class MpdataItem(scrapy.Item): """ Defines the item fields and specifies processors for each field """ name = scrapy.Field( input_processor=MapCompose(str.title, str.strip), output_processor=Join(' '), ) image = scrapy.Field(output_processor=TakeFirst(), ) birthdate = scrapy.Field(output_processor=TakeFirst(), ) birthplace = scrapy.Field( input_processor=TakeFirst(), output_processor=Compose(lambda x: x[0] if len(x[0]) > 2 else "-"), ) profession = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=TakeFirst(), ) languages = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=Join(', '), ) party = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=TakeFirst(), ) electoral_district = scrapy.Field(output_processor=TakeFirst(), ) first_time_mp = scrapy.Field( input_processor=MapCompose(str.strip), output_processor=Join(', '), ) email = scrapy.Field(output_processor=TakeFirst(), )
class LegalDocumentLoader(ItemLoader): """ Process the scraped data. The raw html is formatted so that opening tags are on new lines and their content indented. This allows line-by-line diff across versions. """ default_output_processor = TakeFirst() url_in = Identity() url_out = Join() title_in = Identity() title_out = Join() provider_in = Identity() provider_out = Join() last_updated_in = Identity() last_updated_out = Join() text_in = MapCompose(prettify_html) # break lines text_out = Join()
class TruliaItemLoader(ItemLoader): default_input_processor = MapCompose(str.strip) default_output_processor = TakeFirst() price_out = Compose( lambda v: take_first_two(v), Join(' - '), lambda s: s.replace(',', '') ) # we will get multiple items if the price is a range - join to gether to make a string "<price 1> - <price 2>" description_out = Compose(remove_empty) features_out = Compose(remove_empty) heating_out = Compose(remove_empty) floors_out = Compose(remove_empty) city_state_out = Join(', ') tags_out = Compose(remove_empty) attribute_values_out = Compose(remove_empty) attribute_names_out = Compose(remove_empty) area_out = Compose( TakeFirst(), lambda s: s.replace(',', ''), str.strip ) # area could be "2,500" or a range, "2,500 - 5,000". To keep range we do not convert to int bedrooms_out = Compose(TakeFirst(), int) bathrooms_out = Compose(TakeFirst(), float) deposit = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) year_built = Compose(TakeFirst(), int) days_on_market = Compose(TakeFirst(), int) year_built = Compose(TakeFirst(), int)
def parse_other(self, response: HtmlResponse): item = response.meta['item'] loader = QianchengItemLoader(item, response) _extract_info = partial(extract_info, response) info_text = _extract_info("//p[@class='msg ltype']/@title")[0].split( "|") if len(_extract_info( "//p[@class='msg ltype']/@title")) != 0 else ["空"] * 5 loader.add_value("experience", info_text[1]) loader.add_value("education", info_text[2] if len(info_text) == 5 else "空") loader.add_value("job_number", info_text[3] if len(info_text) == 5 else info_text[2]) loader.add_xpath("advantage", '//div[@class="jtag"]/div//span/text()', processors=Compose(Join())) info = _extract_info("//div[@class='com_tag']/p/@title") loader.add_value("company_nature", info[0] if len(info) != 0 else "空") loader.add_value("company_size", info[1] if len(info) != 0 else "空") loader.add_value("company_industry", info[2] if len(info) != 0 else "空") loader.add_xpath("company_address", "//*[text()='联系方式']/parent::*/parent::*//p/text()", processors=Compose(Join(""), self.replace_all_n)) info2 = self.replace_all_n("".join( _extract_info( u"//*[text()='职位信息']/parent::*/parent::*/div//p//text()"))) loc_div = info2.find(u"职能类别") loader.add_value("job_content", info2[:loc_div]) loader.add_value("job_kind", info2[loc_div:]) yield loader.load_item()
class Chapter(Item): title_content = Field(input_processor=MapCompose(remove_tags, get_title_content), output_processor=Join()) content = Field(input_processor=MapCompose(convert_line_break, remove_tags, reformat_chapter_content), output_processor=Join('\n\n'))
class NewsLoader(ItemLoader): url_out = TakeFirst() parent_url_out = TakeFirst() published_at_in = MapCompose(date_to_string) published_at_out = TakeFirst() author_in = MapCompose(trim_author) author_out = TakeFirst() title_in = MapCompose(unidecode.unidecode, trim) title_out = Join() description_in = MapCompose(unidecode.unidecode, trim) description_out = Join() outlet_out = TakeFirst() outlet_url_out = TakeFirst() type_out = TakeFirst() scraped_at_in = MapCompose(date_to_string) scraped_at_out = TakeFirst() scraped_url_out = TakeFirst()
class BookInfo(Item): full_name = Field(input_processor=MapCompose(str.strip), output_processor=Join()) author = Field(input_processor=MapCompose(str.strip), output_processor=Join()) last_chapter = Field(input_processor=MapCompose(get_last_chapter), output_processor=Join())
class IggItem(scrapy.Item): title = scrapy.Field( input_processor=Compose(TakeFirst(), filter_title, lambda v: v.strip()), output_processor=Join('')) developer = scrapy.Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=Join('')) publisher = scrapy.Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=Join('')) release_date = scrapy.Field( input_processor=MapCompose(lambda v: v.strip()), output_processor=Join('')) genre = scrapy.Field( input_processor=MapCompose(lambda v: v.strip(), filter_empty), output_processor=Identity()) links = scrapy.Field( input_processor=MapCompose(lambda v: v.strip(), filter_empty), output_processor=Identity())
class FeedEntryItemLoader(BaseItemLoader): default_item_class = FeedEntryItem # Field specific content_text_in = MapCompose(skip_false, str.strip, remove_tags) content_text_out = Join("\n") content_html_in = MapCompose( skip_false, replace_regex, build_tree, convert_footnotes, pullup_elems, replace_elems, remove_elems, change_attribs, change_tags, cleanup_html, convert_iframes, lxml_cleaner, flatten_tree, skip_empty_tree, make_links_absolute, make_srcset_absolute, serialize_tree, ) content_html_out = Compose(Join(), truncate_text) # Use sorted to keep the output stable. category_out = Compose(set, sorted) enclosure_in = Identity() enclosure_out = Identity()
class ComputerAdLoader(SecondHandAdLoader): """ """ ram_model_in = Identity() ram_model_out = Join() ram_size_in = Identity() ram_size_out = Join()
class ShoesAdLoader(SecondHandAdLoader): """ """ category_in = Identity() category_out = Join() size_in = Identity() size_out = Join()
class HhLoader(ItemLoader): default_item_class = dict title_out = TakeFirst() salary_in = MapCompose(clear_salary) salary_out = Join() description_out = Join() author_href_in = MapCompose(make_author_link) author_href_out = TakeFirst()
class HhVacancyLoader(ItemLoader): default_item_class = HhVacancyItem vacancy_url_out = TakeFirst() title_out = TakeFirst() salary_out = Join() description_out = Join() company_url_in = MapCompose(company_url) company_url_out = TakeFirst()
class FashionItemLoader(ItemLoader): default_input_processor = Compose(normalize) length_in = Compose(set, list, sorted) brand_out = Join('/') price_out = Compose(lambda x: x[0].replace(',','')) wish_out = Compose(TakeFirst(),int) category_out = Compose(Join('>')) default_output_processor = Compose(set,list, sorted, Join(','))
class User(Item): full_name = Field(input_processor=MapCompose(remove_tags, str.strip), output_processor=Join()) username = Field(input_processor=MapCompose(remove_tags, get_username, str.strip), output_processor=Join()) follower = Field(input_processor=MapCompose(remove_tags, str.strip), output_processor=Join()) following = Field(input_processor=MapCompose(remove_tags, str.strip), output_processor=Join())
class Chapter(Item): title_index = Field(input_processor=MapCompose(remove_tags, get_title_index), output_processor=Join()) title_content = Field(input_processor=MapCompose(remove_tags, get_title_content), output_processor=Join()) content = Field(input_processor=MapCompose(replace_break_element, remove_tags, reformat_chapter_content), output_processor=Join('\n'))
class ProductItemMeta(scrapy.Item): detail_name = scrapy.Field(output_processor=Join()) brand = scrapy.Field(output_processor=Join()) description = scrapy.Field( input_processor=MapCompose(remove_whitespace), output_processor=Join() ) # options = scrapy.Field(serializer=list) price = scrapy.Field() size_format = scrapy.Field(serializer=str) discount_percent = scrapy.Field(serializer=str)
class LermerparserItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field(output_processor=TakeFirst()) photos = scrapy.Field() price = scrapy.Field(input_processor=Join('.'), output_processor=MapCompose(to_float)) description = scrapy.Field(input_processor=Join(''), output_processor=TakeFirst()) specifications_keys = scrapy.Field() specifications_vals = scrapy.Field( input_processor=MapCompose(remove_empty_space)) link = scrapy.Field()
class Juxtapoz_Item(scrapy.Item): title = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) para = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) captions = Field(input_processor=MapCompose(tags_and_unicode)) images = Field() author = Field(input_processor=Join(), output_processor=TakeFirst()) pubtime = Field(output_processor=TakeFirst()) tag = Field() url = Field() source = Field(output_processor=TakeFirst())
class Nytimes_Dir_Item(scrapy.Item): title = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) para = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) captions = Field(input_processor=Compose(elim_dupes), output_processor=MapCompose(tags_and_unicode)) images = Field() author = Field(input_processor=Join(), output_processor=TakeFirst()) pubtime = Field(input_processor=MapCompose(iso_time_to_df)) tag = Field() url = Field() source = Field(output_processor=TakeFirst())
class Artag_and_eflux_Item(scrapy.Item): title = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) para = Field(input_processor=Join(), output_processor=MapCompose(para_clean)) captions = Field(input_processor=MapCompose(tags_and_unicode)) images = Field() author = Field(input_processor=Join(), output_processor=TakeFirst()) pubtime = Field(input_processor=MapCompose(word_time_to_df), output_processor=TakeFirst()) tag = Field(input_processor=MapCompose(tags_and_unicode)) url = Field() source = Field(output_processor=TakeFirst())
def parse_item(self, response): item = ProductItem() l = ItemLoader(item=item, response=response) l.add_css("title", "span._3mRKt::text", Join(), MapCompose(str.strip)) l.add_css("sale_price", "div.eP0wn._26-lJ._28iFq::text", Join(), MapCompose(str.strip), re='[,.0-9]+') l.add_css("full_price", "span._2plVT._35rbh::text", re="[,.0-9]+") l.add_css("description", "div._34YUR._1K7NF > span::text", MapCompose(str.strip)) l.add_css("brand", "h1._1psEi > a::text") l.add_css("category", "li._1Hb_0:nth-child(4) > a > span::text") l.add_value("url", response.url) return l.load_item()
class Hyperallergic_Dir_Item(scrapy.Item): title = Field(input_processor=Join(), output_processor=MapCompose(tags_and_unicode)) para = Field(input_processor=Join(), output_processor=MapCompose(para_clean)) captions = Field(input_processor=MapCompose(tags_and_unicode)) images = Field() author = Field(input_processor=Join(), output_processor=TakeFirst()) pubtime = Field(input_processor=MapCompose(iso_time_to_df), output_processor=TakeFirst()) tag = Field() url = Field() source = Field(output_processor=TakeFirst())
class AmazonReviewItem(scrapy.Item): name = scrapy.Field(output_processor=TakeFirst()) product_title = scrapy.Field(output_processor=TakeFirst()) product_url = scrapy.Field(output_processor=TakeFirst(),input_processor=MapCompose(parse_url)) rating = scrapy.Field(output_processor=TakeFirst()) review_short = scrapy.Field(output_processor=TakeFirst()) review_long = scrapy.Field(output_processor=Join(),input_processor=MapCompose(str.strip))
class AuthorLoader(ItemLoader): default_item_class = AuthorItem name_out = Join() site_url_out = TakeFirst() field_of_activity_in = MapCompose(parse_field_of_activity) author_description_out = TakeFirst() url_out = TakeFirst()
def parse_item(self, response): """ This function parses a property page. @url https://www.gumtree.com/p/property-to-rent/one-bedroom-property-near-chiswick-park-tube-station./1405437559 @returns items 1 @scrapes title price description address image_urls @scrapes url project server spider date """ loader = ItemLoader(item=PropertiesItem(), response=response) loader.add_value('title', response.meta['title'], MapCompose(str.strip, str.title)) loader.add_xpath('price', '//h2[@itemprop="price"]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') loader.add_xpath('description', '//p[@itemprop="description"]/text()', MapCompose(str.strip), Join()) loader.add_xpath('address', '//h4[@itemprop="addressLocality"]/text()', MapCompose(str.strip)) loader.add_xpath('image_urls', '//*[@class="carousel-item"]/img/@src', MapCompose(lambda i: urljoin(response.url, i))) loader.add_value('url', response.url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('date', datetime.datetime.now()) return loader.load_item()
def parse_item(self, response): item = Item() l = ItemLoader(item=item, response=response) for a in self.config["attributes"]: item.fields[a["name"]] = scrapy.Field() processors = [] if "processors" in a: for p in a["processors"]: if p == "join": processors.append(Join()) elif p == "strip": processors.append(MapCompose(str.strip)) kwargs = {} if "regex" in a: kwargs["re"] = a["regex"] l.add_css(a["name"], a["selector"], *processors, **kwargs) item.fields["url"] = scrapy.Field() l.add_value("url", response.url) return l.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ if not response: self.log("RESPONSE IS NONE") # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
class QuotesParquetItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() text = scrapy.Field( output_processor=Join() ) author = scrapy.Field() tags = scrapy.Field()
class WeekliesScraperItem(Item): issue_name = Field( input_processor = MapCompose(), output_processor = TakeFirst() ) issue_number = Field( input_processor = MapCompose(str.strip), output_processor = TakeFirst() ) issue_cover_url = Field( input_processor = MapCompose(), output_processor = TakeFirst() ) issue_url = Field( input_processor = MapCompose(), output_processor = TakeFirst() ) section_name = Field( input_processor = MapCompose(remove_tags, str.strip, str.lower), output_processor = TakeFirst() ) article_url = Field( input_processor = MapCompose(), output_processor = TakeFirst() ) article_title = Field( input_processor = MapCompose(remove_tags, str.strip), output_processor = TakeFirst() ) article_authors = Field( input_processor = MapCompose(remove_tags, str.strip), output_processor = Join(', ') ) article_intro = Field( input_processor = MapCompose(remove_tags, str.strip, remove_empty_lines), output_processor = Join() ) article_content = Field( input_processor = MapCompose(remove_tags, str.strip, remove_empty_lines, remove_xml_tags), output_processor = Join('\n') ) article_tags = Field( input_processor = MapCompose(str.strip, str.lower), output_processor = Join(', ') )