class CategoryItemLoader(ItemLoader): def catIDfromURL(url): #needs to be put in a utility file, because this is also used in td_spider.py itemIdQuery = re.compile('[Cc]at[Ii]d=[0-9]+$') categoryIDtxt = re.findall(itemIdQuery, url)[0] categoryID = categoryIDtxt.replace("CatId=", "") return int(categoryID) def catLevelfromURL(url): #needs to be put in a utility file, because this is also used in td_spider.py catLevelQuery = re.compile('(?:category\_)([st])(?:lc)') catLevelArry = re.findall(catLevelQuery, url) if catLevelArry: catLevel = catLevelArry[0] if catLevel == 't': #top level - level 1 return 1 if catLevel == 's': #second level - level 2 return 2 def linkToMfgID(link): onclick = link.xpath("@onclick").extract() if onclick: mfgquery = re.compile('[Mm]fr[Ii]d=[0-9]+\"') mfgId = re.findall(mfgquery, onclick[0]) if mfgId: mfgId = mfgId[0].replace("MfrId=", "").replace("\"", "") mfgId = mfgId.encode('utf-8') return int(mfgId) default_input_processor = Identity() default_output_processor = Join() categoryName_in = TakeFirst() categoryName_out = Join() tdCategoryID_in = MapCompose(catIDfromURL) tdCategoryID_out = TakeFirst() tdCategoryParent_in = Identity() tdCategoryParent_out = Identity() tdCategoryLevel_in = MapCompose(catLevelfromURL) tdCategoryLevel_out = TakeFirst() manufacturers_in = MapCompose(linkToMfgID) manufacturers_out = Identity()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal # define processors # An Item Loader contains one input processor and one output processor for each (item) field. loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
class ArticleItemLoader(ItemLoader): default_item_class = ArticleItem default_output_processor = TakeFirst() title_out = Compose(TakeFirst(), Net39ArticleTitle()) content_out = Compose(Join(''), Net39ArticleContent())
def parse_article(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ selector = Selector(response) loader = XPathItemLoader(LeMondeArt(), selector=selector) self.log('\n\nA response from %s just arrived!' % response.url) # define processors text_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Populate the LeMonde Item with the item loader for field, xpath in self.article_item_fields.iteritems(): try: loader.add_xpath(field, xpath, text_input_processor) except ValueError: self.log("XPath %s not found at url %s" % (xpath, response.url)) #loader.add_value("Url",response.url) yield loader.load_item()
class ResultsItemsLoader(ItemLoader): default_item_class = ResultsItem default_output_processor = Compose(TakeFirst(), unicode, unicode.strip) Winodds_out = Compose(default_output_processor, try_float) FinishTime_out = Compose(default_output_processor, timeprocessor) Sec1time_out = Compose(default_output_processor, timeprocessor) Sec2time_out = Compose(default_output_processor, timeprocessor) Sec3time_out = Compose(default_output_processor, timeprocessor) Sec4time_out = Compose(default_output_processor, timeprocessor) Sec5time_out = Compose(default_output_processor, timeprocessor) Sec6time_out = Compose(default_output_processor, timeprocessor) LBW_out = Compose(default_output_processor, horselengthprocessor) Draw_out = Compose(default_output_processor, try_int) Place_out = Compose(default_output_processor) # Place_out = Compose(default_output_processor) # PlaceNum_out = Compose(default_output_processor) HorseNumber_out = Compose(default_output_processor, noentryprocessor) Sec1DBL_out = Compose(default_output_processor, horselengthprocessor) Sec2DBL_out = Compose(default_output_processor, horselengthprocessor) Sec3DBL_out = Compose(default_output_processor, horselengthprocessor) Sec4DBL_out = Compose(default_output_processor, horselengthprocessor) Sec5DBL_out = Compose(default_output_processor, horselengthprocessor) Sec6DBL_out = Compose(default_output_processor, horselengthprocessor) RaceNumber_out = Compose(default_output_processor, try_int) HorseNumber_out = Compose(default_output_processor, try_int) DeclarHorseWt_out = Compose(default_output_processor, try_int) # image_urls_out = MapCompose(_cleanurl) RunningPosition_out = Join(' ') image_urls_out = Compose(identity)
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ # Gives ability to select parts of response defined in deals_list_xpath selector = HtmlXPathSelector(response) # Iterate through found deals for deal in selector.xpath(self.deals_list_xpath): # Loads data into item fields defined in items.py loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # Define processors for clean up and joining elements loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Iterate over item_fields dict and add xpaths to loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start'] for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(flipkartData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
class ExampleLoader(ItemLoader): #定义默认处理的Item为上方定义的ExampleItem default_item_class = ExampleItem #定义默认输入处理器和输出处理器 default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded response """ selector = HtmlXPathSelector( response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for content in selector.xpath( self.content_list_xpath): #multiple deals per page loader = XPathItemLoader(RedditLearnPython(), selector=content) #iterate over each deal # define processors loader.default_input_processor = MapCompose( unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems( ): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item( ) # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
class PlayStoreItems(Item): app_id = Field(output_processor=TakeFirst()) name = Field(output_processor=TakeFirst()) category = Field(output_processor=TakeFirst()) category_url = Field(output_processor=TakeFirst()) price = Field(input_processor=Compose(lambda text: [ line.strip().replace(" Buy", "").replace("Install", "Free") for line in text ]), output_processor=TakeFirst()) offers_in_app_purchases = Field(output_processor=TakeFirst()) stars_count = Field(input_processor=Compose( lambda text: [line.strip().strip("()") for line in text]), output_processor=Join('')) video = Field(output_processor=TakeFirst()) screenshots = Field() description = Field(input_processor=Compose( lambda text: [line.strip() for line in text]), ) update_date = Field(output_processor=TakeFirst()) file_size = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) installs = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) current_version = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) requires_android = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) offered_by = Field(output_processor=TakeFirst()) offered_by_url = Field(output_processor=TakeFirst())
class AppItem(CrawledItem): update_note = Field( default='', input_processor=MapCompose(unquote_markup, remove_comments, replace_escape_chars, strip_space), output_processor=Join(), ) labels = Field( default='', input_processor=MapCompose(unquote_markup, remove_comments, replace_escape_chars, strip_space), output_processor=Join(separator=u','), ) icon_path = Field() images_path = Field() last_crawl = Field()
class Article(Item): title = Field(input_processor=MapCompose(strip)) link = Field() content = Field(input_processor=MapCompose(strip), output_processor=Join()) date = Field(input_processor=MapCompose(remove_empty, change_date_format), output_processor=Join()) location = Field(input_processor=MapCompose(remove_empty), output_processor=TakeFirst()) author = Field() keywords = Field()
class AppInfoItemLoader(ItemLoader): default_item_class = AppInfoItem default_output_processor = TakeFirst() default_input_processor = MapCompose(unicode.strip) screenshots_out = Identity() intro_out = Join('<br>') tags_out = Identity() permissions_str_out = Join(';') permissions_out = Identity() instance_in = Identity()
class ArticleLoader(ItemLoader): default_output_processor = TakeFirst() name_in = MapCompose(unicode.title) name_out = Join() price_in = MapCompose(unicode.strip)
class LazyTweetAnswer(Item): question_id = Field(input_processor=MapCompose(lambda x: int(x)), output_processor=TakeFirst()) answer_content = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) answerer = Field(output_processor=TakeFirst()) answer_id = Field()
class BookmarkLoader(XPathItemLoader): default_input_processor = MapCompose(remove_entities, string.strip) default_output_processor= TakeFirst() username_out= ParseUsername() date_out = ParseDate() hash_out = HashStringList() tags_out = Join(separator=" ")
class FeedEntry(Item): id = SingleField() title = SingleField() link = SingleField() # updated = SingleField () # Deprecated - use published or incicent_datetime published = SingleField() # this is the date the item was published # Leave this null to have the current timestamp applied summary = Field(input_processor=MapCompose(fix_entities), output_processor=Join(' - ')) content = Field(input_processor=MapCompose(fix_entities), output_processor=Join('<br/>')) lat = SingleField() lng = SingleField() source_id = SingleField() kml_url = SingleField() incident_datetime = SingleField() # date of the incident source_item_id = SingleField( ) # id of the item in the source data (usually equal to task_id)
class UserLoader(XPathItemLoader): default_output_processor = TakeFirst() zeta_id_in = MapCompose(unicode.strip, extract_numbers) member_number_in = MapCompose(unicode.strip, extract_numbers) post_count_in = MapCompose(unicode.strip, to_int) signature_in = Join() date_birthday_in = MapCompose(unicode.strip, to_datetime_short) date_joined_in = MapCompose(unicode.strip, to_datetime_short)
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()' .format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()' .format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()' .format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
class SinaNewsItemLoader(ItemLoader): default_item_class = SinaNewsItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join() content_in = MapCompose(fo.removeBlankStr, fo.filterHtml) publish_time_in = MapCompose(fo.removeBlankStr, fo.getSinaPublishTime)
def __call__(self, values): j = Join() brand = j(values) for b, r in self.replacements: b_tmp = brand.replace(' ', '').replace('-', '').lower() r_tmp = r.replace(' ', '').replace('-', '').lower() if b_tmp == r_tmp or b.lower() == brand.lower(): return r return brand
class ZhiHuA(Item): id = Field(input_processor=MapCompose(lambda x: int(x)), output_processor=TakeFirst()) qid = Field(output_processor=TakeFirst()) asr = Field(output_processor=TakeFirst()) content = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) score = Field(input_processor=MapCompose(lambda x: int(x)), output_processor=TakeFirst())
class TripAdvisorItem(Item): source = Field(output_processor=TakeFirst(), ) source_link = Field(output_processor=TakeFirst(), ) name = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) rating = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) category = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) reviews = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) price = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) city = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) address = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=Join(), ) phone = Field() hotel_class = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) rank_of_city = Field() longitude_latitude = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) owner_website = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) last_crawl = Field()
class TimetableItem(Item): airport = Field() flight_type = Field() flight = Field() airline = Field() airport_of_departure = Field() city_of_departure = Field() airport_of_arrival = Field() city_of_arrival = Field() flight_status = Field() datetime_scheduled = Field(output_processor=Compose(Join(), to_datetime)) datetime_estimated = Field(output_processor=Compose(Join(), to_datetime), default=None) datetime_actual = Field(output_processor=Compose(Join(), to_datetime), default=None) terminal = Field() comment = Field() checkin_desk = Field( output_processor=Compose(Join(), checkin_desk_processor))
class MilkshakeLoader(ItemLoader): default_item_class = MilkshakeItem default_input_processor = MapCompose(remove_tags, lambda s: s.strip()) default_output_processor = TakeFirst() # price input #price_in = MapCompose(remove_tags, filter_price) # description output description_out = Join()
class Product(scrapy.Item): title = Field( input_processor = MapCompose(clean), output_processor = Join() ) url = Field(output_processor = TakeFirst()) current_price = Field( input_processor = MapCompose(extract_price), output_processor = TakeFirst() ) regular_price = Field( input_processor = MapCompose(extract_price), output_processor = TakeFirst() ) availability = Field(output_processor = TakeFirst()) category_name = Field( input_processor = MapCompose(clean), output_processor = Join() )
class BookLoader(ItemLoader): format_map = { u"/images/adobe_icon.gif": "PDF", u"/images/epubDRM_icon.gif": "EPUB", u"/images/mobi_icon.gif": "MobiPocket" } default_item_class = Book default_output_processor = TakeFirst() author_out = TakeFirst() # Ignore other than first publish_date_in = MapCompose(lambda s: datetime.strptime(s, "%Y%m").date()) format_in = MapCompose(lambda f: BookLoader.format_map[f]) format_out = Join(", ") category_out = Join(", ")
class StackOverflowAnswer(Item): answer_id = Field(input_processor=MapCompose(lambda x: int(x)), output_processor=TakeFirst()) answer_content = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) answerer = Field(output_processor=TakeFirst()) marks = Field(input_processor=MapCompose(lambda x: int(x)), output_processor=TakeFirst()) is_best_answer = Field(output_processor=TakeFirst())
class AppItem(Item): # define the fields for your item here like: # name = Field() app_id = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=TakeFirst()) app_type = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) title = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) description = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) score = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) author = Field(input_processor=MapCompose(remove_entities, unicode.strip), output_processor=Join()) icon_url = Field(output_processor=TakeFirst()) similarity = Field() more_from_devs = Field()