def parse_page(self, response): #爬取图片 # print u'~~~~', unicode(response.body, "gbk").encode("utf8") # print(self.config["xpathImagesPath"]) # print(response.xpath(self.config["xpathImagesPath"])) l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.config["id"]) l.add_value('url', response.url) if self.config.has_key("imageUrlReplacement"): l.add_value('replace', self.config["imageUrlReplacement"]) if self.config.has_key("xpathImagesPath"): l.add_xpath('image_urls', self.config["xpathImagesPath"]) if self.config.has_key("xpathFilesPath"): l.add_xpath('file_urls', self.config["xpathFilesPath"]) yield l.load_item() #TODO:获取下一页地址,递归调用自parse_page if self.config.has_key("xpathNextImageUrl"): nextUrls = response.xpath(self.config["xpathNextImageUrl"]) if len(nextUrls) > 0: nextPage = nextUrls.extract()[0] if not nextPage.startswith("http"): if nextPage.startswith("/"): nextPage = response.url[0:response.url.index("/",10)+1]+nextPage else: nextPage = response.url[0:response.url.rfind("/")+1]+nextPage request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']}) yield request
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//td[@valign="top"]/img/@src') return l.load_item()
def get_player_info(self, response): loader = ItemLoader(item=NFL_Player_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0] number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract() if type(number_and_position) is list: number_and_position = number_and_position[0] number = number_and_position.split()[0] position = number_and_position.split()[1] else: number = '' position = '' loader.add_value('number', number) loader.add_value('position', position) loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') yield loader.load_item()
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('url', response.url) l.add_value('name', self.name) l.add_xpath('image_urls', '//div[@class="l_effect_img_mid"]/a/img/@src') return l.load_item()
def parse(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ """ #1. First method item = PropertiesItem() item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract() item['price'] = response.xpath('//*[@itemprop="price"][1]/text()').re('[.0-9]+') item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract() item['address'] = response.xpath( '//*[@itemtype="http://schema.org/' 'Place"][1]/text()').extract() item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract() return item """ #2. Secode method l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('title', '//*[@itemprop="name"][1]/text()') l.add_xpath('price', '//*[@itemprop="price"][1]/text()', re('[.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()') l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()') l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src') return l.load_item() """
def parse_item(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_movie(self,response): loader = ItemLoader(item=DoubanItem(),response=response) for attr,xpath in self.settings.getdict('INFO_XPATH').items(): loader.add_xpath(attr,xpath) s = response.xpath('//div[@id="info"]').extract_first() for attr,regex in self.settings.getdict('RE').items(): loader.add_value(attr,re.findall(regex,s)) loader.add_value('rate',self.parse_rate(response)) loader.add_value('url',response.url) if self.settings.get('ALLOW_COVER') == True: image_urls = self._get_urls( self.image_base_url, urljoin, response.xpath('//div[@id="mainpic"]/a/img/@src').extract(), lambda s:s.split('/')[-1], ) loader.add_value('image_urls',image_urls) return loader.load_item()
def parse(self, response): l=ItemLoader(item=RentalItem(),response=response) l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_value('url', response.url) return l.load_item()
def parse(self,response): l = ItemLoader(item = NytimesItem(),response = response) l.add_xpath('topnews','//*[contains(@id,"topnews-100")]/h2/a/text()') l.add_xpath('sectionnews','//h3[contains(@class,"story-heading")]/text()') #print(type(l.load_item())) x = l.load_item() #print(len(x['date']),len(x['topnews']),len(x['sectionnews'])) nytdict = dict() datelist = [] datalist = datetime.date.today() topnewslist = [] sectionnewslist = [] nytdict['date'] = str(datalist) for t in x['topnews']: topnewslist.append(str(t.encode('ascii','ignore'))) nytdict['topnews']=topnewslist for t in x['sectionnews']: sectionnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['sectionnews']=sectionnewslist filename = datetime.date.today() f=open('{}.json'.format(filename),'w') json.dump(nytdict,f) return l.load_item()
def get_item(self,response): loader = ItemLoader(item=expansys_item(), response=response) loader.add_xpath('url', str('http//www.allforyou.sg'+response.url)) loader.add_xpath('title', '//span[contains(@itemprop, "name")]/text()') return loader.load_items()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') #l.add_xpath('tag', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_url', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_rate(self,response): loader = ItemLoader(item = RateItem(),response=response) for attr,xpath in self.settings.getdict('RATE_XPATH').items(): loader.add_xpath(attr,xpath) return loader.load_item()
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].iteritems(): if xpath: item.fields[name] = Field() l.add_xpath(name, xpath) return l.load_item()
def parse(self, response): item = ItemLoader(item=OrgItem(), response=response) item.add_value('id', self.curr) item.add_xpath('name', '//h2[@class="gsc_authors_header"]/text()') yield item.load_item() next_url = self.next_label_from_db() if next_url: yield Request(url=next_url,dont_filter=True)
def parse_content(self, response): logger.info('Dealing with images: %s', response.url) item_load = ItemLoader(item=ScrapyMeizituItem(), response=response) item_load.add_value('url', response.url) item_load.add_xpath('name', self._x_query['name']) item_load.add_xpath('tags', self._x_query['tags']) item_load.add_xpath('image_urls', self._x_query['image_urls']) return item_load.load_item()
def parse_depth_chart(self, response): loader = ItemLoader(item=NFL_Team_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()') loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()') yield loader.load_item()
def parse_item(self, response): l=ItemLoader(item=PropertiesItem(),response=response) l.add_xpath('title','//*[@itemprop="name"][1]/text()') l.add_xpath('price','//*[@itemprop="price"][1]/text()',re='[.0-9]+') l.add_xpath('description','//*[@itemprop="description"][1]/text()') l.add_xpath('address','//*[@itemtype="http://schema.org/Place"][1]/text()') l.add_xpath('image_urls','//*[@itemprop="image"][1]/@src') return l.load_item()
def parse_stuff(self, response): hxs = Selector(response) sites = hxs.xpath('//body') items_main = [] for site in sites: loader = ItemLoader(item = Items_Main(), response = response) loader.add_xpath('fragment', '//*[not(self::script)]/text()') items_main.append(loader.load_item()) return items_main
def parse_accelerator(self, response): for sel in response.xpath('//table/tbody/tr'): l = ItemLoader(item=SeedDB2Item(), selector=sel) l.add_xpath('accelerator', 'td/a/strong/text()') l.add_xpath('accelerator_website', 'td/a/@href') l.add_xpath('num_cohorts', 'td[3]/span/text()') l.add_xpath('num_exits', 'td[4]/span/text()') l.add_xpath('num_funding', 'td[5]/span/text()') l.add_xpath('num_avg_funding', 'td[6]/span/text()') yield l.load_item()
def parse_item(self,response): l=ItemLoader(item=XcspiderItem(),response=response) m=response.xpath("//span[@class='ellipsis']/a/@title") # print m l.add_xpath('dp_content',"//ul/li[@class='main_con']/text()",MapCompose(unicode.strip),Join()) l.add_xpath('dp_user',"//span[@class='ellipsis']/a/@title",MapCompose(unicode.strip)) l.add_value('dp_link',response.url) l.add_xpath('dp_scence',"//div[@class='f_left']/h1/text()") l.add_xpath('dp_provice',"//div[@class='breadbar_v1 cf']/ul/li[4]/a/text()",MapCompose(lambda i:i.replace("景点",''))) l.add_xpath('dp_time',"//span[@class='youcate']/text()",MapCompose(unicode.strip)) return l.load_item()
def parse_item(self, selector,response): # Create the loader using the response l = ItemLoader(item=RentalItem(), selector=selector) l.add_xpath('price','(.//span[contains(@class, "item-price")]/text())[1]') l.add_xpath('size','.//small/text()[. = "m2"]/../../text()') l.add_xpath('rooms','.//small/text()[. = "locali"]/../../text()') l.add_xpath('address','.//a[contains(@class, "item-link")]/@title') l.add_xpath('elevator','.//span[text()="piano"]/../text()') #l.add_xpath('floor','(.//span[text()="piano"]/../../text())[1]') return l.load_item()
def parse_auction_item(self, response): loader = ItemLoader(AuctionItems(), response=response) loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) loader.default_output_processor = Join() for field, xpath in auction_item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): l = ItemLoader(item=UniprotItem(), response=response) l.add_xpath('proteinName', "//*[@id='page-header']/h2/span/text()") l.add_value('uniprotAccession', response.url) l.add_xpath('uniprotProteinLength', "//*[@id='sequences-section']/div[1]/div[2]/div[1]/span[2]/text()") listing = response.xpath("//*[@id='subcellular_location']/div[1]/ul") subcellular_location = [] for li in listing: subcellular_location.append(li.xpath("./li/a/text()").extract()) l.add_value('uniprotLocalization', subcellular_location) yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=StartupItem(), response=response) l.add_xpath('title', '//*[@id="C-Main-Article-QQ"]//h1/text()') l.add_xpath('abstract', '//*[@id="C-Main-Article-QQ"]//p[@class="Introduction"]/text()',) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('scrapy_test')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p') article_time = content.xpath('//span[@class="pubTime"]/text()').extract() date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M") if not date_time: return item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_value('date_time', date_time) item.add_xpath('title', '//div[@class="hd"]/h1/text()') item.add_xpath('reading_number', '//em[@id="top_count"]/text()') item.add_xpath('author', '//span[@class="auth"]/text()') item.add_value('original_link', response.url) elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('content', content) item.add_value('image_urls', images) item.add_value('source', u'腾讯科技') item.add_value('category', CATEGORY.TECHNOLOGY) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): # FIXME: fix array issue i = ItemLoader(item=SalefinderItem(), response=response) title = r'//div[@id="product-details-container"]//h1/text()' price = r'//div[@id="product-details-container"]//span[@class="price"]/text()' per = r'//div[@id="product-details-container"]//span[@class="price"]/text()' image_url = r'//a[@id="product-image-container"]//img/@src' i.add_xpath('title', title, MapCompose(unicode.lower)) i.add_xpath('price', price, re=r'[,.0-9]+') i.add_xpath('per', per, re=r'pk|each|kg') i.add_xpath('image_url', image_url) i.add_value('url', response.url) i.add_value('date', date.today().isoformat()) product_buy = response.xpath("//div[@class='product-container']//div[@id='product-buy']") product_buy_text = product_buy.extract_first().lower() # Detect the vendor from a product-buy div if 'coles' in product_buy_text: i.add_value('vendor', 'coles') elif 'woolworths' in product_buy_text: i.add_value('vendor', 'woolworths') else: i.add_value('vendor', 'unknown') return i.load_item()
def parse(self, response): today = datetime.date.today() today_long_date = datetime.datetime.strftime(today, '%A, %d %b %Y') today = datetime.datetime.strftime(today, '%A') sel = response.xpath restaurant = self.get_title(sel) l = ItemLoader(item=LunchItem(), response=response) l.add_value('restaurant', restaurant) l.add_xpath('dishes', "//h4[text()='" + today + "']/following-sibling::table//td[@class='lunch']") l.add_value('day', today_long_date) yield l.load_item()
def parse(self, response): sel = Selector(response) table = sel.xpath("//*[@id='imageKey']/tbody/tr") for tr in table: l = ItemLoader(item=PfamItem(), selector=tr) l.add_value('proteinName', response.url) l.add_xpath('pfamAccession', "./td[position() = 1 and text() = 'Pfam']/@class") l.add_xpath('pfamID', "./td[2]/a/text()") l.add_xpath('sequenceStart', "./td[3]/text()") l.add_xpath('sequenceEnd', "./td[4]/text()") l.add_xpath('proteinLength', '//*[@id="proteinSummaryBlock"]/div[2]/table[1]/tbody/tr[3]/td[2]/text()') yield l.load_item()
def parse(self, response): sel = Selector(response) last_page = sel.xpath('//span[@class="step-links"]/a/text()')[-1].extract() self.num_page = int(last_page) loader = ItemLoader(item=User(), response=response) loader.add_value('uid', self.uid) loader.add_xpath('name', '//a[@class="username"]/text()') for i in range(1, self.num_page + 1): url = self.start_urls[0] + '/' + str(i) yield Request(url, callback=self.parse_list, meta={'loader': loader})
def parse_item(self,response): l=ItemLoader(item=AskspiderItem()) l.add_xpath('q_title',"//h1[@class='ask_title']/text()",MapCompose(unicode.strip),Join()) l.add_xpath('q_time',"//span[@class='ask_time']/text()",MapCompose(unicode.strip)) l.add_xpath('q_province',"//div[@class='abouttdd']/ul/li[1]/h3/span/text()",MapCompose(unicode.strip)) l.add_value('q_link',response.url) l.add_xpath('q_user',"//a[@class='ask_username']/text()") return l.load_item()
def parse(self, response): if response.xpath(self.TitleXpath).get() is None: raise ValueError("the TitleXpath of Bmbf webpage has changed") elif response.xpath(self.DateXpath).get() is None: raise ValueError("the Bmbf webpage xpath has changed") elif response.xpath(self.UrlXpath).get() is None: raise ValueError("the URl of Bmbf webpage xpath has changed") loader = ItemLoader(item=EventItemBmbf(), response=response) loader.add_xpath("TitleBMBF", self.TitleXpath) loader.add_xpath("DateBMBF", self.DateXpath) loader.add_xpath( "UrlBMBF", self.UrlXpath ) #//div[@class="main"]//div[@class="content"]//div[@class="article-section"]//p/strong item = loader.load_item() # if "TitleBMBF" not in item: raise ValueError("TitleBMBF item is not loaded") elif "DateBMBF" not in item: raise ValueError("DateBMBF item is not loaded") elif "UrlBMBF" not in item: raise ValueError("UrlBMBF item is not loaded") #store crawled data in a dict, then yield it to pipeline mydict = { "title": [], "date": [], "url": [], "paperType": [], "where": [], "deadline": [] } for k in range(int(len(item["UrlBMBF"]))): mydict["title"].append(item["TitleBMBF"][k]) mydict["date"].append( datetime.strptime( item["DateBMBF"][k].replace(" ", "").split("-")[0], '%d.%m.%Y')) #print("date",datetime.strptime( item["DateBMBF"][k].replace(" ","").split("-")[0], '%d.%m.%Y')) #print("----------------------") mydict["url"].append('https://www.bmbf.de/' + item["UrlBMBF"][k]) if (len(item["DateBMBF"][k].split("-")) > 1): #if formatchecker!=datetime.strptime( (item["DateBMBF"][k].replace(" ","")).split("-")[1], '%d.%m.%Y') #raise ValueError("the webpage has changed or the date format has changed") #print(datetime.strptime( (item["DateBMBF"][k].replace(" ","")).split("-")[1], '%d.%m.%Y')) mydict["deadline"].append( datetime.strptime( (item["DateBMBF"][k].replace(" ", "")).split("-")[1], '%d.%m.%Y')) else: mydict["deadline"].append(None) myPipeline = ScrapyProjectPipeline() myPipeline.process_item(mydict, SpiderBmbf) yield mydict #if we put this outside for loop calling pipelines 1 times
def parse_item(self, response): l = ItemLoader(item=CollegeNetworkItem(), response=response) l.add_xpath('college', "//div[@class='row school-title-wrapper']/p/b/a/text()") l.add_xpath('department', "//div[@class='row school-title-wrapper']/p/b/text()[2]") l.add_xpath('department_attr', "//div[@class='card-block']/p/b[position() <= 6]/text()") l.add_xpath('department_attr_val', "//div[@class='card-block']/p/text()[position() <= 6]") l.add_xpath('overlap_college', "//div[@class='card-block']//small/a/text()") l.add_xpath('overlap_college_num', "//div[@class='card-block']//small/text()") l.add_xpath( 'applied_region', "//div[@class='card-block']/text()[re:test(., 'x\d{1,3}')]") # Housekeeping fields l.add_value('url', response.url) l.add_value('rtrv_date', datetime.datetime.now()) return l.load_item()
def parse_comment(self, comment_scope): # Extract the data for a single comment selector = Selector(text=comment_scope) comment_loader = ItemLoader(item=Comment(), selector=selector) comment_loader.default_output_processor = TakeFirst() comment_loader.add_xpath('comment_id', '//li/@id') comment_loader.add_xpath('comment_author', '//div/div[1]/div/span[1]/a/span/text()') comment_loader.add_xpath( 'comment_text', '//div/div[2]/div[2]/div[@class="d-comment__body"]') comment_loader.add_xpath('timestamp', '//div/div[1]/div/div/a/time/@datetime') comment_loader.add_xpath( 'parent_comment_id', '//a[@class="js-discussion-author-link"]/@href') comment_loader.add_xpath('upvotes', '//div/div[2]/div[1]/span/span[1]/text()') return comment_loader.load_item()
def parse_item(self, response): url = response.url item_list = item_code(url, self.web_name, '/loan/(.*?)$') item = ItemLoader(item=YzmSx5170Item(), response=response) item.add_value('web_name', self.web_name) item.add_value('web_code', self.name) item.add_value('url', url) item.add_value('item_code', item_list.get('item_code')) item.add_xpath('title', '//title/text()') item.add_xpath('amount', "//ul[@class='left-1-ul']//li[1]//p[1]") item.add_xpath('rate', "//ul[@class='left-1-ul']//li[2]//p[1]") item.add_xpath('period', "//ul[@class='left-1-ul']//li[3]//p[1]") item.add_xpath( 'loan_using', '//*[contains(text(),"资金用途")]/following-sibling::div[1]/p/text()') # item.add_xpath('loaner_info', '//*[@id="userName"]') item.add_xpath('pay_type', '//*[contains(text(),"还款方式")]/text()') item.add_xpath('progress', "//ol[@class='left-1-ol']//li[2]/span/text()") # invest records i_v = [] invest_records_temp = '{{username={lst[0]}|rate=-1|postmoney={lst[2]}|money={lst[2]}|postdate={lst[1]}|status=全部通过}}' invest_records_format = "" tr = response.css('.invest-table').css('tr') if not tr: tr = response.css('.tou.info-tab-main').css('tr') try: for i in tr: lst = i.css('td::text').extract() i_v.append(lst) for n in i_v: invest_records_format += invest_records_temp.format(lst=n) item.add_value('invest_records', invest_records_format) item.add_value('start', i_v[1][1]) item.add_value('end', i_v[-1][1]) except Exception: print(url, 'invest records is error') yield item.load_item()
def parse(self, response): l = ItemLoader(item=FinanceItem(), response=response) l.add_xpath( "Currency", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/thead/tr/th[1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TimePeriod", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/thead/tr/th[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashAndEquivalents", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[1]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "ShortTermInvestments", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[2]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashAndShortTermInvestments", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[3]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "AccountsReceivableTradeNet", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[4]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "ReceivablesOther", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[5]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalReceivablesNet", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[6]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalInventory", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[7]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "PrepaidExpenses", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[8]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherCurrentAssetsTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[9]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalCurrentAssets", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[10]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "PropertyPlantEquipmentTotalGross", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[11]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "AccumulatedDepreciationTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[12]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "GoodwillNet", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[13]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IntangiblesNet", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[14]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "LongTermInvestments", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[15]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherLongTermAssetsTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[16]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalAssets", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[17]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "AccountsPayable", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[18]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "AccruedExpenses", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[19]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NotesPayableShortTermDebt", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[20]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CurrentPortofLTDebtCapitalLeases", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[21]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherCurrentliabilitiesTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[22]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalCurrentLiabilities", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[23]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "LongTermDebt", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[24]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CapitalLeaseObligations", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[25]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalLongTermDebt", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[26]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalDebt", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[27]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DeferredIncomeTax", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[28]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "MinorityInterest", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[29]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherLiabilitiesTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[30]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalLiabilities", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[31]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "RedeemablePreferredStockTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[32]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "PreferredStockNonRedeemableNet", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[33]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CommonStockTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[34]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "AdditionalPaidInCapital", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[35]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "RetainedEarningsAccumulatedDeficit", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[36]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TreasuryStockCommon", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[37]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherEquityTotal", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[38]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalEquity", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[39]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalLiabilitiesShareholdersEquity", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[40]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "SharesOutsCommonStockPrimaryIssue", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[41]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalCommonSharesOutstanding", '//div[@id="balinterimdiv"]//*[@id="fs-table"]/tbody/tr[42]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse_match(self, response, **kwargs): """ Fetches data about particular event from returned content. Creates match item and fills with fetched data. """ html_event_part = HtmlResponse(url=response.url, body=json.loads( response.body)['content1'].encode()) match_loader = ItemLoader(item=MatchItem(), response=html_event_part) match_loader.add_value('id', uuid4()) match_loader.add_value('league_id', kwargs.get('league_id')) match_loader.add_xpath('timestamp', '//p[@data-time]/@data-time') match_loader.add_xpath( 'home_team', '//span[@itemprop="homeTeam"]/span[@itemprop="name"]/@content') match_loader.add_xpath( 'away_team', '//span[@itemprop="awayTeam"]/span[@itemprop="name"]/@content') match_loader.add_xpath('stadium', '//small/span[@itemprop="name"]/text()') match_loader.add_xpath( 'home_result', '//div[contains(@class, "h2h-final-score")]/' 'div[@class="widget-content"]/h2/text()') match_loader.add_xpath( 'away_result', '//div[contains(@class, "h2h-final-score")]/' 'div[@class="widget-content"]/h2/text()') match = match_loader.load_item() yield match html_post_match = HtmlResponse(url=response.url, body=json.loads( response.body)['content2'].encode()) if html_post_match.xpath('//div[@class="w100 cf ac"]'): # if post match statistics data exists statistics_loader = ItemLoader(item=PostMatchStatisticsItem(), response=html_post_match) statistics_loader.add_value('id', uuid4()) statistics_loader.add_value('match_id', match['id']) statistics_loader.add_xpath( 'possession_home', '//span[contains(@class, "possession")]/text()') statistics_loader.add_xpath( 'possession_away', '//span[contains(@class, "possession")]/text()') statistics_loader.add_xpath( 'shots_home', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Shots")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'shots_away', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Shots")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'cards_home', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Cards")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'cards_away', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Cards")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'corners_home', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Corners")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'corners_away', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Corners")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'fouls_home', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Fouls")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'fouls_away', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Fouls")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'offsides_home', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Offsides")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) statistics_loader.add_xpath( 'offsides_away', '//div[@class="w100 m0Auto"]/div/div[contains(text(), "Offsides")]' '/following-sibling::div[contains(@class, "bbox")]/span/text()' ) yield statistics_loader.load_item()
def parse_items(self, response): item = ItemLoader(AirbnbItem(), response) item.add_xpath('tipo', '//*[@id-"summary"]/div/div/div[1]/div/div/div/div[2]/div[2]/div/div[1]/text()') item.add_xpath('capacidad', '//*[@id-"summary"]/div/div/div[1]/div/div/div/div[2]/div[2]/div/div[2]/text()', MapCompose(lambda i: i[0])) yield item.load_item()
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=100, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to proper page: {}' .format(response.meta['url'])) yield scrapy.Request( next_reply, callback=self.parse_page, meta={'index': response.meta['index'] + 1}) elif response.meta['flag'] == 'back': #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=100, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request( next_reply, callback=self.parse_page, meta={'index': response.meta['index'] + 1})
def parse_video(self, response): item = ItemLoader(Video(), response) item.add_xpath('titulo', '//h1/text()') item.add_xpath('fecha_de_publicacion', '//span[@class="publish-date"]/text()') yield item.load_item()
def parse_title(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('TITLE', '//h1[contains(@class, "parchment")]//text()') return il.load_item()
def parse_keywords(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('KEYWORDS', '//td[contains(@class, "tabbody")]//ul/li/a/text()') return il.load_item()
def parse(self, response): self.crawler.stats.set_value('pages_to_visit', len(self.urls)) loader = ItemLoader(item=SofifaItem(), response=response) col_4_loader = loader.nested_xpath( ".//div[@class='column col-4 text-center']") # GENERAL PLAYER INFORMATION loader.add_xpath('id', ".//div[@class='info']/h1/text()") loader.add_xpath('name', ".//div[@class='info']/h1/text()") loader.add_xpath('full_name', ".//div[@class='meta']/text()") loader.add_xpath( 'age', ".//div[@class='meta']/text()/following-sibling::text()[last()]") loader.add_xpath( 'dob', ".//div[@class='meta']/text()/following-sibling::text()[last()]") loader.add_xpath( 'height', ".//div[@class='meta']/text()/following-sibling::text()[last()]") loader.add_xpath( 'weight', ".//div[@class='meta']/text()/following-sibling::text()[last()]") loader.add_xpath('nationality', ".//div[@class='meta']/a/@title") # GENERAL PLAYER STATS loader.add_xpath( 'preferred_foot', "(.//label[text()='Preferred Foot']/following::text())[1]") loader.add_xpath( 'international_reputation', "(.//label[text()='International Reputation']/following::text())[1]" ) loader.add_xpath( 'weak_foot', "(.//label[text()='Weak Foot']/following::text())[1]") loader.add_xpath( 'skill_moves', "(.//label[text()='Skill Moves']/following::text())[1]") loader.add_xpath( 'work_rate', "(.//label[text()='Work Rate']/following::span/text())[1]") loader.add_xpath( 'body_type', "(.//label[text()='Body Type']/following::span/text())[1]") loader.add_xpath( 'real_face', "(.//label[text()='Real Face']/following::span/text())[1]") # CLUB/TEAM INFORMATION col_4_loader.add_xpath( 'value', "following::text()[contains(., 'Value')]/following::span[1]/text()" ) col_4_loader.add_xpath( 'wage', "following::text()[contains(., 'Wage')]/following::span[1]/text()") loader.add_xpath( 'release_clause', "(.//label[text()='Release Clause']/following::span/text())[1]") loader.add_xpath('club_name', "(.//ul[@class='pl']//a/text())[1]") loader.add_xpath( 'club_rating', ".//div[@class='column col-4'][3]/ul/li[2]/span/text()") loader.add_xpath( 'club_position', "(.//label[text()='Position']/following::text()[1])[1]") loader.add_xpath( 'club_jersey_number', "(.//label[text()='Jersey Number']/following::text()[1])[1]") loader.add_xpath('club_join_date', ".//label[text()='Joined']/following::text()[1]") loader.add_xpath( 'loaned_from', ".//label[text()='Loaned From']/following::a[1]/text()") loader.add_xpath( 'club_contract_end_date', ".//label[text()='Contract Valid Until']/following::text()[1]") loader.add_xpath('team_name', "(.//ul[@class='pl']//a/text())[2]") loader.add_xpath( 'team_rating', ".//div[@class='column col-4'][4]/ul/li[2]/span/text()") loader.add_xpath( 'team_position', "(.//label[text()='Position']/following::text()[1])[2]") loader.add_xpath( 'team_jersey_number', "(.//label[text()='Jersey Number']/following::text()[1])[2]") # PLAYER GAME STATS loader.add_xpath( 'overall_rating', "(.//div[@class='column col-4 text-center']" "/preceding::text()[contains(.,'Overall Rating')])[2]/following::span[1]/text()" ) col_4_loader.add_xpath( 'potential_rating', "following::text()[contains(., 'Potential')]/following::span[1]" "/text()") loader.add_xpath('positions', ".//div[@class='meta']/span/text()") loader.add_xpath('unique_attributes', ".//div[@class='mt-2']/a/text()") if 'GK' in response.xpath( ".//div[@class='meta']/span/text()").getall(): loader.add_xpath( 'DIV', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'HAN', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'KIC', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'REF', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'SPD', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'POS', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) else: loader.add_xpath( 'PAC', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'SHO', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'PAS', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'DRI', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'DEF', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) loader.add_xpath( 'PHY', ".//div[@class='wrapper']//script[contains(text(), 'var overallRating')]/text()" ) # PLAYER DETAILED STATS loader.add_xpath('crossing', "(.//span[../span='Crossing']/text())[1]") loader.add_xpath('finishing', "(.//span[../span='Finishing']/text())[1]") loader.add_xpath('heading_accuracy', "(.//span[../span='Heading Accuracy']/text())[1]") loader.add_xpath('short_passing', "(.//span[../span='Short Passing']/text())[1]") loader.add_xpath('volleys', "(.//span[../span='Volleys']/text())[1]") loader.add_xpath('aggression', "(.//span[../span='Aggression']/text())[1]") loader.add_xpath('interceptions', "(.//span[../span='Interceptions']/text())[1]") loader.add_xpath('positioning', "(.//span[../span='Positioning']/text())[1]") loader.add_xpath('vision', "(.//span[../span='Vision']/text())[1]") loader.add_xpath('penalties', "(.//span[../span='Penalties']/text())[1]") loader.add_xpath('composure', ".//li[contains(text(), 'Composure')]/span/text()") loader.add_xpath('dribbling', "(.//span[../span='Dribbling']/text())[1]") loader.add_xpath('curve', "(.//span[../span='Curve']/text())[1]") loader.add_xpath('fk_accuracy', "(.//span[../span='FK Accuracy']/text())[1]") loader.add_xpath('long_passing', "(.//span[../span='Long Passing']/text())[1]") loader.add_xpath('ball_control', "(.//span[../span='Ball Control']/text())[1]") loader.add_xpath('marking', "(.//span[../span='Marking']/text())[1]") loader.add_xpath('standing_tackle', "(.//span[../span='Standing Tackle']/text())[1]") loader.add_xpath('sliding_tackle', "(.//span[../span='Sliding Tackle']/text())[1]") loader.add_xpath('acceleration', "(.//span[../span='Acceleration']/text())[1]") loader.add_xpath('sprint_speed', "(.//span[../span='Sprint Speed']/text())[1]") loader.add_xpath('agility', "(.//span[../span='Agility']/text())[1]") loader.add_xpath('reactions', "(.//span[../span='Reactions']/text())[1]") loader.add_xpath('balance', "(.//span[../span='Balance']/text())[1]") loader.add_xpath('gk_diving', ".//li[contains(text(), 'GK Diving')]/span/text()") loader.add_xpath('gk_handling', ".//li[contains(text(), 'GK Handling')]/span/text()") loader.add_xpath('gk_kicking', ".//li[contains(text(), 'GK Kicking')]/span/text()") loader.add_xpath( 'gk_positioning', ".//li[contains(text(), 'GK Positioning')]/span/text()") loader.add_xpath('gk_reflexes', ".//li[contains(text(), 'GK Reflexes')]/span/text()") loader.add_xpath('shot_power', "(.//span[../span='Shot Power']/text())[1]") loader.add_xpath('jumping', "(.//span[../span='Jumping']/text())[1]") loader.add_xpath('stamina', "(.//span[../span='Stamina']/text())[1]") loader.add_xpath('strength', "(.//span[../span='Strength']/text())[1]") loader.add_xpath('long_shots', "(.//span[../span='Long Shots']/text())[1]") loader.add_xpath( 'traits', ".//h5[text()='Traits']/following-sibling::ul/li/span/text()") # PLAYER REAL OVERALL RATING (POSITIONAL STATS) loader.add_xpath('LS', "(.//div[../div='LS']/following::text())[1]") loader.add_xpath('ST', "(.//div[../div='ST']/following::text())[1]") loader.add_xpath('RS', "(.//div[../div='RS']/following::text())[1]") loader.add_xpath('LW', "(.//div[../div='LW']/following::text())[1]") loader.add_xpath('LF', "(.//div[../div='LF']/following::text())[1]") loader.add_xpath('CF', "(.//div[../div='CF']/following::text())[1]") loader.add_xpath('RF', "(.//div[../div='RF']/following::text())[1]") loader.add_xpath('RW', "(.//div[../div='RW']/following::text())[1]") loader.add_xpath('LAM', "(.//div[../div='LAM']/following::text())[1]") loader.add_xpath('CAM', "(.//div[../div='CAM']/following::text())[1]") loader.add_xpath('RAM', "(.//div[../div='RAM']/following::text())[1]") loader.add_xpath('LM', "(.//div[../div='LM']/following::text())[1]") loader.add_xpath('LCM', "(.//div[../div='LCM']/following::text())[1]") loader.add_xpath('CM', "(.//div[../div='CM']/following::text())[1]") loader.add_xpath('RCM', "(.//div[../div='RCM']/following::text())[1]") loader.add_xpath('RM', "(.//div[../div='RM']/following::text())[1]") loader.add_xpath('LWB', "(.//div[../div='LWB']/following::text())[1]") loader.add_xpath('LDM', "(.//div[../div='LDM']/following::text())[1]") loader.add_xpath('CDM', "(.//div[../div='CDM']/following::text())[1]") loader.add_xpath('RDM', "(.//div[../div='RDM']/following::text())[1]") loader.add_xpath('RWB', "(.//div[../div='RWB']/following::text())[1]") loader.add_xpath('LB', "(.//div[../div='LB']/following::text())[1]") loader.add_xpath('LCB', "(.//div[../div='LCB']/following::text())[1]") loader.add_xpath('CB', "(.//div[../div='CB']/following::text())[1]") loader.add_xpath('RCB', "(.//div[../div='RCB']/following::text())[1]") loader.add_xpath('RB', "(.//div[../div='RB']/following::text())[1]") # COMMUNITY INFORMATION loader.add_xpath( 'followers', "(.//div[@class='operation mt-2']/a/text()[contains(.,'Follow')]" "/following::span)[1]/text()") loader.add_xpath( 'likes', "(.//div[@class='operation mt-2']/a/text()[contains(.,'Like')]" "/following::span)[1]/text()") loader.add_xpath( 'dislikes', "(.//div[@class='operation mt-2']/a/text()[contains(.,'Dislike')]" "/following::span)[1]/text()") # MEDIA loader.add_xpath('face_img', ".//div/div/article/div/img//@data-src") loader.add_xpath('flag_img', ".//div[@class='meta']/a/img/@data-src") loader.add_xpath('club_logo_img', "(.//div/ul/li/figure/img/@data-src)[1]") loader.add_xpath('team_logo_img', "(.//div/ul/li/figure/img/@data-src)[2]") self.logger.info(f'Parse function called on {response.url}') self.logger.info( f"Currently on page {self.crawler.stats.get_value('page_counter')} out of " f"{self.crawler.stats.get_value('pages_to_visit')}") # TODO: enable continued logging of page_counter after a pause/resume. self.crawler.stats.inc_value(key='page_counter', count=1, start=0) print(response.request.headers['User-Agent']) print( f"{self.crawler.stats.get_value('page_counter')} out of {self.crawler.stats.get_value('pages_to_visit')}" ) yield loader.load_item()
def parse_content(self, response): item = ItemLoader(item=RealEstateItem(), response=response) item.add_value("id", str(uuid1())) item.add_value("domain", 'lianjia') # 简单的描述信息 item.add_xpath("title", '//*[@class="header-title"]/text()') # 小区名称 item.add_xpath("housing_estate", "//*[@class='maininfo-estate-name']/a[1]/text()") # 房产总价万元 item.add_xpath("price_num", '//*[@class="price-num"]/text()') # 小区地址 item.add_xpath( "address", "//*[@class='item-cell maininfo-estate-address']/text()") # 房型介绍 item.add_xpath( "rooms", '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[2]/ul/li[1]/span[2]/text()' ) # 房源编号 item.add_xpath( "house_code", '//*[@class="maininfo-minor maininfo-item"]/li[4]/span[2]/text()[1]' ) # 抓取的url item.add_value("url", response.url) # 建筑面积 item.add_xpath( "floorage", '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[2]/ul/li[3]/span[2]/text()' ) # 装修 0-毛坯 1-简装 2 中等装修 3 精装 item.add_xpath( "decoration_situation", '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[3]/ul/li[2]/span[2]/text()' ) # 每平方米单价 item.add_xpath("price_unit_num", '//*[@class="price-unit-num"]/span/text()') # 楼层 item.add_xpath( "floor", '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[3]/ul/li[1]/span[2]/text()' ) # 房本年限 item.add_xpath( "term", '//*[@id="js-baseinfo-header"]/div[1]/div[2]/div[2]/ul/li[2]/span[2]/text()' ) # 建成时间 item.add_xpath("year", '//*[@class="main-item u-tr"]/p[2]/text()') # 朝向 item.add_xpath( "orientation", '//*[@id="js-baseinfo-header"]/div[1]/div[1]/div[3]/ul/li[3]/span[2]/text()[1]' ) # 标签 item.add_xpath( "tags", '//*[@id="js-baseinfo-header"]/div[1]/div[4]/div[2]/ul/li/span/text()' ) # 城市名称 item.add_value("city", "苏州") # 所处的区域 item.add_value("district", "工业园区") # 数据创建时间 item.add_value("create_time", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) return item.load_item()
def parse(self, response): print("typeeeeeeeeeeeeeeeeeeeeeee", response) print("typeeeeeeeeeeeeeeeeeeeeeee", type(response)) # if response.xpath(self.DateXpath).get() is None \ # or response.xpath(self.UrlXpath).get() is None\ # or response.xpath(self.DeadlineXpath).get() is None\ # or response.xpath(self.PlaceXpath).get() is None \ # or response.xpath(self.TitleXpath).get() is None : # # raise ValueError("Wiki webpage has changed") if response.xpath(self.DateXpath).get() is None: raise ValueError("the DateXpath of Wiki webpage has changed") if response.xpath(self.UrlXpath).get() is None: raise ValueError("the UrlXpath of Wiki webpage has changed") if response.xpath(self.DeadlineXpath).get() is None: raise ValueError("the DeadlineXpath of Wiki webpage has changed") if response.xpath(self.PlaceXpath).get() is None: raise ValueError("the PlaceXpath of Wiki webpage has changed") if response.xpath(self.TitleXpath).get() is None: raise ValueError("the TitleXpath of Wiki webpage has changed") loader = ItemLoader(item=EventItemWiki(), response=response) #loader of type EventItemWiki loader.add_xpath("DateWiki", self.DateXpath) loader.add_xpath("UrlWiki", self.UrlXpath) loader.add_xpath("DeadlineWiki", self.DeadlineXpath) loader.add_xpath("PlaceWiki", self.PlaceXpath) j = 0 for i in range(int(len(loader.load_item()["PlaceWiki"]))): title = response.xpath(self.TitleXpath)[j + 1].extract() loader.add_value("TitleWiki", title) j = j + 5 item = loader.load_item() # item is loaded with loader data dictionary # if "TitleWiki" not in item or "DateWiki" not in item or "UrlWiki" not in item or "DeadlineWiki" not in item or "PlaceWiki" not in item: #you do it # print("item is not loaded properly") # raise ValueError if "TitleWiki" not in item: raise ValueError("TitleWiki item is not loaded") if "DateWiki" not in item: raise ValueError("DateWiki item is not loaded") if "UrlWiki" not in item: raise ValueError("UrlWiki item is not loaded") if "DeadlineWiki" not in item: raise ValueError("DeadlineWiki item is not loaded") if "PlaceWiki" not in item: raise ValueError("PlaceWiki item is not loaded") #store crawled data to a dict, then yield to pipeline #calling pipelines 1 times mydict = { "title": [], "date": [], "url": [], "paperType": [], "where": [], "deadline": [] } for k in range(int(len(item["TitleWiki"]))): mydict["title"].append(item["TitleWiki"][k]) if item['DateWiki'][k] == "N/A" or item['DateWiki'][ k] == "TBD" or item["DateWiki"][k] == "Online" or item[ "DateWiki"][k] == "ONLINE": mydict["date"].append(None) #mydict["date"].append(item['DateWiki'][k].split("-")[0][0:12].rstrip()) else: mydict["date"].append( datetime.strptime( item['DateWiki'][k].split("-")[0][0:12].rstrip(), '%b %d, %Y')) mydict["url"].append('http://www.wikicfp.com' + item["UrlWiki"][k]) mydict["paperType"].append("Wiki") mydict["where"].append(item["PlaceWiki"][k]) if item["DeadlineWiki"][k] == "TBD" or item["DeadlineWiki"][ k] == "N/A": #TBD mydict["deadline"].append(None) elif (len(item["DeadlineWiki"][k]) >= 12): #with brackets mydict["deadline"].append( datetime.strptime( item['DeadlineWiki'][k].split("(")[0][0:12].rstrip(), '%b %d, %Y')) else: # normal case mydict["deadline"].append( datetime.strptime(item['DeadlineWiki'][k], '%b %d, %Y')) #Calling pipeline myPipeline = ScrapyProjectPipeline() myPipeline.process_item(mydict, SpiderWiki) yield mydict #calling pipelines 1 time
def parse(self, response): try: if response.status == 404: self.append(self.bad_log_file, response.url) elif response.status == 200: selectors = response.xpath( '//*[@id="ContentPlaceHolder1_UpdatePanel1"]/div') del selectors[:2] del selectors[-1] for divs in selectors: #Parse despesas l = ItemLoader(item=Despesa(), selector=divs) l.add_xpath( 'data', './b[contains(text(),"Data")]/following-sibling::text()[1]' .encode('utf-8'), MapCompose(str.strip)) l.add_xpath( 'tipo', './b[contains(text(),"Tipo")]/following-sibling::text()[1]' .encode('utf-8'), MapCompose(str.strip)) l.add_xpath( 'responsavel', u'./b[contains(text(),"Responsável")]/following-sibling::text()[1]', MapCompose(str.strip)) l.add_xpath( 'usuario', './b[contains(text(),"Usuário")]/following-sibling::text()[1]', MapCompose(str.strip)) l.add_xpath( 'valor', './b[contains(text(),"Valor")]/following-sibling::text()[1]' .encode('utf-8'), MapCompose(str.strip)) l.add_xpath( 'localidade', './b[contains(text(),"Localidade")]/following-sibling::text()[1]' .encode('utf-8'), MapCompose(str.strip)) l.add_xpath( 'justificativa', './b[contains(text(),"Justificativa")]/following-sibling::text()[1]' .encode('utf-8'), MapCompose(str.strip, remove_tags, replace_escape_chars, remove_comments)) yield l.load_item() else: self.append(self.bad_log_file, response.url) except Exception as e: self.log('[exception] : %s' % e) #Post request pagination yield scrapy.FormRequest.from_response( response, url="http://www.cms.ba.gov.br/despesa.aspx/", formdata={ '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$dpNoticia$ctl02$ctl00', 'ctl00$ContentPlaceHolder1$dpNoticia$ctl02$ctl00': 'ctl00$ContentPlaceHolder1$UpdatePanel1|ctl00$ContentPlaceHolder1$dpNoticia$ctl02$ctl00' }, callback=self.parse)
def parse_statistics(self, response): driver = response.meta['driver'] nav_urls = response.meta['nav_urls'] parent_loader = response.meta['loader'] loader = ItemLoader(parent=parent_loader, response=response) fiftytwo_week_high = response.xpath( "//tr/td/span[text()='52 Week High']/parent::td/following-sibling::td[1]/text()" ).get() loader.add_value('fiftytwo_week_high', fiftytwo_week_high) previous_close = locale.atof(loader.get_output_value('previous_close')) one_year_target_est = locale.atof( loader.get_output_value('one_year_target_est')) diff_to_52_week_high = 1 - (previous_close - locale.atof(fiftytwo_week_high)) diff_to_1y_target_est = 1 - (one_year_target_est - previous_close) loader.add_value( 'diff_to_52_week_high', f"{self._round_off_2_decimal(diff_to_52_week_high)}%") loader.add_value( 'diff_to_1y_target_est', f"{self._round_off_2_decimal(diff_to_1y_target_est)}%") forward_pe = self._wait_and_find_elem( driver, "//tr/td/span[text()='Forward P/E']/parent::td/following-sibling::td[1]" ).text loader.add_xpath('forward_pe', forward_pe) market_cap = response.xpath( "//tr/td/span[contains(text(), 'Market Cap')]/parent::td/following-sibling::td[1]/text()" ).get() unit = market_cap[-1] if unit == 'B': multiplier = 1000 elif unit == 'T': multiplier = 1000000 else: multiplier = 1 market_cap = float(market_cap[0:-1]) * multiplier loader.add_value('market_cap', market_cap) peg_ratio = response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('peg_ratio', peg_ratio) loader.add_xpath( 'price_over_sales', "//tr/td/span[contains(text(), 'Price/Sales')]/parent::td/following-sibling::td[1]/text()" ) price_over_book = response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('price_over_book', price_over_book) return_on_assets = response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('return_on_assets', return_on_assets) return_on_equity = response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('return_on_equity', return_on_equity) loader.add_xpath( 'diluted_eps', "//tr/td/span[contains(text(), 'Diluted EPS')]/parent::td/following-sibling::td[1]/text()" ) quarterly_earnings_growth = response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('quarterly_earnings_growth', quarterly_earnings_growth) fwd_annual_dividend_rate = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('fwd_annual_dividend_rate', fwd_annual_dividend_rate) fwd_annual_dividend_yield = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('fwd_annual_dividend_yield', fwd_annual_dividend_yield) ex_dividend_date = response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('ex_dividend_date', ex_dividend_date) yield SeleniumRequest(url=nav_urls['profile_url'], callback=self.parse_profile, previous_response=response, meta={ "loader": loader, "nav_urls": nav_urls })
def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() i = 0 while (1): i += 1 if len(response.xpath( '(//*[@class="team-name"])[%d]/text()' % i)) < 1: break else: print("name: %s" % response.xpath( '(//*[@class="team-name"])[%d]/text()' % i).extract()) # print("web: %s" % response.xpath('//html').extract()) l = ItemLoader(item=GameRecord(), response=response) l.add_xpath('name1', '(//*[@class="team-name"])[%d]/text()' % i) l.add_xpath('name2', '(//*[@class="team-name f-toe"])[%d]/text()' % i) l.add_xpath('time', '(//*[@class="td time"])[%d]/text()' % i) l.add_xpath('series', '(//*[@class="f-toe f-csp"])[%d]/text()' % i) l.add_xpath( 'score1', '(//*[@class="vs-data f-csp"])[%d]/@data-matchhomescore' % i, MapCompose(int)) # l.add_xpath('score1', '(//*[@class="td vs"])[%d]/a/@data-matchhomescore' % i) l.add_xpath( 'score2', '(//*[@class="vs-data f-csp"])[%d]/@data-matchawayscore' % i, MapCompose(int)) l.add_value('last_updated', 'today') # you can also use literal values item.append(l.load_item()) l.add_value('url', response.url) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return item
def parse_item(self, response): url = response.url item_list = item_code(url, self.web_name, 'id=(.*?)$') print(item_list) item = ItemLoader(item=GdSz6652Item(), response=response) item.add_value('web_name', self.web_name) item.add_value('web_code', self.name) item.add_value('url', url) item.add_value('item_code', item_list.get('item_code')) item.add_css('title', '.title.border-bottom-light::text') item.add_xpath( 'amount', '//*[contains(text(),"借款金额")]/following-sibling::td[1]/text()') item.add_xpath('rate', '//*[contains(text(),"历史年化结算利率")]/../span[1]/text()') item.add_xpath('period', '//*[contains(text(),"借款期限(天)")]/../span[1]/text()') item.add_xpath( 'loan_using', '//*[contains(text(),"借款用途")]/following-sibling::td[1]/text()') # item.add_xpath('loaner_info', '//*[@id="userName"]') item.add_xpath('pay_type', '//*[contains(text(),"回款方式")]/text()') item.add_xpath('progress', '//*[contains(text(),"剩余可出借金额(元)")]/../span[1]/text()') # invest records i_v = [] invest_records_temp = '{{username={lst[0]}|rate=-1|postmoney={lst[1]}|money={lst[1]}|postdate={lst[2]}|status=全部通过}}' invest_records_format = "" tr = response.css('#investRecordWrap').css('tr') try: for i in tr: lst = i.css('td::text').extract() if lst: i_v.append(lst) for n in i_v: invest_records_format += invest_records_temp.format(lst=n) item.add_value('invest_records', invest_records_format) item.add_value('start', i_v[-1][2]) item.add_value('end', i_v[0][2]) except Exception: print(url, 'invest records is error') yield item.load_item()
def parse(self, response): l = ItemLoader(item=ParlamentItem(), response=response) l.add_xpath( 'ime', '//h2/text()', MapCompose(lambda i: i.replace('\n', ''), str.strip, str.capitalize)) l.add_xpath('prezime', '//h2/span/text()', MapCompose(str.capitalize)) l.add_xpath( 'stranka', '//h4[contains(text(), "stranka")]/following::p[1]/text()', MapCompose( lambda i: i.replace('\n', ''), str.strip, )) l.add_xpath( 'posl_grupa', '//h4[contains(text(), "grupa")]/following::p[1]/a/text()') l.add_xpath('mesto', '//h4[contains(text(), "Mesto")]/following::p[1]/text()') l.add_xpath( 'zanimanje', '//h4[contains(text(), "Zanimanje")]/following::p[1]/text()') l.add_xpath('godina', '//h4[contains(text(), "Godina")]/following::p[1]/text()') l.add_xpath('foto', '//div[@class = "image_holder left"]/img/@src') l.add_xpath('twitter', '//ul[@class = "social-list"]/li[1]/a/@href') l.add_xpath('facebook', '//ul[@class = "social-list"]/li[2]/a/@href') return l.load_item()
def parse_question(self, response): item_loader = ItemLoader(item=ZhihuItemQuestion(), response=response) item_loader.add_value("zhihu_id", response.meta.get("question_id")) item_loader.add_value("url", response.url) item_loader.add_xpath("title", "//h1[@class='QuestionHeader-title']//text()") item_loader.add_xpath("main_content", "//div[@class='QuestionHeader-detail']//text()") item_loader.add_xpath("tag", "//div[@class='QuestionHeader-topics']//text()") item_loader.add_xpath( "focus_num", "//button[@class='Button NumberBoard-item Button--plain']//strong//text()" ) item_loader.add_xpath( "click_num", "//div[@class='NumberBoard-item']//strong//text()") item_loader.add_xpath( "comment_num", "normalize-space(//div[@class='QuestionHeader-Comment']/button/text()[1])" ) item_loader.add_xpath( "answer_num", "normalize-space(//div[@class='List-header']//span//text()[1])") question_item = item_loader.load_item() yield scrapy.Request(url=self.start_answer_url.format( response.meta.get("question_id"), 20, 0), callback=self.parse_answer) yield question_item
def parse_article(self, response): loader = ItemLoader(item=ClaimsItem(), response=response) loader.add_xpath("text", '//div[@class="claim"]/p') loader.add_xpath("rating", '//h5[starts-with(@class,"rating-label")]') loader.add_value("fact_check", response.url) yield loader.load_item()
def parse(self, response): sel = Selector(response) productos = sel.xpath('//div[@id="js-product-wrapper"]/article') # sel.css también puede ser usado. # iterar sobre todos los productos: for i, elem in enumerate(productos): item = ItemLoader(Producto(), elem) item.add_xpath( # 'imagen', './div[@class="dkt-product__gallery"]/div/div/div/div/picture/source[5]/@srcset') # 'imagen', './div/div/div/div/div/picture/source[position()=4]/@srcset') 'imagen', './div[@class="dkt-product__gallery"]/div/div/div[position()=1]/div/picture/source/source/source/source/source/@srcset' ) item.add_xpath( 'titulo', 'normalize-space(div[@class="dkt-product__infos-wrapper"]/div[@class="dkt-product__infos__link"]/div/div/a/h2/text())' ) item.add_xpath( # 'precio', './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/div/@data-price') 'precio', './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/div[@class="dkt-price__cartridge"]/@data-price' or './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/div/@data-price' ) item.add_xpath( 'precio_a', 'normalize-space(.//div[@class="dkt-price__cartridge"]/text())' ) item.add_xpath( 'precio_b', 'normalize-space(.//div[@class="dkt-price__cartridge"]/sup/text())' ) item.add_xpath( 'precio_previo', './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/span/span[position()=1]/text()' ) item.add_xpath( 'reduccion', './div[@class="dkt-product__infos-wrapper"]/div/div/div[@class="dkt-product__price"]/div/span/span[position()=2]/text()' ) item.add_xpath( 'marca', './div[@class="dkt-product__infos-wrapper"]/div/div/div/span/span/text()' ) item.add_xpath( 'url', './div[@class="dkt-product__infos-wrapper"]/div[@class="dkt-product__infos__link"]/div/div/a/@href' ) item.add_xpath( 'rating', './div[@class="dkt-product__infos-wrapper"]/div/div/span[@itemprop="ratingValue"]/text()' ) item.add_xpath( 'review', './div[@class="dkt-product__infos-wrapper"]/div/div/span[@itemprop="reviewCount"]/text()' ) item.add_xpath( 'modelId', './div[@class="dkt-product__gallery"]/div/div[position()=1]/div[position()=1]/@data-modelid' ) item.add_value('id', i + random.randrange(10, 4000000)) item.add_value('control_type', 'A') yield item.load_item() # Paginacion con el botón más productos: boton_next = response.css('button #more_product_a').extract_first() if boton_next: boton_next = response.urljoin(boton_next) # ahora repetir el proceso en la nueva url con la funcion parse yield scrapy.Request(url=boton_next, callback=self.parse)
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(), selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) #load following page #tries to click on "more", otherwise it looks for the appropriate #year for 1-click only and proceeds to click on others new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() if not new_page: if response.meta['flag'] == self.k and self.k >= self.year: self.logger.info('There are no more, flag set at = {}'.format( self.k)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Everything OK, new flag: {}'.format( self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: while not new_page: #sometimes the years are skipped self.logger.info( 'XPATH not found for year {}'.format(self.k - 1)) self.k -= 1 self.logger.info( 'Trying with previous year, flag={}'.format( self.k)) if self.k < self.year: self.logger.info( 'The previous year to crawl is less than the parameter year: {} < {}' .format(self.k, self.year)) self.logger.info( 'This is not handled well, please re-run with -a year="{}" or less' .format(self.k)) break xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info('New page found with flag {}'.format( self.k)) new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Now going with flag {}'.format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, click on more! flag = {}'.format( response.meta['flag'])) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') self.logger.info( 'First page scraped, click on more! Flag not set, default flag = {}' .format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' # #open page in browser for debug # from scrapy.utils.response import open_in_browser # open_in_browser(response) #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date print(post) if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime(date.year, date.month, date.day) if date is not None else date date = str(date) print(current_date) #if 'date' argument is reached stop crawling #if self.date > current_date: # raise CloseSpider('Reached date: {}'.format(self.date)) print("stop2") new = ItemLoader(item=FbcrawlItem(), selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'.format( abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_value('date', date) new.add_xpath('post_id', './@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if 'flag' in response.meta and response.meta[ 'flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def parse_course(self, response): course_details = response.xpath('//form[@id="formCourseSearchDetails"]/div[contains(@id, "courseProfilePanel_")]') if not course_details: return False l = ItemLoader(WcsuwocaCourseItem(), response=response) l.default_output_processor = TakeFirst() l.add_value('institution_name', 'Western Continuing Studies') l.add_value('course_code', response.meta['course_code']) l.add_value('course_name', response.meta['course_name']) l.add_xpath('delivery_types', '//div[@class="courseProfileInstructionMethods"]/span[not(@class)]/span/text()') l.add_value('url', response.url) l.add_xpath('description', 'string(//div[@id="courseProfileOfficialCourseDescription"])') # l.add_value('subject', response.meta['subject']) l.add_value('subject', response.meta['program']) course_section = course_details.xpath('.//div[contains(@id, "courseSectionPanel_")]') if not course_section: return False course_data = course_section[0] # price = course_data.xpath('.//td[@class="tuitionProfileFees"]/text()').get() # price = course_data.xpath('.//tr[descendant::a[contains(., "Course")] and td[span[contains(., "") and @class="creditType" and contains(., "non-credit")]]]/td[@class="tuitionProfileFees"]/text()').get() price = course_data.xpath('.//td[@class="tuitionProfileFees"]/text()') if price: # prices = [p.strip().lstrip('$') for p in price.getall()] prices = list(map(lambda x: ', '.join(x), [re.findall(r'\d*\,?\d+\.\d{2}', p) for p in price.getall()])) prices = ', '.join(prices) prices = prices.split(', ') price = '0.0' for price_val in prices: try: check_zerro_price = float(price_val.replace(',', '')) if check_zerro_price: price = price_val break except ValueError: continue # price = price.strip().lstrip('$') else: return False # price = '0.0' # # Skip courses with price $0.00 # try: # check_zerro_price = float(price.replace(',', '')) # except ValueError: # check_zerro_price = False # if not check_zerro_price: # return False l.add_value('price', [price]) weekdays = course_data.xpath('string(.//div[contains(@class, "sectionScheduleMeetingDays")]//div[contains(@class, "content")])').get() if weekdays: weekdays = weekdays.strip() weekdays = re.sub(r'\s+', '', weekdays) weekdays = weekdays.split(',') else: weekdays = [] l.add_value('days', [weekdays]) # l.add_value('program', 'Continuing Education') # l.add_value('program', response.meta['program']) l.add_xpath('program', '//div[@id="courseProfileCertificates"]//li/a/text()') duration_hours_list = course_data.xpath('string(.//div[contains(@class, "section sectionScheduleMeetingTime")]//div[contains(@class, "content")])').get() if duration_hours_list: duration_hours_list = re.findall(r'\d{1,2}:\d{1,2}\w{2}', duration_hours_list) duration_hours_list = [t.lower() for t in duration_hours_list] else: duration_hours_list = [] l.add_value('duration_hours', [duration_hours_list]) l.add_value('duration_days_week', l.get_collected_values('days')) duration_month_list = course_data.xpath('string(.//div[contains(@class, "section sectionScheduleMeetingDates")]//div[contains(@class, "content")])').get() if duration_month_list: duration_month_list = re.findall(r'\w+\s\d{1,2},\s\d{4}', duration_month_list) if len(duration_month_list) == 2: duration_month_list = [datetime.strptime(d, '%b %d, %Y') for d in duration_month_list] if len(duration_month_list) == 1: duration_month_list = [datetime.strptime(duration_month_list[0], '%b %d, %Y')] else: duration_month_list = [None] l.add_value('duration_months', [duration_month_list]) l.add_value('duration_as_string', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), l.get_collected_values('duration_months'), ]) hours_site = course_data.xpath('string(.//div[contains(@class, "sectionContactHours")]//div[contains(@class, "content")])').get() if hours_site: hours_site = hours_site.strip() l.add_value('total_hours', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), hours_site, ]) yield l.load_item()
def parse_reactions(self, response): new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath('likes', "//a[contains(@href,'reaction_type=1')]/span/text()") new.add_xpath('ahah', "//a[contains(@href,'reaction_type=4')]/span/text()") new.add_xpath('love', "//a[contains(@href,'reaction_type=2')]/span/text()") new.add_xpath('wow', "//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh', "//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('grrr', "//a[contains(@href,'reaction_type=8')]/span/text()") print(new.load_item()) yield (new.load_item())
def parse_season(self, response, **kwargs): """ Parses page with particular league season. Creates `league` item and fills it with data parsed from returned content. """ loader = ItemLoader(item=LeagueItem(), response=response) loader.add_value('id', uuid4()) loader.add_xpath( 'title', '//div[@id="teamSummary"]/h1[contains(@class, "teamName")]/text()') loader.add_value('country_id', kwargs.get('country_id')) loader.add_xpath( 'teams_count', ('//div[@class="league-details"]/div[@class="detail"]' '/div[contains(., "Teams")]/following-sibling::div/text()')) loader.add_xpath( 'season_start', ('//div[@class="league-details"]/div[@class="detail season"]' '/div[contains(., "Season")]/following-sibling::div/text()')) loader.add_xpath( 'season_end', ('//div[@class="league-details"]/div[@class="detail season"]' '/div[contains(., "Season")]/following-sibling::div/text()')) loader.add_xpath('all_matches_count', ('//div[@class="league-details"]/div[@class="detail"]' '/following-sibling::div[contains(., "Matches")]' '/div[@class="w65 fl boldFont"]/text()')) loader.add_xpath('image_url', '//div[@id="teamSummary"]/img/@src') league = loader.load_item() matches = response.xpath( '//div[@id="teamSummary"]/ul[contains(@class, "secondary-nav")]/' 'li[contains(@class, "middle")]/a') # selector href = matches.xpath('@href').get() # matches for this league available for premium account league['blocked'] = True if href else False yield league if href == '#': # parameters for urls query string params = { 'hash': matches.attrib['data-hash'], 'zzz': matches.attrib['data-zzz'], 'cur': matches.attrib['data-z'] } yield FormRequest(url=self.make_url('ajax_league.php'), method='POST', formdata=params, callback=self.parse_matches, cb_kwargs={'league_id': league['id']}) elif href and href != '#': yield response.follow(url=href, callback=self.parse_matches, cb_kwargs={'league_id': league['id']})
def parse_news(self, response): item = ItemLoader(Articulo(), response) item.add_xpath('titulo', '//h1/text()') item.add_xpath('contenido', '//div[@id="id_text"]//*/text()') yield item.load_item()
def parse(self, response): l = ItemLoader(item=EbookItem(), response=response) # Primary Fields l.add_xpath("title", "//header/h1/text()", MapCompose(lambda i: i.strip())) # TODO add custom pipeline to append subtitle if key doesn't # l.add_xpath('subtitle', # '//header/h4/text()', # MapCompose(lambda i: i.strip()), default=' ') l.add_value("subtitle", "N/A") # not all books have subtitles l.add_xpath( "image", '//img[contains(@class,"attachment-post-thumbnail")]/@src', MapCompose(lambda i: i.strip()), ) l.add_xpath( "author", '//div[contains(@class, "book-detail")]//dd[1]/a/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "isbin", '//div[contains(@class, "book-detail")]//dd[2]/text()', MapCompose(lambda i: i.strip(), lambda i: i.replace("-", "")), ) l.add_xpath( "year", '//div[contains(@class, "book-detail")]//dd[3]/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "pages", '//div[contains(@class, "book-detail")]//dd[4]/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "language", '//div[contains(@class, "book-detail")]//dd[5]/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "file_size", '//div[contains(@class, "book-detail")]//dd[6]/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "file_format", '//div[contains(@class, "book-detail")]//dd[7]/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "category", '//div[contains(@class, "book-detail")]//dd[8]//a/text()', MapCompose(lambda i: i.strip()), ) l.add_xpath( "description", '//div[contains(@class,"entry-content")]', MapCompose( lambda s: s.replace("\n", ""), lambda s: s.replace("\b", ""), lambda s: s.replace("\f", ""), lambda s: s.replace("\r", ""), lambda s: s.replace("\t", ""), lambda s: s.replace("\v", ""), lambda s: s.replace("\x00", ""), lambda i: i.strip(), # TODO check for other stray characters ), ) l.add_xpath( "download_link", '//a[contains(@href,"file")]/@href', MapCompose(lambda s: s.replace(" ", "%20"), lambda i: i.strip()), ) # TODO where to add Housekeeping Fields # l.add_value('url', response.url) # l.add_value('project', self.settings.get('BOT_NAME')) # l.add_value('spider', self.name) # l.add_value('server', socket.gethostname()) # l.add_value('date', date.today()) return l.load_item()
def parse_item(self, selector, response): """ This function parses a property page @url http://localhost:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ loader = ItemLoader(item=PropertiesItem(), selector=selector) loader.add_xpath('title', './/*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title)) loader.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') loader.add_xpath( 'description', './/*[@itemprop="description"][1]/text()', MapCompose(str.strip, lambda i: i.replace('\r\n', ' '))) loader.add_xpath( 'address', './/*[@itemtype="http://schema.org/Place"][1]/*/text()', MapCompose(str.strip)) loader.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src', MapCompose(lambda i: parse.urljoin(response.url, i))) loader.add_xpath('url', './/*[@itemprop="url"]/@href', MapCompose(lambda i: parse.urljoin(response.url, i))) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('date', datetime.datetime.now()) yield loader.load_item()