def parse_item(self,response): l = XPathItemLoader(item = YellowPagesItem(),response = response) l.add_xpath('company','//*[@id="main-content"]/div[1]/div[1]/h1/text()') l.add_xpath('st_add','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[1]/text()') l.add_xpath('city','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[2]/text()') l.add_xpath('phone','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[3]/text()') #reviews left res = l.load_item() print("") print("") results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'st_add' in res: results['address'] = res['st_add'] if 'city' in res: results['address'] = results['address'] + res['city'] if 'phone' in res: results['phone'] = res['phone'] print("") return res
def parse(self, response): # hxs = HtmlXPathSelector(response) # ads = hxs.select('//div[@class="list-ads"]/a') # items = [] # for ad in ads: # item = LeboncoinItem() # item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') # item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() # item['url'] = ad.select('@href').extract() # self.log(item['name']) #print item['name'],':' ,item['photo'],'--->', item['url'] #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\ #<img src="%s" alt="" /><br />\ #<p><a href="%s">%s</a></p>\ #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) ) ##print photo #items.append(item) ## put in filename #filename = response.url.split("/")[-4] #open('/tmp/lbc/'+filename+'.html', 'a').write(html) #return items #yield items hxs = HtmlXPathSelector(response) for qxs in hxs.select('//div[@class="list-ads"]/a'): loader = XPathItemLoader(LeboncoinItem(), selector=qxs) loader.add_xpath('name' , 'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' ) loader.add_xpath('photo' , 'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' ) loader.add_xpath('url' , '@href' ) loader.add_value('category' , response.url.split("/")[-4] ) yield loader.load_item()
def parse_article(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ selector = Selector(response) loader = XPathItemLoader(LeMondeArt(), selector=selector) self.log('\n\nA response from %s just arrived!' % response.url) # define processors text_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Populate the LeMonde Item with the item loader for field, xpath in self.article_item_fields.iteritems(): try: loader.add_xpath(field, xpath, text_input_processor) except ValueError: self.log("XPath %s not found at url %s" % (xpath, response.url)) #loader.add_value("Url",response.url) yield loader.load_item()
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL':response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k,v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'.format(k), level=log.WARNING) continue val = get_v_x( self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex') ) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("im", "http://itunes.apple.com/rss") x.register_namespace('atom','http://www.w3.org/2005/Atom') feedCount = str(len(self.start_urls)) self.i=self.i+1 self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO) entries = x.select('//atom:entry') if entries: # a itunes rss feed for entry in entries: id = entry.select('./atom:id/@im:id').extract() self.log('Entry %s' % (str(id)), level=log.INFO) yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson) else: # a single feed l = XPathItemLoader(PodcastItem(), x) l.add_value('id', 'rssdisco_'+response.url) l.add_value('audioType', 'disco') l.add_value('brandFeed', response.url) l.add_xpath('brandName', '//./channel/title/text()') self.log('Feed from rss %s' % (response.url), level=log.INFO) item = l.load_item() yield item
def parse_page(self, response, chart, next_pages): hxs = HtmlXPathSelector(response) # parse every chart entry list = [] for item in hxs.select('//*[@class="printable-row"]'): loader = XPathItemLoader(SingleItem(), selector=item) loader.add_xpath('rank', 'div/div[@class="prank"]/text()') loader.add_xpath('track', 'div/div[@class="ptitle"]/text()') loader.add_xpath('artist', 'div/div[@class="partist"]/text()') loader.add_xpath('album', 'div/div[@class="palbum"]/text()') single = loader.load_item() list.append(dict(single)) chart['list'] += list if len(next_pages) == 0: log.msg("Done with %s" %(chart['name'])) yield chart else: next_page = next_pages.popleft() log.msg("Starting nextpage (%s) of %s - %s left" % (next_page, chart['name'], len(next_pages))) request = Request('http://www.billboard.com'+next_page, callback = lambda r: self.parse_page(r, chart, next_pages)) yield request
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",queryStr['page'] # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(JabongData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse_item(self, response): url_obj = urlparse(response.url) path = url_obj.path if path.endswith("/"): path = path[:-1] page = path.split("/")[-1] fullDomain = getDomainName(response.url) # with HTTP or HTTPS domain = fullDomain.split("/")[-2] newpath = r'C:\\Users\\****\\scrapy_projects\\tutorial\\' + domain if not os.path.exists(newpath): os.makedirs(newpath) os.chdir(newpath) filename = '%s.html' % (domain + " " + page) with open(filename, 'wb') as f: f.write(response.body) links = 'links-%s.txt' % (domain + " " + page) content = 'contents-%s.txt' % (domain + " " + page) f1.write("\n") f1.write(domain + sep) f1.write(page + sep) # 16 whois attributes f1.write(str(whois.whois(response.url).whois_server) + sep) f1.write(str(whois.whois(response.url).referral_url) + sep) f1.write(str(whois.whois(response.url).updated_date) + sep) f1.write(str(whois.whois(response.url).creation_date) + sep) f1.write(str(whois.whois(response.url).expiration_date) + sep) f1.write(str(whois.whois(response.url).name_servers) + sep) f1.write(str(whois.whois(response.url).status) + sep) f1.write(str(whois.whois(response.url).emails) + sep) f1.write(str(whois.whois(response.url).dnssec) + sep) f1.write(str(whois.whois(response.url).name) + sep) f1.write(str(whois.whois(response.url).org) + sep) f1.write(str(whois.whois(response.url).address) + sep) f1.write(str(whois.whois(response.url).city) + sep) f1.write(str(whois.whois(response.url).state) + sep) f1.write(str(whois.whois(response.url).zipcode) + sep) f1.write(str(whois.whois(response.url).country) + sep) extractLinks(links, response) countRelAbsHttpsLinks(links) countInOutLinks(links) countSlashes(links) imagePreloading(links) extractText(content, response) countSentences(content) checkGrammar(content) # Average word length is: ??? global_wc can be zero f1.write(str("%.2f" % (global_wordLen / global_wc)) + sep) # Number of words in the page: f1.write(str(global_wc) + sep) # Downloads images loader = XPathItemLoader(item=ImageItem(), response=response) loader.add_xpath('image_urls', '//img/@src') hashImages() # Calculates hashes of images downloaded by scrapy # Write label into the data file f1.write(my_dict.get(fullDomain, "redirect")) return loader.load_item()
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL': response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k, v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'. format(k), level=log.WARNING) continue val = get_v_x(self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex')) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse_talk(self, response): loader = XPathItemLoader(item=Pybr8TalksItem(), response=response) loader.add_xpath('title', '//div[@id="proposal"]/h1/text()') loader.add_xpath('description', '//div[@class="twocolumn"]/div[2]/text()[2]') loader.add_xpath('author_name', '//div[@class="twocolumn"]/div/div[2]/h3/text()') loader.add_xpath('author_profile', '//div[@class="twocolumn"]/div/div[2]/text()[3]') return loader.load_item()
def _set_loader(self, response, xs, item): if not xs: self.from_detail_page = True item = response.request.meta['item'] self.loader = XPathItemLoader(item=item, response=response) self.loader.default_output_processor = TakeFirst() else: self.from_detail_page = False self.loader = XPathItemLoader(item=item, selector=xs) self.loader.default_output_processor = TakeFirst()
def _set_loader(self, response, hxs, item): if not hxs: self.follow_url = True item = response.request.meta["item"] self.loader = XPathItemLoader(item=item, response=response) self.loader.default_output_processor = TakeFirst() else: self.follow_url = False self.loader = XPathItemLoader(item=item, selector=hxs) self.loader.default_output_processor = TakeFirst()
def parse_argument(self, response): loader = XPathItemLoader(item=Argument(), response=response) id = self.parse_id_from_url(response.url) if id: loader.add_value('id', id) else: loader.add_value('id', -1) loader.add_xpath('rating', '//b[@id="QuestionRateValue"]/text()') loader.add_xpath('essay', '//div[@class="essay"]') return loader.load_item()
def _set_loader(self, response, hxs, item): if not hxs: self.from_detail_page = True item = response.request.meta['item'] self.loader = XPathItemLoader(item=item, response=response) self.loader.default_output_processor = TakeFirst() else: self.from_detail_page = False self.loader = XPathItemLoader(item=item, selector=hxs) self.loader.default_output_processor = TakeFirst()
def parse_item(self,response): l = XPathItemLoader(item = HotfrogItem(),response = response) l.add_xpath('company','/html/body/center/table[2]/text()') res = l.load_item() print("") print("") return res print("") print("")
def parse(self, response): """ Default callback used by Scrapy to process download response Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link :param response: :return: """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = TakeFirst() loader.default_input_processor = MapCompose(unicode.strip) loader.default_input_processor = Join() loader.defalut_output_processor = TakeFirst() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """"Call back used by Scrapy to download and process response """ selector = HtmlXPathSelector(response) # Go through art statements for statement in selector.select(self.description_xpath): loader = XPathItemLoader(MonetInformation(), selelctor=statement) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor
def parse(self, response): gold = XPathItemLoader(item=FinanceIndex(), response=response) gold.add_value("name", "Oro Spot Cierre Londres") gold.add_value("unit", "USD") gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()") return [gold.load_item()]
def parse_item(self, response): #hxs = HtmlXPathSelector(response) l = XPathItemLoader(item=PytexasItem(), response=response) l.add_xpath('title', '//*/div[@class="span6"]/h2/text()') l.add_xpath('speaker', '//*/div[@class="span6"]/h3/text()') l.add_xpath('description', '//*/div[@class="span6"]/p[2]/text()') #l.add_value('last_updated', 'today') # you can also use literal values return l.load_item()
def parse(self, response): ubi = XPathItemLoader(item=FinanceIndex(), response=response) ubi.add_value("name", "Uruguay Bond Index") ubi.add_value("unit", "bps") ubi.add_xpath("value", "//span/text()") return [ubi.load_item()]
def get_user(self, selector): user_loader = XPathItemLoader(item = YahooUser(), selector = selector) user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()') user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href') user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)', user_loader.get_output_value('user_url') ).group(1)) if user_loader.get_collected_values('user_name'): return user_loader.load_item() else: return None
def improvement(text, url): response = http.TextResponse(url=url, body=str(text)) i = XPathItemLoader(item=TCADImprovementItem(), response=response) i.add_xpath('id', '//td[1]/text()') i.add_xpath('state_category', '//td[2]/text()') i.add_xpath('description', '//td[3]/text()') return i.load_item()
def get_bond(name, bondname=None): bond = XPathItemLoader(item=Bond(), response=response) name, values, _ = get_index_values(name) bond.add_value("name", bondname or name) bond.add_value("bondcoupon", values[0]) price, byield = values[2].split("/") bond.add_value("bondprice", price) bond.add_value("bondyield", byield) return bond
def test_wikipedia_links(self): navs = self.selector.select_script("return $('#p-navigation li')") nav_items = [] for nav in navs: loader = XPathItemLoader(NavItem(), nav) loader.add_xpath('name', './/a') nav_items.append(loader.load_item()) expected = [u'Main page', u'Contents', u'Featured content', u'Current events', u'Random article', u'Donate to Wikipedia'] for i, item in enumerate(nav_items): self.assertEqual(item['name'], expected[i])
def parse(self, response): hxs = HtmlXPathSelector(response) orden_compra, anio = re.search(r'wOCabc=(\d+)&wEjercicio=(\d+)', urlparse(response.url).query).groups() for tr in hxs.select('//table[contains(@width, "760")][2]/tr'): i = CompraLineaItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('cantidad', 'td[1]/text()') l.add_xpath('importe', 'td[2]/text()') l.add_xpath('detalle', 'td[3]/text()') l.add_value('orden_compra', int(orden_compra)) l.add_value('anio', int(anio)) x = l.load_item() yield x
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Tasa Objetivo BCU") rate.add_value("unit", "%") rate.add_xpath("value", "8.75") #rate.update_only_if_change = True return [rate.load_item()]
def parse(self, response): url = response.url group_name = url[url.find("group") :].split("/")[1] hxs = HtmlXPathSelector(response) dls = hxs.select('//dl[@class="obu"]') items = [] for dl in dls: item = GroupUserItem() l = XPathItemLoader(item=item, selector=dl) l.add_xpath("homepage", "dt/a/@href") l.add_xpath("image", "dt/a/img/@src") l.add_xpath("name", "dd/a/text()") l.add_value("group", group_name) yield l.load_item() links = hxs.select('//span[@class="next"]/a/@href').extract() for url in links: yield Request(url, callback=self.parse) if len(links) < 1: p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S) m = p.search(response.body_as_unicode()) if m: url = m.group(1) yield Request(url, callback=self.parse)
def parse_materials(self, response): reportnum = response.request.meta['reportnum'] text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No materials reports found in response {0}' .format(reportnum), log.INFO) else: self.log('Retrieved {0} materials records in report {1}' .format(len(materials),reportnum), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.name_in = lambda slist: [s[:32] for s in slist] l.add_value('reportnum', reportnum) for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \ selector = selector) answer_loader.add_value('question_id', response.url.split('/')[-1]) answer_loader.add_value('answerer', self.get_user(selector.select(''.join([ './/span[@class="answer-meta"]' ])))) answer_loader.add_xpath('answer_content',''.join([ './/span[@class="answer-body"]', '//span[@class="answer-status"]//descendant-or-self::text()' ])) print answer_loader.get_output_value('answer_content') a = input() return answer_loader.load_item()
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Merval") rate.add_value("unit", "") hxs = HtmlXPathSelector(response) rate.add_value("value", hxs.select("//span[contains(@id,'UltimoMerval')]/text()")[0].extract()) return [rate.load_item()]
def parse(self, response): hxs = HtmlXPathSelector(response) for tr in hxs.select('//div[@id="miListView"]/table/tr'): i = ProveedorItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('nombre', 'td[1]/text()') l.add_xpath('domicilio', 'td[2]/text()') l.add_xpath('cuit', 'td[3]/text()') l.add_xpath('localidad', 'td[4]/text()') yield l.load_item() for l in self.extractor.extract_links(response): yield Request(l.url, callback=self.parse)
def get_UT_item(self, sel, user_url): ''' given the selector of topic and user url, generate the u_t relationship ''' ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector=sel) ut_loader.add_value('crawled_from', user_url) ut_loader.add_value('user_url', '/' + '/'.join(user_url.split('/')[-3:-1])) ut_loader.add_xpath( 'topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href') return ut_loader.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector) user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()'])) user_loader.add_value( 'twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def parse(self, response): items = [] for name, pattern, pos in rates: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", name) rate.add_value("unit", "%") rate.add_xpath( "value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos)) items.append(rate.load_item()) return items
def search_results(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) reports = hxs.select ('//table[@class="t16Standard"]/tr') if (len(reports) == 0): self.log('Incident report data not present in response', log.ERROR) else: # Skip the first report record because this is the header row reports.pop (0) if (len(reports) == 0): self.log('No incident reports found in response', log.WARNING) else: self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO) for report in reports: l = XPathItemLoader(NrcScrapedReport(), report) l.context['base_url'] = response.url for name, params in NrcScrapedReport.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if self.db.reportExists(item['reportnum']): self.log('Report {0} already exists. Skipping to next report.'.format(item['reportnum']), log.INFO) else: f_request = Request( item['full_report_url'], callback=self.parse_full_report) m_request = Request( item['materials_url'], callback=self.parse_materials) yield item self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE') # if self.db.fullReportExists (item['reportnum']): # self.log('Full report Report {0} already exists. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield f_request # # if self.db.materialExists (item['reportnum']): # self.log('Materials record(s) already exist for report {0}. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield m_request # get next page of results next = hxs.select('//td[@class="pagination"][4]/a/@href') if len(next) > 0: yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results)
def parse(self, response): items = [] for name, pattern, pos in rates: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", name) rate.add_value("unit", "%") rate.add_xpath("value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos)) items.append(rate.load_item()) return items
def get_UT_item(self, sel, user_url): ''' given the selector of topic and user url, generate the u_t relationship ''' ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector = sel) ut_loader.add_value('crawled_from', user_url) ut_loader.add_value('user_url', '/'+'/'.join(user_url.split('/')[-3:-1])) ut_loader.add_xpath('topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href') return ut_loader.load_item()
def parse_item(self,response): l = XPathItemLoader(item = YelpItem(),response = response) l.add_xpath('company','//*[@id="wrap"]/div[3]/div[1]/div/div[2]/div[1]/h1/text()') for i in range(1,8): l.add_xpath('day','//*[@id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/th[@scope="row"]/text()') l.add_xpath('timings1','//*[@id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/td[1]/span[1]/text()') l.add_xpath('timings2','//*[@id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/td[1]/span[2]/text()') return l.load_item()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded response """ selector = HtmlXPathSelector( response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for content in selector.xpath( self.content_list_xpath): #multiple deals per page loader = XPathItemLoader(RedditLearnPython(), selector=content) #iterate over each deal # define processors loader.default_input_processor = MapCompose( unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems( ): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item( ) # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal # define processors # An Item Loader contains one input processor and one output processor for each (item) field. loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ # Gives ability to select parts of response defined in deals_list_xpath selector = HtmlXPathSelector(response) # Iterate through found deals for deal in selector.xpath(self.deals_list_xpath): # Loads data into item fields defined in items.py loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # Define processors for clean up and joining elements loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Iterate over item_fields dict and add xpaths to loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for qxs in hxs.select(self.lista_linhas_xpath): loader = XPathItemLoader(LinhaItem(), selector=qxs) loader.add_xpath('linha', './td[1]/p//text()') loader.add_xpath('nome', './td[3]/p//text()') link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0] #TODO: Deveria manter o contexto e retornar os dados da proxima pagina # mas o que parece eh que nao esta retornando request = Request(link, callback=self.parse_item) #pdb.set_trace() loader.add_value('ida', request.meta['ida']) loader.add_value('volta', request.meta['volta']) yield loader.load_item()
def parse_full_report(self, response): # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode (response.body, response.encoding) t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8') l= XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq']) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() reportnum = item['reportnum'] yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def history(text, url): response = http.TextResponse(url=url, body=str(text.replace(u'\xa0', ''))) h = XPathItemLoader(item=TCADValueHistoryItem(), response=response) h.add_xpath('year', '//td[1]/text()') h.add_xpath('value', '//td[4]/text()') return h.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) entries = hxs.select( '//tr[contains(@class,"trusted tlistrow")]/td[contains(@class, "tlistname")]' ) for entry in entries: l = XPathItemLoader(item=TorrentItem(), selector=entry) l.add_xpath('torrent', 'a/@href') l.add_xpath('title', 'a[contains(@href, "nyaa")]/text()') yield l.load_item()
def parseDetalle(self, response): # Page 253, Orden 2665 has multiple pages if self.need_help(response): return hxs = HtmlXPathSelector(response) viewstate = self.getViewState(hxs, save=False) orden_compra = response.request.meta['compra'] hxs = HtmlXPathSelector(response) for tr in hxs.select( '//table[@id="ctl00_ContentPlaceHolder1_gvDetalle"]/tr[position() > 1]' ): i = CompraLineaItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('cantidad', 'td[1]/text()') l.add_xpath('unidad_medida', 'td[2]/text()') l.add_xpath('detalle', 'td[3]/text()') l.add_xpath('importe', 'td[4]/text()') x = l.load_item() if 'cantidad' in x: orden_compra['compra_linea_items'].append(x) foundCurrent = False lastPage = True # when no paging foundCurrent = False for td in hxs.select('//td[@colspan="4"]//td'): lastPage = False # only commit in the last page args = self.postBackArgs(td.select('a')) if not args: # page with no links lastPage = True foundCurrent = True elif foundCurrent: args = self.formdata(viewstate, *args) req = FormRequest(url, formdata=args, callback=self.parseDetalle) req.meta['compra'] = orden_compra yield req break if lastPage: yield orden_compra
def parse(self, response): selector = HtmlXPathSelector(response) for startup in selector.select(self.startup_results_xpath): loader = XPathItemLoader(SearchResults(), selector=startup) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) # looking for a deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_category(self, response): # The main selector we're using to extract data from the page main_selector = HtmlXPathSelector(response) # The XPath to website links in the directory page xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font' # Get a list of (sub) selectors to each website node pointed by the XPath sub_selectors = main_selector.select(xpath) # Iterate over the sub-selectors to extract data for each website for selector in sub_selectors: item = GoogledirItem() l = XPathItemLoader(item=item, selector=selector) l.add_xpath('name', 'a/text()') l.add_xpath('url', 'a/@href') l.add_xpath('description', 'font[2]/text()') # Here we populate the item and yield it yield l.load_item()
def parse_movie_info(self, response): """Scrapes movie information""" self.log("Parsing Movie Info") hxs = HtmlXPathSelector(response) selector = hxs.select('//div[@class="maindetails"]') item = MovieItem() # set url item['url'] = response.url # use item loader for other attributes l = XPathItemLoader(item=item, selector=selector) l.add_xpath('title', './/h1/text()') l.add_xpath( 'release_date', './/h5[text()="Release Date:"]' '/following-sibling::div/text()') l.add_xpath( 'tagline', './/h5[text()="Tagline:"]' '/following-sibling::div/text()') yield l.load_item()
def parse(self, response): """Get response from start_urls""" selector = HtmlXPathSelector(response) for deal in selector.xpath(self.xpath_for_deals): loader = XPathItemLoader(LivingSocial(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath.strip()) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) #iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #define processor # renove whitespace loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def myparse(self, response): print "myParse" selector = HtmlXPathSelector(response) # l = selector.select(self.deals_list_xpath) l = selector.select('//div[@id="detailed"]') ll = l.select('.//div[@class="title4"]/a/text()').extract() open(ll[0].strip() + '.html', 'wb').write(response.body) print ll[0].strip() for deal in l: #loader = XPathItemLoader(LivingSocialDeal(),selector=deal) loader = XPathItemLoader(MoviesClass(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.default_output_processor = TakeFirst() for field, xpath in self.mov_fields.iteritems(): loader.add_xpath(field, xpath) x = deal.select(field).extract() yield loader.load_item()