def parse_rc(self,response): loader = XPathItemLoader(item=ParseRcItem(), response=response) id = self.parse_id_from_url(response.url) loader.add_value('questionId', id) loader.add_xpath('text', '//div[@class="text"]/text()') loader.add_xpath('text', '//div[@class="text"]/span/text()') loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()') loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()') loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()') # loader.add_xpath('explanation','//div[@id="DivExplain"]') item = loader.load_item() if len(item['text']) ==3: test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2] + '</span>'+ item['text'][1] else: test = item['text'][0] for filename in self.fileList: index = filename.find(id) if index != -1: f = open('/home/huwei/origin/rcarticle/' + filename) artile = f.read() f.close content = self.rc_content.format(artile[24:len(artile) - 4],item['questionId'][0], item['questionId'][0],test, item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0], item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1], item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2], item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3], item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4], item['questionId'][0],item['answer'][0]) wf = open('/home/huwei/gmatclub/rc/' + id + '.html','w') wf.write(content) wf.close() return item
def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath( 'question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath( 'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()'])) question_loader.add_xpath( 'asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value( 'asker', self.get_user( selector.select(''.join(['.//span[@class="post-meta"]'])))) question_loader.add_xpath( 'number_of_answers', ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()'])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def process_item(self, task_id): report = self.db.loadScrapedFullReport(task_id) if report is None: return text = report["full_report_body"] text = "".join(chr(min(ord(c), 127)) for c in text) t = TextResponse(url=report["full_report_url"], body=text.encode("utf-8")) # must have utf-8 here l = XPathItemLoader(NrcParsedReport(), response=t) l.add_value("reportnum", task_id) patterns = self.compile_patterns() for p in patterns: l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1]) county = l.get_output_value("county") pattern = self.get_area_code_pattern(county) if pattern: l.add_value("areaid", county) l.add_value("blockid", text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern) l.add_value("blockid", text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)") item = l.load_item() yield item self.item_completed(task_id)
def parse_five_chose_one(self, response): print 'parse_five_chose_One' loader = XPathItemLoader(item=ParseFiveSelectOneItem(), response=response) id = self.parse_id_from_url(response.url) loader.add_value('questionId', id) loader.add_xpath('text', '//div[@class="text"]/text()') loader.add_xpath('text', '//div[@class="text"]/span/text()') loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()') loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()') loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()') loader.add_xpath('explanation','//div[@id="DivExplain"]') item = loader.load_item() if len(item['text']) ==3: test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2] + '</span>'+ item['text'][1] else: test = item['text'][0] content = self.five_chose_one_content.format(item['questionId'][0],test, item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0], item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1], item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2], item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3], item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4], item['questionId'][0],item['answer'][0],item['explanation'][0][21:len(item['explanation'][0]) - 6]) wf = open('/home/huwei/gmatclub/ir/' + id + '.html','w') wf.write(content) wf.close() return item
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL': response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k, v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'. format(k), level=log.WARNING) continue val = get_v_x(self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex')) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL':response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k,v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'.format(k), level=log.WARNING) continue val = get_v_x( self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex') ) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse_sale(self, response): l = XPathItemLoader(item=SaleItem(), response=response) l.add_value('url', response.url) l.add_xpath('address', '//h1[@class="address"]/text()') l.add_xpath('price', '//div[@class="price"]/text()') l.add_xpath('sale_date', '//th[text()="Last sale:"]/../td/div[last()]/text()', re=r'on (\w+)') l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()') l.add_xpath('bathrooms', '//th[text()="Bathrooms:"]/../td/text()', re=r'(\d+)') l.add_xpath('powder_rooms', '//th[text()="Bathrooms:"]/../td/text()', re=r', (\d+)') l.add_xpath('property_type', '//th[text()="Property type:"]/../td/text()') l.add_xpath('size', '//th[text()="Size:"]/../td/text()', re=r'([\d|,]+) sqft') l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()') l.add_xpath('price_per_sf', '//th[text()="Price/sqft:"]/../td/text()') l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()') l.add_xpath('public_records', 'id("property_public_info_module")/ul/li/span/text()') return l.load_item()
def parse(self, response): x = HtmlXPathSelector(response) #x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") #programs = x.select('./body/outline[position()=4]/outline[position()<4]') programs = x.select('//div[@class="item-list"]/ul/li[contains(@class,"views-row")]/div/div/div') podcastCount = str(len(programs)) i=0 allitems=[] for program in programs: i=i+1 l = XPathItemLoader(PodcastItem(), selector=program) l.add_xpath('id', 'concat("fri_", .//li/a[@class="rss"]/@href)') l.add_value('type', 'disco') l.add_xpath('brandId', './/li/a[@class="rss"]/@href') l.add_xpath('brandFeed', 'concat("http://www.franceinfo.fr", .//li[contains(@class,"link_rss")]/a[@class="rss"]/@href)') l.add_xpath('brandName', './/h3/a/text()') l.add_xpath('brandTimes', './/div[@class="views-field-field-emission-texte-diffusion-value"]/text()') l.add_xpath('brandDescription', './/div[@class="views-field-field-emission-desc-courte-value"]/p/text()') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_value('channelId', 'franceinfo') l.add_xpath('channelName', '//head/meta[@property="og:site_name"]/@content') l.add_xpath('channelDescription', '//head/meta[@property="og:description"]/@content') l.add_xpath('channelImage', '//div[@id="header"]/div/span/a/img/@src') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_xpath('brandHomepage', './/h3/a/@href') self.log('Discovering fri [%s of %s] feeds' % (i, podcastCount), level=log.INFO) item = l.load_item() yield item
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start'] for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(flipkartData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath('question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath('question_tags', ''.join([ '//*[@id="post-tags"]/ul/li/a/text()' ])) question_loader.add_xpath('asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value('asker', self.get_user(selector.select(''.join([ './/span[@class="post-meta"]' ])))) question_loader.add_xpath('number_of_answers', ''.join([ './/span[@class="post-meta"]', '//a[last()]/text()' ])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def parse_materials(self, response): reportnum = response.request.meta['reportnum'] text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No materials reports found in response {0}' .format(reportnum), log.INFO) else: self.log('Retrieved {0} materials records in report {1}' .format(len(materials),reportnum), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.name_in = lambda slist: [s[:32] for s in slist] l.add_value('reportnum', reportnum) for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def parse(self, response): hxs = HtmlXPathSelector(response) parse_prices = lambda l: filter(bool, [item.strip() for item in l]) item_name = hxs.select( "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value" ).extract() item_hash = hashlib.md5( '%s::%s::%s' % (self.auction_id, item_name, self.name)).hexdigest() item_price = parse_prices( hxs.select("//div[2]//div[2]/text()").extract()) loader = XPathItemLoader(item=SearchResultItem(), response=response) loader.add_value("id", item_hash) loader.add_value("auction_id", self.auction_id) loader.add_value("site", self.name) loader.add_xpath( "name", "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value" ) loader.add_value("link", response.url) loader.add_value("price", item_price) return loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",queryStr['page'] # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(JabongData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse(self, response): # hxs = HtmlXPathSelector(response) # ads = hxs.select('//div[@class="list-ads"]/a') # items = [] # for ad in ads: # item = LeboncoinItem() # item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') # item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() # item['url'] = ad.select('@href').extract() # self.log(item['name']) #print item['name'],':' ,item['photo'],'--->', item['url'] #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\ #<img src="%s" alt="" /><br />\ #<p><a href="%s">%s</a></p>\ #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) ) ##print photo #items.append(item) ## put in filename #filename = response.url.split("/")[-4] #open('/tmp/lbc/'+filename+'.html', 'a').write(html) #return items #yield items hxs = HtmlXPathSelector(response) for qxs in hxs.select('//div[@class="list-ads"]/a'): loader = XPathItemLoader(LeboncoinItem(), selector=qxs) loader.add_xpath('name' , 'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' ) loader.add_xpath('photo' , 'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' ) loader.add_xpath('url' , '@href' ) loader.add_value('category' , response.url.split("/")[-4] ) yield loader.load_item()
def process_item(self, task_id): report = self.db.loadScrapedFullReport(task_id) if report is None: return text = report['full_report_body'] text = "".join(chr(min(ord(c),127)) for c in text) t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here l = XPathItemLoader(NrcParsedReport(), response=t) l.add_value('reportnum', task_id) patterns = self.compile_patterns () for p in patterns: l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1]) county = l.get_output_value('county') pattern = self.get_area_code_pattern(county) if pattern: l.add_value ('areaid', county) l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern) l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)") item = l.load_item() yield item self.item_completed(task_id)
def get_answer(self, selector, question_loader): answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector) answer_loader.add_xpath('answer_id', './@id') answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()') answer_loader.add_value('answerer',self.get_user(selector)) answer_loader.add_value('question_id',question_loader.get_output_value('question_id')) answer_loader.add_xpath('answering_date',''.join([ './/div[@class="qa-container"]//ul[@class="meta"]', '/li[1]/abbr/@title' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]/text()' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]//strong/text()' ])) # get the good number ot bad number marks = answer_loader.get_output_value('marks') # print marks if marks.find('good'): answer_loader.add_value('number_of_good_marks', marks.split(' ')[0]) #bad numbers # is best answer answer_class = selector.select('./@class').extract()[0] if answer_class.find('best') != -1: answer_loader.add_value('is_best_answer', 1) else: answer_loader.add_value('is_best_answer', 0) return answer_loader.load_item()
def parse(self, response): x = XmlXPathSelector(response) #x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") #programs = x.select('./body/outline[position()=4]/outline[position()<4]') programs = x.select('//body/outline/outline') podcastCount = str(len(programs)) i=0 allitems=[] for program in programs: i=i+1 l = XPathItemLoader(PodcastItem(), selector=program) l.add_xpath('id', 'concat("dpc_", ./@xmlUrl)') l.add_value('audioType', 'disco') l.add_xpath('brandId', './@xmlUrl') l.add_xpath('brandFeed', './@xmlUrl') l.add_xpath('brandName', './@title') l.add_xpath('brandDescription', './@description') l.add_xpath('brandHomepage', './@htmlUrl') self.log('Discovering dpc [%s of %s] feeds' % (i, podcastCount), level=log.INFO) item = l.load_item() yield item
def parse(self, response): url = response.url group_name = url[url.find("group") :].split("/")[1] hxs = HtmlXPathSelector(response) dls = hxs.select('//dl[@class="obu"]') items = [] for dl in dls: item = GroupUserItem() l = XPathItemLoader(item=item, selector=dl) l.add_xpath("homepage", "dt/a/@href") l.add_xpath("image", "dt/a/img/@src") l.add_xpath("name", "dd/a/text()") l.add_value("group", group_name) yield l.load_item() links = hxs.select('//span[@class="next"]/a/@href').extract() for url in links: yield Request(url, callback=self.parse) if len(links) < 1: p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S) m = p.search(response.body_as_unicode()) if m: url = m.group(1) yield Request(url, callback=self.parse)
def parse_listing(self, response): l = XPathItemLoader(item=ListingItem(), response=response) l.add_value("url", response.url) l.add_xpath("address", '//h1[@class="address"]/text()') l.add_xpath("price", '//div[@class="price"]/text()') l.add_xpath("bedrooms", '//th[text()="Bedrooms:"]/../td/text()') l.add_xpath("bathrooms", '//th[text()="Bathrooms:"]/../td/text()', re=r"(\d+)") l.add_xpath("powder_rooms", '//th[text()="Bathrooms:"]/../td/text()', re=r", (\d+)") l.add_xpath("property_type", '//th[text()="Property type:"]/../td/text()') l.add_xpath("size", '//th[text()="Size:"]/../td/text()', re=r"([\d|,]+) sqft") l.add_xpath("lot", '//th[text()="Lot:"]/../td/text()') l.add_xpath("price_per_sf", '//th[text()="Price/sqft:"]/../td/text()') l.add_xpath("year_built", '//th[text()="Year built:"]/../td/text()') l.add_xpath("date_listed", '//th[text()="Added on Trulia:"]/../td/text()') l.add_xpath("mls_id", '//th[text()="MLS/ID:"]/../td/text()') l.add_xpath("descriptive_title", '//h2[@class="descriptive_title"]/text()') l.add_xpath("description", '//div[@class="listing_description_module"]/text()') l.add_xpath("additional_fields", 'id("property_listing_details_module")/ul/li/span/text()') l.add_xpath("public_records", 'id("property_public_info_module")/ul/li/span/text()') return l.load_item()
def parse(self, response): ubi = XPathItemLoader(item=FinanceIndex(), response=response) ubi.add_value("name", "Uruguay Bond Index") ubi.add_value("unit", "bps") ubi.add_xpath("value", "//span/text()") return [ubi.load_item()]
def parse(self, response): gold = XPathItemLoader(item=FinanceIndex(), response=response) gold.add_value("name", "Oro Spot Cierre Londres") gold.add_value("unit", "USD") gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()") return [gold.load_item()]
def get_user(self, selector): user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector) user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()'])) user_loader.add_value( 'twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def parse_doctor_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969 @returns items 1 1 @returns requests 0 0 """ hxs = HtmlXPathSelector(response) l = XPathItemLoader(CYDoctorItem(), hxs) l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()")) shortdesc = hxs.select( "//div[@id='mainColumn']//p[@class='bdFt']/text()").extract() if len(shortdesc) == 1: shortdescStr = shortdesc[0].strip() words = shortdescStr.split() if len(words) == 3: l.add_value('title', words[0]) l.add_value('hospital', words[1]) l.add_value('specialty', words[2]) else: print("title/hostpital/special error.") l.add_xpath( 'specialtyDesc', "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()") l.add_xpath( 'personalInfo', "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()") l.add_xpath('stars', "//p[@class='right starTxt']/text()") answer = hxs.select( "//div[@id='resolvedData']/p[1]/a/text()").extract() if len(answer) == 1: answerStr = answer[0].strip().replace(u"\xa0", "") m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr) if m.groupdict()["answer_cnt"] is not None: l.add_value('answers', m.groupdict()["answer_cnt"]) review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract() if len(review) == 1: reviewStr = review[0].strip().replace(u"\xa0", "") m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr) if m.groupdict()["review_cnt"] is not None: l.add_value('reviews', m.groupdict()["review_cnt"]) # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()") # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()") ret = l.load_item() print ret yield ret
def parse_argument(self, response): loader = XPathItemLoader(item=Argument(), response=response) id = self.parse_id_from_url(response.url) if id: loader.add_value('id', id) else: loader.add_value('id', -1) loader.add_xpath('rating', '//b[@id="QuestionRateValue"]/text()') loader.add_xpath('essay', '//div[@class="essay"]') return loader.load_item()
def parse(self, response): items = [] for name, pattern, pos in rates: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", name) rate.add_value("unit", "%") rate.add_xpath("value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos)) items.append(rate.load_item()) return items
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//span[@class=\'kitapismi\']/text()') l.add_xpath('isbn', '//span[@class=\'normalkucuk\']/text()', u'ISBN:([0-9]+)') l.add_xpath('author', '//span/a[contains(@href, "/yazar/")]/text()') l.add_xpath('publisher','//span/a[contains(@href, "/yayinevi/")]/text()') l.add_xpath('price', '//td/text()', u'Kitapyurdu Fiyatı:(.*) TL\.') l.add_value('link', response.url) l.add_value('store', 3) return l.load_item()
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Tasa Objetivo BCU") rate.add_value("unit", "%") rate.add_xpath("value", "8.75") #rate.update_only_if_change = True return [rate.load_item()]
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//h1[@class=\'kitapad14pnt\']/b/text()') l.add_xpath('isbn', '//span[@class=\'kunye\']/text()', u'ISBN: ([0-9\-X]+)') l.add_xpath('author', '//span[@class=\'yazarad12pnt\']/a/span[@class=\'yazarad12pnt\']/text()') l.add_xpath('publisher','//h3[@class=\'kapakyazisi\']/b/font/a/text()') l.add_xpath('price', '//span[@class="kapakyazisi"]/font/b/text()', u'(.*) TL') l.add_value('link', response.url) l.add_value('store', 5) return l.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item = LazyTweetUser(), selector = selector) user_loader.add_xpath('twitter_username', ''.join([ './a[1]/text()' ])) user_loader.add_value('twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//div[@class=\'boxTanimisim\']/div/text()') l.add_xpath('isbn', '//div[@id=\'tanitimbox\']/text()', u'.*ISBN : ([0-9]+)') l.add_xpath('author', '//div[@class=\'boxTanimVideo\']/a/text()') l.add_xpath('publisher','//h3[@class=\'boxTanimyayinevi\']/a/b/text()') l.add_xpath('price', '//b[@class=\'pricerange\']/text()', u'\s*([0-9,]*) TL \(KDV Dahil\)') l.add_value('link', response.url) l.add_value('store', 2) return l.load_item()
def get_UT_item(self, sel, user_url): ''' given the selector of topic and user url, generate the u_t relationship ''' ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector = sel) ut_loader.add_value('crawled_from', user_url) ut_loader.add_value('user_url', '/'+'/'.join(user_url.split('/')[-3:-1])) ut_loader.add_xpath('topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href') return ut_loader.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//font[@class=\'baslikt\']/strong/text()') l.add_xpath('isbn', '//td/text()', u'.*ISBN: ([0-9\-]+)') l.add_xpath('author', '//td[@class=\'yazart\']/a/text()') l.add_xpath('publisher','//a[@class=\'yayineviU\']/text()') l.add_xpath('price', '//font[@class=\'fiyat\']/text()', u'([0-9,]+) TL') l.add_value('link', response.url) l.add_value('store', 6) return l.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelAdi\']/text()') l.add_xpath('isbn', '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelIsbn\']/text()') l.add_xpath('author', '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelYazar\']/a/text()') l.add_xpath('publisher','//a[@id=\'ctl00_ContentPlaceHolderMainOrta_HyperLinkYayinci\']/text()') l.add_xpath('price', '//span[@class=\'fiyat\']/text()', u'(.*) TL') l.add_value('link', response.url) l.add_value('store', 4) return l.load_item()
def parse_doctor_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969 @returns items 1 1 @returns requests 0 0 """ hxs = HtmlXPathSelector(response) l = XPathItemLoader(CYDoctorItem(), hxs) l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()")) shortdesc = hxs.select("//div[@id='mainColumn']//p[@class='bdFt']/text()").extract() if len(shortdesc) == 1: shortdescStr = shortdesc[0].strip() words = shortdescStr.split() if len(words) == 3: l.add_value('title', words[0]) l.add_value('hospital', words[1]) l.add_value('specialty', words[2]) else: print ("title/hostpital/special error.") l.add_xpath('specialtyDesc', "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()") l.add_xpath('personalInfo', "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()") l.add_xpath('stars', "//p[@class='right starTxt']/text()") answer = hxs.select("//div[@id='resolvedData']/p[1]/a/text()").extract() if len(answer) == 1: answerStr = answer[0].strip().replace(u"\xa0", "") m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr) if m.groupdict()["answer_cnt"]is not None: l.add_value('answers', m.groupdict()["answer_cnt"]) review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract() if len(review) == 1: reviewStr = review[0].strip().replace(u"\xa0", "") m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr) if m.groupdict()["review_cnt"]is not None: l.add_value('reviews', m.groupdict()["review_cnt"]) # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()") # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()") ret = l.load_item() print ret yield ret
def load_compra_items(self, response, orden_compra): hxs = HtmlXPathSelector(response) for tr in hxs.select('//table[contains(@width, "760")][2]/tr'): i = CompraLineaItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('cantidad', 'td[1]/text()') l.add_xpath('importe', 'td[2]/text()') l.add_xpath('detalle', 'td[3]/text()') l.add_value('orden_compra', orden_compra) x = l.load_item() yield x
def parse_articles(self, response): hxs = HtmlXPathSelector(response) l = XPathItemLoader(item=Article(), response=response) l.add_xpath("title", "//h1[contains(@class,'detail-title')]/text()") l.add_xpath("content", "//div[contains(@class,'article-text')]//p[contains(@class,'body')]") l.add_xpath("date", "//span[contains(@class,'dateline')]/text()") l.add_xpath("location", " ") l.add_xpath("keywords", "//div[@id='articleKeywords']/p/a/text()") l.add_value("link", response.url) l.add_value("author", 'Sainath') return l.load_item()
def get_UT_item(self, sel, user_url): ''' given the selector of topic and user url, generate the u_t relationship ''' ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector=sel) ut_loader.add_value('crawled_from', user_url) ut_loader.add_value('user_url', '/' + '/'.join(user_url.split('/')[-3:-1])) ut_loader.add_xpath( 'topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href') return ut_loader.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item = YahooUser(), selector = selector) user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()') user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href') user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)', user_loader.get_output_value('user_url') ).group(1)) if user_loader.get_collected_values('user_name'): return user_loader.load_item() else: return None
def parse_faculty_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/faculty/DE4rO-XCoLU0Jq1rbc1P6dS2aO.htm @returns items 21 21 @returns requests 3 3 @scrapes _name specialty title shortDesc """ hxs = HtmlXPathSelector(response) linkExtractor = SgmlLinkExtractor( allow=(r"/faculty/\S+/menzhen.htm\?orderby", ), unique=True) links = linkExtractor.extract_links(response) for link in links: yield Request(link.url, callback=self.parse_faculty_detail) specialty = hxs.select( "/html/body/div[3]/div/div[2]/div/a[3]/text()").extract() hospital = hxs.select( "/html/body/div[3]/div/div[2]/div/a[2]/text()").extract() docLinks = hxs.select( "//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]" ) #docLinks = hxs.select("//table[@id='doc_list_index']/tr") for doc in docLinks: l = XPathItemLoader(DoctorItem(), doc) docNames = doc.select( "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()" ).extract() if len(docNames) != 0: print docNames[0] l.add_xpath( '_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()" ) l.add_value('specialty', specialty) l.add_value('hospital', hospital) l.add_xpath('title', "./td[@class='tda']/li/p[1]/text()") l.add_xpath('acadamicDegree', "./td[@class='tda']/li/p[2]/text()") l.add_xpath('shortDesc', "./td[@class='tdb']/text()") #clinic time todo ret = l.load_item() #print ret yield ret
def parse(self, response): items = [] for name, pattern, pos in rates: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", name) rate.add_value("unit", "%") rate.add_xpath( "value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos)) items.append(rate.load_item()) return items
def parse_lineas(self, response): hxs = HtmlXPathSelector(response) for tr in hxs.select('//table//tr[position() > 1]'): cli = CompraLineaItem() l = XPathItemLoader(item=cli, selector=tr) l.add_xpath('cantidad', 'td[4]/text()') l.add_xpath('unidad_medida', 'td[5]/text()') l.add_xpath('importe', 'td[3]/text()') l.add_xpath('importe_total', 'td[6]/text()') l.add_xpath('detalle', 'td[2]/text()') l.add_value('orden_compra', [response.request.meta['orden_de_compra']]) # hack, ver ../items.py:50 (TakeFirst()) yield l.load_item()
def parse_links(self, response): listing = re.findall(r"lid=(\d+)",response.url) loader = XPathItemLoader(item=AuctionsItem(), response=response) loader.add_value("id",listing[0]) loader.add_xpath("auctioneer",settings['AUCTION_AUCTIONEER']) loader.add_xpath("contact_number",settings['AUCTION_CONTACT_NUMBER']) loader.add_xpath("date",settings['AUCTION_DATE']) loader.add_xpath("time",settings['AUCTION_TIME']) loader.add_xpath("location",settings['AUCTION_LOCATION']) loader.add_value("link",response.url) loader.add_xpath("listing",settings['AUCTION_LISTING']) return loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) orden_compra, anio = re.search(r'wOCabc=(\d+)&wEjercicio=(\d+)', urlparse(response.url).query).groups() for tr in hxs.select('//table[contains(@width, "760")][2]/tr'): i = CompraLineaItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('cantidad', 'td[1]/text()') l.add_xpath('importe', 'td[2]/text()') l.add_xpath('detalle', 'td[3]/text()') l.add_value('orden_compra', int(orden_compra)) l.add_value('anio', int(anio)) x = l.load_item() yield x
def parse_articles(self, response): hxs = HtmlXPathSelector(response) l = XPathItemLoader(item=Article(), response=response) l.add_xpath("title", "//h1[contains(@class,'detail-title')]/text()") l.add_xpath( "content", "//div[contains(@class,'article-text')]//p[contains(@class,'body')]" ) l.add_xpath("date", "//span[contains(@class,'dateline')]/text()") l.add_xpath("location", " ") l.add_xpath("keywords", "//div[@id='articleKeywords']/p/a/text()") l.add_value("link", response.url) l.add_value("author", 'Sainath') return l.load_item()
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \ selector = selector) answer_loader.add_value('question_id', response.url.split('/')[-1]) answer_loader.add_value( 'answerer', self.get_user( selector.select(''.join(['.//span[@class="answer-meta"]'])))) answer_loader.add_xpath( 'answer_content', ''.join([ './/span[@class="answer-body"]', '//span[@class="answer-status"]//descendant-or-self::text()' ])) print answer_loader.get_output_value('answer_content') a = input() return answer_loader.load_item()
def parse_full_report(self, response): # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode (response.body, response.encoding) t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8') l= XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq']) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() reportnum = item['reportnum'] yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def get_user(self, selector, response, label): user_loader = XPathItemLoader(item = StackOverflowUser(), selector = selector) user_loader.add_xpath('user_name', ''.join([ './/div[contains(@class, "user-details")]', '/a/text()' ])) user_loader.add_xpath('user_link', ''.join([ './/div[contains(@class, "user-details")]', '/a/@href' ])) if user_loader.get_output_value('user_link'): user_id = user_loader.get_output_value('user_link') user_loader.add_value('user_id', user_loader.get_output_value('user_link')) return user_loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for qxs in hxs.select(self.lista_linhas_xpath): loader = XPathItemLoader(LinhaItem(), selector=qxs) loader.add_xpath('linha', './td[1]/p//text()') loader.add_xpath('nome', './td[3]/p//text()') link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0] #TODO: Deveria manter o contexto e retornar os dados da proxima pagina # mas o que parece eh que nao esta retornando request = Request(link, callback=self.parse_item) #pdb.set_trace() loader.add_value('ida', request.meta['ida']) loader.add_value('volta', request.meta['volta']) yield loader.load_item()
def parse_rental(self, response): l = XPathItemLoader(item=RentalItem(), response=response) l.add_value('url', response.url) l.add_xpath('address', '//th[text()="Address:"]/../td/text()') l.add_xpath('price', '//th[text()="Price:"]/../td/div/text()') l.add_xpath('price_period', '//th[text()="Price:"]/../td/div/span/text()') l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()') l.add_xpath('bathrooms', '//th[text()="Bathrooms:"]/../td/text()', re=r'(\d+)') l.add_xpath('powder_rooms', '//th[text()="Bathrooms:"]/../td/text()', re=r', (\d+)') l.add_xpath('property_type', '//th[text()="Property type:"]/../td/text()') l.add_xpath('size', '//th[text()="Size:"]/../td/text()', re=r'([\d|,]+) sqft') l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()') l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()') l.add_xpath('lease_term', '//th[text()="Terms of lease:"]/../td/text()') l.add_xpath('pets_allowed', '//th[text()="Pets:"]/../td/text()') l.add_xpath('date_listed', '//th[text()="Added on Trulia:"]/../td/text()') l.add_xpath('mls_id', '//th[text()="MLS/ID:"]/../td/text()') l.add_xpath('descriptive_title', '//h2[@class="descriptive_title"]/text()') l.add_xpath('description', '//div[@class="listing_description_module"]/text()') l.add_xpath('additional_fields', 'id("property_listing_details_module")/ul/li/span/text()') l.add_xpath('public_records', 'id("property_public_info_module")/ul/li/span/text()') return l.load_item()
def parse_item(self, response): sel = Selector(response) print response.url app_loader = XPathItemLoader(item=AppItem(), selector=sel) # init the item loader # set app id app_loader.add_value('app_id', parse_id(response.url)) # composite the title app_loader.add_xpath( 'title', '//div[contains(@class, "document-title")]//text()') app_loader.add_xpath( 'description', '//div[contains(@class, "id-app-orig-desc")]//text()') app_loader.add_xpath('score', '//meta[@itemprop="ratingValue"]//@content') app_loader.add_xpath( 'icon_url', '//div[contains(@class, "details-info")]//img[contains(@class, "cover-image")]/@src' ) app_loader.add_xpath( 'author', '//div[@itemprop="author"]//span[@itemprop="name"]//text()') app_loader.add_xpath( 'app_type', '//div[contains(@class, "details-info")]//span[@itemprop="genre"]/text()' ) # get the similarities and the more from developers app_loader.add_xpath( 'similarity', '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=1]//div[contains(@class, "card")]/@data-docid' ) app_loader.add_xpath( 'more_from_devs', '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=2]//div[contains(@class, "card")]/@data-docid' ) # print app_loader.load_item() # print app_loader.get_output_value('app_id') return app_loader.load_item()
def parse_materials(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No incident reports found in response', log.INFO) else: self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)') for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item
def parse_full_report(self, response): reportnum = response.request.meta['reportnum'] # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode(response.body, response.encoding) if len( text ) < 1000: # check for an empty response- if so then bail out - we'll try again next time around return t = TextResponse(url=response.url, body=text.encode('utf-8'), encoding='utf-8') l = XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', reportnum) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Merval") rate.add_value("unit", "") hxs = HtmlXPathSelector(response) rate.add_value("value", hxs.select("//span[contains(@id,'UltimoMerval')]/text()")[0].extract()) return [rate.load_item()]