def parse(self, response): selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parseAPage(self, response): if self.need_help(response): return hxs = HtmlXPathSelector(response) viewstate = self.getViewState(hxs) self.pages += 1 for tr in hxs.select( '//table[@id="ctl00_ContentPlaceHolder1_gvOrdenes"]/tr[position() > 1]' ): detalle = self.postBackArgs(tr.select('td[9]/a')) if detalle: detalle = self.postBackArgs(tr.select('td[9]/a')) i = CompraItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('orden_compra', 'td[1]/text()') l.add_xpath('fecha', 'td[2]/text()') l.add_xpath('importe', 'td[3]/text()') l.add_xpath('proveedor', 'td[4]/text()') l.add_xpath('destino', 'td[5]/text()') l.add_xpath('suministro', 'td[6]/text()') l.add_xpath('anio', 'td[7]/text()') l.add_xpath('tipo', 'td[8]/text()') compra = l.load_item() compra['compra_linea_items'] = [] detalle = self.formdata(viewstate, *detalle) req = FormRequest(url, formdata=detalle, callback=self.parseDetalle) req.meta['compra'] = compra req.meta['viewstate'] = viewstate yield req # Get previous page if self.pages < MAX_PAGES: prev = None for td in hxs.select('//td[@colspan="9"]//td'): args = self.postBackArgs(td.select('a')) if args: prev = args else: prev = self.formdata(viewstate, *prev) req = FormRequest(url, formdata=prev, callback=self.parseAPage) yield req
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \ selector = selector) answer_loader.add_value('question_id', response.url.split('/')[-1]) answer_loader.add_value( 'answerer', self.get_user( selector.select(''.join(['.//span[@class="answer-meta"]'])))) answer_loader.add_xpath( 'answer_content', ''.join([ './/span[@class="answer-body"]', '//span[@class="answer-status"]//descendant-or-self::text()' ])) print answer_loader.get_output_value('answer_content') a = input() return answer_loader.load_item()
def parse_full_report(self, response): # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode (response.body, response.encoding) t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8') l= XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq']) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() reportnum = item['reportnum'] yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def parse(self, response): """Get response from start_urls""" selector = HtmlXPathSelector(response) for deal in selector.xpath(self.xpath_for_deals): loader = XPathItemLoader(LivingSocial(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath.strip()) yield loader.load_item()
def search_results(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) reports = hxs.select ('//table[@class="t16Standard"]/tr') if (len(reports) == 0): self.log('Incident report data not present in response', log.ERROR) else: # Skip the first report record because this is the header row reports.pop (0) if (len(reports) == 0): self.log('No incident reports found in response', log.WARNING) else: self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO) for report in reports: l = XPathItemLoader(NrcScrapedReport(), report) l.context['base_url'] = response.url for name, params in NrcScrapedReport.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if self.db.reportExists(item['reportnum']): self.log('Report {0} already exists. Skipping to next report.'.format(item['reportnum']), log.INFO) else: f_request = Request( item['full_report_url'], callback=self.parse_full_report) m_request = Request( item['materials_url'], callback=self.parse_materials) yield item self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE') # if self.db.fullReportExists (item['reportnum']): # self.log('Full report Report {0} already exists. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield f_request # # if self.db.materialExists (item['reportnum']): # self.log('Materials record(s) already exist for report {0}. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield m_request # get next page of results next = hxs.select('//td[@class="pagination"][4]/a/@href') if len(next) > 0: yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results)
def parseDetalle(self, response): # Page 253, Orden 2665 has multiple pages if self.need_help(response): return hxs = HtmlXPathSelector(response) viewstate = self.getViewState(hxs, save=False) orden_compra = response.request.meta['compra'] hxs = HtmlXPathSelector(response) for tr in hxs.select( '//table[@id="ctl00_ContentPlaceHolder1_gvDetalle"]/tr[position() > 1]' ): i = CompraLineaItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('cantidad', 'td[1]/text()') l.add_xpath('unidad_medida', 'td[2]/text()') l.add_xpath('detalle', 'td[3]/text()') l.add_xpath('importe', 'td[4]/text()') x = l.load_item() if 'cantidad' in x: orden_compra['compra_linea_items'].append(x) foundCurrent = False lastPage = True # when no paging foundCurrent = False for td in hxs.select('//td[@colspan="4"]//td'): lastPage = False # only commit in the last page args = self.postBackArgs(td.select('a')) if not args: # page with no links lastPage = True foundCurrent = True elif foundCurrent: args = self.formdata(viewstate, *args) req = FormRequest(url, formdata=args, callback=self.parseDetalle) req.meta['compra'] = orden_compra yield req break if lastPage: yield orden_compra
def parse(self, response): selector = HtmlXPathSelector(response) # iterate over data_list for data in selector.select(self.data_list): loader = XPathItemLoader(TeoniteItem(), selector=data) loader.default_input_processor = MapCompose(str.strip) loader.default_output_processor = Join() # add xpath to loader for field, xpath in self.item_fields.items(): loader.add_xpath(field, xpath) yield loader.load_item() for nextp in selector.select(self.next_page): yield response.follow(nextp, callback=self.parse)
def parse(self, response): hxs = HtmlXPathSelector(response) for qxs in hxs.select(self.lista_linhas_xpath): loader = XPathItemLoader(LinhaItem(), selector=qxs) loader.add_xpath('linha', './td[1]/p//text()') loader.add_xpath('nome', './td[3]/p//text()') link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0] #TODO: Deveria manter o contexto e retornar os dados da proxima pagina # mas o que parece eh que nao esta retornando request = Request(link, callback=self.parse_item) #pdb.set_trace() loader.add_value('ida', request.meta['ida']) loader.add_value('volta', request.meta['volta']) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for qxs in hxs.select(self.question_list_xpath): loader = XPathItemLoader(QuestionItem(), selector=qxs) loader.add_xpath('title', './/h3/a/text()') loader.add_xpath('summary', './/h3/a/@title') loader.add_xpath('tags', './/a[@rel="tag"]/text()') loader.add_xpath('user', './/div[@class="started"]/a[2]/text()') loader.add_xpath('posted', './/div[@class="started"]/a[1]/span/@title') loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()') loader.add_xpath( 'answers', './/div[contains(@class, "answered")]/div[1]/text()') loader.add_xpath('views', './/div[@class="views"]/div[1]/text()') yield loader.load_item()
def get_user(self, selector, response, label): user_loader = XPathItemLoader(item = StackOverflowUser(), selector = selector) user_loader.add_xpath('user_name', ''.join([ './/div[contains(@class, "user-details")]', '/a/text()' ])) user_loader.add_xpath('user_link', ''.join([ './/div[contains(@class, "user-details")]', '/a/@href' ])) if user_loader.get_output_value('user_link'): user_id = user_loader.get_output_value('user_link') user_loader.add_value('user_id', user_loader.get_output_value('user_link')) return user_loader.load_item()
def parse(self, response): response.body = response.body.replace('\\','').replace('\xa0','') p = XPathItemLoader(item=PersonItem(), response=response) try: p.add_value('first_name', re.findall( '&qf=(\w+)&', response.url )[0] ) p.add_value('middle_name', re.findall( '&qmi=(\w+)&', response.url )[0] ) p.add_value('last_name', re.findall( '&qn=(\w+)&', response.url )[0] ) p.add_value('city', re.findall( '&qc=(\w+)&', response.url )[0] ) p.add_value('state', re.findall( '&qs=(\w+)&', response.url )[0] ) p.add_value('zipcode', re.findall( '&qz=(\d+)&', response.url )[0] ) p.add_value('prop_ref', re.findall( '&prop_ref=(\d+)', response.url )[0] ) p.add_xpath('cities', '//div[@class="addresses"]/p/b/text()[1]', re="([^\(]+)") p.add_xpath('age','//div[@class="greenTopBoxLeft round12_12_0_0"]/p[@class="nameAge"]/text()[2]', re=", Age (\d+)") except IndexError: pass else: return p.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) #iterate over events for event in selector.select(self.events_list_xpath): loader = XPathItemLoader(CrunchBaseEvent(), selector=event) #define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader. for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) item_name = hxs.select( "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value").extract( ) item_hash = hashlib.md5( '%s::%s::%s' % (self.auction_id, item_name, self.name)).hexdigest() loader = XPathItemLoader(item=SearchResultItem(), response=response) loader.add_value("id", item_hash) loader.add_value("auction_id", self.auction_id) loader.add_value("site", self.name) loader.add_xpath( "name", "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value") loader.add_value("link", response.url) loader.add_xpath("price", "//td[7]/text()") return loader.load_item()
def parse_rental(self, response): l = XPathItemLoader(item=RentalItem(), response=response) l.add_value('url', response.url) l.add_xpath('address', '//th[text()="Address:"]/../td/text()') l.add_xpath('price', '//th[text()="Price:"]/../td/div/text()') l.add_xpath('price_period', '//th[text()="Price:"]/../td/div/span/text()') l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()') l.add_xpath('bathrooms', '//th[text()="Bathrooms:"]/../td/text()', re=r'(\d+)') l.add_xpath('powder_rooms', '//th[text()="Bathrooms:"]/../td/text()', re=r', (\d+)') l.add_xpath('property_type', '//th[text()="Property type:"]/../td/text()') l.add_xpath('size', '//th[text()="Size:"]/../td/text()', re=r'([\d|,]+) sqft') l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()') l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()') l.add_xpath('lease_term', '//th[text()="Terms of lease:"]/../td/text()') l.add_xpath('pets_allowed', '//th[text()="Pets:"]/../td/text()') l.add_xpath('date_listed', '//th[text()="Added on Trulia:"]/../td/text()') l.add_xpath('mls_id', '//th[text()="MLS/ID:"]/../td/text()') l.add_xpath('descriptive_title', '//h2[@class="descriptive_title"]/text()') l.add_xpath('description', '//div[@class="listing_description_module"]/text()') l.add_xpath('additional_fields', 'id("property_listing_details_module")/ul/li/span/text()') l.add_xpath('public_records', 'id("property_public_info_module")/ul/li/span/text()') return l.load_item()
def parse_item(self, response): sel = Selector(response) print response.url app_loader = XPathItemLoader(item=AppItem(), selector=sel) # init the item loader # set app id app_loader.add_value('app_id', parse_id(response.url)) # composite the title app_loader.add_xpath( 'title', '//div[contains(@class, "document-title")]//text()') app_loader.add_xpath( 'description', '//div[contains(@class, "id-app-orig-desc")]//text()') app_loader.add_xpath('score', '//meta[@itemprop="ratingValue"]//@content') app_loader.add_xpath( 'icon_url', '//div[contains(@class, "details-info")]//img[contains(@class, "cover-image")]/@src' ) app_loader.add_xpath( 'author', '//div[@itemprop="author"]//span[@itemprop="name"]//text()') app_loader.add_xpath( 'app_type', '//div[contains(@class, "details-info")]//span[@itemprop="genre"]/text()' ) # get the similarities and the more from developers app_loader.add_xpath( 'similarity', '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=1]//div[contains(@class, "card")]/@data-docid' ) app_loader.add_xpath( 'more_from_devs', '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=2]//div[contains(@class, "card")]/@data-docid' ) # print app_loader.load_item() # print app_loader.get_output_value('app_id') return app_loader.load_item()
def myparse(self, response): print "myParse" selector = HtmlXPathSelector(response) # l = selector.select(self.deals_list_xpath) l = selector.select('//div[@id="detailed"]') ll = l.select('.//div[@class="title4"]/a/text()').extract() open(ll[0].strip() + '.html', 'wb').write(response.body) print ll[0].strip() for deal in l: #loader = XPathItemLoader(LivingSocialDeal(),selector=deal) loader = XPathItemLoader(MoviesClass(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.default_output_processor = TakeFirst() for field, xpath in self.mov_fields.iteritems(): loader.add_xpath(field, xpath) x = deal.select(field).extract() yield loader.load_item()
def parse_movie_info(self, response): """Scrapes movie information""" self.log("Parsing Movie Info") hxs = HtmlXPathSelector(response) selector = hxs.select('//div[@class="maindetails"]') item = MovieItem() # set url item['url'] = response.url # use item loader for other attributes l = XPathItemLoader(item=item, selector=selector) l.add_xpath('title', './/h1/text()') l.add_xpath( 'release_date', './/h5[text()="Release Date:"]' '/following-sibling::div/text()') l.add_xpath( 'tagline', './/h5[text()="Tagline:"]' '/following-sibling::div/text()') yield l.load_item()
def parse_category(self, response): # The main selector we're using to extract data from the page main_selector = HtmlXPathSelector(response) # The XPath to website links in the directory page xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font' # Get a list of (sub) selectors to each website node pointed by the XPath sub_selectors = main_selector.select(xpath) # Iterate over the sub-selectors to extract data for each website for selector in sub_selectors: item = GoogledirItem() l = XPathItemLoader(item=item, selector=selector) l.add_xpath('name', 'a/text()') l.add_xpath('url', 'a/@href') l.add_xpath('description', 'font[2]/text()') # Here we populate the item and yield it yield l.load_item()
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = StackOverflowAnswer(), selector = selector) answer_loader.add_xpath('answer_content', ''.join([ ".//td[@class='answercell']/div[@class='post-text']", "/p/text()" ])) answer_loader.add_xpath('answer_id', ''.join([ "./@data-answerid" ])) answer_loader.add_xpath('marks', ''.join([ ".//span[contains(@class, 'vote-count-post')]/text()" ])) # is best answer? if selector.select('./@class').extract()[0].find('accepted-answer') != -1: answer_loader.add_value('is_best_answer', 1) else: answer_loader.add_value('is_best_answer', 0) # get user name answer_loader.add_value('answerer', self.get_user(selector, response, 'answer')) return answer_loader.load_item()
def parse_materials(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No incident reports found in response', log.INFO) else: self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)') for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def scrape_content_items(self, response): hxs = HtmlXPathSelector(response) stats = self.crawler.stats page_num = hxs.select( '//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value' ).extract() if page_num: page_num = page_num[0] self.log( '%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO) else: self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING) stats.inc_value('_pages', spider=self) reports = hxs.select( '//table[@id="MainContent_DocumentList1_GridView1"]//tr') for report in reports: l = XPathItemLoader(FracFocusScrape(), report) l.state_in = lambda slist: [s[:20] for s in slist] l.county_in = lambda slist: [s[:20] for s in slist] for name, params in FracFocusScrape.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if item.get('api'): if self.db.itemExists(item): stats.inc_value('_existing_count', spider=self) else: stats.inc_value('_new_count', spider=self) # print item['operator'] yield item if not stats.get_value('_existing_count') and not stats.get_value( '_new_count'): self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)
def parse_full_report(self, response): reportnum = response.request.meta['reportnum'] # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode(response.body, response.encoding) if len( text ) < 1000: # check for an empty response- if so then bail out - we'll try again next time around return t = TextResponse(url=response.url, body=text.encode('utf-8'), encoding='utf-8') l = XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', reportnum) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def get_question(self, selector, response): hxs = HtmlXPathSelector(response) number_of_answers = hxs.select(''.join([ '//div[@id="answers"]', '//div[contains(@class, "answers-subheader")]', '/h2/text()' ])).extract() question_loader = XPathItemLoader(item = StackOverflowQuestion(), selector = selector) question_loader.add_xpath('question_content', ''.join([ ".//td[@class='postcell']", "//div[@class='post-text']/p/text()" ])) question_loader.add_xpath('question_tags', ''.join([ ".//div[@class='post-taglist']", "//a[@class='post-tag']/text()" ])) question_loader.add_xpath('question_id', ''.join([ './@data-questionid' ])) question_loader.add_xpath('marks', ''.join([ ".//span[contains(@class, 'vote-count-post')]/text()" ])) question_loader.add_value('asker', self.get_user(selector, response, 'question')) question_loader.add_value('number_of_answers', int(number_of_answers[0].strip().split(' ')[0])) question_title = hxs.select(''.join([ '//div[contains(@id, "question-header")]', '//a[contains(@class, "question-hyperlink")]/text()' ])).extract() question_loader.add_value('question_title', question_title) # print question_loader.get_output_value('question_title') return question_loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) teams = hxs.select('//tbody/tr') #get the table rows season = hxs.select('//small/text()').extract() day = hxs.select('//h3/text()').extract() items = [] for index, team in enumerate(teams): l = XPathItemLoader(item=TeamItem(), response=response, selector=team) l.add_xpath('name', 'td[@class="equipo"]/text()') l.add_xpath('name', 'td[@class="equipo"]/a/text()') l.add_value('season', season) l.add_value('day', day) l.add_value('position', str(index + 1)) l.add_xpath('pj', 'td[@class="pj"]/text()') l.add_xpath('pg', 'td[@class="pg"]/text()') l.add_xpath('pe', 'td[@class="pe"]/text()') l.add_xpath('pp', 'td[@class="pp"]/text()') l.add_xpath('gf', 'td[@class="gf"]/text()') l.add_xpath('gc', 'td[@class="gc"]/text()') l.add_xpath('points', 'td[@class="pts seleccionado"]/text()') items.append(l.load_item()) return items
def parse_product(self, response): ''' Gather all the information from the product name price description image_urls ''' l = XPathItemLoader(item=Product(), response=response) l.add_xpath('name', XPATHS['product']['name']) l.add_xpath('description', XPATHS['product']['description']) # price for xpath in XPATHS['product']['prices']: l.add_xpath('price', xpath) l.add_xpath('image_urls', XPATHS['product']['image_urls'] \ , re='\'(.*?)\'') return l.load_item()
def parse_doctor(self, response): response_url = response.url doctor_id = re.search('doctor/([^\.]*)\.htm', response_url).group(1) hxs = Selector(response) #parse doctor name name_list = hxs.xpath("//input[@name='doctor_name']/@value") doctor_name = '' if len(name_list) != 0: doctor_name = name_list[0].extract() #hospital department hospital_department_selectors = hxs.xpath("//meta[@name='keywords']/@content") hospital = '' department = '' if len(hospital_department_selectors) != 0: hospital_re = r',(?P<hospital>.*?)' + doctor_name hospital_match = re.search(hospital_re, hospital_department_selectors[0].extract()) if hospital_match != None: hospital = hospital_match.group('hospital') department_re = hospital + r'(?P<department>.*?)' + doctor_name + ',' department_match = re.search(department_re, hospital_department_selectors[0].extract()) if department_match != None: department = department_match.group('department') #title title = '' title_selectors = hxs.xpath('//meta[@name="description"]/@content') if len(title_selectors) != 0: title_re_str = doctor_name + r'(?P<doctor_title>.*?)' + u'简介' title = re.search(title_re_str, title_selectors[0].extract()).group(1) doctor_about_dict = None tag_doctor_about_selectors = hxs.xpath('//div[@id="bp_doctor_about"]/div[@class="doctor_about"]') if len(tag_doctor_about_selectors) != 0: doctor_about_dict = self.parse_doctor_about(tag_doctor_about_selectors) else: doctor_about_match_list = hxs.xpath( '//script[@type="text/javascript"]/text()').re( 'BigPipe.onPageletArrive\((?P<doctor_about>\{"id":"bp_doctor_about".*\})\);') if doctor_about_match_list: da_dict = json.loads(doctor_about_match_list[0]) if 'content' in da_dict: doctor_about_hxs = Selector(HtmlResponse(url=response.url, body=da_dict['content'].encode('utf-8'))) doctor_about_dict = self.parse_doctor_about(doctor_about_hxs) #schedule doctor_schedule = [] trs = hxs.xpath("//table[@class='doctortimefrom1']/tr") day_part = 0 for itr in trs: if 0 != day_part: doctor_schedule.extend(self.weekday_operation(itr, day_part)) #上午 day_part += 1 # #disease # disease_list = list() # disease_ht_selector = hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td') # if len(disease_ht_selector) == 1: # disease_list = self.parse_disease_from_td_selector(disease_ht_selector, doctor_id=doctor_id) # else: # disease_match_list = hxs.xpath( # '//script[@type="text/javascript"]/text()').re( # 'BigPipe.onPageletArrive\((?P<dict_content>\{"id":"bp_doctor_getvote".*\})\);') # if disease_match_list: # disease_match = disease_match_list[0] # d_dict = json.loads(disease_match) # if 'content' in d_dict: # disease_hxs = Selector(HtmlResponse(url=response.url, body=d_dict['content'].encode('utf-8'))) # disease_selector = disease_hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td') # if len(disease_selector) == 1: # disease_list = self.parse_disease_from_td_selector(disease_selector, doctor_id=doctor_id) zanwu_re = re.compile(u'暂无') empty_sub_re = re.compile(r'(<!--.*?-->|\n|\t|\r|[ ])') item = XPathItemLoader(DoctorDetailItem(),hxs) item.add_value('doctor_id',doctor_id) if doctor_name: item.add_value('_name',doctor_name) if response.meta['city']: item.add_value('city',response.meta['city']) if hospital: item.add_value('hospital',hospital) if department: item.add_value('department',department) if title: item.add_value('title',title) if doctor_schedule: item.add_value('schedule',doctor_schedule) else: if len(hxs.xpath('//table[@class="doctortimefrom1"]')) == 0: for content in hxs.xpath('//script[@type="text/javascript"]/text()').extract(): if content.find('doctortimefrom1') != -1: item.add_value('schedule','') # shouldn't exist in js break if doctor_about_dict: if 'image_url' in doctor_about_dict: item.add_value('image',doctor_about_dict['image_url']) if 'bio' in doctor_about_dict: bio = doctor_about_dict['bio'] if zanwu_re.search(bio) != None: bio = '' if bio: item.add_value('bio',empty_sub_re.sub('', bio)) if 'feature' in doctor_about_dict: feature = doctor_about_dict['feature'] if zanwu_re.search(feature) != None: feature = '' if feature: item.add_value('feature',empty_sub_re.sub('', feature)) yield item.load_item() url=u'http://www.haodf.com/doctor/'+doctor_id+u'/jingyan/1.htm' l = LetterItem() l['doctor_id'] = doctor_id letter = [] disease_item = DoctorDiseaseItem() disease_item['doctor_id'] = doctor_id req=Request(url,callback=self.parse_letter) req.meta['item']=l req.meta['letter']=letter req.meta['disease']=disease_item yield req
def parse(self, response): response.body = response.body.replace('\\', '').replace('\xa0', '') parcel = XPathItemLoader(item=TCADParcelItem(), response=response) parcel.add_value('url', response.url) parcel.add_xpath( 'prop_id', '//font[text()="Property ID Number:"]/../../td[3]/font/b/text()') parcel.add_xpath( 'owner', '//td[text()="Owner\'s Name"]/../td[@class="reports_blacktxt"]/font/b/text()' ) parcel.add_xpath( 'owner_address', '//td[text()="Owner\'s Name"]/../../tr[2]/td[2]/text()') parcel.add_xpath( 'address', '//td[text()="Owner\'s Name"]/../../tr[3]/td[2]/text()') parcel.add_xpath( 'land_value', '//font[text()="Land Value"]/../../td[@class="reports_blacktxt"]/p/text()' ) parcel.add_xpath( 'improvement_value', '//font[text()="Improvement Value"]/../../td[@class="reports_blacktxt"]/p/text()' ) parcel.add_xpath( 'market_value', '//font[text()="Total Value"]/../../td[@class="reports_blacktxt"]/p/text()' ) parcel.add_xpath( 'acreage', '//font[text()="Land Acres"]/../../td[@class="reports_blacktxt"]/p/text()' ) parcel.add_xpath( 'neighborhood', '//font[text()="Neighborhood Code"]/../../td[@class="reports_blacktxt"]/text()' ) parcel.add_xpath( 'improvement_area', '//font[text()="Total Living Area"]/../../td[2]//b/text()') def improvement(text, url): response = http.TextResponse(url=url, body=str(text)) i = XPathItemLoader(item=TCADImprovementItem(), response=response) i.add_xpath('id', '//td[1]/text()') i.add_xpath('state_category', '//td[2]/text()') i.add_xpath('description', '//td[3]/text()') return i.load_item() def segment(text, url): response = http.TextResponse(url=url, body=str(text.replace(u'\xa0', ''))) s = XPathItemLoader(item=TCADSegmentItem(), response=response) s.add_xpath('improvement_id', '//td[1]/text()') s.add_xpath('id', '//td[2]/text()') s.add_xpath('type_code', '//td[3]/text()') s.add_xpath('description', '//td[4]/text()') s.add_xpath('klass', '//td[5]/text()') s.add_xpath('year_built', '//td[6]/text()') s.add_xpath('area', '//td[7]/text()') return s.load_item() def history(text, url): response = http.TextResponse(url=url, body=str(text.replace(u'\xa0', ''))) h = XPathItemLoader(item=TCADValueHistoryItem(), response=response) h.add_xpath('year', '//td[1]/text()') h.add_xpath('value', '//td[4]/text()') return h.load_item() hxs = HtmlXPathSelector(response) values = hxs.select( '//font[text()="Improvement ID"]/../../../../tr[position()>1]' ).extract() parcel.add_value( 'improvements', map(improvement, values, [ response.url, ] * len(values))) values = hxs.select( '//font[text()="Imp ID"]/../../../../tr[position()>1 and position()<last()]' ).extract() parcel.add_value('segments', map(segment, values, [ response.url, ] * len(values))) values = hxs.select( '//td[text()="Certified Value History"]/../../../..//td[@colspan="5"]/following::tr[1]' ).extract() parcel.add_value('historical_values', map(history, values, [ response.url, ] * len(values))) return parcel.load_item()
def parse_item(self, response): loader = XPathItemLoader(item=ImageItem(), response=response) loader.add_xpath('image_urls', '//img/@src') return loader.load_item()