def parse_showdesk_members_treat(self, resp): hxs = Selector(resp) next_page_nodes = hxs.xpath('//a[@class="next_page"]') meta = resp.meta if next_page_nodes and meta['page'] == 1: next_page_node = next_page_nodes[0] total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip() for i in xrange(2, int(total_page) + 1): new_meta = dict(meta) new_meta['page'] = i self.log('%s yield member list page %d' % (self.name, i)) yield FormRequest(url="http://vip6.sentree.com.cn/shair/timesItem!initTreat.action", formdata={ 'page.currNum' : str(i), 'page.rpp' : '30', 'r' : str(meta['r']), 'set' : 'manage' }, callback=self.parse_showdesk_members_treat, meta=new_meta) treat_info_tabs = hxs.xpath('//div[@class="page_main"]//div[@class="table-responsive"]/table') if not treat_info_tabs: yield None return treat_info_tab = treat_info_tabs[0] ths = str_list_strip_replace(treat_info_tab.xpath('./thead/tr/th/child::text()').extract(), [' ', '\t', '\n', ' ']) info_nodes = treat_info_tab.xpath('./tbody/tr') for i_n in info_nodes: infos = [] info_tds = i_n.xpath('./td') for i_t in info_tds: info = ''.join(str_list_strip_replace(i_t.xpath('.//child::text()').extract(), [' ', '\t', '\n', ' '])) infos.append(info) item = SentreeMemberTreatItem() item['hs'] = ths item['vals'] = infos yield item
def parse_item(self, response: Response, selector: Selector): try: param_names = selector.xpath('.//dt/text()').extract() param_values = selector.xpath('.//dd/text()').extract() params = {n.strip(': '): v.strip(' ') for n, v in zip(param_names, param_values) if '\n' not in v} loader = ProductLoader(item=ProductItem(), response=response, selector=selector) loader.add_xpath('name', './/*[@itemprop="name"]/text()') loader.add_xpath('category', '//*[@id="main"]/h1/text()') loader.add_xpath('link', './/*[@itemprop="name"]/../@href') loader.add_xpath('price', './/*[@class="price" or @class="price action_special"]/text()') loader.add_xpath('price_old', './/*[@class="price-old"]/text()') loader.add_xpath('rating', './/*[@class="oh-rating"]/text()') loader.add_value('params', params) loader.add_value('where_found', response.request.url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('parse_datetime', datetime.datetime.now()) return loader.load_item() except Exception as e: print(e)
def parse_showdesk_membercards(self, resp): hxs = Selector(resp) headers = hxs.xpath('//form[@id="cardTypeForm"]//table/thead/tr/th/child::text()').extract() if not headers: self.log('%s can not find table headers.' % self.name, level=log.ERROR) yield None return employee_nodes = hxs.xpath('//form[@id="cardTypeForm"]//table/tbody/tr') if not employee_nodes: self.log('%s can not find member card info' % self.name, level=log.ERROR) yield None return for e_n in employee_nodes: info_nodes = e_n.xpath('td') info = OrderedDict({}) for idx, i_n in enumerate(info_nodes): if idx == 0 or idx == len(info_nodes) - 2: continue if idx == len(info_nodes) - 1: info[headers[idx]] = ' | '.join(str_list_strip_replace(i_n.xpath('./child::text()').extract(), [' ', '\t', '\n', ' '])) continue sep = ' | ' if idx == 3: sep = '' info[headers[idx]] = sep.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n', ' '])) item = SentreeMemberCardItem() item['info'] = info # items.append(info) yield item
def parse_showdesk_members2(self, resp): hxs = Selector(resp) next_page_nodes = hxs.xpath('//a[@class="next_page"]') meta = resp.meta if next_page_nodes and meta['page'] == 1: next_page_node = next_page_nodes[0] total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip() for i in xrange(2, int(total_page) + 1): new_meta = dict(meta) new_meta['page'] = i self.log('%s yield member list page %d' % (self.name, i)) yield FormRequest(url="http://vip6.sentree.com.cn/shair/memberInfo!memberlist.action", formdata={ 'page.currNum' : str(i), 'page.rpp' : '30', 'r' : str(meta['r']), 'set' : 'manage' }, callback=self.parse_showdesk_members2, meta=new_meta) member_nodes = hxs.xpath('//form[@id="delForm"]//table/tbody/tr') if member_nodes: for m_n in member_nodes: member_tds = m_n.xpath('td') info_query_str = None try: phone = member_tds[1].xpath('a/child::text()').extract()[0].replace(' ', '').strip() name = member_tds[2].xpath('span/child::text()').extract()[0].replace(' ', '').strip() card_no = member_tds[6].xpath('table/tr/td[1]/a/child::text()').extract()[0].replace(' ', '').strip() info_query_str = member_tds[6].xpath('table/tr/td[1]/a/@onclick').extract()[0] info_query_str = info_query_str[info_query_str.find('?') + 1:] info_query_str = info_query_str[:info_query_str.find("'")] card_name = member_tds[6].xpath('table/tr/td[2]/child::text()').extract()[0].replace(' ', '').strip() card_type = member_tds[6].xpath('table/tr/td[3]//child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() discont = member_tds[6].xpath('table/tr/td[4]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() timeout = member_tds[6].xpath('table/tr/td[9]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() overage = str_list_strip_replace(member_tds[6].xpath('table/tr/td[7]//child::text()').extract(), [' ', ' ', '\t', '\n']) except: self.log(traceback.format_exc()) continue mem_item = SentreeMembersSimpleItem() mem_item[u'phone'] = phone mem_item[u'name'] = name mem_item[u'card_no'] = card_no mem_item[u'card_name'] = card_name mem_item[u'card_type'] = card_type mem_item[u'discont'] = discont mem_item[u'timeout'] = timeout mem_item[u'overage'] = overage if info_query_str: new_meta = dict(meta) new_meta['item'] = mem_item yield Request(url='http://vip6.sentree.com.cn/shair/memberArchives!editMember.action?%s%d' % (info_query_str, time.time()), callback=self.parse_member_overdraft, meta=new_meta) else: mem_item['overdraft'] = '0.0' yield mem_item
def parse_item(self, response): item = BuscapeItem() sel = Selector(response) title = sel.xpath('//h1[@class="name"]/text()').extract()[0] item["title"] = title item["url"] = response.url attributes = [] pares = sel.xpath('//*[@class="product-details"]/ul/li') for par in pares: key = par.xpath('span[@class="name"]/text()').extract() value = par.xpath('span[@class="value"]/text()').extract() attributes.append({"key": key[0], "value" : value}) item["attributes"] = attributes return item
def parse(self, response): topic_id = response.meta[ 'topic_id' ] sel = Selector(text=response.body, type="html") print 'starting' topic_lists = sel.xpath('//ul[re:test(@id,"results")]/li') for topic in topic_lists: topic_item = Topic_Item() temp_sel = Selector(text=topic.extract()) topic_item['topic_id'] = topic_id title = temp_sel.xpath('//h3/a')[0].extract() title = self.parse_html_content(title) print title topic_item['topic_title']=title content = temp_sel.xpath('//p')[0].extract() content = self.parse_html_content(content).strip() print type(content) print content.encode('gbk','ignore') topic_item['topic_content']=content ttime = temp_sel.xpath('//span[re:test(@class,"green stat")]/text()').extract()[0] tt = ttime.split()[1].__repr__() print tt now = datetime.datetime.now() if '5e74' in tt: time_pa = re.findall(self.time_1_pa,ttime.split()[1])[0] new_time = str(time_pa[0])+'-'+str(time_pa[1])+'-'+str(time_pa[2])+' '+'00:00:00' print time_pa elif '5206' in tt: time_pa = re.findall(self.time_2_pa,ttime.split()[1])[0] new_time = now - datetime.timedelta(minutes=int(time_pa)) print time_pa elif '5c0f' in tt: time_pa = re.findall(self.time_2_pa,ttime.split()[1])[0] new_time = now - datetime.timedelta(hours=int(time_pa)) print time_pa print new_time topic_item['topic_post_time']= new_time poster = ttime.split()[0] topic_item['topic_author'] = poster url = temp_sel.xpath('//h3/a/@href').extract()[0] print url topic_item['topic_url']=url yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item}) print '++++++++++++++++++++++++++++++'
def parse(self, response): topic_kws = response.meta[ 'topic_kws' ] all_content = BeautifulSoup(response.body,'html5lib') topic_lists = all_content.find_all('li',class_="pbw") for topic in topic_lists: topic_item = Topic_Item() topic_item['topic_db_message'] = topic_kws temp_sel = Selector(text=topic.prettify(), type="html") title = topic.find_all("a")[0].get_text() # print title topic_item['topic_title']=title url = topic.find_all("a")[0].get('href') print url topic_item['topic_url']=url topic_content = topic.find_all("p")[1].get_text() print topic_content topic_item['topic_content']=topic_content post_time = temp_sel.xpath('//p/span/text()')[0].extract().strip() print post_time topic_item['topic_post_time']=post_time author = temp_sel.xpath('//p/span/a/text()')[0].extract().strip() # print author topic_item['topic_author']=author reply_msg = topic.find_all('p',class_='xg1')[0] msg = re.findall(self.reply_pattern,reply_msg.get_text())[0] print msg reply_num = msg[0] read_num = msg[1] topic_item['topic_reply']=reply_num homepage = temp_sel.xpath('//p/span/a/@href').extract()[0] user_id = re.findall(self.userid_pa,homepage)[0] print user_id topic_item['poster_id']=user_id topic_item['homepage'] = homepage print '+++++++++++++++++++++++++++++++++++++++++' yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})
def parse_quick_facts(self, selector: Selector, quest: Quest): """ parses the quick facts section on a wowhead quest page :param selector: selector of the quick facts section :param quest: quest item to store gathered info in :return: """ result = selector.re(r"Start:\s(.*</a>)") if result: element = Selector(text=result[0]) quest["npc"] = element.xpath("//a/text()").get() quest["npc_link"] = self.base_url + element.xpath( "//a/@href").get() else: quest["npc"] = "Unknown" quest["npc_link"] = "Unknown"
def parse(self, response): topic_kws = response.meta["topic_kws"] all_content = BeautifulSoup(response.body, "html5lib") topic_lists = all_content.find_all("li", class_="pbw") for topic in topic_lists: topic_item = Topic_Item() topic_item["topic_db_message"] = topic_kws temp_sel = Selector(text=topic.prettify(), type="html") title = topic.find_all("a")[0].get_text() # print title topic_item["topic_title"] = title url = topic.find_all("a")[0].get("href") print url topic_item["topic_url"] = url topic_content = topic.find_all("p")[1].get_text() # print topic_content topic_item["topic_content"] = topic_content post_time = temp_sel.xpath("//p/span/text()")[0].extract().strip() print post_time topic_item["topic_post_time"] = post_time author = temp_sel.xpath("//p/span/a/text()")[0].extract().strip() # print author topic_item["topic_author"] = author reply_msg = topic.find_all("p", class_="xg1")[0] msg = re.findall(self.reply_pattern, reply_msg.get_text())[0] # print msg reply_num = msg[0] read_num = msg[1] topic_item["topic_reply"] = reply_num homepage = temp_sel.xpath("//p/span/a/@href").extract()[0] user_id = re.findall(self.userid_pa, homepage)[0] # print user_id topic_item["poster_id"] = user_id topic_item["homepage"] = homepage print "+++++++++++++++++++++++++++++++++++++++++" yield scrapy.Request(url, callback=self.parse_torrent, meta={"topic_item": topic_item})
def parse(self, response): sel = Selector(response) locations = Locations() locations["restaurantIDs"] = sel.xpath('//a/@data-id').extract() locations["coordinates"] = {} locations["coordinates"]["longitude"] = self.coordinatesURLTranslator.getLongitude(response.url) locations["coordinates"]["latitude"] = self.coordinatesURLTranslator.getLatitude(response.url) return locations
def parse_XML(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def parse(self, response): sel = Selector(response) restaurants = sel.xpath('//a[contains(@id, "establecimiento")]') for restaurant in restaurants: locationCsv = LocationCsv() locationCsv["id_restaurante"] = restaurant.css("a::attr(data-id)").extract() locationCsv["nombre_restaurante"] = restaurant.css("a .result-info h4::text").extract() locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude(response.url) locationCsv["longitud"] = self.coordinatesURLTranslator.getLongitude(response.url) yield locationCsv
class RestaurantIDsGetter(object): def __init__(self, response): self.sel = Selector(response) def getID(self, url): xpathQuery = '//a[contains(@href, "' + url + '")]/@data-id' queryResults = self.sel.xpath(xpathQuery).extract() if len(queryResults) == 0: return "NoID" return queryResults[0]
def parse(self, response): topic_id = response.meta[ 'topic_id' ] sel = Selector(text=response.body, type="html") topic_lists = sel.xpath('//div[re:test(@class,"result.*?")]') for topic in topic_lists: topic_item = Topic_Item() temp_sel = Selector(text=topic.extract()) title = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/a/text()')[0].extract() print title topic_item['topic_title']=title board = temp_sel.xpath('//div[re:test(@class,"meta")]/a/text()')[0].extract() print board poster = temp_sel.xpath('//div[re:test(@class,"meta")]/a/text()')[1].extract().strip() print poster topic_item['topic_author']=poster main_con = temp_sel.xpath('//div[re:test(@class,"meta")]')[0].extract().strip() post_time_ = re.findall(self.post_time_pa,main_con)[0] post_time_str = '20'+post_time_[0]+' '+post_time_[2]+':00' post_time = time.strptime(post_time_str, '%Y-%m-%d %H:%M:%S') # print post_time if '4e0b' in post_time_[1].__repr__(): print u'下午' post_time = self.time_to_datetime(post_time)+ datetime.timedelta(hours=12) elif '4e0a' in post_time_[1].__repr__(): print u'上午' post_time = post_time_str print post_time topic_item['topic_post_time']=post_time content = temp_sel.xpath('//div[re:test(@class,"content")]/text()')[0].extract().strip() print content topic_item['topic_content']=content url = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/a/@href').extract()[0] url = 'http://www.battlenet.com.cn'+url print url topic_item['topic_url']=url reply_num = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/span[re:test(@class,"small")]/text()').extract() reply_num = reply_num[len(reply_num)-1] reply_num = re.findall(self.dig_pattern,reply_num)[0] print reply_num topic_item['topic_reply']=reply_num print '+++++++++++++++++++++++++++++++++++++' yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})
def parse_member_overdraft(self, resp): hxs = Selector(resp) mem_item = resp.meta['item'] overdraft_click_nodes = hxs.xpath('//ul[@class="tab-nav"]//a[@href="#tab7"]/@onclick') if not overdraft_click_nodes: mem_item['overdraft'] = '0.0' yield mem_item else: click_str = overdraft_click_nodes.extract()[0] ids = re.findall(r'\d+', click_str) yield FormRequest(url='http://vip6.sentree.com.cn/shair/memberArchives!debtlist.action', formdata={'id' : ids[0], 'shopid' : ids[1]}, callback=self.parse_member_overdraft2, meta=resp.meta)
def parse(self, response): sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Dmozitem() item['title'] = site.xpath('a/text()').extract() item['link'] = site.xpath('a/@href').extract() item['desc'] = site.xpath('text()').extract() items.append(item) return items
def parse(self, response): topic_id = response.meta[ 'topic_id' ] sel = Selector(text=response.body, type="html") topic_lists = sel.xpath('//div[re:test(@class,"result f s3")]') for topic in topic_lists: topic_item = Topic_Item() temp_sel = Selector(text=topic.extract()) title = temp_sel.xpath('//h3[re:test(@class,"c-title")]/a/text()').extract()[0].strip() # print title topic_item['topic_title']=title content = temp_sel.xpath('//div[re:test(@class,"c-abstract")]/text()').extract()[0].strip() print content topic_item['topic_content']=content post_time = temp_sel.xpath('//div[re:test(@class,"c-summary-1")]/span')[2].extract() post_time = re.findall(self.post_pa,post_time)[0]+' 00:00:00' print post_time topic_item['topic_post_time']=post_time author = temp_sel.xpath('//div[re:test(@class,"c-summary-1")]/span/text()')[1].extract() print author topic_item['topic_author']=author url = temp_sel.xpath('//h3[re:test(@class,"c-title")]/a/@href').extract()[0] print url topic_item['topic_url']=url topic_item['topic_reply']=0 print '+++++++++++++++++++++++++++++++++++++' yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})
def parse_member_overdraft2(self, resp): mem_item = resp.meta['item'] hxs = Selector(resp) total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()') if not total_overdraft_nodes: overdraft = '0.0' else: overdrafts = str_list_strip_replace(total_overdraft_nodes.extract(), [' ', ' ', '\t', '\n']) overdraft_statuss = str_list_strip_replace(hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[5]/font/child::text()').extract(), [' ', ' ', '\t', '\n']) overdraft = float(0) for i, s_overdraft in enumerate(overdrafts): f_overdraft = float(s_overdraft) if u'已还清' in overdraft_statuss[i]: overdraft = overdraft - f_overdraft continue if u'未还清' in overdraft_statuss[i]: overdraft = overdraft + f_overdraft if overdraft < 0: overdraft = float(0) overdraft = '%.1f' % overdraft mem_item['overdraft'] = overdraft yield mem_item
def parse_showdesk_services(self, resp): hxs = Selector(resp) headers = hxs.xpath('//table[@id="itemset"]/thead/tr/th/child::text()').extract() if not headers: self.log('%s can not find table headers.' % self.name, level=log.ERROR) yield None return service_nodes = hxs.xpath('//table[@id="itemset"]/tbody/tr') if not service_nodes: self.log('%s can not find services info' % self.name, level=log.ERROR) yield None return for s_n in service_nodes: info_nodes = s_n.xpath('td') info = OrderedDict({}) no = None for idx, i_n in enumerate(info_nodes): if idx == 0 or idx == len(info_nodes) - 1: continue if idx == 8: info[headers[idx]] = str_list_strip_replace(str_list_strip(hxs.xpath('//span[@id="pricespan%s"]' % no).xpath('child::text()').extract()), [' ', '\t', '\n']) continue if idx == 9: discount_nodes = i_n.xpath('.//div[starts-with(@id, "icddiv")]') discounts = [] if discount_nodes: for d_n in discount_nodes: discounts.append(' | '.join(str_list_strip_replace(str_list_strip(d_n.xpath('./child::text()').extract()), [' ', '\t', '\n']))) info[headers[idx]] = ' ||| '.join(discounts) continue info[headers[idx]] = ' | '.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n'])) if idx == 1: no = info[headers[idx]] item = SentreeServiceItem() item['info'] = info # items.append(info) yield item
def parse(self, response): sel = Selector(response) restaurants = sel.xpath('//a[contains(@id, "establecimiento")]') for restaurant in restaurants: locationCsv = LocationCsv() locationCsv["id_restaurante"] = restaurant.css( "a::attr(data-id)").extract() locationCsv["nombre_restaurante"] = restaurant.css( "a .result-info h4::text").extract() locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude( response.url) locationCsv[ "longitud"] = self.coordinatesURLTranslator.getLongitude( response.url) yield locationCsv
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@class="mainleft"]') itemlist= [] for site in sites: item = CnkispiderItem() title = site.xpath('//*[@id="chTitle"]/text()').extract() #将相应的值填入到item对应的属性中去 item['title'] = [t.encode('utf-8') for t in title] author = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[1]/a/text()').extract() if author == None: author = site.xpath('//*[@id="content"]/div[1]/div[2]/p[1]/a/text()').extract() item['author'] = [a.encode('utf-8') for a in author] institution = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[3]/a/text()').extract() item['institution'] = [i.encode('utf-8') for i in institution] abstract = site.xpath('//*[@id="ChDivSummary"]/text()').extract() item['abstract'] = [a.encode('utf-8') for a in abstract] keyWord = site.xpath('//*[@id="ChDivKeyWord"]/a/text()').extract() item['keyWord'] = [k.encode('utf-8') for k in keyWord] downloadFreq = site.xpath('//*[@id="content"]/div[1]/div[5]/ul/li/text()').re(u'\s*【下载频次】(.*)') item['downloadFreq'] = [d.encode('utf-8') for d in downloadFreq] quoteFreq = site.xpath('//*[@id="rc3"]/text()').re('\W(\d+)\W') item['quoteFreq'] = [q.encode('utf-8') for q in quoteFreq] itemlist.append(item) #加入日志记录,级别为info log.msg("Appending item...", level=log.INFO) #生成日志 log.msg("Append done.", level=log.INFO) return itemlist # if __name__ == "__main__": # sys.path.append('F:\Pythonworkspace\cnkiSpider_master\cnkiSpider\cnkiSpider') # cnki = CNKI_Spiders() # # print os.getcwd() # print cnki #
yield item items = [] if __name__ == '__main__': f = open('e:\\1.html') html = "" for l in f: html += l f.close() resp = TextResponse(url="", body=html) if 1: hxs = Selector(resp) total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]') total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()') if not total_overdraft_nodes: overdraft = '0' else: overdraft = str_list_strip_replace(total_overdraft_nodes.extract(), [' ', ' ', '\t', '\n'])[0] print overdraft sys.exit(0) s = SentreeSpider() try: s.parse_showdesk_services(resp) except: print traceback.format_exc() print json.dumps(obj=items, ensure_ascii=False, indent=4)
def parse_consumer_bill_stream_validate(self, resp): hxs = Selector(resp) menu = [u'营业记录', u'水单记录', u'水单审查'] bill_headers = [] head_nodes = hxs.xpath('//tbody[@id="billBody"]/parent::table/thead/tr/th') if not head_nodes: self.log('in %s.parse_consumer_bill_stream_validate, can not get table headers.' % self.name, level=log.ERROR) yield None return for idx, hd in enumerate(head_nodes): if idx == len(head_nodes) - 1: break txts = hd.xpath('child::text()').extract() bill_headers.append('/'.join(txts)) bill_nodes = hxs.xpath('//tbody[@id="billBody"]/tr') if bill_nodes: for bn in bill_nodes: item = SentreeShuiDanShenChaItem() item['menu'] = menu headers = [] item['data'] = OrderedDict({}) data_nodes = bn.xpath('td') for idx, dn in enumerate(data_nodes): if idx == 6: break h = bill_headers[idx] if idx == 0 or idx == 4: headers.append(h) item['data'][h] = [str_list_strip(dn.xpath('descendant::text()').extract())[0], True] continue if idx == 1 or idx == 2 or idx == 3: headers.append(h) item['data'][h] = [str_list_strip(dn.xpath('descendant::text()').extract()), True] continue if idx == 5: detail = [] subtrs = dn.xpath('table/tr') recoded_headers = False for tr in subtrs: empperfors = [] subdetail = OrderedDict({}) subtds = tr.xpath('td') h = bill_headers[idx + 0] if not recoded_headers: headers.append(h) subdetail[h] = [str_list_strip(subtds[0].xpath('descendant::text()').extract()), True] h = bill_headers[idx + 1] if not recoded_headers: headers.append(h) subdetail[h] = [str_list_strip(subtds[1].xpath('descendant::text()').extract())[0], True] subtrs2 = subtds[2].xpath('table/tr') for kdx, tr2 in enumerate(subtrs2): if kdx == len(subtrs2) - 1: break empperfor = OrderedDict({}) subtds2 = tr2.xpath('td') h = bill_headers[idx + 2 + 0] if not recoded_headers: headers.append(h) if h not in empperfor: empperfor[h] = [] empperfor[h].append([str_list_strip(subtds2[0].xpath('descendant::text()').extract()), True]) h = bill_headers[idx + 2 + 1] if not recoded_headers: headers.append(h) if h not in empperfor: empperfor[h] = [] empperfor[h].append([str_list_strip(subtds2[1].xpath('descendant::text()').extract())[0], True]) h = bill_headers[idx + 2 + 2] h = u'员工' + h if not recoded_headers: headers.append(h) if h not in empperfor: empperfor[h] = [] empperfor[h].append([str_list_strip(subtds2[2].xpath('descendant::text()').extract())[0], True]) empperfors.append(empperfor) recoded_headers = True subdetail[u'员工业绩'] = [empperfors, False] detail.append([subdetail, False]) recoded_headers = True item['headers'] = headers item['data'][u'详情'] = [detail, False] # items.append(item) yield item
def parse_page(self,response): item=CnbetaItem() sel=Selector(response) item["title"]=sel.xpath('//title/text()').extract() item['url']=response.url return item