def parse(self, response): url = response.url if "research" in url: categories = response.xpath(".//*[@class='catec']") for i in xrange(len(categories) - 1): large_categories = categories[i].xpath(".//*[@class='fl']") large_category_name = clean_text( large_categories.xpath(".//text()").extract()[0].strip()) mid_categories = categories[i].xpath(".//span") for mid_category in mid_categories: mid_category_name = clean_text( mid_category.xpath(".//text()").extract()[0].strip()) page_url = mid_category.xpath(".//@href").extract()[0] request = FormRequest(page_url, callback=self._parse_page_research, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["mid_category"] = mid_category_name request.meta["first_url"] = page_url yield request elif "free" in url: large_categories = response.xpath(".//*[@class='tul2']//h2//a") for i in xrange(len(large_categories)): large_category_name = clean_text(large_categories[i].xpath( ".//text()").extract()[0].strip()) page_url = large_categories[i].xpath("./@href").extract()[0] request = FormRequest(page_url, callback=self._parse_page_free, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["first_url"] = page_url yield request
def _parse_research(self, response): reports = response.xpath(".//*[@id='ulNewsList']//li") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath(".//dt//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_time = clean_text( report.xpath(".//*[@class='time']").extract()[0].strip()) if len(report_time) > 0: item["report_revision_time"] = report_time.split(u":")[1] date, date_precision = parse_date( item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = False yield item
def _parse_item(self, response): domain_url = "http://www.chinairn.com/" reports = response.xpath("//p[@class='maintittle']") for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//text()").extract()[0].strip()) if parseIndustryName(item["report_name"]) != None: item["industry_small_chs_name"] = parseIndustryName(item["report_name"]) page_url = report.xpath(".//@href").extract()[0] item["report_link"] = urljoin(domain_url, page_url) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国行业研究网" try: self.report_para(item, report) except: log.msg("Report revision time missed: %s"%item["report_link"], level=log.WARNING) item["price_free"] = False yield item Current_Page = clean_text(response.xpath(".//*[@class='hover']/text()").extract()[0]) if Page_Limit > 0 and int(Current_Page) > Page_Limit:return nextPage = response.xpath("//a[contains(@class,'down')]")[0] lastPageurl = nextPage.xpath("./following-sibling::a[1]/@href").extract()[0] nextPageurl = nextPage.xpath("./@href").extract()[0] if lastPageurl != nextPageurl: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def parse_item(self, response): item = IndustryReportSpiderItem() item['report_link'] = response.url item['source_name'] = u"艾瑞网" item['source_domain'] = self.allowed_domains[0] item['report_name'] = clean_text(response.xpath("//*[@class='content_title']/text()").extract()[0].strip()) price = response.xpath(u"//*[contains(text(), '价格')]/text()").extract()[0] item['price_free'] = True if u"免费" in price else False infodatas = response.xpath("//*[@class='content_titleinfoa']/span//text()").extract() for text in infodatas: try: if u"页数" in text:item['report_page_count'] = re.findall(ur'([0-9]+)', text)[0] except:pass try: if u"图表" in text:item['report_graph_count'] = re.findall(ur'([0-9]+)', text)[0] except:pass try: if u"-" in text: item['report_revision_time'] = text item['report_revision_time_standard'] = parse_date(item['report_revision_time']) except:pass item['industry_large_category'] =u"信息传输、软件和信息技术服务业" try: item['industry_mid_category'] = clean_text(response.xpath("//*[@class='content_titleinfoa']//a/text()").extract()[0].strip()) except: pass # if item['price_free']: # self.browser.get(response.url) # self.browser.find_element_by_xpath("//*[@class='download']/a").click() # WebDriverWait(self.browser, 20).until(EC.presence_of_element_located((By.XPATH, ".//*[@id='ButtonBox']/input"))) # Confirm = self.browser.find_element_by_xpath(".//*[@id='ButtonBox']/input") # Confirm.click() # WebDriverWait(self.browser, 20).until(EC.staleness_of(Confirm)) # if ".pdf" in self.browser.current_url:item['pdf_Link'] = self.browser.current_url return item
def _parse_hg(self, response): reports = response.xpath(".//*[@class='yahei f14']") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["report_name"] = clean_text( report.xpath(".//a/text()").extract()[0].strip()) page_url = report.xpath(".//a//@href").extract()[0] item["report_link"] = page_url report_time = clean_text( report.xpath(".//*[@name='deliveddate']/text()").extract() [0].strip()) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"国研网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) dict = self.parseContent(page_url) if dict["free"] == False: item["price_free"] = False else: item["price_free"] = True if (dict["url"][0] == "pdf"): item["pdf_Link"] = dict["url"][1] else: item["content_Link"] = dict["url"][1] yield item
def _parse_free(self, response): reports = response.xpath(".//*[@class='tul3']//li") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["report_name"] = clean_text(report.xpath(".//a//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_content = self.parseTimeContent(page_url) if report_content != None: item["report_content"] = report_content report_time = clean_text(report.xpath(".//span//text()").extract()[0].strip()) if (len(report_time) > 0): item["report_revision_time"] = report_time date, date_precision = parse_date(item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = True yield item
def _parse_research(self, response): reports = response.xpath(".//*[@id='ulNewsList']//li") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//dt//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_time = clean_text(report.xpath(".//*[@class='time']").extract()[0].strip()) if len(report_time) >0: item["report_revision_time"] = report_time.split(u":")[1] date, date_precision = parse_date(item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = False yield item
def _parse_free(self, response): reports = response.xpath(".//*[@class='tul3']//li") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["report_name"] = clean_text( report.xpath(".//a//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_content = self.parseTimeContent(page_url) if report_content != None: item["report_content"] = report_content report_time = clean_text( report.xpath(".//span//text()").extract()[0].strip()) if (len(report_time) > 0): item["report_revision_time"] = report_time date, date_precision = parse_date( item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = True yield item
def parse(self, response): url = response.url if "research" in url: categories = response.xpath(".//*[@class='catec']") for i in xrange(len(categories)-1): large_categories = categories[i].xpath(".//*[@class='fl']") large_category_name = clean_text(large_categories.xpath(".//text()").extract()[0].strip()) mid_categories = categories[i].xpath(".//span") for mid_category in mid_categories: mid_category_name = clean_text(mid_category.xpath(".//text()").extract()[0].strip()) page_url = mid_category.xpath(".//@href").extract()[0] request = FormRequest(page_url, callback=self._parse_page_research, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["mid_category"] = mid_category_name request.meta["first_url"] = page_url yield request elif "free" in url: large_categories = response.xpath(".//*[@class='tul2']//h2//a") for i in xrange(len(large_categories)): large_category_name = clean_text(large_categories[i].xpath(".//text()").extract()[0].strip()) page_url = large_categories[i].xpath("./@href").extract()[0] request = FormRequest(page_url, callback=self._parse_page_free, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["first_url"] = page_url yield request
def parse(self, response): large_categories = response.xpath("//*[@class='tabContent bluelink']//*[contains(@style, 'padding')]/a") for large_category in large_categories: large_category_name = clean_text(large_category.xpath(".//text()").extract()[0].strip()) mid_categorys = large_category.xpath("./parent::*/following-sibling::*[1]/a") for mid_category in mid_categorys: mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0]) mid_category_url = urljoin(self.base_url, mid_category.xpath("./@href").extract()[0]) request = FormRequest(mid_category_url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category_name"] = large_category_name request.meta["mid_category_name"] = mid_category_name yield request
def parse(self, response): large_categories = response.xpath(".//*[@class='shopleft_bt']//a") middle_categories = response.xpath(".//*[@class='shopnav2']") for i in xrange(len(large_categories)): large_category_name = clean_text(large_categories[i].xpath("./text()").extract()[0].strip()) middle_category_list = middle_categories[i].xpath(".//*[@class='shopleft_wt']") for middle_category in middle_category_list: middle_category_name = clean_text(middle_category.xpath(".//a/text()").extract()) page_url = middle_category.xpath(".//a//@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["mid_category"] = middle_category_name yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='info']") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath(".//h3//a/text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url string = clean_text( report.xpath(" //*[@class='rdate']//span/text()").extract( )[0].strip()) temp = self.parseItem(string) if len(temp) == 1: item["report_revision_time"] = temp[0][0] item["report_page_count"] = temp[0][1] item["report_graph_count"] = temp[0][2] date, date_precision = parse_date( item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国投资咨询网" item["price_free"] = False yield item if_nextpage = response.xpath(".//*[@class='zw']") if len(if_nextpage) > 0: if (if_nextpage.xpath(".//text()").extract()[-1] ) == u'下一页': #存在翻页 page_url = if_nextpage.xpath(".//@href").extract()[-1] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] request.meta["mid_category"] = response.meta[ "mid_category"] yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='img_des']/a") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath("./text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url report_time = self.parseTime(item["report_link"]) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"欧咨网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item if len(response.xpath(".//*[@class='page']//@href")) > 1: #存在翻页 page_len = clean_text( response.xpath( ".//*[@class='page']//*[@class='fl_l']/text()"). extract()[0].strip()) nextPageurl = response.xpath( ".//*[@class='page']//@href").extract()[-1] finds = self.pattern_page.findall(page_len) currentPage = finds[0][0] totlePage = finds[0][1] if currentPage != totlePage: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] request.meta["mid_category"] = response.meta[ "mid_category"] yield request
def report_para(self, item, report): revision_time = clean_text(report.xpath("..//*[@class='sp1']/text()").extract()[0].split(u":")[1].strip()) if self.pattern.match(revision_time): item["report_revision_time"] = revision_time else: textlst = report.xpath("../*[@class='main']/text()").extract()[0].replace(u"】 ", u"【").split(u"【") for text in range(len(textlst)+1): if textlst[text].endswith(u"日期"): item["report_revision_time"] = clean_text(textlst[text+1].strip()) break try: date, date_precision = parse_date(item["report_revision_time"]) dateTimezone = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["report_revision_time_standard"] = dateTimezone except: pass
def parse(self, response): large_categories = response.xpath(".//*[@id='cateitems']//h3//a") for large_category in large_categories: large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip()) page_url = large_category.xpath("./@href").extract()[0] request = FormRequest(page_url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category"] = large_category_name yield request
def parse(self, response): large_categories = response.xpath(".//*[@class='rptmap']//strong//a") for large_category in large_categories: large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip()) page_url = large_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category"] = large_category_name yield request
def parse_middle_category(self, response): report_types = response.xpath(u"//li[contains(text(),'报告')]") for report_type in report_types: mid_category_url = urljoin(self.base_url, report_type.xpath(u"./preceding-sibling::span[1]/a/@href").extract()[0]) request = FormRequest(mid_category_url, callback=self.parse_page, dont_filter=True) request.meta["large_category_name"] = response.meta["large_category_name"] request.meta["mid_category_name"] = response.meta["mid_category_name"] request.meta["report_type"] = clean_text(report_type.xpath("./text()").extract()[0].strip()) request.meta["page_base_url"] = mid_category_url yield request
def parse(self, response): large_categories = response.xpath(".//*[@class='shopleft_bt']//a") middle_categories = response.xpath(".//*[@class='shopnav2']") for i in xrange(len(large_categories)): large_category_name = clean_text( large_categories[i].xpath("./text()").extract()[0].strip()) middle_category_list = middle_categories[i].xpath( ".//*[@class='shopleft_wt']") for middle_category in middle_category_list: middle_category_name = clean_text( middle_category.xpath(".//a/text()").extract()) page_url = middle_category.xpath(".//a//@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["mid_category"] = middle_category_name yield request
def parse(self, response): large_categories = response.xpath("//a[contains(@id, 'xTrade')][not(contains(@id, 'All'))]") for large_category in large_categories: large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip()) id = large_category.xpath("./@id").extract()[0] if id[-1] == u'l': continue page_url = large_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category"] = large_category_name yield request
def parse_middle_category(self, response): mid_categories = response.xpath(".//*[@class='report2']//h2//a") for mid_category in mid_categories: mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0].strip()) page_url = mid_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = mid_category_name request.meta["first_url"] = url yield request
def parse(self, response): large_categories = response.xpath(".//*[@class='rptmap']//strong//a") for large_category in large_categories: large_category_name = clean_text( large_category.xpath("./text()").extract()[0].strip()) page_url = large_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category"] = large_category_name yield request
def parse_item(self, response): item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category_name"] item["industry_mid_category"] = response.meta["mid_category_name"] item["report_name"] = clean_text(response.xpath("//h1/text()").extract()[0].strip()) item["report_type"] = response.meta["report_type"] item["industry_small_chs_name"] = parseIndustryName(item["report_name"]) item["price_free"] = self._parse_price(response) item["report_link"] = response.url item["source_domain"] = self.base_url item["source_name"] = u"中国产业发展研究网" yield item
def _parse_page_free(self, response): total_pages = int(clean_text(response.xpath(".//*[@class='pages']//a//text()").extract()[-2].strip())) first_url = response.meta["first_url"] request = FormRequest(first_url, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] yield request if total_pages>1: for i in xrange(1,total_pages): next_page = first_url[:-5] + '-p' + str(i+1) + '.html' request = FormRequest(next_page, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] yield request
def parse(self, response): large_categories = response.xpath( "//*[@class='tabContent bluelink']//*[contains(@style, 'padding')]/a" ) for large_category in large_categories: large_category_name = clean_text( large_category.xpath(".//text()").extract()[0].strip()) mid_categorys = large_category.xpath( "./parent::*/following-sibling::*[1]/a") for mid_category in mid_categorys: mid_category_name = clean_text( mid_category.xpath("./text()").extract()[0]) mid_category_url = urljoin( self.base_url, mid_category.xpath("./@href").extract()[0]) request = FormRequest(mid_category_url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category_name"] = large_category_name request.meta["mid_category_name"] = mid_category_name yield request
def parse_middle_category(self, response): mid_categories = response.xpath(".//*[@id='catgory_container']//a") for mid_category in mid_categories: mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0].strip()) page_url = mid_category.xpath("./@href").extract()[0] if((mid_category_name!=u'不限') & ("report" in page_url)): url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_firstPage, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = mid_category_name request.meta["first_url"] = url yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='info']") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//h3//a/text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url string =clean_text(report.xpath(" //*[@class='rdate']//span/text()").extract()[0].strip()) temp = self.parseItem(string) if len(temp)==1: item["report_revision_time"] = temp[0][0] item["report_page_count"] = temp[0][1] item["report_graph_count"] = temp[0][2] date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国投资咨询网" item["price_free"] = False yield item if_nextpage = response.xpath(".//*[@class='zw']") if len(if_nextpage)>0: if (if_nextpage.xpath(".//text()").extract()[-1])==u'下一页': #存在翻页 page_url =if_nextpage.xpath(".//@href").extract()[-1] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def parse_index_page(self,response): self.current_page += 1 industry = response.xpath('//*[@id="DataList1"]//a') for r in industry: industry_small_chs_names= clean_text(r.xpath('./text()').extract()[0].strip()) self.industryList.append(industry_small_chs_names.encode("utf-8")) if self.current_page % 30 == 0: self.f.write("\n".join(self.industryList)) self.f.write("\n") print "!" * 50, "写入文件", "!" * 50 self.industryList = [] print "*" * 30, response.meta["page"], "*" * 30 print "*" * 30, "第", self.current_page, "页", "*" * 30
def parse_middle_category(self, response): mid_categories = response.xpath(".//*[@class='report2']//h2//a") for mid_category in mid_categories: mid_category_name = clean_text( mid_category.xpath("./text()").extract()[0].strip()) page_url = mid_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = mid_category_name request.meta["first_url"] = url yield request
def parse_item(self, response): item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category_name"] item["industry_mid_category"] = response.meta["mid_category_name"] item["report_name"] = clean_text( response.xpath("//h1/text()").extract()[0].strip()) item["report_type"] = response.meta["report_type"] item["industry_small_chs_name"] = parseIndustryName( item["report_name"]) item["price_free"] = self._parse_price(response) item["report_link"] = response.url item["source_domain"] = self.base_url item["source_name"] = u"中国产业发展研究网" yield item
def _parse_item(self, response): reports = response.xpath(".//*[@class='clistdl']") for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//dt/a/text()").extract()[0].strip()) if len(report.xpath(".//dd//*[@class='cxgrep']//@title"))>0: industry = clean_text(report.xpath(".//dd//*[@class='cxgrep']//@title").extract()[0].strip()) else: industry = item["report_name"] industry_small_chs_name = parseIndustryName(industry) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url item["report_revision_time"] = clean_text(report.xpath(".//dt/span/text()").extract()[0].strip()) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国报告大厅" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item
def _parse_hy_large(self, response): large_categories = response.xpath(".//*[@class='yahei f16 fB']") for large_category in large_categories: large_category_name = clean_text( large_category.xpath(".//text()").extract()[0].strip()) if u"区域重点行业中小企业季报" not in large_category_name: page_url = large_category.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_hg_mid, dont_filter=True) request.meta["large_category"] = parseIndustryName( large_category_name) yield request
def _parse_first(self, response): total_pages = clean_text( response.xpath( ".//*[@id='Content_WebPageDocumentsByUId1_span_totalpage']//text()" ).extract()[0].strip()) if total_pages >= 1: for i in xrange(0, int(total_pages)): next_page = response.url + '&curpage=' + str(i + 1) request = FormRequest(next_page, callback=response.meta["callback"], dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] yield request
def parse_index_page(self, response): self.current_page += 1 industry = response.xpath('//*[@id="DataList1"]//a') for r in industry: industry_small_chs_names = clean_text( r.xpath('./text()').extract()[0].strip()) self.industryList.append(industry_small_chs_names.encode("utf-8")) if self.current_page % 30 == 0: self.f.write("\n".join(self.industryList)) self.f.write("\n") print "!" * 50, "写入文件", "!" * 50 self.industryList = [] print "*" * 30, response.meta["page"], "*" * 30 print "*" * 30, "第", self.current_page, "页", "*" * 30
def _parse_item(self, response): reports = response.xpath(".//*[@class='img_des']/a") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath("./text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url report_time = self.parseTime(item["report_link"]) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"欧咨网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item if len(response.xpath(".//*[@class='page']//@href"))>1: #存在翻页 page_len = clean_text(response.xpath(".//*[@class='page']//*[@class='fl_l']/text()").extract()[0].strip()) nextPageurl = response.xpath(".//*[@class='page']//@href").extract()[-1] finds = self.pattern_page.findall(page_len) currentPage = finds[0][0] totlePage = finds[0][1] if currentPage != totlePage: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def parse_middle_category(self, response): report_types = response.xpath(u"//li[contains(text(),'报告')]") for report_type in report_types: mid_category_url = urljoin( self.base_url, report_type.xpath( u"./preceding-sibling::span[1]/a/@href").extract()[0]) request = FormRequest(mid_category_url, callback=self.parse_page, dont_filter=True) request.meta["large_category_name"] = response.meta[ "large_category_name"] request.meta["mid_category_name"] = response.meta[ "mid_category_name"] request.meta["report_type"] = clean_text( report_type.xpath("./text()").extract()[0].strip()) request.meta["page_base_url"] = mid_category_url yield request
def _parse_firstPage(self, response): if len(response.xpath(".//*[@class='counter']/text()"))>=1: #存在翻页 first_url = response.meta["first_url"] page_len = clean_text(response.xpath(".//*[@class='counter']/text()").extract()[0].strip()) finds = self.pattern_page.findall(page_len) totlePage = finds[0][1] for i in xrange(1,int(totlePage)): nextPageurl = "index_" + str(i+1) + ".html" url = urljoin(first_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] request.meta["first_url"] = first_url yield request else: request = FormRequest(response.url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def _parse_page_free(self, response): total_pages = int( clean_text( response.xpath(".//*[@class='pages']//a//text()").extract() [-2].strip())) first_url = response.meta["first_url"] request = FormRequest(first_url, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] yield request if total_pages > 1: for i in xrange(1, total_pages): next_page = first_url[:-5] + '-p' + str(i + 1) + '.html' request = FormRequest(next_page, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] yield request