def _parse_research(self, response): reports = response.xpath(".//*[@id='ulNewsList']//li") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//dt//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_time = clean_text(report.xpath(".//*[@class='time']").extract()[0].strip()) if len(report_time) >0: item["report_revision_time"] = report_time.split(u":")[1] date, date_precision = parse_date(item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = False yield item
def _parse_free(self, response): reports = response.xpath(".//*[@class='tul3']//li") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["report_name"] = clean_text(report.xpath(".//a//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_content = self.parseTimeContent(page_url) if report_content != None: item["report_content"] = report_content report_time = clean_text(report.xpath(".//span//text()").extract()[0].strip()) if (len(report_time) > 0): item["report_revision_time"] = report_time date, date_precision = parse_date(item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = True yield item
def parse_item(self, response): item = IndustryReportSpiderItem() item['report_link'] = response.url item['source_name'] = u"艾瑞网" item['source_domain'] = self.allowed_domains[0] item['report_name'] = clean_text(response.xpath("//*[@class='content_title']/text()").extract()[0].strip()) price = response.xpath(u"//*[contains(text(), '价格')]/text()").extract()[0] item['price_free'] = True if u"免费" in price else False infodatas = response.xpath("//*[@class='content_titleinfoa']/span//text()").extract() for text in infodatas: try: if u"页数" in text:item['report_page_count'] = re.findall(ur'([0-9]+)', text)[0] except:pass try: if u"图表" in text:item['report_graph_count'] = re.findall(ur'([0-9]+)', text)[0] except:pass try: if u"-" in text: item['report_revision_time'] = text item['report_revision_time_standard'] = parse_date(item['report_revision_time']) except:pass item['industry_large_category'] =u"信息传输、软件和信息技术服务业" try: item['industry_mid_category'] = clean_text(response.xpath("//*[@class='content_titleinfoa']//a/text()").extract()[0].strip()) except: pass # if item['price_free']: # self.browser.get(response.url) # self.browser.find_element_by_xpath("//*[@class='download']/a").click() # WebDriverWait(self.browser, 20).until(EC.presence_of_element_located((By.XPATH, ".//*[@id='ButtonBox']/input"))) # Confirm = self.browser.find_element_by_xpath(".//*[@id='ButtonBox']/input") # Confirm.click() # WebDriverWait(self.browser, 20).until(EC.staleness_of(Confirm)) # if ".pdf" in self.browser.current_url:item['pdf_Link'] = self.browser.current_url return item
def _parse_hg(self, response): reports = response.xpath(".//*[@class='yahei f14']") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["report_name"] = clean_text( report.xpath(".//a/text()").extract()[0].strip()) page_url = report.xpath(".//a//@href").extract()[0] item["report_link"] = page_url report_time = clean_text( report.xpath(".//*[@name='deliveddate']/text()").extract() [0].strip()) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"国研网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) dict = self.parseContent(page_url) if dict["free"] == False: item["price_free"] = False else: item["price_free"] = True if (dict["url"][0] == "pdf"): item["pdf_Link"] = dict["url"][1] else: item["content_Link"] = dict["url"][1] yield item
def _parse_research(self, response): reports = response.xpath(".//*[@id='ulNewsList']//li") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath(".//dt//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_time = clean_text( report.xpath(".//*[@class='time']").extract()[0].strip()) if len(report_time) > 0: item["report_revision_time"] = report_time.split(u":")[1] date, date_precision = parse_date( item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = False yield item
def _parse_free(self, response): reports = response.xpath(".//*[@class='tul3']//li") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["report_name"] = clean_text( report.xpath(".//a//text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url report_content = self.parseTimeContent(page_url) if report_content != None: item["report_content"] = report_content report_time = clean_text( report.xpath(".//span//text()").extract()[0].strip()) if (len(report_time) > 0): item["report_revision_time"] = report_time date, date_precision = parse_date( item["report_revision_time"]) try: item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) except: pass item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国产业洞察网" item["price_free"] = True yield item
def _parse_item(self, response): reports = response.xpath(".//*[@class='info']") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath(".//h3//a/text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url string = clean_text( report.xpath(" //*[@class='rdate']//span/text()").extract( )[0].strip()) temp = self.parseItem(string) if len(temp) == 1: item["report_revision_time"] = temp[0][0] item["report_page_count"] = temp[0][1] item["report_graph_count"] = temp[0][2] date, date_precision = parse_date( item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国投资咨询网" item["price_free"] = False yield item if_nextpage = response.xpath(".//*[@class='zw']") if len(if_nextpage) > 0: if (if_nextpage.xpath(".//text()").extract()[-1] ) == u'下一页': #存在翻页 page_url = if_nextpage.xpath(".//@href").extract()[-1] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] request.meta["mid_category"] = response.meta[ "mid_category"] yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='img_des']/a") if len(reports) > 0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta[ "large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text( report.xpath("./text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName( item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url report_time = self.parseTime(item["report_link"]) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"欧咨网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace( tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item if len(response.xpath(".//*[@class='page']//@href")) > 1: #存在翻页 page_len = clean_text( response.xpath( ".//*[@class='page']//*[@class='fl_l']/text()"). extract()[0].strip()) nextPageurl = response.xpath( ".//*[@class='page']//@href").extract()[-1] finds = self.pattern_page.findall(page_len) currentPage = finds[0][0] totlePage = finds[0][1] if currentPage != totlePage: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] request.meta["mid_category"] = response.meta[ "mid_category"] yield request
def report_para(self, item, report): revision_time = clean_text(report.xpath("..//*[@class='sp1']/text()").extract()[0].split(u":")[1].strip()) if self.pattern.match(revision_time): item["report_revision_time"] = revision_time else: textlst = report.xpath("../*[@class='main']/text()").extract()[0].replace(u"】 ", u"【").split(u"【") for text in range(len(textlst)+1): if textlst[text].endswith(u"日期"): item["report_revision_time"] = clean_text(textlst[text+1].strip()) break try: date, date_precision = parse_date(item["report_revision_time"]) dateTimezone = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["report_revision_time_standard"] = dateTimezone except: pass
def _parse_item(self, response): reports = response.xpath(".//*[@class='info']") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//h3//a/text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url string =clean_text(report.xpath(" //*[@class='rdate']//span/text()").extract()[0].strip()) temp = self.parseItem(string) if len(temp)==1: item["report_revision_time"] = temp[0][0] item["report_page_count"] = temp[0][1] item["report_graph_count"] = temp[0][2] date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国投资咨询网" item["price_free"] = False yield item if_nextpage = response.xpath(".//*[@class='zw']") if len(if_nextpage)>0: if (if_nextpage.xpath(".//text()").extract()[-1])==u'下一页': #存在翻页 page_url =if_nextpage.xpath(".//@href").extract()[-1] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request
def _parse_item(self, response): reports = response.xpath(".//*[@class='clistdl']") for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath(".//dt/a/text()").extract()[0].strip()) if len(report.xpath(".//dd//*[@class='cxgrep']//@title"))>0: industry = clean_text(report.xpath(".//dd//*[@class='cxgrep']//@title").extract()[0].strip()) else: industry = item["report_name"] industry_small_chs_name = parseIndustryName(industry) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] item["report_link"] = page_url item["report_revision_time"] = clean_text(report.xpath(".//dt/span/text()").extract()[0].strip()) item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"中国报告大厅" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item
def _parse_item(self, response): reports = response.xpath(".//*[@class='img_des']/a") if len(reports)>0: for report in reports: item = IndustryReportSpiderItem() item["industry_large_category"] = response.meta["large_category"] item["industry_mid_category"] = response.meta["mid_category"] item["report_name"] = clean_text(report.xpath("./text()").extract()[0].strip()) industry_small_chs_name = parseIndustryName(item["report_name"]) if industry_small_chs_name != None: item["industry_small_chs_name"] = industry_small_chs_name page_url = report.xpath(".//@href").extract()[0] url = urljoin(self.base_url, page_url) item["report_link"] = url report_time = self.parseTime(item["report_link"]) if report_time != None: item["report_revision_time"] = report_time item["source_domain"] = self.allowed_domains[0] item["source_name"] = u"欧咨网" date, date_precision = parse_date(item["report_revision_time"]) item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai')) item["price_free"] = False yield item if len(response.xpath(".//*[@class='page']//@href"))>1: #存在翻页 page_len = clean_text(response.xpath(".//*[@class='page']//*[@class='fl_l']/text()").extract()[0].strip()) nextPageurl = response.xpath(".//*[@class='page']//@href").extract()[-1] finds = self.pattern_page.findall(page_len) currentPage = finds[0][0] totlePage = finds[0][1] if currentPage != totlePage: url = urljoin(self.base_url, nextPageurl) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = response.meta["mid_category"] yield request