def _parse_research(self, response):
     reports = response.xpath(".//*[@id='ulNewsList']//li")
     if len(reports)>0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta["large_category"]
             item["industry_mid_category"] = response.meta["mid_category"]
             item["report_name"] = clean_text(report.xpath(".//dt//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_time = clean_text(report.xpath(".//*[@class='time']").extract()[0].strip())
             if len(report_time) >0:
                 item["report_revision_time"] = report_time.split(u":")[1]
                 date, date_precision = parse_date(item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = False
             yield item
 def _parse_free(self, response):
     reports = response.xpath(".//*[@class='tul3']//li")
     if len(reports)>0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta["large_category"]
             item["report_name"] = clean_text(report.xpath(".//a//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_content = self.parseTimeContent(page_url)
             if report_content != None:
                 item["report_content"] = report_content
             report_time = clean_text(report.xpath(".//span//text()").extract()[0].strip())
             if (len(report_time) > 0):
                 item["report_revision_time"] = report_time
                 date, date_precision = parse_date(item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = True
             yield item
예제 #3
0
 def parse_item(self, response):
     item = IndustryReportSpiderItem()
     item['report_link'] = response.url
     item['source_name'] = u"艾瑞网"
     item['source_domain'] = self.allowed_domains[0]
     item['report_name'] = clean_text(response.xpath("//*[@class='content_title']/text()").extract()[0].strip())
     price = response.xpath(u"//*[contains(text(), '价格')]/text()").extract()[0]
     item['price_free'] = True if u"免费" in price else False
     infodatas = response.xpath("//*[@class='content_titleinfoa']/span//text()").extract()
     for text in infodatas:
         try:
             if u"页数" in text:item['report_page_count'] = re.findall(ur'([0-9]+)', text)[0]
         except:pass
         try:
             if u"图表" in text:item['report_graph_count'] = re.findall(ur'([0-9]+)', text)[0]
         except:pass
         try:
             if u"-" in text:
                 item['report_revision_time'] = text
                 item['report_revision_time_standard'] = parse_date(item['report_revision_time'])
         except:pass
     item['industry_large_category'] =u"信息传输、软件和信息技术服务业"
     try:
         item['industry_mid_category'] = clean_text(response.xpath("//*[@class='content_titleinfoa']//a/text()").extract()[0].strip())
     except:
         pass
     # if item['price_free']:
         # self.browser.get(response.url)
         # self.browser.find_element_by_xpath("//*[@class='download']/a").click()
         # WebDriverWait(self.browser, 20).until(EC.presence_of_element_located((By.XPATH, ".//*[@id='ButtonBox']/input")))
         # Confirm = self.browser.find_element_by_xpath(".//*[@id='ButtonBox']/input")
         # Confirm.click()
         # WebDriverWait(self.browser, 20).until(EC.staleness_of(Confirm))
         # if ".pdf" in self.browser.current_url:item['pdf_Link'] = self.browser.current_url
     return item
    def _parse_hg(self, response):
        reports = response.xpath(".//*[@class='yahei f14']")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["report_name"] = clean_text(
                    report.xpath(".//a/text()").extract()[0].strip())
                page_url = report.xpath(".//a//@href").extract()[0]
                item["report_link"] = page_url
                report_time = clean_text(
                    report.xpath(".//*[@name='deliveddate']/text()").extract()
                    [0].strip())
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"国研网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(
                    tzinfo=pytz.timezone('Asia/Shanghai'))

                dict = self.parseContent(page_url)
                if dict["free"] == False:
                    item["price_free"] = False
                else:
                    item["price_free"] = True
                    if (dict["url"][0] == "pdf"):
                        item["pdf_Link"] = dict["url"][1]
                    else:
                        item["content_Link"] = dict["url"][1]
                yield item
 def _parse_research(self, response):
     reports = response.xpath(".//*[@id='ulNewsList']//li")
     if len(reports) > 0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta[
                 "large_category"]
             item["industry_mid_category"] = response.meta["mid_category"]
             item["report_name"] = clean_text(
                 report.xpath(".//dt//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(
                 item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_time = clean_text(
                 report.xpath(".//*[@class='time']").extract()[0].strip())
             if len(report_time) > 0:
                 item["report_revision_time"] = report_time.split(u":")[1]
                 date, date_precision = parse_date(
                     item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(
                         tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = False
             yield item
 def _parse_free(self, response):
     reports = response.xpath(".//*[@class='tul3']//li")
     if len(reports) > 0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta[
                 "large_category"]
             item["report_name"] = clean_text(
                 report.xpath(".//a//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(
                 item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_content = self.parseTimeContent(page_url)
             if report_content != None:
                 item["report_content"] = report_content
             report_time = clean_text(
                 report.xpath(".//span//text()").extract()[0].strip())
             if (len(report_time) > 0):
                 item["report_revision_time"] = report_time
                 date, date_precision = parse_date(
                     item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(
                         tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = True
             yield item
예제 #7
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='info']")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(
                    report.xpath(".//h3//a/text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(
                    item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                string = clean_text(
                    report.xpath(" //*[@class='rdate']//span/text()").extract(
                    )[0].strip())
                temp = self.parseItem(string)
                if len(temp) == 1:
                    item["report_revision_time"] = temp[0][0]
                    item["report_page_count"] = temp[0][1]
                    item["report_graph_count"] = temp[0][2]
                    date, date_precision = parse_date(
                        item["report_revision_time"])
                    item["report_revision_time_standard"] = date.replace(
                        tzinfo=pytz.timezone('Asia/Shanghai'))

                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"中国投资咨询网"
                item["price_free"] = False
                yield item

            if_nextpage = response.xpath(".//*[@class='zw']")

            if len(if_nextpage) > 0:
                if (if_nextpage.xpath(".//text()").extract()[-1]
                    ) == u'下一页':  #存在翻页
                    page_url = if_nextpage.xpath(".//@href").extract()[-1]
                    url = urljoin(self.base_url, page_url)
                    request = FormRequest(url,
                                          callback=self._parse_item,
                                          dont_filter=True)
                    request.meta["large_category"] = response.meta[
                        "large_category"]
                    request.meta["mid_category"] = response.meta[
                        "mid_category"]
                    yield request
예제 #8
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='img_des']/a")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(
                    report.xpath("./text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(
                    item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                report_time = self.parseTime(item["report_link"])
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"欧咨网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(
                    tzinfo=pytz.timezone('Asia/Shanghai'))
                item["price_free"] = False
                yield item

            if len(response.xpath(".//*[@class='page']//@href")) > 1:  #存在翻页
                page_len = clean_text(
                    response.xpath(
                        ".//*[@class='page']//*[@class='fl_l']/text()").
                    extract()[0].strip())
                nextPageurl = response.xpath(
                    ".//*[@class='page']//@href").extract()[-1]
                finds = self.pattern_page.findall(page_len)
                currentPage = finds[0][0]
                totlePage = finds[0][1]
                if currentPage != totlePage:
                    url = urljoin(self.base_url, nextPageurl)
                    request = FormRequest(url,
                                          callback=self._parse_item,
                                          dont_filter=True)
                    request.meta["large_category"] = response.meta[
                        "large_category"]
                    request.meta["mid_category"] = response.meta[
                        "mid_category"]
                    yield request
예제 #9
0
 def report_para(self, item, report):
     revision_time = clean_text(report.xpath("..//*[@class='sp1']/text()").extract()[0].split(u":")[1].strip())
     if self.pattern.match(revision_time):
         item["report_revision_time"] = revision_time
     else:
         textlst = report.xpath("../*[@class='main']/text()").extract()[0].replace(u"】 ", u"【").split(u"【")
         for text in range(len(textlst)+1):
             if textlst[text].endswith(u"日期"):
                 item["report_revision_time"] = clean_text(textlst[text+1].strip())
                 break
     try:
         date, date_precision = parse_date(item["report_revision_time"])
         dateTimezone = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
         item["report_revision_time_standard"] = dateTimezone
     except:
         pass
예제 #10
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='info']")
        if len(reports)>0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta["large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(report.xpath(".//h3//a/text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                string =clean_text(report.xpath(" //*[@class='rdate']//span/text()").extract()[0].strip())
                temp = self.parseItem(string)
                if len(temp)==1:
                    item["report_revision_time"] = temp[0][0]
                    item["report_page_count"] = temp[0][1]
                    item["report_graph_count"] = temp[0][2]
                    date, date_precision = parse_date(item["report_revision_time"])
                    item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))

                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"中国投资咨询网"
                item["price_free"] = False
                yield item

            if_nextpage = response.xpath(".//*[@class='zw']")

            if len(if_nextpage)>0:
                if (if_nextpage.xpath(".//text()").extract()[-1])==u'下一页': #存在翻页
                    page_url =if_nextpage.xpath(".//@href").extract()[-1]
                    url = urljoin(self.base_url, page_url)
                    request = FormRequest(url, callback=self._parse_item, dont_filter=True)
                    request.meta["large_category"] = response.meta["large_category"]
                    request.meta["mid_category"] = response.meta["mid_category"]
                    yield request
 def _parse_item(self, response):
     reports = response.xpath(".//*[@class='clistdl']")
     for report in reports:
         item = IndustryReportSpiderItem()
         item["industry_large_category"] = response.meta["large_category"]
         item["industry_mid_category"] = response.meta["mid_category"]
         item["report_name"] = clean_text(report.xpath(".//dt/a/text()").extract()[0].strip())
         if len(report.xpath(".//dd//*[@class='cxgrep']//@title"))>0:
             industry = clean_text(report.xpath(".//dd//*[@class='cxgrep']//@title").extract()[0].strip())
         else:
             industry = item["report_name"]
         industry_small_chs_name = parseIndustryName(industry)
         if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
         page_url = report.xpath(".//@href").extract()[0]
         item["report_link"] = page_url
         item["report_revision_time"] = clean_text(report.xpath(".//dt/span/text()").extract()[0].strip())
         item["source_domain"] = self.allowed_domains[0]
         item["source_name"] = u"中国报告大厅"
         date, date_precision = parse_date(item["report_revision_time"])
         item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
         item["price_free"] = False
         yield item
예제 #12
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='img_des']/a")
        if len(reports)>0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta["large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(report.xpath("./text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                report_time = self.parseTime(item["report_link"])
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"欧咨网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                item["price_free"] = False
                yield item

            if len(response.xpath(".//*[@class='page']//@href"))>1: #存在翻页
                page_len = clean_text(response.xpath(".//*[@class='page']//*[@class='fl_l']/text()").extract()[0].strip())
                nextPageurl = response.xpath(".//*[@class='page']//@href").extract()[-1]
                finds = self.pattern_page.findall(page_len)
                currentPage = finds[0][0]
                totlePage = finds[0][1]
                if currentPage != totlePage:
                    url = urljoin(self.base_url, nextPageurl)
                    request = FormRequest(url, callback=self._parse_item, dont_filter=True)
                    request.meta["large_category"] = response.meta["large_category"]
                    request.meta["mid_category"] = response.meta["mid_category"]
                    yield request