Exemplo n.º 1
0
 def parse(self, response):
     start_url = response.url
     try:
         data = htmlparser.Parser(response.body.decode(self.encoding))
     except Exception as e:
         print('response failed %s' % e)
         return
     org_list = data.xpathall('''//ul[@class="list_009"]/li''')
     # for org in org_list[:5]:
     for org in org_list:
         if org:
             title = org.xpath('''//a/text()''').text().strip()
             ctime = org.xpath('''//span/text()''').replace(
                 u'月',
                 '-').replace(u'日',
                              '').regex('(\d+-\d+ \d+:\d+)').text().strip()
             y_ctime = datetime.datetime.now().strftime("%Y")
             ctime = y_ctime + '-' + ctime
             c_time = ctime
             org_url = org.xpath('''//a/@href''').text().strip()
             if title:
                 url = urljoin(start_url, org_url)
                 print(url)
                 ctime = local_timestamp(ctime)
                 item = {'ctime': ctime, 'title': title}
                 print(item)
                 yield scrapy.Request(url,
                                      callback=self.detail_parse,
                                      meta={
                                          'item': item,
                                          "c_time": c_time
                                      },
                                      headers=self.headers,
                                      dont_filter=True)
Exemplo n.º 2
0
 def parse(self, response):
     start_url = response.url
     try:
         data = htmlparser.Parser(response.body.decode(self.encoding))
     except Exception as e:
         print('response failed %s' % e)
         return
     org_list = data.xpathall('''//div[@id="newslist"]/a''')
     # for org in org_list[:5]:
     for org in org_list:
         if org:
             title = org.xpath('''//h2/text()''').text().strip()
             ctime = org.xpath('''//div[@class="author"]/span''').regex(
                 '(\d+-\d+-\d+ \d+:\d+)').text().strip()
             org_url = org.xpath('''//@href''').text().strip()
             if title:
                 url = urljoin(start_url, org_url)
                 print(url)
                 ctime = local_timestamp(ctime)
                 item = {'ctime': ctime, 'title': title}
                 print(item)
                 yield scrapy.Request(url,
                                      callback=self.detail_parse,
                                      meta={'item': item},
                                      headers=self.headers,
                                      dont_filter=True)
 def parse(self, response):
     start_url = response.url
     try:
         resp = re.sub('\r|\n|\r\n', '',
                       response.body.decode(self.encoding))
         p_data = re.findall('\((.*)\)', resp)[0]
         data = json.loads(p_data)
     except Exception as e:
         print('response failed %s' % e)
         return
     org_list = data.get('data')
     for org in org_list:
         if org:
             title = org.get('title')
             ctime = org.get('newsTime')
             org_url = org.get('url')
             if title:
                 url = urljoin(start_url, org_url)
                 print(url)
                 ctime = local_timestamp(ctime)
                 item = {'ctime': ctime, 'title': title}
                 print(item)
                 yield scrapy.Request(url,
                                      callback=self.detail_parse,
                                      meta={'item': item},
                                      headers=self.headers,
                                      dont_filter=True)
Exemplo n.º 4
0
 def parse(self, response):
     start_url = response.url
     try:
         data = json.loads(response.body.decode(self.encoding))
     except Exception as e:
         print('response failed %s' % e)
         return
     org_list = data.get('data')
     for org in org_list:
         if org:
             title = org.get('title')
             ctime = org.get('publish_time')
             org_url = org.get('vurl')
             if title:
                 url = urljoin(start_url, org_url)
                 print(url)
                 ctime = local_timestamp(ctime)
                 item = {'ctime': ctime, 'title': title}
                 print(item)
                 yield scrapy.Request(url,
                                      callback=self.detail_parse,
                                      meta={'item': item},
                                      headers=self.headers,
                                      dont_filter=True)
Exemplo n.º 5
0
    def detail_parse(self, response):
        item = response.meta['item']
        try:
            data = htmlparser.Parser(response.body.decode(self.encoding))
        except Exception as e:
            print('second response failed %s' % e)
            return
        url = response.url
        ctime1 = data.xpath(
            '''//p[@class="Wh"]/span[1]/text()''').text().strip()
        ctime2 = data.xpath(
            '''//p[@class="Wh"]/span[2]/text()''').text().strip()
        ctime = ctime1 + ' ' + ctime2
        ctime = local_timestamp(ctime)
        contents = []  # 全部的文本内容
        content_list = data.xpathall('''//div[@class="detailCont"]''')
        for con in content_list:
            con = con.text().strip()
            if con:
                contents.append(con)
        content_x = data.xpath('''//div[@class="detailCont"]''').data
        content_xml = content_x
        label = {}
        img_list = data.xpathall('''//div[@class="detailCont"]//img''')
        if img_list:
            for count, image in enumerate(img_list):
                image_dict = {}
                image_url = image.xpath('//@src').text().strip()
                if image_url:
                    image_url = urljoin(url, image_url)
                    node = '#image{}#'.format(count)
                    file_name = image_url.split('/')[-1]
                    image_dict['url'] = image_url
                    image_dict['name'] = ''
                    image_dict['file_name'] = file_name
                    label[node] = image_dict

        table_list = data.xpathall('''//div[@class="detailCont"]//table''')
        if table_list:
            for count, table in enumerate(table_list):
                table_dict = {}
                node = "#table{}#".format(count)
                table_sele = table.data
                table_dict['table_xml'] = table_sele
                node_p = "<p>" + node + "</p>"
                content_x = content_x.replace(table_sele, node_p)
                label[node] = table_dict
        xml = htmlparser.Parser(content_x)
        web_contents = []  # web直接展示的content(表格替换成node)
        content_list = xml.xpathall('''//p''')
        for con in content_list:
            con = con.text().strip()
            if con:
                web_contents.append(con)
        breadcrumb = [
            "首页",
            "宏观",
        ]
        article_info = {}
        channel = '宏观'
        accessory = []  # 附件
        # all_acc = data.xpathall('''//div[@class="ewb-info-con"]//a''')
        # if all_acc:
        #     for acc in all_acc:
        #         temp = {}
        #         acc_url = acc.xpath('//@href').text().strip()
        #         if acc_url and '@' not in acc_url:
        #             acc_url = urljoin(url, acc_url)
        #             name = acc.text().strip()
        #             file_name = acc_url.split('/')[-1].split('=')[-1]
        #             temp['url'] = acc_url
        #             temp['name'] = name
        #             temp['file_name'] = file_name
        #             dir_path = os.path.join(self.ori_path, self.dir_name)
        #             if not os.path.isdir(dir_path):
        #                 os.makedirs(dir_path)
        #             path = os.path.join(dir_path, file_name)
        #             dow_img_acc(path, acc_url)
        #             # file_content = parse_main(path)
        #             temp['file_content'] = '' # file_content
        #             accessory.append(temp)
        gtime = int(time.time())
        main_business = ''
        source = data.xpath(
            '''//span[@class="baodao"]/text()''').text().strip()
        webname = '21经济网'
        domain = self.allowed_domains[0]
        uid = add_uuid(url)
        item["ctime"] = ctime  #
        item["collection_name"] = "news_finance_21jingji_raw"  # 集合名
        item["url"] = url  # 链接
        item["uid"] = uid  # 去重id
        item["contents"] = contents  # 数据处理的内容
        item["web_contents"] = web_contents  # 前端使用的内容
        item["article_info"] = article_info  # 文章的相关信息
        item["label"] = label  # 图片、表格
        item["accessory"] = accessory  # 附件
        item["gtime"] = gtime  # 爬虫时间
        item['breadcrumb'] = breadcrumb  # 导航
        item['channel'] = channel  # 频道
        item["spider_name"] = self.name  # 爬虫名
        item["webname"] = webname  # 网站名
        item["domain"] = domain  # 域名
        item["source"] = source  # 来源
        item["main_business"] = main_business  # 相关行业
        item['path'] = ''  # 附件路径
        yield item