示例#1
0
 def parse(self, response):
     message_list = response.xpath('//ul[@class="flfg_03"]/li')
     for message in message_list:
         date = "".join(message.xpath('span/text()').extract())
         title = "".join(message.xpath('a/text()').extract()).replace("· ", "")
         href = "".join(message.xpath('a/@href').extract())
         try:
             date = datetime.datetime.strptime(str(date).replace('-', '-'), '%Y-%m-%d')
         except Exception as e:
             date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         if str("www.hainan.gov.cn") in href:
             result = session.query(NewsItemInfo).filter_by(url=href, web_id=23).count()
             if result:
                 # print("{} 存在".format(href))
                 pass
             else:
                 if str(href) == "http://www.hainan.gov.cn/":
                     pass
                 else:
                     yield scrapy.Request(url=href, callback=self.get_detail,
                                          meta={"date": date, "title": title, "laiyuan": response.url})
         else:
             url = response.url + href  # .replace("./", "")
             result = session.query(NewsItemInfo).filter_by(url=url, web_id=23).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 if str(url) == "http://www.hainan.gov.cn/":
                     pass
                 else:
                     yield scrapy.Request(url=url, callback=self.get_detail,
                                          meta={"date": date, "title": title, "laiyuan": response.url})
示例#2
0
 def parse(self, response):
     # print(response.text)
     message_list = response.xpath('//*[@id="con"]/tr')
     for message in message_list:
         title = "".join(message.xpath('td[2]/a/text()').extract())
         href = "".join(message.xpath('td[2]/a/@href').extract())
         date = "".join(message.xpath('td[3]/text()').extract())
         date = date.replace(".", "-")
         # 防止空白行 只有提取到了标题的内容 才进行数据提取
         if title != "":
             try:
                 date = datetime.datetime.strptime(
                     str(date).replace('/', '-'), '%Y-%m-%d')
                 # print(date)
             except Exception as e:
                 # print(e)
                 date = time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time()))
             # print(title, url, date)
             # 新闻发布会的数据内容不一样
             if "http://www.mlr.gov.cn" in href:
                 url = href.replace("index.htm", "")
                 # print(url)
                 result = session.query(NewsItemInfo).filter_by(
                     url=url, web_id=8).count()
                 if result:
                     # print("{} 存在".format(url))
                     pass
                 else:
                     yield scrapy.Request(url=url,
                                          callback=self.get_detail_fbh,
                                          meta={
                                              "date": date,
                                              "title": title,
                                              "laiyuan": response.url
                                          })
             else:
                 url = response.url + href
                 result = session.query(NewsItemInfo).filter_by(
                     url=url, web_id=8).count()
                 if result:
                     # print("{} 存在".format(url))
                     pass
                 else:
                     yield scrapy.Request(url=url,
                                          callback=self.get_detail,
                                          meta={
                                              "date": date,
                                              "title": title,
                                              "laiyuan": response.url
                                          })
示例#3
0
 def parse(self, response):
     message_list = response.xpath('//ul[@class="nr_neirong"]/li')
     # print(len(message_list))
     for message in message_list:
         title = "".join(message.xpath('span/a/text()').extract())
         href = "".join(message.xpath('span/a/@href').extract())
         date = "".join(message.xpath('font/text()').extract())
         if "http" in href:
             url = href
         else:
             url = response.url + href
         # print(title, url, date)
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         if ".pdf" not in str(url[:-5]).lower():
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=57).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url=url,
                                      callback=self.get_detail,
                                      meta={
                                          "title": title,
                                          "date": date,
                                          "laiyuan": response.url
                                      })
         else:
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=57).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 item = FagaiweiItem()
                 item["url"] = url
                 item["pub_time"] = date
                 item["title"] = title
                 item["content"] = "可能是图片或表格 打开原网站查看"
                 item["webname"] = "中国期货业协会"
                 item["web_id"] = 57
                 item["keyword"] = keyword.get_keyword(item["content"])
                 yield item
示例#4
0
 def parse(self, response):
     message_list = response.xpath('//ul[@class="list_02 clearfix"]/li')
     # print(len(message_list))
     for message in message_list:
         date = "".join(message.xpath('font/text()').extract())
         # print(date)
         title = "".join(message.xpath('a/text()').extract())
         # print(title)
         href = "".join(message.xpath('a/@href').extract())
         # print(href)
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         if "www.xinhuanet.com" in href:
             result = session.query(NewsItemInfo).filter_by(
                 url=href, web_id=2).count()
             if result:
                 # print("{} 存在".format(href))
                 pass
             else:
                 yield scrapy.Request(url=href,
                                      callback=self.get_detail,
                                      meta={
                                          "date": date,
                                          "title": title
                                      })
         elif href[-3:] == ".pdf":
             pass
         else:
             url = response.url + href[2:]
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=2).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url=url,
                                      callback=self.get_detail,
                                      meta={
                                          "date": date,
                                          "title": title,
                                          "laiyuan": response.url
                                      })
示例#5
0
 def parse(self, response):
     # print(response.text)
     message_list = response.xpath('//ul[@class="conList_ul"]/li|\
                                     //ul[@class="govpushinfo150203"]/li')
     # print(len(message_list))
     for message in message_list:
         title = "".join(message.xpath('a/text()').extract())
         href = "".join(message.xpath('a/@href').extract())
         date = "".join(message.xpath('span/text()').extract())
         # print(title, href, date)
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         if "http://fangtan.customs.gov.cn" in href:
             url = href
         else:
             url = "http://www.customs.gov.cn" + href
         result = session.query(NewsItemInfo).filter_by(url=url,
                                                        web_id=11).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url=url,
                                  callback=self.get_detail,
                                  meta={
                                      "title": title,
                                      "date": date,
                                      "laiyuan": response.url
                                  })
示例#6
0
    def parse(self, response):
        # 获取页面详情页的url
        url = response.url
        orurl = 'http://www.xm.gov.cn/'
        contens_urls = response.xpath(
            "//div[@class='gl_list1']//li/a/@href").getall()
        name = response.xpath("//li[@class='on']/a/text()")[-1].get()

        for contens_url in contens_urls:
            if contens_url.startswith('./'):
                contens_url = contens_url.replace('./', '')
                contens_url = url + contens_url
            elif contens_url.startswith('../'):
                contens_url = contens_url.replace('../', '')
                contens_url = orurl + contens_url

            result = session.query(NewsItemInfo).filter_by(url=contens_url,
                                                           web_id=25).count()
            if result:
                # print("{} 存在".format(contens_url))
                pass
            else:
                yield scrapy.Request(url=contens_url,
                                     callback=self.parse_page,
                                     meta={
                                         "url": url,
                                         "name": name
                                     })
示例#7
0
    def process_file(self, response):
        # 处理详情页面是文件如公司公告
        # print('this is file !')
        item = FagaiweiItem()
        for tmp in response.xpath('//*[@class="content"]/div[@ref]/div[2]'):
            item['web_id'] = 51
            item['url'] = tmp.xpath('./div[1]/a/@href').extract_first()
            item['title'] = tmp.xpath('./div[1]/a/text()').extract_first(
                default='')
            item['web'] = response.meta.get('web')
            item['keyword'] = ''
            item['webname'] = '阿斯达克新闻'
            item_time = tmp.xpath(
                './div[@class="newstime4"]/text()').extract_first(default='')
            item['pub_time'] = ' '.join(item_time.split()[1:]).replace(
                '/', '-')
            item['content'] = '这是一个文件,查看原文链接进行打开!!'

            result = session.query(NewsItemInfo).filter_by(url=item['url'],
                                                           web_id=51).count()
            if result:
                # print("{} 存在".format(item['url']))
                pass
            else:
                yield item
示例#8
0
 def parse(self, response):
     item = FagaiweiItem()
     urls = response.xpath("//span[@class='tit']/a/@href").getall()
     # print(urls)
     titles = response.xpath("//span[@class='tit']/a/text()").getall()
     times = response.xpath("//span[@class='time']/text()").getall()
     dabao = zip(urls, titles, times)
     for url, title, time1 in dabao:
         filename = re.findall(r'=(\d+)', url)[0]
         url2 = 'http://php.cnstock.com/news_new/index.php/api/fileview?ID=' + filename + '&db=txt'
         if url2[-4:] == '=txt':
             # print("==================================\n{}".format(durl))
             result = session.query(NewsItemInfo).filter_by(
                 url=url2, web_id=67).count()
             if result:
                 # print("TXT 文件地址: {} 存在".format(url2))
                 pass
             else:
                 content = txt.main(url=url2)
                 item['content'] = content
                 item['web_id'] = 67
                 item['title'] = title
                 time = time1.replace('(', '').replace(')', '')
                 item['pub_time'] = time
                 item['webname'] = '中国证券网信息披露平台'
                 item['web'] = response.url
                 item['url'] = url2
                 item["keyword"] = keyword.get_keyword(item["content"])
                 yield item
示例#9
0
 def parse(self, response):
     doc = pq(response.text)
     keyword_list = doc('#topwords li a').text().split()
     url_list = doc('.news-list li')
     for content in url_list.items():
         webname = content('.account').text()
         web = response.url
         url = content('h3 a').attr('href')
         result = session.query(NewsItemInfo).filter_by(url=url,
                                                        web_id=13).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             title = content('h3').text()
             timestamp = content('.s-p').attr('t')
             pub_time = datetime.datetime.fromtimestamp(
                 int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
             yield scrapy.Request(url,
                                  callback=self.process_detail,
                                  meta={
                                      'webname': webname,
                                      'web': web,
                                      'title': title,
                                      'pub_time': pub_time
                                  })
示例#10
0
 def parse(self, response):
     message_list = response.xpath(
         '//div[@class="wscn-tabs__content"]/div/div')
     # print(len(message_list))
     for message in message_list:
         # date = "".join(message.xpath('span/a/text()|span/text()').extract())
         # date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         title = "".join(
             message.xpath('div/div/a[1]/text()').extract()).replace(
                 " ", "").replace("\n", "")
         href = "".join(message.xpath('div/div/a[1]/@href').extract())
         # print(title, href)
         if "http" in href:
             url = href
         else:
             url = "https://wallstreetcn.com" + href
         result = session.query(NewsItemInfo).filter_by(url=url,
                                                        web_id=46).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url=url,
                                  callback=self.get_detail,
                                  meta={
                                      "title": title,
                                      "laiyuan": response.url
                                  })
示例#11
0
def url_fagaiwei(response):
    data_list = {}
    message_list = response.xpath('//ul[@class="list_02 clearfix"]/li')
    for message in message_list:
        date = "".join(message.xpath('font/text()').extract())
        title = "".join(message.xpath('a/text()').extract())
        href = "".join(message.xpath('a/@href').extract())
        if title != "":
            try:
                date = datetime.datetime.strptime(
                    str(date).replace('/', '-'), '%Y-%m-%d')
                # print(date)
            except Exception as e:
                # print(e)
                date = time.strftime('%Y-%m-%d %H:%M:%S',
                                     time.localtime(time.time()))
            if "www.xinhuanet.com" in href:
                url = href
            elif href.endswith("pdf"):
                pass
            elif "http" in href:
                url = href
            else:
                url = response.url + href
            result = session.query(NewsItemInfo).filter_by(url=url,
                                                           web_id=2).count()
            if result:
                # print("{} 存在".format(url))
                pass
            else:
                data_list['date'] = date
                data_list['url'] = url
                data_list['title'] = title
                yield data_list
示例#12
0
 def parse(self, response):
     message_list = response.xpath(
         '//table[@class="sv_yh_14_30"]/tr/td/table/tr')
     # print(len(message_list))
     for message in message_list:
         title = "".join(message.xpath('td[2]/a/text()').extract())
         href = "".join(message.xpath('td[2]/a/@href').extract())
         date = "".join(message.xpath('td[3]/text()').extract())
         # print(title, href, date)
         date = date.replace('[', '').replace(']', '')
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         if href != "":
             url = response.url.replace("index.html", "") + href
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=9).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url=url,
                                      callback=self.get_detail,
                                      meta={
                                          "title": title,
                                          "date": date,
                                          "laiyuan": response.url
                                      })
示例#13
0
 def parse(self, response):
     pub_title = '大公报'
     data_tiitle = ''.join(list(response.xpath("//div[@class='pannel_inner01']/div//text()").getall())) \
         .replace('/n', '')
     web2 = 'http://news.takungpao.com.hk/paper/{}.html'.format(
         time.strftime("%Y%m%d", time.localtime()))
     url2s = response.xpath("//a[@class ='bluelink']/text()").getall()
     for url2 in url2s:
         item = FagaiweiItem()
         param = re.search(r'第(\w+)版', url2).group(1)
         url = web2 + '?' + param
         result = session.query(NewsItemInfo).filter_by(url=url,
                                                        web_id=41).count()
         if result:
             # print("PDF 文件地址: {} 存在".format(url))
             pass
         else:
             item['url'] = url
             item['title'] = pub_title + data_tiitle + param
             item['content'] = '该页面为电子版报纸请点原链接查看'
             item['web'] = response.url
             item['webname'] = pub_title
             item['pub_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime())
             item["keyword"] = keyword.get_keyword(item["content"])
             item['web_id'] = 41
             yield item
示例#14
0
    def parse(self, response):
        pub_title = '中国上市公司'
        title2 = response.xpath("//span[@class='title']/text()").get()
        webname = pub_title + title2
        urls = response.xpath(
            "//div[@class='list_data']//li/a/@onclick").getall()
        urla = response.url
        data = response.xpath("//span[@class='date']/text()").get()
        if data.startswith('2018'):
            data = data
        else:
            data = '2018-' + data
        for url in urls:
            url = 'http://www.cnlist.com' + url.replace("OpenDetail('", '') \
                .replace(',', '?id=').replace("');", '').replace(' ', '').replace("'", "")

            result = session.query(NewsItemInfo).filter_by(url=url,
                                                           web_id=64).count()
            if result:
                # print("PDF 文件地址: {} 存在".format(url))
                pass
            else:
                yield scrapy.Request(url=url,
                                     callback=self.parse_page,
                                     meta={
                                         'url': urla,
                                         'webname': webname,
                                         'data': data
                                     })
示例#15
0
 def parse_tz(self, response):
     message_list = response.xpath('//tbody/tr')
     # print(len(message_list))
     for message in message_list:
         title = "".join(message.xpath('td/a/text()').extract())
         href = "".join(message.xpath('td/a/@href').extract())
         date = "".join(message.xpath('td[2]/text()').extract())
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         # print(title, href, date)
         url = href
         if ".pdf" not in str(url[:-5]).lower():
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=57).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url=url,
                                      callback=self.get_detail,
                                      meta={
                                          "title": title,
                                          "date": date,
                                          "laiyuan": response.url
                                      })
示例#16
0
    def parse(self, response):
        message_list = response.xpath('//div[@class="lie_main_m"]/ul/li|//div[@class="lie_main_m"]/li')
        for message in message_list:
            title = "".join(message.xpath('a/text()').extract())
            # keyword = "".join(message.xpath('span/text()').extract())
            href = "".join(message.xpath('a/@href').extract())
            date = "".join(message.xpath('a/span/text()').extract())
            # print(href)
            try:
                date = datetime.datetime.strptime(str(date).replace('/', '-'), '%Y-%m-%d')
                # print(date)
            except Exception as e:
                # print(e)
                date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

            url = response.url + href
            # print(title, date, url)
            result = session.query(NewsItemInfo).filter_by(url=url, web_id=20).count()
            if result:
                # print("{} 存在".format(url))
                pass
            else:
                yield scrapy.Request(url=url, callback=self.get_detail,
                                     meta={"date": date,
                                           "title": title.replace("\r", "").replace("\n", "").replace("\t", ""),
                                           "laiyuan": response.url})
示例#17
0
 def parse(self, response):
     message_list = response.xpath('//div[@class="inner"]/div[2]/ul/li')
     # print(len(message_list))
     for message in message_list:
         title = "".join(message.xpath('a/text()').extract())
         href = "".join(message.xpath('a/@href').extract())
         date = "".join(message.xpath('span/text()').extract())
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         url = "http://www.forestry.gov.cn" + href
         # print(title, url, date)
         result = session.query(NewsItemInfo).filter_by(url=url,
                                                        web_id=15).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url=url,
                                  callback=self.get_detail,
                                  meta={
                                      "title": title,
                                      "date": date,
                                      "laiyuan": response.url
                                  })
 def parse(self, response):
     json_text = response.text
     json_str = json.loads(json_text, encoding='utf8')
     urls = jsonpath.jsonpath(json_str, '$..url')
     pub_times = jsonpath.jsonpath(json_str, '$..dateTime')
     titles = jsonpath.jsonpath(json_str, '$..title')
     contents = jsonpath.jsonpath(json_str, '$..description')
     dabao = zip(urls, titles, pub_times, contents)
     for url, title, pub_time, content in dabao:
         result = session.query(NewsItemInfo).filter_by(url=url,
                                                        web_id=44).count()
         if result:
             # print("URL文件地址: {} 存在".format(url))
             pass
         else:
             item = FagaiweiItem()
             item['webname'] = '央视网'
             item['web'] = 'http://news.cctv.com/'
             item['url'] = url
             item['pub_time'] = pub_time
             item['content'] = content
             item['keyword'] = ''
             item['title'] = title
             item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime())
             item['web_id'] = 44
             yield item
示例#19
0
 def parse(self, response):
     message_list = response.xpath('//ul/li')
     # print(len(message_list))
     for message in message_list:
         href = "".join(message.xpath('a/@href').extract())
         title = "".join(message.xpath('a/text()').extract())
         date = "".join(message.xpath('span/text()').extract())
         if date != "":
             if "http" in href.lower():
                 url = href
             else:
                 url = "http://www.cea.gov.cn" + href
             date = date.replace('[', '').replace(']', '')
             # print(date)
             try:
                 date = datetime.datetime.strptime(
                     str(date).replace('/', '-'), '%Y-%m-%d %H:%M:%S')
                 # print(date)
             except Exception as e:
                 # print(e)
                 date = time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time()))
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=7).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url=url,
                                      callback=self.get_detail,
                                      meta={
                                          "date": date,
                                          "title": title,
                                          "laiyuan": response.url
                                      })
示例#20
0
 def parse_2(self, response):
     message_list = response.xpath(
         '//div[@class="show_body clearfix"]/div[1]/ul/li')
     for message in message_list:
         title = "".join(message.xpath('a/text()').extract())
         href = "".join(message.xpath('a/@href').extract())
         date = "".join(message.xpath('span/text()').extract())
         date = date.replace("年", "-").replace("月", "-").replace("日", "")
         # print(date)
         try:
             date = datetime.datetime.strptime(
                 str(date).replace('/', '-'), '%Y-%m-%d %H:%M')
             # print(date)
         except Exception as e:
             # print(e)
             date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
         # print(title, href, date)
         result = session.query(NewsItemInfo).filter_by(url=href,
                                                        web_id=49).count()
         if result:
             # print("{} 存在".format(href))
             pass
         else:
             yield scrapy.Request(url=href,
                                  callback=self.get_detail_2,
                                  meta={
                                      "title": title,
                                      "date": date,
                                      "laiyuan": response.url
                                  })
示例#21
0
 def parse(self, response):
     urlsy = []
     response = response.text
     json_str = json.loads(response, encoding='utf-8')
     urls = jsonpath.jsonpath(json_str, '$..url')
     nodeIds = jsonpath.jsonpath(json_str, '$..nodeId')
     titles = jsonpath.jsonpath(json_str, '$..title')
     datas = jsonpath.jsonpath(json_str, '$..date')
     dabaos = zip(urls, nodeIds, titles, datas)
     for url, node, titles, datas in dabaos:
         # print(url)
         node = int(node)
         # 把不需要的nodeID写进去就可以过滤他那一类的新闻
         not_node = [1016, 368583, 174585]
         if url not in urlsy:
             if node not in not_node:
                 urlsy.append(url)
                 result = session.query(NewsItemInfo).filter_by(
                     url=url, web_id=33).count()
                 if result:
                     # print("{} 存在".format(url))
                     pass
                 else:
                     yield scrapy.Request(url=url,
                                          callback=self.parse_page,
                                          meta={
                                              'title': titles,
                                              'data': datas
                                          })
             else:
                 pass
         else:
             pass
示例#22
0
 def parse_js(self, response):
     url_list = re.findall(r'"url"\s?:\s?"(.*?)"', response.text)
     for url in url_list:
         result = session.query(NewsItemInfo).filter_by(url=url, web_id=36).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url, callback=self.process_detail, meta={'web': response.url})
示例#23
0
 def parse(self, response):
     url_list = response.xpath('//ul[@data-client="scroll"]/li/a/@href | //ul[@class="list_009"]/li/a/@href ').extract()
     for url in url_list:
         result = session.query(NewsItemInfo).filter_by(url=url, web_id=42).count()
         if result:
         # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url,callback=self.process_detail,meta={'web':response.url})
示例#24
0
 def parse(self, response):
     url_list = response.xpath('//div[@class="newslist"]/ul/li/div[1]/a/@href').extract()
     for url in url_list:
         new_url = 'http://www.acbgg.com' + url
         result = session.query(NewsItemInfo).filter_by(url=new_url, web_id=83).count()
         if result:
             # print("{} 存在".format(new_url))
             pass
         else:
             yield scrapy.Request(new_url, callback=self.process_detail, meta={'web': response.url})
示例#25
0
 def parse(self, response):
     url_list = response.xpath('//*[@id="ent0"]/li//div[@class="news_title"]/em/a/@href').extract()
     # print(url_list)
     for url in url_list:
         result = session.query(NewsItemInfo).filter_by(url='https:' + url, web_id=36).count()
         if result:
             # print("{} 存在".format('https:' + url))
             pass
         else:
             yield scrapy.Request('https:' + url, callback=self.process_detail, meta={'web': response.url})
示例#26
0
 def parse(self, response):
     url_list = response.xpath('//*[@id="mainlist"]/ul/li/p/a/@href|//ul[@id="idData"]/li/p[1]/a/@href').extract()
     for url in url_list:
         if '1' in url:
             result = session.query(NewsItemInfo).filter_by(url=url, web_id=61).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url, callback=self.process_detail, meta={'web': response.url})
示例#27
0
 def parse(self, response):
     # 获取文章链接
     url_list = response.xpath('//div[@class="ct_b_l_list"]//a[@class="ct_b_l_l_tb_tltie"]/@href').extract()
     for url in url_list:
         result = session.query(NewsItemInfo).filter_by(url=url, web_id=60).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url, callback=self.process_detail, meta={"web": response.url})
示例#28
0
def parse_juchao(response, item):
    PUB_URL = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/bulletin_detail/true/'
    D_URL = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/download/'
    dates = response.text
    json_str = json.loads(dates, encoding='utf-8')
    urls = jsonpath.jsonpath(json_str, "$..announcementId")
    title1 = jsonpath.jsonpath(json_str, "$..secCode")
    title2 = jsonpath.jsonpath(json_str, "$..secName")
    title3 = jsonpath.jsonpath(json_str, "$..announcementTitle")
    timestamp = jsonpath.jsonpath(json_str, "$..announcementTime")
    pdf = jsonpath.jsonpath(json_str, "$..adjunctUrl")
    if title2 is None:
        title2 = ''
        titles = zip(title1, title3)
    else:
        titles = zip(title1, title2, title3)

    url_contents = zip(urls, titles, timestamp, pdf)
    for url, title, time_local, pdf in url_contents:
        # item = {}
        if None in title:
            title = title[0] + title[2]
        else:
            title = title
        title = ' '.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \
            .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \
            .replace('?', '')

        durl = D_URL + url  # PDF文件下载地址
        if pdf[-4:] == '.PDF':
            # print("==================================\n{}".format(durl))
            result = session.query(NewsItemInfo).filter_by(url=PUB_URL + url,
                                                           web_id=56).count()
            if result:
                # print("PDF 文件地址: {} 存在".format(PUB_URL + url))
                pass
            else:
                contents = pdf_to_txt.main(url=durl, fileName=title)
                if len(contents) == 0:
                    item['content'] = '请点击原文链接查看'
                else:
                    item['content'] = '\n'.join(list(contents))

                times = str(time_local)[0:-3] + '.' + '000'
                item['pub_time'] = datetime.datetime.fromtimestamp(
                    float(times)).strftime('%Y-%m-%d %H:%M:%S')
                item['webname'] = '巨潮资讯'
                item['web'] = response.url[0:-7]
                item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                item["keyword"] = keyword.get_keyword(item["content"])
                item['web_id'] = 56
                item['title'] = title
                item['url'] = PUB_URL + url
                yield item
示例#29
0
 def parse(self, response):
     url_list = re.findall(r'(/article.*?html)', response.text)
     for url in url_list:
         url = url.replace('\\', '')
         result = session.query(NewsItemInfo).filter_by(
             url='http://kan.china.com' + url, web_id=34).count()
         if result:
             # print("{} 存在".format('http://kan.china.com' + url))
             pass
         else:
             yield scrapy.Request('http://kan.china.com' + url,
                                  callback=self.process_detail)
示例#30
0
 def parse(self, response):
     urls = response.xpath("//ul[contains(@class,'article-mini')]//li/a/@href|"
                           "//ul[@class='nf-list']//a/@href").getall()
     urla = response.url
     for url in urls:
         # print("{}+++++++++++++++{}".format(urla, url))
         result = session.query(NewsItemInfo).filter_by(url=url, web_id=66).count()
         if result:
             # print("{} 存在".format(url))
             pass
         else:
             yield scrapy.Request(url=url, callback=self.parse_page, meta={'url': urla})