Пример #1
0
 def parse_news(self, response):
     classify, e_name = response.meta.get('info')
     articles = response.xpath(
         "//div[@class='largeTitle']//article[contains(@class,'js-article-item')]"
     )
     for article in articles:
         link_url = article.xpath(".//div[@class='textDiv']/a/@href").get()
         link_url = response.urljoin(link_url)
         if xredis.sismember('flashnews:' + e_name, get_md5(link_url)):
             continue
         xredis.sadd('flashnews:' + e_name, get_md5(link_url))
         title = article.xpath(".//div[@class='textDiv']/a/text()").get()
         source = article.xpath(
             ".//div[@class='textDiv']/span/span[1]/text()").get()
         source = source.replace('提供者', '').strip()
         description = article.xpath(
             ".//div[@class='textDiv']/p/text()").getall()
         description = list_to_str(description)
         yield scrapy.Request(url=link_url,
                              callback=self.parse_time,
                              dont_filter=True,
                              meta={
                                  "info": (title, source, description,
                                           link_url, classify)
                              })
Пример #2
0
 def parse_list(self, response):
     classify = response.meta.get("info")
     articles = response.xpath("//div[@id='newslist']/a")
     for article in articles:
         article_url = response.urljoin(article.xpath("./@href").get())
         if xredis.sismember('flashnews:yicai_news', get_md5(article_url)):
             continue
         xredis.sadd('flashnews:yicai_news', get_md5(article_url))
         yield scrapy.Request(url=article_url,
                              callback=self.parse_detail,
                              dont_filter=True,
                              meta={"info": classify})
Пример #3
0
 def parse(self, response):
     lis = response.xpath("//ul[@id='dfx-scrolled-block']//li")
     for li in lis:
         link_url = li.xpath(".//div[2]/h3/a/@href").get().strip()
         if xredis.sismember('flashnews:forex_news', get_md5(link_url)):
             continue
         xredis.sadd('flashnews:forex_news', get_md5(link_url))
         title = li.xpath(".//div[2]/h3/a/text()").get().strip()
         pub_time = li.xpath(".//div[1]/h3/text()").get()
         pub_time = parse_pub_time(pub_time)
         description = li.xpath(".//div[2]/div/text()").get().strip()
         yield scrapy.Request(
             url=link_url,
             callback=self.parse_content,
             dont_filter=True,
             meta={"info": (title, link_url, pub_time, description)})
Пример #4
0
 def parse_next(self,response):
     stock_code, stock_market, stock_name = response.meta.get('info')
     time_list = timelist_conversion(response.xpath("//div[@class='datelist']/ul/text()").getall())
     articles = response.xpath("//div[@class='datelist']/ul//a")
     for index, article in enumerate(articles):
         title = article.xpath("./text()").get()
         link_url = response.urljoin(article.xpath("./@href").get())
         pub_time = time_list[index]
         link_url_md5 = get_md5(stock_code+link_url)
         # if filter_url('china_notice',link_url_md5):
         #     continue
         yield scrapy.Request(url=link_url, callback=self.parse_vip_stock, #dont_filter=True,
                                  meta={"info": (stock_code, stock_market, stock_name, title, link_url, pub_time,link_url_md5)})
Пример #5
0
 def parse_next(self, response):
     stock_code, stock_market, stock_name = response.meta.get('info')
     lis = response.xpath("//ul[@id='js_ggzx']/li")
     for li in lis:
         title = li.xpath("./a/text()").get()
         if title == None:
             continue
         link_url = li.xpath("./a/@href").get()
         if filter_urlkey(link_url):
             continue
         pub_time = li.xpath("./span/text()").get()
         link_url_md5 = get_md5(stock_code + title)
         if filter_url('hk_news', link_url_md5):
             continue
         if 'vip.stock.finance.sina.com.cn' in link_url:
             yield scrapy.Request(url=link_url,
                                  callback=self.parse_vip_stock,
                                  dont_filter=True,
                                  meta={
                                      "info": (stock_code, stock_market,
                                               stock_name, title, link_url,
                                               pub_time, link_url_md5)
                                  })
             continue
         if 'stock.finance.sina.com.cn' in link_url:
             yield scrapy.Request(url=link_url,
                                  callback=self.parse_stock_finance,
                                  dont_filter=True,
                                  meta={
                                      "info": (stock_code, stock_market,
                                               stock_name, title, link_url,
                                               pub_time, link_url_md5)
                                  })
             continue
         if 'finance.sina.com.cn' in link_url or 'cj.sina.com.cn' in link_url or 'tech.sina.com.cn' in link_url:
             yield scrapy.Request(url=link_url,
                                  callback=self.parse_finance_cj,
                                  dont_filter=True,
                                  meta={
                                      "info": (stock_code, stock_market,
                                               stock_name, title, link_url,
                                               pub_time, link_url_md5)
                                  })
             continue
Пример #6
0
 def parse_next(self, response):
     stock_code, stock_market, stock_name = response.meta.get('info')
     time_list = timelist_conversion(
         response.xpath("//div[@class='datelist']/ul/text()").getall())
     articles = response.xpath("//div[@class='datelist']/ul//a")
     for index, article in enumerate(articles):
         title = article.xpath("./text()").get()
         link_url = article.xpath("./@href").get()
         if filter_urlkey(link_url):  #去除特殊的url
             continue
         pub_time = time_list[index]
         link_url_md5 = get_md5(stock_code + title)
         if filter_url('china_news', link_url_md5):  #去重
             continue
         if 'vip.stock.finance.sina.com.cn' in link_url:
             yield scrapy.Request(url=link_url,
                                  callback=self.parse_vip_stock,
                                  dont_filter=True,
                                  meta={
                                      "info": (stock_code, stock_market,
                                               stock_name, title, link_url,
                                               pub_time, link_url_md5)
                                  })
             continue
         if 'stock.finance.sina.com.cn' in link_url:
             yield scrapy.Request(url=link_url,
                                  callback=self.parse_stock_finance,
                                  dont_filter=True,
                                  meta={
                                      "info": (stock_code, stock_market,
                                               stock_name, title, link_url,
                                               pub_time, link_url_md5)
                                  })
             continue
         if 'finance.sina.com.cn' in link_url or 'cj.sina.com.cn' in link_url or 'tech.sina.com.cn' in link_url:
             yield scrapy.Request(url=link_url,
                                  callback=self.parse_finance_cj,
                                  dont_filter=True,
                                  meta={
                                      "info": (stock_code, stock_market,
                                               stock_name, title, link_url,
                                               pub_time, link_url_md5)
                                  })
             continue
Пример #7
0
 def parse_next(self, response):
     stock_code, stock_market, stock_name = response.meta.get('info')
     lis = response.xpath("//ul[@class='list01']/li")
     for li in lis:
         title = li.xpath("./a/text()").get()
         if title == None:
             continue
         pub_time = li.xpath("./span/text()").get()
         link_url = li.xpath("./a/@href").get()
         link_url_md5 = get_md5(stock_code + link_url)
         # if filter_url('hk_notice',link_url_md5):
         #     continue
         yield scrapy.Request(
             url=link_url,
             callback=self.parse_stock_finance,  #dont_filter=True,
             meta={
                 "info": (stock_code, stock_market, stock_name, title,
                          link_url, pub_time, link_url_md5)
             })