Exemplo n.º 1
0
 def process_item(self, item, spider):
     item["source"]=SpiderSourceName.sina
     item["type"]=SpiderSourceCode.individual_stock
     item["id"] = shortuuid.uuid()
     item["scope"] = u"个股"
     
     contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getNewsSchema())
     kafka_producer.send(kafkaTopic.news,value=contents)
     sleep(1)
     self.logger.info("send data to kafka, from "+item["source"] +" , url: "+item["url"])
     get_redis_conn().zadd(RedisKeys.sina_individual_crawled,item["url"],item['pub_date'][0:10].replace("-",""))
     return item
Exemplo n.º 2
0
 def process_item(self, item, spider):
     item["id"] = shortuuid.uuid()
     item["source"]=SpiderSourceName.baidu
     item["type"]=SpiderSourceCode.baidu_stock_opinion
     
     #contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getConsensusSchema())
     #kafka_producer.send(kafkaTopic.consensus,value=contents)
     
     self.logger.info("send data to kafka, from "+item["source"] +" , batch: "+str(item["batch"]))
     #sleep(1)
     get_redis_conn().zadd(RedisKeys.baidu_opinion_crawled+str(item["flag"]),item['pub_date'],item['pub_date'])
     return item
Exemplo n.º 3
0
 def process_item(self, item, spider):
     item["source"]=SpiderSourceName.dongfang
     item["type"]=SpiderSourceCode.dongfang
     item["id"] = shortuuid.uuid()
     item["scope"] = u"新闻"
     item['code']=""
     item['name']=""
     contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getNewsSchema())
     self.logger.info("AvroUtils successful!")
     kafka_producer.send(kafkaTopic.news,value=contents)
     sleep(1)
     self.logger.info("send data to kafka, from "+item["source"] +" , url: "+item["url"])
     get_redis_conn().zadd(RedisKeys.dongfang_url_crawl,item["url"],item['pub_date'][0:10].replace("-",""))
     return item
Exemplo n.º 4
0
    def process_item(self, item, spider):
        item["id"] = shortuuid.uuid()
        item["source"] = SpiderSourceName.baidu
        item["type"] = SpiderSourceCode.baidu_stock_opinion

        #contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getConsensusSchema())
        #kafka_producer.send(kafkaTopic.consensus,value=contents)

        self.logger.info("send data to kafka, from " + item["source"] +
                         " , batch: " + str(item["batch"]))
        #sleep(1)
        get_redis_conn().zadd(
            RedisKeys.baidu_opinion_crawled + str(item["flag"]),
            item['pub_date'], item['pub_date'])
        return item
Exemplo n.º 5
0
    def process_item(self, item, spider):
        item["source"] = SpiderSourceName.sina
        item["type"] = SpiderSourceCode.individual_stock
        item["id"] = shortuuid.uuid()
        item["scope"] = u"个股"

        contents = AvroUtils.createAvroMemoryRecord(item,
                                                    AvroUtils.getNewsSchema())
        kafka_producer.send(kafkaTopic.news, value=contents)
        sleep(1)
        self.logger.info("send data to kafka, from " + item["source"] +
                         " , url: " + item["url"])
        get_redis_conn().zadd(RedisKeys.sina_individual_crawled, item["url"],
                              item['pub_date'][0:10].replace("-", ""))
        return item
Exemplo n.º 6
0
    def parse(self, response):
        access_token_list = response.xpath('//script').re(
            'SNB.data.access_token.*\|\| "(.*)";')
        if len(access_token_list) == 0:
            self.logger.error("get access_token error")
            return

        self.headers["cookie"] = "xq_a_token=" + access_token_list[0]

        self.redis_conn = get_redis_conn()
        xueqiu_comment_relation = self.redis_conn.zrange(
            RedisKeys.xueqiu_comment_relation,
            start=0,
            end=-1,
            desc=False,
            withscores=True)
        #get all data, if want to get part of redis, use zrangebyscore
        for relation in xueqiu_comment_relation:  #id&user_id&uuid
            result = relation[0].split("&&")
            self.logger.info("article corelation:" + str(result))
            user_id = result[0]
            article_id = result[1]
            article_hive_id = result[
                2]  # shortuuid ,to construct mapping with articles
            comment_url = "https://xueqiu.com/service/comment/list?id=" + article_id + "&user_id=" + user_id + "&type=status&sort=false&page=1"
            request = scrapy.Request(comment_url,
                                     headers=self.headers,
                                     callback=self.parse_comment)
            request.meta["article_hive_id"] = article_hive_id
            request.meta["article_id"] = article_id
            request.meta["user_id"] = user_id
            yield request
Exemplo n.º 7
0
 def process_item(self, item, spider):
     item["source"] = SpiderSourceName.dongfang
     item["type"] = SpiderSourceCode.dongfang
     item["id"] = shortuuid.uuid()
     item["scope"] = u"新闻"
     item['code'] = ""
     item['name'] = ""
     contents = AvroUtils.createAvroMemoryRecord(item,
                                                 AvroUtils.getNewsSchema())
     self.logger.info("AvroUtils successful!")
     kafka_producer.send(kafkaTopic.news, value=contents)
     sleep(1)
     self.logger.info("send data to kafka, from " + item["source"] +
                      " , url: " + item["url"])
     get_redis_conn().zadd(RedisKeys.dongfang_url_crawl, item["url"],
                           item['pub_date'][0:10].replace("-", ""))
     return item
Exemplo n.º 8
0
 def __init__(self, *a, **kw):
     if kw.has_key("endDate"):
         if TimeUtils.isValidEndDate(kw["endDate"]):
             self.endDate=kw["endDate"]
         else:
             self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15')
             raise CloseSpider(kw["endDate"]+' error format')
         
     self.redis_conn=get_redis_conn()
Exemplo n.º 9
0
    def __init__(self, *a, **kw):
        if kw.has_key("endDate"):
            if TimeUtils.isValidEndDate(kw["endDate"]):
                self.endDate = kw["endDate"]
            else:
                self.logger.error(kw["endDate"] +
                                  ': error format, must be like 2016-05-15')
                raise CloseSpider(kw["endDate"] + ' error format')

        self.redis_conn = get_redis_conn()
Exemplo n.º 10
0
 def __init__(self, *a, **kw):
     if kw.has_key("endDate"):
         if TimeUtils.isValidEndDate(kw["endDate"]):
             self.endDate=kw["endDate"]
         else:
             self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15')
             raise CloseSpider(kw["endDate"]+' error format')
     
     
     self.redis_conn=get_redis_conn()
     #if not self.redis_conn.exists('sina_individual_stock:requests'):
     #    print "set start urls"
     #    self.start_urls = [
     #         "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml",
     #     "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml"
     #     ]
     sha_stock_codes=self.redis_conn.smembers(RedisKeys.SHAStockCode)
     for code in sha_stock_codes:
         url=('http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1' % code)
         self.start_urls.append(url)
Exemplo n.º 11
0
    def __init__(self, *a, **kw):
        if kw.has_key("endDate"):
            if TimeUtils.isValidEndDate(kw["endDate"]):
                self.endDate = kw["endDate"]
            else:
                self.logger.error(kw["endDate"] +
                                  ': error format, must be like 2016-05-15')
                raise CloseSpider(kw["endDate"] + ' error format')

        self.redis_conn = get_redis_conn()
        #if not self.redis_conn.exists('sina_individual_stock:requests'):
        #    print "set start urls"
        #    self.start_urls = [
        #         "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml",
        #     "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml"
        #     ]
        sha_stock_codes = self.redis_conn.smembers(RedisKeys.SHAStockCode)
        for code in sha_stock_codes:
            url = (
                'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1'
                % code)
            self.start_urls.append(url)
Exemplo n.º 12
0
 def parse(self, response):
     access_token_list = response.xpath('//script').re('SNB.data.access_token.*\|\| "(.*)";')
     if len(access_token_list) == 0:
         self.logger.error("get access_token error")
         return
     
     self.headers["cookie"]="xq_a_token="+access_token_list[0]
     
     self.redis_conn=get_redis_conn()
     xueqiu_comment_relation=self.redis_conn.zrange(RedisKeys.xueqiu_comment_relation, start=0, end=-1, desc=False, withscores=True)
     #get all data, if want to get part of redis, use zrangebyscore
     for relation in xueqiu_comment_relation:  #id&user_id&uuid
         result=relation[0].split("&&")
         self.logger.info("article corelation:"+str(result))
         user_id=result[0]
         article_id=result[1]
         article_hive_id=result[2]  # shortuuid ,to construct mapping with articles
         comment_url="https://xueqiu.com/service/comment/list?id="+article_id+"&user_id="+user_id+"&type=status&sort=false&page=1"
         request = scrapy.Request(comment_url,headers=self.headers,callback=self.parse_comment)
         request.meta["article_hive_id"]=article_hive_id
         request.meta["article_id"]=article_id
         request.meta["user_id"]=user_id
         yield request
Exemplo n.º 13
0
 def __init__(self):
     self.redis_conn = get_redis_conn()
Exemplo n.º 14
0
 def __init__(self):
     self.redis_conn=get_redis_conn()
#-*- coding: UTF-8 -*-

import sys
import os                                                                                                                               
from os.path import dirname
path = dirname(os.path.abspath(os.path.dirname(__file__)))
sys.path.append(path)
#print path


from scrapy_redis.connection import get_redis_conn

conn=get_redis_conn()

fp=open(path+'/tools/SHACode.txt') 
for line in fp.readlines():
    text=line.strip()
    conn.sadd("SHAStockCode",text)

print "add success"
#print text
print 'contents of SHAStockCode'
codes=conn.smembers('SHAStockCode')
for code in codes:
    print code