Exemplo n.º 1
0
 def set_ua_from_fua():
     """
     从fake user agent获取ua信息
     """
     ua = fake_useragent.UserAgent()
     ua_dict = {
         'chrome': ua.chrome,
         'ie': ua.ie,
         'opera': ua.opera,
         'firefox': ua.firefox,
         'safari': ua.safari,
     }
     rc = RedisClient()
     rc.put('useragents', ua_dict)
Exemplo n.º 2
0
 def __init__(self):
     """
     从redis中获取保存的ua值,如果没有,则新下载
     """
     self.headers = {
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Connection': 'keep-alive',
     }
     rc = RedisClient()
     self.ua_dict = rc.get_all('useragents')
     while not self.ua_dict:
         Headers.set_ua_from_fua()
         self.ua_dict = rc.get_all('useragents')
Exemplo n.º 3
0
 def __init__(self, *args, **kwargs):
     super(AnchorSpider, self).__init__(*args, **kwargs)
     print("__init__")
     self.redis_client = RedisClient().getInstance()
Exemplo n.º 4
0
class AnchorSpider(scrapy.Spider):
    name = "douyu_anchor"
    allowed_domains = []
    start_urls = [apiconstants.get_api_douyu_list_url(0)]
    """
    1.获取主播列表页中的主播房间url,交给scrapy下载后进行解析
    1.获取下一页的url并交给scrapy进行下载,下载完成交给parse解析
    """

    offset = 0

    def __init__(self, *args, **kwargs):
        super(AnchorSpider, self).__init__(*args, **kwargs)
        print("__init__")
        self.redis_client = RedisClient().getInstance()

    def parse(self, response):
        is_end = False
        anchor_list = []
        anchor_uids = []
        if response.body:
            result = json.loads(response.body)
            if result and int(result['error']) == 0:
                result_anchor_list = result['data']
                result_count = len(result_anchor_list)
                print("result_count : " + str(result_count))
                if result_count > 0:
                    for anchor_item in result_anchor_list:
                        anchor = AncharItem()
                        anchor['room_id'] = anchor_item['room_id']
                        anchor['room_href'] = anchor_item['url']
                        anchor['room_name'] = anchor_item['room_name']
                        anchor['room_status'] = anchor_item['show_status']
                        anchor['room_thumb'] = anchor_item['room_src']
                        anchor['nickname'] = anchor_item['nickname']
                        anchor['avatar'] = anchor_item['avatar']
                        anchor['sex'] = 0
                        anchor['weight'] = 0  # owner_weight
                        anchor['cate_id'] = anchor_item['cate_id']
                        anchor['start_time'] = anchor_item['show_time']
                        anchor['fans_num'] = anchor_item['fans']
                        anchor['online_num'] = anchor_item['online']
                        if anchor_item.get("jumpUrl"):
                            # 存在的情况,会跳转到外部连接,如企鹅直播
                            pass
                        anchor_list.append(anchor)
                        anchor_uids.append(anchor['room_id'])
                        # 交给主播个人数据解析
                        roominfo_url = apiconstants.get_douyu_roominfo_url(
                            anchor['room_id'])
                        # 如果有数据了,那就不获取了
                        anchor_redis_name = 'anchor:1' + ":" + str(
                            anchor['room_id'])
                        if self.redis_client.exists(anchor_redis_name):
                            yield anchor
                        else:
                            yield Request(url=roominfo_url,
                                          callback=self.parse_anchor_info)
                else:
                    is_end = True

                self.offset = self.offset + result_count

        print(anchor_uids)

        # 提取下一页并交给scrapy进行下载
        if is_end:
            print("爬取结束")
        else:
            url = apiconstants.get_api_douyu_list_url(self.offset)
            yield Request(url=url, callback=self.parse)

    # 爬取主播个人数据
    def parse_anchor_info(self, response):
        if response.body:
            result = json.loads(response.body)
            if result and int(result['error']) == 0:
                result_anchor_info = result['data']
                anchor_info = AncharItem()
                anchor_info['room_id'] = result_anchor_info['room_id']
                # anchor_info['room_href'] = result_anchor_info['room_href']
                anchor_info['room_href'] = ""
                anchor_info['room_name'] = result_anchor_info['room_name']
                anchor_info['room_status'] = result_anchor_info['room_status']
                anchor_info['room_thumb'] = result_anchor_info['room_thumb']
                anchor_info['nickname'] = result_anchor_info['owner_name']
                anchor_info['avatar'] = result_anchor_info['avatar']
                anchor_info['sex'] = 0
                anchor_info['weight'] = 0  # owner_weight
                anchor_info['cate_id'] = result_anchor_info['cate_id']
                anchor_info['cate_name'] = result_anchor_info['cate_name']
                anchor_info['start_time'] = result_anchor_info['start_time']
                anchor_info['fans_num'] = result_anchor_info['fans_num']
                anchor_info['online_num'] = result_anchor_info['online']
                anchor_info['gift_list'] = result_anchor_info['gift']
                yield anchor_info
Exemplo n.º 5
0
 def __init__(self):
     print("__init__")
     self.redis_client = RedisClient().getInstance()
Exemplo n.º 6
0
 def __init__(self, dbpool):
     self.dbpool = dbpool
     self.redis_client = RedisClient()
Exemplo n.º 7
0
class MysqlTwistedPipeline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool
        self.redis_client = RedisClient()

    @classmethod
    def from_settings(cls, settings):
        dbparms = config.DB_config.get("mysql")
        dbparms['db'] = config.database
        dbparms['cursorclass'] = MySQLdb.cursors.DictCursor
        dbparms['use_unicode'] = True

        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

        return cls(dbpool)

    def process_item(self, item, spider):
        # 使用twisted将mysql插入变成异步执行
        query = self.dbpool.runInteraction(self.do_insert_anthor, item)
        # 因为是异步的,所以错误的查询
        query.addErrback(self.handle_error)  # 处理异常
        anthor_id = int(item['room_id'])
        # # 存入Redis礼物数据
        gift_list = item['gift_list']
        if gift_list:
            for gift in gift_list:
                gift_redis_name = 'gift:' + str(apiconstants.PLATFORM_DOUYU) + ":" + gift['id']  # 平台加礼物ID
                self.redis_client.getInstance().hmset(gift_redis_name, dict(gift))
            # 存入Redis主播数据
            item.pop('gift_list')
        anchor_redis_name = 'anchor:' + str(apiconstants.PLATFORM_DOUYU) + ":" + str(anthor_id)
        self.redis_client.getInstance().hmset(anchor_redis_name, dict(item))  # 更新数据库数据
        anchor_id_list_redis_name = 'anchor_id_list:' + str(apiconstants.PLATFORM_DOUYU)
        self.redis_client.getInstance().sadd(anchor_id_list_redis_name, anthor_id)

    # 保存主播数据
    def do_insert_anthor(self, cursor, item):
        # 判断主播是否存在
        exist_sql = "select * from anthor where platform=%s and room_id=%s" % (1, item['room_id'])
        cursor.execute(exist_sql)
        cursor.fetchall()
        if cursor.rowcount == 0:
            print("不存在主播数据,入库")
            # 执行具体的插入
            insert_sql = """
                                insert into anthor(nickname,avatar,sex,weight,platform,room_id,room_href,room_name,room_thumb,cate_id,fans_num)
                                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                            """
            cursor.execute(insert_sql, (
                item['nickname'], item['avatar'], item['sex'], item['weight'], 1, item['room_id'], item['room_href'],
                item['room_name'], item['room_thumb'], item['cate_id'], item['fans_num']))
        else:
            print("存在主播数据")

    # 保存主播礼物数据
    def do_insert_gift(self, cursor, item):
        # 判断主播是否存在
        exist_sql = "select * from gift where platform=%s and gid=%s" % (1, item['room_id'])
        cursor.execute(exist_sql)
        cursor.fetchall()
        if cursor.rowcount == 0:
            print("不存在主播数据,入库")
            # 执行具体的插入
            insert_sql = """
                                insert into gift(gid,name,desc,intro,platform,cost,contribution)
                                VALUES (%s,%s,%s,%s,%s,%s,%s)
                            """
            cursor.execute(insert_sql, (
                item['gid'], item['name'], item['desc'], item['intro'], item['platform'], item['cost'], item['contribution']))
        else:
            print("存在主播数据")


    def handle_error(self, failure):
        # 处理异步插入的异常
        print(failure)