def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=z8_delete_str_1)
            result = list(
                tmp_sql_server._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if index % 300 == 0:  # 每更新300个,休眠3分钟
                    sleep_time = 3 * 60
                    sleep(sleep_time)
                    print('休眠{}s中...'.format(sleep_time))

                if tmp_sql_server.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0])
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            print('@@ 该商品的页面已经不存在!此处将其删除!')
                            tmp_sql_server._delete_table(
                                sql_str=z8_delete_str_2, params=(item[0], ))
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        if item[1] == 1:
                            tmp_sql_server._delete_table(
                                sql_str=z8_delete_str_2, params=(item[0], ))
                            print('该goods_id[{0}]已过期,删除成功!'.format(item[0]))
                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (item[0], index))
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                gc.collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemplo n.º 2
0
class Z8Updater(AsyncCrawler):
    """折800常规商品实时更新"""
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(self,
                              *params,
                              **kwargs,
                              log_print=True,
                              log_save_path=MY_SPIDER_LOGS_PATH +
                              '/折800/实时更新/')
        self.sql_cli = None
        self.goods_index = 1
        # 并发量
        self.concurrency = 10

    async def _get_db_old_data(self):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            result = list(self.sql_cli._select_table(sql_str=z8_select_str_3))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_new_ali_obj(self, index) -> None:
        if index % 10 == 0:
            try:
                del self.zhe_800
            except:
                try:
                    del self.zhe_800
                except:
                    pass
            collect()
            self.zhe_800 = Zhe800Parse()

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新一个goods的信息
        :param db_goods_info_obj:
        :param index:
        :return: ['goods_id', bool:'成功与否']
        '''
        res = False
        await self._get_new_ali_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                format(db_goods_info_obj.goods_id, index))
            self.zhe_800.get_goods_data(goods_id=db_goods_info_obj.goods_id)
            data = self.zhe_800.deal_with_data()
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='z8',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = self.zhe_800.to_right_and_update_data(
                    data=data, pipeline=self.sql_cli)

            else:  # 表示返回的data值为空值
                pass
        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(2.)

        return [db_goods_info_obj.goods_id, res]

    async def _update_db(self):
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.zhe_800 = Zhe800Parse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        db_goods_info_obj = Z8DbGoodsInfoObj(item=item,
                                                             logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(
                            db_goods_info_obj.goods_id))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(
                                    db_goods_info_obj=db_goods_info_obj,
                                    index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)
                    try:
                        del tasks
                    except:
                        pass

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(5.5)
            try:
                del self.zhe_800
            except:
                pass
            collect()

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
Exemplo n.º 3
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=vip_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            continue

        print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
        print(result)
        print('--------------------------------------------------------')

        print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
        index = 1
        for item in result:  # 实时更新数据
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            vip = VipParse()
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                print('正在重置,并与数据库建立新连接中...')
                # try:
                #     del tmp_sql_server
                # except:
                #     pass
                # gc.collect()
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                print('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                print(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' %
                    (item[0], index))
                vip.get_goods_data(goods_id=[0, item[0]])
                data = vip.deal_with_data()
                if data != {}:
                    data['goods_id'] = item[0]

                    data['shelf_time'], data[
                        'delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            shelf_time=item[4],
                            delete_time=item[5])
                    data['_is_price_change'], data[
                        '_price_change_info'] = _get_price_change_info(
                            old_price=item[2],
                            old_taobao_price=item[3],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price'])

                    try:
                        old_sku_info = format_price_info_list(
                            price_info_list=json_2_dict(item[6]), site_id=25)
                    except AttributeError:  # 处理已被格式化过的
                        old_sku_info = item[6]
                    data['_is_price_change'], data[
                        'sku_info_trans_time'] = get_sku_info_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=format_price_info_list(
                                data['price_info_list'], site_id=25),
                            is_price_change=item[7]
                            if item[7] is not None else 0)

                    vip.to_right_and_update_data(data=data,
                                                 pipeline=tmp_sql_server)
                else:  # 表示返回的data值为空值
                    pass
            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            # try:
            #     del vip
            # except:
            #     pass
            gc.collect()
            sleep(VIP_SLEEP_TIME)
        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(30)
        # del vip
        gc.collect()
Exemplo n.º 4
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server._select_table(sql_str=kl_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            kaola = KaoLaParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del kaola
                    except:
                        pass
                    kaola = KaoLaParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index)))
                    data = kaola._get_goods_data(goods_id=item[1])

                    if data.get('is_delete') == 1:  # 单独处理下架商品
                        data['goods_id'] = item[1]
                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        kaola.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        gc.collect()
                        continue

                    data = kaola._deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])

                        if data.get('is_delete') == 1:
                            my_lg.info('@@@ 该商品已下架...')
                            tmp_sql_server._update_table_2(sql_str=kl_update_str_2, params=(item[1],), logger=my_lg)
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                            old_price=item[3],
                            old_taobao_price=item[4],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price']
                        )

                        try:
                            old_sku_info = format_price_info_list(price_info_list=json_2_dict(item[7]), site_id=29)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[7]
                        data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=format_price_info_list(data['price_info_list'], site_id=29),
                            is_price_change=item[8] if item[8] is not None else 0
                        )

                        kaola.to_right_and_update_data(data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        gc.collect()
Exemplo n.º 5
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        chuchujie = ChuChuJie_9_9_Parse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, gender, page, goods_url from dbo.chuchujie_xianshimiaosha where site_id=24'
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))]
            # print(db_goods_id_list)

            # my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
            # index = 1
            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(goods_id)
                    chuchujie.get_goods_data(goods_id=goods_id)
                    goods_data = chuchujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    elif goods_data.get('is_delete', 0) == 1:   # is_delete=1(即库存为0)则跳过
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:   # 否则就解析并且插入
                        my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)

                        # 获取剩余时间
                        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url,
                            css_selector='p#activityTime span'
                        )
                        # print(tmp_body)

                        try: del my_phantomjs
                        except: pass
                        gc.collect()

                        if tmp_body == '':  # 获取手机版的页面完整html失败
                            sleep(.4)
                            pass

                        else:
                            # p#activityTime span
                            _t = Selector(text=tmp_body).css('p#activityTime span::text').extract_first()
                            _t = re.compile(r'剩余').sub('', _t)
                            # print(_t)
                            if _t == '' or _t is None:
                                print('获取到的_t为空值, 严重错误! 请检查!')

                            miaosha_end_time = self.get_miaosha_end_time(_t)

                            goods_data['goods_url'] = tmp_url
                            goods_data['goods_id'] = str(goods_id)
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = {
                                'miaosha_begin_time': timestamp_to_regulartime(int(time.time())),
                                'miaosha_end_time': timestamp_to_regulartime(int(miaosha_end_time)),
                            }
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                            goods_data['gender'] = str(item.get('gender', '0'))
                            goods_data['page'] = item.get('page')

                            # pprint(goods_data)
                            # print(goods_data)
                            chuchujie.insert_into_chuchujie_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                            # sleep(CHUCHUJIE_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                        # index += 1

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del chuchujie
        except:
            pass
        gc.collect()
Exemplo n.º 6
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关拼团商品的数据
        :param params: 待传参数
        :return:
        '''
        goods_list = params[0]

        mogujie = MoGuJieParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline._select_table(sql_str=mg_select_str_1))
            ]
            print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://shop.mogujie.com/detail/' + str(
                        goods_id)

                    mogujie.get_goods_data(goods_id=str(goods_id))
                    goods_data = mogujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        # 规范化
                        goods_data[
                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                goods_data['price_info_list'])
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['pintuan_time'] = item.get(
                            'pintuan_time', {})
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=item.get('pintuan_time', {}))
                        goods_data['all_sell_count'] = item.get(
                            'all_sell_count', '')
                        goods_data['fcid'] = str(item.get('fcid'))
                        goods_data['page'] = str(item.get('page'))
                        goods_data['sort'] = str(item.get('sort', ''))

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mogujie.insert_into_mogujie_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            db_goods_id_list.append(goods_id)
                            db_goods_id_list = list(set(db_goods_id_list))

                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()
Exemplo n.º 7
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, schedule, is_delete from dbo.juanpi_pintuan where site_id=18'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                data = {}
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    pintuan_end_time = json.loads(item[1])[0].get('end_time')
                    pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(time.time()):
                        sql_str = r'delete from dbo.juanpi_pintuan where goods_id=%s'
                        tmp_sql_server._delete_table(sql_str=sql_str, params=(item[0]))
                        print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0]))
                    else:
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        juanpi_pintuan.get_goods_data(goods_id=item[0])
                        data = juanpi_pintuan.deal_with_data()

                        if data != {}:
                            data['goods_id'] = item[0]
                            juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del juanpi_pintuan
                # except:
                #     pass
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemplo n.º 8
0
class ZWMSpider(AsyncCrawler):
    def __init__(self):
        AsyncCrawler.__init__(
            self,
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/zwm/_/',
        )
        self.init_zwm_pwd()
        self.concurrency = 10
        self.num_retries = 6
        self.max_transaction_details_page_num = 20  # 交易截止抓取页
        self.max_business_settlement_records_page_num = 20  # 商户结算记录截止抓取页
        self.max_business_manage_page_num = 80  # 商户及门店管理截止抓取页(单数据也超过此数量就得进行修改)
        self.login_cookies_dict = {}
        self.sleep_time = 5

    def init_zwm_pwd(self):
        ori_data = ''
        with open(ZWM_PWD_PATH, 'r') as f:
            for line in f:
                ori_data += line.replace('\n', '').replace('  ', '')
        data = json_2_dict(
            json_str=ori_data,
            logger=self.lg,
            default_res={},
        )
        self.zwm_username, self.zwm_pwd = data['username'], data['pwd']
        assert self.zwm_username != '' and self.zwm_pwd != ''

    async def _fck_run(self) -> None:
        while True:
            try:
                login_res = await self._login()
                assert login_res is True, '登录失败, 退出后续同步操作!'

                # 获取所有交易明细(自己有接口, 不需要了)
                # all_transaction_details = await self._get_all_transaction_details()
                # pprint(all_transaction_details)
                # self.lg.info('len_all_transaction_details: {}'.format(len(all_transaction_details)))
                # await self._wash_and_save_all_transaction_details(target_list=all_transaction_details)

                # 获取所有商户结算记录
                self.lg.info('获取所有商户结算记录...')
                all_business_settlement_records = await self._get_all_business_settlement_records_by_something(
                )
                # pprint(all_business_settlement_records)
                self.lg.info('len_now_business_settlement_records: {}'.format(
                    len(all_business_settlement_records)))
                await self._wash_save_all_business_settlement_records(
                    target_list=all_business_settlement_records)
                self.lg.info('\n')

                # 获取所有商户及门店管理记录
                self.lg.info('获取所有商户及门店管理记录 ...')
                all_business_manage_records = await self._get_all_business_manage_records_by_something(
                )
                # pprint(all_business_manage_records)
                self.lg.info('len_all_business_manage_records: {}'.format(
                    len(all_business_manage_records)))
                await self._wash_save_all_business_manage_records(
                    target_list=all_business_manage_records)
                self.lg.info('\n')

            except Exception:
                self.lg.error('遇到错误:', exc_info=True)

            self.lg.info('## 同步完成 ##')
            self.lg.info('休眠 {} minutes ...'.format(self.sleep_time))

            # 定时
            await async_sleep(60 * self.sleep_time)

    async def _login(self) -> bool:
        """
        登录
        :return:
        """
        headers = await self._get_random_pc_headers()
        headers.update({
            'Referer':
            'https://agent.yrmpay.com/JHAdminConsole/loginNew.jsp',
        })
        file_load = {
            'loginName': self.zwm_username,
            'userPassword': self.zwm_pwd,
        }
        m = MultipartEncoder(fields=file_load)
        # self.lg.info(m)
        headers.update({'Content-Type': m.content_type})
        login_url = 'https://agent.yrmpay.com/JHAdminConsole/foreigncard/permissionsLogin.do'

        with session() as _session:
            try:
                response = _session.post(
                    url=login_url,
                    headers=headers,
                    data=m,
                    proxies=self._get_proxies(),
                )
                login_res = json_2_dict(
                    json_str=response.text,
                    default_res={},
                    logger=self.lg,
                ).get('message', '')
                assert login_res == '登录成功', '登录失败!'
                self.lg.info(login_res)
                self.login_cookies_dict = response.cookies.get_dict()
                assert self.login_cookies_dict != {}, 'self.login_cookies_dict != 空dict!'
                # pprint(self.login_cookies_dict)
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                return False

        return True

    async def _wash_save_all_business_manage_records(self, target_list: list):
        """
        清洗并存储所有未存储的 or 更新所有已存储的business manage records
        :param target_list:
        :return:
        """
        all_res = []
        for item in target_list:
            try:
                now_time = get_shanghai_time()
                create_time, modify_time, approval_status_change_time = now_time, now_time, now_time
                agent_name = item['agentName']
                top_agent_name = item['topAgentName']
                shop_type = item['merType']

                is_high_quality_shop = item['isHighQualityMer']
                if is_high_quality_shop == '否':
                    is_high_quality_shop = 0
                elif is_high_quality_shop == '是':
                    is_high_quality_shop = 1
                else:
                    raise ValueError(
                        'is_high_quality_shop value: {} 异常!'.format(
                            is_high_quality_shop))

                shop_id = item.get('jhmid', '')
                assert shop_id != ''
                shop_chat_name = item.get('merchantName', '')
                assert shop_chat_name != ''
                phone_num = item.get('phone', '')
                assert phone_num != ''
                shop_chant_num = int(item['merchantNum'])
                sale = item['sale']
                is_real_time = 0 if item['isRealTime'] == '未开通' else 1
                approve_date = date_parse(item['approveDate'])
                rate = Decimal(item['rate']).__round__(4)
                account_type = item['accType']
                apply_time = date_parse(item['applyTime'])
                # 可为空值
                process_context = item.get('processContext', '')
                is_non_contact = 0 if item['isNonContact'] == '未开通' else 1

                approval_status = item['approvalStatus']
                if approval_status == '待审核':
                    approval_status = 1
                elif approval_status == '审核通过':
                    approval_status = 0
                elif approval_status == '退回':
                    approval_status = 2
                else:
                    raise ValueError(
                        'approval_status value: {} 异常'.format(approval_status))

                # 用其原值为定值不变, 且唯一
                unique_id = item['id']

            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

            zwm_item = ZWMBusinessManageRecordItem()
            zwm_item['unique_id'] = unique_id
            zwm_item['create_time'] = create_time
            zwm_item['modify_time'] = modify_time
            zwm_item['agent_name'] = agent_name
            zwm_item['top_agent_name'] = top_agent_name
            zwm_item['shop_type'] = shop_type
            zwm_item['is_high_quality_shop'] = is_high_quality_shop
            zwm_item['shop_id'] = shop_id
            zwm_item['shop_chat_name'] = shop_chat_name
            zwm_item['phone_num'] = phone_num
            zwm_item['shop_chant_num'] = shop_chant_num
            zwm_item['sale'] = sale
            zwm_item['is_real_time'] = is_real_time
            zwm_item['approve_date'] = approve_date
            zwm_item['rate'] = rate
            zwm_item['account_type'] = account_type
            zwm_item['apply_time'] = apply_time
            zwm_item['process_context'] = process_context
            zwm_item['is_non_contact'] = is_non_contact
            zwm_item['approval_status'] = approval_status
            zwm_item[
                'approval_status_change_time'] = approval_status_change_time
            all_res.append(dict(zwm_item))

            # 查看
            # if shop_id == 'YRMPAY100038574':
            # if phone_num == '18192242001':
            # if shop_chat_name == '哇哇叫':
            #     pprint(dict(zwm_item))

        # pprint(all_res)
        await self._insert_or_update_shop_manage_records_table(all_res=all_res)
        try:
            del all_res
        except:
            pass

        return None

    async def _insert_or_update_shop_manage_records_table(self, all_res: list):
        """
        插入or update原数据
        :param all_res:
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            db_data = self.sql_cli._select_table(
                sql_str=zwm_select_str_2,
                params=None,
                logger=self.lg,
            )
            # pprint(db_data)
            db_unique_id_list = [item[0] for item in db_data]
            assert db_unique_id_list != [], 'db_unique_id_list != []'
            self.lg.info('len_db_unique_id_list: {}'.format(
                len(db_unique_id_list)))
        except Exception:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
            self.lg.error('遇到错误:', exc_info=True)
            return None

        new_add_count = 0
        for item in all_res:
            unique_id = item['unique_id']
            if unique_id not in db_unique_id_list:
                # 插入
                self.lg.info('inserting unique_id: {} ...'.format(unique_id))
                params = await self._get_insert_item_params2(item=item)
                try:
                    res = self.sql_cli._insert_into_table_2(
                        sql_str=zwm_insert_str_2,
                        params=params,
                        logger=self.lg)
                    if res:
                        new_add_count += 1
                except Exception:
                    self.lg.error('遇到错误:', exc_info=True)
                    continue
            else:
                db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time(
                    db_data=db_data,
                    unique_id=unique_id,
                )
                item[
                    'approval_status_change_time'] = await self._get_new_approval_status_change_time(
                        db_old_approval_status=db_old_approval_status,
                        db_old_approval_status_change_time=
                        db_old_approval_status_change_time,
                        new_approval_status=item['approval_status'],
                        new_approval_status_change_time=item[
                            'approval_status_change_time'])
                # 更新
                self.lg.info('updating unique_id: {} ...'.format(unique_id))
                params = await self._get_update_item_params(item=item)
                try:
                    res = self.sql_cli._update_table_2(
                        sql_str=zwm_update_str_1,
                        params=params,
                        logger=self.lg)
                except Exception:
                    self.lg.error('遇到错误:', exc_info=True)
                    continue

        if not self.sql_cli.is_connect_success:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        else:
            pass

        try:
            del db_data
            del db_unique_id_list
        except:
            pass

        self.lg.info(
            'table.zwm_buss_manage_records新增个数: {}'.format(new_add_count))

    async def _get_new_approval_status_change_time(
            self, db_old_approval_status, db_old_approval_status_change_time,
            new_approval_status, new_approval_status_change_time):
        """
        获取新的approval_status_change_time
        :return:
        """
        if db_old_approval_status_change_time is not None:
            new_approval_status_change_time = db_old_approval_status_change_time \
                if db_old_approval_status == new_approval_status \
                else get_shanghai_time()
        else:
            pass

        return new_approval_status_change_time

    async def _get_dd_old_approval_status_and_approval_status_change_time(
            self, db_data: list, unique_id: str) -> tuple:
        """
        获取db 原先的approval_status
        :param db_data:
        :param unique_id:
        :return:
        """
        for item in db_data:
            if unique_id == item[0]:
                return item[1], item[2]
            else:
                continue

    async def _get_all_business_manage_records_by_something(self, ):
        """
        获取所有商户及门店管理记录
        :return:
        """
        async def get_tasks_params_list(max_business_manage_page_num) -> list:
            """获取tasks_params_list"""
            tasks_params_list = []
            for page_num in range(1, max_business_manage_page_num):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where page_num: {}]...'.format(k['page_num'])

        def get_now_args(k) -> list:
            return [
                k['page_num'],
            ]

        res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=await get_tasks_params_list(
                max_business_manage_page_num=self.max_business_manage_page_num
            ),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self._get_one_page_business_manage_records_by_something,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res,
            one_default_res=[],
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,
        )

        return res

    @catch_exceptions_with_class_logger(default_res=[])
    def _get_one_page_business_manage_records_by_something(
        self,
        page_num: int,
        start_date: str = None,
        end_date: str = None,
    ):
        """
        获取单页商户及门店管理记录
        :param page_num:
        :param start_date:      默认设置前一个月27号, eg: '2019-01-27 00:00'
        :param end_date:        eg: '2019-07-20 09:39'
        :return:
        """
        # todo 获取最开始->至今的, 即采集所有, 避免老店铺的审核状态变动, 而后台无法同步状态, 审核时间
        # start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] + ' 00:00'
        start_date = '2018-01-01 00:00'
        end_date = (str(get_shanghai_time())
                    if end_date is None else end_date)[0:16]
        self.lg.info('start_date: {}, end_date: {}'.format(
            start_date, end_date))

        headers = self.get_random_pc_headers()
        headers.update({
            'Accept': '*/*',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer':
            'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/page.do',
            'X-Requested-With': 'XMLHttpRequest',
        })
        params = (('_dc', get_now_13_bit_timestamp()), )
        data = {
            'merchantCode': '',
            'accType': '',
            'phone': '',
            'approveDate': '',
            'merchantName': '',
            'processStatus': '',
            'startTime': start_date,
            'endTime': end_date,
            'agentName': '',
            'page': str(page_num),
            'start': str((page_num - 1) * 100),  # 开始位置0, 100, 200
            'limit': '100',
        }
        url = 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/materialList.do'
        body = Requests.get_url_body(
            method='post',
            url=url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            data=data,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,
        )
        assert body != '', 'body不为空值!'
        res = json_2_dict(json_str=body, logger=self.lg,
                          default_res={}).get('materialList', [])

        self.lg.info('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))

        return res

    async def _wash_save_all_business_settlement_records(self, target_list):
        """
        清洗并存储 未被存储的所有商户结算记录
        :param target_list:
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            db_data = self.sql_cli._select_table(
                sql_str=zwm_select_str_1,
                params=None,
                logger=self.lg,
            )
            # pprint(db_data)
            db_unique_id_list = [item[0] for item in db_data]
            assert db_unique_id_list != [], 'db_unique_id_list != []'
            self.lg.info('len_db_unique_id_list: {}'.format(
                len(db_unique_id_list)))
        except Exception:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
            self.lg.error('遇到错误:', exc_info=True)
            return None

        all_res = []
        for item in target_list:
            try:
                create_time = get_shanghai_time()
                shop_name = item.get('merName', '')
                assert shop_name != ''
                shop_id = item.get('mid', '')
                assert shop_id != ''
                agent_name = item['agentName']
                top_agent_name = item['topAgentName']
                date_settle_type = item['settleType']
                trans_amount = item.get('transAmt', '')
                assert trans_amount != ''
                trans_amount = Decimal(trans_amount).__round__(2)
                service_charge = Decimal(item['mda']).__round__(2)
                accounting_amount = Decimal(item['mnamt']).__round__(2)
                trans_date = date_parse(item['txnDay'])
                trans_status = item['status']
                if trans_status == '已结算':
                    trans_status = 0
                else:
                    raise ValueError(
                        'trans_status: {}, 未知交易状态!'.format(trans_status))
                settle_type = item['type']
                settle_date = date_parse(item['minDay'])
                # 生成唯一标识码
                unique_id = get_uuid3(
                    target_str=shop_id + str(date_settle_type) + str(trans_amount) + \
                               str(service_charge) + str(trans_date) + \
                               str(settle_type) + str(settle_date),)
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

            if unique_id in db_unique_id_list:
                # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id))
                continue

            settle_record_item = ZWMBusinessSettlementRecordItem()
            settle_record_item['unique_id'] = unique_id
            settle_record_item['create_time'] = create_time
            settle_record_item['shop_name'] = shop_name
            settle_record_item['shop_id'] = shop_id
            settle_record_item['agent_name'] = agent_name
            settle_record_item['top_agent_name'] = top_agent_name
            settle_record_item['date_settle_type'] = date_settle_type
            settle_record_item['trans_amount'] = trans_amount
            settle_record_item['service_charge'] = service_charge
            settle_record_item['accounting_amount'] = accounting_amount
            settle_record_item['trans_date'] = trans_date
            settle_record_item['trans_status'] = trans_status
            settle_record_item['settle_type'] = settle_type
            settle_record_item['settle_date'] = settle_date
            all_res.append(dict(settle_record_item))

        # pprint(all_res)
        self.lg.info('未存储个数: {}'.format(len(all_res)))
        await self._save_all_business_settlement_records(all_res=all_res)

        try:
            del all_res
        except:
            pass

        return None

    async def _save_all_business_settlement_records(self, all_res) -> None:
        """
        存储新增的商家提现记录
        :param all_res:
        :return:
        """
        new_add_count = 0
        for item in all_res:
            # 处理未存储的新数据
            unique_id = item['unique_id']
            self.lg.info('saving unique_id: {} ...'.format(unique_id))
            params = await self._get_insert_item_params(item=item)
            try:
                res = self.sql_cli._insert_into_table_2(
                    sql_str=zwm_insert_str_1, params=params, logger=self.lg)
                if res:
                    new_add_count += 1
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

        if not self.sql_cli.is_connect_success:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        else:
            pass

        self.lg.info('新增个数: {}'.format(new_add_count))

        return None

    async def _get_insert_item_params(self, item) -> tuple:
        """
        待插入对象
        :param item:
        :return:
        """
        return tuple([
            item['unique_id'],
            item['create_time'],
            item['shop_name'],
            item['shop_id'],
            item['agent_name'],
            item['top_agent_name'],
            item['date_settle_type'],
            item['trans_amount'],
            item['service_charge'],
            item['accounting_amount'],
            item['trans_date'],
            item['trans_status'],
            item['settle_type'],
            item['settle_date'],
        ])

    async def _get_insert_item_params2(self, item) -> tuple:
        """
        待插入对象, zwm_buss_manage_records table
        :param item:
        :return:
        """
        return tuple([
            item['unique_id'],
            item['create_time'],
            item['modify_time'],
            item['agent_name'],
            item['top_agent_name'],
            item['shop_type'],
            item['is_high_quality_shop'],
            item['shop_id'],
            item['shop_chat_name'],
            item['phone_num'],
            item['shop_chant_num'],
            item['sale'],
            item['is_real_time'],
            item['approve_date'],
            item['rate'],
            item['account_type'],
            item['apply_time'],
            item['process_context'],
            item['is_non_contact'],
            item['approval_status'],
            item['approval_status_change_time'],
        ])

    async def _get_update_item_params(self, item: dict) -> tuple:
        """
        更新对象, zwm_buss_manage_records table
        :param item:
        :return:
        """
        return tuple([
            item['modify_time'],
            item['agent_name'],
            item['top_agent_name'],
            item['shop_type'],
            item['is_high_quality_shop'],
            item['shop_id'],
            item['shop_chat_name'],
            item['phone_num'],
            item['shop_chant_num'],
            item['sale'],
            item['is_real_time'],
            item['approve_date'],
            item['rate'],
            item['account_type'],
            item['apply_time'],
            item['process_context'],
            item['is_non_contact'],
            item['approval_status'],
            item['approval_status_change_time'],
            item['unique_id'],
        ])

    async def _wash_and_save_all_transaction_details(self, target_list: list):
        """
        清洗并存储所有交易明细
        :param target_list:
        :return:
        """
        pass

    async def _get_all_business_settlement_records_by_something(self):
        """
        获取所有商户结算记录
        :return:
        """
        async def get_tasks_params_list(
                max_business_settlement_records_page_num) -> list:
            """获取tasks_params_list"""
            tasks_params_list = []
            for page_num in range(1, max_business_settlement_records_page_num):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where page_num: {}]...'.format(k['page_num'])

        def get_now_args(k) -> list:
            return [
                k['page_num'],
            ]

        res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=await
            get_tasks_params_list(max_business_settlement_records_page_num=self
                                  .max_business_settlement_records_page_num),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self.
            _get_one_page_business_settlement_records_by_something,
            func_name_where_get_now_args=get_now_args,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res,
            one_default_res=[],
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,
        )

        return res

    @catch_exceptions_with_class_logger(default_res=[])
    def _get_one_page_business_settlement_records_by_something(
            self,
            page_num: int,
            start_date: str = None,
            end_date: str = None,
            mid: str = '',
            agent_name: str = '') -> list:
        """
        得到单页商户结算记录
        :param page_num:
        :param start_date:              默认设置前一个月27号, eg: '2019-07-01'
        :param end_date:                eg: '2019-07-16'
        :param mid:                     商户编号
        :param agent_name:              顶级机构名称
        :return:
        """
        start_date = str(self.get_1_on_the_month(
        ) if start_date is None else start_date).split(' ')[0]
        # start_date = '2018-01-01'
        end_date = (str(get_shanghai_time())
                    if end_date is None else end_date).split(' ')[0]
        self.lg.info('start_date: {}, end_date: {}'.format(
            start_date, end_date))

        headers = self.get_random_pc_headers()
        headers.update({
            'Accept': '*/*',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer':
            'https://agent.yrmpay.com/JHAdminConsole/merSettle/querySettleJsp.do',
            'X-Requested-With': 'XMLHttpRequest',
        })
        params = (('_dc', get_now_13_bit_timestamp()), )
        data = {
            'startDate': start_date,
            'endDate': end_date,
            'mid': mid,
            'agentName': agent_name,
            'loginAgentId': self.zwm_username[0:8],  # 前8位
            'page': str(page_num),
            'start': str((page_num - 1) * 100),  # 开始位置, 0, 100, 200
            'limit': '100',
        }
        url = 'https://agent.yrmpay.com/JHAdminConsole/merSettle/queryMerSettleList.do'
        body = Requests.get_url_body(
            method='post',
            url=url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            data=data,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,
        )
        # self.lg.info(body)
        assert body != '', 'body不为空值!'
        res = json_2_dict(json_str=body, logger=self.lg,
                          default_res={}).get('data', [])

        self.lg.info('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))

        return res

    async def _get_all_transaction_details(self) -> list:
        """
        获取所有交易流水
        :return:
        """
        async def _get_tasks_params_list() -> list:
            """获取tasks_params_list"""
            tasks_params_list = []
            for page_num in range(1, self.max_transaction_details_page_num):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        tasks_params_list = await _get_tasks_params_list()
        tasks_params_list_obj = TasksParamsListObj(
            tasks_params_list=tasks_params_list,
            step=self.concurrency,
        )

        all_res = []
        while True:
            try:
                slice_params_list = tasks_params_list_obj.__next__()
            except AssertionError:
                break

            tasks = []
            for k in slice_params_list:
                page_num = k['page_num']
                self.lg.info(
                    'create task[where page_num: {}]...'.format(page_num))
                func_args = [
                    page_num,
                ]
                tasks.append(
                    self.loop.create_task(
                        unblock_func(
                            func_name=self.
                            _get_one_page_transaction_details_by_something,
                            func_args=func_args,
                            logger=self.lg,
                        )))

            one_res = await async_wait_tasks_finished(tasks=tasks)
            try:
                del tasks
            except:
                pass
            for i in one_res:
                for j in i:
                    all_res.append(j)

        return all_res

    @catch_exceptions_with_class_logger(default_res=[])
    def _get_one_page_transaction_details_by_something(
        self,
        page_num: int,
        start_date: str = None,
        end_date: str = None,
        transaction_status: str = '',
        mer_name: str = '',
        order_no: str = '',
        mid: str = '',
        agent_name: str = '',
        pay_channel: str = '',
        sale_name: str = '',
    ) -> list:
        """
        获取单页交易流水
        :param page_num:                开始页面, eg: 1, 2, 3
        :param start_date:              eg: '2019-07-16 00:00'
        :param end_data:                eg: '2019-07-16 10:02'
        :param transaction_status:      交易状态 | 选择全部: '' or 交易成功: '1' or 退款成功: '3'
        :param mer_name:                待查询的商户名称
        :param order_no:                订单号
        :param mid:                     商户编号
        :param agent_name:              顶级机构名称
        :param pay_channel:             支付渠道 | 请选择: '' or 微信: '50' or 支付宝: '51' or 微信条码: '55' or 支付宝条码: '56' or 微信小程序: '67'
        :param sale_name:               销售名称
        :return:
        """
        res = []
        start_date = self.get_0_00_on_the_day(
        ) if start_date is None else start_date
        end_date = str(get_shanghai_time()) if end_date is None else end_date

        headers = self.get_random_pc_headers()
        headers.update({
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer':
            'https://agent.yrmpay.com/JHAdminConsole/limafuReport/transflow.do',
            'X-Requested-With': 'XMLHttpRequest',
        })
        params = (('_dc', get_now_13_bit_timestamp()), )
        data = {
            'startDate': start_date,
            'endDate': end_date,
            'type': '2',
            'status': transaction_status,
            'payChannel': pay_channel,
            'orderNo': order_no,
            'merName': mer_name,
            'mid': mid,
            'agentName': agent_name,
            'saleName': sale_name,
            'page': str(page_num),
            'start': str((page_num - 1) * 20),  # 开始位置, 0, 20, 40
            'limit': '20',
        }
        url = 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/querylimafuTransFlow.do'

        body = Requests.get_url_body(
            method='post',
            url=url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            data=data,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,
        )
        assert body != '', 'body不为空值!'
        res = json_2_dict(json_str=body, logger=self.lg,
                          default_res={}).get('data', [])

        self.lg.info('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))

        return res

    def get_0_00_on_the_day(self) -> str:
        """
        获取当天的0点
        :return:
        """
        now_time = get_shanghai_time()

        return str(
            datetime(year=now_time.year,
                     month=now_time.month,
                     day=now_time.day))

    def get_1_on_the_month(self) -> str:
        """
        获取当月的第一天
        :return:
        """
        now_time = get_shanghai_time()
        # 避免月底流水无法获取
        day = 5

        now_month = now_time.month
        if now_month > 1:
            now_month -= 1
        else:
            # now_month为1月份
            now_month = 12

        return str(datetime(
            year=now_time.year,
            month=now_month,
            day=day,
        ))

    def _get_proxies(self) -> dict:
        """
        获取代理
        :return:
        """
        proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type, )
        assert proxies != {}, 'proxies不为空dict!'

        return proxies

    async def _get_random_pc_headers(self) -> dict:
        """
        :return:
        """
        return self.get_random_pc_headers()

    @staticmethod
    def get_random_pc_headers() -> dict:
        headers = get_random_headers(
            upgrade_insecure_requests=False,
            cache_control='',
        )
        headers.update({
            'Origin': 'https://agent.yrmpay.com',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarytSJCAoaErjNY4IbM',
            'accept': 'text/plain, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
        })
        return headers

    def __del__(self):
        try:
            del self.lg
            del self.login_cookies_dict
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        #  and GETDATE()-ModfiyTime>0.3
        # and MainGoodsID is not null
        sql_str = '''
        select SiteID, GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time
        from dbo.GoodsInfoAutoGet 
        where SiteID=30 and GETDATE()-ModfiyTime>0.3 and MainGoodsID is not null
        order by ID asc'''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[2],
                                shelf_time=item[5],
                                delete_time=item[6])
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                        else:
                            data['_is_price_change'], data[
                                '_price_change_info'] = _get_price_change_info(
                                    old_price=item[3],
                                    old_taobao_price=item[4],
                                    new_price=data['price'],
                                    new_taobao_price=data['taobao_price'])
                            # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info']))

                        yanxuan.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        gc.collect()
Exemplo n.º 10
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        # and GETDATE()-ModfiyTime>1
        sql_str = '''
        select GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time
        from dbo.GoodsInfoAutoGet 
        where SiteID=2 and MainGoodsID is not null and GETDATE()-ModfiyTime>1
        order by ID desc
        '''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    data = ali_1688.get_ali_1688_data(item[0])
                    if isinstance(data, int) is True:  # 单独处理返回tt为4041
                        continue
                    else:
                        pass

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]

                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        print('上架时间:', data['shelf_time'], '下架时间:',
                              data['delete_time'])

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(1.5)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = ali_1688.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        print('上架时间:', data['shelf_time'], '下架时间:',
                              data['delete_time'])
                        '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中'''
                        # 业务逻辑
                        #   公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(.3)  # 避免服务器更新太频繁
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Exemplo n.º 11
0
class XiaoHongShuParse(Crawler):
    def __init__(self, logger=None, by_wx=False):
        '''
        :param logger:
        :param by_wx: 抓取wx小程序(弊端: 没有tags值 优点: 可长期采集, 不容易被封) √
                      vs 抓取app(弊端: 测试发现就算用高匿proxy每跑20个, 就被封3-5分钟, 效率低)
        '''
        super(XiaoHongShuParse, self).__init__(
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            logger=logger,
            log_save_path=MY_SPIDER_LOGS_PATH + '/小红书/_/',
        )
        self._set_headers()
        self.by_wx = by_wx
        self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        self.index = 0
        self.success_insert_db_num = 0
        self.CRAWL_ARTICLE_SLEEP_TIME = 1  # 抓每天文章的sleep_time(wx=1/app=2)
        self.LONG_SLEEP_TIME = 0  # 每抓10条休眠时间
        self.db_share_id = []  # db原先存在的
        self.ip_pool_type = IP_POOL_TYPE

    def _set_headers(self):
        self.headers = {
            'authority': 'www.xiaohongshu.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': get_random_pc_ua(),
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
        }

    def _get_xiaohongshu_home_aritles_info(self):
        '''
        小红书主页json模拟获取(模拟app端主页请求)
        :return:
        '''
        headers = {
            'Accept-Encoding': 'br, gzip, deflate',
            'Connection': 'keep-alive',
            # 'device_id': '2AEEF650-2CAE-480F-B30C-CA5CABC26193',
            'Accept': 'application/json',
            'Host': 'www.xiaohongshu.com',
            'User-Agent':
            'discover/5.19.1 (iPhone; iOS 11.0; Scale/3.00) Resolution/1242*2208 Version/5.19.1 Build/5191001 Device/(Apple Inc.;iPhone7,1)',
            # 'Authorization': 'session.1210427606534613282',
            'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
            'X-Tingyun-Id': 'LbxHzUNcfig;c=2;r=551911068',
        }

        # 下面参数每个都是必须的, 且不变
        params = (
            ('deviceId', '2AEEF650-2CAE-480F-B30C-CA5CABC26193'),
            ('device_fingerprint',
             '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'),
            ('device_fingerprint1',
             '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'),
            ('lang', 'zh'),
            ('num', '10'),
            ('oid', 'homefeed_recommend'),
            ('platform', 'iOS'),
            ('sid', 'session.1210427606534613282'),
            ('sign', 'c9a9eadc6c46823ae3075d7b28fe97fa'),
            ('t', '1531010946'),  # 用原来的避免sign错误
            # ('t', int(time.time())),
        )

        url = 'https://www.xiaohongshu.com/api/sns/v6/homefeed'
        body = Requests.get_url_body(url=url,
                                     headers=headers,
                                     params=params,
                                     cookies=None,
                                     high_conceal=True,
                                     ip_pool_type=self.ip_pool_type)
        # self.lg.info(body)
        if body == '':
            self.lg.error('获取到的body为空值!请检查!')
            return []

        if re.compile(r'<title>403 Forbidden</title>').findall(body) != []:
            self.lg.info('此次抓取被403禁止!')
            sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            return []

        _ = json_2_dict(body, logger=self.lg).get('data', [])
        # pprint(_)
        if _ == []:
            self.lg.error('获取到的data为空值!请检查!')
            return []

        _ = [{
            'share_link': item.get('share_link', ''),
            'likes': item.get('likes', 0),
        } for item in _]

        return _

    def _deal_with_home_article(self):
        home_articles_link_list = self._get_xiaohongshu_home_aritles_info()
        # pprint(home_articles_link_list)
        self.lg.info(home_articles_link_list)

        # self.lg.info(str(home_articles_link_list) + '\n')
        data = self._deal_with_articles(articles_list=home_articles_link_list)
        # pprint(data)

        self._save_articles(data=data)

        self.lg.info('一次采集完毕, 进入{0}s休眠...'.format(self.LONG_SLEEP_TIME))
        sleep(self.LONG_SLEEP_TIME)  # 设置休眠, 实现周期抓取, 避免频繁抓取被封禁(测试发现抓20个就会封一会)

        return True

    def _deal_with_articles(self, articles_list):
        '''
        处理给与小红书地址(articles_list)
        :param articles_list: 待抓取的文章地址list  eg: [{'share_link':'小红书地址', 'likes': 111}, ...]   # likes可以为空
        :return: data a list
        '''
        data = []
        _db = self.my_pipeline._select_table(
            sql_str='select share_id from dbo.daren_recommend')
        if _db is not None and _db != [] and _db != [()]:
            self.db_share_id = [item[0] for item in _db]
            # self.lg.info(self.db_share_id)

        for item in articles_list:
            self.index += 1
            article_link = item.get('share_link', '')
            article_likes = item.get('likes', 0)
            article_id = re.compile(r'/item/(\w+)').findall(article_link)[0]

            if article_id in self.db_share_id:
                self.lg.info('该{0}已存在于db中...跳过!'.format(article_id))

            self.lg.info('[+] {0}'.format(article_link))
            if article_link != '':
                if not self.by_wx:  # 通过pc端
                    params = (('_at',
                               '499a292d16aed3d80a068fc60e0c1e3ee3410'), )
                    body = Requests.get_url_body(
                        url=article_link,
                        headers=self.headers,
                        params=params,
                        high_conceal=True,
                        ip_pool_type=self.ip_pool_type)
                    # self.lg.info(str(body))
                    try:
                        article_info = re.compile(
                            'window.__INITIAL_SSR_STATE__=(.*?)</script>'
                        ).findall(body)[0]
                        # self.lg.info(str(article_info))
                    except IndexError:
                        self.lg.error('获取article_info时IndexError!请检查!')
                        sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
                        continue

                    article_info = self._wash_article_info(
                        json_2_dict(json_str=article_info, logger=self.lg))
                    # pprint(article_info)
                    article_info = self._parse_page(
                        article_link=article_link,
                        article_info=article_info,
                        article_likes=article_likes)
                    # pprint(article_info)

                else:  # 通过wx小程序
                    # url = "https://www.xiaohongshu.com/wx_mp_api/sns/v1/note/" + article_id
                    # wx接口改版, 需要一个参数Auth认证, 暂时没处理
                    url = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/note/' + article_id
                    params = {
                        "sid":
                        "session.1210427606534613282",  # 对方服务器用来判断登录是否过期(过期则替换这个即可再次采集)
                    }
                    body = Requests.get_url_body(
                        url=url,
                        headers=self.headers,
                        params=params,
                        ip_pool_type=self.ip_pool_type)
                    # self.lg.info(str(body))
                    if body == '':
                        self.lg.error('获取到的article的body为空值!跳过!')
                        sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
                        continue
                    article_info = self._wash_article_info_from_wx(
                        json_2_dict(json_str=body, logger=self.lg))
                    article_info = self._parse_page_from_wx(
                        article_link=article_link,
                        article_info=article_info,
                        article_likes=article_likes)
                    # pprint(article_info)

                data.append(article_info)
                sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            else:
                pass

        self.lg.info('@@@ 抓取完毕!')
        # pprint(data)

        return data

    def _parse_page(self, **kwargs):
        '''
        解析单个article的info
        :return: a dict
        '''
        article_link = kwargs.get('article_link', '')
        article_info = kwargs.get('article_info', {}).get('NoteView', {})
        article_likes = kwargs.get('article_likes', get_random_int_number())

        error_msg = '出错article_url: {0}'.format(article_link)
        try:
            nick_name = article_info.get('noteInfo',
                                         {}).get('user',
                                                 {}).get('nickname', '')
            assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg

            head_url = article_info.get('noteInfo',
                                        {}).get('user', {}).get('image', '')
            assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg

            profile = ''  # 个人简介或者个性签名(留空)

            share_id = article_info.get('noteInfo', {}).get('id', '')
            assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg

            title = article_info.get('noteInfo', {}).get('title',
                                                         '')  # title默认留空
            comment_content = self.wash_sensitive_info(
                article_info.get('noteInfo', {}).get('desc', ''))
            assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg

            share_img_url_list = [{   # 如果是视频的话, 则里面第一章图片就是视频第一帧
                'img_url': item.get('original', ''),
                'height': item.get('height'),           # 图片高宽
                'width': item.get('width'),
            } for item in article_info.get('noteInfo', {}).get('images', [])]
            assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg

            div_body = ''  # 默认留空
            gather_url = article_link

            # 原文章原始的创建日期
            tmp_create_time = article_info.get('noteInfo', {}).get('time', '')
            assert tmp_create_time != '', '获取到的create_time为空值!请检查!'
            create_time = string_to_datetime(tmp_create_time + ':00')

            site_id = 3  # 小红书
            goods_url_list = []  # 该文章待抓取的商品地址
            share_goods_base_info = []

            tags = self._get_tags(article_info=article_info)

            # 视频播放地址
            tmp_video_url = article_info.get('noteInfo', {}).get('video', '')
            tmp_video_url = 'https:' + tmp_video_url if tmp_video_url != '' else ''
            video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url)

            likes = article_likes
            collects = article_info.get('noteInfo', {}).get('collects', None)
            assert collects is not None, '获取到的collects为None!请检查!' + error_msg

        except Exception:
            sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            self.lg.error('遇到错误: ', exc_info=True)
            return {}

        _ = WellRecommendArticle()
        _['nick_name'] = nick_name
        _['head_url'] = head_url
        _['profile'] = profile
        _['share_id'] = share_id
        _['title'] = title
        _['comment_content'] = comment_content
        _['share_img_url_list'] = share_img_url_list
        _['div_body'] = div_body
        _['gather_url'] = gather_url
        _['create_time'] = create_time
        _['site_id'] = site_id
        _['goods_url_list'] = goods_url_list
        _['tags'] = tags
        _['share_goods_base_info'] = share_goods_base_info
        _['video_url'] = video_url
        _['likes'] = likes
        _['collects'] = collects

        return _

    def _parse_page_from_wx(self, **kwargs):
        '''
        解析wx单个article的info
        :param kwargs:
        :return: a WellRecommendArticle object
        '''
        article_link = kwargs.get('article_link', '')
        article_info = kwargs.get('article_info', {}).get('data', {})
        article_likes = kwargs.get('article_likes', get_random_int_number())

        error_msg = '出错article_url: {0}'.format(article_link)
        try:
            nick_name = article_info.get('user', {}).get('nickname', '')
            assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg

            head_url = article_info.get('user', {}).get('images', '')
            assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg

            profile = ''  # 个人简介或者个性签名(留空)

            share_id = article_info.get('id', '')
            assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg

            title = self.wash_sensitive_info(article_info.get('title',
                                                              ''))  # title默认留空
            comment_content = self.wash_sensitive_info(
                article_info.get('desc', ''))
            assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg

            share_img_url_list = [{  # 如果是视频的话, 则里面第一章图片就是视频第一帧
                'img_url': item.get('original', ''),
                'height': item.get('height'),  # 图片高宽
                'width': item.get('width'),
            } for item in article_info.get('images_list', [])]
            assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg

            div_body = ''  # 默认留空
            gather_url = article_link

            # 原文章原始的创建日期
            tmp_create_time = article_info.get('time', '')
            assert tmp_create_time != '', '获取到的create_time为空值!请检查!'
            create_time = string_to_datetime(tmp_create_time + ':00')

            site_id = 3  # 小红书
            goods_url_list = []  # 该文章待抓取的商品地址
            share_goods_base_info = []

            # wx端tags没有返回值
            tags = self._get_tags_from_wx(article_info=article_info)

            # 视频播放地址
            tmp_video_url = article_info.get('video', '')
            tmp_video_url = re.compile('\?.*').sub('', tmp_video_url)
            video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url)

            likes = article_likes
            collects = article_info.get('fav_count', None)
            assert collects is not None, '获取到的collects为None!请检查!' + error_msg

        except Exception:
            sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            self.lg.error('遇到错误:', exc_info=True)
            return {}

        _ = WellRecommendArticle()
        _['nick_name'] = nick_name
        _['head_url'] = head_url
        _['profile'] = profile
        _['share_id'] = share_id
        _['title'] = title
        _['comment_content'] = comment_content
        _['share_img_url_list'] = share_img_url_list
        _['div_body'] = div_body
        _['gather_url'] = gather_url
        _['create_time'] = create_time
        _['site_id'] = site_id
        _['goods_url_list'] = goods_url_list
        _['tags'] = tags
        _['share_goods_base_info'] = share_goods_base_info
        _['video_url'] = video_url
        _['likes'] = likes
        _['collects'] = collects

        return _

    def _save_articles(self, data):
        '''
        存储数据
        :param data:
        :return:
        '''
        self.lg.info('即将开始存储该文章...')
        sql_str = 'insert into dbo.daren_recommend(share_id, nick_name, head_url, profile, gather_url, title, comment_content, share_img_url_list, div_body, create_time, site_id, tags, video_url, likes, collects) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        for item in data:
            if self.index % 20 == 0:
                self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

            if self.my_pipeline.is_connect_success:
                share_id = item.get('share_id', '')
                if share_id == '':
                    continue

                self.lg.info(
                    '------>>>| 正在存储share_id: {0}...'.format(share_id))
                try:
                    params = self._get_db_insert_into_params(item=item)
                except Exception:
                    continue
                result = self.my_pipeline._insert_into_table_2(sql_str=sql_str,
                                                               params=params,
                                                               logger=self.lg)
                if result:
                    self.success_insert_db_num += 1

            else:
                self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format(
                    item.get('gather_url', '')))

        self.lg.info('@' * 9 +
                     ' 目前成功存储{0}个!'.format(self.success_insert_db_num))

        return True

    def _get_db_insert_into_params(self, item):
        '''
        得到待存储的数据
        :param item:
        :return:
        '''
        params = [
            item['share_id'],
            item['nick_name'],
            item['head_url'],
            item['profile'],
            item['gather_url'],
            item['title'],
            item['comment_content'],
            dumps(item['share_img_url_list'], ensure_ascii=False),
            # dumps(item['goods_id_list'], ensure_ascii=False),
            # dumps(item['share_goods_base_info'], ensure_ascii=False),
            item['div_body'],
            item['create_time'],
            item['site_id'],
            dumps(item['tags'], ensure_ascii=False),
            item['video_url'],
            item['likes'],
            item['collects'],
        ]

        return tuple(params)

    def _get_tags(self, article_info):
        '''
        获取tags
        :return:
        '''
        tmp_tags = list_duplicate_remove([
            str(item.get('name', '')) for item in article_info.get(
                'noteInfo', {}).get('relatedTags', [])
        ])
        # self.lg.info(str(tmp_tags))
        # list先转str, 去掉敏感字眼, 再转list, 并去除''元素, 得到最后list
        tmp_tags = delete_list_null_str(
            self.wash_sensitive_info('|'.join(tmp_tags)).split('|'))
        tags = [
            {  # tags可以为空list!
                'keyword': item,
            } for item in tmp_tags
        ]

        return tags

    def _get_tags_from_wx(self, article_info):
        '''
        从wx获取tags
        :param article_info:
        :return:
        '''

        return []

    def _wash_article_info(self, _dict):
        '''
        清洗无用字段
        :param _dict:
        :return:
        '''
        try:
            _dict['NoteView']['commentInfo'] = {}  # 评论信息
            _dict['NoteView']['panelData'] = []  # 相关笔记
        except:
            pass

        return _dict

    def _wash_article_info_from_wx(self, _dict):
        '''
        清洗wx无用字段
        :param _dict:
        :return:
        '''
        try:
            _dict['data']['mini_program_info'] = {}  # 推荐首页的缩略信息
            _dict['data']['share_info'] = {}  # 分享的信息
        except:
            pass

        return _dict

    def wash_sensitive_info(self, data):
        '''
        清洗敏感信息
        :param data:
        :return:
        '''
        replace_str_list = [
            ('小红书', '优秀网'),
            ('xiaohongshu', '优秀网'),
            ('XIAOHONGSHU', '优秀网'),
            ('某宝', '优秀网'),
            ('薯队长', '秀队长'),
            ('薯宝宝', '秀客'),
            ('红薯们', '秀客们'),
            ('小红薯', '小秀客'),
        ]

        add_sensitive_str_list = [
            '#.*#',
            '@.*?薯',
        ]

        return wash_sensitive_info(
            data=data,
            replace_str_list=replace_str_list,
            add_sensitive_str_list=add_sensitive_str_list,
            is_default_filter=True,
            is_lower=False,
        )

    def __del__(self):
        try:
            del self.lg
            del self.my_pipeline
        except:
            pass
        gc.collect()
Exemplo n.º 12
0
class RecommendGoodOps(AsyncCrawler):
    """荐好ops"""
    def __init__(self):
        AsyncCrawler.__init__(
            self,
            log_print=True,
            is_new_loop=False,
            log_save_path=MY_SPIDER_LOGS_PATH + '/荐好/ops/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.request_num_retries = 6
        self.article_type = 'zq'
        self.yx_username = input('请输入yx_username:'******'请输入yx_password:'******'yx_username: {}, yx_password: {}'.format(
            self.yx_username, self.yx_password))
        # 不支持https了, 原先支持
        self.publish_url = 'http://configadmin.yiuxiu.com/Business/Index'
        self.select_sql0 = 'SELECT unique_id FROM dbo.recommend_good_ops_article_id_duplicate_removal'
        self.insert_sql0 = 'INSERT INTO dbo.recommend_good_ops_article_id_duplicate_removal(unique_id, create_time) values(%s, %s)'
        self.min_article_id = 0
        self.max_article_id = 0
        self.driver_headless = True
        # 必须使用代理, yx限制ip频繁
        self.driver_use_proxy = True
        # 荐好管理label
        self.recommend_good_label_css_selector = 'span.nav-label'
        # 设置开眼的min_article_id, max_article_id
        self.ky_min_article_id, self.ky_max_article_id = 4000, 60000
        # article_id 截取数
        self.zq_intercept_num = 2
        self.hk_intercept_num = 1
        self.lfd_intercept_num = 1
        self.gxg_intercept_num = 1
        self.pp_intercept_num = 2
        self.kr_intercept_num = 1
        self.dfsp_intercept_num = 1
        self.jrxsp_intercept_num = 1
        self.ky_intercept_num = 1
        # 增加全屏视频数
        self.lsp_intercept_num = 2
        self.mp_intercept_num = 1
        self.klm_intercept_num = 2
        self.article_parser = None
        # 暂存好看视频list的dict
        self.hk_cache_dict = {}
        self.lfd_cache_dict = {}
        self.gxg_cache_dict = {}
        self.pp_cache_dict = {}
        self.kr_cache_dict = {}
        self.dfsp_cache_dict = {}
        self.lsp_cache_dict = {}
        self.mp_cache_dict = {}
        self.klm_cache_dict = {}
        self.jrxsp_cache_dict = {}

    async def _fck_run(self):
        # 休眠7.5分钟, 避免频繁发!(5分钟还是太快, 删不过来)(增加较多视频, 失败率较高故还是5分钟)
        # sleep_time = 0.
        sleep_time = 60 * 5.
        self.db_article_id_list = await self.get_db_unique_id_list()
        assert self.db_article_id_list != []
        self.lg.info('db_article_id_list_len: {}'.format(
            len(self.db_article_id_list)))

        _timeout = await self.get_auto_publish_articles_timeout()
        while True:
            if get_shanghai_time().hour == 0:
                # 夜晚休眠
                await async_sleep(60 * 60 * 4.)
            else:
                pass
            try:
                try:
                    await async_wait_for(
                        self.auto_publish_articles(),
                        timeout=_timeout,
                    )
                except AsyncTimeoutError:
                    raise PublishOneArticleFailException

            except (
                    ArticleTitleOverLongException,
                    LoginFailException,
                    ArticleTitleContainSensitiveWordsException,
                    PublishOneArticleFailException,
                    EnterTargetPageFailException,
            ):
                self.lg.error('遇到错误:', exc_info=True)
                continue

            except Exception:
                self.lg.error('遇到错误:', exc_info=True)

            self.lg.info('休眠{}s...'.format(sleep_time))
            await async_sleep(sleep_time)

    async def get_auto_publish_articles_timeout(self):
        """
        获取自动发布文章的超时时长
        :return:
        """
        all_intercept_num = self.zq_intercept_num \
                            + self.hk_intercept_num \
                            + self.lfd_intercept_num \
                            + self.gxg_intercept_num \
                            + self.pp_intercept_num \
                            + self.kr_intercept_num \
                            + self.dfsp_intercept_num \
                            + self.lsp_intercept_num \
                            + self.mp_intercept_num \
                            + self.klm_intercept_num \
                            + self.jrxsp_intercept_num \
                            + self.ky_intercept_num
        _timeout = all_intercept_num * 2.5 * 60

        return _timeout

    async def get_db_unique_id_list(self) -> list:
        """
        获取db的unique_id_list
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        if not self.sql_cli.is_connect_success:
            raise SqlServerConnectionException

        res = []
        try:
            res = self.sql_cli._select_table(
                sql_str=self.select_sql0,
                logger=self.lg,
            )
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        res = [] if res is None else res

        return [item[0] for item in res]

    async def auto_publish_articles(self):
        """
        自动发布文章
        :return:
        """
        self.sql_cli = get_new_sql_cli(sql_cli=self.sql_cli)
        if not self.sql_cli.is_connect_success:
            raise SqlServerConnectionException
        else:
            pass

        if self.min_article_id == 0\
            or self.max_article_id == 0:
            self.article_parser = ArticleParser(logger=self.lg)
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type=self.article_type, ))
            assert article_list != []

            self.min_article_id, self.max_article_id = self.get_latest_max_and_min_artcile_id_from_article_list(
                article_list=article_list, )
            self.lg.info('最新的min_article_id: {}, max_article_id: {}'.format(
                self.min_article_id,
                self.max_article_id,
            ))
        else:
            pass

        # todo shadow模式下老是登录失败, 建议关闭shadow, 不使用其 or lantern or 使用shadow其全局代理模式即可

        # 创建目标集合
        # zq_article_list = []
        # hk_article_list = []
        # lfd_article_list = []
        # gxg_article_list = []
        # pp_article_list = []
        # kr_article_list = []
        # dfsp_article_list = []
        # lsp_article_list = []
        # mp_article_list = []
        # klm_article_list = []
        # jrxsp_article_list = []
        zq_article_list = self.get_zq_own_create_article_id_list(
            min_article_id=self.min_article_id,
            max_article_id=self.max_article_id,
        )
        hk_article_list = self.get_hk_article_id_list()
        lfd_article_list = self.get_lfd_article_id_list()
        gxg_article_list = self.get_gxg_article_id_list()
        pp_article_list = self.get_pp_article_id_list()
        kr_article_list = self.get_kr_article_id_list()
        dfsp_article_list = self.get_dfsp_article_id_list()
        lsp_article_list = self.get_lsp_article_id_list()
        mp_article_list = self.get_mp_article_id_list()
        klm_article_list = self.get_klm_article_id_list()
        jrxsp_article_list = self.get_jrxsp_article_id_list()
        ky_article_list = self.get_ky_own_create_article_id_list()

        # 测试用
        # article_id = '17300123'
        # article_list = [{
        #     'uid': get_uuid3(target_str='{}::{}'.format('zq', article_id)),
        #     'article_type': 'zq',
        #     'article_id': article_id,
        #     'title': '未知',
        #     'article_url': 'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id),
        # }]

        # 文章在前的发布顺序, 视频在后(避免视频发过多)
        article_list = zq_article_list \
                       + pp_article_list \
                       + kr_article_list \
                       + dfsp_article_list \
                       + hk_article_list \
                       + klm_article_list \
                       + jrxsp_article_list \
                       + mp_article_list \
                       + lsp_article_list \
                       + ky_article_list \
                       + lfd_article_list \
                       + gxg_article_list

        assert article_list != []
        # pprint(article_list)

        target_article_list = self.get_target_article_list(
            article_list=article_list)
        if target_article_list == []:
            self.lg.info('待发布的target_article_list为空list, pass!')
            return

        driver = None
        try:
            try:
                # rasp上代理模式启动chromedriver具有一定的失败率, 故还是mac
                driver = BaseDriver(
                    type=CHROME,
                    executable_path=CHROME_DRIVER_PATH,
                    # 本地老是出错
                    # type=FIREFOX,
                    # executable_path=FIREFOX_DRIVER_PATH,
                    load_images=True,
                    logger=self.lg,
                    headless=self.driver_headless,
                    driver_use_proxy=self.driver_use_proxy,
                    ip_pool_type=self.ip_pool_type,
                )
                self.login_bg(driver=driver)
                self.get_into_recommend_good_manage(driver=driver)
            except (FZTimeoutError, WebDriverException):
                raise LoginFailException

            for item in target_article_list:
                uid = item.get('uid', '')
                title = item.get('title', '')
                article_url = item.get('article_url', '')
                self.lg.info('正在发布文章 title: {}, article_url: {} ...'.format(
                    title, article_url))
                try:
                    self.publish_one_article(
                        driver=driver,
                        article_url=article_url,
                    )
                except FZTimeoutError:
                    raise PublishOneArticleFailException

                # 新增, 以及插入db
                self.db_article_id_list.append(uid)
                self.sql_cli._insert_into_table_2(
                    sql_str=self.insert_sql0,
                    params=(
                        uid,
                        get_shanghai_time(),
                    ),
                    logger=self.lg,
                )
        except (
                ArticleTitleOverLongException,
                LoginFailException,
                ArticleTitleContainSensitiveWordsException,
                PublishOneArticleFailException,
                EnterTargetPageFailException,
        ) as e:
            # 抛出异常
            raise e
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        finally:
            try:
                # ** 注意: 不可直接del driver, 测试发现浏览器未被正确关闭, 还存在!!
                # del driver
                # 关闭浏览器
                driver.driver.quit()
                # 只关闭当前窗口, 不关闭浏览器
                # driver.driver.close()
                self.lg.info('driver 释放成功!')
            except:
                try:
                    driver.driver.quit()
                except:
                    pass
            collect()

        return

    def get_ky_own_create_article_id_list(self):
        """
        获取ky article_list
        :return:
        """
        article_id_list = [
            str(article_id) for article_id in range(self.ky_min_article_id,
                                                    self.ky_max_article_id)
        ]

        # 截取
        article_id_list = random_sample(article_id_list, self.ky_intercept_num)
        res = [{
            'uid':
            get_uuid3(target_str='{}::{}'.format('ky', article_id)),
            'article_type':
            'ky',
            'title':
            '未知',
            'article_id':
            article_id,
            'article_url':
            'https://www.kaiyanapp.com/detail.html?vid={}'.format(article_id),
        } for article_id in article_id_list]

        return res

    def get_jrxsp_article_id_list(self):
        """
        获取目标article_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.jrxsp_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='jrxsp', ))
            self.jrxsp_cache_dict['data'] = article_list
            self.jrxsp_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.jrxsp_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 40 * 60:
                # klm 每日更新数量有限, 每过40分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='jrxsp', ))
                self.jrxsp_cache_dict['data'] = article_list
                self.jrxsp_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.jrxsp_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list,
                                         self.jrxsp_intercept_num)

        return article_list

    def get_klm_article_id_list(self):
        """
        获取目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.klm_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='klm', ))
            self.klm_cache_dict['data'] = article_list
            self.klm_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.klm_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # klm 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='klm', ))
                self.klm_cache_dict['data'] = article_list
                self.klm_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.klm_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.klm_intercept_num)

        return article_list

    def get_mp_article_id_list(self):
        """
        获取mp 目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.mp_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='mp', ))
            self.mp_cache_dict['data'] = article_list
            self.mp_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.mp_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # mp 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='mp', ))
                self.mp_cache_dict['data'] = article_list
                self.mp_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.mp_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.mp_intercept_num)

        return article_list

    def get_lsp_article_id_list(self):
        """
        获取lsp 目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.lsp_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='lsp', ))
            self.lsp_cache_dict['data'] = article_list
            self.lsp_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.lsp_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # dfsp 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='lsp', ))
                self.lsp_cache_dict['data'] = article_list
                self.lsp_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.lsp_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.lsp_intercept_num)

        return article_list

    def get_dfsp_article_id_list(self):
        """
        获取dfsp 目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.dfsp_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='dfsp', ))
            self.dfsp_cache_dict['data'] = article_list
            self.dfsp_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.dfsp_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # dfsp 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='dfsp', ))
                self.dfsp_cache_dict['data'] = article_list
                self.dfsp_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.dfsp_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.dfsp_intercept_num)

        return article_list

    def get_kr_article_id_list(self):
        """
        获取kr 目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.kr_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='kr', ))
            self.kr_cache_dict['data'] = article_list
            self.kr_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.kr_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # pp 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='kr', ))
                self.kr_cache_dict['data'] = article_list
                self.kr_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.kr_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.kr_intercept_num)

        return article_list

    def get_pp_article_id_list(self):
        """
        获取pp目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.pp_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='pp', ))
            self.pp_cache_dict['data'] = article_list
            self.pp_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.pp_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # pp 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='pp', ))
                self.pp_cache_dict['data'] = article_list
                self.pp_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.pp_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.pp_intercept_num)

        return article_list

    def get_gxg_article_id_list(self):
        """
        获取gxg目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.gxg_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='gxg', ))
            self.gxg_cache_dict['data'] = article_list
            self.gxg_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.gxg_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # gxg 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='gxg', ))
                self.gxg_cache_dict['data'] = article_list
                self.gxg_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.gxg_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.gxg_intercept_num)

        return article_list

    def get_lfd_article_id_list(self):
        """
        获取lfd目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.lfd_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='lfd', ))
            self.lfd_cache_dict['data'] = article_list
            self.lfd_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.lfd_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 30 * 60:
                # lfd 每日更新数量有限, 每过30分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='lfd', ))
                self.lfd_cache_dict['data'] = article_list
                self.lfd_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.lfd_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.lfd_intercept_num)

        return article_list

    def get_hk_article_id_list(self):
        """
        获取hk 目标article_id_list
        :return:
        """
        if not isinstance(self.article_parser, ArticleParser):
            self.article_parser = ArticleParser(logger=self.lg)
        else:
            pass

        if self.hk_cache_dict == {}:
            # 首次启动
            article_list = self.loop.run_until_complete(
                self.article_parser.get_article_list_by_article_type(
                    article_type='hk', ))
            self.hk_cache_dict['data'] = article_list
            self.hk_cache_dict['cache_time'] = datetime_to_timestamp(
                get_shanghai_time())
        else:
            cache_time = self.hk_cache_dict['cache_time']
            if datetime_to_timestamp(
                    get_shanghai_time()) - cache_time > 12 * 60:
                # 每过12分钟重新获取一次
                article_list = self.loop.run_until_complete(
                    self.article_parser.get_article_list_by_article_type(
                        article_type='hk', ))
                self.hk_cache_dict['data'] = article_list
                self.hk_cache_dict['cache_time'] = datetime_to_timestamp(
                    get_shanghai_time())
            else:
                article_list = self.hk_cache_dict['data']

        if article_list != []:
            # 截取1个(与图文穿插)
            article_list = random_sample(article_list, self.hk_intercept_num)

        return article_list

    def get_latest_max_and_min_artcile_id_from_article_list(
            self, article_list) -> tuple:
        """
        获取最新范围的article_id最大, 最小的article_id(目的动态的自己创建值)
        :return: (int, int)
        """
        latest_article_id_list = []
        for item in article_list:
            # eg: zq是'17296475'
            article_id = item.get('article_id', '')
            if len(article_id) >= 8:
                latest_article_id_list.append(int(article_id))
            else:
                continue

        assert latest_article_id_list != []
        latest_article_id_list = sorted(latest_article_id_list)
        # pprint(latest_article_id_list)

        return (latest_article_id_list[0], latest_article_id_list[-1])

    def get_zq_own_create_article_id_list(self, min_article_id: int,
                                          max_article_id: int):
        """
        自己create的article_id_list
        :return:
        """
        # 取中间值, 避免老是在发老新闻
        middle_article_id = int((min_article_id + max_article_id) / 2)
        self.lg.info('middle_article_id: {}'.format(middle_article_id))
        article_id_list = [
            str(article_id)
            for article_id in range(middle_article_id, max_article_id)
        ]

        # 截取3
        article_id_list = random_sample(article_id_list, self.zq_intercept_num)
        res = [{
            'uid':
            get_uuid3(target_str='{}::{}'.format('zq', article_id)),
            'article_type':
            'zq',
            'title':
            '未知',
            'article_id':
            article_id,
            'article_url':
            'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id),
        } for article_id in article_id_list]

        new_res = res

        # 本地不检测了
        # article_parser = ArticleParser(logger=self.lg)
        # # article_list = self.loop.run_until_complete(article_parser.get_article_list_by_article_type(
        # #     article_type=self.article_type,))
        # new_res = []
        # for item in res:
        #     article_url = item.get('article_url', '')
        #     try:
        #         self.lg.info('本地检测url: {}'.format(article_url))
        #         _ = self.loop.run_until_complete(article_parser._parse_article(
        #             article_url=article_url,))
        #         title = _.get('title', '')
        #         assert title != ''
        #         # 标题必须小于等于30
        #         assert len(title) <= 30
        #     except Exception:
        #         continue
        #
        #     item.update({
        #         'title': title,
        #     })
        #     new_res.append(item)

        return new_res

    def get_target_article_list(self, article_list: list) -> list:
        """
        获取未被发布的item
        :return:
        """
        target_article_list = []
        for item in article_list:
            try:
                title = item.get('title', '')
                assert title != ''
                uid = item.get('uid', '')
                assert uid != ''
                article_url = item.get('article_url', '')
                assert article_url != ''
                if uid not in self.db_article_id_list:
                    target_article_list.append(item)
                else:
                    # 已发布的跳过
                    self.lg.info('该文章之前已被发布![where title: {}, url: {}]'.format(
                        title, article_url))
                    continue
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

        return target_article_list

    @fz_set_timeout(seconds=1.5 * 60)
    def login_bg(self, driver: BaseDriver):
        """
        login
        :return:
        """
        self.lg.info('login ...')
        body = driver.get_url_body(
            url=self.publish_url,
            timeout=30,
        )
        try:
            assert body != ''
            driver.find_element(value='input#loginName').send_keys(
                self.yx_username)
            driver.find_element(value='input#loginPwd').send_keys(
                self.yx_password)
            driver.find_element(value='button#subbut').click()
        except (
                NoSuchElementException,
                SeleniumTimeoutException,
                AssertionError,
                WebDriverException,
                AttributeError,
        ):
            # 抛出登录异常
            raise LoginFailException

        try:
            self.wait_for_recommend_good_label_appear(driver=driver)
        except FZTimeoutError:
            # 进入目标页失败, 则抛出异常!
            raise EnterTargetPageFailException

    @fz_set_timeout(seconds=10.)
    def wait_for_recommend_good_label_appear(self, driver: BaseDriver):
        """
        直到出现荐好管理label
        :param driver:
        :return:
        """
        while True:
            recommend_good_label_text = driver.find_element(
                value=self.recommend_good_label_css_selector).text
            # self.lg.info('recommend_good_label_text: {}'.format(recommend_good_label_text))
            if recommend_good_label_text == '荐好管理':
                break
            else:
                continue

        self.lg.info('login success!')

    @fz_set_timeout(seconds=60.)
    def get_into_recommend_good_manage(self, driver: BaseDriver):
        """
        进入荐好管理
        :param driver:
        :return:
        """
        try:
            driver.find_element(
                value=self.recommend_good_label_css_selector).click()
            # 等待下方标签出现
            sleep(.5)
            driver.find_element(value='a.J_menuItem').click()
        except SeleniumTimeoutException:
            # 进入目标页失败, 则抛出异常!
            raise EnterTargetPageFailException

    @fz_set_timeout(seconds=2.5 * 60)
    def publish_one_article(self, driver: BaseDriver, article_url: str):
        """
        发布一篇图文
        :param driver:
        :param article_url:
        :return:
        """
        try:
            # 切换到目标iframe(用index有时候不准, pass)
            # driver.switch_to_frame(frame_reference=1)

            iframe_ele_list = driver.find_elements(by=By.TAG_NAME,
                                                   value='iframe')
            # pprint(iframe_ele_list)
            assert iframe_ele_list != []
            target_iframe_ele = iframe_ele_list[1] if len(
                iframe_ele_list) > 1 else iframe_ele_list[0]
            driver.switch_to_frame(frame_reference=target_iframe_ele)
        except (NoSuchFrameException, ) as e:
            # 没匹配到frame(可能是原先就在目标iframe, eg: title过长的, 再切回iframe, 但是iframe_ele_list为0)
            raise e

        try:
            # 清空输入框
            input_box_ele = driver.find_element(value='input#SnatchUrl')
            input_box_ele.clear()
            # 输入待采集地址
            input_box_ele.send_keys(article_url)
            # 点击采集按钮
            driver.find_elements(
                value='span.input-group-btn button')[0].click()

            self.wait_for_delete_img_appear(driver=driver)
        except (FZTimeoutError, NoSuchElementException, WebDriverException):
            # 发布某文章超时失败or无元素存在, 则抛出发布异常
            raise PublishOneArticleFailException

        # 获取输入框的值
        title = driver.find_element(
            value='input#RecommendName').get_attribute('value')
        self.lg.info('title: {}'.format(title))
        if target_str_contain_some_char_check(
                target_str=title,
                check_char_obj=ARTICLE_TITLE_SENSITIVE_STR_TUPLE):
            raise ArticleTitleContainSensitiveWordsException
        else:
            pass
        if isinstance(title, str) and len(title) > 30:
            # 标题过长则return, 不发布
            self.lg.info('@@@ title 标题过长, 无法发布!! 跳过!')
            # 由于标题过长后, 无法处理后续文章, 故不return, 直接抛出异常
            # return
            raise ArticleTitleOverLongException
        else:
            pass

        try:
            # 点击发布按钮
            driver.find_elements(
                value='span.input-group-btn button')[1].click()
        except WebDriverException:
            # 处理发布单篇异常!
            # 处理报错: Message: unknown error: Element <iframe class="J_iframe" name="iframe0"
            raise PublishOneArticleFailException

        # 切换至主页面
        driver.switch_to_default_content()
        # 填写被发布人
        random_phone = self.get_random_phone()
        driver.find_element(
            value='input.layui-layer-input').send_keys(random_phone)
        # 点击确定
        driver.find_element(value='a.layui-layer-btn0').click()

        self.lg.info('url: {} 发布成功!'.format(article_url))
        # 发布成功, 等待8.5秒, 等待页面元素置空
        sleep(8.5)

        return

    @fz_set_timeout(seconds=70.)
    def wait_for_delete_img_appear(self, driver: BaseDriver):
        """
        直至出现图片, 超时退出(并且避免发布无图文章)
        :return:
        """
        while True:
            # 改用 不宜用下面方式 长期跑电脑卡死
            try:
                delete_btn_text = driver.find_element(
                    value='div.deletebut').text
            except NoSuchElementException:
                # 处理这个异常, 并继续等待
                sleep(.3)
                continue

            # 原先 但是老是发布失败!!
            # delete_btn_text = driver.find_element(value='div.deletebut').text

            # self.lg.info('delete_btn_text: {}'.format(delete_btn_text))
            if delete_btn_text == '删除':
                break
            else:
                continue

        self.lg.info('该url采集完毕!')

    def get_random_phone(self) -> int:
        """
        随机个手机号
        :return:
        """
        phone_list = []
        with open('../tools/phone.txt', 'r') as f:
            for line in f:
                try:
                    phone_list.append(int(line.replace('\n', '')))
                except Exception:
                    continue

        # pprint(phone_list)
        random_phone = phone_list[randint(0, len(phone_list) - 1)]
        self.lg.info('random_phone: {}'.format(random_phone))

        return random_phone

    def __del__(self):
        try:
            del self.lg
            del self.loop
            del self.db_article_id_list
            del self.publish_url
            del self.article_parser
            del self.hk_cache_dict
        except:
            pass
        collect()
Exemplo n.º 13
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()

        sql_str = 'select GoodsID, IsDelete, MyShelfAndDownTime from dbo.GoodsInfoAutoGet where SiteID=1'
        sql_str_2 = 'select GoodsOutUrl, goods_id from db_k85u.dbo.goodsinfo where OutGoodsType<=13 and onoffshelf=1 and not exists (select maingoodsid from gather.dbo.GoodsInfoAutoGet c where c.maingoodsid=goodsinfo.goods_id)'
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server._select_table(sql_str=sql_str, params=None)
            result_2 = list(
                tmp_sql_server_2._select_table(sql_str=sql_str_2, params=None))
            # print(result_2)
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            new_table_ali_1688_all_goods_id_list = [item[0] for item in result]
            for item in result_2:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item[0])
                    if goods_id == '':
                        print('@@@ 原商品的地址为: ', item[0])
                        continue
                    else:
                        if goods_id in new_table_ali_1688_all_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            continue

                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (goods_id, index))
                            tt = taobao.get_goods_data(goods_id)
                            if tt.get('is_delete') == 1:  # 处理已下架的但是还是要插入的
                                tt['goods_id'] = goods_id
                                tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                                tt['username'] = '******'
                                tt['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data=tt, pipeline=tmp_sql_server_2)

                                index += 1
                                gc.collect()
                                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                                continue
                            else:
                                pass

                            data = taobao.deal_with_data(goods_id=goods_id)
                            if data != {}:
                                data['goods_id'] = goods_id
                                data[
                                    'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                        goods_id)
                                data['username'] = '******'
                                data['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data, pipeline=tmp_sql_server_2)
                            else:
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemplo n.º 14
0
# coding:utf-8
'''
@author = super_fazai
@File    : test_sql_str.py
@Time    : 2018/6/14 07:41
@connect : [email protected]
'''

import sys
sys.path.append('..')

from pprint import pprint
from json import dumps
from my_pipeline import SqlServerMyPageInfoSaveItemPipeline

_ = SqlServerMyPageInfoSaveItemPipeline()
sql_str = 'select gather_url, MainID from dbo.daren_recommend where site_id=2 and MainID is not null'
params = None
result = _._select_table(sql_str=sql_str, params=params)
pprint(result)

# 更新
# sql_str_2 = 'UPDATE dbo.daren_recommend set share_img_url_list=NULL, goods_id_list=NULL, share_goods_base_info=%s where MainID=579;'
# result = _._update_table(sql_str=sql_str_2, params=params)
# print(result)
Exemplo n.º 15
0
class Z8Updater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.goods_index = 1
        self.concurrency = 8  # 并发量
        self.delete_sql_str = z8_delete_str_3

    async def _get_db_old_data(self):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=z8_delete_str_4,
                                              params=None)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=z8_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_begin_time(self, miaosha_time) -> int:
        miaosha_begin_time = json_2_dict(miaosha_time).get(
            'miaosha_begin_time')
        miaosha_begin_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_begin_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_begin_time

    async def _get_new_z8_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.zhe_800_spike
            except:
                pass
            collect()
            self.zhe_800_spike = Zhe800Spike()

    async def _update_is_delete(self, goods_id) -> bool:
        '''
        下架商品逻辑删除
        :param goods_id:
        :return:
        '''
        delete_str = 'update dbo.zhe_800_xianshimiaosha set is_delete=1 where goods_id=%s'
        res = self.tmp_sql_server._update_table(sql_str=delete_str,
                                                params=(goods_id, ))
        await async_sleep(.3)

        return res

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        session_id = item[2]
        miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time)
        # self.lg.info(str(miaosha_begin_time))
        await self._get_new_z8_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = await self._update_is_delete(goods_id=goods_id)
                self.lg.info(
                    '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format(
                        goods_id,
                        json.loads(item[1]).get('miaosha_begin_time')))
                index += 1
                self.goods_index = index
                res = True
                await async_sleep(.3)

                return goods_id, res

            elif is_recent_time == 2:
                # 可能包括过期的
                self.lg.info('未来时间暂时不更新! {}'.format(
                    timestamp_to_regulartime(miaosha_begin_time)))
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                try:
                    tmp_data = self.zhe_800_spike._get_one_session_id_data(
                        base_session_id=str(session_id))
                except Exception:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                try:
                    tmp_data = tmp_data.get('data', {}).get('blocks', [])
                    assert tmp_data != [], '该session_id不存在,此处跳过'
                except AssertionError:  # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    res = await self._update_is_delete(goods_id)
                    self.lg.info(
                        msg=
                        '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'
                        .format(goods_id, miaosha_begin_time))
                    index += 1
                    self.goods_index = index
                    await async_sleep(1.2)

                    return goods_id, res

                tmp_data = [item_s.get('deal', {}) for item_s in tmp_data]
                # pprint(tmp_data)
                try:
                    miaosha_goods_list = await self._get_miaoshao_goods_info_list(
                        data=tmp_data)
                    # pprint(miaosha_goods_list)
                except ValueError:
                    await async_sleep(2)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                # 该session_id中现有的所有zid的list
                miaosha_goods_all_goods_id = [
                    i.get('zid') for i in miaosha_goods_list
                ]
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    res = await self._update_is_delete(goods_id)
                    self.lg.info(
                        '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(
                            goods_id))
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(
                        miaosha_goods_list=miaosha_goods_list,
                        goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.5)

        return goods_id, res

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :return:
        '''
        miaosha_goods_list = kwargs.get('miaosha_goods_list')
        goods_id = kwargs.get('goods_id')

        zhe_800_miaosha = Zhe800Parse()
        res = False
        for item_1 in miaosha_goods_list:
            if item_1.get('zid', '') == goods_id:
                zhe_800_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = zhe_800_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    break

                else:  # 否则就解析并且插入
                    goods_data['stock_info'] = item_1.get('stock_info')
                    goods_data['goods_id'] = str(item_1.get('zid'))
                    if item_1.get('stock_info').get('activity_stock') > 0:
                        # self.lg.info(item_1.get('price'))
                        # self.lg.info(item_1.get('taobao_price'))
                        goods_data['price'] = item_1.get('price')
                        goods_data['taobao_price'] = item_1.get('taobao_price')
                    else:
                        self.lg.info('该商品参与活动的对应库存为0')
                        await self._update_is_delete(goods_id=goods_id)
                        break

                    goods_data['sub_title'] = item_1.get('sub_title')
                    goods_data['miaosha_time'] = item_1.get('miaosha_time')
                    goods_data['miaosha_begin_time'], goods_data[
                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                            miaosha_time=item_1.get('miaosha_time'))

                    if goods_data.get('is_delete', 0) == 1:
                        self.lg.info('该商品[{0}]已售罄...'.format(goods_id))

                    # self.lg.info(str(goods_data['stock_info']))
                    # self.lg.info(str(goods_data['miaosha_time']))
                    res = zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(
                        data=goods_data, pipeline=self.tmp_sql_server)
                    break
            else:
                pass
        collect()

        return res

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -259200:  # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
            # if diff_time < -172800:     # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息
            return 0  # 已过期恢复原价的
        elif diff_time > -172800 and diff_time < 7200:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:
            return 2  # 未来时间的暂时不用更新

    async def _update_db(self):
        '''
        秒杀数据实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.zhe_800_spike = Zhe800Spike()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)

            try:
                del self.zhe_800_spike
            except:
                pass
            collect()

    async def _get_miaoshao_goods_info_list(self, data) -> list:
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        # pprint(data)
        for item in data:
            if item == {}:
                continue
            # pprint(item)
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time':
                timestamp_to_regulartime(int(
                    str(item.get('begin_time'))[0:10])),
                'miaosha_end_time':
                timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])),
            }

            # 折800商品地址
            tmp['zid'] = item.get('zid')
            # 限时秒杀的库存信息
            tmp['stock_info'] = {
                'activity_stock': item.get('activity_stock',
                                           0),  # activity_stock为限时抢的剩余数量
                'stock': item.get('stock', 0),  # stock为限时秒杀的总库存
            }
            # 原始价格
            tmp['price'] = float(item.get('list_price'))
            # 秒杀的价格, float类型
            tmp['taobao_price'] = float(item.get('price'))
            tmp['sub_title'] = item.get('description', '')
            miaosha_goods_list.append(tmp)
            # pprint(miaosha_goods_list)

        return miaosha_goods_list

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.zhe_800_spike
        except:
            pass
        collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs()
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # print(tmp_item_list)
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # print(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        tmp_price_info_list = goods_data[
                                            'price_info_list']
                                        price_info_list = [{
                                            'spec_value':
                                            item_4.get('spec_value'),
                                            'pintuan_price':
                                            item_4.get('detail_price'),
                                            'detail_price':
                                            '',
                                            'normal_price':
                                            item_4.get('normal_price'),
                                            'img_url':
                                            item_4.get('img_url'),
                                            'rest_number':
                                            item_4.get('rest_number'),
                                        } for item_4 in tmp_price_info_list]

                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = price_info_list

                                        # pprint(goods_data)
                                        # print(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                tmp_price_info_list = goods_data[
                                                    'price_info_list']
                                                price_info_list = [
                                                    {
                                                        'spec_value':
                                                        item_4.get(
                                                            'spec_value'),
                                                        'pintuan_price':
                                                        item_4.get(
                                                            'detail_price'),
                                                        'detail_price':
                                                        '',
                                                        'normal_price':
                                                        item_4.get(
                                                            'normal_price'),
                                                        'img_url':
                                                        item_4.get('img_url'),
                                                        'rest_number':
                                                        item_4.get(
                                                            'rest_number'),
                                                    } for item_4 in
                                                    tmp_price_info_list
                                                ]

                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = price_info_list
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data['pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                        pintuan_time=goods_data[
                                                            'pintuan_time'])
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                # print(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()
Exemplo n.º 17
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_miaosha_goods_list = self.get_all_miaosha_goods_list()
        try:
            self.driver.quit()
        except:
            pass
        gc.collect()

        pinduoduo = PinduoduoParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            if my_pipeline._select_table(sql_str=pd_select_str_3) is None:
                db_goods_id_list = []
            else:
                db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=pd_select_str_3))]

            for item in all_miaosha_goods_list:
                '''
                注意: 明日8点半抓取到的是页面加载中返回的是空值
                '''
                if item.get('goods_id') != 'None':    # 跳过goods_id为'None'
                    if item.get('goods_id', '') in db_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过')
                        pass
                    else:
                        tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + item.get('goods_id')
                        pinduoduo.get_goods_data(goods_id=item.get('goods_id'))
                        goods_data = pinduoduo.deal_with_data()

                        # print(goods_data)
                        if goods_data == {}:  # 返回的data为空则跳过
                            print('得到的goods_data为空值,此处先跳过,下次遍历再进行处理')
                            # sleep(3)
                            pass

                        else:  # 否则就解析并插入
                            goods_data['stock_info'] = item.get('stock_info')
                            goods_data['goods_id'] = item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['username'] = '******'
                            goods_data['price'] = item.get('price')  # 秒杀前的原特价
                            goods_data['taobao_price'] = item.get('taobao_price')  # 秒杀价
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = item.get('miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time'))

                            if item.get('stock_info').get('activity_stock') <= 2:
                                # 实时秒杀库存小于等于2时就标记为 已售罄
                                print('该秒杀商品已售罄...')
                                goods_data['is_delete'] = 1

                            pinduoduo.insert_into_pinduoduo_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(PINDUODUO_SLEEP_TIME)

                else:
                    print('该goods_id为"None", 此处跳过')
                    pass
            sleep(5)

        else:
            pass
        try:
            del pinduoduo
        except:
            pass
        gc.collect()
Exemplo n.º 18
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=mg_delete_str_2)
            result = list(sql_cli._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            self.my_phantomjs = BaseDriver(
                executable_path=PHANTOMJS_DRIVER_PATH,
                ip_pool_type=self.ip_pool_type)
            for item in result:  # 实时更新数据
                goods_id = item[0]
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(
                        executable_path=PHANTOMJS_DRIVER_PATH,
                        ip_pool_type=self.ip_pool_type)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            update_sql_str=mg_update_str_5,
                            sql_cli=sql_cli,
                        )
                        print(
                            '过期的goods_id为(%s)' % goods_id,
                            ', 拼团开始时间为(%s), 逻辑删除成功!' %
                            json.loads(item[1]).get('begin_time'))
                        sleep(.3)

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.get_url_body(url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                _handle_goods_shelves_in_auto_goods_table(
                                    goods_id=goods_id,
                                    update_sql_str=mg_update_str_5,
                                    sql_cli=sql_cli,
                                )
                                sleep(.3)

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if goods_id not in pintuan_goods_all_goods_id:
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=goods_id)
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = goods_id
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data, pipeline=sql_cli)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == goods_id:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=goods_id)
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data[
                                                    'goods_id'] = goods_id
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=sql_cli)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        gc.collect()
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = '''
        select GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time 
        from dbo.GoodsInfoAutoGet 
        where SiteID=13 and MainGoodsID is not null'''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                pinduoduo = PinduoduoParse()
                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                    pinduoduo.get_goods_data(goods_id=item[0])
                    data = pinduoduo.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            shelf_time=item[4],
                            delete_time=item[5])
                        data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                            old_price=item[2],
                            old_taobao_price=item[3],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price']
                        )

                        # print('------>>>| 爬取到的数据为: ', data)
                        pinduoduo.to_right_and_update_data(data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del pinduoudo
                # except:
                #     pass
                gc.collect()
                # sleep(1)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        # del pinduoduo
        gc.collect()
Exemplo n.º 20
0
    def deal_with_data(self, *param):
        '''
        处理并存储相关秒杀商品的数据
        :param param: 相关参数
        :return:
        '''
        print(60 * '*')
        event_time = param[0]
        item_list = param[1]
        print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t',
              '对应时间戳为: ', event_time)
        print(60 * '*')

        mogujie = MoGuJieMiaoShaParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=mg_select_str_4))
            db_goods_id_list = [item[0] for item in _]
            for item in item_list:
                goods_id = str(item.get('iid', ''))
                if goods_id in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = item.get('link', '')
                    # print(tmp_url)
                    try:
                        object_id = re.compile('objectId=(\w+)').findall(
                            tmp_url)[0]
                    except IndexError:  # 表示匹配到的地址不是秒杀商品的地址
                        print('+++++++ 这个url不是秒杀的url: ', tmp_url)
                        continue
                    tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format(
                        goods_id, object_id)
                    tmp_ = mogujie.get_goods_id_from_url(tmp_url)
                    mogujie.get_goods_data(goods_id=tmp_)
                    goods_data = mogujie.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)

                        # price设置为原价
                        try:
                            tmp_price_list = sorted([
                                round(float(item_4.get('normal_price', '')), 2)
                                for item_4 in goods_data['price_info_list']
                            ])
                            price = Decimal(tmp_price_list[-1]).__round__(
                                2)  # 商品原价
                            goods_data['price'] = price
                        except:
                            print('设置price为原价时出错!请检查')
                            sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                            continue

                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            timestamp_to_regulartime(
                                int(item.get('startTime', 0))),
                            'miaosha_end_time':
                            timestamp_to_regulartime(
                                int(item.get('endTime', 0))),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        goods_data['event_time'] = str(event_time)
                        # pprint(goods_data)
                        # print(goods_data)
                        res = mogujie.insert_into_mogujie_xianshimiaosha_table(
                            data=goods_data, pipeline=my_pipeline)
                        if res:
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                    sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()
Exemplo n.º 21
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        base_session_id = BASE_SESSION_ID
        while base_session_id < MAX_SESSION_ID:
            print('待抓取的session_id为: ', base_session_id)
            data = self._get_one_session_id_data(base_session_id=base_session_id)
            sleep(.3)

            if data.get('data', {}).get('blocks', []) == []:     # session_id不存在
                pass

            else:                           # 否则session_id存在
                try:
                    _ = str(data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10]
                    if _ != '':
                        pass
                    elif data.get('data', {}).get('blocks', [])[0].get('showcase', {}) != {}:   # 未来时间
                        print('*** 未来时间 ***')
                        # pprint(data.get('data', {}))
                        _ = str(data.get('data', {}).get('blocks', [])[1].get('deal', {}).get('begin_time', ''))[:10]
                    else:
                        raise Exception
                    begin_times_timestamp = int(_)  # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整

                except Exception as e:
                    print('遇到严重错误: ', e)
                    base_session_id += 2
                    continue

                print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp))

                if self.is_recent_time(timestamp=begin_times_timestamp):    # 说明秒杀日期合法
                    try:
                        data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])]
                    except Exception as e:
                        print('遇到严重错误: ', e)
                        base_session_id += 2
                        continue
                    # pprint(data)

                    if data != []:  # 否则说明里面有数据
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data)
                        # pprint(miaosha_goods_list)

                        zhe_800 = Zhe800Parse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=z8_select_str_5))]
                            for item in miaosha_goods_list:
                                if item.get('zid', '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', ''))
                                    goods_id = zhe_800.get_goods_id_from_url(tmp_url)

                                    zhe_800.get_goods_data(goods_id=goods_id)
                                    goods_data = zhe_800.deal_with_data()

                                    if goods_data == {}:    # 返回的data为空则跳过
                                        pass
                                    else:       # 否则就解析并且插入
                                        goods_data['stock_info'] = item.get('stock_info')
                                        goods_data['goods_id'] = str(item.get('zid'))
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get('price')
                                        goods_data['taobao_price'] = item.get('taobao_price')
                                        goods_data['sub_title'] = item.get('sub_title')
                                        # goods_data['is_baoyou'] = item.get('is_baoyou')
                                        goods_data['miaosha_time'] = item.get('miaosha_time')
                                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time'))
                                        goods_data['session_id'] = str(base_session_id)
                                        # print(goods_data['miaosha_time'])

                                        # print(goods_data)
                                        zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                                        sleep(ZHE_800_SPIKE_SLEEP_TIME)   # 放慢速度

                            sleep(1)
                        else:
                            pass
                        try:
                            del zhe_800
                        except:
                            pass
                        gc.collect()

                    else:       # 说明这个sessionid没有数据
                        print('该sessionid没有相关key为jsons的数据')
                        # return {}
                        pass
                else:
                    pass

            base_session_id += 2
Exemplo n.º 22
0
    def get_ali_1688_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}

        # 阿里1688手机版地址: https://m.1688.com/offer/559836312862.html
        wait_to_deal_with_url = 'https://m.1688.com/offer/' + str(
            goods_id) + '.html'
        print('------>>>| 待处理的阿里1688地址为: ', wait_to_deal_with_url)

        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=wait_to_deal_with_url, css_selector='div.d-content')
        # print(body)
        if body == '':
            print('获取到的body为空str!请检查!')
            self.result_data = {}
            return {}

        # '''
        # 改用requests
        # '''
        # body = MyRequests.get_url_body(url=wait_to_deal_with_url, headers=self.headers)
        # # print(body)
        #
        # if body == '':
        #     return {}
        # print(body)

        tmp_body = body

        try:
            pull_off_shelves = Selector(
                text=body).css('div.d-content p.info::text').extract_first()
        except:
            pull_off_shelves = ''
        if pull_off_shelves == '该商品无法查看或已下架':  # 表示商品已下架, 同样执行插入数据操作
            # print('test')
            try:
                tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s'
                is_in_db = tmp_my_pipeline._select_table(
                    sql_str=sql_str, params=(str(goods_id), ))
                # print(is_in_db)
            except Exception as e:
                print('遇到错误:', e)
                print('数据库连接失败!')
                self.result_data = {}
                return {}

            if is_in_db != []:  # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可
                sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s'
                tmp_my_pipeline._update_table(sql_str=sql_str,
                                              params=(goods_id))
                print('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = True  # 用来判断原先该goods是否在db中
                self.result_data = {}

                return tmp_data_s

            else:  # 表示该goods_id没存在于db中
                print('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = False
                self.result_data = {}

                return tmp_data_s

        body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall(
            body)
        if body != []:
            body = body[0]
            body = r'{"beginAmount"' + body
            # print(body)
            body = json.loads(body)
            # pprint(body)

            if body.get('discountPriceRanges') is not None:
                self.result_data = self._wash_discountPriceRanges(body=body)
                return self.result_data
            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

        else:
            print('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!')
            body = re.compile(
                r'{"activityId"(.*?)</script></div></div>').findall(tmp_body)
            if body != []:
                body = body[0]
                body = r'{"activityId"' + body
                # print(body)
                body = json.loads(body)
                # pprint(body)

                if body.get('discountPriceRanges') is not None:
                    self.result_data = self._wash_discountPriceRanges(
                        body=body)
                    self.is_activity_goods = True
                    return self.result_data
                else:
                    print('data为空!')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
            else:
                print('这个商品对应活动属性未知, 此处不解析, 设置为跳过!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
Exemplo n.º 23
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=3 or SiteID=4 or SiteID=6 order by ID desc'
        sql_str_2 = 'select GoodsOutUrl, goods_id from db_k85u.dbo.goodsinfo where OutGoodsType<=13 and onoffshelf=1 and not exists (select maingoodsid from gather.dbo.GoodsInfoAutoGet c where c.maingoodsid=goodsinfo.goods_id)'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
            result_2 = list(
                tmp_sql_server._select_table(sql_str=sql_str_2, params=None))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
            result_2 = []
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse()
            for item in result_2:  # 实时更新数据
                data = {}
                if index % 5 == 0:
                    tmall = TmallParse()
                    gc.collect()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item[0])
                    if goods_id == []:
                        print('@@@ 原地址为: ', item[0])
                        continue
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id[1], index))
                        data = tmall.get_goods_data(goods_id=goods_id)
                        if isinstance(data, int):
                            continue

                        if data.get('is_delete') == 1:
                            data['goods_id'] = goods_id[1]

                            # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际
                            #####################################################
                            if goods_id[0] == 0:  # [0, '1111']
                                wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[
                                    1]  # 构造成标准干净的天猫商品地址
                            elif goods_id[0] == 1:  # [1, '1111']
                                wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[
                                    1]
                            elif goods_id[
                                    0] == 2:  # [2, '1111', 'https://xxxxx']
                                wait_to_deal_with_url = str(
                                    goods_id[2]) + '?id=' + goods_id[1]
                            else:
                                continue
                            data['goods_url'] = wait_to_deal_with_url
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            # print('------>>>| 爬取到的数据为: ', data)
                            result = tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=tmp_sql_server)
                            if result is False:
                                print('出错商品的地址为: ', item[0])
                            else:
                                pass
                            index += 1
                            gc.collect()
                            sleep(1.2)
                            continue
                        else:
                            pass

                        data = tmall.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id[1]

                            # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际
                            #####################################################
                            if goods_id[0] == 0:  # [0, '1111']
                                wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[
                                    1]  # 构造成标准干净的天猫商品地址
                            elif goods_id[0] == 1:  # [1, '1111']
                                wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[
                                    1]
                            elif goods_id[
                                    0] == 2:  # [2, '1111', 'https://xxxxx']
                                wait_to_deal_with_url = str(
                                    goods_id[2]) + goods_id[1]
                            else:
                                continue
                            data['goods_url'] = wait_to_deal_with_url
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            # print('------>>>| 爬取到的数据为: ', data)
                            tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                            pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                sleep(2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Exemplo n.º 24
0
    async def deal_with_all_goods_id(self):
        '''
        获取每个详细分类的商品信息
        :return: None
        '''
        sort_data = await self.get_all_goods_list()
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        index = 1
        if sql_cli.is_connect_success:
            # 普通sql_server连接(超过3000无返回结果集)
            self.lg.info('正在获取天天特价db原有goods_id, 请耐心等待...')
            db_ = list(sql_cli._select_table(sql_str=tb_select_str_6))
            db_goods_id_list = [[item[0], item[2]] for item in db_]
            self.lg.info('获取完毕!!!')
            # print(db_goods_id_list)
            db_all_goods_id = [i[0] for i in db_goods_id_list]

            for item in sort_data:
                tejia_goods_list = await self.get_tiantiantejia_goods_list(
                    data=item.get('data', []))
                self.lg.info(str(tejia_goods_list))

                for tmp_item in tejia_goods_list:
                    if tmp_item.get(
                            'goods_id', ''
                    ) in db_all_goods_id:  # 处理如果该goods_id已经存在于数据库中的情况
                        tmp_end_time = ''
                        try:
                            tmp_end_time = [
                                i[1] for i in db_goods_id_list
                                if tmp_item.get('goods_id', '') == i[0]
                            ][0]
                            # print(tmp_end_time)
                        except:
                            pass

                        if tmp_end_time != '' \
                                and tmp_end_time < get_shanghai_time():
                            '''
                            * 处理由常规商品又转换为天天特价商品 *
                            '''
                            self.lg.info('##### 该商品由常规商品又转换为天天特价商品! #####')
                            # 先删除,再重新插入(原先已过期)
                            _ = await sql_cli.delete_taobao_tiantiantejia_expired_goods_id(
                                goods_id=tmp_item.get('goods_id', ''),
                                logger=self.lg)
                            if _ is False:
                                continue

                            index = await self.insert_into_table(
                                tmp_item=tmp_item,
                                category=item['category'],
                                current_page=item['current_page'],
                                sql_cli=sql_cli,
                                index=index,
                            )

                        else:
                            self.lg.info('该goods_id已经存在于数据库中, 此处跳过')
                            pass

                    else:
                        sql_cli = await _get_new_db_conn(
                            db_obj=sql_cli,
                            index=index,
                            logger=self.lg,
                        )
                        if sql_cli.is_connect_success:
                            index = await self.insert_into_table(
                                tmp_item=tmp_item,
                                category=item['category'],
                                current_page=item['current_page'],
                                sql_cli=sql_cli,
                                index=index,
                            )

                        else:
                            self.lg.error('数据库连接失败!')
                            pass

        else:
            self.lg.error('数据库连接失败!')
            pass
        collect()

        # 休眠30分钟
        self.lg.info('休眠30分钟, 避免特价数据量过大...')
        await async_sleep(60 * 30)

        return True
Exemplo n.º 25
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=tm_select_str_3))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('总计待更新个数: {0}'.format(len(result)))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del tmall
                    except:
                        pass
                    tmall = TmallParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    tmp_item = []
                    if item[0] == 3:  # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 4:
                        tmp_item.append(1)
                    elif item[0] == 6:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    oo = tmall.get_goods_data(goods_id=tmp_item)
                    oo_is_delete = oo.get('is_detele', 0)  # 避免下面解析data错误休眠
                    if isinstance(oo, int):  # 单独处理return 4041
                        index += 1
                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        continue

                    data = tmall.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[2],
                                shelf_time=item[5],
                                delete_time=item[6])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        site_id = tmall._from_tmall_type_get_site_id(
                            type=data['type'])
                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[7]),
                                site_id=site_id)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[7]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=site_id),
                                is_price_change=item[8]
                                if item[8] is not None else 0)

                        tmall.to_right_and_update_data(data,
                                                       pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        if oo_is_delete == 1:
                            pass
                        else:
                            my_lg.info('------>>>| 休眠8s中...')
                            sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemplo n.º 26
0
    def deal_with_data(self, goods_list):
        '''
        处理并存储相关拼团商品的数据
        :param goods_list:
        :return:
        '''
        mia = MiaPintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=mia_select_str_1))
            db_goods_id_list = [item[0] for item in _]
            # print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(
                        goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(
                                goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(
                                goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(
                                goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['sub_title'] = item.get('sub_title', '')
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['pintuan_time'])
                        goods_data['pid'] = item.get('pid')

                        # pprint(goods_data)
                        _r = mia.insert_into_mia_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                    sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        collect()
Exemplo n.º 27
0
class GoodsKeywordsSpider(Crawler):
    def __init__(self):
        super(GoodsKeywordsSpider, self).__init__(
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            logger=None,
            log_save_path=MY_SPIDER_LOGS_PATH + '/goods_keywords/_/',
        )
        self.msg = ''
        self._init_debugging_api()
        self.debugging_api = self._init_debugging_api()
        self._set_func_name_dict()
        self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        # 插入数据到goods_id_and_keyword_middle_table表
        self.add_keyword_id_for_goods_id_sql_str = kw_insert_str_1

    def _init_debugging_api(self):
        '''
        用于设置crawl的关键字热销商品的site_id
        :return: dict
        '''
        return {
            1: True,  # 淘宝
            2: True,  # 阿里1688
            3: True,  # 天猫
            4: True,  # 京东
        }

    def _set_func_name_dict(self):
        self.func_name_dict = {
            'taobao':
            'self._taobao_keywords_spider(goods_id_list={0}, keyword_id={1})',
            'ali':
            'self._ali_keywords_spider(goods_id_list={0}, keyword_id={1})',
            'tmall':
            'self._tmall_keywords_spider(goods_id_list={0}, keyword_id={1})',
            'jd': 'self._jd_keywords_spider(goods_id_list={0}, keyword_id={1})'
        }

    def _just_run(self):
        while True:
            # 获取原先goods_db的所有已存在的goods_id
            try:
                result = list(
                    self.my_pipeline._select_table(sql_str=kw_select_str_1))
                self.lg.info('正在获取db中已存在的goods_id...')
                result_2 = list(
                    self.my_pipeline._select_table(sql_str=kw_select_str_2))
                self.lg.info('db中已存在的goods_id获取成功!')

            except TypeError:
                self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                result = None
                result_2 = None

            if result is not None and result_2 is not None:
                self.lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
                self.lg.info(str(result))
                self.lg.info(
                    '--------------------------------------------------------')

                self.lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
                self.add_goods_index = 0  # 用于定位增加商品的个数
                self.db_existed_goods_id_list = [item[0] for item in result_2]
                # 即时释放资源
                try:
                    del result_2
                except:
                    pass
                gc.collect()

                for item in result:  # 每个关键字在True的接口都抓完, 再进行下一次
                    self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(
                        item[0], item[1]))
                    for type, type_value in self.debugging_api.items(
                    ):  # 遍历待抓取的电商分类
                        if type_value is False:
                            self.lg.info('api为False, 跳过!')
                            continue

                        if self.add_goods_index % 20 == 0:
                            self.lg.info('my_pipeline客户端重连中...')
                            try:
                                del self.my_pipeline
                            except:
                                pass
                            self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline(
                            )
                            self.lg.info('my_pipeline客户端重连完毕!')

                        goods_id_list = self._get_keywords_goods_id_list(
                            type=type, keyword=item)
                        self.lg.info(
                            '关键字为{0}, 获取到的goods_id_list 如下: {1}'.format(
                                item[1], str(goods_id_list)))
                        '''处理goods_id_list'''
                        self._deal_with_goods_id_list(
                            type=type,
                            goods_id_list=goods_id_list,
                            keyword_id=item[0])
                        sleep(3)

    def _get_keywords_goods_id_list(self, type, keyword):
        '''
        获取goods_id_list
        :param type: 电商种类
        :param keyword:
        :return:
        '''
        if type == 1:
            self.lg.info('下面是淘宝的关键字采集...')
            goods_id_list = self._get_taobao_goods_keywords_goods_id_list(
                keyword=keyword)
        elif type == 2:
            self.lg.info('下面是阿里1688的关键字采集...')
            goods_id_list = self._get_1688_goods_keywords_goods_id_list(
                keyword=keyword)
        elif type == 3:
            self.lg.info('下面是天猫的关键字采集...')
            goods_id_list = self._get_tmall_goods_keywords_goods_id_list(
                keyword=keyword)
        elif type == 4:
            self.lg.info('下面是京东的关键字采集...')
            goods_id_list = self._get_jd_goods_keywords_goods_id_list(
                keyword=keyword)

        else:
            goods_id_list = []

        return goods_id_list

    def _deal_with_goods_id_list(self, **kwargs):
        '''
        分类执行代码
        :param kwargs:
        :return:
        '''
        type = kwargs.get('type', '')
        goods_id_list = kwargs.get('goods_id_list', [])
        keyword_id = kwargs.get('keyword_id', '')

        if type == 1:
            self._taobao_keywords_spider(goods_id_list=goods_id_list,
                                         keyword_id=keyword_id)
        elif type == 2:
            self._1688_keywords_spider(goods_id_list=goods_id_list,
                                       keyword_id=keyword_id)
        elif type == 3:
            self._tmall_keywords_spider(goods_id_list=goods_id_list,
                                        keyword_id=keyword_id)
        elif type == 4:
            self._jd_keywords_spider(goods_id_list=goods_id_list,
                                     keyword_id=keyword_id)
        else:
            pass

        return None

    def _get_taobao_goods_keywords_goods_id_list(self, keyword):
        '''
        获取该keywords的商品的goods_id_list
        :param keyword: (id, keyword)
        :return: a list
        '''
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
            'authority': 's.taobao.com',
            # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw',
        }

        # 获取到的为淘宝关键字搜索按销量排名
        params = (
            ('data-key', 'sort'),
            ('data-value', 'sale-desc'),
            ('ajax', 'true'),
            # ('_ksTS', '1528171408340_395'),
            ('callback', 'jsonp396'),
            ('q', keyword[1]),
            ('imgfile', ''),
            ('commend', 'all'),
            ('ssid', 's5-e'),
            ('search_type', 'item'),
            ('sourceId', 'tb.index'),
            # ('spm', 'a21bo.2017.201856-taobao-item.1'),
            ('ie', 'utf8'),
            # ('initiative_id', 'tbindexz_20170306'),
        )

        s_url = 'https://s.taobao.com/search'
        body = Requests.get_url_body(url=s_url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type)
        if body == '':
            return []
        else:
            try:
                data = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1]))
                return []

            data = json_2_dict(json_str=data, logger=self.lg)
            if data == {}:
                self.lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                goods_id_list = data.get('mainInfo', {}).get(
                    'traceInfo', {}).get('traceData', {}).get('allNids', [])
                if goods_id_list is None or goods_id_list == []:
                    self.lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(
                        keyword[1]))
                    return []
                else:
                    return goods_id_list

    def _get_1688_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取1688销量靠前的商品信息
        :param keyword:
        :return: a list eg: ['11111', ...]
        '''
        '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品'''
        headers = {
            'authority': 'm.1688.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': get_random_pc_ua(),
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE',
        }

        params = (
            ('sortType', 'booked'),
            ('filtId', ''),
            ('keywords', keyword[1]),
            ('descendOrder', 'true'),
        )

        url = 'https://m.1688.com/offer_search/-6161.html'
        body = Requests.get_url_body(url=url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                goods_id_list = Selector(text=body).css(
                    'div.list_group-item::attr("data-offer-id")').extract()
                # pprint(goods_id_list)
            except Exception as e:
                self.lg.exception(e)
                self.lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(
                    keyword[1]))
                goods_id_list = []

        return goods_id_list

    def _get_tmall_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取tmall销量靠前的商品
        :param keyword:
        :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id
        '''
        '''方案: tmall m站的搜索'''  # 搜索: 偶尔不稳定但是还是能用
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d',
            'authority': 'list.tmall.com',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte',
        }

        params = {
            'page_size': '20',
            'page_no': '1',
            'q': str(keyword[1]),
            'type': 'p',
            'spm': 'a220m.6910245.a2227oh.d100',
            'from': 'mallfp..m_1_suggest',
            'sort': 'd',
        }

        s_url = 'https://list.tmall.com/m/search_items.htm'
        body = Requests.get_url_body(url=s_url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            return []
        else:
            data = json_2_dict(json_str=body, logger=self.lg)
            if data == {}:
                self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                _ = data.get('item', [])
                if _ is None or _ == []:
                    self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                        keyword[1]))
                    return []
                try:
                    goods_id_list = [str(item.get('url', '')) for item in _]
                except Exception as e:
                    self.lg.exception(e)
                    self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                        keyword[1]))
                    return []

                return goods_id_list

    def _get_jd_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取京东销量靠前的商品
        :param keyword:
        :return: [] or ['xxxx', ....]
        '''
        # 方案1: jd m站的搜索(基于搜索接口)
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416',
            'authority': 'so.m.jd.com',
            # 'cookie': '3AB9D23F7A4B3C9B=SL4YPRE3Y4C627UCHFP4ROHI54TTYYJKLFSVROZQ57T7K3OUUKSYIVFUJKQHBAUPRANZOTPLCVC2TICTSJG6WEMUII; mba_muid=1523868445027-16c30fbc5f8c54c429; abtest=20180416164812814_35; visitkey=41587293677961039; shshshfpa=9e159581-c64f-e9f4-ad0c-8b6ced0d9f28-1525907842; shshshfpb=1a725fe3148b84c839f009c93fc261f2218f59c61e7f4e6c05af381826; retina=1; webp=1; TrackerID=GGwYSka4RvH3lm0ZwLoO2_qdMpBwRG39BvyBvQaJfzyN5cmdGt4lEMSqqJS-sbDqj4nAUX2HU4sVDGA8vl169D37w4EqceYcH6ysXv46kMVfvVdAPmSMV9LceeO3Cc6Z; whwswswws=; __jdc=122270672; subAbTest=20180604104024339_59; mobilev=html5; m_uuid_new=05C2D24B7D8FFDA8D4243A929A5C6234; intlIpLbsCountrySite=jd; mhome=1; cid=9; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; M_Identification_abtest=20180604104040270_32361722; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; so_eggsCount=1; warehistory="4764260,10658784927,"; wq_logid=1528080290.1936376147; __jdu=15238681432201722645210; __jda=122270672.15238681432201722645210.1523868143.1528255502.1529934182.18; __jdv=122270672|direct|-|none|-|1529934182053; cn=0; user-key=ecfc3673-cc54-43e2-96bd-fb7a7e700c32; ipLoc-djd=1-72-2799-0; shshshfp=a3b9323dfc6a675230170e6a43efcb81; USER_FLAG_CHECK=d9f73823a80c0305366f70a3b99b9ecb; sid=57ea016fe0ab4b04271e00f01d94d3b9; intlIpLbsCountryIp=60.177.32.78; autoOpenApp_downCloseDate_auto=1529934572240_21600000; wxa_level=1; PPRD_P=UUID.15238681432201722645210; sc_width=1280; wq_area=15_1213_0%7C3; __jdb=122270672.10.15238681432201722645210|18.1529934182; mba_sid=15299345705167145512031951538.7; __wga=1529934993217.1529934585585.1528080039013.1526716673573.6.3; shshshsID=7f3d94fa215b4e53b467f0d5e0563e9c_9_1529934993592',
        }

        params = (
            ('keyword', keyword[1]),
            ('datatype', '1'),
            ('callback', 'jdSearchResultBkCbA'),
            ('page', '1'),
            ('pagesize', '10'),
            ('ext_attr', 'no'),
            ('brand_col', 'no'),
            ('price_col', 'no'),
            ('color_col', 'no'),
            ('size_col', 'no'),
            ('ext_attr_sort', 'no'),
            ('merge_sku', 'yes'),
            ('multi_suppliers', 'yes'),
            ('area_ids', '1,72,2819'),
            ('sort_type', 'sort_totalsales15_desc'),
            ('qp_disable', 'no'),
            ('fdesc', '\u5317\u4EAC'),
            # ('t1', '1529934992189'),
        )

        s_url = 'https://so.m.jd.com/ware/search._m2wq_list'
        body = Requests.get_url_body(url=s_url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                data = re.compile('jdSearchResultBkCbA\((.*)\)').findall(
                    body)[0]
            except IndexError:
                self.lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format(
                    (keyword[1])))
                return []
            '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。'''
            data = deal_with_JSONDecodeError_about_value_invalid_escape(
                json_str=data)
            data = json_2_dict(json_str=data, logger=self.lg)
            if data == {}:
                self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                # 注意拿到的数据如果是京东拼购则跳过
                # pprint(data)
                data = data.get('data', {}).get('searchm',
                                                {}).get('Paragraph', [])
                # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == ''
                if data is not None and data != []:
                    goods_id_list = [
                        item.get('wareid', '') for item in data
                        if item.get('pinGou', {}).get('bp', '') == ''
                    ]

                    return goods_id_list

                else:
                    self.lg.error('获取到的data为空list, 请检查!')
                    return []

    def _taobao_keywords_spider(self, **kwargs):
        '''
        抓取goods_id_list的数据,并存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https://item.taobao.com/item.htm?id=' + item
            for item in goods_id_list
        ]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass

            else:
                taobao = TaoBaoLoginAndParse(logger=self.lg)
                if self.add_goods_index % 20 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue

                    else:
                        self.lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id, str(self.add_goods_index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            # print('------>>>| 爬取到的数据为: ', data)
                            result = taobao.old_taobao_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True

    def _1688_keywords_spider(self, **kwargs):
        '''
        1688对应关键字的商品信息抓取存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https://detail.1688.com/offer/{0}.html'.format(item)
            for item in goods_id_list
        ]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:
            result = False  # 每次重置
            try:
                goods_id = re.compile('offer/(.*?).html').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue
            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                ali_1688 = ALi1688LoginAndParse(logger=self.lg)
                if self.add_goods_index % 20 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = ali_1688.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id, str(self.add_goods_index)))
                        tt = ali_1688.get_ali_1688_data(goods_id)
                        if tt.get('is_delete') == 1 and tt.get(
                                'before') is False:  # 处理已下架的但是还是要插入的
                            # 下架的商品就pass
                            continue

                        data = ali_1688.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            result = ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                try:
                    del ali_1688
                except:
                    pass
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True

    def _tmall_keywords_spider(self, **kwargs):
        '''
        tmall对应关键字采集
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https:' + re.compile('&skuId=.*').sub('', item)
            for item in goods_id_list
        ]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.lg)
                if self.add_goods_index % 20 == 0:  # 每20次重连一次,避免单次长连无响应报错
                    self.lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data[
                                'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                    type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            result = tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True

    def _jd_keywords_spider(self, **kwargs):
        '''
        jd对应关键字采集
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址'''
        # 所以这边jd就不分类存,一律存为常规商品site_id = 7
        goods_url_list = [
            'https://item.jd.com/{0}.html'.format(str(item))
            for item in goods_id_list
        ]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入db的参数
            try:
                goods_id = re.compile('\/(\d+)\.html').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                jd = JdParse(logger=self.lg)
                if self.add_goods_index % 20 == 0:  # 每20次重连一次,避免单次长连无响应报错
                    self.lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = jd.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id[1], str(self.add_goods_index)))
                        tt = jd.get_goods_data(goods_id)
                        data = jd.deal_with_data(goods_id)
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data['goods_url'] = item

                            result = jd.old_jd_goods_insert_into_new_table(
                                data, self.my_pipeline)
                        else:
                            pass
                else:
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                sleep(1)
                try:
                    del jd
                except:
                    pass
                gc.collect()

            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True

    def _insert_into_goods_id_and_keyword_middle_table(self, **kwargs):
        '''
        数据插入goods_id_and_keyword_middle_table
        :param kwargs:
        :return:
        '''
        goods_id = str(kwargs['goods_id'])
        keyword_id = int(kwargs['keyword_id'])
        # self.lg.info(goods_id)
        # self.lg.info(keyword_id)
        result = False
        '''先判断中间表goods_id_and_keyword_middle_table是否已新增该关键字的id'''
        # 注意非完整sql语句不用r'', 而直接''
        try:
            _ = self.my_pipeline._select_table(sql_str=kw_select_str_3,
                                               params=(goods_id, ))
            _ = [i[0] for i in _]
            # pprint(_)
        except Exception:
            self.lg.error(
                '执行中间表goods_id_and_keyword_middle_table是否已新增该关键字的id的sql语句时出错, 跳过给商品加keyword_id'
            )
            return result

        if keyword_id not in _:
            params = (
                goods_id,
                keyword_id,
            )
            self.lg.info('------>>>| 正在插入keyword_id为{0}, goods_id为{1}'.format(
                params[1], params[0]))
            result = self.my_pipeline._insert_into_table_2(
                sql_str=self.add_keyword_id_for_goods_id_sql_str,
                params=params,
                logger=self.lg)

        return result

    def _add_keyword_2_db_from_excel_file(self):
        '''
        从excel插入新关键字到db
        :return:
        '''
        excel_file_path = '/Users/afa/Desktop/2018-07-18-淘宝phone-top20万.xlsx'
        self.lg.info('正在读取{0}, 请耐心等待...'.format(excel_file_path))
        try:
            excel_result = read_info_from_excel_file(
                excel_file_path=excel_file_path)
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            return False

        self.lg.info('读取完毕!!')
        self.lg.info('正在读取db中原先的keyword...')
        db_keywords = self.my_pipeline._select_table(sql_str=kw_select_str_4)
        db_keywords = [i[0] for i in db_keywords]
        self.lg.info('db keywords 读取完毕!')

        for item in excel_result:
            keyword = item.get('关键词', None)
            if not keyword:
                continue

            if keyword in db_keywords:
                self.lg.info('该关键字{0}已经存在于db中...'.format(keyword))
                continue

            self.lg.info('------>>>| 正在存储关键字 {0}'.format(keyword))
            self.my_pipeline._insert_into_table_2(sql_str=kw_insert_str_2,
                                                  params=(str(keyword), 0),
                                                  logger=self.lg)

        self.lg.info('全部写入完毕!')

        return True

    def __del__(self):
        try:
            del self.lg
            del self.msg
            del self.my_pipeline
        except:
            pass
        try:
            del self.db_existed_goods_id_list
        except:
            pass
        gc.collect()
Exemplo n.º 28
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33]  # notice

        for tab_id in tab_id_list:
            for index in range(0, 50):
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(index))
                print('待抓取的限时秒杀地址为: ', tmp_url)

                data = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers)
                if data == '': break

                try:
                    data = json.loads(data)
                    data = data.get('data', {})
                    # print(data)
                except:
                    break

                if data.get('goodslist') == []:
                    print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(
                        tab_id, index))
                    break
                else:
                    data = data.get('goodslist', [])
                    # print(data)
                    if data == []:
                        print('goodslist为[], 此处跳过')
                        pass
                    else:
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        print(miaosha_goods_list)

                        juanpi = JuanPiParse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            sql_str = r'select goods_id, miaosha_time, tab_id, page from dbo.juanpi_xianshimiaosha where site_id=15'
                            if my_pipeline._select_table(
                                    sql_str=sql_str) is None:
                                db_goods_id_list = []
                            else:
                                db_goods_id_list = [
                                    item[0] for item in list(
                                        my_pipeline._select_table(
                                            sql_str=sql_str))
                                ]

                            for item in miaosha_goods_list:
                                if item.get('goods_id',
                                            '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'http://shop.juanpi.com/deal/' + item.get(
                                        'goods_id')
                                    juanpi.get_goods_data(
                                        goods_id=item.get('goods_id'))
                                    goods_data = juanpi.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item.get(
                                            'goods_id')
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get(
                                            'price')  # 秒杀前的原特价
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')  # 秒杀价
                                        goods_data['sub_title'] = item.get(
                                            'sub_title', '')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['tab_id'] = tab_id
                                        goods_data['page'] = index

                                        # print(goods_data)
                                        juanpi.insert_into_juanpi_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(.4)  # 短暂sleep下避免出错跳出
                            sleep(.65)
                        else:
                            pass
                        try:
                            del juanpi
                        except:
                            pass
                        gc.collect()
Exemplo n.º 29
0
class TaoBaoWeiTaoShareParse(AsyncCrawler):
    def __init__(
        self,
        logger=None,
        *params,
        **kwargs,
    ):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            logger=logger,
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/',
        )
        self._set_headers()
        self.msg = ''
        self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

    def _set_headers(self):
        self.headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            'referer':
            'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33&params=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome',
            'authority': 'h5api.m.taobao.com',
            # cookie得注释掉, 否则为非法请求
            # 'cookie': ''
        }

    async def _get_target_url_and_content_id_and_csid(self, taobao_short_url):
        '''
        根据给与的淘宝分享短链接, 得到target_url, content_id, csid
        :param taobao_short_url:
        :return:
        '''
        if re.compile(r'contentId').findall(taobao_short_url) != []:
            # 先检查是否已为目标地址
            target_url = taobao_short_url

        else:
            body = Requests.get_url_body(
                url=taobao_short_url,
                headers=self.headers,
                ip_pool_type=self.ip_pool_type,
            )
            # self.lg.info(str(body))
            if body == '':
                self.lg.error('获取到的body为空值, 出错短链接地址: {0}'.format(
                    str(taobao_short_url)))
                return '', '', ''

            try:
                # 获取短连接的目标地址
                target_url = re.compile('var url = \'(.*?)\';').findall(
                    body)[0]
                self.lg.info('获取到原始连接: {}'.format(target_url))
            except IndexError:
                self.lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format(
                    str(taobao_short_url)))
                target_url = ''

        try:
            # 得到contentId
            content_id = re.compile('contentId=(\d+)').findall(target_url)[0]
            self.lg.info(content_id)
        except IndexError:
            self.lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format(
                str(taobao_short_url)))
            content_id = ''

        try:
            # 得到csid
            csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall(
                target_url)[0]
            # self.lg.info(csid)
        except IndexError:
            self.lg.info('此链接为无csid情况的链接...')
            # self.lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url)))
            csid = ''

        try:
            tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag_name = ''

        try:
            tag = re.compile('tag=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag = ''

        return target_url, content_id, csid, tag_name, tag

    async def _get_api_body(self, taobao_short_url):
        '''
        获取该页面api返回的文件
        :param taobao_short_url:
        :return: body 类型 str
        '''
        base_url = 'https://h5api.m.taobao.com/h5/mtop.taobao.beehive.detail.contentservicenewv2/1.0/'

        try:
            target_url, content_id, csid, tag_name, tag = await self._get_target_url_and_content_id_and_csid(
                taobao_short_url)
        except ValueError:
            self.lg.error('遇到ValueError!', exc_info=True)
            return ''

        if content_id == '' and csid == '':  # 异常退出
            return ''

        data = dumps({
            'businessSpm': '',
            'business_spm': '',
            'contentId': content_id,
            'params': dumps({
                "csid": csid,
            }) if csid != '' else '',  # 没有csid时,就不传这个参数
            'source': 'weitao_2017_nocover',
            'tagName': tag_name,  # 这个是我自己额外加的用于获取tags的api接口
            'track_params': '',
            'type': 'h5',
        })
        params = {
            'AntiCreep': 'true',
            'AntiFlood': 'true',
            'api': 'mtop.taobao.beehive.detail.contentservicenewv2',
            'appKey': '12574478',
            'callback': 'mtopjsonp1',
            # 'data': '{"contentId":"200668154273","source":"weitao_2017_nocover","type":"h5","params":"{\\"csid\\":\\"54a52aea54b7c29d289a0e36b2bf2f51\\"}","businessSpm":"","business_spm":"","track_params":""}',
            'data': data,
            'dataType': 'jsonp',
            'data_2': '',
            'jsv': '2.4.11',
            # 'sign': 'e8cb623e58bab0ceb10e9edffdacd5b2',
            # 't': '1527300457911',
            'type': 'jsonp',
            'v': '1.0'
        }
        # TODO 新版
        # 必传参数(无cookies, sign正确也无结果!)
        # 而且登录后的cookies, 但是继续采集, tb会报: 亲,访问被拒绝了哦!请检查是否使用了代理软件或VPN哦~
        result_1 = await get_taobao_sign_and_body(
            base_url=base_url,
            headers=self.headers,
            params=params,
            data=data,
            logger=self.lg,
            ip_pool_type=self.ip_pool_type)
        _m_h5_tk = result_1[0]

        if _m_h5_tk == '':
            self.lg.error(
                '获取到的_m_h5_tk为空str! 出错短链接地址: {0}'.format(taobao_short_url))

        # 带上_m_h5_tk, 和之前请求返回的session再次请求得到需求的api数据
        result_2 = await get_taobao_sign_and_body(
            base_url=base_url,
            headers=self.headers,
            params=params,
            data=data,
            _m_h5_tk=_m_h5_tk,
            session=result_1[1],
            logger=self.lg,
            ip_pool_type=self.ip_pool_type)
        body = result_2[2]

        return body

    async def _deal_with_api_info(self, taobao_short_url):
        '''
        处理api返回的信息, 并结构化存储
        :param taobao_short_url:
        :return:
        '''
        data = await self._get_api_body(taobao_short_url)
        if data == '':
            self.lg.error('获取到的api数据为空值!')
            return {}

        try:
            data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0]
        except IndexError:
            self.lg.error(
                're获取主信息失败, IndexError, 出错短链接地址:{0}'.format(taobao_short_url))
            data = {}

        try:
            data = await self._wash_api_info(loads(data))
            # pprint(data)
        except Exception as e:
            self.lg.error('出错短链接地址:{0}'.format(taobao_short_url))
            self.lg.exception(e)
            return {}

        article = await self._get_article(data=data,
                                          taobao_short_url=taobao_short_url)
        pprint(article)

        if article != {} and article.get('share_id', '') != '':
            '''采集该文章推荐的商品'''
            await self._crawl_and_save_these_goods(
                goods_url_list=article.get('goods_url_list', []))
            '''存储该文章info'''
            await self._save_this_article(article=article)

            return True
        else:
            self.lg.info('获取到的文章失败! article为空dict!')
            return False

    async def _crawl_and_save_these_goods(self, goods_url_list):
        '''
        采集该文章推荐的商品
        :param goods_url_list:
        :return:
        '''
        sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6'

        try:
            result = self.my_pipeline._select_table(sql_str=sql_str)
        except TypeError:
            result = []

        self.lg.info('即将开始抓取该文章的goods, 请耐心等待...')
        index = 1

        db_all_goods_id_list = [item[0] for item in result]
        for item in goods_url_list:
            try:
                goods_id = re.compile(r'id=(\d+)').findall(
                    item.get('goods_url', ''))[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in db_all_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            else:
                taobao = TaoBaoLoginAndParse(logger=self.lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(
                        item.get('goods_url', ''))
                    if goods_id == '':
                        self.lg.info('@@@ 原商品的地址为: {0}'.format(
                            item.get('goods_url', '')))
                        continue

                    else:
                        self.lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id, str(index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            # print('------>>>| 爬取到的数据为: ', data)
                            taobao.old_taobao_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)

                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.lg.info('该文章的商品已经抓取完毕!')

        return True

    async def _save_this_article(self, article):
        '''
        存储该文章info
        :param article:
        :return:
        '''
        sql_str = 'select share_id from dbo.daren_recommend'
        db_share_id = [
            j[0] for j in list(self.my_pipeline._select_table(sql_str=sql_str))
        ]

        if article.get('share_id') in db_share_id:
            self.lg.info('该share_id({})已存在于数据库中, 此处跳过!'.format(
                article.get('share_id', '')))

            return True

        else:
            self.lg.info('即将开始存储该文章...')
            if self.my_pipeline.is_connect_success:
                params = await self._get_db_insert_params(item=article)
                # pprint(params)
                sql_str = r'insert into dbo.daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_goods_base_info, div_body, create_time, site_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                self.my_pipeline._insert_into_table_2(sql_str=sql_str,
                                                      params=params,
                                                      logger=self.lg)

                return True
            else:
                self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format(
                    article.get('gather_url', '')))
                return False

    async def _get_db_insert_params(self, item):
        params = (
            item['nick_name'],
            item['head_url'],
            item['profile'],
            item['share_id'],
            item['gather_url'],
            item['title'],
            item['comment_content'],
            # dumps(item['share_img_url_list'], ensure_ascii=False),
            # dumps(item['goods_id_list'], ensure_ascii=False),
            dumps(item['share_goods_base_info'], ensure_ascii=False),
            item['div_body'],
            item['create_time'],
            item['site_id'],
        )

        return params

    async def _get_article(self, data, taobao_short_url):
        '''
        得到该文章的需求信息
        :param data:
        :return:
        '''
        try:
            nick_name = data.get('data', {}).get('models',
                                                 {}).get('account',
                                                         {}).get('name', '')
            assert nick_name != '', '获取到的nick_name为空值!'

            head_url = await self._get_head_url(data=data)

            # 推荐人的简介或者个性签名
            tmp_profile = data.get('data',
                                   {}).get('models',
                                           {}).get('account',
                                                   {}).get('accountDesc', '')
            profile = tmp_profile if tmp_profile is not None else ''

            title = self._wash_sensitive_info(
                data.get('data', {}).get('models',
                                         {}).get('content',
                                                 {}).get('title', ''))
            # self.lg.info(title)
            assert title != '', '获取到的title为空值!请检查!'

            # 达人的评论,可用于荐好首页的文字信息
            comment_content = self._wash_sensitive_info(
                data.get('data', {}).get('models',
                                         {}).get('content',
                                                 {}).get('summary', ''))
            '''微淘抓包的接口: 图片,商品依次对应'''
            tmp_goods_list = data.get('data', {}).get('models', {}).get(
                'content', {}).get('drawerList', [])
            assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!'

            share_img_url_list = [{
                'img_url':
                'https:' + item.get('itemImages', [])[0].get('picUrl', '')
            } for item in tmp_goods_list]
            goods_id_list = [{
                'goods_id': item.get('itemId', '')
            } for item in tmp_goods_list]

            # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序)
            share_goods_base_info = list_duplicate_remove([{
                'img_url':
                'https:' + item.get('itemImages', [])[0].get('picUrl', ''),
                'goods_id':
                item.get('itemId', ''),
            } for item in tmp_goods_list])

            # div_body
            div_body = self._wash_sensitive_info(
                await self._get_div_body(rich_text=data.get('data', {}).get(
                    'models', {}).get('content', {}).get('richText', [])))
            # print(div_body)

            # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫
            goods_url_list = [{
                'goods_url':
                'https://item.taobao.com/item.htm?id=' +
                item.get('goods_id', '')
            } for item in goods_id_list]

            _ = (
                await
                self._get_target_url_and_content_id_and_csid(taobao_short_url))
            gather_url = _[0]
            share_id = _[1]  # 即content_id

            create_time = get_shanghai_time()

            site_id = 2  # 淘宝微淘

            # tags 额外的文章地址
            tags = await self._get_tags(data=data)
            # pprint(tags)

        except Exception as e:
            self.lg.error('出错短链接地址:{0}'.format(taobao_short_url))
            self.lg.exception(e)
            return {}

        article = WellRecommendArticle()
        article['nick_name'] = nick_name
        article['head_url'] = head_url
        article['profile'] = profile
        article['share_id'] = share_id
        article['title'] = title
        article['comment_content'] = comment_content
        article['share_img_url_list'] = share_img_url_list
        article['goods_id_list'] = goods_id_list
        article['div_body'] = div_body
        article['gather_url'] = gather_url
        article['create_time'] = create_time
        article['site_id'] = site_id
        article['goods_url_list'] = goods_url_list
        article['tags'] = tags
        article['share_goods_base_info'] = share_goods_base_info

        return article

    async def _get_head_url(self, data):
        '''
        获取头像地址
        :param data:
        :return:
        '''
        tmp_head_url = data.get('data',
                                {}).get('models',
                                        {}).get('account',
                                                {}).get('accountPic',
                                                        {}).get('picUrl', '')
        if tmp_head_url != '':
            if re.compile('http').findall(tmp_head_url) == []:
                head_url = 'https:' + tmp_head_url
            else:
                head_url = tmp_head_url
        else:
            head_url = ''

        return head_url

    def _wash_sensitive_info(self, data):
        '''
        清洗敏感信息
        :param data:
        :return:
        '''
        data = re.compile('淘宝|天猫|taobao|tmall|TAOBAO|TMALL').sub('', data)

        return data

    async def _get_tags(self, data):
        '''
        获得额外文章的信息
        :param data:
        :return:
        '''
        tags = data.get('data', {}).get('models', {}).get('tags', [])
        tags = [{
            'url': unquote(item.get('url', '')),
            'name': item.get('name', ''),
        } for item in tags]

        return tags

    async def _get_div_body(self, rich_text):
        '''
        处理得到目标文章
        :param rich_text: 待处理的原文章
        :return:
        '''
        div_body = ''
        for item in rich_text:
            if item.get('resource') is None:
                continue

            for resource_item in item.get('resource', []):  # 可能是多个
                # resource = item.get('resource', [])[0]
                text = resource_item.get('text', '')  # 介绍的文字
                picture = resource_item.get('picture', {})  # 介绍的图片
                _goods = resource_item.get('item', {})  # 一个商品

                if text != '':
                    text = '<p style="height:auto;width:100%">' + text + '</p>' + '<br>'
                    div_body += text
                    continue

                if picture != {}:
                    # 得到该图片的宽高,并得到图片的<img>标签
                    _ = r'<img src="{0}" style="height:{1}px;width:{2}px;"/>'.format(
                        'https:' + picture.get('picUrl', ''),
                        picture.get('picHeight', ''),
                        picture.get('picWidth', ''))
                    _ = _ + '<br>'
                    div_body += _
                    continue

                if _goods != {}:
                    _hiden_goods_id = r'<p style="display:none;">此处有个商品[goods_id]: {0}</p>'.format(
                        _goods.get('itemId', '')) + '<br>'
                    div_body += _hiden_goods_id
                    continue

        return '<div>' + div_body + '</div>' if div_body != '' else ''

    async def _wash_api_info(self, data):
        '''
        清洗接口
        :param data:
        :return:
        '''
        try:
            data['data']['assets'] = []
            data['data']['models']['config'] = {}
            data['data']['modules'] = []
        except Exception:
            pass

        return data

    def __del__(self):
        try:
            del self.lg
            del self.msg
            del self.my_pipeline
        except:
            pass
        gc.collect()
class CommentRealTimeUpdateSpider(object):
    def __init__(self):
        self._set_logger()
        self.msg = ''
        self.debugging_api = self._init_debugging_api()
        self._set_func_name_dict()
        self.sql_str = cm_update_str_1

        if self._init_debugging_api().get(2):
            self.my_lg.info('初始化 1688 phantomjs中...')
            self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(3) is True \
                or self._init_debugging_api().get(4) is True\
                or self._init_debugging_api().get(6) is True:
            self.my_lg.info('初始化 天猫 phantomjs中...')
            self.tmall = TmallCommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(7) is True \
                or self._init_debugging_api().get(8) is True\
                or self._init_debugging_api().get(9) is True\
                or self._init_debugging_api().get(10) is True:
            self.my_lg.info('初始化 京东 phantomjs中...')
            self.jd = JdCommentParse(logger=self.my_lg)

    def _set_logger(self):
        self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                '/all_comment/实时更新/' +
                                str(get_shanghai_time())[0:10] + '.txt',
                                console_log_level=INFO,
                                file_log_level=ERROR)

    def _init_debugging_api(self):
        '''
        用于设置待抓取的商品的site_id
        :return: dict
        '''
        return {
            1: True,
            2: True,
            3: True,
            4: True,
            6: True,
            7: True,
            8: True,
            9: True,
            10: True,
            11: False,
            12: False,
            13: False,
            25: False,
        }

    def _set_func_name_dict(self):
        self.func_name_dict = {
            'taobao': 'self._update_taobao_comment({0}, {1}, {2})',
            'ali': 'self._update_ali_1688_comment({0}, {1}, {2})',
            'tmall': 'self._update_tmall_comment({0}, {1}, {2})',
            'jd': 'self._update_jd_comment({0}, {1}, {2})',
            'zhe_800': 'self._update_zhe_800_comment({0}, {1}, {2})',
            'juanpi': 'self._update_juanpi_comment({0}, {1}, {2})',
            'pinduoduo': 'self._update_pinduoduo_comment({0}, {1}, {2})',
            'vip': 'self._update_vip_comment({0}, {1}, {2})',
        }

    def _just_run(self):
        while True:
            #### 更新数据
            self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline()
            #  and GETDATE()-a.modify_time>1
            try:
                result = list(
                    self._comment_pipeline._select_table(
                        sql_str=cm_select_str_1, logger=self.my_lg))
            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                continue

            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(str(result))
            self.my_lg.info(
                '--------------------------------------------------------')
            self.my_lg.info('待更新个数: {0}'.format(len(result)))

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))

            # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房  11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会
            for index, item in enumerate(
                    result):  # item: ('xxxx':goods_id, 'y':site_id)
                if not self.debugging_api.get(item[1]):
                    self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index))
                    continue

                if index % 20 == 0:
                    try:
                        del self._comment_pipeline
                    except:
                        pass
                    self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline(
                    )

                switch = {
                    1: self.func_name_dict.get('taobao'),  # 淘宝
                    2: self.func_name_dict.get('ali'),  # 阿里1688
                    3: self.func_name_dict.get('tmall'),  # 天猫
                    4: self.func_name_dict.get('tmall'),  # 天猫超市
                    6: self.func_name_dict.get('tmall'),  # 天猫国际
                    7: self.func_name_dict.get('jd'),  # 京东
                    8: self.func_name_dict.get('jd'),  # 京东超市
                    9: self.func_name_dict.get('jd'),  # 京东全球购
                    10: self.func_name_dict.get('jd'),  # 京东大药房
                    11: self.func_name_dict.get('zhe_800'),  # 折800
                    12: self.func_name_dict.get('juanpi'),  # 卷皮
                    13: self.func_name_dict.get('pinduoduo'),  # 拼多多
                    25: self.func_name_dict.get('vip'),  # 唯品会
                }

                # 动态执行
                exec_code = compile(
                    switch[item[1]].format(index, item[0], item[1]), '',
                    'exec')
                exec(exec_code)
                sleep(1.1)

    def _update_taobao_comment(self, index, goods_id, site_id):
        '''
        处理淘宝的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 淘宝\t\t索引值(%s)' % str(index))

            taobao = TaoBaoCommentParse(logger=self.my_lg)
            _r = taobao._get_comment_data(goods_id=str(goods_id))

            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

            try:
                del taobao
            except:
                self.my_lg.info('del taobao失败!')
            gc.collect()
        else:
            pass

    def _update_ali_1688_comment(self, index, goods_id, site_id):
        '''
        处理阿里1688的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 阿里1688\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try:
                    del self.ali_1688
                except:
                    self.my_lg.error('del ali_1688失败!')
                gc.collect()
                self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

            _r = self.ali_1688._get_comment_data(goods_id=goods_id)
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)

            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

        else:
            pass

    def _update_tmall_comment(self, index, goods_id, site_id):
        '''
        处理tmall商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 天猫\t\t索引值(%s)' % str(index))

            if site_id == 3:
                _type = 0
            elif site_id == 4:
                _type = 1
            elif site_id == 6:
                _type = 2
            else:
                return None

            if index % 5 == 0:
                try:
                    del self.tmall
                except:
                    self.my_lg.info('del tmall失败!')
                gc.collect()
                self.tmall = TmallCommentParse(logger=self.my_lg)

            _r = self.tmall._get_comment_data(type=_type,
                                              goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
            gc.collect()
        else:
            pass

    def _update_jd_comment(self, index, goods_id, site_id):
        '''
        处理京东商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 京东\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try:
                    del self.jd
                except:
                    self.my_lg.info('del jd失败!')
                gc.collect()
                self.jd = JdCommentParse(logger=self.my_lg)

            _r = self.jd._get_comment_data(goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
        else:
            pass

    def _update_zhe_800_comment(self, index, goods_id, site_id):
        '''
        处理折800商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_juanpi_comment(self, index, goods_id, site_id):
        '''
        处理卷皮商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_pinduoduo_comment(self, index, goods_id, site_id):
        '''
        处理拼多多的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_vip_comment(self, index, goods_id, site_id):
        '''
        处理唯品会的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _get_db_update_params(self, item):
        return (
            item['modify_time'],
            dumps(item['_comment_list'], ensure_ascii=False),
            item['goods_id'],
        )

    def __del__(self):
        try:
            del self.my_lg
            del self.msg
            del self.debugging_api
        except:
            pass
        try:
            del self._comment_pipeline
        except:
            pass
        try:
            del self.tmall
        except:
            pass
        gc.collect()