예제 #1
0
    def _just_run(self):
        while True:
            # 获取原先goods_db的所有已存在的goods_id
            try:
                result = list(
                    self.my_pipeline._select_table(sql_str=kw_select_str_1))
                self.lg.info('正在获取db中已存在的goods_id...')
                result_2 = list(
                    self.my_pipeline._select_table(sql_str=kw_select_str_2))
                self.lg.info('db中已存在的goods_id获取成功!')

            except TypeError:
                self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                result = None
                result_2 = None

            if result is not None and result_2 is not None:
                self.lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
                self.lg.info(str(result))
                self.lg.info(
                    '--------------------------------------------------------')

                self.lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
                self.add_goods_index = 0  # 用于定位增加商品的个数
                self.db_existed_goods_id_list = [item[0] for item in result_2]
                # 即时释放资源
                try:
                    del result_2
                except:
                    pass
                collect()

                for item in result:  # 每个关键字在True的接口都抓完, 再进行下一次
                    self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(
                        item[0], item[1]))
                    for type, type_value in self.debugging_api.items(
                    ):  # 遍历待抓取的电商分类
                        if type_value is False:
                            self.lg.info('api为False, 跳过!')
                            continue

                        self.my_pipeline = _block_get_new_db_conn(
                            db_obj=self.my_pipeline,
                            index=self.add_goods_index,
                            logger=self.lg,
                            remainder=20,
                        )

                        goods_id_list = self._get_keywords_goods_id_list(
                            type=type, keyword=item)
                        self.lg.info(
                            '关键字为{0}, 获取到的goods_id_list 如下: {1}'.format(
                                item[1], str(goods_id_list)))
                        '''处理goods_id_list'''
                        self._deal_with_goods_id_list(
                            type=type,
                            goods_id_list=goods_id_list,
                            keyword_id=item[0])
                        sleep(3)
예제 #2
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            logger_name=get_uuid1(),
            log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=mia_select_str_5))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            mia = MiaParse()
            for item in result:
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del mia
                    except:
                        pass
                    mia = MiaParse()
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    mia.get_goods_data(goods_id=goods_id)
                    data = mia.deal_with_data()
                    db_goods_info_obj = MIADbGoodsInfoObj(item=item,
                                                          logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='mia',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        mia._to_right_and_update_data(data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        try:
            del my_lg
        except:
            pass
        collect()
예제 #3
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        result = self._get_db_old_data()
        if result is None:
            sleep_time = 20
            print('获取db数据失败, 休眠{}s ...'.format(sleep_time))
            sleep(sleep_time)

            return None

        index = 1
        for item in result:  # 实时更新数据
            goods_id = item[0]
            pid = item[2]
            # 2020-04-12 00:00:00
            pintuan_end_time = json_2_dict(item[1]).get('end_time')
            pintuan_end_time = datetime_to_timestamp(
                string_to_datetime(pintuan_end_time))
            # print(pintuan_end_time)

            data = {}
            self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli,
                                                  index=index,
                                                  remainder=50)
            if self.sql_cli.is_connect_success:
                is_recent_time = self.is_recent_time(pintuan_end_time)
                if is_recent_time == 0:
                    # 已恢复原价的
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        update_sql_str=mia_update_str_7,
                        sql_cli=self.sql_cli)
                    print('该goods拼团开始时间为({})'.format(
                        json.loads(item[1]).get('begin_time')))
                    sleep(.4)

                elif is_recent_time == 2:
                    # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
                    pass

                else:  # 返回1,表示在待更新区间内
                    print(
                        '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'
                        .format(goods_id, index))
                    data['goods_id'] = goods_id
                    try:
                        data_list = get_mia_pintuan_one_page_api_goods_info(
                            page_num=pid)
                    except ResponseBodyIsNullStrException:
                        index += 1
                        sleep(.4)
                        continue

                    # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新
                    # try:
                    #     assert data_list != [], 'data_list不为空list!'
                    # except AssertionError as e:
                    #     print(e)
                    #     _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         update_sql_str=mia_update_str_7,
                    #         sql_cli=self.sql_cli)
                    #     sleep(.4)
                    #     index += 1
                    #     continue

                    pintuan_goods_all_goods_id = [
                        item_1.get('goods_id', '') for item_1 in data_list
                    ]
                    # print(pintuan_goods_all_goods_id)
                    '''
                    蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                    '''
                    mia_pt = MiaPintuanParse(is_real_times_update_call=True)
                    if goods_id not in pintuan_goods_all_goods_id:
                        # 内部已经下架的
                        # 一律更新
                        try:
                            goods_data = self._get_mia_pt_one_goods_info(
                                mia_pt_obj=mia_pt,
                                goods_id=goods_id,
                            )
                        except AssertionError:
                            # 返回的data为空则跳过
                            index += 1
                            continue

                        # pprint(goods_data)
                        mia_pt.update_mia_pintuan_table(data=goods_data,
                                                        pipeline=self.sql_cli)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                    else:
                        # 未下架的
                        for item_2 in data_list:
                            if item_2.get('goods_id', '') == goods_id:
                                sub_title = item_2.get('sub_title', '')
                                try:
                                    goods_data = self._get_mia_pt_one_goods_info(
                                        mia_pt_obj=mia_pt,
                                        goods_id=goods_id,
                                        sub_title=sub_title,
                                    )
                                except AssertionError:
                                    # 返回的data为空则跳过
                                    continue

                                # pprint(goods_data)
                                mia_pt.update_mia_pintuan_table(
                                    data=goods_data, pipeline=self.sql_cli)
                                sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                            else:
                                pass

                    try:
                        del mia_pt
                    except:
                        pass

            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass

            index += 1
            collect()

        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=jp_delete_str_1)
            result = list(sql_cli._select_table(sql_str=jp_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                goods_id = item[0]
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    try:
                        pintuan_end_time = json.loads(
                            item[1])[0].get('end_time')
                    except IndexError:
                        print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format(
                            goods_id))
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        continue
                    pintuan_end_time = int(
                        str(
                            time.mktime(
                                time.strptime(pintuan_end_time,
                                              '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(
                            datetime_to_timestamp(get_shanghai_time())):
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id))
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        juanpi_pintuan.get_goods_data(goods_id=goods_id)
                        data = juanpi_pintuan.deal_with_data()
                        if data == {}:
                            continue

                        data['goods_id'] = goods_id
                        juanpi_pintuan.to_right_and_update_pintuan_data(
                            data=data, pipeline=sql_cli)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        gc.collect()
예제 #5
0
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=vip_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            continue

        _block_print_db_old_data(result=result)
        index = 1
        for item in result:  # 实时更新数据
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            vip = VipParse()
            sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                             index=index,
                                             remainder=50)
            if sql_cli.is_connect_success:
                print(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' %
                    (item[0], index))
                vip.get_goods_data(goods_id=[0, item[0]])
                data = vip.deal_with_data()
                if data != {}:
                    data['goods_id'] = item[0]
                    data['shelf_time'], data[
                        'delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            shelf_time=item[4],
                            delete_time=item[5])
                    price_info_list = old_sku_info = json_2_dict(
                        item[6], default_res=[])
                    try:
                        old_sku_info = format_price_info_list(
                            price_info_list=price_info_list, site_id=25)
                    except AttributeError:  # 处理已被格式化过的
                        pass
                    new_sku_info = format_price_info_list(
                        data['price_info_list'], site_id=25)
                    data['_is_price_change'], data[
                        'sku_info_trans_time'], price_change_info = _get_sku_price_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=new_sku_info,
                            is_price_change=item[7]
                            if item[7] is not None else 0,
                            db_price_change_info=json_2_dict(item[9],
                                                             default_res=[]),
                            old_price_trans_time=item[12],
                        )
                    data['_is_price_change'], data[
                        '_price_change_info'] = _get_price_change_info(
                            old_price=item[2],
                            old_taobao_price=item[3],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price'],
                            is_price_change=data['_is_price_change'],
                            price_change_info=price_change_info,
                        )
                    # 监控纯规格变动
                    data['is_spec_change'], data[
                        'spec_trans_time'] = _get_spec_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=new_sku_info,
                            is_spec_change=item[8]
                            if item[8] is not None else 0,
                            old_spec_trans_time=item[13],
                        )

                    # 监控纯库存变动
                    data['is_stock_change'], data['stock_trans_time'], data[
                        'stock_change_info'] = _get_stock_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=new_sku_info,
                            is_stock_change=item[10]
                            if item[10] is not None else 0,
                            db_stock_change_info=json_2_dict(item[11],
                                                             default_res=[]),
                            old_stock_trans_time=item[14],
                        )

                    vip.to_right_and_update_data(data=data, pipeline=sql_cli)
                else:  # 表示返回的data值为空值
                    pass
            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            try:
                del vip
            except:
                pass
            gc.collect()
            sleep(VIP_SLEEP_TIME)
        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(30)
        gc.collect()
예제 #6
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=kl_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True)
            for item in result:  # 实时更新数据
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del kaola
                    except:
                        pass
                    kaola = KaoLaParse(logger=my_lg,
                                       is_real_times_update_call=True)
                    collect()

                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    logger=my_lg,
                    remainder=10,
                )
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    db_goods_info_obj = KLDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    data = kaola._get_goods_data(goods_id=goods_id)
                    if data.get('is_delete', 0) == 1:
                        # 单独处理下架商品
                        data['goods_id'] = goods_id
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=db_goods_info_obj.is_delete,
                                shelf_time=db_goods_info_obj.shelf_time,
                                delete_time=db_goods_info_obj.delete_time,
                            )

                        try:
                            kaola.to_right_and_update_data(data,
                                                           pipeline=sql_cli)
                        except Exception:
                            my_lg.error(exc_info=True)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        collect()
                        continue

                    data = kaola._deal_with_data()
                    if data != {}:
                        if data.get('is_delete', 0) == 1:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='kl',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )
                        kaola.to_right_and_update_data(data, pipeline=sql_cli)

                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠3s中...')
                        sleep(3.)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:
            # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
예제 #7
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    db_goods_info_obj = YXDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:
                            # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            sql_cli._update_table_2(
                                sql_str=yx_update_str_2,
                                params=(db_goods_info_obj.goods_id, ),
                                logger=my_lg,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='yx',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        yanxuan.to_right_and_update_data(data,
                                                         pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
예제 #8
0
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=pd_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            pinduoduo_miaosha = PinduoduoParse()

            all_miaosha_goods_list = self.get_all_miaosha_goods_list()

            # 其中所有goods_id的list
            miaosha_goods_all_goods_id = [
                i.get('goods_id') for i in all_miaosha_goods_list
            ]
            # print(miaosha_goods_all_goods_id)

            for item in result:  # 实时更新数据
                # 对于拼多多先拿到该商品的结束时间点
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        sql_cli._delete_table(sql_str=self.delete_sql_str,
                                              params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))
                        sleep(.3)

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            '''
                            表示其中没有了该goods_id
                            '''
                            sql_cli._delete_table(sql_str=self.delete_sql_str,
                                                  params=(item[0]))
                            print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                  item[0])
                            sleep(.3)

                        else:  # 未下架的
                            for item_1 in all_miaosha_goods_list:
                                if item_1.get('goods_id', '') == item[0]:
                                    # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                    # pinduoduo_miaosha = PinduoduoParse()
                                    pinduoduo_miaosha.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = pinduoduo_miaosha.deal_with_data(
                                    )

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        # sleep(3)
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item_1.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item_1.get(
                                            'goods_id')
                                        if item_1.get('stock_info').get(
                                                'activity_stock') > 0:
                                            goods_data['price'] = item_1.get(
                                                'price')  # 秒杀前的原特价
                                            goods_data[
                                                'taobao_price'] = item_1.get(
                                                    'taobao_price')  # 秒杀价
                                        else:
                                            pass
                                        goods_data['sub_title'] = item_1.get(
                                            'sub_title', '')
                                        goods_data[
                                            'miaosha_time'] = item_1.get(
                                                'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item_1.get(
                                                        'miaosha_time'))

                                        if item_1.get('stock_info').get(
                                                'activity_stock') <= 1:
                                            # 实时秒杀库存小于等于1时就标记为 已售罄
                                            print('该秒杀商品已售罄...')
                                            goods_data['is_delete'] = 1

                                        # print(goods_data)
                                        pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table(
                                            data=goods_data, pipeline=sql_cli)
                                    sleep(PINDUODUO_SLEEP_TIME)
                                else:
                                    pass

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(3 * 60)
        # del ali_1688
        gc.collect()
예제 #9
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=mg_delete_str_2)
            result = list(sql_cli._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            self.my_phantomjs = BaseDriver(
                executable_path=PHANTOMJS_DRIVER_PATH,
                ip_pool_type=self.ip_pool_type)
            for item in result:  # 实时更新数据
                goods_id = item[0]
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(
                        executable_path=PHANTOMJS_DRIVER_PATH,
                        ip_pool_type=self.ip_pool_type)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            update_sql_str=mg_update_str_5,
                            sql_cli=sql_cli,
                        )
                        print(
                            '过期的goods_id为(%s)' % goods_id,
                            ', 拼团开始时间为(%s), 逻辑删除成功!' %
                            json.loads(item[1]).get('begin_time'))
                        sleep(.3)

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                _handle_goods_shelves_in_auto_goods_table(
                                    goods_id=goods_id,
                                    update_sql_str=mg_update_str_5,
                                    sql_cli=sql_cli,
                                )
                                sleep(.3)

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if goods_id not in pintuan_goods_all_goods_id:
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=goods_id)
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = goods_id
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data, pipeline=sql_cli)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == goods_id:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=goods_id)
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data[
                                                    'goods_id'] = goods_id
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=sql_cli)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        gc.collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=z8_delete_str_1)
            result = list(sql_cli._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            for item in result:  # 实时更新数据
                goods_id = item[0]
                db_is_delete = item[1]
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    remainder=50,
                )
                if index % 300 == 0:  # 每更新300个,休眠3分钟
                    sleep_time = 3 * 60
                    sleep(sleep_time)
                    print('休眠{}s中...'.format(sleep_time))

                if sql_cli.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        if db_is_delete == 1:
                            print('该goods_id[{0}]已过期!'.format(goods_id))
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                        else:
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
예제 #11
0
    def _tmall_keywords_spider(self, **kwargs):
        """
        tmall对应关键字采集
        :param kwargs:
        :return:
        """
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')
        for item in goods_url_list:
            # item为goods_url
            # 用于判断某个goods是否被插入的参数
            result = False
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True   # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
                self.sql_cli = _block_get_new_db_conn(
                    db_obj=self.sql_cli,
                    index=self.add_goods_index,
                    logger=self.lg,
                    remainder=20, )
                if self.sql_cli.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            if not self.check_target_data_is_legal(target_data=data):
                                return False

                            result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli)
                        else:
                            pass

                else:
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:
                # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id,
                    keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True
예제 #12
0
    def _just_run(self):
        while True:
            result = None
            result_2 = None
            # 获取原先goods_db的所有已存在的goods_id
            try:
                result = list(self.sql_cli._select_table(sql_str=kw_select_str_1))
                self.lg.info('正在获取db中已存在的goods_id...')
                result_2 = list(self.sql_cli._select_table(sql_str=kw_select_str_2))
                self.lg.info('db中已存在的goods_id获取成功!')
            except TypeError:
                self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

            if result is None or result_2 is None:
                sleep(15)
                continue

            self.lg.info('db 已存在的goods_id_num: {}'.format(len(result_2)))
            # 用于定位增加商品的个数
            self.add_goods_index = 0
            self.db_existed_goods_id_list = [item[0] for item in result_2]
            # 即时释放资源
            try:
                del result_2
            except:
                pass
            collect()

            for item in result:
                keyword_id = item[0]
                keyword = item[1]
                # 每个关键字在True的接口都抓完, 再进行下一次
                self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(keyword_id, keyword))
                # 筛选
                if int(keyword_id) < 43:
                    if int(keyword_id) not in (25, 26):
                        self.lg.info('不在处理的keyword_id范围内, keyword_id: {}, keyword: {}'.format(
                            keyword_id,
                            keyword))
                        continue
                    else:
                        pass
                else:
                    pass

                for type, type_value in self.debugging_api.items():
                    # 遍历待抓取的电商分类
                    if type_value is False:
                        self.lg.info('api为False, 跳过!')
                        continue

                    self.sql_cli = _block_get_new_db_conn(
                        db_obj=self.sql_cli,
                        index=self.add_goods_index,
                        logger=self.lg,
                        remainder=20,)
                    goods_id_list = self._get_keywords_goods_id_list(
                        type=type,
                        keyword=item)
                    # pprint(goods_id_list)
                    self.lg.info('关键字为{0}, 获取到的goods_id_list_num: {1}'.format(keyword, len(goods_id_list)))
                    '''处理goods_id_list'''
                    self._deal_with_goods_id_list(
                        type=type,
                        goods_id_list=goods_id_list,
                        keyword_id=keyword_id)
                    sleep(3)
    async def deal_with_tmcs_goods_id_list(self):
        self.lg.info('即将开始抓取tmcs goods, 请耐心等待...')
        for item in self.db_wait_2_save_goods_id_list:
            # eg: '61864164616'
            goods_id = item

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
            self.sql_cli = _block_get_new_db_conn(
                db_obj=self.sql_cli,
                index=self.add_goods_index,
                logger=self.lg,
                remainder=self.sql_cli_remainder,
            )
            if self.sql_cli.is_connect_success:
                # 加spm 是为了get_goods_id_from_url能筛选, id
                # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id)
                goods_url = 'https://detail.tmall.com/item.htm?id={}'.format(
                    goods_id)
                # 下面这个goods_id为类型加goods_id的list
                goods_id = tmall.get_goods_id_from_url(goods_url)
                if goods_id == []:
                    self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url))
                    continue
                else:
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (goods_id[1], str(self.add_goods_index)))
                    tt = tmall.get_goods_data(goods_id)
                    data = tmall.deal_with_data()
                    goods_id = goods_id[1]
                    if data != {}:
                        data['goods_id'] = goods_id
                        data['username'] = '******'
                        data['main_goods_id'] = None
                        data[
                            'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                type=data['type'],
                                goods_id=goods_id,
                            )
                        if data['goods_url'] == '':
                            self.lg.error('该goods_url为空值! 此处跳过!')
                            continue

                        if len(data['all_img_url']) <= 1:
                            self.lg.info(
                                '[goods_id: {}]主图个数<=1, pass'.format(goods_id))
                            return False

                        result = tmall.old_tmall_goods_insert_into_new_table(
                            data=data, pipeline=self.sql_cli)
                        if result:
                            # 避免后续重复采集
                            self.db_existed_goods_id_list.append(goods_id)
                        else:
                            pass
                    else:
                        pass

            else:
                self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                pass
            self.add_goods_index += 1
            collect()
            sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.lg.info('tmcs已经抓取完毕!')

        return True