Пример #1
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 order by ID desc'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
            result_2 = list(tmp_sql_server.select_old_table_all_goods_id())
            # print(result_2)
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            # 新表 GoodsInfoAutoGet
            new_table_ali_1688_all_goods_id_list = list(
                set([item[0] for item in result]))  # 新表里面的goods_id
            print(new_table_ali_1688_all_goods_id_list)
            sleep(2)

            # 老表
            old_table_ali_1688_all_goods_list = []
            for item in result_2:
                tmp_goods_id = ali_1688.get_goods_id_from_url(item[0])
                if tmp_goods_id != '' and tmp_goods_id not in new_table_ali_1688_all_goods_id_list:
                    old_table_ali_1688_all_goods_list.append([
                        'https://detail.1688.com/offer/' + tmp_goods_id +
                        '.html',
                        item[1],
                        tmp_goods_id,
                    ])
                else:
                    print('@@@ 原地址为: ', item[0])
            # print(old_table_ali_1688_all_goods_list)
            print('老表待转数据个数为: ', len(old_table_ali_1688_all_goods_list))
            sleep(2)

            for item in old_table_ali_1688_all_goods_list:  # 实时更新数据
                if index % 10 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = str(item[2])
                    # print(goods_id)
                    if goods_id in new_table_ali_1688_all_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过!')
                        index += 1
                        gc.collect()
                        continue  # 跳过sleep

                    else:
                        sql_str = r'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s'
                        try:  # 老是有重复的,索性单独检查
                            is_in_db = list(
                                tmp_sql_server._select_table(
                                    sql_str=sql_str, params=(goods_id)))
                        except:
                            is_in_db = []
                            pass
                        if is_in_db != []:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            index += 1
                            gc.collect()
                            continue

                        print(
                            '------>>>| 正在插入的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        tt = ali_1688.get_ali_1688_data(goods_id)
                        if tt.get('is_delete') == 1 and tt.get(
                                'before') is False:  # 处理已下架的但是还是要插入的
                            tt['goods_id'] = goods_id
                            tt['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            tt['username'] = '******'
                            tt['main_goods_id'] = item[1]

                            ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=tt, pipeline=tmp_sql_server)

                            index += 1
                            gc.collect()
                            sleep(1.2)
                            continue
                        else:
                            pass

                        data = ali_1688.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=data, pipeline=tmp_sql_server)

                        else:  # 表示返回的data为空值
                            pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Пример #2
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()
            result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id())
            # print(result_2)
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            new_table_ali_1688_all_goods_id_list = [item[0] for item in result]
            for item in result_2:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item[0])
                    if goods_id == '':
                        print('@@@ 原商品的地址为: ', item[0])
                        continue
                    else:
                        if goods_id in new_table_ali_1688_all_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            continue

                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (goods_id, index))
                            tt = taobao.get_goods_data(goods_id)
                            if tt.get('is_delete') == 1:  # 处理已下架的但是还是要插入的
                                tt['goods_id'] = goods_id
                                tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                                tt['username'] = '******'
                                tt['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data=tt, pipeline=tmp_sql_server_2)

                                index += 1
                                gc.collect()
                                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                                continue
                            else:
                                pass

                            data = taobao.deal_with_data(goods_id=goods_id)
                            if data != {}:
                                data['goods_id'] = goods_id
                                data[
                                    'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                        goods_id)
                                data['username'] = '******'
                                data['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data, pipeline=tmp_sql_server_2)
                            else:
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Пример #3
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=3 or SiteID=4 or SiteID=6 order by ID desc'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
            result_2 = list(tmp_sql_server.select_old_table_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
            result_2 = []
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse()
            for item in result_2:  # 实时更新数据
                data = {}
                if index % 5 == 0:
                    tmall = TmallParse()
                    gc.collect()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item[0])
                    if goods_id == []:
                        print('@@@ 原地址为: ', item[0])
                        continue
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id[1], index))
                        data = tmall.get_goods_data(goods_id=goods_id)
                        if isinstance(data, int):
                            continue

                        if data.get('is_delete') == 1:
                            data['goods_id'] = goods_id[1]

                            # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际
                            #####################################################
                            if goods_id[0] == 0:  # [0, '1111']
                                wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[
                                    1]  # 构造成标准干净的天猫商品地址
                            elif goods_id[0] == 1:  # [1, '1111']
                                wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[
                                    1]
                            elif goods_id[
                                    0] == 2:  # [2, '1111', 'https://xxxxx']
                                wait_to_deal_with_url = str(
                                    goods_id[2]) + '?id=' + goods_id[1]
                            else:
                                continue
                            data['goods_url'] = wait_to_deal_with_url
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            # print('------>>>| 爬取到的数据为: ', data)
                            result = tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=tmp_sql_server)
                            if result is False:
                                print('出错商品的地址为: ', item[0])
                            else:
                                pass
                            index += 1
                            gc.collect()
                            sleep(1.2)
                            continue
                        else:
                            pass

                        data = tmall.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id[1]

                            # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际
                            #####################################################
                            if goods_id[0] == 0:  # [0, '1111']
                                wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[
                                    1]  # 构造成标准干净的天猫商品地址
                            elif goods_id[0] == 1:  # [1, '1111']
                                wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[
                                    1]
                            elif goods_id[
                                    0] == 2:  # [2, '1111', 'https://xxxxx']
                                wait_to_deal_with_url = str(
                                    goods_id[2]) + goods_id[1]
                            else:
                                continue
                            data['goods_url'] = wait_to_deal_with_url
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            # print('------>>>| 爬取到的数据为: ', data)
                            tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                            pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                sleep(2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()