Пример #1
0
    def _deal_with_data(self):
        '''
        处理并存储抓取到的拼团商品的数据
        :return:
        '''
        zid_list = self._get_pintuan_goods_info()

        zhe_800_pintuan = Zhe800PintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, is_delete from dbo.zhe_800_pintuan where site_id=17'
            db_goods_id_list = [
                item[0]
                for item in list(my_pipeline._select_table(sql_str=sql_str))
            ]
            for item in zid_list:
                if item[0] in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(
                        item[0])
                    goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url)

                    zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    goods_data = zhe_800_pintuan.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_id'] = str(item[0])
                        goods_data['spider_url'] = tmp_url
                        goods_data['username'] = '******'
                        goods_data['page'] = str(item[1])
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                schedule=goods_data.get('schedule', [])[0])

                        # print(goods_data)
                        _r = zhe_800_pintuan.insert_into_zhe_800_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 插入就更新
                            db_goods_id_list.append(item[0])
                            db_goods_id_list = list(set(db_goods_id_list))

                    sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                    gc.collect()

        else:
            pass
        try:
            del zhe_800_pintuan
        except:
            pass
        gc.collect()

        return None
Пример #2
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=z8_delete_str_1)
            result = list(
                tmp_sql_server._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0])
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            print('@@ 该商品的页面已经不存在!此处将其删除!')
                            tmp_sql_server._delete_table(
                                sql_str=z8_delete_str_2, params=(item[0], ))
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        if item[1] == 1:
                            tmp_sql_server._delete_table(
                                sql_str=z8_delete_str_2, params=(item[0], ))
                            print('该goods_id[{0}]已过期,删除成功!'.format(item[0]))
                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (item[0], index))
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                gc.collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Пример #3
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时秒杀商品信息
        :return:
        '''
        zid_list = []
        for page in range(0, 100):
            tmp_zid_list = []
            tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            tmp_data = self.get_url_body(tmp_url=tmp_url)
            # print(tmp_data)

            if tmp_data == []:
                print('该tmp_url得到的object为空list, 此处跳过!')
                break

            tmp_zid_list = [(item.get('product', {}).get('zid', ''), page)
                            for item in tmp_data]
            # print(tmp_zid_list)

            for item in tmp_zid_list:
                if item != '':
                    zid_list.append(item)

        zid_list = list(set(zid_list))
        print('该zid_list的总个数为: ', len(zid_list))
        print(zid_list)

        zhe_800_pintuan = Zhe800PintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline.select_zhe_800_pintuan_all_goods_id())
            ]
            for item in zid_list:
                if item[0] in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(
                        item[0])
                    goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url)

                    zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    goods_data = zhe_800_pintuan.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_id'] = str(item[0])
                        goods_data['spider_url'] = tmp_url
                        goods_data['username'] = '******'
                        goods_data['page'] = str(item[1])

                        # print(goods_data)
                        zhe_800_pintuan.insert_into_zhe_800_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        sleep(.7)

        else:
            pass
        try:
            del zhe_800_pintuan
        except:
            pass
        gc.collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=z8_delete_str_1)
            result = list(sql_cli._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            for item in result:  # 实时更新数据
                goods_id = item[0]
                db_is_delete = item[1]
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    remainder=50,
                )
                if index % 300 == 0:  # 每更新300个,休眠3分钟
                    sleep_time = 3 * 60
                    sleep(sleep_time)
                    print('休眠{}s中...'.format(sleep_time))

                if sql_cli.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        if db_is_delete == 1:
                            print('该goods_id[{0}]已过期!'.format(goods_id))
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                        else:
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_zhe_800_pintuan_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    zhe_800_pintuan.get_goods_data(goods_id=item[0])
                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        if item[1] == 1:
                            tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id(
                                goods_id=item[0])
                            print('该goods_id[{0}]已过期,删除成功!'.format(item[0]))

                        zhe_800_pintuan.to_right_and_update_data(
                            data=data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del zhe_800_pintuan
                # except:
                #     pass
                gc.collect()
                sleep(.7)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()