示例#1
0
    def deal_with_data(self, goods_list):
        '''
        处理并存储相关拼团商品的数据
        :param goods_list:
        :return:
        '''
        mia = MiaPintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21'
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))]
            # print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['sub_title'] = item.get('sub_title', '')
                        goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(pintuan_time=goods_data['pintuan_time'])
                        goods_data['pid'] = item.get('pid')

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mia.insert_into_mia_pintuan_table(data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            db_goods_id_list.append(goods_id)
                            db_goods_id_list = list(set(db_goods_id_list))

                    sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        gc.collect()
示例#2
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mia_pintuan = MiaPintuanParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                            item[2]) + '/0/'
                        # print(tmp_url)

                        body = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers,
                                                       had_referer=True)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            if tmp_data.get('data_list', []) == []:
                                print('得到的data_list为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                data_list = [{
                                    'goods_id':
                                    item_2.get('sku', ''),
                                    'sub_title':
                                    item_2.get('intro', ''),
                                } for item_2 in tmp_data.get('data_list', [])]
                                # pprint(data_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in data_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:  # 内部已经下架的
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass

                                    # 一律更新
                                    mia_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mia_pintuan.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:
                                        goods_data['goods_id'] = str(item[0])
                                        if goods_data[
                                                'pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间)
                                            now_time = get_shanghai_time()
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = (
                                                        now_time, now_time)
                                        else:
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                        pintuan_time=goods_data[
                                                            'pintuan_time'])

                                        # pprint(goods_data)
                                        # print(goods_data)
                                        mia_pintuan.update_mia_pintuan_table(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in data_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mia_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mia_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:
                                                goods_data['goods_id'] = str(
                                                    item[0])
                                                goods_data[
                                                    'sub_title'] = item_2.get(
                                                        'sub_title', '')
                                                if goods_data[
                                                        'pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团
                                                    now_time = get_shanghai_time(
                                                    )
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = (
                                                                now_time,
                                                                now_time)
                                                else:
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                                pintuan_time=
                                                                goods_data[
                                                                    'pintuan_time']
                                                            )

                                                # pprint(goods_data)
                                                # print(goods_data)
                                                mia_pintuan.update_mia_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(MIA_SPIKE_SLEEP_TIME
                                                      )  # 放慢速度
                                        else:
                                            pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
示例#3
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        result = self._get_db_old_data()
        if result is None:
            sleep_time = 20
            print('获取db数据失败, 休眠{}s ...'.format(sleep_time))
            sleep(sleep_time)

            return None

        index = 1
        for item in result:  # 实时更新数据
            goods_id = item[0]
            pid = item[2]
            # 2020-04-12 00:00:00
            pintuan_end_time = json_2_dict(item[1]).get('end_time')
            pintuan_end_time = datetime_to_timestamp(
                string_to_datetime(pintuan_end_time))
            # print(pintuan_end_time)

            data = {}
            self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli,
                                                  index=index,
                                                  remainder=50)
            if self.sql_cli.is_connect_success:
                is_recent_time = self.is_recent_time(pintuan_end_time)
                if is_recent_time == 0:
                    # 已恢复原价的
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        update_sql_str=mia_update_str_7,
                        sql_cli=self.sql_cli)
                    print('该goods拼团开始时间为({})'.format(
                        json.loads(item[1]).get('begin_time')))
                    sleep(.4)

                elif is_recent_time == 2:
                    # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
                    pass

                else:  # 返回1,表示在待更新区间内
                    print(
                        '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'
                        .format(goods_id, index))
                    data['goods_id'] = goods_id
                    try:
                        data_list = get_mia_pintuan_one_page_api_goods_info(
                            page_num=pid)
                    except ResponseBodyIsNullStrException:
                        index += 1
                        sleep(.4)
                        continue

                    # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新
                    # try:
                    #     assert data_list != [], 'data_list不为空list!'
                    # except AssertionError as e:
                    #     print(e)
                    #     _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         update_sql_str=mia_update_str_7,
                    #         sql_cli=self.sql_cli)
                    #     sleep(.4)
                    #     index += 1
                    #     continue

                    pintuan_goods_all_goods_id = [
                        item_1.get('goods_id', '') for item_1 in data_list
                    ]
                    # print(pintuan_goods_all_goods_id)
                    '''
                    蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                    '''
                    mia_pt = MiaPintuanParse(is_real_times_update_call=True)
                    if goods_id not in pintuan_goods_all_goods_id:
                        # 内部已经下架的
                        # 一律更新
                        try:
                            goods_data = self._get_mia_pt_one_goods_info(
                                mia_pt_obj=mia_pt,
                                goods_id=goods_id,
                            )
                        except AssertionError:
                            # 返回的data为空则跳过
                            index += 1
                            continue

                        # pprint(goods_data)
                        mia_pt.update_mia_pintuan_table(data=goods_data,
                                                        pipeline=self.sql_cli)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                    else:
                        # 未下架的
                        for item_2 in data_list:
                            if item_2.get('goods_id', '') == goods_id:
                                sub_title = item_2.get('sub_title', '')
                                try:
                                    goods_data = self._get_mia_pt_one_goods_info(
                                        mia_pt_obj=mia_pt,
                                        goods_id=goods_id,
                                        sub_title=sub_title,
                                    )
                                except AssertionError:
                                    # 返回的data为空则跳过
                                    continue

                                # pprint(goods_data)
                                mia_pt.update_mia_pintuan_table(
                                    data=goods_data, pipeline=self.sql_cli)
                                sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                            else:
                                pass

                    try:
                        del mia_pt
                    except:
                        pass

            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass

            index += 1
            collect()

        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
示例#4
0
    def deal_with_data(self, goods_list):
        '''
        处理并存储相关拼团商品的数据
        :param goods_list:
        :return:
        '''
        mia = MiaPintuanParse(is_real_times_update_call=True)
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=mia_select_str_1))
            db_goods_id_list = [item[0] for item in _]
            # print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(
                                goods_url) != '':
                            # 地址变了
                            # 原先
                            # goods_url = 'https://www.miyabaobei.hk/item-{}.html'.format(goods_id)
                            # now
                            goods_url = 'https://m.miyabaobei.hk/item-{}.html'.format(
                                goods_id)

                        else:
                            goods_url = 'https://m.mia.com/item-' + str(
                                goods_id) + '.html'

                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['sub_title'] = item.get('sub_title', '')
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['pintuan_time'])
                        goods_data['pid'] = item.get('pid')

                        # pprint(goods_data)
                        _r = mia.insert_into_mia_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                    sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        collect()