示例#1
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        chuchujie = ChuChuJie_9_9_Parse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=cc_select_str_2))
            db_goods_id_list = [item[0] for item in _]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(
                        goods_id)
                    chuchujie.get_goods_data(goods_id=goods_id)
                    goods_data = chuchujie.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        sleep(.5)

                    elif goods_data.get('is_delete',
                                        0) == 1:  # is_delete=1(即库存为0)则跳过
                        print('------>>>| 该商品库存为0,已被抢光!')
                        sleep(.5)

                    else:  # 否则就解析并且插入
                        my_phantomjs = BaseDriver(
                            executable_path=PHANTOMJS_DRIVER_PATH,
                            ip_pool_type=self.ip_pool_type)

                        # 获取剩余时间
                        tmp_body = my_phantomjs.get_url_body(
                            url=tmp_url, css_selector='p#activityTime span')
                        # print(tmp_body)

                        try:
                            del my_phantomjs
                        except:
                            pass
                        gc.collect()

                        if tmp_body == '':  # 获取手机版的页面完整html失败
                            sleep(.5)
                            pass
                        else:
                            # p#activityTime span
                            _t = Selector(text=tmp_body).css(
                                'p#activityTime span::text').extract_first()
                            _t = re.compile(r'剩余').sub('', _t)
                            # print(_t)
                            if _t == '' or _t is None:
                                print('获取到的_t为空值, 严重错误! 请检查!')

                            miaosha_end_time = self.get_miaosha_end_time(_t)
                            goods_data['goods_url'] = tmp_url
                            goods_data['goods_id'] = str(goods_id)
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = {
                                'miaosha_begin_time':
                                timestamp_to_regulartime(int(time.time())),
                                'miaosha_end_time':
                                timestamp_to_regulartime(
                                    int(miaosha_end_time)),
                            }
                            goods_data['miaosha_begin_time'], goods_data[
                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                    miaosha_time=goods_data['miaosha_time'])
                            goods_data['gender'] = str(item.get('gender', '0'))
                            goods_data['page'] = item.get('page')

                            res = chuchujie.insert_into_chuchujie_xianshimiaosha_table(
                                data=goods_data, pipeline=my_pipeline)
                            if res:
                                if goods_id not in db_goods_id_list:
                                    db_goods_id_list.append(goods_id)

                            # sleep(CHUCHUJIE_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠
                        # index += 1
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del chuchujie
        except:
            pass
        gc.collect()
示例#2
0
class GX8899Spider(Crawler):
    def __init__(self, logger=None):
        super(GX8899Spider, self).__init__(
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            logger=logger,
            log_save_path=MY_SPIDER_LOGS_PATH + '/gx8899/_/',
            
            is_use_driver=True,
            driver_executable_path=PHANTOMJS_DRIVER_PATH
        )
        self._set_sort_type_name()
        self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s'
        self.id_list = []
        self.update_index = 0

    def _set_sort_type_name(self):
        '''
        设置抓取的分类名
        :return:
        '''
        self.sort_type_name_list = [
            # 'weixin',
            # 'nansheng',
            # 'nvsheng',
            'fengjing',
            'jingxuan',
            'wupin',
            'oumei',
            'weimei',
            'heibai',
            'baqi',
            'xiaoqingxin',
            'yijing',
            'beiying',
            'chouyan',
            'sumiao',
            'gexing',
            'xiaohai',
            'qiche',
            'zhiwu',
            'shouhui',
            'weshen',
            'mingxing',
            'jianzhu',
            'renwu',
        ]

    def _get_gx8899_all_img_url(self):
        self.lg.info('即将开始采集gx8899...')
        fz = []
        for sort_type_name in self.sort_type_name_list:
            tmp = self._get_one_sort_type_name_page_info(sort_type_name)
            if tmp != []:
                fz += tmp

        self.lg.info('@@@ 全部头像抓取完毕!')
        self.fz = fz

        return fz

    def _get_new_wait_2_handle_id_list(self):
        '''
        获取新的带处理的
        :return:
        '''
        sql_str = '''
        select top 1000 id 
        from dbo.sina_weibo
        where sina_type = 'bilibili' and modify_time is null
        '''
        if self.id_list == []:
            self.lg.info('@@@ 重新获取id_list...')
            self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
            try:
                wait = self.my_pipeline._select_table(sql_str=sql_str)
                self.id_list = [i[0] for i in wait]
            except TypeError or IndexError:
                sleep(8)
                return []
        else:
            pass

        return self.id_list

    @fz_set_timeout(6)
    def oo(self, id, img_url):
        try:
            self.my_pipeline._update_table_2(
                sql_str=self.update_sql,
                params=(img_url, get_shanghai_time(), id),
                logger=self.lg
            )
        except Exception:
            return False
        return True

    def _get_one_sort_type_name_page_info(self, sort_type_name):
        '''
        得到一个分类的某页信息
        :return:
        '''
        base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name)
        headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': get_random_pc_ua(),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            # 'Referer': 'http://m.gx8899.com/weixin/',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

        index = 0
        res = []
        while True:
            if index == 0:
                url = base_url
                index += 1  # 第二页index_2开始
            else:
                url = base_url + 'index_{0}.html'.format(index)

            self.lg.info('正在抓取{0}'.format(url))
            # 太慢, 改用phantomjs
            # body = self._get_loop_run_result(url=url, headers=headers)

            if index % 15 == 0:
                try:
                    del self.driver
                except:
                    pass
                gc.collect()
                self.driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.lg, ip_pool_type=self.ip_pool_type)
                self.lg.info('[+] phantomjs已重置!')

            body = self.driver.get_url_body(url=url)
            # self.lg.info(str(body))
            if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall(body) != []:
                break

            need = Selector(text=body).css('div#con_tabone_1 li.last a:last-child ::attr(href)').extract()
            pprint(need)
            if need == []:
                self.lg.error('获取到的need为空list!出错地址:{0}'.format(url))
                continue

            for article_url in need:
                _ = self._get_one_article_page_info(article_url)
                if _ != []:
                    res += _

                self.lg.info('#### 已更新{0}个id !'.format(self.update_index))

            index += 1

        return res

    def _get_one_article_page_info(self, url):
        '''
        得到一个推荐地址里面所有图片list
        :param url:
        :return:
        '''
        headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

        # body = self._get_loop_run_result(url=url, headers=headers)
        body = self.driver.get_url_body(url=url)
        if body == '':
            self.lg.info('获取到img list为空list!出错地址:{}'.format(url))
            return []

        need = Selector(text=body).css('div.content p img ::attr(src)').extract()
        # pprint(need)
        # self.lg.info(str(need))
        if need != []:
            self.lg.info('[+] crawl子地址success')
        else:
            self.lg.info('[-] crawl子地址fail')

        # 数据更新操作
        for img_url in need:
            try:
                random_id_index = randint(0, len(self._get_new_wait_2_handle_id_list())-1)
            except:
                sleep(5)
                continue
            res = self.oo(
                id=self.id_list[random_id_index],
                img_url=img_url,
            )
            if res:
                self.id_list.pop(random_id_index)
                self.update_index += 1

        return need

    async def _get_one_page_body(self, url, headers):
        '''
        异步获取body
        :param url:
        :param headers:
        :return:
        '''
        body = await AioHttp.aio_get_url_body(url=url, headers=headers, ip_pool_type=self.ip_pool_type)

        return body

    def _get_loop_run_result(self, **kwargs):
        loop = get_event_loop()
        result = loop.run_until_complete(self._get_one_page_body(
            url=kwargs.get('url', ''),
            headers=kwargs.get('headers', {})
        ))

        return result

    def __del__(self):
        try:
            del self.driver
            del self.lg
        except:
            pass
        gc.collect()
示例#3
0
class PinduoduoSpike(object):
    def __init__(self):
        self._set_headers()
        self.ip_pool_type = IP_POOL_TYPE
        self.driver = BaseDriver(executable_path=EXECUTABLE_PATH,
                                 ip_pool_type=self.ip_pool_type)

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'm.juanpi.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def _get_db_goods_id_list(self) -> list:
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        _ = my_pipeline._select_table(sql_str=pd_select_str_3)
        assert _ is not None, 'db_goods_id_list为None!'
        db_goods_id_list = [item[0] for item in list(_)]

        try:
            del my_pipeline
        except:
            pass

        return db_goods_id_list

    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_miaosha_goods_list = self.get_all_miaosha_goods_list()
        try:
            del self.driver
        except:
            pass
        gc.collect()

        pinduoduo = PinduoduoParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            self.db_goods_id_list = self._get_db_goods_id_list()
            for item in all_miaosha_goods_list:
                '''
                注意: 明日8点半抓取到的是页面加载中返回的是空值
                '''
                if item.get('goods_id') != 'None':  # 跳过goods_id为'None'
                    if item.get('goods_id', '') in self.db_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过')
                        pass
                    else:
                        tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + item.get(
                            'goods_id')
                        pinduoduo.get_goods_data(goods_id=item.get('goods_id'))
                        goods_data = pinduoduo.deal_with_data()

                        # print(goods_data)
                        if goods_data == {}:  # 返回的data为空则跳过
                            print('得到的goods_data为空值,此处先跳过,下次遍历再进行处理')
                            # sleep(3)
                            pass

                        else:  # 否则就解析并插入
                            goods_data['stock_info'] = item.get('stock_info')
                            goods_data['goods_id'] = item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['username'] = '******'
                            goods_data['price'] = item.get('price')  # 秒杀前的原特价
                            goods_data['taobao_price'] = item.get(
                                'taobao_price')  # 秒杀价
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = item.get(
                                'miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data[
                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                    miaosha_time=item.get('miaosha_time'))

                            if item.get('stock_info', {}).get(
                                    'activity_stock', 0) <= 2:
                                # 实时秒杀库存小于等于2时就标记为 已售罄
                                print('该秒杀商品已售罄...')
                                goods_data['is_delete'] = 1

                            pinduoduo.insert_into_pinduoduo_xianshimiaosha_table(
                                data=goods_data, pipeline=my_pipeline)
                        sleep(PINDUODUO_SLEEP_TIME)

                else:
                    print('该goods_id为"None", 此处跳过')
                    pass
            sleep(5)

        else:
            pass
        try:
            del pinduoduo
        except:
            pass
        gc.collect()

    def get_all_miaosha_goods_list(self):
        def get_data(body):
            '''处理返回的body'''
            _ = '{}'
            try:
                _ = re.compile(r'<body>(.*)</body>').findall(body)[0]
            except IndexError:
                print('获取all_miaosha_goods_list出现索引异常!')

            return _

        # 今日秒杀
        tmp_url = 'http://apiv4.yangkeduo.com/api/spike/v2/list/today?page=0&size=2000'
        print('待爬取的今日限时秒杀数据的地址为: ', tmp_url)
        today_data = get_data(body=self.driver.get_url_body(url=tmp_url))
        today_data = self.json_to_dict(tmp_data=today_data)
        sleep(PINDUODUO_SLEEP_TIME)

        # 明日的秒杀
        tmp_url_2 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/tomorrow?page=0&size=2000'
        print('待爬取的明日限时秒杀数据的地址为: ', tmp_url_2)
        tomorrow_data = get_data(body=self.driver.get_url_body(url=tmp_url_2))
        tomorrow_data = self.json_to_dict(tmp_data=tomorrow_data)
        sleep(PINDUODUO_SLEEP_TIME)

        # 未来的秒杀
        tmp_url_3 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/all_after?page=0&size=2000'
        print('待爬取的未来限时秒杀数据的地址为: ', tmp_url_3)
        all_after_data = get_data(body=self.driver.get_url_body(url=tmp_url_3))
        all_after_data = self.json_to_dict(tmp_data=all_after_data)
        sleep(PINDUODUO_SLEEP_TIME)

        if today_data != []:
            today_miaosha_goods_list = self.get_miaoshao_goods_info_list(
                data=today_data)
            # print('今日限时秒杀的商品list为: ', today_miaosha_goods_list)

        else:
            today_miaosha_goods_list = []
            print('今日秒杀的items为[]')

        if tomorrow_data != []:
            tomorrow_miaosha_goods_list = self.get_miaoshao_goods_info_list(
                data=tomorrow_data)
            # print('明日限时秒杀的商品list为: ', tomorrow_miaosha_goods_list)

        else:
            tomorrow_miaosha_goods_list = []
            print('明日秒杀的items为[]')

        if all_after_data != []:
            all_after_miaosha_goods_list = self.get_miaoshao_goods_info_list(
                data=all_after_data)
            # print('未来限时秒杀的商品list为: ', all_after_miaosha_goods_list)

        else:
            all_after_miaosha_goods_list = []
            print('未来秒杀的items为[]')

        all_miaosha_goods_list = today_miaosha_goods_list
        for item in tomorrow_miaosha_goods_list:
            all_miaosha_goods_list.append(item)
        for item in all_after_miaosha_goods_list:
            all_miaosha_goods_list.append(item)
        print('当前所有限时秒杀商品list为: ', all_miaosha_goods_list)

        return all_miaosha_goods_list

    def json_to_dict(self, tmp_data):
        try:
            data = json.loads(tmp_data)
            # pprint(data)
            times = [
                str(timestamp_to_regulartime(int(item)))
                for item in data.get('times', [])
            ]
            data = data.get('items', [])
            # print(data)
            # print(times)
        except:
            print('json.loads转换data的时候出错,data为空')
            data = []
        return data

    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            miaosha_begin_time = str(
                timestamp_to_regulartime(
                    int(item.get('data', {}).get('start_time'))))
            tmp_hour = miaosha_begin_time[-8:-6]
            if tmp_hour in PINDUODUO_MIAOSHA_SPIDER_HOUR_LIST:
                if tmp_hour in PINDUODUO_MIAOSHA_BEGIN_HOUR_LIST:
                    '''
                    # 这些起始的点秒杀时间只有30分钟
                    '''
                    miaosha_end_time = str(
                        timestamp_to_regulartime(
                            int(item.get('data', {}).get('start_time')) +
                            60 * 30))
                else:
                    miaosha_end_time = str(
                        timestamp_to_regulartime(
                            int(item.get('data', {}).get('start_time')) +
                            60 * 60))

                tmp['miaosha_time'] = {
                    'miaosha_begin_time': miaosha_begin_time,
                    'miaosha_end_time': miaosha_end_time,
                }
                # 卷皮商品的goods_id
                tmp['goods_id'] = str(item.get('data', {}).get('goods_id'))
                # 限时秒杀库存信息
                tmp['stock_info'] = {
                    'activity_stock':
                    int(
                        item.get('data', {}).get('all_quantity', 0) -
                        item.get('data', {}).get('sold_quantity', 0)),
                    'stock':
                    item.get('data', {}).get('all_quantity', 0),
                }
                # 原始价格
                tmp['price'] = round(
                    float(item.get('data', {}).get('normal_price', '0')) / 100,
                    2)
                tmp['taobao_price'] = round(
                    float(item.get('data', {}).get('price', '0')) / 100, 2)
                miaosha_goods_list.append(tmp)
            else:
                pass
        return miaosha_goods_list

    def __del__(self):
        try:
            del self.driver
        except:
            pass
        gc.collect()
示例#4
0
class Pinduoduo_Miaosha_Real_Time_Update(object):
    def __init__(self):
        self._set_headers()
        self.delete_sql_str = pd_delete_str_1
        self.ip_pool_type = IP_POOL_TYPE
        self.driver = BaseDriver(executable_path=EXECUTABLE_PATH,
                                 ip_pool_type=self.ip_pool_type)

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'm.juanpi.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=pd_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            pinduoduo_miaosha = PinduoduoParse()

            all_miaosha_goods_list = self.get_all_miaosha_goods_list()

            # 其中所有goods_id的list
            miaosha_goods_all_goods_id = [
                i.get('goods_id') for i in all_miaosha_goods_list
            ]
            # print(miaosha_goods_all_goods_id)

            for item in result:  # 实时更新数据
                # 对于拼多多先拿到该商品的结束时间点
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        sql_cli._delete_table(sql_str=self.delete_sql_str,
                                              params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))
                        sleep(.3)

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            '''
                            表示其中没有了该goods_id
                            '''
                            sql_cli._delete_table(sql_str=self.delete_sql_str,
                                                  params=(item[0]))
                            print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                  item[0])
                            sleep(.3)

                        else:  # 未下架的
                            for item_1 in all_miaosha_goods_list:
                                if item_1.get('goods_id', '') == item[0]:
                                    # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                    # pinduoduo_miaosha = PinduoduoParse()
                                    pinduoduo_miaosha.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = pinduoduo_miaosha.deal_with_data(
                                    )

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        # sleep(3)
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item_1.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item_1.get(
                                            'goods_id')
                                        if item_1.get('stock_info').get(
                                                'activity_stock') > 0:
                                            goods_data['price'] = item_1.get(
                                                'price')  # 秒杀前的原特价
                                            goods_data[
                                                'taobao_price'] = item_1.get(
                                                    'taobao_price')  # 秒杀价
                                        else:
                                            pass
                                        goods_data['sub_title'] = item_1.get(
                                            'sub_title', '')
                                        goods_data[
                                            'miaosha_time'] = item_1.get(
                                                'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item_1.get(
                                                        'miaosha_time'))

                                        if item_1.get('stock_info').get(
                                                'activity_stock') <= 1:
                                            # 实时秒杀库存小于等于1时就标记为 已售罄
                                            print('该秒杀商品已售罄...')
                                            goods_data['is_delete'] = 1

                                        # print(goods_data)
                                        pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table(
                                            data=goods_data, pipeline=sql_cli)
                                    sleep(PINDUODUO_SLEEP_TIME)
                                else:
                                    pass

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(3 * 60)
        # del ali_1688
        gc.collect()

    def get_all_miaosha_goods_list(self):
        def get_data(body):
            '''处理返回的body'''
            _ = '{}'
            try:
                _ = re.compile(r'<body>(.*)</body>').findall(body)[0]
            except IndexError:
                print('获取all_miaosha_goods_list出现索引异常!')

            return _

        # 今日秒杀
        tmp_url = 'http://apiv4.yangkeduo.com/api/spike/v2/list/today?page=0&size=2000'
        # print('待爬取的今日限时秒杀数据的地址为: ', tmp_url)
        today_data = get_data(body=self.driver.get_url_body(url=tmp_url))
        today_data = self.json_to_dict(tmp_data=today_data)

        # 明日的秒杀
        tmp_url_2 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/tomorrow?page=0&size=2000'
        # print('待爬取的明日限时秒杀数据的地址为: ', tmp_url_2)
        tomorrow_data = get_data(body=self.driver.get_url_body(url=tmp_url_2))
        tomorrow_data = self.json_to_dict(tmp_data=tomorrow_data)

        # 未来的秒杀
        tmp_url_3 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/all_after?page=0&size=2000'
        # print('待爬取的未来限时秒杀数据的地址为: ', tmp_url_3)
        all_after_data = get_data(body=self.driver.get_url_body(url=tmp_url_3))
        all_after_data = self.json_to_dict(tmp_data=all_after_data)

        if today_data != []:
            today_miaosha_goods_list = self.get_miaoshao_goods_info_list(
                data=today_data)
            # print('今日限时秒杀的商品list为: ', today_miaosha_goods_list)

        else:
            today_miaosha_goods_list = []
            print('今日秒杀的items为[]')

        if tomorrow_data != []:
            tomorrow_miaosha_goods_list = self.get_miaoshao_goods_info_list(
                data=tomorrow_data)
            # print('明日限时秒杀的商品list为: ', tomorrow_miaosha_goods_list)

        else:
            tomorrow_miaosha_goods_list = []
            print('明日秒杀的items为[]')

        if all_after_data != []:
            all_after_miaosha_goods_list = self.get_miaoshao_goods_info_list(
                data=all_after_data)
            # print('未来限时秒杀的商品list为: ', all_after_miaosha_goods_list)

        else:
            all_after_miaosha_goods_list = []
            print('未来秒杀的items为[]')

        all_miaosha_goods_list = today_miaosha_goods_list
        for item in tomorrow_miaosha_goods_list:
            all_miaosha_goods_list.append(item)
        for item in all_after_miaosha_goods_list:
            all_miaosha_goods_list.append(item)
        # print('当前所有限时秒杀商品list为: ', all_miaosha_goods_list)

        return all_miaosha_goods_list

    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            miaosha_begin_time = str(
                timestamp_to_regulartime(
                    int(item.get('data', {}).get('start_time'))))
            tmp_hour = miaosha_begin_time[-8:-6]
            if tmp_hour in PINDUODUO_MIAOSHA_SPIDER_HOUR_LIST:
                if tmp_hour in PINDUODUO_MIAOSHA_BEGIN_HOUR_LIST:
                    '''
                    # 这些起始的点秒杀时间只有30分钟
                    '''
                    miaosha_end_time = str(
                        timestamp_to_regulartime(
                            int(item.get('data', {}).get('start_time')) +
                            60 * 30))
                else:
                    miaosha_end_time = str(
                        timestamp_to_regulartime(
                            int(item.get('data', {}).get('start_time')) +
                            60 * 60))

                tmp['miaosha_time'] = {
                    'miaosha_begin_time': miaosha_begin_time,
                    'miaosha_end_time': miaosha_end_time,
                }
                # 卷皮商品的goods_id
                tmp['goods_id'] = str(item.get('data', {}).get('goods_id'))
                # 限时秒杀库存信息
                tmp['stock_info'] = {
                    'activity_stock':
                    int(
                        item.get('data', {}).get('all_quantity', 0) -
                        item.get('data', {}).get('sold_quantity', 0)),
                    'stock':
                    item.get('data', {}).get('all_quantity', 0),
                }
                # 原始价格
                tmp['price'] = round(
                    float(item.get('data', {}).get('normal_price', '0')) / 100,
                    2)
                tmp['taobao_price'] = round(
                    float(item.get('data', {}).get('price', '0')) / 100, 2)
                miaosha_goods_list.append(tmp)
            else:
                pass
        return miaosha_goods_list

    def json_to_dict(self, tmp_data):
        try:
            data = json.loads(tmp_data)
            # pprint(data)
            times = [
                str(timestamp_to_regulartime(int(item)))
                for item in data.get('times', [])
            ]
            data = data.get('items', [])
            # print(data)
            # print(times)
        except:
            print('json.loads转换data的时候出错,data为空')
            data = []
        return data

    def is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestr: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(time.time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time <= -86400:  # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
            # if diff_time <= 0:
            return 0  # 已过期恢复原价的
        elif diff_time > 0 and diff_time <= 7200:  # 未来2小时的
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:
            return 2  # 未来时间的暂时不用更新

    def __del__(self):
        try:
            del self.driver
        except:
            pass
        gc.collect()
示例#5
0
class MoGuJiePinTuan(object):
    def __init__(self):
        self._set_headers()
        self._set_fcid_dict()
        self.ip_pool_type = IP_POOL_TYPE

    def _set_headers(self):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'api.mogujie.com',
            'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def _set_fcid_dict(self):
        self.fcid_dict = {
            '女装': 10053171,
            # '精选': 10053172,
            '男友': 10053173,
            '内衣': 10053174,
            '女鞋': 10053175,
            '包包': 10053176,
            '美妆': 10053177,
            '生活': 10053178,
            '配饰': 10053179,
            '母婴': 10053180,
            '食品': 10053181,
        }

    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []

        '''
        方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据
        '''
        # mw_appkey = '100028'
        # mw_t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e'
        # mw_ttid = 'NMMain%40mgj_h5_1.0'
        #
        # _ = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        #
        # data = {
        #     "pid": "93745",
        #     "platform": "m",
        #     "cKey": "mwp_mait",
        #     "fcid": "",
        # }
        #
        # params = {
        #     'data': data
        # }
        #
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid
        #
        # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format(
        #     mw_appkey, mw_t, mw_uuid, mw_ttid, _
        # )
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        #
        # try:
        #     response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     body = response.content.decode('utf-8')
        #     print(body)
        # except Exception:
        #     print('requests.get()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}

        '''
        方法二: 通过pc端来获取拼团商品列表
        '''
        self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type)
        for key in self.fcid_dict:
            print('正在抓取的分类为: ', key)
            for index in range(1, 100):
                if index % 5 == 0:
                    try: del self.my_phantomjs
                    except: pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type)

                fcid = self.fcid_dict[key]
                tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                    str(index), fcid
                )
                # requests请求数据被过滤(起初能用),改用phantomjs
                # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                body = self.my_phantomjs.get_url_body(url=tmp_url)
                # print(body)

                try:
                    body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0]
                    tmp_data = json.loads(body)
                except:
                    print('json.loads转换body时出错, 请检查')
                    continue

                if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []:
                    # 表示拼团数据为空则跳出循环
                    break

                # pprint(tmp_data)
                # print(tmp_data)

                tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', [])
                # print(tmp_item_list)
                # pprint(tmp_item_list)

                begin_time_timestamp = int(time.time())     # 开始拼团的时间戳
                item_list = [{
                    'goods_id': item.get('tradeItemId', ''),
                    'pintuan_time': {
                        'begin_time': timestamp_to_regulartime(timestamp=begin_time_timestamp),
                        'end_time': timestamp_to_regulartime(self.get_pintuan_end_time(begin_time_timestamp, item.get('leftTimeOrg', ''))),
                    },
                    'all_sell_count': str(item.get('salesVolume', 0)),
                    'fcid': fcid,
                    'page': index,
                    'sort': key,
                } for item in tmp_item_list]
                print(item_list)

                for item_1 in item_list:
                    goods_list.append(item_1)

                sleep(MOGUJIE_SLEEP_TIME)

        # 处理goods_list数据
        print(goods_list)
        self.deal_with_data(goods_list)
        sleep(5)

    def deal_with_data(self, *params):
        '''
        处理并存储相关拼团商品的数据
        :param params: 待传参数
        :return:
        '''
        goods_list = params[0]

        mogujie = MoGuJieParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=mg_select_str_1))
            db_goods_id_list = [item[0] for item in _]
            print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://shop.mogujie.com/detail/' + str(goods_id)

                    mogujie.get_goods_data(goods_id=str(goods_id))
                    goods_data = mogujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        # 规范化
                        goods_data['price_info_list'] = _get_mogujie_pintuan_price_info_list(goods_data['price_info_list'])
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['pintuan_time'] = item.get('pintuan_time', {})
                        goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('pintuan_time', {}))
                        goods_data['all_sell_count'] = item.get('all_sell_count', '')
                        goods_data['fcid'] = str(item.get('fcid'))
                        goods_data['page'] = str(item.get('page'))
                        goods_data['sort'] = str(item.get('sort', ''))

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mogujie.insert_into_mogujie_pintuan_table(data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()

    def get_pintuan_end_time(self, begin_time, left_time):
        '''
        处理并得到拼团结束时间
        :param begin_time: 秒杀开始时间戳
        :param left_time: 剩余时间字符串
        :return: end_time 时间戳(int)
        '''
        # 'leftTimeOrg': '6天13小时'
        # 'leftTimeOrg': '13小时57分'

        had_day = re.compile(r'天').findall(left_time)
        had_hour = re.compile(r'小时').findall(left_time)
        had_min = re.compile(r'分').findall(left_time)

        tmp = re.compile(r'\d+').findall(left_time)
        if had_day != [] and had_hour != []:    # left_time 格式为 '6天13小时'
            day, hour, min = int(tmp[0]), int(tmp[1]), 0

        elif had_day == [] and had_hour != []:  # left_time 格式为 '13小时57分'
            day, hour, min = 0, int(tmp[0]), int(tmp[1])

        elif had_day == [] and had_hour == []:  # left_time 格式为 '36分'
            print('left_time = ', left_time)
            day, hour, min = 0, 0, int(tmp[0])

        else:               # 无天, 小时, 分
            print('day, hour, min = 0, 0, 0', 'left_time = ', left_time)
            day, hour, min = 0, 0, 0

        left_end_time_timestamp = \
            day * 24 * 60 * 60 + \
            hour * 60 * 60 + \
            min * 60

        return begin_time + left_end_time_timestamp

    def __del__(self):
        try: del self.my_phantomjs
        except: pass
        gc.collect()
示例#6
0
class Zhe800Spike(object):
    def __init__(self):
        self.headers = self._get_pc_headers()
        self.ip_pool_type = IP_POOL_TYPE
        self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH,
                                       ip_pool_type=self.ip_pool_type)

    @staticmethod
    def _get_pc_headers():
        return {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'zhe800.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    @staticmethod
    def _get_begin_times_timestamp(data) -> int:
        _ = str(
            data.get('data', {}).get('blocks',
                                     [])[0].get('deal',
                                                {}).get('begin_time', ''))[:10]
        if _ != '':
            pass
        elif data.get('data', {}).get('blocks', [])[0].get('showcase',
                                                           {}) != {}:  # 未来时间
            print('*** 未来时间 ***')
            # pprint(data.get('data', {}))
            _ = str(
                data.get('data', {}).get('blocks',
                                         [])[1].get('deal',
                                                    {}).get('begin_time',
                                                            ''))[:10]
        else:
            raise Exception
        begin_times_timestamp = int(
            _)  # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整

        return begin_times_timestamp

    def _get_db_goods_id_list(self, my_pipeline) -> list:
        _ = list(my_pipeline._select_table(sql_str=z8_select_str_5))
        db_goods_id_list = [item[0] for item in _]

        return db_goods_id_list

    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        base_session_id = BASE_SESSION_ID
        while base_session_id < MAX_SESSION_ID:
            print('待抓取的session_id为: ', base_session_id)
            data = self._get_one_session_id_data(
                base_session_id=base_session_id)
            sleep(.5)
            if data.get('data', {}).get('blocks', []) == []:  # session_id不存在
                base_session_id += 2
                continue

            try:
                begin_times_timestamp = self._get_begin_times_timestamp(data)
            except Exception as e:
                print('遇到严重错误: ', e)
                base_session_id += 2
                continue

            print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp))
            is_recent_time = self.is_recent_time(
                timestamp=begin_times_timestamp)
            if not is_recent_time:  # 说明秒杀日期合法
                base_session_id += 2
                continue

            try:
                data = [
                    item_s.get('deal', {})
                    for item_s in data.get('data', {}).get('blocks', [])
                ]
            except Exception as e:
                print('遇到严重错误: ', e)
                base_session_id += 2
                continue
            # pprint(data)

            if data != []:  # 否则说明里面有数据
                miaosha_goods_list = self.get_miaoshao_goods_info_list(
                    data=data)
                # pprint(miaosha_goods_list)

                zhe_800 = Zhe800Parse()
                my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                if my_pipeline.is_connect_success:
                    db_goods_id_list = self._get_db_goods_id_list(my_pipeline)
                    for item in miaosha_goods_list:
                        if item.get('zid', '') in db_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过')
                            pass
                        else:
                            tmp_url = 'https://shop.zhe800.com/products/' + str(
                                item.get('zid', ''))
                            goods_id = zhe_800.get_goods_id_from_url(tmp_url)

                            zhe_800.get_goods_data(goods_id=goods_id)
                            goods_data = zhe_800.deal_with_data()
                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:  # 否则就解析并且插入
                                goods_data['stock_info'] = item.get(
                                    'stock_info')
                                goods_data['goods_id'] = str(item.get('zid'))
                                goods_data['spider_url'] = tmp_url
                                goods_data['username'] = '******'
                                goods_data['price'] = item.get('price')
                                goods_data['taobao_price'] = item.get(
                                    'taobao_price')
                                goods_data['sub_title'] = item.get('sub_title')
                                # goods_data['is_baoyou'] = item.get('is_baoyou')
                                goods_data['miaosha_time'] = item.get(
                                    'miaosha_time')
                                goods_data['miaosha_begin_time'], goods_data[
                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                        miaosha_time=item.get('miaosha_time'))
                                goods_data['session_id'] = str(base_session_id)

                                # print(goods_data)
                                res = zhe_800.insert_into_zhe_800_xianshimiaosha_table(
                                    data=goods_data, pipeline=my_pipeline)
                                if res:
                                    if goods_id not in db_goods_id_list:
                                        db_goods_id_list.append(goods_id)

                                sleep(ZHE_800_SPIKE_SLEEP_TIME)  # 放慢速度

                    sleep(4)
                else:
                    pass
                try:
                    del zhe_800
                except:
                    pass
                gc.collect()

            else:  # 说明这个sessionid没有数据
                print('该sessionid没有相关key为jsons的数据')
                pass

            base_session_id += 2

    def _get_one_session_id_data(self, base_session_id) -> dict:
        '''
        得到一个session_id的data
        :param base_session_id:
        :return:
        '''
        _data = []
        for _page in range(1, 20):
            '''per_page为20固定,其他不返回数据'''
            tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page={1}&per_page=20'.format(
                str(base_session_id),
                _page,
            )
            body = self.my_phantomjs.get_url_body(url=tmp_url, )
            # print(body)
            try:
                data = json_2_dict(
                    re.compile(r'<pre.*?>(.*)</pre>').findall(body)[0],
                    default_res={})
                # pprint(data)
            except (IndexError, Exception):
                sleep(.3)
                continue
            # print(type(data.get('data', {}).get('has_next')))
            if data.get('msg', '') == '无效场次':
                print('该session_id不存在,此处跳过')
                break

            if not data.get('data', {}).get('has_next', True):
                print('该session_id没有下页了!!')
                break
            else:
                print('正在抓取该session_id的第 {0} 页...'.format(_page))

            for _i in data.get('data', {}).get('blocks', []):
                _data.append(_i)

            sleep(.3)

        return {
            'data': {
                'blocks': _data,
            }
        }

    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            # pprint(item)
            tmp = {}
            # 秒杀开始时间和结束时间
            try:
                tmp['miaosha_time'] = {
                    'miaosha_begin_time':
                    timestamp_to_regulartime(
                        int(str(item.get('begin_time'))[:10])),
                    'miaosha_end_time':
                    timestamp_to_regulartime(
                        int(str(item.get('end_time'))[:10])),
                }
            except ValueError:
                continue

            # 折800商品地址
            tmp['zid'] = item.get('zid')
            # 限时秒杀的库存信息
            tmp['stock_info'] = {
                'activity_stock': item.get('activity_stock',
                                           0),  # activity_stock为限时抢的剩余数量
                'stock': item.get('stock', 0),  # stock为限时秒杀的总库存
            }
            tmp['price'] = float(item.get('list_price'))
            tmp['taobao_price'] = float(item.get('price'))
            tmp['sub_title'] = item.get('description', '')
            miaosha_goods_list.append(tmp)
            # pprint(miaosha_goods_list)

        return miaosha_goods_list

    def is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: True or False
        '''
        time_1 = int(timestamp)
        time_2 = time.time()  # 当前的时间戳
        time_1 = time.localtime(time_1)
        time_2 = time.localtime(time_2)
        if time_1.tm_year > time_2.tm_year:
            print('** 该年份为未来时间年份 **')
            if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR:  # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息
                print('合法时间')
                # diff_days = abs(time_1.tm_mday - time_2.tm_mday)
                return True
            else:
                print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR,
                                                       SPIDER_END_HOUR))
                return False

        if time_1.tm_year == time_2.tm_year:
            if time_1.tm_mon > time_2.tm_mon:  # 先处理得到的time_1的月份大于当前月份的信息(即未来月份的)
                print('** 该月份为未来时间月份 **')
                if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR:  # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息
                    print('合法时间')
                    # diff_days = abs(time_1.tm_mday - time_2.tm_mday)
                    return True
                else:
                    print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(
                        SPIDER_START_HOUR, SPIDER_END_HOUR))
                    return False

            if time_1.tm_mon >= time_2.tm_mon:  # 如果目标时间的月份时间 >= 当前月份(月份合法, 表示是当前月份或者是今年其他月份)
                if time_1.tm_mday >= time_2.tm_mday - 2:  # 这样能抓到今天的前两天的信息
                    if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR:  # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息
                        print('合法时间')
                        # diff_days = abs(time_1.tm_mday - time_2.tm_mday)
                        return True
                    else:
                        print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(
                            SPIDER_START_HOUR, SPIDER_END_HOUR))
                        return False
                else:
                    print('该日时间已过期, 此处跳过')
                    return False
            else:  # 月份过期
                print('该月份时间已过期,此处跳过')
                return False

        else:
            print('非本年度的限时秒杀时间,此处跳过')
            return False

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
示例#7
0
class MoGuJiePinTuanRealTimesUpdate(object):
    def __init__(self):
        self._set_headers()
        self.ip_pool_type = IP_POOL_TYPE

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'list.mogujie.com',
            # 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=mg_delete_str_2)
            result = list(sql_cli._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            self.my_phantomjs = BaseDriver(
                executable_path=PHANTOMJS_DRIVER_PATH,
                ip_pool_type=self.ip_pool_type)
            for item in result:  # 实时更新数据
                goods_id = item[0]
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(
                        executable_path=PHANTOMJS_DRIVER_PATH,
                        ip_pool_type=self.ip_pool_type)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            update_sql_str=mg_update_str_5,
                            sql_cli=sql_cli,
                        )
                        print(
                            '过期的goods_id为(%s)' % goods_id,
                            ', 拼团开始时间为(%s), 逻辑删除成功!' %
                            json.loads(item[1]).get('begin_time'))
                        sleep(.3)

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.get_url_body(url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                _handle_goods_shelves_in_auto_goods_table(
                                    goods_id=goods_id,
                                    update_sql_str=mg_update_str_5,
                                    sql_cli=sql_cli,
                                )
                                sleep(.3)

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if goods_id not in pintuan_goods_all_goods_id:
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=goods_id)
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = goods_id
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data, pipeline=sql_cli)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == goods_id:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=goods_id)
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data[
                                                    'goods_id'] = goods_id
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=sql_cli)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        gc.collect()

    def get_pintuan_end_time(self, begin_time, left_time):
        '''
        处理并得到拼团结束时间
        :param begin_time: 秒杀开始时间戳
        :param left_time: 剩余时间字符串
        :return: end_time 时间戳(int)
        '''
        # 'leftTimeOrg': '6天13小时'
        # 'leftTimeOrg': '13小时57分'

        had_day = re.compile(r'天').findall(left_time)
        had_hour = re.compile(r'小时').findall(left_time)
        had_min = re.compile(r'分').findall(left_time)

        tmp = re.compile(r'\d+').findall(left_time)
        if had_day != [] and had_hour != []:  # left_time 格式为 '6天13小时'
            day, hour, min = int(tmp[0]), int(tmp[1]), 0

        elif had_day == [] and had_hour != []:  # left_time 格式为 '13小时57分'
            day, hour, min = 0, int(tmp[0]), int(tmp[1])

        elif had_day == [] and had_hour == []:  # left_time 格式为 '36分'
            print('left_time = ', left_time)
            day, hour, min = 0, 0, int(tmp[0])

        else:  # 无天, 小时, 分
            print('day, hour, min = 0, 0, 0', 'left_time = ', left_time)
            day, hour, min = 0, 0, 0

        left_end_time_timestamp = \
            day * 24 * 60 * 60 + \
            hour * 60 * 60 + \
            min * 60

        return begin_time + left_end_time_timestamp

    def is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(datetime_to_timestamp(get_shanghai_time()))  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的

        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的

        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()