예제 #1
0
    def _tmall_keywords_spider(self, **kwargs):
        """
        tmall对应关键字采集
        :param kwargs:
        :return:
        """
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')
        for item in goods_url_list:
            # item为goods_url
            # 用于判断某个goods是否被插入的参数
            result = False
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True   # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
                self.sql_cli = _block_get_new_db_conn(
                    db_obj=self.sql_cli,
                    index=self.add_goods_index,
                    logger=self.lg,
                    remainder=20, )
                if self.sql_cli.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            if not self.check_target_data_is_legal(target_data=data):
                                return False

                            result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli)
                        else:
                            pass

                else:
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:
                # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id,
                    keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True
예제 #2
0
    def _tmall_keywords_spider(self, **kwargs):
        '''
        tmall对应关键字采集
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https:' + re.compile('&skuId=.*').sub('', item)
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每20次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data[
                                'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                    type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.my_lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            result = tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
    async def deal_with_tmcs_goods_id_list(self):
        self.lg.info('即将开始抓取tmcs goods, 请耐心等待...')
        for item in self.db_wait_2_save_goods_id_list:
            # eg: '61864164616'
            goods_id = item

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
            self.sql_cli = _block_get_new_db_conn(
                db_obj=self.sql_cli,
                index=self.add_goods_index,
                logger=self.lg,
                remainder=self.sql_cli_remainder,
            )
            if self.sql_cli.is_connect_success:
                # 加spm 是为了get_goods_id_from_url能筛选, id
                # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id)
                goods_url = 'https://detail.tmall.com/item.htm?id={}'.format(
                    goods_id)
                # 下面这个goods_id为类型加goods_id的list
                goods_id = tmall.get_goods_id_from_url(goods_url)
                if goods_id == []:
                    self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url))
                    continue
                else:
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (goods_id[1], str(self.add_goods_index)))
                    tt = tmall.get_goods_data(goods_id)
                    data = tmall.deal_with_data()
                    goods_id = goods_id[1]
                    if data != {}:
                        data['goods_id'] = goods_id
                        data['username'] = '******'
                        data['main_goods_id'] = None
                        data[
                            'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                type=data['type'],
                                goods_id=goods_id,
                            )
                        if data['goods_url'] == '':
                            self.lg.error('该goods_url为空值! 此处跳过!')
                            continue

                        if len(data['all_img_url']) <= 1:
                            self.lg.info(
                                '[goods_id: {}]主图个数<=1, pass'.format(goods_id))
                            return False

                        result = tmall.old_tmall_goods_insert_into_new_table(
                            data=data, pipeline=self.sql_cli)
                        if result:
                            # 避免后续重复采集
                            self.db_existed_goods_id_list.append(goods_id)
                        else:
                            pass
                    else:
                        pass

            else:
                self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                pass
            self.add_goods_index += 1
            collect()
            sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.lg.info('tmcs已经抓取完毕!')

        return True