Exemplo n.º 1
0
def _deal_with_jd_goods(goods_link, my_lg):
    '''
    处理jd商品
    :param goods_link:
    :return:
    '''
    my_lg.info('进入京东商品处理接口...')
    goods_id = _get_jd_goods_id(goods_link)
    if goods_id == '':
        msg = 'goods_id匹配失败!请检查url是否正确!'
        return _error_data(msg=msg)

    jd_url = 'https://item.jd.com/{0}.html'.format(goods_id)
    data = get_one_jd_data(wait_to_deal_with_url=jd_url)
    if data.get('msg', '') == 'data为空!':
        msg = '该goods_id:{0}, 抓取数据失败!'.format(goods_id)
        return _error_data(msg=msg)

    else:
        pass

    site_id = _from_jd_type_get_site_id(type=data.get('jd_type'))
    data = _get_right_model_data(data=data, site_id=site_id, logger=my_lg)
    my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
    my_lg.info('------>>>| 正在存储的数据为: ' + data.get('goods_id', ''))

    params = _get_db_jd_insert_params(item=data)
    sql_str = 'insert into dbo.GoodsInfoAutoGet(GoodsID, GoodsUrl, UserName, CreateTime, ModfiyTime, ShopName, Account, GoodsName, SubTitle, LinkName, Price, TaoBaoPrice, PriceInfo, SKUName, SKUInfo, ImageUrl, PropertyInfo, DetailInfo, SellCount, SiteID, IsDelete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
    is_insert_into = my_pipeline._insert_into_table(sql_str=sql_str,
                                                    params=params)
    if is_insert_into:  # 如果返回值为True
        pass
    else:  # 不处理存储结果
        # msg = '存储该goods_id:{0}失败!'.format(goods_id)
        # return _error_data(msg=msg)
        pass

    return compatible_api_goods_data(data=data, my_lg=my_lg)
Exemplo n.º 2
0
    def get_all_user_and_their_recommend_goods_list(self):
        for index in range(1, 100):
            t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位

            # 达人推荐的地址(ajax请求)
            tmp_url = 'https://wq.jd.com/shopgroup_feed/GetDarenFeeds?pageno={}&pagesize=5&darenType=0&perDarenFeedNum=3&totalpage=1&_={}&callback=jsonpCBKC&g_ty=ls'.format(
                str(index), t
            )

            self.from_ip_pool_set_proxy_ip_to_phantomjs()
            self.driver.set_page_load_timeout(15)  # 设置成15秒避免数据出错

            try:
                self.driver.get(tmp_url)
                self.driver.implicitly_wait(15)
            except Exception as e:  # 如果超时, 终止加载并继续后续操作
                print('-->>time out after 15 seconds when loading page')
                self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                # pass

            body = self.driver.page_source
            body = re.compile(r'\n').sub('', body)
            body = re.compile(r'\t').sub('', body)
            body = re.compile(r'  ').sub('', body)
            # print(body)
            body = re.compile(r'square\((.*)\)').findall(body)

            if body != []:
                body = body[0]
                try:
                    data = json.loads(body)
                    # pprint(data)
                except:
                    print('json.loads转换body得到data时出错!')
                    return []

                if data.get('user_list') is None:   # 表示没有数据了,返回的为 square({"errmsg":"","iRet":0,"totalnum":347} )
                    print('body中获取的user_list为None!')
                    pass

                else:
                    user_list = data.get('user_list', [])
                    # pprint(user_list)

                    for item in user_list:
                        # 达人昵称
                        nick_name = item.get('nickname', '')

                        # 达人头像
                        head_url = item.get('headurl', '')
                        head_url = re.compile(r'http:').sub('', head_url)
                        if re.compile(r'^http').findall(head_url) != []:
                            pass
                        else:
                            head_url = 'http:' + head_url

                        # 个性签名
                        profile = item.get('profile', '')

                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10'
                        _ = my_pipeline._select_table(sql_str=sql_str)
                        db_goods_id = [j[1] for j in list(_)] if _ is not None else []
                        # print(db_goods_id)
                        sql_str = r'select share_id from dbo.jd_youxuan_daren_recommend'
                        db_share_id = [j[0] for j in list(my_pipeline._select_table(sql_str=sql_str))]
                        # print(db_share_id)
                        jd = JdParse()

                        # 达人推荐的商品info
                        feed_list = item.get('feed_list', [])
                        for feed_list_item in feed_list:
                            if feed_list_item.get('shareid', '') in db_share_id:
                                print('该share_id({})已存在于数据库中, 此处跳过!'.format(feed_list_item.get('shareid', '')))
                                pass
                            else:
                                # share_id
                                share_id = feed_list_item.get('shareid', '')
                                article_url = 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id
                                print('------>>>| 正在抓取的jd优选达人推荐文章的地址为: ', 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id)

                                # 图片的信息
                                tmp_share_img_url_list = []
                                for item2 in feed_list_item.get('sharepicurl', '').split(','):
                                    if re.compile(r'^//').findall(item2) == []:
                                        tmp_share_img_url = 'https://img14.360buyimg.com/evalpic/s800x800_' + item2
                                    else:
                                        tmp_share_img_url = 'http:' + item2
                                    tmp_share_img_url_list.append(tmp_share_img_url)
                                share_img_url_list = [{'img_url': item5} for item5 in tmp_share_img_url_list]

                                # 处理得到达人的自拍图片div
                                tmp_img_div_desc = ''
                                for item4 in tmp_share_img_url_list:
                                    tmp_img_div = r'<img src="{}" style="height:auto;width:100%;"/>'.format(item4)
                                    tmp_img_div_desc += tmp_img_div
                                my_img_div = '<div>' + tmp_img_div_desc + '</div>'
                                # print(my_img_div)

                                # 获取到goods_id 和 fisrt_text
                                share_url = 'https://wq.jd.com/shopgroup_feed/FeedDetail?shareid=' + feed_list_item.get('shareid', '') + '&g_tk=1975813451'
                                try:
                                    self.from_ip_pool_set_proxy_ip_to_phantomjs()
                                    self.driver.get(share_url)
                                    self.driver.implicitly_wait(15)
                                except Exception as e:  # 如果超时, 终止加载并继续后续操作
                                    print('-->>time out after 15 seconds when loading page')
                                    self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                                    # pass
                                feed_detail_body = self.driver.page_source
                                feed_detail_body = re.compile(r'\n').sub('', feed_detail_body)
                                feed_detail_body = re.compile(r'\t').sub('', feed_detail_body)
                                feed_detail_body = re.compile(r'  ').sub('', feed_detail_body)

                                feed_data = re.compile(r'square\((.*)\)').findall(feed_detail_body)
                                # print(feed_data)

                                if feed_data != []:
                                    feed_data = feed_data[0]
                                    try:
                                        feed_data = json.loads(feed_data)
                                    except:
                                        print('json.loads转换feed_data失败,此处跳过!')
                                        break   # 跳出后执行后面的外层的else

                                    # 文章标题
                                    title = feed_data.get('feeddata', {}).get('title', '')
                                    title = re.compile(r'12.12').sub('', title)

                                    # 达人评论内容
                                    tmp_comment_content = feed_data.get('feeddata', {}).get('commentcontent', '')
                                    tmp_comment_content = re.compile(r'&amp;').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'\n').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'12.12').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'11.11').sub('', tmp_comment_content)
                                    comment_content = tmp_comment_content

                                    if title == '':
                                        # 由于获取到title为空, 所有title = comment_content, 并把comment_content = ''
                                        title = comment_content
                                        comment_content = ''
                                    # print('该文章的标题为: ', title)
                                    # print('达人的评论内容为: ', comment_content)

                                    # first_text(文章的第一段评论内容)
                                    first_text = feed_data.get('feeddata', {}).get('firsttext', '')
                                    first_text = re.compile(r'12.12').sub('', first_text)
                                    first_text = re.compile(r'11.11').sub('', first_text)
                                    # print('first_text为: ', first_text)

                                    sku_id = feed_data.get('feeddata', {}).get('skuid')
                                    if sku_id == '0':
                                        # 如果sku_id = '0'表示没有sku_id
                                        sku_id = ''
                                    # print('sku_id为: ', sku_id)

                                    share_id = feed_list_item.get('shareid', '')
                                    tmp_div_body_dict = self.get_div_body(share_id=share_id)
                                    # pprint(tmp_div_body_dict)

                                    if tmp_div_body_dict['sku_info'] == [] and sku_id != '':
                                        # 表示如果tmp_div_body_dict['sku_info']为[],则第二部分没有goods_id,所有将第一个sku_id赋值给sku_info
                                        goods_id_list = [{'goods_id': sku_id}]
                                    else:
                                        # 这篇文章推荐的商品goods_id的list(第一个为没有div_body时的goods_id)
                                        goods_id_list = [{'goods_id': item6} for item6 in tmp_div_body_dict['sku_info']]
                                    tmp_div_body = '<div>' + '<h3>{}</h3>'.format(title) + '<p>{}</p>'.format(comment_content) + my_img_div + tmp_div_body_dict['div_body']
                                    # print('该文章推荐的商品goods_id的list为: ', goods_id_list)
                                    # print(tmp_div_body)

                                else:
                                    print('获取feed_data失败!')
                                    return []

                                # 后期处理
                                if comment_content == '':
                                    comment_content = first_text

                                '''
                                时区处理,时间处理到上海时间
                                '''
                                tz = pytz.timezone('Asia/Shanghai')  # 创建时区对象
                                now_time = datetime.datetime.now(tz)
                                # 处理为精确到秒位,删除时区信息
                                now_time = re.compile(r'\..*').sub('', str(now_time))
                                # 将字符串类型转换为datetime类型
                                now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S')
                                create_time = now_time      # 创建的时间

                                result = {
                                    'nick_name': nick_name,                     # 达人昵称
                                    'head_url': head_url,                       # 达人头像
                                    'profile': profile,                         # 个性签名
                                    'share_id': share_id,                       # 分享的share_id
                                    'article_url': article_url,                 # 文章原地址
                                    'title': title,                             # 文章标题
                                    'comment_content': comment_content,         # 达人的评论内容
                                    'share_img_url_list': share_img_url_list,   # 达人自拍照片list
                                    # 'first_text': first_text,                   # 文章的第一段评论文字
                                    'goods_id_list':goods_id_list,              # 文章中所有推荐的商品的goods_id的list
                                    'div_body': tmp_div_body,                   # 文章主体div
                                    'create_time': create_time,                 # 文章创建的时间
                                }
                                # pprint(result)
                                print(result)
                                params = self._get_db_insert_params(item=result)
                                sql_str = r'insert into dbo.jd_youxuan_daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_img_url_list, goods_id_list, div_body, create_time) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                                my_pipeline._insert_into_table(sql_str=sql_str, params=params)

                                print('准备开始抓取该文章中的所有推荐商品'.center(30, '-'))
                                for i in goods_id_list:
                                    if i.get('goods_id', '') in db_goods_id:
                                        print('该goods_id({})已经存在于数据库中, 此处跳过!'.format(i.get('goods_id', '')))
                                        pass
                                    else:
                                        tmp_goods_id_url = 'https://item.jd.com/' + i.get('goods_id', '') + '.html'
                                        goods_id = jd.get_goods_id_from_url(jd_url=tmp_goods_id_url)
                                        jd.get_goods_data(goods_id=goods_id)
                                        tmp_jd_data = jd.deal_with_data(goods_id=goods_id)
                                        tmp_jd_data['spider_url'] = tmp_goods_id_url
                                        tmp_jd_data['username'] = '******'
                                        tmp_jd_data['goods_id'] = goods_id[1]

                                        jd.insert_into_jd_table(data=tmp_jd_data, pipeline=my_pipeline)
                                print('该文章内推荐的商品全部抓取完毕'.center(30, '-'))

            else:
                print('body为空list!')
Exemplo n.º 3
0
    async def _insert_into_table(self):
        pipeline = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'insert into dbo.Region(c_name, code, parent_code, parent_name) values(%s, %s, %s, %s)'
        # 存储第一级别
        print('第一级别'.center(100, '-'))
        for item in self.target_data:
            c_name = item.get('c_name')
            code = item.get('code')
            parent_code = ''
            parent_name = ''
            params = (
                c_name,
                code,
                parent_code,
                parent_name,
            )
            print(params)
            pipeline._insert_into_table(sql_str=sql_str, params=params)

        # 存储第二级别
        print('第二级别'.center(100, '-'))
        for item in self.target_data:
            item_name = item.get('c_name')
            item_code = item.get('code')
            o = item.get('o')
            for i in o:
                c_name = i.get('c_name')
                code = i.get('code')
                parent_code = item_code
                parent_name = item_name
                params = (
                    c_name,
                    code,
                    parent_code,
                    parent_name,
                )
                print(params)
                pipeline._insert_into_table(sql_str=sql_str, params=params)

        # 存储第三级别
        print('第三级别'.center(100, '-'))
        for item in self.target_data:
            o = item.get('o')
            for i in o:
                i_name = i.get('c_name')
                i_code = i.get('code')
                w = i.get('w')
                if w is not None:
                    for j in w:
                        c_name = j.get('c_name')
                        code = j.get('code')
                        parent_code = i_code
                        parent_name = i_name
                        params = (
                            c_name,
                            code,
                            parent_code,
                            parent_name,
                        )
                        print(params)
                        pipeline._insert_into_table(sql_str=sql_str,
                                                    params=params)
                else:  # eg: 北京市只到第二级, 第三级无
                    pass

        print('全部写入完毕!'.center(100, '*'))

        return True
Exemplo n.º 4
0
    def run(self):
        global coupon_queue, goods_id_and_coupon_url_list, unique_coupon_id_list

        while True:
            sql_cli = None
            try:
                if coupon_queue.qsize() >= 1:
                    # todo 有些领券url 为预付定金商品, 此处不处理
                    coupon_item = coupon_queue.get()
                    ori_coupon_list = json_2_dict(
                        json_str=re.compile('\((.*)\)').findall(coupon_item)
                        [0],
                        default_res={},
                    ).get('data', {}).get('resultList', [])
                    assert ori_coupon_list != []
                    # pprint(ori_coupon_list)

                    # todo: 测试发现, 返回数据中, 若有多买几件的优惠券在字段'nCouponInfoMap'中
                    # 现只支持1件, 不支持多件的券
                    coupon_list = []
                    for item in ori_coupon_list:
                        try:
                            goods_id = str(item.get('itemId', ''))
                            assert goods_id != ''
                            # 一个账户优惠券只能使用一次
                            # 优惠券展示名称, eg: '优惠券'
                            coupon_display_name = '优惠券'
                            # 优惠券的值, 即优惠几元
                            ori_coupon_value = item.get('couponAmount', '')
                            assert ori_coupon_value != ''
                            coupon_value = str(
                                float(ori_coupon_value).__round__(2))
                            # 使用门槛
                            ori_thresold = item.get('couponStartFee', '')
                            assert ori_thresold != ''
                            threshold = str(float(ori_thresold).__round__(2))
                            begin_time = str(
                                timestamp_to_regulartime(
                                    int(
                                        item.get('couponEffectiveStartTime',
                                                 '')[0:10])))
                            end_time = str(
                                timestamp_to_regulartime(
                                    int(
                                        item.get('couponEffectiveEndTime',
                                                 '')[0:10])))
                            # 使用方法
                            use_method = '满{}元, 减{}元'.format(
                                threshold, coupon_value)

                            if string_to_datetime(
                                    end_time) <= get_shanghai_time():
                                print('该券已过期[goods_id: {}]'.format(goods_id))
                                # 已过期的
                                continue

                            if datetime_to_timestamp(string_to_datetime(end_time)) - datetime_to_timestamp(string_to_datetime(begin_time)) \
                                    <= 60 * 60 * 36:
                                print('该券小于1.5天[goods_id: {}], pass'.format(
                                    goods_id))
                                continue

                            # todo 测试发现, 同一商品可能存在不同活动时间段的同一优惠券(但是活动时间不同), 导致一个商品有多个优惠券
                            #  所以取值时, 按结束时间最大那个来取值
                            # 上面还是会有问题, 导致价格重复减, 所以生成唯一id, 所以在一次转换价格后要把所有的该goods_id券都标记为1
                            # 生成唯一id
                            # unique_id = str(get_uuid3(
                            #     target_str=goods_id \
                            #                + coupon_value \
                            #                + threshold \
                            #                + str(datetime_to_timestamp(string_to_datetime(begin_time)))[0:10]\
                            #                + str(datetime_to_timestamp(string_to_datetime(end_time)))[0:10]))

                            # todo 根据上诉存在多张券导致价格被多次修改的情况,故表中一个goods_id,只允许存一张券, 就不会出现价格被多次修改的情况
                            # 解释就说: 只存储优惠力度最大的券
                            unique_id = str(get_uuid3(target_str=goods_id))

                            # 领券地址
                            # pprint(goods_id_and_coupon_url_list)
                            coupon_url = ''
                            for j in goods_id_and_coupon_url_list:
                                tmp_goods_id = j['goods_id']
                                tmp_coupon_url = j['coupon_url']
                                if goods_id == tmp_goods_id:
                                    print('@@@ 成功匹配到goods_id: {} 的领券地址: {}!!'.
                                          format(goods_id, tmp_coupon_url))
                                    coupon_url = tmp_coupon_url
                                    break
                                else:
                                    continue
                            assert coupon_url != ''

                            coupon_list.append({
                                'unique_id': unique_id,
                                'goods_id': goods_id,
                                'coupon_url': coupon_url,
                                'coupon_display_name': coupon_display_name,
                                'coupon_value': coupon_value,
                                'threshold': threshold,
                                'begin_time': begin_time,
                                'end_time': end_time,
                                'use_method': use_method,
                            })

                        except Exception as e:
                            print(e)
                            continue

                    # pprint(coupon_list)
                    if coupon_list != []:
                        # 存储
                        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
                        if not sql_cli.is_connect_success:
                            raise SqlServerConnectionException

                        for item in coupon_list:
                            unique_id = item['unique_id']
                            goods_id = item['goods_id']
                            if unique_id not in unique_coupon_id_list:
                                save_res = sql_cli._insert_into_table(
                                    sql_str=
                                    'insert into dbo.coupon_info(unique_id, create_time, goods_id, coupon_url, coupon_display_name, coupon_value, threshold, begin_time, end_time, use_method) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
                                    params=(
                                        unique_id,
                                        str(get_shanghai_time()),
                                        goods_id,
                                        item['coupon_url'],
                                        item['coupon_display_name'],
                                        Decimal(
                                            item['coupon_value']).__round__(2),
                                        Decimal(
                                            item['threshold']).__round__(2),
                                        item['begin_time'],
                                        item['end_time'],
                                        item['use_method'],
                                    ),
                                    repeat_insert_default_res=False,  # 避免重复改价
                                )
                                if save_res:
                                    # todo 只更新一次价格, 避免重复更新导致价格错误
                                    # 去重
                                    unique_coupon_id_list.append(unique_id)
                                    # 更新常规表中的商品价格变动
                                    sql_str = '''
                                    select top 1 Price, TaoBaoPrice, SKUInfo
                                    from dbo.GoodsInfoAutoGet
                                    where GoodsID=%s
                                    '''
                                    db_res = []
                                    try:
                                        db_res = list(
                                            sql_cli._select_table(
                                                sql_str=sql_str,
                                                params=(goods_id, ),
                                            ))
                                    except Exception as e:
                                        print(e)

                                    if db_res != []:
                                        # 标记常规商品由于优惠券带来的价格变动
                                        try:
                                            # 减去优惠券的价格
                                            coupon_value = float(
                                                item['coupon_value'])
                                            threshold = float(
                                                item['threshold'])
                                            # 还原为原始价格
                                            db_price = float(
                                                db_res[0][0]) * (1 - CP_PROFIT)
                                            db_taobao_price = float(
                                                db_res[0][1]) * (1 - CP_PROFIT)
                                            # 减去优惠券价, 并且加上CP_PROFIT, 得到最终待存储价格
                                            new_price = (
                                                (db_price - coupon_value
                                                 if db_price >= threshold else
                                                 db_price) *
                                                (1 + CP_PROFIT)).__round__(2)
                                            new_taobao_price = (
                                                (db_taobao_price -
                                                 coupon_value if
                                                 db_taobao_price >= threshold
                                                 else db_taobao_price) *
                                                (1 + CP_PROFIT)).__round__(2)

                                            new_sku_info = get_new_sku_info_from_old_sku_info_subtract_coupon_and_add_cp_profit(
                                                old_sku_info=json_2_dict(
                                                    json_str=db_res[0][2],
                                                    default_res=[],
                                                ),
                                                threshold=threshold,
                                                coupon_value=coupon_value,
                                            )

                                            sql_str2 = '''
                                            update dbo.GoodsInfoAutoGet
                                            set Price=%s, TaoBaoPrice=%s, SKUInfo=%s, ModfiyTime=%s, sku_info_trans_time=%s, IsPriceChange=1, PriceChangeInfo=SKUInfo
                                            where GoodsID=%s 
                                            '''
                                            now_time = get_shanghai_time()
                                            sql_cli._update_table(
                                                sql_str=sql_str2,
                                                params=(
                                                    Decimal(new_price
                                                            ).__round__(2),
                                                    Decimal(new_taobao_price).
                                                    __round__(2),
                                                    dumps(new_sku_info,
                                                          ensure_ascii=False),
                                                    now_time,
                                                    now_time,
                                                    goods_id,
                                                ),
                                            )
                                        except Exception as e:
                                            print(e)
                                    else:
                                        pass
                                else:
                                    continue

                            else:
                                continue

                else:
                    continue

            except IndexError:
                # 跳过相同接口得索引异常
                continue
            except Exception as e:
                print(e)
            finally:
                try:
                    del sql_cli
                except:
                    pass