예제 #1
0
def record_device_id(deviceid, platform, original_deviceid):

    tnow = timeHelper.getNowLong()
    if platform not in [notification_config.CONST_DEVICE_PLATFORM_ANDROID, notification_config.CONST_DEVICE_PLATFORM_IOS]:
        platform = notification_config.CONST_DEVICE_PLATFORM_IOS

    if len(deviceid) < 5:
        return {
            'status': -1,
            'msg': 'invalid device_id = %s' %deviceid,
        }

    # record in db
    vlist = [
        [deviceid,platform,tnow,original_deviceid,""]
    ]

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='user_notification_device',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=True,
        need_flow=False,
    )

    return ret
예제 #2
0
    def run_tasks_repeated(self):

        while True:
            has_error = False
            logging.info("\n\n\nJob start at %s" %timeHelper.getNowLong())
            t1 = time.time()
            try:
                has_error  = self.run_tasks_once()
            except Exception as e:
                logging.error("ERROR GREP LINE >>>>>>" * 5)
                logging.error(e)
                has_error = True

            logging.info("HAS ERROR TODAY? %s" %has_error)

            t2 = time.time()
            logging.info("RUN-ONCE TOTALLY TAKING TIME: %s seconds" %(t2-t1) )
            logging.info("-" * 60)


            if self.is_daily:
                remaining = timeHelper.getTimeLeftTillTomorrow() + self.START_HOUR*3600 + 10
                logging.info("Sleep to tomorrow, at %s:00:00 hour: hours left = %s" %(self.START_HOUR,remaining/3600) )
            else:
                inter = self.interval_seconds - int(t2-t1)
                if inter < 0:
                    inter = 0
                remaining = inter  + 10 #+ self.START_HOUR*3600
                logging.info("Sleep to fill interval, seconds left = %s" %(remaining) )

            if remaining < self.min_sleep:
                remaining = self.min_sleep

            time.sleep(remaining)
예제 #3
0
def fill_catalog():

    tnow = timeHelper.getNowLong()

    catalog_id = 1000
    vlist = []
    tlist = []
    for cdict in JD_CATALOG_ARRAY:
        ttp = []
        catids = cdict['category_ids']
        catalog_name = cdict['catalog_name'].strip()
        ttp += [catalog_id, catalog_name, catalog_id, tnow]

        for item in catids:
            vtp = []
            cat_name = item.strip()
            id_prefix_array = _get_category_id_prefix_given_category_name(
                cat_name)
            vtp += id_prefix_array
            vtp += [catalog_id, catalog_name, tnow]
            vlist.append(vtp)

        tlist.append(ttp)
        catalog_id += 1000

    sql1 = 'insert into jd_catalog values(%s,%s,%s,%s)'
    tlist += _get_fixed_catalog()
    ar1 = dbhelper.executeSqlWriteMany(sql1, tlist)

    sql2 = 'insert into jd_catalog_map values(%s,%s,%s,%s,%s)'
    ar2 = dbhelper.executeSqlWriteMany(sql2, vlist)

    print 'jd_catalog rows inserted: %s' % ar1
    print 'jd_catalog_map rows inserted: %s' % ar2
예제 #4
0
def fill_catalog():

    tnow = timeHelper.getNowLong()

    catalog_id = 1000
    vlist = []
    tlist = []
    for cdict in JD_CATALOG_ARRAY:
        ttp = []
        catids = cdict['category_ids']
        catalog_name = cdict['catalog_name'].strip()
        ttp += [ catalog_id, catalog_name, catalog_id, tnow ]

        for item in catids:
            vtp = []
            cat_name = item.strip()
            id_prefix_array = _get_category_id_prefix_given_category_name(cat_name)
            vtp += id_prefix_array
            vtp += [catalog_id, catalog_name, tnow]
            vlist.append(vtp)

        tlist.append(ttp)
        catalog_id += 1000

    sql1 = 'insert into jd_catalog values(%s,%s,%s,%s)'
    tlist += _get_fixed_catalog()
    ar1 = dbhelper.executeSqlWriteMany(sql1, tlist)

    sql2 = 'insert into jd_catalog_map values(%s,%s,%s,%s,%s)'
    ar2 = dbhelper.executeSqlWriteMany(sql2, vlist)

    print 'jd_catalog rows inserted: %s' %ar1
    print 'jd_catalog_map rows inserted: %s' %ar2
예제 #5
0
def record_device_id(deviceid, platform, original_deviceid):

    tnow = timeHelper.getNowLong()
    if platform not in [
            notification_config.CONST_DEVICE_PLATFORM_ANDROID,
            notification_config.CONST_DEVICE_PLATFORM_IOS
    ]:
        platform = notification_config.CONST_DEVICE_PLATFORM_IOS

    if len(deviceid) < 5:
        return {
            'status': -1,
            'msg': 'invalid device_id = %s' % deviceid,
        }

    # record in db
    vlist = [[deviceid, platform, tnow, original_deviceid, ""]]

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='user_notification_device',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=True,
        need_flow=False,
    )

    return ret
예제 #6
0
def _get_fixed_catalog():
    tnow = timeHelper.getNowLong()
    vlist = [
        ['_ALL_', '全部折扣', 2000000, tnow],
        ['_EXPENSIVE_', '超值折扣', 1000000, tnow],
        ['HOT', '最新发现', 2500000, tnow],
        ['_HISTORY_LOWEST_', '历史最低', 3000000, tnow],
    ]
    return vlist
예제 #7
0
def _get_fixed_catalog():
    tnow = timeHelper.getNowLong()
    vlist = [
        ['_ALL_','全部折扣',2000000,tnow],
        ['_EXPENSIVE_','超值折扣',1000000,tnow],
        ['HOT','最新发现',2500000,tnow],
        ['_HISTORY_LOWEST_','历史最低',3000000,tnow],
    ]
    return vlist
예제 #8
0
def update_history_lowest_store():
    skulist = getHistoryLowest_SkuIds()
    dt = timeHelper.getNowLong()
    vlist = []
    for sku_id in skulist:
        vlist.append([sku_id, dt])
    sql = 'insert ignore into jd_notification_history_lowest values (%s,%s)'
    afr = dbhelper.executeSqlWriteMany(sql,vlist)
    sql2 = 'replace into jd_notification_job_status values("%s","%s")'%(NOTIFICATION_JOB_NAME,dt)
    afr2 = dbhelper.executeSqlWrite1(sql2)
    afr3 = _removeOldNotifications()
    afr4 = _removeOutdated_Nonhistory_lowest()
    return [afr,afr2,afr3,afr4]
예제 #9
0
def _calculate_worthy_values(sku_info_list):
    for sku in sku_info_list:

        col_name_list = [
            'discount_rate',
            'max_deduction_ratio',
            'discount',
            'rf_ratio',
            'gift_ratio',
        ]

        param_dict = {}
        for item in col_name_list:
            param_dict[item] = 1.0
            index = _get_column_index(item)
            if sku[index] is not None:
                param_dict[item] = float(sku[index])
                if item == 'max_deduction_ratio':
                    param_dict[item] = 1.0 - float(sku[index])

        # The most easy algorithm to calculate final worthy score
        value1 = _calculate_weighted_score(param_dict,
                                           col_worthyvalue_weight_dict_1)
        sku[_get_column_index('worthy_value1')] = value1

        value2 = _calculate_weighted_score(
            param_dict, col_worthyvalue_weight_dict_acitivity)
        sku[_get_column_index('activity_discount_rate')] = value2

        value3 = _calculate_weighted_score(
            param_dict, col_worthyvalue_weight_dict_deduct_even)
        sku[_get_column_index('total_discount_rate')] = value3

        sku[_get_column_index('final_price')] = caculate_final_price(
            sku, price=None)

        nowtime = timeHelper.getNowLong()
        sku[_get_column_index('this_update_time')] = nowtime

        # if value1<1:
        #     print
    return 0
예제 #10
0
def do_log_user_event(device_id, query, catalog_id, remote_ip):

    catalog_name = ""

    if query is not None and len(query) > 0:
        catalog_id = ""
    else:
        query = ""
        sql2 = 'select * from jd_catalog where catalog_id="%s"' % catalog_id
        # print sql2
        retrows = dbhelper.executeSqlRead(sql2)
        if len(retrows) > 0:
            catalog_name = retrows[0]['catalog_name']
        else:
            catalog_name = 'Unknown'

    sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' % (
        device_id, query, catalog_id, catalog_name, timeHelper.getNowLong(),
        remote_ip)
    afr = dbhelper.executeSqlWrite1(sql_user_event)
    # print afr
    return afr
예제 #11
0
    def run_tasks_repeated(self):

        while True:
            has_error = False
            logging.info("\n\n\nJob start at %s" % timeHelper.getNowLong())
            t1 = time.time()
            try:
                has_error = self.run_tasks_once()
            except Exception as e:
                logging.error("ERROR GREP LINE >>>>>>" * 5)
                logging.error(e)
                has_error = True

            logging.info("HAS ERROR TODAY? %s" % has_error)

            t2 = time.time()
            logging.info("RUN-ONCE TOTALLY TAKING TIME: %s seconds" %
                         (t2 - t1))
            logging.info("-" * 60)

            if self.is_daily:
                remaining = timeHelper.getTimeLeftTillTomorrow(
                ) + self.START_HOUR * 3600 + 10
                logging.info(
                    "Sleep to tomorrow, at %s:00:00 hour: hours left = %s" %
                    (self.START_HOUR, remaining / 3600))
            else:
                inter = self.interval_seconds - int(t2 - t1)
                if inter < 0:
                    inter = 0
                remaining = inter + 10  #+ self.START_HOUR*3600
                logging.info("Sleep to fill interval, seconds left = %s" %
                             (remaining))

            if remaining < self.min_sleep:
                remaining = self.min_sleep

            time.sleep(remaining)
예제 #12
0
def _calculate_worthy_values(sku_info_list):
    for sku in sku_info_list:

        col_name_list = [
            'discount_rate',
            'max_deduction_ratio',
            'discount',
            'rf_ratio',
            'gift_ratio',
        ]

        param_dict = {}
        for item in col_name_list:
            param_dict[item] = 1.0
            index = _get_column_index(item)
            if sku[index] is not None:
                param_dict[item] = float(sku[index])
                if item=='max_deduction_ratio':
                    param_dict[item] = 1.0 - float(sku[index])

        # The most easy algorithm to calculate final worthy score
        value1 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_1)
        sku[_get_column_index('worthy_value1')] = value1

        value2 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_acitivity)
        sku[_get_column_index('activity_discount_rate')] = value2

        value3 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_deduct_even)
        sku[_get_column_index('total_discount_rate')] = value3

        sku[_get_column_index('final_price')] = caculate_final_price(sku,price=None)

        nowtime = timeHelper.getNowLong()
        sku[_get_column_index('this_update_time')] = nowtime

        # if value1<1:
        #     print
    return 0
def do_log_user_event(device_id, query, catalog_id, remote_ip):

    catalog_name = ""

    if query is not None and len(query) > 0:
        catalog_id = ""
    else:
        query = ""
        sql2 = 'select * from jd_catalog where catalog_id="%s"' %catalog_id
        # print sql2
        retrows = dbhelper.executeSqlRead(sql2)
        if len(retrows) > 0:
            catalog_name = retrows[0]['catalog_name']
        else:
            catalog_name = 'Unknown'

    sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' %(device_id,query,catalog_id,catalog_name, timeHelper.getNowLong(),remote_ip)
    afr = dbhelper.executeSqlWrite1(sql_user_event)
    # print afr
    return afr
예제 #14
0
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...')
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' % recent
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" % len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [
                            sku_id, dt, pid, code, name, gift_name, gift_num,
                            gift_image, gift_sku_id, gift_gt, gift_gs,
                            update_date
                        ]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug(
                            "error in extracting gift info for sku_id = %s" %
                            sku_id)
                        logging.debug("%s" % e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %
                  num_error)
    logging.debug('num17: %s' % num17)
    logging.debug('vlist len: %s' % len(vlist))
    logging.debug('glist len: %s' % len(glist))

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist) > 0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist) > 0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0], cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg, sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1, ret2, ret3])
예제 #15
0
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' %today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' %len(retrows) )
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" %row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat==1:
                        stat_has_repeat = True
                    dr_ratio = deduction*1.0/reach
                    maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %(reach,deduction) )
                    logging.error(e)
                    continue

                if price >= reach and reach>0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct/price
                tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" %sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打","商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type==2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+',re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type==0:
                discount = pts[1]
            elif deduct_type==1:
                free_num = float(pts[1])
                rf_ratio = float(free_num*1.0/reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            vlist19.append(tp19)


        else:
            pass

    logging.debug("code = 15, cases = %s" %num_15)
    logging.debug("code = 15, repeated = %s" %num_15_repeated)
    logging.debug("rows to insert = %s" %len(vlist) )

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" %num_19 )
    logging.debug("rows to insert = %s" %len(vlist19) )

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
예제 #16
0
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...' )
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' %recent
    retrows = dbhelper.executeSqlRead(sql,is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" %len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug("error in extracting gift info for sku_id = %s"%sku_id)
                        logging.debug("%s" %e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error)
    logging.debug('num17: %s' %num17 )
    logging.debug('vlist len: %s' %len(vlist) )
    logging.debug('glist len: %s' %len(glist) )

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist)>0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist)>0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0],cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg,sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1,ret2, ret3])
예제 #17
0
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' % today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' % len(retrows))
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" % row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat == 1:
                        stat_has_repeat = True
                    dr_ratio = deduction * 1.0 / reach
                    maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %
                                  (reach, deduction))
                    logging.error(e)
                    continue

                if price >= reach and reach > 0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct / price
                tp = [
                    sku_id, dt, price, is_repeat, reach, deduction,
                    max_deduction, dr_ratio, maxp_ratio, single_discount_rate,
                    category_id, category_name, pid, code, name, content,
                    adurl, origin_dt
                ]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" % sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打", "商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type == 2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+', re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type == 0:
                discount = pts[1]
            elif deduct_type == 1:
                free_num = float(pts[1])
                rf_ratio = float(free_num * 1.0 / reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 = [
                sku_id, dt, price, deduct_type, reach_num, discount, free_num,
                rf_ratio, category_id, category_name, pid, code, name, content,
                adurl, origin_dt
            ]
            vlist19.append(tp19)

        else:
            pass

    logging.debug("code = 15, cases = %s" % num_15)
    logging.debug("code = 15, repeated = %s" % num_15_repeated)
    logging.debug("rows to insert = %s" % len(vlist))

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" % num_19)
    logging.debug("rows to insert = %s" % len(vlist19))

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
예제 #18
0
        (select * from jd_item_comment_count_latest where CommentCount>=100) a
        left join
        jd_item_category b
        on a.SkuId = b.sku_id
        -- where a.dt > '2015-10-1'

        group by b.category_id
        ) e
        left JOIN
        jd_category c
        on e.category_id = c.id

    ''' %(today)
    return sql


if __name__ == "__main__":

    while True:
        print 'job start: %s' %timeHelper.getNowLong()
        t1 = time.time()
        ret = calculate_base_rating_for_categories()
        print ret
        t2 = time.time()
        print "Finished in seconds: %s" %(t2-t1)

        remaining = timeHelper.getTimeLeftTillTomorrow()
        print "now sleeping for hours: %s" %(remaining/3600)

        time.sleep(remaining)