def gen_level_aggs(col, updata=False):
    feat_path = os.path.join(feats_root,'level_aggs_{}.pkl'.format(col))
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        dfal = get_nominal_dfal()[[col, 'da'] + level_cols]
        dmax = dfal.da.max()
        dmin = dfal.da.min()
        
        level_agg = None
        for da in sorted(dfal.da.unique())[1:]:
            da_agg = None
            for win_das in [1, 2, 3]:
                if da - win_das < dmin:
                    continue
                agg = gen_level_agg_features(dfal, da, win_das, col)
                print('Generated {} {} {}'.format(col, da, win_das))
                if da_agg is None:
                    da_agg = agg
                else:
                    da_agg = da_agg.merge(agg, how='outer')
            if level_agg is None:
                level_agg = da_agg
            else: 
                level_agg = pd.concat([level_agg, da_agg], axis=0)
                level_agg.fillna(0, inplace=True)
                level_agg, _ = reduce_mem_usage(level_agg)
        print(level_agg.shape)
        level_agg, _ = reduce_mem_usage(level_agg)
        dump_pickle(level_agg, feat_path)
예제 #2
0
def gen_buy_count(file_name='train'):

    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    cols = ['user_id', 'item_id', 'item_brand_id', 'second_cate', 'shop_id']
    data_select = pd.DataFrame()
    if file_name == 'train':
        for col in cols:
            feature_str = col + '_buy_count'
            buy_all = None
            for day in data.day.unique():
                buy_filter = data.loc[data.day < day, [col, 'is_trade']]
                col_buy_count = buy_filter.groupby([col]).sum().iloc[:, 0]
                today_data = data.loc[data.day == day, [col]]
                today_data[feature_str] = today_data.apply(lambda x: \
                  col_buy_count[x[col]] if x[col] in col_buy_count.index else -1, axis=1)
                buy_all = pd.concat([buy_all, today_data], axis=0)
            data_select[feature_str] = buy_all[feature_str]
    else:
        train_data = load_pickle(path=raw_data_path + 'train' + '.pkl')
        for col in cols:
            feature_str = col + '_buy_count'
            buy_filter = train_data.loc[train_data.day <= 24,
                                        [col, 'is_trade']]
            col_buy_count = buy_filter.groupby([col]).sum().iloc[:, 0]

            data_select[feature_str] = data.apply(lambda x: \
               col_buy_count[x[col]] if x[col] in col_buy_count.index else -1, axis=1)
    dump_pickle(data_select, feature_data_path + file_name + '_buy_count')
def gen_user_feature_click_hour(update=True):
    """生成用户对所有分类属性的当前小时点击量

    file_name: user_(feature_id)_click_hour.pkl
    
    features:
        'user_item_id_click_hour',
        'user_item_brand_id_click_hour', 
        'user_context_page_id_click_hour', 
        'user_shop_id_click_hour',

    """

    data = load_pickle(raw_data_path + 'all_data.pkl')

    feature_list=['item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level', 
                  'item_collected_level', 'item_pv_level',
                  'context_page_id', 
                  'shop_id', 'shop_review_num_level', 'shop_star_level',]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path + 'user_'+feature+'_click_hour.pkl'
        if os.path.exists(feature_path) and update == False:
            print('found '+feature_path)
        else:
            print('generating '+feature_path)
            user_feature_click_day = data.groupby(['user_id', 'day', 'hour', feature]).size(
            ).reset_index().rename(columns={0: 'user_'+feature+'_click_hour'})
            dump_pickle(user_feature_click_day, feature_path)
def gen_feature_click_stats(update=True):
    """生成各个分类属性日点击量的统计特征

    file_name: (feature)_click_day_stats.pkl

    example:
        user_id_click_day_mean 该用户平均每天点击多少次
        item_id_click_day_max 该物品单日最高销量

    features:
        'user_id_click_day_mean', 'user_id_click_day_max', 'user_id_click_day_min', 
        'item_id_click_day_mean', 'item_id_click_day_max', 'item_id_click_day_min',
        'item_brand_id_click_day_mean', 'item_brand_id_click_day_max', 'item_brand_id_click_day_min', 
        'shop_id_click_day_mean', 'shop_id_click_day_max', 'shop_id_click_day_min',
        'context_page_id_click_day_mean', 'context_page_id_click_day_max', 'context_page_id_click_day_min',
        'category2_label_click_day_mean', 'category2_label_click_day_max', 'category2_label_click_day_min'
        

    """

    data = load_pickle(raw_data_path + 'all_data.pkl')

    stats_feature = ['user_id', 'item_id', 'item_brand_id', 'shop_id']

    for feature in tqdm(stats_feature):
        feature_path = feature_data_path + feature + '_click_day_stats.pkl'
        if os.path.exists(feature_path) and update == False:
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            feature_stats = gen_feature_click_day_stats(data, feature)
            print(feature_stats.columns)
            dump_pickle(feature_stats, feature_path)
예제 #5
0
def gen_item_stats_feature(updata=False):
    feat_path = os.path.join(feats_root, 'item_click_stats.pkl')
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        dfal = get_nominal_dfal()
        dfal = add_item_total_da_click(dfal)
        dfal = add_item_da_feature_click(dfal)
        print('generating ' + feat_path)
        columns_da = list(
            filter(lambda x: x.endswith('_click_da'), dfal.columns.values))
        columns_ho = list(
            filter(lambda x: x.endswith('_click_ho'), dfal.columns.values))

        tbar = tqdm(columns_da)
        for col in tbar:
            tbar.set_description('add_item_click_stats ' + col)
            dfal = gen_item_click_stats(dfal, col)
        print('add_item_click_stats completed.')

        feat_names = list(
            filter(lambda x: '_click_da_' in x, dfal.columns.values))
        dfal = dfal[feat_names + ['item_id']].drop_duplicates(['item_id'])
        print('gen_item_stats_feature shape:', dfal.shape)
        dump_pickle(dfal, feat_path)
    print('gen_item_stats_feature completed.')
예제 #6
0
def get_vae_vs_beta(epochs, model, model_name, data_dir, betas):

    for beta in betas:

        if beta > 1:
            beta_str = f"{int(beta):04d}"
        else:
            beta_str = f"{beta:.1f}"


        vae = model(dim_z=cf_latent_dim, dim_x=(cf_img_size,cf_img_size,3), 
                    learning_rate=0.0001, kl_weight=beta)
        #vae.compile(optimizer=vae.optimizer, loss = vae.partial_vae_loss)

        # always start from the "warmed up beta=1, 20 epochs weights"
        sv_path = os.path.join(data_dir,"0001")
        vae.load_model(sv_path, 20)  

        # vae = kcv.K_PCVAE(dim_z=cf_latent_dim, dim_x=(cf_img_size,cf_img_size,3), 
        #             learning_rate=0.0001, kl_weight=beta)

        vae.compile(optimizer=vae.optimizer, loss = vae.partial_vae_loss)

        train_history = vae.fit(train_dataset,epochs=epochs, 
                                verbose=0, validation_data=test_dataset)
                                #, initial_epoch = 11 )

        history = train_history.history
        ut.dump_pickle(os.path.join(data_dir,f"history{model_name}_{beta_str}.pkl"), (history,betas,epochs))
        sv_path = os.path.join(data_dir,f"{beta_str}")
        make_dir(sv_path)
        print('save model')
        vae.save_model(sv_path, epochs)
예제 #7
0
def overtrain_vae(model, model_name, data_dir, params, epochs):
    make_dir(data_dir)

    vae = model(dim_z=params['z_dim'],
                dim_x=params['x_dim'],
                learning_rate=0.0001,
                kl_weight=params['kl_weight'])
    loss = vae.partial_vae_loss
    vae.compile(optimizer=vae.optimizer, loss=loss)

    train_history = vae.fit(train_dataset,
                            epochs=epochs,
                            verbose=1,
                            validation_data=test_dataset)
    #, initial_epoch = 11 )

    history = train_history.history
    filename = f"overtrain-{model_name}-kl_weight{params['kl_weight']:03d}.pkl"
    ut.dump_pickle(os.path.join(data_dir, filename), (history, params))

    sv_path = os.path.join(data_dir, f"kl_weight{params['kl_weight']:03d}")

    make_dir(sv_path)
    print('save model')
    vae.save_model(sv_path, epochs)
def denote_direction(input_path, output_dir):
    data_dsc = utils.load_pickle(input_path)
    data_dsc.loc[:, "fut_ret_direction"] = np.nan
    is_pos = data_dsc.loc[:, "fut_ret"] > 0
    data_dsc.loc[is_pos, "fut_ret_direction"] = 1
    data_dsc.loc[~is_pos, "fut_ret_direction"] = -1
    utils.dump_pickle(data_dsc, os.path.join(output_dir, "data_dsc.pkl"))
예제 #9
0
def load_data(start_day=23, end_day=26, load_test=False):
    """
    读取基本表拼接后的数据
    test表load_test = True
    """
    if load_test == True:
        trans_train_path = feature_data_path + 'trans_test_' + str(
            start_day) + '_' + str(end_day) + '.pkl'
        raw_train_path = raw_data_path + 'test.pkl'
    else:
        trans_train_path = feature_data_path + 'trans_train_' + str(
            start_day) + '_' + str(end_day) + '.pkl'
        raw_train_path = raw_data_path + 'train.pkl'

    if os.path.exists(trans_train_path):
        print('found ' + trans_train_path)
        train = pickle.load(open(trans_train_path, 'rb'))
    else:
        print('generating ' + trans_train_path)
        train = load_pickle(raw_train_path)

        train = addTime(train)
        train = train[(train.clickDay >= start_day)
                      & (train.clickDay <= end_day)]
        train = addAd(train)
        train = addPosition(train)
        train = addAppCategories(train)
        train = addUserInfo(train)

        dump_pickle(train, trans_train_path)
    return train
예제 #10
0
def gen_user_search_count(file_name):

    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    data = data.loc[:, [
        'user_id', 'item_id', 'shop_id', 'day', 'hour', 'second_cate'
    ]]

    data_select = pd.DataFrame()

    #聚类一下
    user_day_search = data.groupby(['user_id', 'day']).count().iloc[:, 0]
    #获取每个样本的,user_id,day组成的索引,以索引聚类后的数据
    x = data.loc[:, ('user_id', 'day')].values
    k = user_day_search.loc[[tuple(i) for i in x]]
    data_select['user_day_search'] = k.values

    user_hour_search = data.groupby(['user_id', 'day', 'hour']).count().iloc[:,
                                                                             0]
    x = data.loc[:, ('user_id', 'day', 'hour')].values
    k = user_hour_search.loc[[tuple(i) for i in x]]
    data_select['user_hour_search'] = k.values

    user_day_item_search = data.groupby(['user_id', 'day',
                                         'item_id']).count().iloc[:, 0]
    x = data.loc[:, ('user_id', 'day', 'item_id')].values
    k = user_day_item_search.loc[[tuple(i) for i in x]]
    data_select['user_day_item_search'] = k.values

    user_hour_item_search = data.groupby(['user_id', 'day', 'hour',
                                          'item_id']).count().iloc[:, 0]
    x = data.loc[:, ('user_id', 'day', 'hour', 'item_id')].values
    k = user_hour_item_search.loc[[tuple(i) for i in x]]
    data_select['user_hour_item_search'] = k.values

    user_day_shop_search = data.groupby(['user_id', 'day',
                                         'shop_id']).count().iloc[:, 0]
    x = data.loc[:, ('user_id', 'day', 'shop_id')].values
    k = user_day_shop_search.loc[[tuple(i) for i in x]]
    data_select['user_day_shop_search'] = k.values

    user_hour_shop_search = data.groupby(['user_id', 'day', 'hour',
                                          'shop_id']).count().iloc[:, 0]
    x = data.loc[:, ('user_id', 'day', 'hour', 'shop_id')].values
    k = user_hour_shop_search.loc[[tuple(i) for i in x]]
    data_select['user_hour_shop_search'] = k.values

    user_day_catesearch = data.groupby(['user_id', 'day',
                                        'second_cate']).count().iloc[:, 0]
    x = data.loc[:, ('user_id', 'day', 'second_cate')].values
    k = user_day_catesearch.loc[[tuple(i) for i in x]]
    data_select['user_day_cate_search'] = k.values

    user_hour_cate_search = data.groupby(
        ['user_id', 'day', 'hour', 'second_cate']).count().iloc[:, 0]
    x = data.loc[:, ('user_id', 'day', 'hour', 'second_cate')].values
    k = user_hour_cate_search.loc[[tuple(i) for i in x]]
    data_select['user_hour_cate_search'] = k.values

    dump_pickle(data_select,
                feature_data_path + file_name + '_user_search_count')
예제 #11
0
def main():
    parse = argparse.ArgumentParser()

    parse.add_argument('-A', '--A', type=float, default=0.55, help='A')
    parse.add_argument('-B', '--B', type=float, default=1.5, help='B')
    args = parse.parse_args()

    data_path = 'data/deeplearning_data/xml_data/'
    train_pid_label = load_pickle(data_path + 'train_label.pkl')

    index_label = load_pickle('data/baseline_data/xml_data/all_labels.pkl')
    baseline_inv_prop_file = 'data/baseline_data/xml_data/inv_prop.txt'

    train_label = train_pid_label.values()
    train_label = np.concatenate(train_label).tolist()
    label_frequency = dict(Counter(train_label))
    labels, fre = zip(*label_frequency.iteritems())
    fre = np.array(fre)

    N = len(train_pid_label)
    C = (math.log(N) - 1) * (args.B + 1)**args.A
    inv_prop = 1 + C * (fre + args.B)**(-args.A)

    inv_prop_dict = dict(zip(labels, inv_prop.tolist()))
    dump_pickle(inv_prop_dict, data_path + 'inv_prop_dict.pkl')
    #
    # for baseline inv propensity
    with open(baseline_inv_prop_file, 'w') as df:
        for l_ in index_label[:-1]:
            df.write(str(inv_prop_dict[l_]))
            df.write('\n')
        df.write(str(inv_prop_dict[index_label[-1]]))
def gen_category_predict_rank(update=True):
    '''生成实际类别在预测类别里的排序

    file_name: category_predict_rank.pkl

    features: category_predict_rank

    '''

    all_data = load_pickle(raw_data_path + 'all_data.pkl')

    feature_path = feature_data_path + 'category_predict_rank.pkl'
    if os.path.exists(feature_path) and update == False:
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        all_data['category_predict_rank'] = all_data.apply(
            lambda row: get_category_predict_rank(
                row['item_category_list'], row['predict_category_property']),
            axis=1)

        all_data['category_3'] = all_data.apply(
            lambda row: get_category_3_predict_rank(
                row['item_category_list'], row['predict_category_property']),
            axis=1)

        all_data = all_data[['category_predict_rank', 'category_3']]
        dump_pickle(all_data, feature_path)
def gen_property_sim(update=True):
    '''生成实际属性与预测属性的相似度

    file_name: property_sim.pkl

    features: property_sim

    '''

    all_data = load_pickle(raw_data_path + 'all_data.pkl')

    feature_path = feature_data_path + 'property_sim.pkl'
    if os.path.exists(feature_path) and update == False:
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        all_data['property_sim'] = all_data.apply(
            lambda row: get_property_sim(row['item_category_list'], row[
                'item_property_list'], row['predict_category_property']),
            axis=1)

        all_data = all_data[[
            'property_sim',
        ]]
        dump_pickle(all_data, feature_path)
def gen_hist_cvr_smooth(start_day,end_day, key, alpha=0.25):

    train_data = load_pickle(raw_data_path+'train.pkl')
    test_date = load_pickle(raw_data_path+'test.pkl')
    data = train_data.append(test_date)
    del train_data,test_date
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day,end_day+1)):
        feature_path = feature_data_path + key +'_histcvr_smooth_day_'+str(day)+'.pkl'
        if os.path.exists(feature_path):
            print('found '+feature_path)
        else:
            print('generating '+feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            dfCvr[key+'_cvr'] = (dfCvr['label_1'] + alpha) / (dfCvr['label_0'] + dfCvr['label_1'] + alpha*2)
            #dfCvr['clickDay'] = day
            sub_data = pd.merge(data.loc[data.clickDay==day,['clickDay',key]],dfCvr[[key,key+'_cvr']],'left',on=[key,])
            sub_data.drop_duplicates(['clickDay',key],inplace=True)
            sub_data.sort_values(['clickDay',key],inplace=True)
            dump_pickle(sub_data[['clickDay',key,key+'_cvr']],feature_path)
def gen_user_feature_click_hour():
    """生成用户对所有分类属性的当前小时点击量
    """

    data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    feature_list = [
        'category2_label',
        'category3_label',
        'shop_id',
        'item_id',
        'item_brand_id',
        'context_page_id',
        'item_price_level_bin',
        'item_sales_level_bin',
        'item_property_topic_k_15',
    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path + '_2_1_' + 'user_' + feature + '_click_hour.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)

            user_feature_click_hour = data.groupby(
                ['user_id', 'day', 'hour',
                 feature]).size().reset_index().rename(
                     columns={0: 'user_' + feature + '_click_hour'})
            dump_pickle(user_feature_click_hour, feature_path)
예제 #16
0
def comparing_with_ground_truth(tops, txt_infos, k):
    utils.dump_pickle("result.pkl", tops)
    gt = utils.get_pickle("datasets/qst1_w4/gt_corresps.pkl")
    hypo = utils.get_pickle("result.pkl")
    mapAtK = metrics.mapk(gt, hypo, k)
    print("\nMap@ " + str(k) + " is " + str(mapAtK))

    bbs_gt = np.asarray(
        utils.get_groundtruth("datasets/qst1_w4/text_boxes.pkl")).squeeze()
    bbs_predicted = [[painting.boundingxy for painting in txt_info]
                     for txt_info in txt_infos]
    mean_iou = utils.get_mean_IoU(bbs_gt, bbs_predicted)
    print("Mean Intersection over Union: ", mean_iou)

    texts_gt = utils.get_gt_text("datasets/qst1_w4")
    texts_predicted = [[painting.text for painting in txt_info]
                       for txt_info in txt_infos]
    with open('results.txt', 'w') as f:
        for item in texts_predicted:
            f.write("%s\n" % item)
    mean_lev = utils.compute_lev(texts_gt, texts_predicted)
    print(texts_predicted)
    print("\n")
    print(texts_gt)
    print("Mean Levenshtein distance: ", mean_lev)
예제 #17
0
def gen_bow_features_for_pool(pools, pool_idx, image_dir, feature_file_dir):
    train_files = []
    val_files = []
    test_files = []
    pool = pools['data'][str(pool_idx)]
    for path in pool['train_files']:
        full_path = os.path.join(image_dir, path)
        train_files.append(full_path)
    for path in pool['val_files']:
        full_path = os.path.join(image_dir, path)
        val_files.append(full_path)
    for path in pool['test_files']:
        full_path = os.path.join(image_dir, path)
        test_files.append(full_path)

    train_bow_features, val_bow_features, test_bow_features = extract_BOW_features(
        train_files, val_files, test_files)

    features = {
        'train_features': train_bow_features,
        'val_features': val_bow_features,
        'test_features': test_bow_features,
        'pool_idx': pool_idx
    }

    filepath = get_feature_file_path(pools, pool_idx, feature_file_dir)
    dump_pickle(features, filepath)
    return
def gen_tricks(start_day, end_day):
    """
    生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接
    """
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    actions = load_pickle(raw_data_path + 'user_app_actions.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    data = addTime(data)
    data = addAd(data)

    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            df = data.loc[data.clickDay == day]
            df = add_trick(df)
            df = add_diff(df)
            df = add_install2click(df, day, actions)
            dump_pickle(
                df[[
                    'global_index', 'trick', 'first_diff', 'last_diff',
                    'install2click'
                ]], feature_path)
예제 #19
0
def gen_feature_click_day_hour(update=True):
    '''
    计算feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'user_gender_id', 'context_page_id',
                        'user_occupation_id', 'user_age_level']的点击量
    计算的是每天每小时

    文件名:[feature]_click_hour.pkl
    '''

    all_data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    for feature in tqdm([
            'user_id', 'item_id', 'item_brand_id', 'category2_label',
            'category3_label', 'context_page_id', 'shop_id',
            'item_property_topic_k_15'
    ]):
        feature_path = feature_data_path + '_2_7_' + feature + '_click_day_hour.pkl'  # 要存放的目录
        if os.path.exists(feature_path) and update == False:
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            feature_click_day_hour = all_data.groupby(
                [feature, 'day', 'hour']).size().reset_index().rename(
                    columns={0: feature + '_click_hour'})
            dump_pickle(feature_click_day_hour, feature_path)  # 存储
예제 #20
0
def linear_regression(train, test, cols_x, col_y, output_dir):
    reg = LinearRegression()
    X_train = train.loc[:, cols_x].copy()
    y_train = train.loc[:, col_y].copy()
    X_test = test.loc[:, cols_x].copy()
    y_test = test.loc[:, col_y].copy()

    X_train.replace(np.nan, 0.0, inplace=True)
    X_test.replace(np.nan, 0.0, inplace=True)

    X_train = X_train.abs()
    y_train = y_train.abs()
    X_test = X_test.abs()
    y_test = y_test.abs()

    # res = np.correlate(X_train.loc[:, cols_x[0]], X_train.loc[:, cols_x[1]])
    # print(res)

    reg.fit(X_train, y_train)
    print(reg.coef_)
    yhat_test = reg.predict(X_test)
    utils.dump_pickle(yhat_test, os.path.join(output_dir,
                                              "scale_yhat_test.pkl"))
    print("insample R2: ", reg.score(X_train, y_train))
    print("outofsample R2: ", reg.score(X_test, y_test))
예제 #21
0
def gen_user_start_installed_cateA():
    """
    计算用户初始安装的各大类app的的数量
    拼接键['userID',]
    """
    user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
    app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
    app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100
                                                    if x > 100 else x)
    user_install = user_install.merge(app_cate, 'left', 'appID')
    for cate_a in tqdm(app_cate.cate_a.unique()):
        feature_path = feature_data_path + 'user_start_installed_cate_' + str(
            cate_a) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            user_install_cate = user_install[user_install.cate_a == cate_a][[
                'userID', 'cate_a'
            ]]
            user_install_cate.rename(
                columns={'cate_a': 'user_start_install_cate_' + str(cate_a)},
                inplace=True)
            user_install_cate = user_install_cate.groupby(
                'userID', as_index=False).sum()
            dump_pickle(user_install_cate, feature_path)
def gen_ID_global_sum_count(
        last_day=27,
        stats_features=['positionID', 'creativeID', 'appID', 'adID',
                        'userID']):
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    del train, test
    gc.collect()
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    for feature in tqdm(stats_features):
        feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            #continue
        print('generating ' + feature_path)
        feature_count_sum = pd.DataFrame(
            data.groupby(feature).size()).reset_index().rename(
                columns={0: feature + '_sum_count'})
        dump_pickle(feature_count_sum, feature_path)
def run_graph_generate(args):
  print("graph_generate")
  generate_args_base = {
    "sizes": args["cluster_sizes"],
    "p_in": args["p"],
    "p_out": args["q"],
    "seed": args["seed"],
    "cull_disconnected": args["cull_disconnected"],
    "connect_disconnected": args["connect_disconnected"],
    "generator_type": args["generator_type"],
    "out_path": None,
    "visualize": False,
  }

  graphs = {}
  num_graphs = args["num_graphs"]
  for i in range(num_graphs):
    if i == 0 or (i+1) % 100 == 0 or i == (num_graphs - 1):
      print(f"graph {i+1}/{num_graphs}")

    generate_args = generate_args_base.copy()

    if args.get('verbose', False):
      print(f"{i}: {generate_args}")

    graph = generate_appm.main(generate_args)
    graph_id = str(uuid.uuid4())
    graphs[graph_id] = {'args': generate_args, 'graph': graph }

  out_path = (f"{args['results_base']}"
              f"/{datetime.now().strftime(TIMESTAMP_FORMAT)}.pk")

  dump_pickle(graphs, out_path)
def gen_hist_cvr_smooth(start_da, end_da, key, alpha=0.25):
    dfal = get_nominal_dfal()
    dfal = dfal.loc[dfal.da <= end_da, [key, 'da', 'is_trade']]
    gc.collect()
    for da in tqdm(np.arange(start_da, end_da + 1)):
        feat_path = os.path.join(
            feats_root, key + '_hist_cvr_smooth_da_' + str(da) + '.pkl')
        if os.path.exists(feat_path):
            print('found ' + feat_path)
        else:
            print('generating ' + feat_path)
            dfcv = dfal.copy().loc[dfal.da < da]
            dfcv.is_trade = dfcv.is_trade.apply(int)
            dfcv = pd.get_dummies(dfcv, columns=['is_trade'], prefix='label')
            dfcv = dfcv.groupby([key], as_index=False).sum()
            dfcv[key + '_cvr'] = (dfcv['label_1'] + alpha) / (
                dfcv['label_0'] + dfcv['label_1'] + alpha * 2)
            result = pd.merge(dfal.loc[dfal.da == da, ['da', key]],
                              dfcv.loc[:, [key, key + '_cvr']],
                              'left',
                              on=[
                                  key,
                              ])
            result.drop_duplicates(['da', key], inplace=True)
            result.sort_values(['da', key], inplace=True)
            dump_pickle(result.loc[:, ['da', key, key + '_cvr']], feat_path)
def gen_dfal():
    dump_nominal_file = os.path.join(utils.cache_root, 'dfda_nominal.pkl')
    dump_textual_file = os.path.join(utils.cache_root, 'dfda_textual.pkl')
    if not os.path.exists(dump_nominal_file):
        tr = pd.read_csv('./input/round1_ijcai_18_train_20180301.txt',
                         sep=' ',
                         dtype={'is_trade': np.uint8})
        tr.is_trade = tr.is_trade.astype(np.int8)
        te = pd.read_csv('./input/round1_ijcai_18_test_b_20180418.txt',
                         sep=' ')
        da = pd.concat([tr, te], axis=0)
        da = utils.add_time_fields(da)

        for col in utils.nominal_cate_cols + utils.identity_cols:
            da[col] = LabelEncoder().fit_transform(da[col])

        for col in utils.ordinal_cate_cols:
            levels = sorted(da[col].unique())
            da[col] = da[col].apply(lambda x: levels.index(x)).astype(np.uint8)

        del da['context_id']
        del da['context_timestamp']
        del da['ts']
        da, _ = utils.reduce_mem_usage(da)
        utils.dump_pickle(da[utils.textual_cols], dump_textual_file)
        utils.dump_pickle(da.drop(utils.textual_cols, axis=1),
                          dump_nominal_file)
    print('gen dfal ok.')
예제 #26
0
def gen_user_search_time(file_name):
    '''
    #用当次搜索距离当天第一次搜索该商品时间差
    #用当次搜索距离当天第最后一次搜索该商品时间差
    #用当次搜索距离当天第一次搜索该店铺时间差
    #用当次搜索距离当天第最后一次搜索该店铺时间差
    #用当次搜索距离当天第一次搜索该品牌时间差
    #用当次搜索距离当天第最后一次搜索该品牌时间差
    #用当次搜索距离当天第一次搜索该类目时间差
    #用当次搜索距离当天第最后一次搜索该类目时间差
    '''
    data_select = pd.DataFrame()
    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    
    cols = ['item_id','shop_id', 'item_brand_id','second_cate']
    for col in cols:
        data_filter = data[['user_id', col,'day','context_timestamp']].groupby(['user_id', col,'day'])
        max_time = data_filter.agg(max)
        min_time = data_filter.agg(min)
        x = data.loc[:, ('user_id', col, 'day')].values
        m = max_time.loc[[tuple(i) for i in x]]
        n = min_time.loc[[tuple(i) for i in x]]
        data_select['sub_maxtime_'+col] = data['context_timestamp'].values - np.squeeze(m.values)
        data_select['sub_mintime_'+col] = data['context_timestamp'].values - np.squeeze(n.values)
        
        data_select['sub_maxtime_'+col] = data_select['sub_maxtime_'+col].apply(lambda x: x.total_seconds())
        data_select['sub_mintime_'+col] = data_select['sub_mintime_'+col].apply(lambda x: x.total_seconds())
    dump_pickle(data_select, feature_data_path +file_name + '_user_search_time')
예제 #27
0
def gen_user_basic_info(file_name='train', test_day=24):
    data_select = pd.DataFrame()

    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    data_select['user_id'] = data['user_id']
    data_select['user_gender_id'] = data['user_gender_id']
    data_select['user_age_level'] = data['user_age_level']
    data_select['user_occupation_id'] = data['user_occupation_id']
    data_select['user_star_level'] = data['user_star_level']

    #用户搜索时间划分,上午/下午/晚上/凌晨
    data_select['is_morning'] = (data['hour'].values >=
                                 8) & (data['hour'].values <= 12)
    data_select['is_afternoon'] = (data['hour'].values >
                                   12) & (data['hour'].values <= 17)
    data_select['is_evening'] = (data['hour'].values >
                                 17) & (data['hour'].values <= 23)
    data_select['is_before_dawn'] = (data['hour'].values < 8)

    if file_name == 'train':
        '''
        为了后面的抽样,这里先加上is_trade,训练时记得要删去
        '''
        data_select['is_trade'] = data['is_trade']
    dump_pickle(data_select,
                feature_data_path + file_name + '_user_basic_info')
예제 #28
0
def add_context_cate(data):

    #得到类别和属性组合后的个数
    context_cate_cols_path = raw_data_path + 'context_cate_cols.pkl'
    if os.path.exists(context_cate_cols_path):
        print("found " + context_cate_cols_path)
        cols = load_pickle(context_cate_cols_path)
        cols = list(map(lambda x: x[0], cols))
    else:
        #cate_dict, cate_cnt, _, _ = search_category_explore(data)
        cols = gen_sorted_search_cate_property(data)
        cols = list(map(lambda x: x[0], cols))
        dump_pickle(cols, context_cate_cols_path)

    feature_path = feature_data_path + 'context_cate_property_feat.pkl'
    data.cate_cols = data.predict_category_property.apply(
        lambda x: str_to_cate_cols(x))
    col_index = 0
    #当前商品的类别和属性拼接后是否在前00名
    for col in tqdm(cols[:300]):
        data[col] = data.cate_cols.apply(lambda x: 1 if col in x else 0)
        #if col_index % 200 == 0 and col_index > 100:
        #    dump_pickle(data[['instance_id']+cols[:col_index+1]], feature_path)
        col_index += 1
    dump_pickle(data[['instance_id'] + cols[:300]], feature_path)
    return data
예제 #29
0
def gen_shop_da_feature_click(updata=False):
    """生成用户相关所有数据的每天点击统计量"""
    dfal = get_nominal_dfal()
    stats_feat = [
        'item_category_list', 'item_brand_id', 'item_city_id',
        'user_gender_id', 'user_occupation_id', 'item_price_level',
        'item_sales_level', 'item_collected_level', 'item_pv_level',
        'user_age_level', 'user_star_level', 'context_page_id', 'item_id',
        'user_id'
    ]
    tbar = tqdm(stats_feat)
    for feat in tbar:
        feat_path = os.path.join(feats_root, 'shop_' + feat + '_click_da.pkl')
        if os.path.exists(feat_path) and updata == False:
            tbar.set_description('Found {:>60}'.format(
                os.path.basename(feat_path)))
        else:
            tbar.set_description('Generating {:>60}'.format(
                os.path.basename(feat_path)))
            shop_feat_click_da = dfal.groupby(
                ['shop_id', 'da', feat]).size().reset_index().rename(
                    columns={0: 'agg_shop_%s_click_da' % feat})
            dump_pickle(shop_feat_click_da, feat_path)

    print('gen_shop_da_feature_click completed.')
예제 #30
0
def gen_category(file_name='train'):
    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    
    item_cate_col = list(data.item_category_list)
    item_cate = list(map(lambda x: x.split(';'), item_cate_col))
    data['second_cate'] = list(map(lambda x: x[1], item_cate))
    
    dump_pickle(data, path=raw_data_path + file_name + '.pkl')