Exemplo n.º 1
0
def pnone_processing(path="../data/",
                     train_data_name="train_data",
                     test_data_name="test_data"):
    # load the data
    df_train = common.load_df(path, train_data_name)
    df_test = common.load_df(path, test_data_name)

    print df_train.shape
    print df_test.shape

    # user prod add_to_cart_order mean, std
    priors, train, orders, products, aisles, departments = preprocessing_data()

    df_user_pnone=pd.DataFrame(priors.groupby('user_id').\
        apply(lambda user_orders: sum(user_orders.groupby('order_id').reordered.sum() == 0)\
        / float(user_orders.order_id.unique().shape[0])),columns=['pnone'])

    df_train = df_train.merge(df_user_pnone,\
                          how='left', left_on=['user_id'], right_index=True)
    df_test = df_test.merge(df_user_pnone,\
                          how='left', left_on=['user_id'], right_index=True)

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)
Exemplo n.º 2
0
def extra_feature_processing(path="../data/",
                             train_data_name="train_data",
                             test_data_name="test_data"):
    # load the data
    df_train = common.load_df(path, train_data_name)
    df_test = common.load_df(path, test_data_name)

    print df_train.shape
    print df_test.shape

    # user prod add_to_cart_order mean, std
    priors, train, orders, products, aisles, departments = preprocessing_data()

    # get mean of add to cart order
    user_prod_avg_add_to_cart_order=\
        pd.DataFrame(priors.groupby(['user_id','product_id']).add_to_cart_order.agg(np.mean))
    user_prod_avg_add_to_cart_order.rename( columns={'add_to_cart_order':'user_prod_avg_add_to_cart_order'},\
                                           inplace=True)
    df_train = df_train.merge(user_prod_avg_add_to_cart_order,\
                          how='left', left_on=['user_id','product_id'], right_index=True)
    df_test = df_test.merge(user_prod_avg_add_to_cart_order,\
                          how='left', left_on=['user_id','product_id'], right_index=True)

    # order_dow processing
    # priors.groupby(['user_id','product_id']).apply(get_p_dow)

    # load timeline related data
    # timeline_data = get_timeline_data()

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)
Exemplo n.º 3
0
def get_processed_data(path="../data/",
                       train_data_name="train_data",
                       test_data_name="test_data"):
    if os.path.isfile(path + train_data_name +
                      ".csv.gz") and os.path.isfile(path + test_data_name +
                                                    ".csv.gz"):
        # load the data
        df_train = common.load_df(path, train_data_name)
        df_test = common.load_df(path, test_data_name)
    else:
        print "no data, start processing"
        df_train, df_test = processing_data()
        # save the df_train, df_test
        common.save_df(df_train, "../data/", train_data_name, index=False)
        common.save_df(df_test, "../data/", test_data_name, index=False)
    return df_train, df_test
Exemplo n.º 4
0
def extra_reorder_rate_processing(path="../data/",
                                  train_data_name="train_data",
                                  test_data_name="test_data"):
    # load the data
    df_train = common.load_df(path, train_data_name)
    df_test = common.load_df(path, test_data_name)

    df_all = pd.concat([df_train, df_test])
    assert (df_all.shape[0] == df_train.shape[0] + df_test.shape[0])

    print df_all.shape
    print df_all.keys()

    # here the cat 50, 100 is totally wrong.
    # re-processing here
    # drop old cat columns
    drop_cols = [
        'aisle_reorder_rate', 'department_reorder_rate', 'prod_cat_20',
        'prod_cat_50', 'prod_cat_100', 'user_cat_20', 'user_cat_50',
        'user_cat_100', 'user_prod_match_20', 'user_prod_match_50',
        'user_prod_match_100', 'prod_cat_20_reorder_rate',
        'user_cat_20_prod_reorder_rate',
        'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate',
        'user_cat_50_prod_reorder_rate',
        'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate',
        'user_cat_100_prod_reorder_rate',
        'user_cat_100_prod_cat_100_reorder_rate'
    ]
    df_all.drop(drop_cols, axis=1, inplace=True, errors='ignore')
    print df_all.shape

    # load user and product category data
    user_cat_data, prod_cat_data, user_prod_cat_match_data = get_user_prod_cat_data(
    )

    # merge category data
    df_all = df_all.merge(user_cat_data, how='left', on='user_id')
    df_all = df_all.merge(prod_cat_data, how='left', on='product_id')
    df_all = df_all.merge(user_prod_cat_match_data,
                          how='left',
                          on=['user_id', 'product_id'])

    # reorder_rate processing
    # aisle reorder rate
    df_all=df_all.merge(pd.DataFrame(df_all.groupby('aisle_id').apply(lambda orders: \
                                    sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                    if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['aisle_reorder_rate']),\
                                    how='left', left_on='aisle_id', right_index=True)
    assert (
        df_all.groupby('aisle_id').aisle_reorder_rate.apply(lambda x: x.unique(
        ).shape[0] == 1).sum() == df_all.aisle_id.unique().shape[0])

    # department reorder rate
    df_all=df_all.merge(pd.DataFrame(df_all.groupby('department_id').apply(lambda orders: \
                                    sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                    if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['department_reorder_rate']),\
                                    how='left', left_on='department_id', right_index=True)
    assert(df_all.groupby('department_id').department_reorder_rate.apply(lambda x: x.unique().shape[0]==1).sum() ==\
                                                                         df_all.department_id.unique().shape[0])

    nt_list = [20, 50, 100]
    for num_topics in nt_list:
        # prod_cat_reorder_rate
        prod_cat_str = 'prod_cat_' + str(num_topics)
        df_all = df_all.merge(pd.DataFrame(df_all.groupby(prod_cat_str).apply(lambda orders: \
                                           sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                           if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=[prod_cat_str+'_reorder_rate']),\
                                           how='left', left_on=prod_cat_str, right_index=True)
        assert(df_all.groupby(prod_cat_str)[prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \
               == df_all[prod_cat_str].unique().shape[0])

        # user_cat_prod_reorder_rate
        # for a given product, the reorder rate of all users who belongs to a particle category
        user_cat_str = 'user_cat_' + str(num_topics)
        df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, 'product_id']).apply(lambda orders: \
                                         sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                         if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_prod_reorder_rate']),\
                                         how='left', left_on=[user_cat_str,'product_id'],right_index=True)
        assert(df_all.groupby([user_cat_str,'product_id'])[user_cat_str+'_prod_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \
               == len(df_all.groupby([user_cat_str,'product_id']).groups.keys()))

        # user_cat_prod_cat_reorder_rate
        df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, prod_cat_str]).apply(lambda orders: \
                        sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                        if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_'+prod_cat_str+'_reorder_rate']), \
                        how='left', left_on=[user_cat_str, prod_cat_str], right_index=True)
        assert(df_all.groupby([user_cat_str,prod_cat_str])[user_cat_str+'_'+prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \
               == len(df_all.groupby([user_cat_str,prod_cat_str]).groups.keys()))

    # set dtypes
    category_cols = [
        'eval_set', 'prod_cat_20', 'prod_cat_50', 'prod_cat_100',
        'user_cat_20', 'user_cat_50', 'user_cat_100'
    ]
    for col in category_cols:
        df_all[col] = df_all[col].astype('category')

    rate_cols = [
        'aisle_reorder_rate', 'department_reorder_rate',
        'prod_cat_20_reorder_rate', 'user_cat_20_prod_reorder_rate',
        'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate',
        'user_cat_50_prod_reorder_rate',
        'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate',
        'user_cat_100_prod_reorder_rate',
        'user_cat_100_prod_cat_100_reorder_rate'
    ]
    for col in rate_cols:
        df_all[col] = df_all[col].astype('float32')

    print df_all.shape
    # use float32 to save
    # split the train and test for df_all
    # df_train=df_all[df_all.eval_set=='train'].drop(['add_to_cart_order'],axis=1)
    #  df_test=(df_all[df_all.eval_set=='test']).drop(['add_to_cart_order','reordered'],axis=1)
    df_train = df_all[df_all.eval_set == 'train']
    df_train['reordered'] = df_train['reordered'].astype(np.uint8)
    df_test = (df_all[df_all.eval_set == 'test']).drop(['reordered'], axis=1)

    print df_train.shape
    print df_test.shape

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)
Exemplo n.º 5
0
            up_lda = LdaMulticore(corpus=user_prods,
                                  id2word=id2prod,
                                  workers=2,
                                  num_topics=num_topics,
                                  minimum_probability=1e-8,
                                  chunksize=2000,
                                  passes=1)

            model_fn = model_fn_prefix + str(num_topics) + ".lda"
            print "save model into " + model_fn
            up_lda.save(model_fn)
            # debug reload model
            # up_lda = LdaModel.load(model_fn)

            df_user_cat = get_user_cat(up_lda, df_user_prods)
            user_cat_name = user_cat_name_prefix + str(num_topics)
            print "save " + user_cat_name
            common.save_df(df_user_cat, "../data/", user_cat_name, index=False)

            df_prod_cat = get_prod_cat(up_lda, products)
            prod_cat_name = prod_cat_name_prefix + str(num_topics)
            print "save " + prod_cat_name
            common.save_df(df_prod_cat, "../data/", prod_cat_name, index=False)

            print time.ctime()

        print "finished"
        print time.ctime()
        print time.time() - start_time
Exemplo n.º 6
0
        print "In debug mode, None-debug mode command : python -O " + __file__ + "\n\n"

        # the following is self test code for 10 users
        # load the data
        ten_user_orders = pd.read_csv("/tmp/ten_user_orders.csv.gz", compression='gzip')
        
        # test for get_user_corpus
        ten_user_corpus = get_user_corpus(ten_user_orders)  
        
        # check get_user_corpus result
        print "compare user_corpus"
        print ten_user_corpus[ten_user_corpus.user_id==202277].user_corpus.apply(lambda row: list(zip(*row)[0]))
        print ten_user_orders[ten_user_orders.user_id==202277].product_id.sort_values().tolist()
    
        # save corpus
        common.save_df(ten_user_corpus,"/tmp/", "ten_user_corpus", index=False)
        # load corpus back    
        load_ten_user_corpus = common.load_df("/tmp/", "ten_user_corpus")
        print load_ten_user_corpus
    else:
        ### formal code
        IDIR = "../input/"
        priors, train, orders, products, aisles, departments = common.load_raw_data(IDIR)
    
    	# only build corpus for priors which is used for cross-validation
        print('add order info to priors')
        orders = orders.set_index('order_id', inplace=True, drop=False)
        priors = priors.join(orders, on='order_id', rsuffix='_')
        priors.drop('order_id_', inplace=True, axis=1)
        
        if "priors" in objects: