def generate_features(flag): if flag == "test": tr_artifact_file = FileNames.train_artifact hist_artifact_files = [ FileNames.cust_train_artifact1, FileNames.cust_train_artifact2, FileNames.cust_train_artifact3, FileNames.cust_train_artifact4, ] tr_file = FileNames.train_v2 te_file = FileNames.test_v2 tr_save_file = FileNames.train_features_v1 te_save_file = FileNames.test_features_v1 elif flag == "val": tr_artifact_file = FileNames.tr_artifact hist_artifact_files = [ FileNames.cust_tr_artifact1, FileNames.cust_tr_artifact2, FileNames.cust_tr_artifact3, FileNames.cust_tr_artifact4, ] tr_file = FileNames.tr_v2 te_file = FileNames.val_v2 tr_save_file = FileNames.tr_features_v1 te_save_file = FileNames.val_features_v1 else: print("flag not VALD!") tr_artifact = load_pickle(tr_artifact_file) hist_artifacts = [ load_pickle(hist_file) for hist_file in hist_artifact_files ] columns = get_feature_names(3) tr_data = load_pickle(tr_file) te_data = load_pickle(te_file) all_data = pd.concat([tr_data, te_data]) pipeline = get_feature_pipeline(tr_artifact, hist_artifacts, all_data) x_tr = pipeline.fit_transform(tr_data) x_te = pipeline.transform(te_data) x_tr = pd.DataFrame(x_tr, columns=columns) x_te = pd.DataFrame(x_te, columns=columns) x_tr[FieldNames.target] = tr_data[FieldNames.target].values if flag == "val": x_te[FieldNames.target] = te_data[FieldNames.target].values save_pickle(x_tr, tr_save_file) save_pickle(x_te, te_save_file)
def main(): """Load train and test, map additional data, split validation and save as pickle.""" print("Read train and test files") train, test = read_train_test() print("Read and map campaign start and end dates") kws = { "parse_dates": [FieldNames.campaign_start_date, FieldNames.campaign_end_date], "dayfirst": True, } campaign_data = read_csv(FileNames.campaign, **kws) train = pd.merge(train, campaign_data, on="campaign_id", how="left") test = pd.merge(test, campaign_data, on="campaign_id", how="left") print("Read and map demograhics data") demog_data = read_csv(FileNames.demogs) train = pd.merge(train, demog_data, on="customer_id", how="left") test = pd.merge(test, demog_data, on="customer_id", how="left") for col, mapping in [ (FieldNames.age_range, AGE_MAP), (FieldNames.marital_status, MARITAL_STATUS), (FieldNames.family_size, FAMILY_SIZE), (FieldNames.no_of_children, NO_OF_CHILDREN), (FieldNames.campaign_type, CAMPAIGN_TYPE), ]: train[col] = map_to_float(train, col, mapping) test[col] = map_to_float(test, col, mapping) print("Read coupon and item details and merge them") coupon_data = read_csv(FileNames.coupon_item) item_data = read_csv(FileNames.item) coupon_data = pd.merge(coupon_data, item_data, on="item_id", how="left") print("Map coupon details to train") coupon_grouped = coupon_data.groupby("coupon_id").agg( {"item_id": list, "brand": list, "brand_type": list, "category": list} ) train = pd.merge(train, coupon_grouped, on="coupon_id", how="left") test = pd.merge(test, coupon_grouped, on="coupon_id", how="left") train = train.rename(columns={'item_id': FieldNames.item_set}) test = test.rename(columns={'item_id': FieldNames.item_set}) print("split train --> tr and val") tr = train.loc[~train[FieldNames.campaign_id].isin([11, 12, 13])] val = train.loc[train[FieldNames.campaign_id].isin([11, 12, 13])] print("save as pickle") save_pickle(train, FileNames.train_v2) save_pickle(test, FileNames.test_v2) save_pickle(tr, FileNames.tr_v2) save_pickle(val, FileNames.val_v2)
def prepare_transactions(): """Create validation customer transaction data; Aggregate by date and user.""" cust_transact = read_csv(FileNames.transaction, **{"parse_dates": [FieldNames.transaction_date]}) item_details = read_csv(FileNames.item) cust_transact = pd.merge(cust_transact, item_details, on=FieldNames.item_id, how="left") cust_transact[FieldNames.pct_discount] = ( cust_transact[FieldNames.coupon_discount] / cust_transact[FieldNames.selling_price]) cust_transact[FieldNames.transaction_dayofweek] = cust_transact[ FieldNames.transaction_date].dt.dayofweek cust_transact_tr = cust_transact.loc[ cust_transact[FieldNames.transaction_date] <= "2013-05-10"] print("Saving to pickle") save_pickle(cust_transact, FileNames.transaction_test_v1) save_pickle(cust_transact_tr, FileNames.transaction_val_v1)
def save_transaction_artifact(flag): """Sace artifacts for customer transactions with different conditions.""" if flag == 'test': inp_file = FileNames.transaction_test_v1 save_file1 = FileNames.cust_train_artifact1 save_file2 = FileNames.cust_train_artifact2 save_file3 = FileNames.cust_train_artifact3 save_file4 = FileNames.cust_train_artifact4 elif flag == 'val': inp_file = FileNames.transaction_val_v1 save_file1 = FileNames.cust_tr_artifact1 save_file2 = FileNames.cust_tr_artifact2 save_file3 = FileNames.cust_tr_artifact3 save_file4 = FileNames.cust_tr_artifact4 else: print('flag not VALID!') transactions = load_pickle(inp_file) transactions_grp = group_transactions(transactions) artifact = _get_transaction_artifact(transactions_grp) save_pickle(artifact, save_file1) del artifact, transactions_grp print("Customer artifact 1 done!") transactions2 = transactions.loc[ np.abs(transactions[FieldNames.coupon_discount]) > 0] transactions_grp2 = group_transactions(transactions2) artifact = _get_transaction_artifact(transactions_grp2) save_pickle(artifact, save_file2) del transactions2, transactions_grp2, artifact print("Customer artifact 2 done!") transactions3 = transactions.loc[ (np.abs(transactions[FieldNames.coupon_discount]) > 0) & (np.abs(transactions[FieldNames.other_discount]) > 0)] transactions_grp3 = group_transactions(transactions3) artifact = _get_transaction_artifact(transactions_grp3) save_pickle(artifact, save_file3) del transactions3, transactions_grp3, artifact print("Customer artifact 3 done!") transactions4 = transactions.loc[(np.abs( transactions[FieldNames.coupon_discount]) > np.abs( transactions[FieldNames.other_discount]))] transactions_grp4 = group_transactions(transactions4) artifact = _get_transaction_artifact(transactions_grp4) save_pickle(artifact, save_file4) del transactions4, artifact print("Customer artifact 4 done!")
def save_train_artifact(flag): """Create artifact using training data.""" if flag == 'test': inp_file = FileNames.train_v2 save_file = FileNames.train_artifact elif flag == 'val': inp_file = FileNames.tr_v2 save_file = FileNames.tr_artifact tr = load_pickle(inp_file) tr_artifact = HistoricalArtifact( tr, user_field=FieldNames.customer_id, date_field=FieldNames.campaign_start_date, key_fields=[ FieldNames.campaign_id, FieldNames.coupon_id, FieldNames.target, FieldNames.item_category, ], ) save_pickle(tr_artifact, save_file)