예제 #1
0
def generate_features(flag):
    if flag == "test":
        tr_artifact_file = FileNames.train_artifact
        hist_artifact_files = [
            FileNames.cust_train_artifact1,
            FileNames.cust_train_artifact2,
            FileNames.cust_train_artifact3,
            FileNames.cust_train_artifact4,
        ]
        tr_file = FileNames.train_v2
        te_file = FileNames.test_v2
        tr_save_file = FileNames.train_features_v1
        te_save_file = FileNames.test_features_v1
    elif flag == "val":
        tr_artifact_file = FileNames.tr_artifact
        hist_artifact_files = [
            FileNames.cust_tr_artifact1,
            FileNames.cust_tr_artifact2,
            FileNames.cust_tr_artifact3,
            FileNames.cust_tr_artifact4,
        ]
        tr_file = FileNames.tr_v2
        te_file = FileNames.val_v2
        tr_save_file = FileNames.tr_features_v1
        te_save_file = FileNames.val_features_v1
    else:
        print("flag not VALD!")
    tr_artifact = load_pickle(tr_artifact_file)
    hist_artifacts = [
        load_pickle(hist_file) for hist_file in hist_artifact_files
    ]
    columns = get_feature_names(3)
    tr_data = load_pickle(tr_file)
    te_data = load_pickle(te_file)
    all_data = pd.concat([tr_data, te_data])
    pipeline = get_feature_pipeline(tr_artifact, hist_artifacts, all_data)

    x_tr = pipeline.fit_transform(tr_data)
    x_te = pipeline.transform(te_data)

    x_tr = pd.DataFrame(x_tr, columns=columns)
    x_te = pd.DataFrame(x_te, columns=columns)
    x_tr[FieldNames.target] = tr_data[FieldNames.target].values
    if flag == "val":
        x_te[FieldNames.target] = te_data[FieldNames.target].values
    save_pickle(x_tr, tr_save_file)
    save_pickle(x_te, te_save_file)
def main():
    """Load train and test, map additional data, split validation and save as pickle."""
    print("Read train and test files")
    train, test = read_train_test()

    print("Read and map campaign start and end dates")
    kws = {
        "parse_dates": [FieldNames.campaign_start_date, FieldNames.campaign_end_date],
        "dayfirst": True,
    }
    campaign_data = read_csv(FileNames.campaign, **kws)
    train = pd.merge(train, campaign_data, on="campaign_id", how="left")
    test = pd.merge(test, campaign_data, on="campaign_id", how="left")

    print("Read and map demograhics data")
    demog_data = read_csv(FileNames.demogs)
    train = pd.merge(train, demog_data, on="customer_id", how="left")
    test = pd.merge(test, demog_data, on="customer_id", how="left")
    for col, mapping in [
        (FieldNames.age_range, AGE_MAP),
        (FieldNames.marital_status, MARITAL_STATUS),
        (FieldNames.family_size, FAMILY_SIZE),
        (FieldNames.no_of_children, NO_OF_CHILDREN),
        (FieldNames.campaign_type, CAMPAIGN_TYPE),
    ]:
        train[col] = map_to_float(train, col, mapping)
        test[col] = map_to_float(test, col, mapping)

    print("Read coupon and item details and merge them")
    coupon_data = read_csv(FileNames.coupon_item)
    item_data = read_csv(FileNames.item)
    coupon_data = pd.merge(coupon_data, item_data, on="item_id", how="left")

    print("Map coupon details to train")
    coupon_grouped = coupon_data.groupby("coupon_id").agg(
        {"item_id": list, "brand": list, "brand_type": list, "category": list}
    )
    train = pd.merge(train, coupon_grouped, on="coupon_id", how="left")
    test = pd.merge(test, coupon_grouped, on="coupon_id", how="left")

    train = train.rename(columns={'item_id': FieldNames.item_set})
    test = test.rename(columns={'item_id': FieldNames.item_set})

    print("split train --> tr and val")
    tr = train.loc[~train[FieldNames.campaign_id].isin([11, 12, 13])]
    val = train.loc[train[FieldNames.campaign_id].isin([11, 12, 13])]

    print("save as pickle")
    save_pickle(train, FileNames.train_v2)
    save_pickle(test, FileNames.test_v2)
    save_pickle(tr, FileNames.tr_v2)
    save_pickle(val, FileNames.val_v2)
def prepare_transactions():
    """Create validation customer transaction data; Aggregate by date and user."""
    cust_transact = read_csv(FileNames.transaction,
                             **{"parse_dates": [FieldNames.transaction_date]})
    item_details = read_csv(FileNames.item)
    cust_transact = pd.merge(cust_transact,
                             item_details,
                             on=FieldNames.item_id,
                             how="left")
    cust_transact[FieldNames.pct_discount] = (
        cust_transact[FieldNames.coupon_discount] /
        cust_transact[FieldNames.selling_price])
    cust_transact[FieldNames.transaction_dayofweek] = cust_transact[
        FieldNames.transaction_date].dt.dayofweek
    cust_transact_tr = cust_transact.loc[
        cust_transact[FieldNames.transaction_date] <= "2013-05-10"]

    print("Saving to pickle")
    save_pickle(cust_transact, FileNames.transaction_test_v1)
    save_pickle(cust_transact_tr, FileNames.transaction_val_v1)
def save_transaction_artifact(flag):
    """Sace artifacts for customer transactions with different conditions."""
    if flag == 'test':
        inp_file = FileNames.transaction_test_v1
        save_file1 = FileNames.cust_train_artifact1
        save_file2 = FileNames.cust_train_artifact2
        save_file3 = FileNames.cust_train_artifact3
        save_file4 = FileNames.cust_train_artifact4
    elif flag == 'val':
        inp_file = FileNames.transaction_val_v1
        save_file1 = FileNames.cust_tr_artifact1
        save_file2 = FileNames.cust_tr_artifact2
        save_file3 = FileNames.cust_tr_artifact3
        save_file4 = FileNames.cust_tr_artifact4
    else:
        print('flag not VALID!')

    transactions = load_pickle(inp_file)
    transactions_grp = group_transactions(transactions)
    artifact = _get_transaction_artifact(transactions_grp)
    save_pickle(artifact, save_file1)
    del artifact, transactions_grp
    print("Customer artifact 1 done!")

    transactions2 = transactions.loc[
        np.abs(transactions[FieldNames.coupon_discount]) > 0]
    transactions_grp2 = group_transactions(transactions2)
    artifact = _get_transaction_artifact(transactions_grp2)
    save_pickle(artifact, save_file2)
    del transactions2, transactions_grp2, artifact
    print("Customer artifact 2 done!")

    transactions3 = transactions.loc[
        (np.abs(transactions[FieldNames.coupon_discount]) > 0)
        & (np.abs(transactions[FieldNames.other_discount]) > 0)]
    transactions_grp3 = group_transactions(transactions3)
    artifact = _get_transaction_artifact(transactions_grp3)
    save_pickle(artifact, save_file3)
    del transactions3, transactions_grp3, artifact
    print("Customer artifact 3 done!")

    transactions4 = transactions.loc[(np.abs(
        transactions[FieldNames.coupon_discount]) > np.abs(
            transactions[FieldNames.other_discount]))]
    transactions_grp4 = group_transactions(transactions4)
    artifact = _get_transaction_artifact(transactions_grp4)
    save_pickle(artifact, save_file4)
    del transactions4, artifact
    print("Customer artifact 4 done!")
def save_train_artifact(flag):
    """Create artifact using training data."""
    if flag == 'test':
        inp_file = FileNames.train_v2
        save_file = FileNames.train_artifact
    elif flag == 'val':
        inp_file = FileNames.tr_v2
        save_file = FileNames.tr_artifact

    tr = load_pickle(inp_file)
    tr_artifact = HistoricalArtifact(
        tr,
        user_field=FieldNames.customer_id,
        date_field=FieldNames.campaign_start_date,
        key_fields=[
            FieldNames.campaign_id,
            FieldNames.coupon_id,
            FieldNames.target,
            FieldNames.item_category,
        ],
    )
    save_pickle(tr_artifact, save_file)