예제 #1
0
def prepare_datasets(data_dir):
    deviceinfo = utils.prepare_device_related_datasets(data_dir)

    # Count number of events per hour for each device (ephpd)
    ephpd = utils.prepare_events_per_hour_per_device_dataset(data_dir)

    # Events spread over 6 windows/splits through the day
    esd = utils.prepare_events_spread_dataset(data_dir)

    # Read the training & test datasets
    train = utils.read_gz(data_dir, "gender_age_train.csv.gz")
    test = utils.read_gz(data_dir, "gender_age_test.csv.gz")

    # Merge train and test with the events per hour per device dataset, ephpd
    train = pd.merge(train, ephpd, how="left")
    test = pd.merge(test, ephpd, how="left")
    for col in list(ephpd.columns.values):
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

    # Merge train and test with the events spread dataset, esd
    train = pd.merge(train, esd, how="left")
    test = pd.merge(test, esd, how="left")
    for col in list(esd.columns.values):
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

    # Merge train and test with a subset of columns of the device info dataset
    df2 = deviceinfo[["device_id", "phone_brand_id", "is_foreign_brand",
                      "device_model_id"]].copy()
    df2 = df2.drop_duplicates(subset=["device_id"], keep="last")
    train = pd.merge(train, df2, how="left", on="device_id")
    test = pd.merge(test, df2, how="left", on="device_id")

    # Prepare the train and test datasets
    hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()]
    cols_to_drop = list(hour_of_day_cols)
    test.drop(cols_to_drop, axis=1, inplace=True)
    test.fillna(-1, inplace=True)
    cols_to_drop.extend(["gender", "age"])
    train.drop(cols_to_drop, axis=1, inplace=True)
    target = train.group.values
    train = train.drop(["group"], axis=1)
    train.fillna(-1, inplace=True)
    logger.info("train.columns : {}".format(list(train.columns.values)))
    logger.info(train.head())
    return train, test, target
예제 #2
0
파일: td2.py 프로젝트: nirmalyaghosh/kaggle
def prepare_datasets(data_dir):
    # Bag-of-Apps features based on
    # https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/
    # bag-of-app-id-python-2-27392/code

    # Read App Events
    app_events = utils.read_gz(data_dir, "app_events.csv.gz")
    app_events = app_events.groupby("event_id")["app_id"].apply(
        lambda x: " ".join(set("app_id:" + str(s) for s in x)))

    # Read Events
    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         dtype={"device_id": np.str})
    events["app_id"] = events["event_id"].map(app_events)
    events = events.dropna()
    del app_events
    events = events[["device_id", "app_id"]]

    events = events.groupby("device_id")["app_id"]\
        .apply(lambda x: " "
               .join(set(str(" ".join(str(s) for s in x)).split(" "))))
    events = events.reset_index(name="app_id")
    # expand to multiple rows
    events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
    events.columns = ["app_id", "device_id"]

    # Read Phone Brand Device Model
    pbd = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv.gz"),
                      dtype={"device_id": np.str})
    pbd.drop_duplicates("device_id", keep="first", inplace=True)

    # Read Train and Test
    train = pd.read_csv(os.path.join(data_dir, "gender_age_train.csv.gz"),
                        dtype={"device_id": np.str})
    train.drop(["age", "gender"], axis=1, inplace=True)
    test = pd.read_csv(os.path.join(data_dir, "gender_age_test.csv.gz"),
                        dtype={"device_id": np.str})
    test["group"] = np.nan

    Y = train["group"]
    label_group = LabelEncoder()
    Y = label_group.fit_transform(Y)

    # Concat train and test,
    # before concatenating the features (phone_brand, device_model and app_id)
    df_all = pd.concat((train, test), axis=0, ignore_index=True)
    df_all = pd.merge(df_all, pbd, how="left", on="device_id")
    df_all["phone_brand"] = df_all["phone_brand"]\
        .apply(lambda x: "phone_brand:" + str(x))
    df_all["device_model"] = df_all["device_model"]\
        .apply(lambda x: "device_model:" + str(x))
    f1 = df_all[["device_id", "phone_brand"]]   # phone_brand
    f2 = df_all[["device_id", "device_model"]]  # device_model
    f3 = events[["device_id", "app_id"]]    # app_id
    del df_all
    # Rename the 2nd column
    f1.columns.values[1] = "feature"
    f2.columns.values[1] = "feature"
    f3.columns.values[1] = "feature"

    FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)
    FLS = FLS.reset_index()

    # User-Item Feature
    device_ids = FLS["device_id"].unique()
    feature_cs = FLS["feature"].unique()

    data = np.ones(len(FLS))
    device_id_enc = LabelEncoder().fit(FLS["device_id"])
    row = device_id_enc.transform(FLS["device_id"])
    col = LabelEncoder().fit_transform(FLS["feature"])
    sparse_matrix = sparse.csr_matrix((data, (row, col)),
                                      shape=(len(device_ids), len(feature_cs)))
    sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]
    logger.info("sparse_matrix {}".format(sparse_matrix.shape))

    # Data Prep
    train_row = device_id_enc.transform(train["device_id"])
    train_sp = sparse_matrix[train_row, :]

    test_row = device_id_enc.transform(test["device_id"])
    test_sp = sparse_matrix[test_row, :]

    random_state = cfg["common"]["seed"]
    X_train, X_val, y_train, y_val = cv.train_test_split(
        train_sp, Y, train_size=.80, random_state=random_state)

    # Feature Selection
    selector = SelectPercentile(f_classif, percentile=23)
    selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)
    X_val = selector.transform(X_val)
    train_sp = selector.transform(train_sp)
    test_sp = selector.transform(test_sp)
    logger.info("# Num of Features: {}".format(X_train.shape[1]))

    return X_train, X_val, y_train, y_val, test_sp