예제 #1
0
def prepare_device_related_datasets(data_dir):
    logger.info("Preparing device related datasets")
    deviceinfo = read_gz(data_dir, "phone_brand_device_model.csv.gz")
    # Get rid of duplicate device ids
    deviceinfo = deviceinfo.drop_duplicates("device_id", keep="first")

    # Extract the phone brand names - translate Chinese to English
    file_path = os.path.join(data_dir, "phone_brands_map.txt")
    if os.path.exists(file_path) == False:
        phone_brands = pd.unique(deviceinfo.phone_brand.ravel()).tolist()
        phone_brands_map = dict(zip(phone_brands, [None] * len(phone_brands)))
        cols = ["phone_brand", "phone_brand_translated"]
        phone_brands = pd.DataFrame(phone_brands_map.items(), columns=cols)
        phone_brands["is_foreign_brand"] = False  # Needs to be hand coded
        phone_brands.to_csv(file_path, encoding="utf-8-sig", index=False,
                            sep="\t")
    else:
        phone_brands = pd.read_csv(file_path, encoding="utf-8-sig",
                                   index_col=False, sep="\t")

    # Convert the index into a column and rename it brand ID
    phone_brands.reset_index(level=0, inplace=True)
    phone_brands.rename(columns={"index": "phone_brand_id"}, inplace=True)

    # Some device_model (such as S6, T5, T9, X5, X6, etc.)
    # associated with more than one phone_brand.
    # So concatenate phone_brand and device_model and then encode it
    m_d = deviceinfo.phone_brand.str.cat(deviceinfo.device_model)
    le = preprocessing.LabelEncoder().fit(m_d)
    deviceinfo["device_model_id"] = le.transform(m_d)

    # Merge device info with phone brands
    deviceinfo = pd.merge(deviceinfo, phone_brands)
    return deviceinfo
예제 #2
0
def report_grid_search_scores(grid_scores, n_top=5):
    # Utility function to report best scores
    # Credit : http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
    top_scores = sorted(grid_scores, key=operator.itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        rank, mvs = (i + 1), score.mean_validation_score
        logger.info("Model rank {0}, mean validation score {1:.3f}, "\
                    "parameters : {2}".format(rank, mvs, score.parameters))
예제 #3
0
def report_grid_search_scores(grid_scores, n_top=5):
    # Utility function to report best scores
    # Credit : http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
    top_scores = sorted(grid_scores, key=operator.itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        rank, mvs = (i + 1), score.mean_validation_score
        logger.info("Model rank {0}, mean validation score {1:.3f}, "\
                    "parameters : {2}".format(rank, mvs, score.parameters))
예제 #4
0
def make_submission_file(model, predicted_vals, name_prefix):
    ts = time.strftime("%a_%d%b%Y_%H%M%S")
    # First, save the model [See http://stackoverflow.com/a/11169797]
    if model is not None:
        file_name_prefix = "%s%s.model" % (name_prefix, ts)
        _ = joblib.dump(model, os.path.join("submissions", file_name_prefix),
                        compress=9)
    # Next, generate the submissions file
    file_path = os.path.join("submissions", "%s%s.csv" % (name_prefix, ts))
    predicted_vals.to_csv(file_path, index=False,
                          quoting=csv.QUOTE_NONE)
    gzip_file(file_path)
    logger.info("See %s.gz" % file_path)
예제 #5
0
def make_submission_file(model, predicted_vals, name_prefix):
    ts = time.strftime("%a_%d%b%Y_%H%M%S")
    # First, save the model [See http://stackoverflow.com/a/11169797]
    if model is not None:
        file_name_prefix = "%s%s.model" % (name_prefix, ts)
        _ = joblib.dump(model,
                        os.path.join("submissions", file_name_prefix),
                        compress=9)
    # Next, generate the submissions file
    file_path = os.path.join("submissions", "%s%s.csv" % (name_prefix, ts))
    predicted_vals.to_csv(file_path, index=False, quoting=csv.QUOTE_NONE)
    gzip_file(file_path)
    logger.info("See %s.gz" % file_path)
예제 #6
0
def prepare_datasets(data_dir):
    deviceinfo = utils.prepare_device_related_datasets(data_dir)

    # Count number of events per hour for each device (ephpd)
    ephpd = utils.prepare_events_per_hour_per_device_dataset(data_dir)

    # Events spread over 6 windows/splits through the day
    esd = utils.prepare_events_spread_dataset(data_dir)

    # Read the training & test datasets
    train = utils.read_gz(data_dir, "gender_age_train.csv.gz")
    test = utils.read_gz(data_dir, "gender_age_test.csv.gz")

    # Merge train and test with the events per hour per device dataset, ephpd
    train = pd.merge(train, ephpd, how="left")
    test = pd.merge(test, ephpd, how="left")
    for col in list(ephpd.columns.values):
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

    # Merge train and test with the events spread dataset, esd
    train = pd.merge(train, esd, how="left")
    test = pd.merge(test, esd, how="left")
    for col in list(esd.columns.values):
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

    # Merge train and test with a subset of columns of the device info dataset
    df2 = deviceinfo[["device_id", "phone_brand_id", "is_foreign_brand",
                      "device_model_id"]].copy()
    df2 = df2.drop_duplicates(subset=["device_id"], keep="last")
    train = pd.merge(train, df2, how="left", on="device_id")
    test = pd.merge(test, df2, how="left", on="device_id")

    # Prepare the train and test datasets
    hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()]
    cols_to_drop = list(hour_of_day_cols)
    test.drop(cols_to_drop, axis=1, inplace=True)
    test.fillna(-1, inplace=True)
    cols_to_drop.extend(["gender", "age"])
    train.drop(cols_to_drop, axis=1, inplace=True)
    target = train.group.values
    train = train.drop(["group"], axis=1)
    train.fillna(-1, inplace=True)
    logger.info("train.columns : {}".format(list(train.columns.values)))
    logger.info(train.head())
    return train, test, target
예제 #7
0
def prepare_events_per_hour_per_device_dataset(data_dir):
    logger.info("Preparing events per hour per device dataset")
    events = read_gz(data_dir, "events.csv.gz")
    events.timestamp = pd.to_datetime(events.timestamp)
    events["time_hour"] = events.timestamp.apply(lambda x: x.hour)

    # Count number of events per hour for each device (ephpd)
    ephpd = pd.crosstab(events["device_id"], events["time_hour"])

    # Rename columns showing number of events per hour
    hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()]
    d = dict(zip(np.arange(0, 24).tolist(), hour_of_day_cols))
    ephpd.rename(columns=d, inplace=True)
    ephpd.reset_index(level=0, inplace=True)
    # Normalize the rows in ephpd by their sums
    ephpd_normalized = ephpd[hour_of_day_cols] \
        .div(ephpd[hour_of_day_cols].sum(axis=1), axis=0)
    ephpd_normalized.head()
    ephpd = pd.merge(ephpd, ephpd_normalized,
                     right_index=True, left_index=True, suffixes=('', '_n'))
    return ephpd
예제 #8
0
def prepare_events_spread_dataset(data_dir):
    # Calculates spread of events over 6 splits:
    # Morning (5AM-10AM), Mid-day (10AM-2PM), Afternoon (2PM-5PM),
    # Evening (5PM-10PM), Night (10PM-1AM), NightOwl (1AM-5AM)
    logger.info("Preparing the events spread dataset")
    events = read_gz(data_dir,
                     "events.csv.gz",
                     cols_to_read=["event_id", "device_id", "timestamp"])
    events.timestamp = pd.to_datetime(events.timestamp)
    events["time_hour"] = events.timestamp.apply(lambda x: x.hour)
    s = events.groupby(["device_id", "time_hour"]).size()
    df = s.unstack(level=-1)

    # Create hour-of-day columns showing number of events per hour
    hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()]
    d = dict(zip(np.arange(0, 24).tolist(), hour_of_day_cols))
    df.rename(columns=d, inplace=True)
    df = df.fillna(0)

    # Split into 6 splits
    #df["usage_morning"] = df_tmp[["h" + str(x) for x in np.arange(5, 10).tolist()]].sum(axis=1)
    df["h_morning"] = df[["h" + str(x) for x in [5, 6, 7, 8, 9]]].sum(axis=1)
    df["h_midday"] = df[["h" + str(x) for x in [10, 11, 12, 13]]].sum(axis=1)
    df["h_afternoon"] = df[["h" + str(x) for x in [14, 15, 16]]].sum(axis=1)
    df["h_evening"] = df[["h" + str(x)
                          for x in [17, 18, 19, 20, 21]]].sum(axis=1)
    df["h_night"] = df[["h" + str(x) for x in [22, 23, 0]]].sum(axis=1)
    df["h_nightowl"] = df[["h" + str(x) for x in [1, 2, 3, 4]]].sum(axis=1)

    # Drop the hour-of-day columns
    df.drop(hour_of_day_cols, axis=1, inplace=True)

    # Normalize rows
    df = df.div(df.sum(axis=1), axis=0).round(2)

    df.reset_index(level=0, inplace=True)

    return df
예제 #9
0
def prepare_device_related_datasets(data_dir):
    logger.info("Preparing device related datasets")
    deviceinfo = read_gz(data_dir, "phone_brand_device_model.csv.gz")
    # Get rid of duplicate device ids
    deviceinfo = deviceinfo.drop_duplicates("device_id", keep="first")

    # Extract the phone brand names - translate Chinese to English
    file_path = os.path.join(data_dir, "phone_brands_map.txt")
    if os.path.exists(file_path) == False:
        phone_brands = pd.unique(deviceinfo.phone_brand.ravel()).tolist()
        phone_brands_map = dict(zip(phone_brands, [None] * len(phone_brands)))
        cols = ["phone_brand", "phone_brand_translated"]
        phone_brands = pd.DataFrame(phone_brands_map.items(), columns=cols)
        phone_brands["is_foreign_brand"] = False  # Needs to be hand coded
        phone_brands.to_csv(file_path,
                            encoding="utf-8-sig",
                            index=False,
                            sep="\t")
    else:
        phone_brands = pd.read_csv(file_path,
                                   encoding="utf-8-sig",
                                   index_col=False,
                                   sep="\t")

    # Convert the index into a column and rename it brand ID
    phone_brands.reset_index(level=0, inplace=True)
    phone_brands.rename(columns={"index": "phone_brand_id"}, inplace=True)

    # Some device_model (such as S6, T5, T9, X5, X6, etc.)
    # associated with more than one phone_brand.
    # So concatenate phone_brand and device_model and then encode it
    m_d = deviceinfo.phone_brand.str.cat(deviceinfo.device_model)
    le = preprocessing.LabelEncoder().fit(m_d)
    deviceinfo["device_model_id"] = le.transform(m_d)

    # Merge device info with phone brands
    deviceinfo = pd.merge(deviceinfo, phone_brands)
    return deviceinfo
예제 #10
0
def prepare_events_spread_dataset(data_dir):
    # Calculates spread of events over 6 splits:
    # Morning (5AM-10AM), Mid-day (10AM-2PM), Afternoon (2PM-5PM),
    # Evening (5PM-10PM), Night (10PM-1AM), NightOwl (1AM-5AM)
    logger.info("Preparing the events spread dataset")
    events = read_gz(data_dir, "events.csv.gz",
                     cols_to_read=["event_id", "device_id", "timestamp"])
    events.timestamp = pd.to_datetime(events.timestamp)
    events["time_hour"] = events.timestamp.apply(lambda x: x.hour)
    s = events.groupby(["device_id","time_hour"]).size()
    df = s.unstack(level=-1)

    # Create hour-of-day columns showing number of events per hour
    hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()]
    d = dict(zip(np.arange(0, 24).tolist(), hour_of_day_cols))
    df.rename(columns=d, inplace=True)
    df = df.fillna(0)

    # Split into 6 splits
    #df["usage_morning"] = df_tmp[["h" + str(x) for x in np.arange(5, 10).tolist()]].sum(axis=1)
    df["h_morning"] = df[["h" + str(x) for x in [5, 6, 7, 8, 9]]].sum(axis=1)
    df["h_midday"] = df[["h" + str(x) for x in [10, 11, 12, 13]]].sum(axis=1)
    df["h_afternoon"] = df[["h" + str(x) for x in [14, 15, 16]]].sum(axis=1)
    df["h_evening"] = df[["h" + str(x) for x in [17, 18, 19, 20, 21]]].sum(axis=1)
    df["h_night"] = df[["h" + str(x) for x in [22, 23, 0]]].sum(axis=1)
    df["h_nightowl"] = df[["h" + str(x) for x in [1, 2, 3, 4]]].sum(axis=1)

    # Drop the hour-of-day columns
    df.drop(hour_of_day_cols, axis=1, inplace=True)

    # Normalize rows
    df = df.div(df.sum(axis=1), axis=0).round(2)

    df.reset_index(level=0, inplace=True)

    return df
예제 #11
0
def prepare_events_per_hour_per_device_dataset(data_dir):
    logger.info("Preparing events per hour per device dataset")
    events = read_gz(data_dir, "events.csv.gz")
    events.timestamp = pd.to_datetime(events.timestamp)
    events["time_hour"] = events.timestamp.apply(lambda x: x.hour)

    # Count number of events per hour for each device (ephpd)
    ephpd = pd.crosstab(events["device_id"], events["time_hour"])

    # Rename columns showing number of events per hour
    hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()]
    d = dict(zip(np.arange(0, 24).tolist(), hour_of_day_cols))
    ephpd.rename(columns=d, inplace=True)
    ephpd.reset_index(level=0, inplace=True)
    # Normalize the rows in ephpd by their sums
    ephpd_normalized = ephpd[hour_of_day_cols] \
        .div(ephpd[hour_of_day_cols].sum(axis=1), axis=0)
    ephpd_normalized.head()
    ephpd = pd.merge(ephpd,
                     ephpd_normalized,
                     right_index=True,
                     left_index=True,
                     suffixes=('', '_n'))
    return ephpd
예제 #12
0
def find_best_estimator(base_estimator, X, y, section, verbosity=3):
    # grid_search_params_key : key under the indicated section of the
    # configuration YML file containing the grid search parameters
    if cfg[section]["find_best"] == False:
        base_estimator.fit(X, y)
        return base_estimator

    cv_nfold = cfg[section]["cv_nfold"]
    name = type(base_estimator).__name__
    grid_search_params_key = "param_dist_%s" % clf_keys[name]
    n_iter = cfg[section]["n_iters"]
    n_jobs = cfg[section]["n_jobs"]
    param_dist = cfg[section][grid_search_params_key]
    random_state = cfg["common"]["seed"]
    scoring = cfg["common"]["grid_search_scoring"]
    if cfg[section]["use_random_search"] == True:
        logger.info("Using random search to find best %s based on %s score" %\
                    (name, scoring))
        search = grid_search.RandomizedSearchCV(estimator=base_estimator,
                                                param_distributions=param_dist,
                                                n_iter=n_iter,
                                                n_jobs=n_jobs,
                                                cv=cv_nfold,
                                                random_state=random_state,
                                                scoring=scoring,
                                                verbose=verbosity)
    else:
        logger.info("Using grid search to find best %s based on %s score" %\
                    (name, scoring))
        search = grid_search.GridSearchCV(estimator=base_estimator,
                                          param_grid=param_dist,
                                          n_jobs=n_jobs,
                                          cv=cv_nfold,
                                          scoring=scoring,
                                          verbose=verbosity)

    start = time.time()
    search.fit(X, y)
    logger.info("Took %.2f seconds to find the best %s." %
                ((time.time() - start), name))
    report_grid_search_scores(search.grid_scores_, n_top=3)
    logger.info(search.best_estimator_)
    return search.best_estimator_
예제 #13
0
def find_best_estimator(base_estimator, X, y, section, verbosity=3):
    # grid_search_params_key : key under the indicated section of the
    # configuration YML file containing the grid search parameters
    if cfg[section]["find_best"] == False:
        base_estimator.fit(X, y)
        return base_estimator

    cv_nfold = cfg[section]["cv_nfold"]
    name = type(base_estimator).__name__
    grid_search_params_key = "param_dist_%s" % clf_keys[name]
    n_iter = cfg[section]["n_iters"]
    n_jobs = cfg[section]["n_jobs"]
    param_dist = cfg[section][grid_search_params_key]
    random_state = cfg["common"]["seed"]
    scoring = cfg["common"]["grid_search_scoring"]
    if cfg[section]["use_random_search"] == True:
        logger.info("Using random search to find best %s based on %s score" %\
                    (name, scoring))
        search = grid_search.RandomizedSearchCV(estimator=base_estimator,
                                                param_distributions=param_dist,
                                                n_iter=n_iter,
                                                n_jobs=n_jobs,
                                                cv=cv_nfold,
                                                random_state=random_state,
                                                scoring=scoring,
                                                verbose=verbosity)
    else:
        logger.info("Using grid search to find best %s based on %s score" %\
                    (name, scoring))
        search = grid_search.GridSearchCV(estimator=base_estimator,
                                          param_grid=param_dist,
                                          n_jobs=n_jobs,
                                          cv=cv_nfold,
                                          scoring=scoring,
                                          verbose=verbosity)

    start = time.time()
    search.fit(X, y)
    logger.info("Took %.2f seconds to find the best %s." %
                ((time.time() - start), name))
    report_grid_search_scores(search.grid_scores_, n_top=3)
    logger.info(search.best_estimator_)
    return search.best_estimator_
예제 #14
0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size *
                                   (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0


if __name__ == "__main__":
    np.random.seed(730521)
    # Read the train and test data
    data_dir = "data"
    logger.info("Running script for Approach 3")
    train = pd.read_csv(os.path.join(data_dir, "gender_age_train.csv.gz"),
                        index_col="device_id")
    test = pd.read_csv(os.path.join(data_dir, "gender_age_test.csv.gz"),
                       index_col="device_id")
    # Encode the age groups
    y_enc = LabelEncoder().fit(train.group)
    y = y_enc.transform(train.group)
    # Create sparse features
    Xtrain, Xtest = u.prepare_sparse_features_dataset(train, test, data_dir)
    dummy_y = np_utils.to_categorical(y)

    # Create the Keras model and compile it
    model = Sequential()
    inp_dim = Xtrain.shape[1]
    model.add(Dense(10, input_dim=inp_dim, init="normal", activation="relu"))
예제 #15
0
def prepare_sparse_features_dataset(train, test, data_dir):
    # Credit : Data preparation strategy influenced by the script
    # written by dune_dweller,
    # https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/a-linear-model-on-apps-and-labels

    # First, read the datasets
    phone = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv"))
    # Get rid of duplicate device ids in phone
    phone = phone.drop_duplicates("device_id", keep="first")\
        .set_index("device_id")
    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         parse_dates=["timestamp"], index_col="event_id")
    appevents = pd.read_csv(os.path.join(data_dir, "app_events.csv.gz"),
                            usecols=["event_id", "app_id", "is_active"],
                            dtype={"is_active":bool})
    applabels = pd.read_csv(os.path.join(data_dir, "app_labels.csv.gz"))

    train["trainrow"] = np.arange(train.shape[0])
    test["testrow"] = np.arange(test.shape[0])

    # Next, create the sparse features
    #Phone brand
    brandencoder = LabelEncoder().fit(phone.phone_brand)
    phone["brand"] = brandencoder.transform(phone["phone_brand"])
    train["brand"] = phone["brand"]
    test["brand"] = phone["brand"]
    Xtr_brand = csr_matrix((np.ones(train.shape[0]),
                           (train.trainrow, train.brand)))
    Xte_brand = csr_matrix((np.ones(test.shape[0]),
                           (test.testrow, test.brand)))
    logger.info("Brand features: train shape {}, test shape {}"
                .format(Xtr_brand.shape, Xte_brand.shape))

    # Device model
    m = phone.phone_brand.str.cat(phone.device_model)
    modelencoder = LabelEncoder().fit(m)
    phone["model"] = modelencoder.transform(m)
    train["model"] = phone["model"]
    test["model"] = phone["model"]
    Xtr_model = csr_matrix((np.ones(train.shape[0]),
                           (train.trainrow, train.model)))
    Xte_model = csr_matrix((np.ones(test.shape[0]),
                           (test.testrow, test.model)))
    logger.info("Model features: train shape {}, test shape {}"
                .format(Xtr_model.shape, Xte_model.shape))

    # Installed apps features
    appencoder = LabelEncoder().fit(appevents.app_id)
    appevents["app"] = appencoder.transform(appevents.app_id)
    napps = len(appencoder.classes_)
    deviceapps = (appevents.merge(events[["device_id"]], how="left",
                                  left_on="event_id",right_index=True)
                           .groupby(["device_id","app"])["app"].agg(["size"])
                           .merge(train[["trainrow"]], how="left",
                                  left_index=True, right_index=True)
                           .merge(test[["testrow"]], how="left",
                                  left_index=True, right_index=True)
                           .reset_index())

    d = deviceapps.dropna(subset=['trainrow'])
    Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),
                         shape=(train.shape[0], napps))
    d = deviceapps.dropna(subset=['testrow'])
    Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)),
                         shape=(test.shape[0], napps))
    logger.info("Apps data: train shape {}, test shape {}"
                .format(Xtr_app.shape, Xte_app.shape))

    # App labels features
    applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
    applabels["app"] = appencoder.transform(applabels.app_id)
    labelencoder = LabelEncoder().fit(applabels.label_id)
    applabels["label"] = labelencoder.transform(applabels.label_id)
    nlabels = len(labelencoder.classes_)

    devicelabels = (deviceapps[["device_id","app"]]
                    .merge(applabels[["app","label"]])
                    .groupby(["device_id","label"])["app"].agg(["size"])
                    .merge(train[["trainrow"]], how='left', left_index=True,
                           right_index=True)
                    .merge(test[["testrow"]], how='left', left_index=True,
                           right_index=True)
                    .reset_index())

    d = devicelabels.dropna(subset=["trainrow"])
    Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
                           shape=(train.shape[0], nlabels))
    d = devicelabels.dropna(subset=["testrow"])
    Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
                           shape=(test.shape[0], nlabels))
    logger.info("Labels data: train shape {}, test shape {}"
                .format(Xtr_label.shape, Xte_label.shape))

    # Concatenate all features
    Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format="csr")
    Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format="csr")
    # Xtrain and Xtest are of type, scipy.sparse.csr.csr_matrix
    logger.info("All features: train shape {}, test shape {}"
                .format(Xtrain.shape, Xtest.shape))

    return Xtrain, Xtest
예제 #16
0
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    # Level 1 classifier, which does the blending
    # bclf = LogisticRegression(C=0.1, random_state=480, tol=0.005,
    #                           solver="newton-cg")
    bclf = LogisticRegression()
    bclf.fit(blend_train, y_dev)
    y_test_predict = bclf.predict_proba(blend_test)
    log_loss = metrics.log_loss(y_test, y_test_predict)
    return bclf, blend_test, log_loss


if __name__ == "__main__":
    s = "approach1"
    logger.info("Running script for Approach 1, %s", cfg[s]["description"])
    t0 = time.time()

    train, test, target = prepare_datasets("data")
    random_state = cfg["common"]["seed"]
    X_train, X_valid, y_train, y_valid = cv.train_test_split(
        train, target, test_size=0.4, random_state=random_state)
    X_submission = test.values[:, 1:]

    # Transforming the string output to numeric
    label_encoder = LabelEncoder()
    label_encoder.fit(target)
    num_classes = len(label_encoder.classes_)
    y = label_encoder.transform(target)

    # Level 0 classifiers
예제 #17
0
def gzip_file(f_path):
    with open(f_path, "rb") as f_in, gzip.open(f_path + ".gz", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
    logger.info("See %s.gz" % f_path)
예제 #18
0
def gzip_file(f_path):
    with open(f_path, "rb") as f_in, gzip.open(f_path + ".gz", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
    logger.info("See %s.gz" % f_path)
예제 #19
0
def prepare_sparse_features_dataset(train, test, data_dir):
    # Credit : Data preparation strategy influenced by the script
    # written by dune_dweller,
    # https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/a-linear-model-on-apps-and-labels

    # First, read the datasets
    phone = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv"))
    # Get rid of duplicate device ids in phone
    phone = phone.drop_duplicates("device_id", keep="first")\
        .set_index("device_id")
    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         parse_dates=["timestamp"],
                         index_col="event_id")
    appevents = pd.read_csv(os.path.join(data_dir, "app_events.csv.gz"),
                            usecols=["event_id", "app_id", "is_active"],
                            dtype={"is_active": bool})
    applabels = pd.read_csv(os.path.join(data_dir, "app_labels.csv.gz"))

    train["trainrow"] = np.arange(train.shape[0])
    test["testrow"] = np.arange(test.shape[0])

    # Next, create the sparse features
    #Phone brand
    brandencoder = LabelEncoder().fit(phone.phone_brand)
    phone["brand"] = brandencoder.transform(phone["phone_brand"])
    train["brand"] = phone["brand"]
    test["brand"] = phone["brand"]
    Xtr_brand = csr_matrix(
        (np.ones(train.shape[0]), (train.trainrow, train.brand)))
    Xte_brand = csr_matrix(
        (np.ones(test.shape[0]), (test.testrow, test.brand)))
    logger.info("Brand features: train shape {}, test shape {}".format(
        Xtr_brand.shape, Xte_brand.shape))

    # Device model
    m = phone.phone_brand.str.cat(phone.device_model)
    modelencoder = LabelEncoder().fit(m)
    phone["model"] = modelencoder.transform(m)
    train["model"] = phone["model"]
    test["model"] = phone["model"]
    Xtr_model = csr_matrix(
        (np.ones(train.shape[0]), (train.trainrow, train.model)))
    Xte_model = csr_matrix(
        (np.ones(test.shape[0]), (test.testrow, test.model)))
    logger.info("Model features: train shape {}, test shape {}".format(
        Xtr_model.shape, Xte_model.shape))

    # Installed apps features
    appencoder = LabelEncoder().fit(appevents.app_id)
    appevents["app"] = appencoder.transform(appevents.app_id)
    napps = len(appencoder.classes_)
    deviceapps = (appevents.merge(
        events[["device_id"]],
        how="left",
        left_on="event_id",
        right_index=True).groupby(["device_id", "app"])["app"].agg([
            "size"
        ]).merge(train[["trainrow"]],
                 how="left",
                 left_index=True,
                 right_index=True).merge(test[["testrow"]],
                                         how="left",
                                         left_index=True,
                                         right_index=True).reset_index())

    d = deviceapps.dropna(subset=['trainrow'])
    Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),
                         shape=(train.shape[0], napps))
    d = deviceapps.dropna(subset=['testrow'])
    Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)),
                         shape=(test.shape[0], napps))
    logger.info("Apps data: train shape {}, test shape {}".format(
        Xtr_app.shape, Xte_app.shape))

    # App labels features
    applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
    applabels["app"] = appencoder.transform(applabels.app_id)
    labelencoder = LabelEncoder().fit(applabels.label_id)
    applabels["label"] = labelencoder.transform(applabels.label_id)
    nlabels = len(labelencoder.classes_)

    devicelabels = (deviceapps[["device_id", "app"]].merge(applabels[[
        "app", "label"
    ]]).groupby(["device_id",
                 "label"])["app"].agg(["size"]).merge(
                     train[["trainrow"]],
                     how='left',
                     left_index=True,
                     right_index=True).merge(test[["testrow"]],
                                             how='left',
                                             left_index=True,
                                             right_index=True).reset_index())

    d = devicelabels.dropna(subset=["trainrow"])
    Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
                           shape=(train.shape[0], nlabels))
    d = devicelabels.dropna(subset=["testrow"])
    Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
                           shape=(test.shape[0], nlabels))
    logger.info("Labels data: train shape {}, test shape {}".format(
        Xtr_label.shape, Xte_label.shape))

    # Concatenate all features
    Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format="csr")
    Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format="csr")
    # Xtrain and Xtest are of type, scipy.sparse.csr.csr_matrix
    logger.info("All features: train shape {}, test shape {}".format(
        Xtrain.shape, Xtest.shape))

    return Xtrain, Xtest
예제 #20
0
파일: td3.py 프로젝트: nirmalyaghosh/kaggle
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0


if __name__ == "__main__":
    np.random.seed(730521)
    # Read the train and test data
    data_dir = "data"
    logger.info("Running script for Approach 3")
    train = pd.read_csv(os.path.join(data_dir, "gender_age_train.csv.gz"),
                        index_col = "device_id")
    test = pd.read_csv(os.path.join(data_dir, "gender_age_test.csv.gz"),
                       index_col = "device_id")
    # Encode the age groups
    y_enc = LabelEncoder().fit(train.group)
    y = y_enc.transform(train.group)
    # Create sparse features
    Xtrain, Xtest = u.prepare_sparse_features_dataset(train, test, data_dir)
    dummy_y = np_utils.to_categorical(y)

    # Create the Keras model and compile it
    model = Sequential()
    inp_dim = Xtrain.shape[1]
    model.add(Dense(10, input_dim=inp_dim, init="normal", activation="relu"))
예제 #21
0
def prepare_bag_of_apps_datasets(data_dir):
    # Based on : https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/low-ram-bag-of-apps-python/

    # First, check if the datasets have already been created
    boa_file_path_1 = os.path.join(data_dir, "bag_of_apps_train.h5")
    boa_file_path_2 = os.path.join(data_dir, "bag_of_apps_test.h5")
    if os.path.exists(boa_file_path_1) and os.path.exists(boa_file_path_2):
        logger.info("Reading Bag-of-Apps datasets from {} & {}".format(
            boa_file_path_1, boa_file_path_2))
        a = pd.read_hdf(boa_file_path_1, "a")
        b = pd.read_hdf(boa_file_path_2, "b")
        return a, b

    # Create the datasets
    logger.info("Preparing Bag-of-Apps datasets")
    app_labels = read_gz(data_dir, "app_labels.csv.gz")
    app_labels = app_labels.groupby("app_id")["label_id"]\
        .apply(lambda x: " ".join(str(s) for s in x))

    app_events = read_gz(data_dir, "app_events.csv.gz")
    app_events["app_labels"] = app_events["app_id"].map(app_labels)
    app_events = app_events.groupby("event_id")["app_labels"]\
        .apply(lambda x: " ".join(str(s) for s in x))
    del app_labels

    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         dtype={"device_id": np.str})
    events["app_labels"] = events["event_id"].map(app_events)
    events = events.groupby("device_id")["app_labels"]\
        .apply(lambda x: " ".join(str(s) for s in x))
    del app_events

    pbd = pd.read_csv(os.path.join(data_dir,
                                   "phone_brand_device_model.csv.gz"),
                      dtype={"device_id": np.str})
    pbd.drop_duplicates("device_id", keep="first", inplace=True)

    _train = read_gz(data_dir, "gender_age_train.csv.gz")
    _train["app_labels"] = _train["device_id"].map(events)
    _train = pd.merge(_train, pbd, how="left", on="device_id", left_index=True)
    _test = read_gz(data_dir, "gender_age_test.csv.gz")
    _test["app_labels"] = _test["device_id"].map(events)
    _test = pd.merge(_test, pbd, how="left", on="device_id", left_index=True)
    del pbd
    del events

    df_all = pd.concat((_train, _test), axis=0, ignore_index=True)
    split_len = len(_train)
    vec = CountVectorizer(min_df=1, binary=1)
    df_all = df_all[["phone_brand", "device_model", "app_labels"]]\
        .astype(np.str).apply(lambda x: " ".join(s for s in x), axis=1)\
        .fillna("Missing")
    df_tfv = vec.fit_transform(df_all)  # 186716 x 2045 sparse matrix
    _train = df_tfv[:split_len, :]  # 74645 x 2045 sparse matrix
    _test = df_tfv[split_len:, :]  # 112071 x 2045 sparse matrix

    # Converting the sparse matrix into a DataFrame
    a = pd.SparseDataFrame([
        pd.SparseSeries(_train[i].toarray().ravel())
        for i in np.arange(_train.shape[0])
    ])
    b = pd.SparseDataFrame([
        pd.SparseSeries(_test[i].toarray().ravel())
        for i in np.arange(_test.shape[0])
    ])
    # Rename the columns
    app_labels_cols = ["a" + str(x) for x in np.arange(0, a.shape[1]).tolist()]
    d = dict(zip(np.arange(0, a.shape[1]).tolist(), app_labels_cols))
    a.rename(columns=d, inplace=True)
    b.rename(columns=d, inplace=True)
    # Write to file
    a.to_sparse(kind='block')\
        .to_hdf(boa_file_path_1, "a", mode="w", complib="blosc", complevel=9)
    b.to_sparse(kind='block')\
        .to_hdf(boa_file_path_2, "b", mode="w", complib="blosc", complevel=9)
    del _train
    del _test

    # TO USE, DO
    # train = pd.merge(train, a, left_index=True , right_index=True)

    return a, b  # bag-of-apps datasets
예제 #22
0
파일: td2.py 프로젝트: nirmalyaghosh/kaggle
    test_sp = selector.transform(test_sp)
    logger.info("# Num of Features: {}".format(X_train.shape[1]))

    return X_train, X_val, y_train, y_val, test_sp


def train_model(X, y, X_, y_, clf):
    model = utils.find_best_estimator(clf, X, y, section="approach2")
    preds = model.predict_proba(X_)
    log_loss = metrics.log_loss(y_, preds)
    return model, log_loss


if __name__ == "__main__":
    s = "approach2"
    logger.info("Running script for Approach 2, %s", cfg[s]["description"])
    t0 = time.time()

    X_train, X_val, y_train, y_val, test_sp = prepare_datasets("data")
    logger.info("Data prep took {:.2f} seconds".format((time.time() - t0)))

    # Compare a few classifiers
    clfs = [
        (ExtraTreesClassifier(**utils.read_estimator_params(s, "et")), "et"),
        (LogisticRegression(**utils.read_estimator_params(s, "lr")), "lr"),
        (RandomForestClassifier(**utils.read_estimator_params(s, "rf")), "rf")
    ]
    results = []
    for clf in clfs:
        ts = time.time()
        model, log_loss = train_model(X_train, y_train, X_val, y_val, clf[0])
예제 #23
0
파일: td2.py 프로젝트: nirmalyaghosh/kaggle
def prepare_datasets(data_dir):
    # Bag-of-Apps features based on
    # https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/
    # bag-of-app-id-python-2-27392/code

    # Read App Events
    app_events = utils.read_gz(data_dir, "app_events.csv.gz")
    app_events = app_events.groupby("event_id")["app_id"].apply(
        lambda x: " ".join(set("app_id:" + str(s) for s in x)))

    # Read Events
    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         dtype={"device_id": np.str})
    events["app_id"] = events["event_id"].map(app_events)
    events = events.dropna()
    del app_events
    events = events[["device_id", "app_id"]]

    events = events.groupby("device_id")["app_id"]\
        .apply(lambda x: " "
               .join(set(str(" ".join(str(s) for s in x)).split(" "))))
    events = events.reset_index(name="app_id")
    # expand to multiple rows
    events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
    events.columns = ["app_id", "device_id"]

    # Read Phone Brand Device Model
    pbd = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv.gz"),
                      dtype={"device_id": np.str})
    pbd.drop_duplicates("device_id", keep="first", inplace=True)

    # Read Train and Test
    train = pd.read_csv(os.path.join(data_dir, "gender_age_train.csv.gz"),
                        dtype={"device_id": np.str})
    train.drop(["age", "gender"], axis=1, inplace=True)
    test = pd.read_csv(os.path.join(data_dir, "gender_age_test.csv.gz"),
                        dtype={"device_id": np.str})
    test["group"] = np.nan

    Y = train["group"]
    label_group = LabelEncoder()
    Y = label_group.fit_transform(Y)

    # Concat train and test,
    # before concatenating the features (phone_brand, device_model and app_id)
    df_all = pd.concat((train, test), axis=0, ignore_index=True)
    df_all = pd.merge(df_all, pbd, how="left", on="device_id")
    df_all["phone_brand"] = df_all["phone_brand"]\
        .apply(lambda x: "phone_brand:" + str(x))
    df_all["device_model"] = df_all["device_model"]\
        .apply(lambda x: "device_model:" + str(x))
    f1 = df_all[["device_id", "phone_brand"]]   # phone_brand
    f2 = df_all[["device_id", "device_model"]]  # device_model
    f3 = events[["device_id", "app_id"]]    # app_id
    del df_all
    # Rename the 2nd column
    f1.columns.values[1] = "feature"
    f2.columns.values[1] = "feature"
    f3.columns.values[1] = "feature"

    FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)
    FLS = FLS.reset_index()

    # User-Item Feature
    device_ids = FLS["device_id"].unique()
    feature_cs = FLS["feature"].unique()

    data = np.ones(len(FLS))
    device_id_enc = LabelEncoder().fit(FLS["device_id"])
    row = device_id_enc.transform(FLS["device_id"])
    col = LabelEncoder().fit_transform(FLS["feature"])
    sparse_matrix = sparse.csr_matrix((data, (row, col)),
                                      shape=(len(device_ids), len(feature_cs)))
    sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]
    logger.info("sparse_matrix {}".format(sparse_matrix.shape))

    # Data Prep
    train_row = device_id_enc.transform(train["device_id"])
    train_sp = sparse_matrix[train_row, :]

    test_row = device_id_enc.transform(test["device_id"])
    test_sp = sparse_matrix[test_row, :]

    random_state = cfg["common"]["seed"]
    X_train, X_val, y_train, y_val = cv.train_test_split(
        train_sp, Y, train_size=.80, random_state=random_state)

    # Feature Selection
    selector = SelectPercentile(f_classif, percentile=23)
    selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)
    X_val = selector.transform(X_val)
    train_sp = selector.transform(train_sp)
    test_sp = selector.transform(test_sp)
    logger.info("# Num of Features: {}".format(X_train.shape[1]))

    return X_train, X_val, y_train, y_val, test_sp
예제 #24
0
def prepare_bag_of_apps_datasets(data_dir):
    # Based on : https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/low-ram-bag-of-apps-python/

    # First, check if the datasets have already been created
    boa_file_path_1 = os.path.join(data_dir, "bag_of_apps_train.h5")
    boa_file_path_2 = os.path.join(data_dir, "bag_of_apps_test.h5")
    if os.path.exists(boa_file_path_1) and os.path.exists(boa_file_path_2):
        logger.info("Reading Bag-of-Apps datasets from {} & {}"
                    .format(boa_file_path_1, boa_file_path_2))
        a = pd.read_hdf(boa_file_path_1, "a")
        b = pd.read_hdf(boa_file_path_2, "b")
        return a, b

    # Create the datasets
    logger.info("Preparing Bag-of-Apps datasets")
    app_labels = read_gz(data_dir, "app_labels.csv.gz")
    app_labels = app_labels.groupby("app_id")["label_id"]\
        .apply(lambda x: " ".join(str(s) for s in x))

    app_events = read_gz(data_dir, "app_events.csv.gz")
    app_events["app_labels"] = app_events["app_id"].map(app_labels)
    app_events = app_events.groupby("event_id")["app_labels"]\
        .apply(lambda x: " ".join(str(s) for s in x))
    del app_labels

    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         dtype={"device_id": np.str})
    events["app_labels"] = events["event_id"].map(app_events)
    events = events.groupby("device_id")["app_labels"]\
        .apply(lambda x: " ".join(str(s) for s in x))
    del app_events

    pbd = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv.gz"),
                      dtype={"device_id": np.str})
    pbd.drop_duplicates("device_id", keep="first", inplace=True)

    _train = read_gz(data_dir, "gender_age_train.csv.gz")
    _train["app_labels"] = _train["device_id"].map(events)
    _train = pd.merge(_train, pbd, how="left", on="device_id", left_index=True)
    _test = read_gz(data_dir, "gender_age_test.csv.gz")
    _test["app_labels"] = _test["device_id"].map(events)
    _test = pd.merge(_test, pbd, how="left", on="device_id", left_index=True)
    del pbd
    del events

    df_all = pd.concat((_train, _test), axis=0, ignore_index=True)
    split_len = len(_train)
    vec = CountVectorizer(min_df=1, binary=1)
    df_all = df_all[["phone_brand", "device_model", "app_labels"]]\
        .astype(np.str).apply(lambda x: " ".join(s for s in x), axis=1)\
        .fillna("Missing")
    df_tfv = vec.fit_transform(df_all) # 186716 x 2045 sparse matrix
    _train = df_tfv[:split_len, :] # 74645 x 2045 sparse matrix
    _test = df_tfv[split_len:, :] # 112071 x 2045 sparse matrix

    # Converting the sparse matrix into a DataFrame
    a = pd.SparseDataFrame([ pd.SparseSeries(_train[i].toarray().ravel())
                             for i in np.arange(_train.shape[0]) ])
    b = pd.SparseDataFrame([ pd.SparseSeries(_test[i].toarray().ravel())
                             for i in np.arange(_test.shape[0]) ])
    # Rename the columns
    app_labels_cols = ["a" + str(x) for x in np.arange(0, a.shape[1]).tolist()]
    d = dict(zip(np.arange(0, a.shape[1]).tolist(), app_labels_cols))
    a.rename(columns=d, inplace=True)
    b.rename(columns=d, inplace=True)
    # Write to file
    a.to_sparse(kind='block')\
        .to_hdf(boa_file_path_1, "a", mode="w", complib="blosc", complevel=9)
    b.to_sparse(kind='block')\
        .to_hdf(boa_file_path_2, "b", mode="w", complib="blosc", complevel=9)
    del _train
    del _test

    # TO USE, DO
    # train = pd.merge(train, a, left_index=True , right_index=True)

    return a, b # bag-of-apps datasets