Exemplo n.º 1
0
def create_features():
    print "loading data"
    X = pd.read_csv('data/train.csv')
    X = X.drop(['ROLE_CODE'], axis=1)
    X = X.drop(['ACTION'], axis=1)

    X_test = pd.read_csv('data/test.csv', index_col=0)
    X_test = X_test.drop(['ROLE_CODE'], axis=1)
    X_test['ACTION'] = 0
    X_test = X_test.drop(['ACTION'], axis=1)

    X_all = pd.concat([X_test, X], ignore_index=True)
    # I want to combine role_title as a subset of role_familia and
    X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY'])
    X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + (
        10000 * X_all['ROLE_ROLLUP_2'])
    X_all = X_all.drop(['ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY'],
                       axis=1)

    # Count/freq
    for col in X_all.columns:
        X_all['cnt'+col] = 0
        groups = X_all.groupby([col])
        for name, group in groups:
            count = group[col].count()
            X_all['cnt'+col].ix[group.index] = count
        X_all['cnt'+col] = X_all['cnt'+col].apply(np.log)

    # Percent of dept that is this resource
    # And Counts of dept/resource occurancesa (tested, not used)
    for col in X_all.columns[1:6]:
        X_all['Duse'+col] = 0.0
        groups = X_all.groupby([col])
        for name, group in groups:
            grps = group.groupby(['RESOURCE'])
            for rsrc, grp in grps:
                X_all['Duse'+col].ix[grp.index] = \
                    float(len(grp.index)) / float(len(group.index))

    # Number of resources that a manager manages
    for col in X_all.columns[0:1]:
    #for col in X_all.columns[0:6]:
        if col == 'MGR_ID':
            continue
        X_all['Mdeps'+col] = 0
        groups = X_all.groupby(['MGR_ID'])
        for name, group in groups:
            X_all['Mdeps'+col].ix[group.index] = len(group[col].unique())

    X_all = X_all.drop(X_all.columns[0:6], axis=1)

    # Now X is the train, X_test is test and X_all is both together
    X = X_all[:][X_all.index >= len(X_test.index)]
    X_test = X_all[:][X_all.index < len(X_test.index)]
    # X is the train set alone, X_all is all features
    X = X.as_matrix()
    X_test = X_test.as_matrix()

    save_dataset('bsfeats', X, X_test)
Exemplo n.º 2
0
def create_datasets(X, X_test, y, datasets=[], use_cache=True):
    """
    Generate datasets as needed with different sets of features
    and save them to disk.
    The datasets are created by combining a base feature set (combinations of
    the original variables) with extracted feature sets, with some additional
    variants.

    The nomenclature is as follows:
    Base datasets:
        - basic: the original columns with features from 1 to 260 (Consecutive daily returns during one year)
        - residuals: residuals obtained by seasonal decomposition using moving averages
        - stats: different statistics associated with the entire time series
        - expanded: features extracted by module tsfresh
    Feature sets and variants:
    (denoted by the letters after the underscore in the base dataset name):
        - s: the base dataset
    """
    if use_cache:
        # Check if all files exist. If not, generate the missing ones
        DATASETS = []
        for dataset in datasets:
            try:
                with open("cache/%s.pkl" % dataset, 'rb'):
                    pass
            except IOError:
                logger.warning("couldn't load dataset %s, will generate it",
                               dataset)
                DATASETS.append(dataset.split('_')[0])
    else:
        DATASETS = ["basic", "residuals", "stats", "expanded"]

    # Generate the missing datasets
    if len(DATASETS):

        if "basic" in DATASETS:
            #Basic dataset
            save_dataset("basic", X, X_test)

        if "residuals" in DATASETS:
            #Residuals
            X_resid = np.apply_along_axis(get_residuals, 1, X)
            X_test_resid = np.apply_along_axis(get_residuals, 1, X_test)
            save_dataset("residuals", X_resid, X_test_resid)
        else:
            X_resid, X_test_resid = get_dataset("residuals")

        if "stats" in DATASETS:
            # Dataset with series statistics
            x_series = create_series_stats(X, X_resid)
            x_series_test = create_series_stats(X_test, X_test_resid)
            save_dataset("stats", x_series, x_series_test)
        #else:
            #x_series, x_series_test = get_dataset("series")

        if "expanded" in DATASETS:
            # Dataset with rolling means
            x_expanded = extract_features(X)
            x_expanded_test = extract_features(X_test)
            save_dataset("expanded", x_expanded, x_expanded_test)
Exemplo n.º 3
0
def create_datasets(X, X_test, y, datasets=[], use_cache=True):
    """
    Generate datasets as needed with different sets of features
    and save them to disk.
    The datasets are created by combining a base feature set (combinations of
    the original variables) with extracted feature sets, with some additional
    variants.

    The nomenclature is as follows:
    Base datasets:
        - basic: the original columns, minus role1, role2, and role_code
        - tuples: all order 2 combinations of the original columns
        - triples: all order 3 combinations of the original columns
        - greedy[1,2,3]: three different datasets obtained by performing
            greedy feature selection with different seeds on the triples
            dataset
        - effects: experimental. Created to try out a suggestion by Gxav
            after the competition

    Feature sets and variants:
    (denoted by the letters after the underscore in the base dataset name):
        - s: the base dataset has been sparsified using One-Hot encoding
        - c: the rare features have been consolidated into one category
        - f: extracted features have been appended, with a different set for
            linear models than for tree-based models
        - b: Benjamin's extracted features.
        - d: interactions for the extracted feature set have been added
        - l: the extracted features have been log transformed
    """
    if use_cache:
        # Check if all files exist. If not, generate the missing ones
        DATASETS = []
        for dataset in datasets:
            try:
                with open("cache/%s.pkl" % dataset, "rb"):
                    pass
            except IOError:
                logger.warning("couldn't load dataset %s, will generate it", dataset)
                DATASETS.append(dataset.split("_")[0])
    else:
        DATASETS = ["basic", "tuples", "triples", "greedy", "greedy2", "greedy3"]

    # Datasets that require external code to be generated
    for dataset, module in EXTERNAL_DATASETS.iteritems():
        if not get_dataset(dataset):
            module.create_features()

    # Generate the missing datasets
    if len(DATASETS):
        bsfeats, bsfeats_test = get_dataset("bsfeats")

        basefeats, basefeats_test = create_features(X, X_test, 3)
        save_dataset("base_feats", basefeats, basefeats_test)

        lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0))
        save_dataset("lrfeats", lrfeats, lrfeats_test)

        feats, feats_test = pre_process(*create_features(X, X_test, 1))
        save_dataset("features", feats, feats_test)

        meta, meta_test = pre_process(*create_features(X, X_test, 2), normalize=False)
        save_dataset("metafeatures", meta, meta_test)

        X = X[:, SELECTED_COLUMNS]
        X_test = X_test[:, SELECTED_COLUMNS]
        save_dataset("basic", X, X_test)

        Xt = create_tuples(X)
        Xt_test = create_tuples(X_test)
        save_dataset("tuples", Xt, Xt_test)

        Xtr = create_tuples(X)
        Xtr_test = create_tuples(X_test)
        save_dataset("triples", Xtr, Xtr_test)

        Xe, Xe_test = create_effects(X, X_test, y)
        save_dataset("effects", Xe, Xe_test)

        feats_d, feats_d_test = pre_process(basefeats, basefeats_test, create_divs=True)
        bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test, create_divs=True)
        feats_l, feats_l_test = pre_process(basefeats, basefeats_test, log_transform=True)
        lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test, log_transform=True)
        bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test, log_transform=True)

        for ds in DATASETS:
            Xg, Xg_test = get_dataset(ds)
            save_dataset(ds + "_b", Xg, Xg_test, bsfeats, bsfeats_test)
            save_dataset(ds + "_f", Xg, Xg_test, feats, feats_test)
            save_dataset(ds + "_fd", Xg, Xg_test, feats_d, feats_d_test)
            save_dataset(ds + "_bd", Xg, Xg_test, bsfeats_d, bsfeats_d_test)
            Xs, Xs_test = sparsify(Xg, Xg_test)
            save_dataset(ds + "_sf", Xs, Xs_test, lrfeats, lrfeats_test)
            save_dataset(ds + "_sfl", Xs, Xs_test, lrfeats_l, lrfeats_l_test)
            save_dataset(ds + "_sfd", Xs, Xs_test, feats_d, feats_d_test)
            save_dataset(ds + "_sb", Xs, Xs_test, bsfeats, bsfeats_test)
            save_dataset(ds + "_sbl", Xs, Xs_test, bsfeats_l, bsfeats_l_test)
            save_dataset(ds + "_sbd", Xs, Xs_test, bsfeats_d, bsfeats_d_test)

            if issubclass(Xg.dtype.type, np.integer):
                consolidate(Xg, Xg_test)
                save_dataset(ds + "_c", Xg, Xg_test)
                save_dataset(ds + "_cf", Xg, Xg_test, feats, feats_test)
                save_dataset(ds + "_cb", Xg, Xg_test, bsfeats, bsfeats_test)
                Xs, Xs_test = sparsify(Xg, Xg_test)
                save_dataset(ds + "_sc", Xs, Xs_test)
                save_dataset(ds + "_scf", Xs, Xs_test, feats, feats_test)
                save_dataset(ds + "_scfl", Xs, Xs_test, feats_l, feats_l_test)
                save_dataset(ds + "_scb", Xs, Xs_test, bsfeats, bsfeats_test)
                save_dataset(ds + "_scbl", Xs, Xs_test, bsfeats_l, bsfeats_l_test)
Exemplo n.º 4
0
def create_features():
    print "loading data"
    #load training data, delete ROLE_CODE and ACTION
    X = pd.read_csv('data/train.csv')
    X = X.drop(['ROLE_CODE'], axis=1)
    X = X.drop(['ACTION'], axis=1)

    #load testing data, delete ROLE_CODE and ACTION
    X_test = pd.read_csv('data/test.csv', index_col=0)
    X_test = X_test.drop(['ROLE_CODE'], axis=1)
    X_test['ACTION'] = 0
    X_test = X_test.drop(['ACTION'], axis=1)

    #combine test and taining data
    X_all = pd.concat([X_test, X], ignore_index=True)
    # I want to combine role_title with role_family
    X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY'])
    #combine ROLLUP1 and ROLLUP2
    X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + (
        10000 * X_all['ROLE_ROLLUP_2'])
    X_all = X_all.drop(['ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY'],
                       axis=1)
    # Count/freq: put count for each group as another feature
    for col in X_all.columns:
        X_all['cnt'+col] = 0
        groups = X_all.groupby([col])
        for name, group in groups:
            count = group[col].count()
            X_all['cnt'+col].ix[group.index] = count
        X_all['cnt'+col] = X_all['cnt'+col].apply(np.log)#apply log

    # Percent of dept that is this resource
    # And Counts of dept/resource occurancesa (tested, not used)
    for col in X_all.columns[1:6]:
        X_all['Duse'+col] = 0.0
        groups = X_all.groupby([col])
        for name, group in groups:
            grps = group.groupby(['RESOURCE'])
            for rsrc, grp in grps:
                X_all['Duse'+col].ix[grp.index] = \
                    float(len(grp.index)) / float(len(group.index))

    # Number of resources that a manager manages
    for col in X_all.columns[0:1]:
    #for col in X_all.columns[0:6]:
        if col == 'MGR_ID':
            continue
        X_all['Mdeps'+col] = 0
        groups = X_all.groupby(['MGR_ID'])
        for name, group in groups:
            X_all['Mdeps'+col].ix[group.index] = len(group[col].unique())

    X_all = X_all.drop(X_all.columns[0:6], axis=1)

    # Now X is the train, X_test is test and X_all is both together
    X = X_all[:][X_all.index >= len(X_test.index)]
    X_test = X_all[:][X_all.index < len(X_test.index)]
    # X is the train set alone, X_all is all features
    X = X.as_matrix()
    X_test = X_test.as_matrix()

    save_dataset('bsfeats', X, X_test)
Exemplo n.º 5
0
def create_datasets(X, X_test, y, datasets=[], use_cache=True):
    """
    Generate datasets as needed with different sets of features
    and save them to disk.
    The datasets are created by combining a base feature set (combinations of
    the original variables) with extracted feature sets, with some additional
    variants.

    The nomenclature is as follows:
    Base datasets:
        - basic: the original columns, minus role1, role2, and role_code
        - tuples: all order 2 combinations of the original columns
        - triples: all order 3 combinations of the original columns
        - greedy[1,2,3]: three different datasets obtained by performing
            greedy feature selection with different seeds on the triples
            dataset
        - effects: experimental. Created to try out a suggestion by Gxav
            after the competition

    Feature sets and variants:
    (denoted by the letters after the underscore in the base dataset name):
        - s: the base dataset has been sparsified using One-Hot encoding
        - c: the rare features have been consolidated into one category
        - f: extracted features have been appended, with a different set for
            linear models than for tree-based models
        - b: Benjamin's extracted features.
        - d: interactions for the extracted feature set have been added
        - l: the extracted features have been log transformed
    """
    if use_cache:
        # Check if all files exist. If not, generate the missing ones
        DATASETS = []
        for dataset in datasets:
            try:
                with open("cache/%s.pkl" % dataset, 'rb'):
                    pass
            except IOError:
                logger.warning("couldn't load dataset %s, will generate it",
                               dataset)
                DATASETS.append(dataset.split('_')[0])
    else:
        DATASETS = [
            "basic", "tuples", "triples", "greedy", "greedy2", "greedy3"
        ]

    # Datasets that require external code to be generated
    for dataset, module in EXTERNAL_DATASETS.iteritems():
        if not get_dataset(dataset):
            module.create_features()

    # Generate the missing datasets
    if len(DATASETS):
        bsfeats, bsfeats_test = get_dataset('bsfeats')

        basefeats, basefeats_test = create_features(X, X_test, 3)
        save_dataset("base_feats", basefeats, basefeats_test)

        lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0))
        save_dataset("lrfeats", lrfeats, lrfeats_test)

        feats, feats_test = pre_process(*create_features(X, X_test, 1))
        save_dataset("features", feats, feats_test)

        meta, meta_test = pre_process(*create_features(X, X_test, 2),
                                      normalize=False)
        save_dataset("metafeatures", meta, meta_test)

        X = X[:, SELECTED_COLUMNS]
        X_test = X_test[:, SELECTED_COLUMNS]
        save_dataset("basic", X, X_test)

        Xt = create_tuples(X)
        Xt_test = create_tuples(X_test)
        save_dataset("tuples", Xt, Xt_test)

        Xtr = create_tuples(X)
        Xtr_test = create_tuples(X_test)
        save_dataset("triples", Xtr, Xtr_test)

        Xe, Xe_test = create_effects(X, X_test, y)
        save_dataset("effects", Xe, Xe_test)

        feats_d, feats_d_test = pre_process(basefeats,
                                            basefeats_test,
                                            create_divs=True)
        bsfeats_d, bsfeats_d_test = pre_process(bsfeats,
                                                bsfeats_test,
                                                create_divs=True)
        feats_l, feats_l_test = pre_process(basefeats,
                                            basefeats_test,
                                            log_transform=True)
        lrfeats_l, lrfeats_l_test = pre_process(lrfeats,
                                                lrfeats_test,
                                                log_transform=True)
        bsfeats_l, bsfeats_l_test = pre_process(bsfeats,
                                                bsfeats_test,
                                                log_transform=True)

        for ds in DATASETS:
            Xg, Xg_test = get_dataset(ds)
            save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test)
            save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test)
            save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test)
            save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test)
            Xs, Xs_test = sparsify(Xg, Xg_test)
            save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test)
            save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test)
            save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test)
            save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test)
            save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
            save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test)

            if issubclass(Xg.dtype.type, np.integer):
                consolidate(Xg, Xg_test)
                save_dataset(ds + '_c', Xg, Xg_test)
                save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test)
                save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test)
                Xs, Xs_test = sparsify(Xg, Xg_test)
                save_dataset(ds + '_sc', Xs, Xs_test)
                save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test)
                save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test)
                save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test)
                save_dataset(ds + '_scbl', Xs, Xs_test, bsfeats_l,
                             bsfeats_l_test)
Exemplo n.º 6
0
def create_features(train='data/train.csv', test='data/test.csv'):
    print "Reading dataset..."
    train_data = pd.read_csv(train)
    test_data = pd.read_csv(test)
    all_data = np.vstack((train_data.ix[:, 1:-1], test_data.ix[:, 1:-1]))

    num_train = np.shape(train_data)[0]

    # Transform data
    print "Transforming data..."
    dp = group_data(all_data, degree=2)
    dt = group_data(all_data, degree=3)

    y = np.array(train_data.ACTION)
    X = all_data[:num_train]
    X_2 = dp[:num_train]
    X_3 = dt[:num_train]

    X_test = all_data[num_train:]
    X_test_2 = dp[num_train:]
    X_test_3 = dt[num_train:]

    X_train_all = np.hstack((X, X_2, X_3))
    X_test_all = np.hstack((X_test, X_test_2, X_test_3))
    num_features = X_train_all.shape[1]

    model = linear_model.LogisticRegression()

    # Xts holds one hot encodings for each individual feature in memory
    # speeding up feature selection
    Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)]

    print "Performing greedy feature selection..."
    score_hist = []
    N = 10
    good_features_list = [[
        0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63,
        64, 67, 69, 71, 75, 81, 82, 85
    ],
                          [
                              0, 1, 7, 8, 9, 10, 36, 37, 38, 41, 42, 43, 47,
                              51, 53, 56, 60, 61, 63, 64, 66, 67, 69, 71, 75,
                              79, 85, 91
                          ],
                          [
                              0, 7, 9, 24, 36, 37, 41, 42, 47, 53, 61, 63, 64,
                              67, 69, 71, 75, 85
                          ],
                          [
                              0, 7, 9, 20, 36, 37, 38, 41, 42, 45, 47, 53, 60,
                              63, 64, 67, 69, 71, 81, 85, 86
                          ]]

    # Greedy feature selection loop
    if not good_features_list:
        good_features = set([])
        while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
            scores = []
            for f in range(len(Xts)):
                if f not in good_features:
                    feats = list(good_features) + [f]
                    Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
                    score = cv_loop(Xt, y, model, N)
                    scores.append((score, f))
                    print "Feature: %i Mean AUC: %f" % (f, score)
            good_features.add(sorted(scores)[-1][1])
            score_hist.append(sorted(scores)[-1])
            print "Current features: %s" % sorted(list(good_features))

        # Remove last added feature from good_features
        good_features.remove(score_hist[-1][1])
        good_features = sorted(list(good_features))

    for i, good_features in enumerate(good_features_list):
        suffix = str(i + 1) if i else ''
        Xt = np.vstack(
            (X_train_all[:, good_features], X_test_all[:, good_features]))
        X_train = Xt[:num_train]
        X_test = Xt[num_train:]
        data.save_dataset("greedy%s" % suffix, X_train, X_test)
def create_datasets(X, X_test, y, datasets=[], use_cache=True):
    """
    Create datasets with different sets of features and save them to disk.
    The datasets are created by combining a base feature set (combinations of
    the original variables) with extracted feature sets, with some additional
    variants.
    """
    if use_cache:
        # Check if all files exist. If not, generate the missing ones
        DATASETS = []
        for dataset in datasets:
            try:
                with open("cache/%s.pkl" % dataset):
                    pass
            except IOError:
                logger.warning("couldn't load dataset %s, will generate it",
                               dataset)
                DATASETS.append(dataset.split('_')[0])
    else:
        DATASETS = ["basic", "tuples", "triples",
                    "greedy", "greedy2", "greedy3"]

    # Datasets that require external code to be generated
    for dataset, module in EXTERNAL_DATASETS.iteritems():
        if not get_dataset(dataset):
            module.create_features()

    # Generate the missing datasets
    if len(DATASETS):
        bsfeats, bsfeats_test = get_dataset('bsfeats')

        basefeats, basefeats_test = create_features(X, X_test, 3)
        save_dataset("base_feats", basefeats, basefeats_test)

        lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0))
        save_dataset("lrfeats", lrfeats, lrfeats_test)

        feats, feats_test = pre_process(*create_features(X, X_test, 1))
        save_dataset("features", feats, feats_test)

        meta, meta_test = pre_process(*create_features(X, X_test, 2),
                                      normalize=False)
        save_dataset("metafeatures", meta, meta_test)

        X = X[:, SELECTED_COLUMNS]
        X_test = X_test[:, SELECTED_COLUMNS]
        save_dataset("basic", X, X_test)

        Xt = create_tuples(X)
        Xt_test = create_tuples(X_test)
        save_dataset("tuples", Xt, Xt_test)

        Xe, Xe_test = create_effects(X, X_test, y)
        save_dataset("effects", Xe, Xe_test)

        feats_d, feats_d_test = pre_process(basefeats, basefeats_test,
                                            create_divs=True)
        bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test,
                                                create_divs=True)
        feats_l, feats_l_test = pre_process(basefeats, basefeats_test,
                                            log_transform=True)
        lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test,
                                                log_transform=True)
        bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test,
                                                log_transform=True)

        for ds in DATASETS:
            Xg, Xg_test = get_dataset(ds)
            save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test)
            save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test)
            save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test)
            save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test)
            Xs, Xs_test = sparsify(Xg, Xg_test)
            save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test)
            save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test)
            save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test)
            save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test)
            save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
            save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test)

            if Xg.dtype == 'int64':
                consolidate(Xg, Xg_test)
                save_dataset(ds + '_c', Xg, Xg_test)
                save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test)
                save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test)
                Xs, Xs_test = sparsify(Xg, Xg_test)
                save_dataset(ds + '_sc', Xs, Xs_test)
                save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test)
                save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test)
                save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test)
                save_dataset(ds + '_scbl', Xs, Xs_test,
                             bsfeats_l, bsfeats_l_test)
Exemplo n.º 8
0
def create_features(train='data/train.csv', test='data/test.csv'):
    print "Reading dataset..."
    train_data = pd.read_csv(train)
    test_data = pd.read_csv(test)
    all_data = np.vstack((train_data.ix[:, 1:-1], test_data.ix[:, 1:-1]))

    num_train = np.shape(train_data)[0]

    # Transform data
    print "Transforming data..."
    dp = group_data(all_data, degree=2)
    dt = group_data(all_data, degree=3)

    y = np.array(train_data.ACTION)
    X = all_data[:num_train]
    X_2 = dp[:num_train]
    X_3 = dt[:num_train]

    X_test = all_data[num_train:]
    X_test_2 = dp[num_train:]
    X_test_3 = dt[num_train:]

    X_train_all = np.hstack((X, X_2, X_3))
    X_test_all = np.hstack((X_test, X_test_2, X_test_3))
    num_features = X_train_all.shape[1]

    model = linear_model.LogisticRegression()

    # Xts holds one hot encodings for each individual feature in memory
    # speeding up feature selection
    Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)]

    print "Performing greedy feature selection..."
    score_hist = []
    N = 10
    good_features_list = [
        [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55,
         60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85],
        [0, 1, 7, 8, 9, 10, 36, 37, 38, 41, 42, 43, 47, 51, 53,
         56, 60, 61, 63, 64, 66, 67, 69, 71, 75, 79, 85, 91],
        [0, 7, 9, 24, 36, 37, 41, 42, 47, 53, 61, 63, 64, 67, 69, 71, 75, 85],
        [0, 7, 9, 20, 36, 37, 38, 41, 42, 45, 47,
         53, 60, 63, 64, 67, 69, 71, 81, 85, 86]
    ]

    # Greedy feature selection loop
    if not good_features_list:
        good_features = set([])
        while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
            scores = []
            for f in range(len(Xts)):
                if f not in good_features:
                    feats = list(good_features) + [f]
                    Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
                    score = cv_loop(Xt, y, model, N)
                    scores.append((score, f))
                    print "Feature: %i Mean AUC: %f" % (f, score)
            good_features.add(sorted(scores)[-1][1])
            score_hist.append(sorted(scores)[-1])
            print "Current features: %s" % sorted(list(good_features))

        # Remove last added feature from good_features
        good_features.remove(score_hist[-1][1])
        good_features = sorted(list(good_features))

    for i, good_features in enumerate(good_features_list):
        suffix = str(i + 1) if i else ''
        Xt = np.vstack((X_train_all[:, good_features],
                        X_test_all[:, good_features]))
        X_train = Xt[:num_train]
        X_test = Xt[num_train:]
        data.save_dataset("greedy%s" % suffix, X_train, X_test)