def create_features(): print "loading data" X = pd.read_csv('data/train.csv') X = X.drop(['ROLE_CODE'], axis=1) X = X.drop(['ACTION'], axis=1) X_test = pd.read_csv('data/test.csv', index_col=0) X_test = X_test.drop(['ROLE_CODE'], axis=1) X_test['ACTION'] = 0 X_test = X_test.drop(['ACTION'], axis=1) X_all = pd.concat([X_test, X], ignore_index=True) # I want to combine role_title as a subset of role_familia and X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY']) X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + ( 10000 * X_all['ROLE_ROLLUP_2']) X_all = X_all.drop(['ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY'], axis=1) # Count/freq for col in X_all.columns: X_all['cnt'+col] = 0 groups = X_all.groupby([col]) for name, group in groups: count = group[col].count() X_all['cnt'+col].ix[group.index] = count X_all['cnt'+col] = X_all['cnt'+col].apply(np.log) # Percent of dept that is this resource # And Counts of dept/resource occurancesa (tested, not used) for col in X_all.columns[1:6]: X_all['Duse'+col] = 0.0 groups = X_all.groupby([col]) for name, group in groups: grps = group.groupby(['RESOURCE']) for rsrc, grp in grps: X_all['Duse'+col].ix[grp.index] = \ float(len(grp.index)) / float(len(group.index)) # Number of resources that a manager manages for col in X_all.columns[0:1]: #for col in X_all.columns[0:6]: if col == 'MGR_ID': continue X_all['Mdeps'+col] = 0 groups = X_all.groupby(['MGR_ID']) for name, group in groups: X_all['Mdeps'+col].ix[group.index] = len(group[col].unique()) X_all = X_all.drop(X_all.columns[0:6], axis=1) # Now X is the train, X_test is test and X_all is both together X = X_all[:][X_all.index >= len(X_test.index)] X_test = X_all[:][X_all.index < len(X_test.index)] # X is the train set alone, X_all is all features X = X.as_matrix() X_test = X_test.as_matrix() save_dataset('bsfeats', X, X_test)
def create_datasets(X, X_test, y, datasets=[], use_cache=True): """ Generate datasets as needed with different sets of features and save them to disk. The datasets are created by combining a base feature set (combinations of the original variables) with extracted feature sets, with some additional variants. The nomenclature is as follows: Base datasets: - basic: the original columns with features from 1 to 260 (Consecutive daily returns during one year) - residuals: residuals obtained by seasonal decomposition using moving averages - stats: different statistics associated with the entire time series - expanded: features extracted by module tsfresh Feature sets and variants: (denoted by the letters after the underscore in the base dataset name): - s: the base dataset """ if use_cache: # Check if all files exist. If not, generate the missing ones DATASETS = [] for dataset in datasets: try: with open("cache/%s.pkl" % dataset, 'rb'): pass except IOError: logger.warning("couldn't load dataset %s, will generate it", dataset) DATASETS.append(dataset.split('_')[0]) else: DATASETS = ["basic", "residuals", "stats", "expanded"] # Generate the missing datasets if len(DATASETS): if "basic" in DATASETS: #Basic dataset save_dataset("basic", X, X_test) if "residuals" in DATASETS: #Residuals X_resid = np.apply_along_axis(get_residuals, 1, X) X_test_resid = np.apply_along_axis(get_residuals, 1, X_test) save_dataset("residuals", X_resid, X_test_resid) else: X_resid, X_test_resid = get_dataset("residuals") if "stats" in DATASETS: # Dataset with series statistics x_series = create_series_stats(X, X_resid) x_series_test = create_series_stats(X_test, X_test_resid) save_dataset("stats", x_series, x_series_test) #else: #x_series, x_series_test = get_dataset("series") if "expanded" in DATASETS: # Dataset with rolling means x_expanded = extract_features(X) x_expanded_test = extract_features(X_test) save_dataset("expanded", x_expanded, x_expanded_test)
def create_datasets(X, X_test, y, datasets=[], use_cache=True): """ Generate datasets as needed with different sets of features and save them to disk. The datasets are created by combining a base feature set (combinations of the original variables) with extracted feature sets, with some additional variants. The nomenclature is as follows: Base datasets: - basic: the original columns, minus role1, role2, and role_code - tuples: all order 2 combinations of the original columns - triples: all order 3 combinations of the original columns - greedy[1,2,3]: three different datasets obtained by performing greedy feature selection with different seeds on the triples dataset - effects: experimental. Created to try out a suggestion by Gxav after the competition Feature sets and variants: (denoted by the letters after the underscore in the base dataset name): - s: the base dataset has been sparsified using One-Hot encoding - c: the rare features have been consolidated into one category - f: extracted features have been appended, with a different set for linear models than for tree-based models - b: Benjamin's extracted features. - d: interactions for the extracted feature set have been added - l: the extracted features have been log transformed """ if use_cache: # Check if all files exist. If not, generate the missing ones DATASETS = [] for dataset in datasets: try: with open("cache/%s.pkl" % dataset, "rb"): pass except IOError: logger.warning("couldn't load dataset %s, will generate it", dataset) DATASETS.append(dataset.split("_")[0]) else: DATASETS = ["basic", "tuples", "triples", "greedy", "greedy2", "greedy3"] # Datasets that require external code to be generated for dataset, module in EXTERNAL_DATASETS.iteritems(): if not get_dataset(dataset): module.create_features() # Generate the missing datasets if len(DATASETS): bsfeats, bsfeats_test = get_dataset("bsfeats") basefeats, basefeats_test = create_features(X, X_test, 3) save_dataset("base_feats", basefeats, basefeats_test) lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0)) save_dataset("lrfeats", lrfeats, lrfeats_test) feats, feats_test = pre_process(*create_features(X, X_test, 1)) save_dataset("features", feats, feats_test) meta, meta_test = pre_process(*create_features(X, X_test, 2), normalize=False) save_dataset("metafeatures", meta, meta_test) X = X[:, SELECTED_COLUMNS] X_test = X_test[:, SELECTED_COLUMNS] save_dataset("basic", X, X_test) Xt = create_tuples(X) Xt_test = create_tuples(X_test) save_dataset("tuples", Xt, Xt_test) Xtr = create_tuples(X) Xtr_test = create_tuples(X_test) save_dataset("triples", Xtr, Xtr_test) Xe, Xe_test = create_effects(X, X_test, y) save_dataset("effects", Xe, Xe_test) feats_d, feats_d_test = pre_process(basefeats, basefeats_test, create_divs=True) bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test, create_divs=True) feats_l, feats_l_test = pre_process(basefeats, basefeats_test, log_transform=True) lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test, log_transform=True) bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test, log_transform=True) for ds in DATASETS: Xg, Xg_test = get_dataset(ds) save_dataset(ds + "_b", Xg, Xg_test, bsfeats, bsfeats_test) save_dataset(ds + "_f", Xg, Xg_test, feats, feats_test) save_dataset(ds + "_fd", Xg, Xg_test, feats_d, feats_d_test) save_dataset(ds + "_bd", Xg, Xg_test, bsfeats_d, bsfeats_d_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + "_sf", Xs, Xs_test, lrfeats, lrfeats_test) save_dataset(ds + "_sfl", Xs, Xs_test, lrfeats_l, lrfeats_l_test) save_dataset(ds + "_sfd", Xs, Xs_test, feats_d, feats_d_test) save_dataset(ds + "_sb", Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + "_sbl", Xs, Xs_test, bsfeats_l, bsfeats_l_test) save_dataset(ds + "_sbd", Xs, Xs_test, bsfeats_d, bsfeats_d_test) if issubclass(Xg.dtype.type, np.integer): consolidate(Xg, Xg_test) save_dataset(ds + "_c", Xg, Xg_test) save_dataset(ds + "_cf", Xg, Xg_test, feats, feats_test) save_dataset(ds + "_cb", Xg, Xg_test, bsfeats, bsfeats_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + "_sc", Xs, Xs_test) save_dataset(ds + "_scf", Xs, Xs_test, feats, feats_test) save_dataset(ds + "_scfl", Xs, Xs_test, feats_l, feats_l_test) save_dataset(ds + "_scb", Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + "_scbl", Xs, Xs_test, bsfeats_l, bsfeats_l_test)
def create_features(): print "loading data" #load training data, delete ROLE_CODE and ACTION X = pd.read_csv('data/train.csv') X = X.drop(['ROLE_CODE'], axis=1) X = X.drop(['ACTION'], axis=1) #load testing data, delete ROLE_CODE and ACTION X_test = pd.read_csv('data/test.csv', index_col=0) X_test = X_test.drop(['ROLE_CODE'], axis=1) X_test['ACTION'] = 0 X_test = X_test.drop(['ACTION'], axis=1) #combine test and taining data X_all = pd.concat([X_test, X], ignore_index=True) # I want to combine role_title with role_family X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY']) #combine ROLLUP1 and ROLLUP2 X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + ( 10000 * X_all['ROLE_ROLLUP_2']) X_all = X_all.drop(['ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY'], axis=1) # Count/freq: put count for each group as another feature for col in X_all.columns: X_all['cnt'+col] = 0 groups = X_all.groupby([col]) for name, group in groups: count = group[col].count() X_all['cnt'+col].ix[group.index] = count X_all['cnt'+col] = X_all['cnt'+col].apply(np.log)#apply log # Percent of dept that is this resource # And Counts of dept/resource occurancesa (tested, not used) for col in X_all.columns[1:6]: X_all['Duse'+col] = 0.0 groups = X_all.groupby([col]) for name, group in groups: grps = group.groupby(['RESOURCE']) for rsrc, grp in grps: X_all['Duse'+col].ix[grp.index] = \ float(len(grp.index)) / float(len(group.index)) # Number of resources that a manager manages for col in X_all.columns[0:1]: #for col in X_all.columns[0:6]: if col == 'MGR_ID': continue X_all['Mdeps'+col] = 0 groups = X_all.groupby(['MGR_ID']) for name, group in groups: X_all['Mdeps'+col].ix[group.index] = len(group[col].unique()) X_all = X_all.drop(X_all.columns[0:6], axis=1) # Now X is the train, X_test is test and X_all is both together X = X_all[:][X_all.index >= len(X_test.index)] X_test = X_all[:][X_all.index < len(X_test.index)] # X is the train set alone, X_all is all features X = X.as_matrix() X_test = X_test.as_matrix() save_dataset('bsfeats', X, X_test)
def create_datasets(X, X_test, y, datasets=[], use_cache=True): """ Generate datasets as needed with different sets of features and save them to disk. The datasets are created by combining a base feature set (combinations of the original variables) with extracted feature sets, with some additional variants. The nomenclature is as follows: Base datasets: - basic: the original columns, minus role1, role2, and role_code - tuples: all order 2 combinations of the original columns - triples: all order 3 combinations of the original columns - greedy[1,2,3]: three different datasets obtained by performing greedy feature selection with different seeds on the triples dataset - effects: experimental. Created to try out a suggestion by Gxav after the competition Feature sets and variants: (denoted by the letters after the underscore in the base dataset name): - s: the base dataset has been sparsified using One-Hot encoding - c: the rare features have been consolidated into one category - f: extracted features have been appended, with a different set for linear models than for tree-based models - b: Benjamin's extracted features. - d: interactions for the extracted feature set have been added - l: the extracted features have been log transformed """ if use_cache: # Check if all files exist. If not, generate the missing ones DATASETS = [] for dataset in datasets: try: with open("cache/%s.pkl" % dataset, 'rb'): pass except IOError: logger.warning("couldn't load dataset %s, will generate it", dataset) DATASETS.append(dataset.split('_')[0]) else: DATASETS = [ "basic", "tuples", "triples", "greedy", "greedy2", "greedy3" ] # Datasets that require external code to be generated for dataset, module in EXTERNAL_DATASETS.iteritems(): if not get_dataset(dataset): module.create_features() # Generate the missing datasets if len(DATASETS): bsfeats, bsfeats_test = get_dataset('bsfeats') basefeats, basefeats_test = create_features(X, X_test, 3) save_dataset("base_feats", basefeats, basefeats_test) lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0)) save_dataset("lrfeats", lrfeats, lrfeats_test) feats, feats_test = pre_process(*create_features(X, X_test, 1)) save_dataset("features", feats, feats_test) meta, meta_test = pre_process(*create_features(X, X_test, 2), normalize=False) save_dataset("metafeatures", meta, meta_test) X = X[:, SELECTED_COLUMNS] X_test = X_test[:, SELECTED_COLUMNS] save_dataset("basic", X, X_test) Xt = create_tuples(X) Xt_test = create_tuples(X_test) save_dataset("tuples", Xt, Xt_test) Xtr = create_tuples(X) Xtr_test = create_tuples(X_test) save_dataset("triples", Xtr, Xtr_test) Xe, Xe_test = create_effects(X, X_test, y) save_dataset("effects", Xe, Xe_test) feats_d, feats_d_test = pre_process(basefeats, basefeats_test, create_divs=True) bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test, create_divs=True) feats_l, feats_l_test = pre_process(basefeats, basefeats_test, log_transform=True) lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test, log_transform=True) bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test, log_transform=True) for ds in DATASETS: Xg, Xg_test = get_dataset(ds) save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test) save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test) save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test) save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test) save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test) save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test) save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test) save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test) if issubclass(Xg.dtype.type, np.integer): consolidate(Xg, Xg_test) save_dataset(ds + '_c', Xg, Xg_test) save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test) save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + '_sc', Xs, Xs_test) save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test) save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test) save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + '_scbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
def create_features(train='data/train.csv', test='data/test.csv'): print "Reading dataset..." train_data = pd.read_csv(train) test_data = pd.read_csv(test) all_data = np.vstack((train_data.ix[:, 1:-1], test_data.ix[:, 1:-1])) num_train = np.shape(train_data)[0] # Transform data print "Transforming data..." dp = group_data(all_data, degree=2) dt = group_data(all_data, degree=3) y = np.array(train_data.ACTION) X = all_data[:num_train] X_2 = dp[:num_train] X_3 = dt[:num_train] X_test = all_data[num_train:] X_test_2 = dp[num_train:] X_test_3 = dt[num_train:] X_train_all = np.hstack((X, X_2, X_3)) X_test_all = np.hstack((X_test, X_test_2, X_test_3)) num_features = X_train_all.shape[1] model = linear_model.LogisticRegression() # Xts holds one hot encodings for each individual feature in memory # speeding up feature selection Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)] print "Performing greedy feature selection..." score_hist = [] N = 10 good_features_list = [[ 0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85 ], [ 0, 1, 7, 8, 9, 10, 36, 37, 38, 41, 42, 43, 47, 51, 53, 56, 60, 61, 63, 64, 66, 67, 69, 71, 75, 79, 85, 91 ], [ 0, 7, 9, 24, 36, 37, 41, 42, 47, 53, 61, 63, 64, 67, 69, 71, 75, 85 ], [ 0, 7, 9, 20, 36, 37, 38, 41, 42, 45, 47, 53, 60, 63, 64, 67, 69, 71, 81, 85, 86 ]] # Greedy feature selection loop if not good_features_list: good_features = set([]) while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: scores = [] for f in range(len(Xts)): if f not in good_features: feats = list(good_features) + [f] Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() score = cv_loop(Xt, y, model, N) scores.append((score, f)) print "Feature: %i Mean AUC: %f" % (f, score) good_features.add(sorted(scores)[-1][1]) score_hist.append(sorted(scores)[-1]) print "Current features: %s" % sorted(list(good_features)) # Remove last added feature from good_features good_features.remove(score_hist[-1][1]) good_features = sorted(list(good_features)) for i, good_features in enumerate(good_features_list): suffix = str(i + 1) if i else '' Xt = np.vstack( (X_train_all[:, good_features], X_test_all[:, good_features])) X_train = Xt[:num_train] X_test = Xt[num_train:] data.save_dataset("greedy%s" % suffix, X_train, X_test)
def create_datasets(X, X_test, y, datasets=[], use_cache=True): """ Create datasets with different sets of features and save them to disk. The datasets are created by combining a base feature set (combinations of the original variables) with extracted feature sets, with some additional variants. """ if use_cache: # Check if all files exist. If not, generate the missing ones DATASETS = [] for dataset in datasets: try: with open("cache/%s.pkl" % dataset): pass except IOError: logger.warning("couldn't load dataset %s, will generate it", dataset) DATASETS.append(dataset.split('_')[0]) else: DATASETS = ["basic", "tuples", "triples", "greedy", "greedy2", "greedy3"] # Datasets that require external code to be generated for dataset, module in EXTERNAL_DATASETS.iteritems(): if not get_dataset(dataset): module.create_features() # Generate the missing datasets if len(DATASETS): bsfeats, bsfeats_test = get_dataset('bsfeats') basefeats, basefeats_test = create_features(X, X_test, 3) save_dataset("base_feats", basefeats, basefeats_test) lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0)) save_dataset("lrfeats", lrfeats, lrfeats_test) feats, feats_test = pre_process(*create_features(X, X_test, 1)) save_dataset("features", feats, feats_test) meta, meta_test = pre_process(*create_features(X, X_test, 2), normalize=False) save_dataset("metafeatures", meta, meta_test) X = X[:, SELECTED_COLUMNS] X_test = X_test[:, SELECTED_COLUMNS] save_dataset("basic", X, X_test) Xt = create_tuples(X) Xt_test = create_tuples(X_test) save_dataset("tuples", Xt, Xt_test) Xe, Xe_test = create_effects(X, X_test, y) save_dataset("effects", Xe, Xe_test) feats_d, feats_d_test = pre_process(basefeats, basefeats_test, create_divs=True) bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test, create_divs=True) feats_l, feats_l_test = pre_process(basefeats, basefeats_test, log_transform=True) lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test, log_transform=True) bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test, log_transform=True) for ds in DATASETS: Xg, Xg_test = get_dataset(ds) save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test) save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test) save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test) save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test) save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test) save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test) save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test) save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test) if Xg.dtype == 'int64': consolidate(Xg, Xg_test) save_dataset(ds + '_c', Xg, Xg_test) save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test) save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + '_sc', Xs, Xs_test) save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test) save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test) save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + '_scbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
def create_features(train='data/train.csv', test='data/test.csv'): print "Reading dataset..." train_data = pd.read_csv(train) test_data = pd.read_csv(test) all_data = np.vstack((train_data.ix[:, 1:-1], test_data.ix[:, 1:-1])) num_train = np.shape(train_data)[0] # Transform data print "Transforming data..." dp = group_data(all_data, degree=2) dt = group_data(all_data, degree=3) y = np.array(train_data.ACTION) X = all_data[:num_train] X_2 = dp[:num_train] X_3 = dt[:num_train] X_test = all_data[num_train:] X_test_2 = dp[num_train:] X_test_3 = dt[num_train:] X_train_all = np.hstack((X, X_2, X_3)) X_test_all = np.hstack((X_test, X_test_2, X_test_3)) num_features = X_train_all.shape[1] model = linear_model.LogisticRegression() # Xts holds one hot encodings for each individual feature in memory # speeding up feature selection Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)] print "Performing greedy feature selection..." score_hist = [] N = 10 good_features_list = [ [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85], [0, 1, 7, 8, 9, 10, 36, 37, 38, 41, 42, 43, 47, 51, 53, 56, 60, 61, 63, 64, 66, 67, 69, 71, 75, 79, 85, 91], [0, 7, 9, 24, 36, 37, 41, 42, 47, 53, 61, 63, 64, 67, 69, 71, 75, 85], [0, 7, 9, 20, 36, 37, 38, 41, 42, 45, 47, 53, 60, 63, 64, 67, 69, 71, 81, 85, 86] ] # Greedy feature selection loop if not good_features_list: good_features = set([]) while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: scores = [] for f in range(len(Xts)): if f not in good_features: feats = list(good_features) + [f] Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() score = cv_loop(Xt, y, model, N) scores.append((score, f)) print "Feature: %i Mean AUC: %f" % (f, score) good_features.add(sorted(scores)[-1][1]) score_hist.append(sorted(scores)[-1]) print "Current features: %s" % sorted(list(good_features)) # Remove last added feature from good_features good_features.remove(score_hist[-1][1]) good_features = sorted(list(good_features)) for i, good_features in enumerate(good_features_list): suffix = str(i + 1) if i else '' Xt = np.vstack((X_train_all[:, good_features], X_test_all[:, good_features])) X_train = Xt[:num_train] X_test = Xt[num_train:] data.save_dataset("greedy%s" % suffix, X_train, X_test)