def main(): p = optparse.OptionParser() p.add_option('--model', '-m', default = 'model', type = str, help = 'model filename prefix') p.add_option('--load', '-L', default = False, action = 'store_true', help = 'load model from file') p.add_option('--features', '-f', default = 'features.txt', type = str, help = 'feature filename') p.add_option('--data', '-d', default = 'data.csv', type = str, help = 'marked data filename') p.add_option('--verbose', '-v', default = False, action = 'store_true', help = 'verbosity flag') p.add_option('--thresh', '-T', default = 0.5, type = float, help = 'probability threshold to classify True') p.add_option('--n_estimators', '-n', default = 100, type = int, help = 'number of random forest estimators') p.add_option('--test_fraction', '-t', default = 0.25, type = float, help = 'fraction of data to use for testing') p.add_option('--seed', '-s', default = None, type = int, help = 'random seed') p.add_option('--jobs', '-j', default = -1, type = int, help = 'number of jobs (-1 if maximum)') p.add_option('--probs', '-p', default = None, type = str, help = 'filename for output probabilities') opts, args = p.parse_args() model_filename = opts.model + '%s.pickle' % ('' if opts.seed is None else str(opts.seed)) probs_filename = ('predicted_probs%s.dat' % ('' if opts.seed is None else str(opts.seed))) if opts.probs is None else opts.probs np.random.seed(opts.seed) if opts.verbose: print("\nReading marked data from %s..." % opts.data) # establish data frame df = pd.read_csv(opts.data) n_lines = len(df) # choose test set as random test_fraction of data, leaving the remainder for training n_test = int(opts.test_fraction * n_lines) if opts.verbose: print("Read %d lines of data -> %d lines (training), %d lines (test)" % (n_lines, n_lines - n_test, n_test)) test_subset = np.random.permutation(range(n_lines))[:n_test] is_train = np.ones(n_lines, dtype = bool) for i in test_subset: is_train[i] = False # establish training and test sets train, test = df[is_train], df[~is_train] if opts.load: rfc = pickle.load(open(model_filename, 'rb')) if opts.verbose: print("\nLoaded model from '%s'.\n" % model_filename) else: # set the random forest instance rfc = RandomForestClassifier(n_estimators = opts.n_estimators, n_jobs = opts.jobs) # set list of features (all the uncommented features above dotted line in feature file; leading/trailing whitespace is stripped with open(opts.features, 'r') as f: lines = f.readlines() line_starts_with_dash = [(line[0] == '-') for line in lines] assert (line_starts_with_dash.count(True) == 1), "Feature file must have a single dashed line separating input/output features." dashed_line_index = line_starts_with_dash.index(True) rfc.input_features = [] for i in range(dashed_line_index): feature = lines[i].partition('#')[0].strip() if (len(feature) > 0): rfc.input_features.append(feature) output_features = [] for i in range(dashed_line_index + 1, len(lines)): feature = lines[i].partition('#')[0].strip() if (len(feature) > 0): output_features.append(feature) assert (len(output_features) == 1), "Feature file must have exactly one output feature." rfc.output_feature = output_features[0] num_features = len(rfc.input_features) assert (num_features > 0), "Feature file must have at least one input feature." X = train[rfc.input_features] y = train[rfc.output_feature] if (not opts.load): # train the forest if opts.verbose: print("\nTraining %d random forests..." % opts.n_estimators) rfc.fit(X, y) # save off the model pickle.dump(rfc, open(model_filename, 'wb')) if opts.verbose: print("\nSaved model to '%s'.\n" % model_filename) # make predictions on the test data probs = rfc.predict_proba(test[rfc.input_features])[:, 1] probs_series = pd.Series(probs) probs_series.to_csv(probs_filename, index = False) test_preds = (probs >= opts.thresh) conf_df = pd.crosstab(test[rfc.output_feature], test_preds, rownames = ['actual'], colnames = ['predicted']) conf_mat = np.asarray(conf_df) class_report = classification_report(test[rfc.output_feature], test_preds) print("\nConfusion Matrix") print(conf_df) print("\nClassification Report") print(class_report) accuracy = (conf_mat[0, 0] + conf_mat[1, 1]) / float(np.sum(conf_mat)) print("Accuracy = %.3f%%" % (100. * accuracy)) print("\nFeature Importances") triples = [(i, rfc.input_features[i], rfc.feature_importances_[i]) for i in range(num_features)] triples.sort(key = lambda pair : pair[2], reverse = True) indices, features, importances = zip(*triples) for i in range(num_features): print("%17s %3d.%03d%%" % (features[i], int(100. * importances[i]), round(1000 * (100. * importances[i] - int(100. * importances[i]))))) stds = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis = 0) plt.figure() plt.title("Feature importances") plt.bar(range(X.shape[1]), importances, color = 'r', yerr = [stds[i] for i in indices], align = 'center') plt.xticks(range(X.shape[1]), features, rotation = 'vertical') plt.xlim([-1, X.shape[1]]) fig = plt.gcf() fig.subplots_adjust(bottom = 0.25) plt.savefig('feature_importances%s.png' % ('' if opts.seed is None else str(opts.seed)))
def main(): p = optparse.OptionParser() p.add_option('--load', '-L', default = False, action = 'store_true', help = 'load model from file') p.add_option('--features', '-f', default = 'features.txt', type = str, help = 'feature filename') p.add_option('--verbose', '-v', default = False, action = 'store_true', help = 'verbosity flag') p.add_option('--thresh', '-T', default = 0.5, type = float, help = 'probability threshold to classify True') p.add_option('--n_estimators', '-n', default = 100, type = int, help = 'number of random forest estimators') p.add_option('--seed', '-s', default = None, type = int, help = 'random seed') p.add_option('--jobs', '-j', default = -1, type = int, help = 'number of jobs (-1 if maximum)') opts, args = p.parse_args() model_filename = 'model%s.pickle' % ('' if opts.seed is None else str(opts.seed)) np.random.seed(opts.seed) if opts.verbose: print("\nReading data set...") train = pd.read_csv('yoochoose/data/training_session_features.csv').append(pd.read_csv('yoochoose/data/dev_session_features.csv')) test = pd.read_csv('yoochoose/data/test_session_features.csv') if opts.load: rfc = pickle.load(open(model_filename, 'rb')) if opts.verbose: print("\nLoaded model from '%s'.\n" % model_filename) else: # set the random forest instance rfc = RandomForestClassifier(n_estimators = opts.n_estimators, n_jobs = opts.jobs) # set list of features (all the uncommented features above dotted line in feature file; leading/trailing whitespace is stripped with open(opts.features, 'r') as f: lines = f.readlines() line_starts_with_dash = [(line[0] == '-') for line in lines] assert (line_starts_with_dash.count(True) == 1), "Feature file must have a single dashed line separating input/output features." dashed_line_index = line_starts_with_dash.index(True) rfc.input_features = [] for i in range(dashed_line_index): feature = lines[i].partition('#')[0].strip() if (len(feature) > 0): rfc.input_features.append(feature) output_features = [] for i in range(dashed_line_index + 1, len(lines)): feature = lines[i].partition('#')[0].strip() if (len(feature) > 0): output_features.append(feature) assert (len(output_features) == 1), "Feature file must have exactly one output feature." rfc.output_feature = output_features[0] num_features = len(rfc.input_features) assert (num_features > 0), "Feature file must have at least one input feature." X = train[rfc.input_features] y = train[rfc.output_feature] if (not opts.load): # train the forest if opts.verbose: print("\nTraining %d random forests..." % opts.n_estimators) rfc.fit(X, y) # save off the model pickle.dump(rfc, open(model_filename, 'wb')) if opts.verbose: print("\nSaved model to '%s'.\n" % model_filename) # make predictions on the test data probs = rfc.predict_proba(test[rfc.input_features])[:, 1] probs_series = pd.Series(probs) probs_series.to_csv('test_probs%s' % ('' if opts.seed is None else str(opts.seed)), index = False) test_preds = (probs >= opts.thresh) conf_df = pd.crosstab(test[rfc.output_feature], test_preds, rownames = ['actual'], colnames = ['predicted']) conf_mat = np.asarray(conf_df) class_report = classification_report(test[rfc.output_feature], test_preds) s = "\nConfusion Matrix\n" s += str(conf_df) + '\n' s += "\nClassification Report\n" s += class_report + '\n' accuracy = (conf_mat[0, 0] + conf_mat[1, 1]) / float(np.sum(conf_mat)) s += "Accuracy = %.3f%%\n" % (100. * accuracy) s += "\nFeature Importances\n" triples = [(i, rfc.input_features[i], rfc.feature_importances_[i]) for i in range(num_features)] triples.sort(key = lambda pair : pair[2], reverse = True) indices, features, importances = zip(*triples) for i in range(num_features): s += "%17s %3d.%03d%%\n" % (features[i], int(100. * importances[i]), round(1000 * (100. * importances[i] - int(100. * importances[i])))) with open('test_report%s' % ('' if opts.seed is None else str(opts.seed)), 'w') as f: f.write(s)