def __init__(self, scaled_features, labels, num_samples, scikit_balancing): """ scaled_feature: Must contain the features all scaled to the same range. labels: labels corresponding to scaled_features num_samples: how many random data points to sample and use from scaled_features for training the feature selector models """ # select a smaller sample for feature selection indices = numpy.random.choice(scaled_features.shape[0], num_samples, replace=False) l1_svm_features = scaled_features[indices, :] l1_svm_labels = labels[indices] # Manually balance data. Do this on the whole data, because we are training the # feature selction on all of data. self.features, self.labels, self.penalty_weights = utils.prepare_train_data( l1_svm_features, l1_svm_labels, scikit_balancing, -1) # a dictionary from svm cost to trained selector model self.feature_selector_models = dict() # Keeps track of the latest model trained for feature selection. All calls for data # transformation or feature coefficients are done based on this trained model. self.current_model = None self.current_transformer = None
def train(model, config): input_, label_ = prepare_train_data(config) model.train_op = tf.train.AdamOptimizer( learning_rate=config.learning_rate).minimize(model.loss) tf.initialize_all_variables().run() counter = 0 time_ = time.time() model.load("checkpoint") print("Starting to train on {} images".format(input_.shape)) for ep in range(config.epoch): batch_i = len(input_) // config.batch_size for idx in range(0, batch_i): batch_images = input_[idx * config.batch_size:(idx + 1) * config.batch_size] batch_labels = label_[idx * config.batch_size:(idx + 1) * config.batch_size] counter += 1 _, err = model.sess.run([model.train_op, model.loss], feed_dict={ model.images: batch_images, model.labels: batch_labels }) if counter % 100 == 0: print( "Epoch: [%2d], step: [%2d], time: [%4.4f], loss: [%.8f]" % ((ep + 1), counter, time.time() - time_, err)) if counter % 1000 == 0: model.save("checkpoint", counter)
def train_logistic(train_features, train_labels, test_features, scikit_balancing, train_size, skip_feature_selection, skip_grid_search, penalty, cost, dual, tol, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) if not skip_feature_selection: # feature selector expects scaled features (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') feature_selector_obj = feature_selection.feature_selector( scaled_train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "logistic" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) penalty = params['penalty'] cost = params['C'] # Now perform the training on full train data. check on test data model = LogisticRegression(penalty=penalty, dual=dual, C=cost, tol=tol, max_iter=5000, class_weight=penalty_weights) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_and_save_model(): print("reading data..") train_x, train_y, test_x, test_y = prepare_train_data() assert train_x.shape[0] == train_y.shape[0] assert test_x.shape[0] == test_y.shape[0] print("start classifying..") model = train_model(train_x, train_y, test_x, test_y) model.save(MODEL_DIR, save_format='h5')
def perform_single_svm(input_data): """ Perform a single trial of svm with selected features """ # extract inputs from input tuple features = input_data[0] labels = input_data[1] svm_kernel = input_data[2] svm_gamma = input_data[3] svm_cost = input_data[4] svm_degree = input_data[5] scikit_balancing = input_data[6] test_size = input_data[7] tolerance = 0.005 cache_size = 6000 # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the # data in the same way random.seed() train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=test_size, random_state=random.randint( 1, 99999999))) train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, -1) model = svm.SVC(tol=tolerance, cache_size=cache_size, class_weight=penalty_weights, kernel=svm_kernel, gamma=svm_gamma, C=svm_cost, degree=svm_degree) model = model.fit(train_features, train_labels) predicted_labels = model.predict(test_features) label_values = [0, 1] trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels, label_values) return trial_metrics
def perform_single_random_forest(input_data): """ Perform a single trial of random forest with selected features """ # extract inputs from input tuple features = input_data[0] labels = input_data[1] rf_num_trees = input_data[2] rf_criterion = input_data[3] rf_max_features = input_data[4] rf_min_samples_split = input_data[5] rf_min_samples_leaf = input_data[6] scikit_balancing = input_data[7] test_size = input_data[8] # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the # data in the same way random.seed() train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=test_size, random_state=random.randint( 1, 99999999))) train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, -1) model = RandomForestClassifier(n_estimators=rf_num_trees, n_jobs=-1, criterion=rf_criterion, max_features=rf_max_features, min_samples_split=rf_min_samples_split, min_samples_leaf=rf_min_samples_leaf) model = model.fit(train_features, train_labels, sample_weight=penalty_weights) predicted_labels = model.predict(test_features) label_values = [0, 1] trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels, label_values) return trial_metrics
def perform_single_logistic(input_data): """ Perform a single trial of logistic regression with selected features """ # extract inputs from input tuple features = input_data[0] labels = input_data[1] logistic_penalty = input_data[2] logistic_cost = input_data[3] scikit_balancing = input_data[4] test_size = input_data[5] tolerance = 0.0005 max_iterations = 10000 # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the # data in the same way random.seed() train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=test_size, random_state=random.randint( 1, 99999999))) train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, -1) model = LogisticRegression(penalty=logistic_penalty, C=logistic_cost, tol=tolerance, max_iter=max_iterations, class_weight=penalty_weights) model = model.fit(train_features, train_labels) predicted_labels = model.predict(test_features) label_values = [0, 1] trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels, label_values) return trial_metrics
def main(_): all_sentence, all_tags, all_intent, vocab, dictionary, tags_list, tags_dict, intent_list, intent_dict = prepare_train_data( FLAGS.train_data_file, FLAGS.vocab_size) train_data, dev_data = split_data(all_sentence, all_tags, all_intent) # train_sentence, train_tags, train_intent = train_data # dev_sentence, dev_tags, dev_intent = dev_data output_path = os.path.join(sys.path[0], 'runs', str(int(time.time()))) checkpoint_dir = os.path.join(output_path, 'checkpoints') os.makedirs(checkpoint_dir, mode=0o755, exist_ok=True) save_vocabulary(os.path.join(output_path, 'sentence_vocab'), vocab) save_vocabulary(os.path.join(output_path, 'tag_vocab'), tags_list) save_vocabulary(os.path.join(output_path, 'intent_vocab'), intent_list) model = RNNModel(hidden_size=FLAGS.hidden_size, embed_size=FLAGS.embedding_size, source_vocab_size=len(vocab), tag_vocab_size=len(tags_list), intent_vocab_size=len(intent_list)) with tf.Session(graph=model.graph) as sess: sess.run(tf.initialize_all_variables()) step = 1 avg_tag_loss = 0 avg_intent_loss = 0 for epoch in range(FLAGS.num_epoch): batch_gen = batch_generator(*train_data) for sentence_batch, length_batch, tags_batch, intent_batch in batch_gen: _, tag_loss, intent_loss = sess.run( [model.train_op, model.tag_loss, model.intent_loss], feed_dict={ model.input_x: sentence_batch, model.input_len: length_batch, model.input_tag: tags_batch, model.input_intent: intent_batch, model.keep_prob: FLAGS.dropout_keep_prob }) avg_tag_loss += tag_loss avg_intent_loss += intent_loss if step % 20 == 0: avg_tag_loss /= 20 avg_intent_loss /= 20 print('Step', step, 'Tag loss', tag_loss, 'Intent loss', intent_loss) avg_tag_loss = 0 avg_intent_loss = 0 step += 1 correct_tag, total_tag = 0, 0 correct_intent, total_intent = 0, 0 for sentence, tags, intent in zip(*dev_data): predict_tags, predict_intent = sess.run( [model.output_tag, model.output_intent], feed_dict={ model.input_x: [sentence], model.input_len: [len(sentence)], model.keep_prob: 1.0 }) for tag1, tag2 in zip(tags, predict_tags[0]): if tag1 == tag2: correct_tag += 1 total_tag += 1 if intent == predict_intent[0]: correct_intent += 1 total_intent += 1 tag_accuracy = correct_tag / total_tag intent_accuracy = correct_intent / total_intent print('[Validation]', 'tag acc =', tag_accuracy, ', intent acc =', intent_accuracy, '\n') model.saver.save( sess, os.path.join( checkpoint_dir, '{}_{:.4f}_{:.4f}.ckpt'.format(epoch, tag_accuracy, intent_accuracy)))
def main(): df = pandas.read_csv(args.input_filename, index_col=False, header=0) data = df.values column_names = df.columns.values.tolist() # Extract features/labels and their names from raw data features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) feature_names = column_names[0:args.label_column] label_name = column_names[args.label_column] # We specify absolute train sizes so we can compare across different data sets with # different overal sizes. Need to do a dummy split to train and test, so we can figure # out max possible train size after balancing dummy_train_features, dummy_test_features, dummy_train_labels, dummy_test_labels = ( model_selection.train_test_split(features, labels, test_size=args.test_size)) dummy_train_features, dummy_train_labels, penalty_weights = utils.prepare_train_data( dummy_train_features, dummy_train_labels, args.scikit_balancing, -1) max_possible_train_size = dummy_train_features.shape[0] train_sizes = range(400, 15000, 100) train_sizes.extend(range(15000, min(max_possible_train_size, 30001), 500)) metric_names = ["train_size", "test_size", "test_female_size", "test_male_size", "test_true_female", "test_false_female", "test_true_male", "test_false_male", "test_accuracy", "test_female_precision", "test_male_precision", "test_female_recall", "test_male_recall"] # mapping from train size to any of "accuracy", "precision"... to list of values, each # value corresponding to the result from one trial results = defaultdict(lambda: defaultdict(list)) finished_trials = 0 while finished_trials < args.num_trials: # Figure out how many parallel processes we should launch to satisfy number of trials. num_processes = min(args.num_processes, args.num_trials - finished_trials) replicated_data = list() for n in range(0, num_processes): # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split # the data in the same way due to an identical intitial random state random_seed = random.randint(1, 999999999) replicated_data.append((features, labels, train_sizes, random_seed)) pool = multiprocessing.Pool(processes = num_processes) trials_metrics = pool.map(compute_trial_metrics, replicated_data) pool.close() finished_trials += num_processes # Add trial metrics to results by looping over different trials in a list for trial_metrics in trials_metrics: # loop over different train size in dict for train_size in train_sizes: metric_values = trial_metrics[train_size] # loop over different metrics for metric in metric_names: results[train_size][metric].append(metric_values[metric]) print("\nFinished %d trials\n" % finished_trials) # generate output file and header output_file = open(args.output_filename, "w") output_file_writer = csv.writer(output_file) output_file_writer.writerow(metric_names) for train_size in train_sizes: output_file_writer.writerow([train_size, int(mean(results[train_size]["test_size"])), int(mean(results[train_size]["test_female_size"])), int(mean(results[train_size]["test_male_size"])), int(mean(results[train_size]["test_true_female"])), int(mean(results[train_size]["test_false_female"])), int(mean(results[train_size]["test_true_male"])), int(mean(results[train_size]["test_false_male"])), mean(results[train_size]["test_accuracy"]), mean(results[train_size]["test_female_precision"]), mean(results[train_size]["test_male_precision"]), mean(results[train_size]["test_female_recall"]), mean(results[train_size]["test_male_recall"]) ]) output_file.close()
def grid_search(score, features, labels, scikit_balancing, algorithm, num_jobs): """ expects the features to be scaled! """ # Now balance the train data set and create requested train size. features, labels, penalty_weights = utils.prepare_train_data( features, labels, scikit_balancing, -1) # Set the parameters for gid search and model based on algorithm choice if algorithm == 'kernel-svm': tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }, { 'kernel': ['sigmoid'], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }, { 'kernel': ['poly'], 'degree': [2, 3, 4], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] model = svm.SVC(tol=0.005, cache_size=6000, class_weight=penalty_weights) elif algorithm == 'linear-svm': tuned_parameters = [{ 'loss': ['hinge', 'squared_hinge'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] model = svm.LinearSVC(tol=0.005, max_iter=5000, class_weight=penalty_weights) elif algorithm == 'logistic': tuned_parameters = [{ 'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] }] model = LogisticRegression(tol=0.0005, max_iter=1000, class_weight=penalty_weights) elif algorithm == 'random-forest': tuned_parameters = [{ 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2', 0.5, 0.8], 'min_samples_split': [2], 'min_samples_leaf': [1] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2', 0.5, 0.8], 'min_samples_split': [5], 'min_samples_leaf': [1, 2] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2', 0.5, 0.8], 'min_samples_split': [10], 'min_samples_leaf': [2, 5] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2', 0.5, 0.8], 'min_samples_split': [20], 'min_samples_leaf': [5, 10] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2', 0.5, 0.8], 'min_samples_split': [50], 'min_samples_leaf': [5, 15, 25] }] model = RandomForestClassifier(class_weight=penalty_weights) elif algorithm == 'knn': tuned_parameters = [{ 'n_neighbors': [1, 2, 3, 4, 5, 10, 15, 20, 30, 50, 70, 100, 150, 200], 'metric': ['euclidean', 'manhattan', 'chebyshev'], 'algorithm': ['ball_tree', 'kd_tree'], 'weights': ['uniform', 'distance'] }] model = KNeighborsClassifier() else: sys.exit('Invalid algorithm: ' + algorithm + ' provided') scorer = create_scorer(score) skf = StratifiedKFold(n_splits=5, shuffle=True) # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory # usage does not blow up clf = GridSearchCV(estimator=model, param_grid=tuned_parameters, n_jobs=num_jobs, pre_dispatch="n_jobs", cv=skf, scoring=scorer) clf.fit(features, labels) return clf
def train_knn(train_features, train_labels, test_features, imbalanced_data, train_size, scaling_method, minmax_min, minmax_max, skip_feature_selection, skip_grid_search, n_neighbors, weights, algorithm, metric, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. Here instead of # scikit balancing, we will use imbalanced_data flag and discard the last output since # it is irrelevant to knn. In order not to balance the data, the third argument should # be true (simulate scikit balancing); so we will use imabalanced_data flag in place of # scikit_balancing. train_features, train_labels, dummy = utils.prepare_train_data( train_features, train_labels, imbalanced_data, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # now that we have limited the data to requested train size, scale data (train_features, test_features) = utils.scale_data(train_features, test_features, scaling_method, minmax_min, minmax_max) if not skip_feature_selection: feature_selector_obj = feature_selection.feature_selector( train_features, train_labels, len(train_labels), imbalanced_data) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "knn" clf = grid_search.grid_search("macro-recall", train_features, train_labels, imbalanced_data, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) n_neighbors = params['n_neighbors'] weights = params['weights'] algorithm = params['algorithm'] metric = params['metric'] # Now perform the training on full train data. check on test data model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, metric=metric) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_svm(train_features, train_labels, test_features, scikit_balancing, train_size, scaling_method, minmax_min, minmax_max, skip_feature_selection, skip_grid_search, kernel, gamma, cost, degree, num_jobs): """ Balances, extracts the requested train size, imputes, scales and finally performs features selection on the train data. Then it performs grid search, train a model using the best parameters. Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # now that we have limited the data to requested train size, scale data (train_features, test_features) = utils.scale_data(train_features, test_features, scaling_method, minmax_min, minmax_max) if not skip_feature_selection: feature_selector_obj = feature_selection.feature_selector( train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "linear-svm" if kernel == "linear" else "kernel-svm" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) if 'kernel' in params: kernel = params['kernel'] if 'gamma' in params: gamma = params['gamma'] if 'C' in params: cost = params['C'] if 'degree' in params: degree = params['degree'] # Now perform the training on full train data. check on test data # We enable probability estimates, so that we can identify the top samples. model = svm.SVC(tol=0.05, cache_size=6000, class_weight=penalty_weights, kernel=kernel, gamma=gamma, C=cost, degree=degree, probability=True) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_random_forest(train_features, train_labels, test_features, scikit_balancing, train_size, skip_feature_selection, skip_grid_search, max_features, n_estimators, criterion, min_samples_split, min_samples_leaf, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) if not skip_feature_selection: # feature selector expects scaled features (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') feature_selector_obj = feature_selection.feature_selector( scaled_train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) max_features = utils.extract_max_features(max_features) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "random-forest" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) n_estimators = max(params['n_estimators'], n_estimators) criterion = params['criterion'] max_features = params['max_features'] min_samples_split = params['min_samples_split'] min_samples_leaf = params['min_samples_leaf'] # Now perform the training on full train data. check on test data model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=num_jobs, criterion=criterion, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, class_weight=penalty_weights) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
tf.flags.DEFINE_integer( "evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{} = {}".format(attr, value)) print("") print("Dataset preparing...") train_X, train_y = prepare_train_data(FLAGS.pos_data_file, FLAGS.neg_data_file) train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=FLAGS.test_perc) model = SentimentPredictior( MAX_SENT_LEN, MAX_WORD_LEN, LETTERS_COUNT, filters=FLAGS.num_filters, lr=FLAGS.learning_rate, words_take=[int(i) for i in FLAGS.words_take.split(",")], dropout=FLAGS.dropout) print("Start training...") model.fit(train_X,
def main(): df = pandas.read_csv(args.input_filename, index_col=False, header=0) # figure out the name of the filtering column column_names = df.columns.values.tolist() filtering_columns = [ 'number_of_contacts__allweek__allday__call__mean', 'number_of_contacts__call__mean' ] filtering_columns = [x for x in filtering_columns if x in column_names] if len(filtering_columns) == 2: sys.exit('Both columns ' + str(filtering_columns) + ' are present in data.') if len(filtering_columns) == 0: sys.exit('None of columns ' + str(filtering_columns) + ' are present in data.') filtering_column = filtering_columns[0] # figure out filtering thresholds that satisfy the requested train and test sizes. Need # to do a dummy split to train and test, so we can figure out max possible train size # after balancing filtering_thresholds = list() for filtering_threshold in arange(0, 7, 0.5): data = df[df[filtering_column] >= filtering_threshold].values features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) dummy_train_features, dummy_test_features, dummy_train_labels, dummy_test_labels = ( model_selection.train_test_split(features, labels, test_size=args.test_size)) dummy_train_features, dummy_train_labels, penalty_weights = utils.prepare_train_data( dummy_train_features, dummy_train_labels, args.scikit_balancing, -1) # This is a good filtering threshold if the number of data points satisfying the # threshold exceeds the requested train size if dummy_train_features.shape[0] >= args.train_size: filtering_thresholds.append(filtering_threshold) else: break metric_names = [ "min_active_days", "percentage_data", "train_size", "train_female_size", "train_male_size", "test_size", "test_female_size", "test_male_size", "test_true_female", "test_false_female", "test_true_male", "test_false_male", "test_accuracy", "test_AUC", "test_average_precision", "test_female_precision", "test_male_precision", "test_average_recall", "test_female_recall", "test_male_recall", "test_average_f1score", "test_female_f1score", "test_male_f1score" ] # mapping from filtering threshold to any of "accuracy", "precision"... to list of # values, each value corresponding to the result from one trial results = defaultdict(lambda: defaultdict(list)) for trial in range(args.num_trials): random_seed = random.randint(1, 999999999) trial_metrics = compute_trial_metrics(df, filtering_thresholds, filtering_column, random_seed) # loop over different filtering thresholds in dict for filtering_threshold in filtering_thresholds: metric_values = trial_metrics[filtering_threshold] # loop over different metrics for metric in metric_names: results[filtering_threshold][metric].append( metric_values[metric]) print("\nFinished %d trials\n" % (trial + 1)) # generate output file and header output_file = open(args.output_filename, "w") output_file_writer = csv.writer(output_file) output_file_writer.writerow(metric_names) for filtering_threshold in filtering_thresholds: output_file_writer.writerow([ filtering_threshold, mean(results[filtering_threshold]["percentage_data"]), int(mean(results[filtering_threshold]["train_size"])), int(mean(results[filtering_threshold]["train_female_size"])), int(mean(results[filtering_threshold]["train_male_size"])), int(mean(results[filtering_threshold]["test_size"])), int(mean(results[filtering_threshold]["test_female_size"])), int(mean(results[filtering_threshold]["test_male_size"])), int(mean(results[filtering_threshold]["test_true_female"])), int(mean(results[filtering_threshold]["test_false_female"])), int(mean(results[filtering_threshold]["test_true_male"])), int(mean(results[filtering_threshold]["test_false_male"])), mean(results[filtering_threshold]["test_accuracy"]), mean(results[filtering_threshold]["test_AUC"]), mean(results[filtering_threshold]["test_average_precision"]), mean(results[filtering_threshold]["test_female_precision"]), mean(results[filtering_threshold]["test_male_precision"]), mean(results[filtering_threshold]["test_average_recall"]), mean(results[filtering_threshold]["test_female_recall"]), mean(results[filtering_threshold]["test_male_recall"]), mean(results[filtering_threshold]["test_average_f1score"]), mean(results[filtering_threshold]["test_female_f1score"]), mean(results[filtering_threshold]["test_male_f1score"]) ]) output_file.close()