def do_iteration(i): print("Iteration " + str(i+1)) dev_df = train.copy(deep=True) test_df = test.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) folds_i = iterative_stratification(dev_df, labels, cv_folds, rng) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for train_idx, valid_idx in cv_iterator(folds_i): training_fold = dev_df.loc[train_idx, ] validation_fold = dev_df.loc[valid_idx, ] # shuffle the folds training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) stats_valid, stats_test = multi_label_crf( labels=labels, df_train=training_fold, df_valid=validation_fold, df_test=test_df, binary=binary, connectivity='full', vectorizer_method=vectorizer_method ) validation_stats_i.merge(stats_valid) testing_stats_i.merge(stats_test) log.write('Iteration {}\n'.format(i)) validation_stats_i.write(log, 'a') testing_stats_i.write(log, 'a') return validation_stats_i, testing_stats_i
def do_fold(j): print("\tFold " + str(j+1)) train_idx = folds_i[j][0] valid_idx = folds_i[j][1] training_fold = developement_df.loc[train_idx, ] training_fold = training_fold.reset_index(drop=True) validation_fold = developement_df.loc[valid_idx, ] validation_fold = validation_fold.reset_index(drop=True) # shuffle the folds training_stats_i_f = Statistics() validation_stats_i_f = Statistics() testing_stats_i_f = Statistics() # Init the label ranking lists. label_pred_proba_train = [] label_pred_proba_valid = [] label_pred_proba_test = [] label_y_train = [] label_y_valid = [] label_y_test = [] # Set up the vectorizer for the bag-of-words representation if vectorizer_method == 'tf-idf': vectorizer = TfidfVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True, sublinear_tf=True, max_df=1.0, min_df=0 ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer = CountVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag) base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng) for label in sorted(labels): print("\t\tFitting for label {}...".format(label)) # SVMs make the assumption of standardised features. Hence we scale the features # avoiding the use of mean to maintain the structure of count sparsity. Scaling # May also help with linear model convergence speed. x_train_l = vectorizer.transform(training_fold['terms'].values) y_train_l = np.asarray(training_fold[label].values, dtype=int) x_valid_l = vectorizer.transform(validation_fold['terms'].values) y_valid_l = np.asarray(validation_fold[label].values, dtype=int) x_test_l = vectorizer.transform(testing_df['terms'].values) y_test_l = np.asarray(test_df_i[label].values, dtype=int) if scale: x_train_l = mean_center(x_train_l, with_mean=False) x_valid_l = mean_center(x_valid_l, with_mean=False) x_test_l = mean_center(x_test_l, with_mean=False) # We generate the folds for randomised search up-front. We hold out one of the folds for # Probability calibration so each sampled param set gets calibrated on the same data. # This leaves cv_folds-2 folds for randomised search cross-validation. # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng) base_estimator_l = base_estimators[label] fresh_estimator = clone(base_estimator_l) # Find the best params, then do a final proper calibration. params = sk_generate_params(method, selection) estimator_l = RandomizedSearchCV( estimator=base_estimator_l, param_distributions=params, n_iter=60, scoring='f1', cv=3, random_state=rng, error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs', refit=True ) # Test if there's any signal if we permute the labels. # Classifier should do poorly if we do so. if permute: y_train_l = rng.permutation(y_train_l) threshold = 0.5 estimator_l.fit(x_train_l, y_train_l) best_params_l = estimator_l.best_params_ # Calibrate the random forest with the best hyperparameters. if method not in ['lr']: estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l), cv=3, method='sigmoid') estimator_l.fit(x_train_l, y_train_l) # Evaluate Performance characteristics and test on training to check overfitting. y_train_prob_l = estimator_l.predict_proba(x_train_l) y_valid_prob_l = estimator_l.predict_proba(x_valid_l) y_test_prob_l = estimator_l.predict_proba(x_test_l) training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold)) validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold)) # Compute independent test data performance testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold)) # Get label ranking info label_pred_proba_train.append([p[1] for p in y_train_prob_l]) label_pred_proba_valid.append([p[1] for p in y_valid_prob_l]) label_pred_proba_test.append([p[1] for p in y_test_prob_l]) label_y_train.append(y_train_l) label_y_valid.append(y_valid_l) label_y_test.append(y_test_l) print(validation_stats_i_f.frame()) # Compute multi-label performance statistics y = np.vstack(zip(*label_y_train)) y_prob = np.vstack(zip(*label_pred_proba_train)) training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_valid)) y_prob = np.vstack(zip(*label_pred_proba_valid)) validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_valid)) y_prob = np.vstack(zip(*label_pred_proba_valid)) validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f # For each iteration, batch the folds into parallel jobs statistics_objects_i = parallel_map(do_fold, range(cv_folds), n_jobs) for (train, val, test) in statistics_objects_i: training_stats_i.merge(train) validation_stats_i.merge(val) testing_stats_i.merge(test) log.write('Iteration {}\n'.format(i)) log.write('Training {}\n'.format(i)) training_stats_i.write(log, 'a') log.write('Validation {}\n'.format(i)) validation_stats_i.write(log, 'a') log.write('Testing {}\n'.format(i)) testing_stats_i.write(log, 'a') statistics_objects.append([training_stats_i, validation_stats_i, testing_stats_i]) # return training_stats_i, validation_stats_i, testing_stats_i # containers = parallel_map(do_iteration, range(iterations), n_jobs=n_jobs)
def multi_label_crf( labels, df_train, df_valid, df_test, binary, connectivity='full', vectorizer_method='count' ): """ Do you suffer from acute label correlations? Are your samples a part of more than one class? Do you have signs of labels have dependency? If you answered yes to at least one of those questions then sign up for structured learning today. For a low monthly membership fee of $39.99 you can solve all your multi-label woes! @param labels: @param df_train: @param df_valid: @param df_test: @param connectivity: @param vectorizer_method: @return: """ stats_container_valid = Statistics() stats_container_test = Statistics() if vectorizer_method == 'tf-idf': vectorizer_node = TfidfVectorizer( stop_words=['go:', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0) vectorizer_node.fit(df_train['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer_node = CountVectorizer(stop_words=['go', '', ' '], binary=binary, lowercase=True) vectorizer_node.fit(df_train['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) x_node_train, y_train, feature_names, selector_node = prep.select_features( df = df_train, vectorizer=vectorizer_node, feature_col='terms', label_col='label', select_method=None, continuous_col=[], alpha=alpha, percentile=percentile ) x_node_valid, y_valid = prep.transform_features( df=df_valid, vectorizer=vectorizer_node, selector=selector_node, feature_col='terms', label_col='label', continuous_cols=[] ) y_train = np.asarray([prep.binarise_labels(x, labels) for x in y_train], dtype=int) y_valid = np.asarray([prep.binarise_labels(x, labels) for x in y_valid], dtype=int) if connectivity == 'full' or connectivity == 'tree': n_labels = len(labels) edges = np.vstack([x for x in itertools.combinations(range(n_labels), 2)]) model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='ad3') elif connectivity == 'tree': edges = chow_liu_tree(y_train) model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='max-product') else: edges = None model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='unary') x_train = x_node_train.toarray() x_valid = x_node_valid.toarray() # -------------------- MAKE THE ESTIMATOR -------------------- # estimator = OneSlackSSVM(model, max_iter=2, tol=0.001, n_jobs=1) # -------------------- LEARN/STATS -------------------- # estimator.fit(x_train, y_train) stats_container_valid.merge(evaluate_crf_model(x_valid, y_valid, estimator, labels)) if isinstance(df_test, pd.DataFrame): x_node_test, y_test = prep.transform_features( df=df_test, vectorizer=vectorizer_node, selector=selector_node, feature_col='terms', label_col='label', continuous_cols=[] ) y_test = np.asarray([prep.binarise_labels(x, labels) for x in y_test], dtype=int) x_test = x_node_test.toarray() stats_container_test.merge(evaluate_crf_model(x_test, y_test, estimator, labels)) # -------------------- RETURN -------------------- # if isinstance(df_test, pd.DataFrame): return stats_container_valid, stats_container_test else: return stats_container_valid
def do_iteration(data): # for i in xrange(iterations): i = data[0] print("Iteration " + str(i+1)) train_i = train.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) su_make_dir('llda/models/iteration-{}'.format(i+1)) folds_i = iterative_stratification(train_i, labels, cv_folds, rng) combinations = itertools.combinations(range(0, cv_folds), cv_folds-1) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for n, j in enumerate(combinations): # print('\tFold ' + str(n+1)) su_make_dir('llda/models/iteration-{}/fold-{}'.format(i+1, n+1)) file_path = 'llda/models/iteration-{}/fold-{}/'.format(i+1, n+1) training_folds_j = folds_i[list(j)] validation_fold_j = folds_i[[f for f in range(0, cv_folds) if f not in j]] assert len(validation_fold_j) == 1 training_fold = reduce( lambda x, y: pd.concat([x, y], ignore_index=True, copy=True), training_folds_j[1:], training_folds_j[0] ) validation_fold = validation_fold_j[0] # shuffle the folds if balanced: training_fold = training_fold.reindex(rng.permutation(training_fold.index)) training_fold = prep.reduce_label_bias(training_fold, labels, 'activation', 5, random_state=rng) training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) write_tsv(training_fold, file_path + '/train.tsv', test=False) write_tsv(validation_fold, file_path + '/valid.tsv', test=True) write_tsv(test, file_path + '/test.tsv', test=True) # ------------------ CALL JAVA TO LLDA ON THIS PARTAY ----------------- # DEVNULL = open(os.devnull, 'w') args = [ 'java', '-jar', '-Xmx2048m', 'llda/tmt-0.4.0.jar', 'llda/llda.scala', file_path, '/train.tsv', '/valid.tsv', '/test.tsv', 'model-{}-{}'.format(i+1, n+1), '{}-{}'.format(i+1, n+1) ] p = Popen(args, stdout=DEVNULL, stderr=STDOUT) p.wait() # Perform evaluation validation_proba = np.genfromtxt('llda/results/validation-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:] test_proba = np.genfromtxt('llda/results/test-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:] labels_j = get_labels_from_model(file_path + '/llda-cvb0-model-{}-{}'.format(1+i, n+1)) validation_stats_i_j = Statistics() testing_stats_i_j = Statistics() for l_index, l in enumerate(labels_j): y_validation = [p for p in validation_fold[l].values] y_proba_validation = [[1-p, p] for p in validation_proba[:, l_index]] y_pred_validation = [int(p >= threshold) for p in validation_proba[:, l_index]] y_hprd = [p for p in test[l].values] y_proba_hprd = [[1-p, p] for p in test_proba[:, l_index]] y_pred_hprd = [int(p >= threshold) for p in test_proba[:, l_index]] validation_stats_i_j = evaluate_model( y=y_validation, y_pred=y_pred_validation, y_pred_prob=y_proba_validation, label=l, statistics=validation_stats_i_j, verbose=0 ) testing_stats_i_j = evaluate_model( y=y_hprd, y_pred=y_pred_hprd, y_pred_prob=y_proba_hprd, label=l, statistics=testing_stats_i_j, verbose=0 ) validation_stats_i.merge(validation_stats_i_j) testing_stats_i.merge(testing_stats_i_j) return validation_stats_i, testing_stats_i
y_pred_prob=y_proba_hprd, label=l, statistics=testing_stats_i_j, verbose=0 ) validation_stats_i.merge(validation_stats_i_j) testing_stats_i.merge(testing_stats_i_j) return validation_stats_i, testing_stats_i containers = parallel_map(do_iteration, zip(range(iterations), seeds), n_jobs=n_jobs) valid_containers = [containers[i][0] for i in range(iterations)] test_containers = [containers[i][1] for i in range(iterations)] for container in valid_containers: validation_stats.merge(container) for container in test_containers: testing_stats.merge(container) # --------------------- FINAL RESULTS ---------------------------- # direc = tempfile.mkdtemp(prefix='LLDA-{}-'.format(date), dir='results/') pickle.dump((validation_stats, testing_stats, config), open(direc + '/LLDA-statistics.pkl', 'w')) results = open(direc + '/LLDA-results.txt', 'w') results.write("\nRun Settings: \n") results.write("\tDate: \t\t\t\t{0}\n".format(date)) results.write("\tMethod: \t\t\t{0}\n".format('L-LDA')) results.write("\tBinary: \t\t\t{0}\n".format('NA')) results.write("\tBalanced: \t\t\t{0}\n".format(balanced)) results.write("\tChained: \t\t\t{0}\n".format('NA'))