def vectorized_clean_pu(ratio=1.0): P_raw, U_raw, X_test_raw, y_test = clean_corpus_pu(ratio) print("\nPU TRAINING", "(on", 100 * ratio, "% of available data)", "\tP: HOC POS + CIVIC", "(", num_rows(P_raw), ")", "\tN: HOC NEG + ABSTRACTS (", num_rows(U_raw), ")", "\tTEST SET (HOC POS + CIVIC + HOC NEG):", num_rows(X_test_raw) ) vec = transformers.vectorizer() vec.fit(helpers.concatenate((P_raw, U_raw))) P = vec.transform(P_raw) U = vec.transform(U_raw) print("Features before selection:", np.shape(P)[1]) # sel = IdentitySelector() sel = transformers.percentile_selector() # sel = basic_pipeline.factorization('LatentDirichletAllocation') sel.fit(vstack((P, U)), (helpers.concatenate((np.ones(num_rows(P)), np.zeros(num_rows(U)))))) P = sel.transform(P) U = sel.transform(U) X_test = (sel.transform(vec.transform(X_test_raw))) print("Features after selection:", np.shape(P)[1]) return P, U, X_test, y_test, vec, sel
def vectorize_preselection(P, U, ratio=1.0): """generate and select features for ratio of sentence sets""" print("Preprocessing corpora for PU learning") if ratio < 1.0: print("Training on", 100 * ratio, "% of data") P, _ = train_test_split(P, train_size=ratio, random_state=RANDOM_SEED) U, _ = train_test_split(U, train_size=ratio, random_state=RANDOM_SEED) vec = transformers.vectorizer() vec.fit(helpers.concatenate((P, U))) P_ = vec.transform(P) U_ = vec.transform(U) print("Features before selection:", np.shape(U_)[1]) sel = transformers.percentile_selector() sel.fit(vstack((P_, U_)), helpers.concatenate((np.ones(num_rows(P_)), -np.ones(num_rows(U_))))) P_ = sel.transform(P_) U_ = sel.transform(U_) return P_, U_, vec, sel
def vectorized_clean_pnu(ratio=1.0): P_raw, N_raw, U_raw = clean_corpus_pnu(ratio) print("\nSEMI-SUPERVISED TRAINING", "(on", 100 * ratio, "% of available data)", "\tP: HOC POS + CIVIC (", num_rows(P_raw), ")", "\tN: HOC NEG (", num_rows(N_raw), ")", "\tU: ABSTRACTS (", num_rows(U_raw), ")" ) vec = transformers.vectorizer() vec.fit(helpers.concatenate((P_raw, N_raw, U_raw))) P = vec.transform(P_raw) N = vec.transform(N_raw) U = vec.transform(U_raw) print("Features before selection:", np.shape(P)[1]) sel = transformers.percentile_selector() sel.fit(vstack((P, N, U)), (helpers.concatenate((np.ones(num_rows(P)), -np.ones(num_rows(N)), np.zeros(num_rows(U)))))) P = (sel.transform(P)) N = (sel.transform(N)) U = (sel.transform(U)) print("Features after selection:", np.shape(P)[1]) return P, N, U, vec, sel
def run_EM_with_RN(P, U, RN, max_pos_ratio=1.0, tolerance=0.05, max_imbalance_P_RN=10.0, clf_selection=True, verbose=False): """second step PU method: train NB with P and RN to get probabilistic labels for U, then iterate EM""" if num_rows(P) > max_imbalance_P_RN * num_rows(RN): P_init = np.array( random.sample(list(P), int(max_imbalance_P_RN * num_rows(RN)))) else: P_init = P if verbose: print( "\nBuilding classifier from Positive and Reliable Negative set" ) initial_model = build_proba_MNB(concatenate((P_init, RN)), concatenate((np.ones(num_rows(P_init)), np.zeros(num_rows(RN)))), verbose=verbose) if num_rows(U) == 0: print("Warning: EM: All of U was classified as negative.") return initial_model y_P = np.array([1] * num_rows(P)) if verbose: print( "\nCalculating initial probabilistic labels for Reliable Negative and Unlabelled set" ) ypU = initial_model.predict_proba(U)[:, 1] ypN = initial_model.predict_proba(RN)[:, 1] if verbose: print("\nIterating EM algorithm on P, RN and U\n") model = iterate_EM(P, concatenate((RN, U)), y_P, concatenate((ypN, ypU)), tolerance=tolerance, max_pos_ratio=max_pos_ratio, clf_selection=clf_selection, verbose=verbose) return model
def rocchio(P, N, alpha=16, beta=4, binary=False): """fits mean training vector and predicts whether cosine similarity is above threshold (default: 0.0) predict_proba returns similarity scores. if X_thresh is true, uses the training vectors' similarity scores to compute a threshold. """ clf = BinaryRocchio(alpha=alpha, beta=beta) X = concatenate((P, N)) y = concatenate((ones(num_rows(P)), zeros(num_rows(N)))) model = clf.fit(X, y) return model
def get_RN_Spy_Docs(P, U, spy_ratio=0.1, max_pos_ratio=0.5, tolerance=0.2, noise_lvl=0.05, verbose=False): """First step technique: Compute reliable negative docs from P using Spy Documents and I-EM""" P_minus_spies, spies = spy_partition(P, spy_ratio) U_plus_spies = concatenate((U, spies)) model = iterate_EM(P_minus_spies, U_plus_spies, tolerance=tolerance, max_pos_ratio=max_pos_ratio, clf_selection=False, verbose=verbose) y_spies = model.predict_proba(spies)[:, 1] y_U = model.predict_proba(U)[:, 1] U_minus_RN, RN = select_PN_below_score(y_spies, U, y_U, noise_lvl=noise_lvl) return U_minus_RN, RN
def biased_SVM_grid_search(P, U, Cs=None, kernel='linear', n_estimators=9, verbose=False): if Cs is None: Cs = [10**x for x in range(-12, 12, 2)] if verbose: print( "Running Biased-SVM with balanced class weights and grid search over", len(Cs), "C values") model = BaggingClassifier(LinearSVC()) grid_search = GridSearchCV( model, param_grid={ 'base_estimator__C': Cs, 'base_estimator__class_weight': ['balanced'], ### not applicable for LinearSVC # 'base_estimator__kernel' : [kernel], # 'base_estimator__cache_size' : [8000], # 'base_estimator__probability' : [True], ### fit parameters for Bagging wrapper 'bootstrap': [True], 'n_estimators': [n_estimators], ### parallelization incompatible with multiprocessing # 'n_jobs' : [n_estimators] }, scoring=pu_scorer, verbose=0) if verbose: print("Grid searching parameters for biased-SVM") X = concatenate((P, U)) y = concatenate((ones(num_rows(P)), zeros(num_rows(U)))) grid_search.fit(X, y) if verbose: train_report(grid_search.best_estimator_, P, U) print("Biased-SVM parameters:", grid_search.best_params_, "\tPU score:", grid_search.best_score_) return grid_search.best_estimator_
def model_pu_score_record(P_train, U_train, P_test, U_test, m): model = m['model'](P_train, U_train) name = m['name'] y_pred = model.predict(helpers.concatenate((P_test, U_test))) y_P = y_pred[:num_rows(P_test)] y_U = y_pred[num_rows(P_test):] score = pu_score(y_P, y_U) return {'name': name, 'model': m['model'], 'pu_score': score, 'ratio_in_U': np.sum(y_U) / num_rows(y_U)}
def best_model_cross_val(P, N, U, fold=10): """determine best model, cross validate and return pipeline trained on all data""" print("\nFinding best model") best = get_best_model(P, N, U)['best'] print("\nCross-validation\n") kf = KFold(n_splits=fold, shuffle=True) splits = zip(list(kf.split(P)), list(kf.split(N))) # TODO doesn't work in parallel # if PARALLEL: # with multi.Pool(min(fold, multi.cpu_count())) as p: # stats = list(p.map(partial(eval_fold, best, P, N, U), enumerate(splits), chunksize=1)) # else: # stats = list(map(partial(eval_fold, best, P, N, U), enumerate(splits))) stats = list(map(partial(eval_fold, best, P, N, U), enumerate(splits))) mean_stats = np.mean(stats, 0) print("Cross-validation average: p {}, r {}, f1 {}, acc {}".format( mean_stats[0], mean_stats[1], mean_stats[2], mean_stats[3])) print("Retraining model on full data") vec, sel = best['vectorizer'], best['selector'] vec.fit(concatenate((P, N, U))) P_, N_, U_ = [vec.transform(x) for x in [P, N, U]] y_pp = concatenate((np.ones(num_rows(P)), -np.ones(num_rows(N)), np.zeros(num_rows(U)))) sel.fit(concatenate((P_, N_, U_)), y_pp) P_, N_, U_ = [(sel.transform(x)) for x in [P_, N_, U_]] model = best['untrained_model'](P_, N_, U_) print("Ratio of U classified as positive:", np.sum(model.predict(U_)) / num_rows(U_)) print("Returning final model") return Pipeline([('vectorizer', vec), ('selector', sel), ('clf', model)])
def clean_corpus_pu(ratio=1.0): # remove worst percentage # print("\nRemoving CIViC-like sentences from HoC[neg]\n") # hocneg_ = cleanup_sources.remove_least_similar_percent(noisy=hocneg, guide=civic, ratio=ratio, percentile=15) # print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n") # hocpos_ = cleanup_sources.remove_least_similar_percent(noisy=hocpos, guide=hocneg_, ratio=ratio, percentile=10) # print("\nRemoving CIViC-unlike sentences from HoC[pos]\n") # hocpos_ = cleanup_sources.remove_least_similar_percent(noisy=hocpos_, guide=civic, ratio=ratio, percentile=10, # inverse=True) # remove what is ambiguous according to PU training print("\nRemoving CIViC-like sentences from HoC[neg]\n") hocneg_ = remove_P_from_U(U=hocneg, P=civic, ratio=ratio) print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n") hocpos_ = remove_P_from_U(U=hocpos, P=hocneg_, ratio=ratio) # print("\nRemoving CIViC-unlike sentences from HoC[pos]\n") # hocpos_ = cleanup_sources.remove_P_from_U(noisy=hocpos, guide=civic, ratio=ratio, inverse=True) hocpos_train, hocpos_test = train_test_split(hocpos_, test_size=0.2, random_state=RANDOM_SEED) civic_train, civic_test = train_test_split(civic, test_size=0.2, random_state=RANDOM_SEED) hocneg_train, X_test_neg = train_test_split(hocneg_, test_size=0.2, random_state=RANDOM_SEED) P_raw = helpers.concatenate((hocpos_train, civic_train)) U_raw = helpers.concatenate((abstracts, hocneg_train)) X_test_pos = helpers.concatenate((hocpos_test, civic_test)) if ratio < 1.0: P_raw, _ = train_test_split(P_raw, train_size=ratio, random_state=RANDOM_SEED) U_raw, _ = train_test_split(U_raw, train_size=ratio, random_state=RANDOM_SEED) X_test_pos, _ = train_test_split(X_test_pos, train_size=ratio, random_state=RANDOM_SEED) X_test_neg, _ = train_test_split(X_test_neg, train_size=ratio, random_state=RANDOM_SEED) X_test_raw = helpers.concatenate((X_test_pos, X_test_neg)) y_test = helpers.concatenate((np.ones(num_rows(X_test_pos)), np.zeros(num_rows(X_test_neg)))) return P_raw, U_raw, X_test_raw, y_test
def eval_fold(model_record, P, N, U, i_splits): """helper function for running cross validation in parallel""" i, (p_split, n_split) = i_splits P_train, P_test = P[p_split[0]], P[p_split[1]] N_train, N_test = N[n_split[0]], N[n_split[1]] y_train_pp = concatenate((np.ones(num_rows(P_train)), -np.ones(num_rows(N_train)), np.zeros(num_rows(U)))) pp = clone(Pipeline([('vectorizer', model_record['vectorizer']), ('selector', model_record['selector'])])) pp.fit(concatenate((P_train, N_train, U)), y_train_pp) P_, N_, U_, P_test_, N_test_ = [(pp.transform(x)) for x in [P_train, N_train, U, P_test, N_test]] model = model_record['untrained_model'](P_, N_, U_) y_pred = model.predict(concatenate((P_test_, N_test_))) y_test = concatenate((np.ones(num_rows(P_test_)), np.zeros(num_rows(N_test_)))) pr, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted') acc = accuracy_score(y_test, y_pred) print("Fold no.", i, "acc", acc, "classification report:\n", classification_report(y_test, y_pred)) return [pr, r, f1, acc]
def iterate_EM(P, U, y_P=None, ypU=None, tolerance=0.05, max_pos_ratio=1.0, clf_selection=False, verbose=False): """EM algorithm for positive set P and unlabelled set U iterate NB classifier with updated labels for unlabelled set (with optional initial labels) until convergence""" if y_P is None: y_P = ([1.] * num_rows(P)) if ypU is None: ypU = ([0.] * num_rows(U)) ypU_old = [-999] iterations = 0 old_model = None new_model = None while not almost_equal(ypU_old, ypU, tolerance): iterations += 1 if verbose: print("Iteration #", iterations, "\tBuilding new model using probabilistic labels") if clf_selection: old_model = new_model new_model = build_proba_MNB(concatenate((P, U)), concatenate((y_P, ypU)), verbose=verbose) if verbose: print("Predicting probabilities for U") ypU_old = ypU ypU = new_model.predict_proba(U)[:, 1] predU = [round(p) for p in ypU] pos_ratio = sum(predU) / num_rows(U) if verbose: print("Unlabelled instances classified as positive:", sum(predU), "/", num_rows(U), "(", pos_ratio * 100, "%)\n") if clf_selection and old_model is not None: if em_getting_worse(old_model, new_model, P, U): if verbose: print( "Approximated error has grown since last iteration.\n" "Aborting and returning classifier #", iterations - 1) return old_model if pos_ratio >= max_pos_ratio: if verbose: print( "Acceptable ratio of positively labelled sentences in U is reached." ) break print("Returning final NB after", iterations, "iterations") return new_model
def biased_SVM_weight_selection(P, U, Cs_neg=None, Cs_pos_factors=None, Cs=None, kernel='linear', test_size=0.2, verbose=False): """run biased SVMs with combinations of class weight values, choose the one with the best pu_measure""" # default values if Cs is None: Cs = [10**x for x in range(-12, 12, 2)] if Cs_neg is None: Cs_neg = [1] # arange(0.01, 0.63, 0.04) if Cs_pos_factors is None: Cs_pos_factors = range(1, 1100, 200) Cs = [(C, C_neg * j, C_neg) for C in Cs for C_neg in Cs_neg for j in Cs_pos_factors] if verbose: print( "Running Biased-SVM with range of C and positive class weight factors.", num_rows(Cs), "parameter combinations.") P_train, P_test = train_test_split(P, test_size=test_size) U_train, U_test = train_test_split(U, test_size=test_size) X = concatenate((P_train, U_train)) y = concatenate((ones(num_rows(P_train)), zeros(num_rows(U_train)))) # with Pool(processes=min(cpu_count() - 1, num_rows(Cs))) as p: score_weights = map( partial(eval_params, X_train=X, y_train=y, P_test=P_test, U_test=U_test, kernel=kernel), Cs) best_score_params = max(score_weights, key=lambda tup: tup[0]) [print(s) for s in score_weights] if verbose: print("\nBest model has parameters", best_score_params[1], "and PU-score", best_score_params[0]) print("Building final classifier") model = build_biased_SVM(concatenate((P, U)), concatenate( (ones(num_rows(P)), zeros(num_rows(U)))), C_pos=best_score_params[1]['C_pos'], C_neg=best_score_params[1]['C_neg'], C=best_score_params[1]['C'], probability=True, kernel=kernel) if verbose: train_report(model, P, U) print("Returning Biased-SVM with parameters", best_score_params[1], "and PU-score", best_score_params[0]) return model
def clean_corpus_pnu(mode="tolerant", percentiles=(10, 25, 10), ratio=1.0): """clean up HoC corpus using PU learning. Modes: "strict", "percentile", default default: remove CIViC-like from HoC[neg], HoC[neg]-like from CIViC strict: remove CIViC-like from HoC[neg], keep only CIViC-like in HoC[pos] percentile: remove percentiles (CIViC-like from HoC[neg], HoC[neg]-like from HoC[pos], CIViC-unlike from HoC[pos) """ if mode == "percentile": # Remove given best/worst percentile of sentences from each set print("\nRemoving CIViC-like sentences from HoC[neg] (", percentiles[0], "%)\n") hocneg_ = remove_most_similar_percent(U=hocneg, P=civic, ratio=ratio, percentile=percentiles[0]) print("\nRemoving HoC[neg]-like sentences from HoC[pos] (", percentiles[1], "%)\n") hocpos_ = remove_most_similar_percent(U=hocpos, P=hocneg_, ratio=ratio, percentile=percentiles[1]) print("\nRemoving CIViC-unlike sentences from HoC[pos] (", percentiles[2], "%)\n") hocpos_ = remove_most_similar_percent(U=hocpos_, P=civic, ratio=ratio, percentile=percentiles[2], inverse=True) elif mode == "strict": # Remove "good" sentences from HoC[neg], keep only "good" sentences in HoC[pos] print("\nKeeping only CIViC-like sentences in HoC[pos]\n") hocpos_ = remove_P_from_U(P=civic, U=hocpos, ratio=ratio, inverse=True) print("\nRemoving CIViC-like sentences from HoC[neg]\n") hocneg_ = remove_P_from_U(P=civic, U=hocneg, ratio=ratio) elif mode == "mixed": # Remove "good" sentences from HoC[neg], CIViC-unlike and HoC[neg]-like sentences from HoC[pos] print("\nRemoving CIViC-like sentences from HoC[neg]\n") hocneg_ = remove_P_from_U(P=civic, U=hocneg, ratio=ratio) print("\nRemoving CIViC-unlike sentences from HoC[pos] (", 75, "%)\n") hocpos_ = remove_most_similar_percent(U=hocpos, P=civic, ratio=ratio, percentile=75, inverse=True) print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n") hocpos_ = remove_P_from_U(P=hocneg_, U=hocpos, ratio=ratio) else: # mode == "tolerant" # Remove "good" sentences from HoC[neg], remove "bad" sentences in HoC[pos] print("\nRemoving CIViC-like sentences from HoC[neg]\n") hocneg_ = remove_P_from_U(P=civic, U=hocneg, ratio=ratio) print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n") hocpos_ = remove_P_from_U(P=hocneg_, U=hocpos, ratio=ratio) P_raw = helpers.concatenate((hocpos_, civic)) U_raw = abstracts N_raw = hocneg_ if ratio < 1.0: P_raw, _ = train_test_split(P_raw, train_size=ratio, random_state=RANDOM_SEED) N_raw, _ = train_test_split(N_raw, train_size=ratio, random_state=RANDOM_SEED) U_raw, _ = train_test_split(U_raw, train_size=ratio, random_state=RANDOM_SEED) return P_raw, N_raw, U_raw
def iterate_SVM(P, U, RN, max_neg_ratio=0.2, clf_selection=True, kernel=None, C=0.1, n_estimators=9, verbose=False): """runs an SVM classifier trained on P and RN iteratively, augmenting RN after each iteration, the documents in U classified as negative are moved to RN until there are none left. max_neg_ratio is the maximum accepted ratio of P to be classified as negative by final classifier. if clf_selection is true and the final classifier regards more than max_neg_ratio of P as negative, return the initial one.""" y_P = np.ones(num_rows(P)) y_RN = np.zeros(num_rows(RN)) if kernel is not None: if verbose: print("Building initial Bagging SVC (", n_estimators, "clfs)", "with Positive and Reliable Negative docs") clf = (BaggingClassifier( svm.SVC(class_weight='balanced', kernel=kernel, C=C), bootstrap=True, n_estimators=n_estimators, n_jobs=min(n_estimators, cpu_count()), max_samples=(1.0 if n_estimators < 4 else 1.0 / (n_estimators - 2)))) else: if verbose: print( "Building initial linearSVM classifier with Positive and Reliable Negative docs" ) clf = svm.LinearSVC(class_weight='balanced', C=C) initial_model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN))) if num_rows(U) == 0: print("Warning: SVM: All of U was classified as negative.") return initial_model if verbose: print( "Predicting U with initial SVM, adding negatively classified docs to RN for iteration" ) y_U = initial_model.predict(U) Q, W = partition_pos_neg(U, y_U) iteration = 0 model = None if num_rows(Q) == 0 or num_rows(W) == 0: print( "Warning: Returning initial SVM because all of U was assigned label", y_U[0]) return initial_model if clf_selection: y_P_initial = initial_model.predict(P) initial_neg_ratio = 1 - np.average(y_P_initial) if initial_neg_ratio > max_neg_ratio: print("Returning initial SVM ({}% of P classified as negative)". format(100 * initial_neg_ratio)) return initial_model # iterate SVM, each turn augmenting RN by the documents in Q classified negative while np.size(W) and np.size(Q): iteration += 1 RN = concatenate((RN, W)) y_RN = np.zeros(num_rows(RN)) if verbose: print("\nIteration #", iteration, "\tReliable negative examples:", num_rows(RN)) if kernel is not None: clf = (BaggingClassifier( svm.SVC(class_weight='balanced', kernel=kernel, C=C), bootstrap=True, n_estimators=n_estimators, n_jobs=min(n_estimators, cpu_count()), max_samples=(1.0 if n_estimators < 4 else 1.0 / (n_estimators - 2)))) else: clf = svm.LinearSVC(class_weight='balanced', C=C) model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN))) y_Q = model.predict(Q) Q, W = partition_pos_neg(Q, y_Q) if np.size(W): RN = concatenate((RN, W)) model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN))) if verbose: print("Iterative SVM converged. Reliable negative examples:", num_rows(RN)) if clf_selection: if verbose: print( "Ratio of positive examples misclassified as negative by initial SVM:", initial_neg_ratio) if model is None: return initial_model y_P_final = model.predict(P) final_neg_ratio = 1 - np.average(y_P_final) if verbose: print( "Ratio of positive examples misclassified as negative by final SVM:", final_neg_ratio) if final_neg_ratio > max_neg_ratio and final_neg_ratio > initial_neg_ratio: print( iteration, "iterations - final SVM discards too many positive examples.", "Returning initial SVM instead") return initial_model print("Returning final SVM after", iteration, "iterations") return model
def getBestModel(P_train, U_train, X_test, y_test): """Evaluate parameter combinations, save results and return pipeline with best model""" print("\nEvaluating parameter ranges for preprocessor and classifiers") X_train = concatenate((P_train, U_train)) y_train_pp = concatenate( (np.ones(num_rows(P_train)), np.zeros(num_rows(U_train)))) results = {'best': {'f1': -1, 'acc': -1}, 'all': []} preproc_params = { 'df_min': [0.002], 'df_max': [1.0], 'rules': [True], 'wordgram_range': [(1, 4)], # [None, (1, 2), (1, 3), (1, 4)], 'chargram_range': [(2, 6)], # [None, (2, 4), (2, 5), (2, 6)], 'feature_select': [ partial(transformers.percentile_selector, 'chi2'), # partial(transformers.factorization, 'PCA', 10), # partial(transformers.factorization, 'PCA', 100), # partial(transformers.factorization, 'PCA', 1000), ] } for wordgram, chargram in product(preproc_params['wordgram_range'], preproc_params['chargram_range']): for r in preproc_params['rules']: for df_min, df_max in product(preproc_params['df_min'], preproc_params['df_max']): for fs in preproc_params['feature_select']: if wordgram is None and chargram is None: break print( "\n----------------------------------------------------------------", "\nwords:", wordgram, "chars:", chargram, "feature selection:", fs, "\n----------------------------------------------------------------\n" ) start_time = time.time() X_train_, X_dev_, vectorizer, selector = prepareTrainTest( trainData=X_train, testData=X_test, trainLabels=y_train_pp, rules=r, wordgram_range=wordgram, feature_select=fs, chargram_range=chargram, min_df_char=df_min, min_df_word=df_min, max_df=df_max) if selector: P_train_ = selector.transform( vectorizer.transform(P_train)) U_train_ = selector.transform( vectorizer.transform(U_train)) else: P_train_ = vectorizer.transform(P_train) U_train_ = vectorizer.transform(U_train) pp = {'word': wordgram, 'char': chargram} # fit models iteration = [ # {'name': 'i-em', 'model': partial(two_step.i_EM, P_train_, U_train_)}, # {'name' : 's-em spy=0.1', # 'model': partial(two_step.s_EM, P_train_, U_train_, spy_ratio=0.1, noise_lvl=0.1)}, # {'name' : 's-em spy=0.2', # 'model': partial(two_step.s_EM, P_train_, U_train_, spy_ratio=0.2, noise_lvl=0.2)}, { 'name': 'roc-svm', 'model': partial(two_step.roc_SVM, P_train_, U_train_) }, { 'name': 'cr_svm noise=0.1', 'model': partial(two_step.cr_SVM, P_train_, U_train_, noise_lvl=0.1) }, { 'name': 'cr_svm noise=0.2', 'model': partial(two_step.cr_SVM, P_train_, U_train_, noise_lvl=0.2) }, { 'name': 'cr_svm noise=0.3', 'model': partial(two_step.cr_SVM, P_train_, U_train_, noise_lvl=0.3) }, # {'name': 'roc_em', 'model': partial(two_step.roc_EM, P_train_, U_train_)}, # {'name' : 'spy_svm spy=0.1', # 'model': partial(two_step.spy_SVM, P_train_, U_train_, spy_ratio=0.1, noise_lvl=0.1)}, # {'name' : 'spy_svm spy=0.2', # 'model': partial(two_step.spy_SVM, P_train_, U_train_, spy_ratio=0.2, noise_lvl=0.2)}, # {'name' : 'biased-svm', # 'model': partial(biased_svm.biased_SVM_weight_selection, P_train_, U_train_)}, ## {'name' : 'bagging-svm', ## 'model': partial(biased_svm.biased_SVM_grid_search, P_train_, U_train_)} ] # eval models if PARALLEL: with multi.Pool(min(multi.cpu_count(), len(iteration))) as p: iter_stats = list( p.map(partial(model_eval_record, X_dev_, y_test, U_train_), iteration, chunksize=1)) else: iter_stats = list( map( partial(model_eval_record, X_dev_, y_test, U_train_), iteration)) # finalize records: remove model, add n-gram stats, update best for m in iter_stats: m['n-grams'] = pp m['fs'] = fs() if m['acc'] > results['best']['acc']: results['best'] = deepcopy(m) results['best']['vectorizer'] = vectorizer results['best']['selector'] = selector m.pop('model', None) results['all'].append(iter_stats) print("Evaluated words:", wordgram, "chars:", chargram, "in %s seconds\n" % (time.time() - start_time)) print_reports(iter_stats) print_results(results) # save results to disk with open( file_path("./pickles/model_eval{}.pickle".format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))), "wb") as f: print('saving model stats to disk\n') pickle.dump(results, f) # ---------------------------------------------------------------- # check how much of U (abstracts) is supposed to be positive # ---------------------------------------------------------------- best_model = results['best']['model'] selector = results['best']['selector'] vectorizer = results['best']['vectorizer'] print("\nAmount of unlabelled training set classified as positive:") if selector: transformedU = (selector.transform(vectorizer.transform(U_train))) else: transformedU = (vectorizer.transform(U_train)) y_predicted_U = best_model.predict(transformedU) print(np.sum(y_predicted_U), "/", num_rows(y_predicted_U), "(", np.sum(y_predicted_U) / num_rows(y_predicted_U), ")") return results['best']
def get_best_model(P_train, N_train, U_train, X_test=None, y_test=None): """Evaluate parameter combinations, save results and return object with stats of all models""" print("Evaluating parameter ranges for preprocessor and classifiers") if X_test is None or y_test is None: P_train, X_test_pos = train_test_split(P_train, test_size=0.2, random_state=RANDOM_SEED) N_train, X_test_neg = train_test_split(N_train, test_size=0.2, random_state=RANDOM_SEED) X_test = concatenate((X_test_pos, X_test_neg)) y_test = concatenate((np.ones(num_rows(X_test_pos)), np.zeros(num_rows(X_test_neg)))) X_train = concatenate((P_train, N_train, U_train)) y_train_pp = concatenate((np.ones(num_rows(P_train)), -np.ones(num_rows(N_train)), np.zeros(num_rows(U_train)))) results = {'best': {'f1': -1, 'acc': -1}, 'all': []} preproc_params = preproc_param_dict() estimators = estimator_list() for wordgram, chargram in product(preproc_params['wordgram_range'], preproc_params['chargram_range']): for r in preproc_params['rules']: for df_min, df_max in product(preproc_params['df_min'], preproc_params['df_max']): for fs in preproc_params['feature_select']: if wordgram is None and chargram is None: break print("\n----------------------------------------------------------------", "\nwords:", wordgram, "chars:", chargram, "feature selection:", fs, "df_min, df_max:", df_min, df_max, "rules", r, "\n----------------------------------------------------------------\n") start_time = time.time() X_train_, X_test_, vectorizer, selector = prepare_train_test(trainData=X_train, testData=X_test, trainLabels=y_train_pp, rules=r, wordgram_range=wordgram, feature_select=fs, chargram_range=chargram, min_df_char=df_min, min_df_word=df_min, max_df=df_max) if selector: P_train_, N_train_, U_train_ = [(selector.transform(vectorizer.transform(x))) for x in [P_train, N_train, U_train]] else: P_train_, N_train_, U_train_ = [(vectorizer.transform(x)) for x in [P_train, N_train, U_train]] # fit models if PARALLEL: with multi.Pool(min(multi.cpu_count(), len(estimators))) as p: iter_stats = list(p.map(partial(model_eval_record, P_train_, N_train_, U_train_, X_test_, y_test), estimators, chunksize=1)) else: iter_stats = list(map(partial(model_eval_record, P_train_, N_train_, U_train_, X_test_, y_test), estimators)) # finalize records: remove model, add n-gram stats, update best for m in iter_stats: m['n-grams'] = {'word': wordgram, 'char': chargram}, m['rules'] = r, m['df_min, df_max'] = (df_min, df_max) m['fs'] = fs() if m['acc'] > results['best']['acc']: results['best'] = deepcopy(m) results['best']['vectorizer'] = vectorizer results['best']['selector'] = selector m.pop('model', None) results['all'].append(iter_stats) print("Evaluated words:", wordgram, "chars:", chargram, "rules:", r, "feature selection:", fs, "min_df:", df_min, "in %s seconds\n" % (time.time() - start_time)) print_reports(iter_stats) print_results(results) return results