def i_EM(P, U, max_imbalance=10.0, max_pos_ratio=1.0, tolerance=0.1, verbose=False): """all-in-one PU method: I-EM algorithm for positive set P and unlabelled set U iterate NB classifier with updated labels for unlabelled set (initially negative) until convergence if U is much larger than P, randomly samples max_imbalance*|P| docs from U""" print("Running I-EM") # if num_rows(U) > max_imbalance * num_rows(P): # U = np.array(random.sample(list(U), int(max_imbalance * num_rows(P)))) model = iterate_EM(P, U, tolerance=tolerance, max_pos_ratio=max_pos_ratio, clf_selection=False, verbose=verbose) if verbose: train_report(model, P, U) return model
def biased_SVM_grid_search(P, U, Cs=None, kernel='linear', n_estimators=9, verbose=False): if Cs is None: Cs = [10**x for x in range(-12, 12, 2)] if verbose: print( "Running Biased-SVM with balanced class weights and grid search over", len(Cs), "C values") model = BaggingClassifier(LinearSVC()) grid_search = GridSearchCV( model, param_grid={ 'base_estimator__C': Cs, 'base_estimator__class_weight': ['balanced'], ### not applicable for LinearSVC # 'base_estimator__kernel' : [kernel], # 'base_estimator__cache_size' : [8000], # 'base_estimator__probability' : [True], ### fit parameters for Bagging wrapper 'bootstrap': [True], 'n_estimators': [n_estimators], ### parallelization incompatible with multiprocessing # 'n_jobs' : [n_estimators] }, scoring=pu_scorer, verbose=0) if verbose: print("Grid searching parameters for biased-SVM") X = concatenate((P, U)) y = concatenate((ones(num_rows(P)), zeros(num_rows(U)))) grid_search.fit(X, y) if verbose: train_report(grid_search.best_estimator_, P, U) print("Biased-SVM parameters:", grid_search.best_params_, "\tPU score:", grid_search.best_score_) return grid_search.best_estimator_
def standalone_rocchio(P, U, alpha=16, beta=4, verbose=False): """1-step Rocchio method""" print("Running Rocchio") if verbose: print("Building Rocchio model to determine Reliable Negative examples") model = rocchio(P, U, alpha=alpha, beta=beta) y_U = model.predict(U) U_minus_RN, RN = partition_pos_neg(U, y_U) if verbose: print("Reliable Negative examples in U:", num_rows(RN), "(", 100 * num_rows(RN) / num_rows(U), "%)") train_report(model, P, U) return model
def cr_SVM(P, U, max_neg_ratio=0.1, noise_lvl=0.2, alpha=16, beta=4, kernel=None, C=0.1, verbose=False): """Two-Step technique based on Cosine Similarity, Rocchio and SVM Step 1.1: Find Potentially Negative docs (less similar to mean(P) than noise_lvl of docs in P) Step 1.2: Find Reliable Negative docs using Rocchio (similarity to mean positive/PN vector) Step 2: Iterate SVM starting from P and RN sets until classification of U converges noise level is quite crucial, should be >=20% to give reasonable results""" print("Running CR-SVM") # step 1 if verbose: print("Determining RN using Cosine Similarity threshold and Rocchio\n") U_minus_RN, RN = get_RN_cosine_rocchio(P, U, noise_lvl=noise_lvl, alpha=alpha, beta=beta, verbose=verbose) # step2 if verbose: print("\nIterating SVM with P, U-RN, and RN") model = iterate_SVM(P, U_minus_RN, RN, kernel=kernel, C=C, max_neg_ratio=max_neg_ratio, verbose=verbose) if verbose: train_report(model, P, U) return model
def s_EM(P, U, spy_ratio=0.1, max_pos_ratio=1.0, tolerance=0.1, noise_lvl=0.1, clf_selection=True, verbose=False): """S-EM two-step PU learning as described in \"Partially Supervised Classification...\". 1st step: get Reliable Negative documents using Spy Documents 2nd step: iterate EM with P, U-RN, and RN """ print("Running S-EM") # P, U = arrays([P, U] # step 1 if verbose: print( "Determining confidence threshold using Spy Documents and I-EM\n") U_minus_RN, RN = get_RN_Spy_Docs(P, U, spy_ratio=spy_ratio, tolerance=tolerance, noise_lvl=noise_lvl, verbose=verbose) # step2 if verbose: print("\nIterating I-EM with P, U-RN, and RN") model = run_EM_with_RN(P, U_minus_RN, RN, tolerance=tolerance, max_pos_ratio=max_pos_ratio, clf_selection=clf_selection, verbose=verbose) if verbose: train_report(model, P, U) return model
def roc_EM(P, U, max_pos_ratio=0.5, tolerance=0.1, clf_selection=True, alpha=16, beta=4, verbose=False): """S-EM two-step PU learning as described in \"Partially Supervised Classification...\". 1st step: get Reliable Negative documents using Spy Documents 2nd step: iterate EM with P, U-RN, and RN """ print("Running Roc-EM") # step 1 if verbose: print("Determining RN using Rocchio method\n") U_minus_RN, RN = get_RN_rocchio(P, U, alpha=alpha, beta=beta, verbose=verbose) # step2 if verbose: print("\nIterating I-EM with P, U-RN, and RN") model = run_EM_with_RN(P, U_minus_RN, RN, tolerance=tolerance, max_pos_ratio=max_pos_ratio, clf_selection=clf_selection, verbose=verbose) if verbose: train_report(model, P, U) return model
def roc_SVM(P, U, max_neg_ratio=0.1, alpha=16, beta=4, kernel=None, C=0.1, verbose=False): """Two-Step technique based on Rocchio and SVM Step 1: Find Reliable Negative docs using Rocchio (similarity to mean positive/unlabelled vector) Step 2: Iterate SVM starting from P and RN sets until classification of U converges""" print("Running Roc-SVM") # step 1 if verbose: print("Determining RN using Rocchio method\n") U_minus_RN, RN = get_RN_rocchio(P, U, alpha=alpha, beta=beta, verbose=verbose) # step2 if verbose: print("\nIterating SVM with P, U-RN, and RN") model = iterate_SVM(P, U_minus_RN, RN, kernel=kernel, C=C, max_neg_ratio=max_neg_ratio, verbose=verbose) if verbose: train_report(model, P, U) return model
def spy_SVM(P, U, spy_ratio=0.1, max_neg_ratio=0.1, tolerance=0.1, noise_lvl=0.1, verbose=False): """S-EM two-step PU learning as described in \"Partially Supervised Classification...\". 1st step: get Reliable Negative documents using Spy Documents 2nd step: iterate EM with P, U-RN, and RN """ print("Running Spy-SVM") # step 1 if verbose: print( "Determining confidence threshold using Spy Documents and I-EM\n") U_minus_RN, RN = get_RN_Spy_Docs(P, U, spy_ratio=spy_ratio, tolerance=tolerance, noise_lvl=noise_lvl, verbose=verbose) # step2 if verbose: print("\nIterating SVM with P, U-RN, and RN") model = iterate_SVM(P, U_minus_RN, RN, max_neg_ratio=max_neg_ratio, verbose=verbose) if verbose: train_report(model, P, U) return model
def biased_SVM_weight_selection(P, U, Cs_neg=None, Cs_pos_factors=None, Cs=None, kernel='linear', test_size=0.2, verbose=False): """run biased SVMs with combinations of class weight values, choose the one with the best pu_measure""" # default values if Cs is None: Cs = [10**x for x in range(-12, 12, 2)] if Cs_neg is None: Cs_neg = [1] # arange(0.01, 0.63, 0.04) if Cs_pos_factors is None: Cs_pos_factors = range(1, 1100, 200) Cs = [(C, C_neg * j, C_neg) for C in Cs for C_neg in Cs_neg for j in Cs_pos_factors] if verbose: print( "Running Biased-SVM with range of C and positive class weight factors.", num_rows(Cs), "parameter combinations.") P_train, P_test = train_test_split(P, test_size=test_size) U_train, U_test = train_test_split(U, test_size=test_size) X = concatenate((P_train, U_train)) y = concatenate((ones(num_rows(P_train)), zeros(num_rows(U_train)))) # with Pool(processes=min(cpu_count() - 1, num_rows(Cs))) as p: score_weights = map( partial(eval_params, X_train=X, y_train=y, P_test=P_test, U_test=U_test, kernel=kernel), Cs) best_score_params = max(score_weights, key=lambda tup: tup[0]) [print(s) for s in score_weights] if verbose: print("\nBest model has parameters", best_score_params[1], "and PU-score", best_score_params[0]) print("Building final classifier") model = build_biased_SVM(concatenate((P, U)), concatenate( (ones(num_rows(P)), zeros(num_rows(U)))), C_pos=best_score_params[1]['C_pos'], C_neg=best_score_params[1]['C_neg'], C=best_score_params[1]['C'], probability=True, kernel=kernel) if verbose: train_report(model, P, U) print("Returning Biased-SVM with parameters", best_score_params[1], "and PU-score", best_score_params[0]) return model