# 2.2 conclusion - PU-Learning bagging的数据 可视化展示`label=0`即 `unlabeled` 的数据 # 展示unlabeled数据集 在经过Bagging方法后的效果 plt.scatter(X[y == 0].feature1, X[y == 0].feature2, c=results[y == 0].output_bag, linewidth=0, s=50, alpha=0.5, cmap='jet_r') plt.colorbar(label='Unlabeled样本的预测分值') plt.title('PU Bagging') plt.show() # 3.1 Using `BaggingClassifierPU` bc = BaggingClassifierPU(DecisionTreeClassifier(), n_estimators=1000, max_samples=sum(y), n_jobs=-1) bc.fit(X, y) results['output_skb'] = bc.oob_decision_function_[:, 1] # Visualize the approach's result plt.scatter(X[y == 0].feature1, X[y == 0].feature2, c=results[y == 0].output_skb, linewidth=0, s=50, alpha=0.5, cmap='jet_r') plt.colorbar(label='Scores given to unlabeled points') plt.title(r'Using ${\tt BaggingClassifierPU}$') plt.show()
) tfidf = seq_vectorizer(ngram_max=4, downsample=40000) print(f"FOR BaggingClassifierPU MODELS:\t Sequences: {sequences.shape};\t Binding: {binding.shape};\t Number of ligand id values: {len(lig_id_vals)}. ") spinner = Spinner() models = {} for lig_id in lig_id_vals: try: X, y = fitter_df_maker(lig_id) sys.stdout.write(f"Models dict now populated to length {len(models.keys())}; ") bc = BaggingClassifierPU(DecisionTreeClassifier(), n_estimators=estimators, #n_jobs=-1, max_samples=int(sum(y.values))) sys.stdout.write(f"next, fitting on ligand #{lig_id}") spinner.start() bc.fit(X,y) spinner.stop() models[lig_id] = bc #sys.stdout.flush() sys.stdout.write('\r') # yes finally https://stackoverflow.com/questions/23138413/clearing-old-data-from-sys-stdout-in-python except: pass #with open('models.pickle', 'wb') as mp: # pickle.dump(models, mp)
print( print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive'])) print('') print('Precision: ', precision_score(y_orig, model.predict(X))) print('Recall: ', recall_score(y_orig, model.predict(X))) print('Accuracy: ', accuracy_score(y_orig, model.predict(X))) print('f1_score: ', f1_score(y_orig, model.predict(X))) f1_orig.append(f1_score(y_orig, model.predict(X))) print('Training bagging classifier...') pu_start = time.perf_counter() model = BaggingClassifierPU(xgb.XGBClassifier(), n_estimators=50, n_jobs=-1, max_samples=sum(y1)) model.fit(X, y1) pu_end = time.perf_counter() print('Done!') print('Time:', pu_end - pu_start) # train data print('---- {} ----'.format('PU Bagging')) print( print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive'])) print('') print('Precision: ', precision_score(y_orig, model.predict(X))) print('Recall: ', recall_score(y_orig, model.predict(X)))
y.loc[np.random.choice(y[y == 1].index, replace = False, size = hidden_size)] = 0 # Check the new contents of the set print('%d positive out of %d total' % (sum(y), len(y))) # Plot the data set, as the models will see it plt.scatter(X[y==0].feature1, X[y==0].feature2, c='k', marker='.', linewidth=1, s=10, alpha=0.5, label='Unlabeled') plt.scatter(X[y==1].feature1, X[y==1].feature2, c='b', marker='o', linewidth=0, s=50, alpha=0.5, label='Positive') plt.legend() plt.title('Data set (as seen by the classifiers)') plt.show() bc = BaggingClassifierPU( DecisionTreeClassifier(), n_estimators = 1000, # 1000 trees as usual max_samples = sum(y), # Balance the positives and unlabeled in each bag n_jobs = -1 # Use all cores ) bc.fit(X, y) # Store the scores assigned by this approach results = pd.DataFrame({ 'truth' : y_orig, # The true labels 'label' : y, # The labels to be shown to models in experiment }, columns = ['truth', 'label']) results['output_bag_tree'] = bc.oob_decision_function_[:,1] # Visualize this approach's results plt.scatter( X[y==0].feature1, X[y==0].feature2, c = results[y==0].output_bag_tree, linewidth = 0, s = 50, alpha = 0.5,
def run(data_train, data_test, clf_name): X_train, y_train = train_data_process(data_train) X_test, y_true = test_data_process(data_test) classifiers = { "XGBOD": XGBOD(random_state=0), "KNeighborsClassifier": KNeighborsClassifier(3), "SVC": SVC(random_state=0), "GaussianProcessClassifier": GaussianProcessClassifier(1.0 * RBF(1.0)), "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0), "RandomForestClassifier": RandomForestClassifier(random_state=0), "MLPClassifier": MLPClassifier(random_state=0), "AdaBoostClassifier": AdaBoostClassifier(), "GaussianNB": GaussianNB(), "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "BaggingClassifierPU": BaggingClassifierPU( DecisionTreeClassifier(), n_estimators=1000, # 1000 trees as usual max_samples=sum(y_train), # Balance the positives and unlabeled in each bag n_jobs=-1 # Use all cores ) } clf = classifiers[clf_name] try: clf.fit(X_train, y_train) y_pred = clf.predict(X_test) TP = 0 FN = 0 FP = 0 TN = 0 for i, label in enumerate(y_true): if label: if y_pred[i]: TP += 1 else: FN += 1 else: if y_pred[i]: FP += 1 else: TN += 1 if (FP + TN) == 0: pf = "no negative samples." else: pf = FP / (FP + TN) try: auc = roc_auc_score(y_true, y_pred) except ValueError as e: auc = str(e) return { 'train samples': str(X_train.shape[0]), 'defective train samples': str(np.sum(y_train)), 'precision': precision_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred), 'pf': pf, 'F-measure': f1_score(y_true, y_pred), 'accuracy': accuracy_score(y_true, y_pred), 'AUC': auc } except ValueError as e: return str(e)
def train_val_PU(max_a, val, tr1, tr2, loc='./log', start_ep=0, end_ep=45): ''' this function trains a triple (a combination of 3 random seeds). max_a: the maximum alpha used to train the PU model. val: the random seed number that produces the LID sequence for test tr1: the 1st random seed number that produces the LID sequence for training tr2: the 2nd random seed number that produces the LID sequence for training loc: the LID sequences' location start_ep: the starting epoch of the LID sequence end_ep: the ending epoch of the LID sequence ''' records = [ [tr1, tr2, val], ] train1 = LID_assmb(tr1, max_a, start_ep, end_ep, loc) train2 = LID_assmb(tr2, max_a, start_ep, end_ep, loc) total = train2.append(train1) total = total.sample(frac=1).reset_index(drop=True) #shuffle everything labels = (total.iloc[:, -2]).to_numpy() loss_labels = (total.iloc[:, -1]).to_numpy() total = total.iloc[:, start_ep:end_ep] bc = BaggingClassifierPU(DecisionTreeClassifier(), n_estimators=1000, max_samples=int(sum(loss_labels)), n_jobs=-1) bc.fit(total, loss_labels) v_total = LID_assmb(val, max_a, start_ep, end_ep, loc) v_total = v_total.sample(frac=1).reset_index(drop=True) v_labels = (v_total.iloc[:, -2]).to_numpy() v_loss_labels = (v_total.iloc[:, -1]).to_numpy() v_total = v_total.iloc[:, start_ep:end_ep] pred = bc.predict_proba(v_total) v_summary = {} for i in set(v_labels): v_summary[i] = 0.0 if float(i) > 100: bl_label = i for i in range(len(pred)): if np.isnan(pred[i][1]): raise ValueError( 'prediction has illegal value nan, please check the model and data!' ) break elif pred[i][1] < 0.5: continue else: v_summary[v_labels[i]] += 1 records.extend( [v_summary[bl_label] / float(sum(v_loss_labels)), v_summary[bl_label]]) temp = set(v_labels) temp.remove(bl_label) header = list(sorted(temp)) header = ['train 1, 2 val', 'recall'] + [ bl_label, ] + header for r in range(3, len(header)): records.append(v_summary[header[r]]) return (records, header) '''