def getTrainedModel1(self): # We build a matrix of LF votes for each comment ticket LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs) # Get true labels for LF set Y_LF_set = np.array(self.LF_set['resolution']) display( lf_summary(sparse.csr_matrix(LF_matrix), Y=Y_LF_set, lf_names=self.LF_names.values())) print("label coverage: " + label_coverage(LF_matrix)) mv = MajorityLabelVoter() Y_train_majority_votes = mv.predict(LF_matrix) print("classification report:\n" + classification_report(Y_LF_set, Y_train_majority_votes)) Ls_train = self.make_Ls_matrix(self.train, self.LFs) # You can tune the learning rate and class balance. model = LabelModel(k=2, seed=123) trainer = model.train_model(Ls_train, n_epochs=2000, print_every=1000, lr=0.0001, class_balance=np.array([0.2, 0.8])) Y_train = model.predict(Ls_train) + Y_LF_set print('Trained Label Model Metrics:') scores = model.score((Ls_train[1], Y_train[1]), metric=['accuracy', 'precision', 'recall', 'f1']) print(scores) return trainer, Y_train
def getTrainedModel2(self): # Apply the LFs to the unlabeled training data applier = PandasLFApplier(self.LFs) L_train = applier.apply(self.train['comments']) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) self.train['resolution'] = label_model.predict( L=L_train, tie_break_policy="abstain") df_train = self.train[self.train.resolution != self.ABSTAIN] train_text = df_train.comments.tolist() X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text) clf = LogisticRegression(solver="lbfgs") clf.fit(X=X_train, y=df_train.resolution.values) prob = clf.predict_proba(self.test) if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' end_model = EndModel([1000, 10, 2], seed=123, device=device) end_model.train_model( (self.train['comments'], self.test['comments']), valid_data=(self.train['resolution'], self.test['comments']), lr=0.01, l2=0.01, batch_size=256, n_epochs=5, checkpoint_metric='accuracy', checkpoint_metric_mode='max') return prob
candidate_dfs['dev'].curated_dsh, model_type='curve', figsize=(12, 7), plot_title="Disease Associates Gene Dev PRC", metric='PR', font_size=16) # In[21]: label_model = LabelModel(k=2, seed=100) label_model.train_model(validation_data[1][0], n_epochs=1000, verbose=False, lr=0.01, l2=2.067) dev_predictions = convert_labels(label_model.predict(validation_data[1][1]), 'categorical', 'onezero') dev_marginals = label_model.predict_proba(validation_data[1][1])[:, 0] # In[22]: plt.rcParams.update({'font.size': 16}) plt.figure(figsize=(10, 6)) plot_predictions_histogram(dev_predictions, candidate_dfs['dev'].curated_dsh.astype(int).values, title="Prediction Histogram for Dev Set") # In[23]: confusion_matrix( convert_labels(candidate_dfs['dev'].curated_dsh.values, 'onezero',
log_train_every=50) score = label_model.score((Ls[1], Ys[1])) print('Trained Label Model Metrics:') scores = label_model.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') scores = mv.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) Y_train_ps = label_model.predict_proba(Ls[0]) Y_dev_p = label_model.predict(Ls[1]) """ mv2 = MajorityClassVoter() mv2.train_model(np.asarray(new_balance)) """ #=np.asarray(new_balance)) #Y_baseline = mv2.predict(Ls[2]) pickling_on2 = open( "data_encompassing/ar/ar_baseline_{}{}".format(flag0, flag), "wb") pickle.dump(Y_baseline, pickling_on2) print(Y_baseline) # baseline majority: """