def train_model(self, df_train: pd.DataFrame, application_area_lfs: list, analysis_path: str = "output", label_output_path: str = "labels.jsonl", save_model_path: str = None): """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points :param df_train: The training data for the model :type df_train: pd.DataFrame :param application_area_lfs: A list of labeling functions to use in training the Label Model :type application_area_lfs: list :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output` :type analysis_path: str, optional :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl" :type label_output_path: str, optional :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved :type save_model_path: str, optional """ file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") applier = PandasLFApplier(lfs=application_area_lfs) L_train = applier.apply(df=df_train) model = LabelModel(cardinality=2, verbose=True) model.fit(L_train=L_train, n_epochs=800, log_freq=100) if (save_model_path is not None): model.save(save_model_path) int_labels, prob_labels = model.predict(L=L_train, return_probs=True, tie_break_policy="abstain") probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=prob_labels, L=L_train) int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe( X=df_train, y=int_labels, L=L_train) # write out both labels. In the probability outputs, p_rel is the second probability listed assert list(probs_df_train_filtered["paperid"]) == list( int_df_train_filtered["paperid"]) with open(f"{label_output_path}", mode="w") as out: for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]): out.write( json.dumps({ "id": paper_id, # cast to int and float to get rid of nonserializable numpy types "is_rel": int(int_train_filtered[idx]), "p_rel": float(probs_train_filtered[idx][1]) }) + "\n") # output LF analysis to csv file sorted by coverage lf_analysis = LFAnalysis(L=L_train, lfs=application_area_lfs).lf_summary() with open( f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv", "w") as outfile: lf_analysis = lf_analysis.sort_values("Coverage") lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
def train(self, dataset): # Apply labeler functions to training set lfs_applier = PandasLFApplier(lfs=self.lfs) with warnings.catch_warnings(): warnings.filterwarnings('ignore') lfs_train = lfs_applier.apply(df=dataset) # Build probabilistic label model label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42) label_probs = label_model.predict_proba(lfs_train) # Filter unlabeled data points df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset, y=label_probs, L=lfs_train) # Featurize data using scikit self.vectorizer = CountVectorizer(ngram_range=(1, 5)) dataset_train = self.vectorizer.fit_transform( df_filtered.sentence.tolist()) # Replace probabilistic labels with most likely label preds_filtered = probs_to_preds(probs=probs_filtered) # Train scikit model self.model = LogisticRegression(C=1e3, solver="liblinear", multi_class='auto') self.model.fit(X=dataset_train, y=preds_filtered)
def label_model_trainer(label_model, L_train, df_train): """ To train the extraction model, we first output the probabilities of the binary choices: True and False from our label model. Then, using the probabilities, we train our end model """ # extract the probabiliteis from the training set using our label model probs_train = label_model.predict_proba(L_train) # Since we cannot use the data points that did not receive any labels (Not covered by our labeling functions), # we filter them out # extract only the data points that received any labels from the labeling functions df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) X_train = uniform_length(df_train_filtered) model = rnn_model() batch_size = 64 model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=50) X_test = uniform_length(df_test) probs_test = model.predict(X_test) preds_test = probs_to_preds(probs_test) print( f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}" ) print( f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}" )
def train_model(label_model, L_train): probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=modeler.df_train, y=probs_train, L=L_train) print("{} out of {} examples used for training data".format( len(df_train_filtered), len(modeler.df_train))) return train_model_from_probs(df_train_filtered, probs_train_filtered, modeler.df_valid, modeler.df_test)
def test_filter_unlabeled_dataframe(self) -> None: X = pd.DataFrame(dict(A=["x", "y", "z"], B=[1, 2, 3])) y = np.array([[0.25, 0.25, 0.25, 0.25], [1.0, 0.0, 0.0, 0.0], [0.2, 0.3, 0.5, 0.0]]) L = np.array([[0, 1, -1], [-1, -1, -1], [1, 1, 0]]) X_filtered, y_filtered = filter_unlabeled_dataframe(X, y, L) np.array_equal(X_filtered.values, np.array([["x", 1], ["z", 3]])) np.testing.assert_array_almost_equal( y_filtered, np.array([[0.25, 0.25, 0.25, 0.25], [0.2, 0.3, 0.5, 0.0]]))
def main(output_path: str, training_data: str, gold_labels: str, label_output_path: str) -> None: df_train, df_test = prepare_data(gold_label_path=gold_labels) L_train, L_test, lfns = LF_applier(df_train, df_test) Y_test = np.asarray(list(map(encode_labels, df_test["relevancy"].tolist())), dtype=np.intc) # Build noise aware majority model begin_train_time = time() label_model = train_model(training_data=df_train, testing_data=df_test, L_train=L_train, save_model=True) end_train_time = time() print(f"Training time: {end_train_time - begin_train_time}") model_analysis(label_model=label_model, training_set=df_train, L_train=L_train, L_test=L_test, Y_test=Y_test, lfs=lfns, output_file=output_path) # get both integer and probability labels for data, filtering out unlabeled data points: https://www.snorkel.org/use-cases/01-spam-tutorial#filtering-out-unlabeled-data-points int_labels, prob_labels = label_model.predict(L=L_train, return_probs=True) probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=prob_labels, L=L_train) int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe( X=df_train, y=int_labels, L=L_train) # write out both labels. In the probability outputs, p_rel is the second probability listed assert list(probs_df_train_filtered["paperid"]) == list( int_df_train_filtered["paperid"]) with open(label_output_path, mode="w") as out: for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]): out.write( json.dumps({ "id": paper_id, # cast to int and float to get rid of nonserializable numpy types "is_rel": int(int_train_filtered[idx]), "p_rel": float(probs_train_filtered[idx][1]) }) + "\n")
def get_snorkel_labels(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) L_probs = label_model.predict_proba(L=L_train) df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df, y=L_probs, L=L_train) return df_filtered, probs_filtered
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def get_label_model_stats(self): result = self.label_model.score(L=self.L_dev, Y=self.Y_dev, metrics=["f1", "precision", "recall"]) probs_train = self.label_model.predict_proba(L=self.L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=self.L_train) result["training_label_coverage"] = len(probs_train_filtered) / len( probs_train) result["class_0_ratio"] = (probs_train_filtered[:, 0] > 0.5).sum() / len(probs_train_filtered) if len(probs_train_filtered) == 0: result["class_0_ratio"] = 0 return result
def train(self): ''' Train the logistic regression discriminative model ''' # We pull out the label vectors for ease of use later Y_test = self.df_test.label.values applier = PandasLFApplier(lfs=self.lfs) L_train = applier.apply(df=self.df_train) # Use Label Model to combined input data label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # Make predictions probs_train = label_model.predict_proba(L=L_train) # Filter abstained inputs df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=L_train) # Represent each data point as a one-hot vector vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(df_train_filtered.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) # Turn probs into preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # Train logistic regression model sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) print( f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%" ) dump(sklearn_model, 'sklearn_model.joblib') dump(vectorizer, 'vectorizer.joblib')
def train(self): probs_train = self.label_model.predict_proba(L=self.L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=self.L_train) if len(df_train_filtered) == 0: print("Labeling functions cover none of the training examples!", file=sys.stderr) return {"micro_f1": 0} #from tensorflow.keras.utils import to_categorical #df_train_filtered, probs_train_filtered = self.df_dev, to_categorical(self.df_dev["label"].values) vectorizer = self.vectorizer X_train = vectorizer.transform(df_train_filtered.text.tolist()) X_dev = vectorizer.transform(self.df_dev.text.tolist()) X_valid = vectorizer.transform(self.df_valid.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) self.keras_model = get_keras_logreg(input_dim=X_train.shape[1]) self.keras_model.fit( x=X_train, y=probs_train_filtered, validation_data=(X_valid, preds_to_probs(self.Y_valid, 2)), callbacks=[get_keras_early_stopping()], epochs=20, verbose=0, ) preds_test = self.keras_model.predict(x=X_test).argmax(axis=1) #return preds_test return self.get_stats(self.Y_test, preds_test)
# # We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set. # %% {"tags": ["md-exclude-output"]} from snorkel.labeling.model.label_model import LabelModel L_train = applier.apply(df_train) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=5000, seed=123, log_freq=20, lr=0.01) preds_train = label_model.predict(L_train) # %% {"tags": ["md-exclude-output"]} from snorkel.labeling import filter_unlabeled_dataframe df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( df_train, preds_train, L_train ) df_train_filtered["rating"] = preds_train_filtered # %% [markdown] # ### Rating Prediction Model # We write a Keras model for predicting ratings given a user's book list and a book (which is being rated). # The model represents the list of books the user interacted with, `books_idxs`, by learning an embedding for each idx, and averaging the embeddings in `book_idxs`. # It learns another embedding for the `book_idx`, the book to be rated. # Then it concatenates the two embeddings and uses an [MLP](https://en.wikipedia.org/wiki/Multilayer_perceptron) to compute the probability of the `rating` being 1. # This type of model is common in large-scale recommender systems, for example, the [YouTube recommender system](https://ai.google/research/pubs/pub45530). # %% import numpy as np import tensorflow as tf from utils import precision_batch, recall_batch, f1_batch
def model_analysis(label_model: LabelModel, training_set: pd.DataFrame, L_train: np.ndarray, L_test: np.ndarray, Y_test: np.ndarray, lfs: list, output_file="output") -> None: # TODO: consider using **kwargs instead of this painful list of arguments """Output analysis for the label model to a file :param label_model: The current label model which we want to output analysis for :type label_model: LabelModel :param training_set: A dataframe containing the training dataset :type training_set: pd.DataFrame :param L_train: The matrix of labels generated by the labeling functions on the training data :type L_train: np.ndarray :param L_test: The matrix of labels generated bt the labeling functions on the testing data :type L_test: np.ndarray :param Y_test: Gold labels associated with data points in L_test :type Y_test: np.ndarray :param lfs: List of labeling functions :type lfs: list :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output` :type output_file: str, optional """ Y_train = label_model.predict_proba(L=L_train) Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain") lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() # TODO: Write this df to a output file. Ask Jennifer about how to handle this print(lf_analysis_train) # build majority label voter model majority_model = MajorityLabelVoter() majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) # get precision and recall scores p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted') r_score = recall_score(y_true=Y_test, y_pred=Y_pred, average='weighted', labels=np.unique(Y_pred)) # how many documents abstained probs_train = majority_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=training_set, y=probs_train, L=L_train) # get number of false positives buckets = get_label_buckets(Y_test, Y_pred) true_positives, false_positives, true_negatives, false_negatives = ( buckets.get((1, 1)), buckets.get((1, 0)), buckets.get( (0, 0)), buckets.get((0, 1))) # write analysis to file timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt", "w") as output_file: output_file.write( f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%" ) output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%") output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%") output_file.write( f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}") output_file.write( f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}" ) output_file.write( f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}" ) output_file.write( f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}" ) output_file.write( f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}" ) output_file.write( f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}") output_file.write( f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
probs_train = label_model.predict_proba(L=L_train) plot_probabilities_histogram(probs_train[:, SPAM]) # %% [markdown] # ### Filtering out unlabeled data points # %% [markdown] # As we saw earlier, some of the data points in our `train` set received no labels from any of our LFs. # These data points convey no supervision signal and tend to hurt performance, so we filter them out before training using a # [built-in utility](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.filter_unlabeled_dataframe.html#snorkel.labeling.filter_unlabeled_dataframe). # %% from snorkel.labeling import filter_unlabeled_dataframe df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train ) # %% [markdown] # ## 5. Training a Classifier # %% [markdown] # In this final section of the tutorial, we'll use the noisy training labels we generated in the last section to train a classifier for our task. # **The output of the Snorkel `LabelModel` is just a set of labels which can be used with most popular libraries for performing supervised learning, such as TensorFlow, Keras, PyTorch, Scikit-Learn, Ludwig, and XGBoost.** # In this tutorial, we demonstrate using classifiers from [Keras](https://keras.io) and [Scikit-Learn](https://scikit-learn.org). # %% [markdown] # ### Featurization # %% [markdown] # For simplicity and speed, we use a simple "bag of n-grams" feature representation: each data point is represented by a one-hot vector marking which words or 2-word combinations are present in the comment text.
def label_post(inp_path, prefix = ""): #lfs = [job_inpost, check_subreddit, check_iama] lfs = [job_inpost, check_iama] context_lens = [100, 3, 2] for with_per in [True, False]: for clen in context_lens: for kw in patterns: lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per)) print("created lfs, their count", len(lfs)) df_train = pd.read_pickle(inp_path) df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)]) df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1) print("loaded dataset") t1 = time.time() with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) print("time mins ", (time.time() - t1) / 60) print(LFAnalysis(L=L_train, lfs=lfs).lf_summary()) df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("*************************************************") df_train = df_train.drop(["index"], axis=1) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123) probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train ) print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value']))) print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value']))) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl") df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv") #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv") verbose = True if verbose: for i in range(len(lfs)): ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv" df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath) auth_hobby_dict = defaultdict(set) for index, row in df_train.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].add(row.value) with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict)))
def get_role_probs(lf_train: pd.DataFrame, filter_abstains: bool = False, lfs: Optional[List[labeling_function]] = None, lf_dev: pd.DataFrame = None, seed: Optional[int] = None, tmp_path: Union[str, Path] = None, use_majority_label_voter=False) -> pd.DataFrame: """ Takes "raw" data frame, builds argument role examples, (trains LabelModel), calculates event_argument_probs and returns merged argument role examples with event_argument_probs. :param use_majority_label_voter: Whether to use a majority label voter instead of the snorkel label model :param seed: Seed for use in label model (mu initialization) :param filter_abstains: Filters rows where all labeling functions abstained :param lf_train: Training dataset which will be labeled using Snorkel :param lfs: List of labeling functions :param lf_dev: Optional development dataset that can be used to set a prior for the class balance :param tmp_path: Path to temporarily store variables that are shared during random repeats :return: Labeled lf_train, labeling function applier, label model """ df_train, L_train = None, None df_dev, Y_dev, L_dev = None, None, None tmp_train_path, tmp_dev_path = None, None # For random repeats try to load pickled variables from first run as they are shared if tmp_path: tmp_train_path = Path(tmp_path).joinpath("role_train.pkl") os.makedirs(os.path.dirname(tmp_train_path), exist_ok=True) if tmp_train_path.exists(): with open(tmp_train_path, 'rb') as pickled_train: df_train, L_train = pickle.load(pickled_train) if lf_dev is not None: tmp_dev_path = Path(tmp_path).joinpath("role_dev.pkl") os.makedirs(os.path.dirname(tmp_dev_path), exist_ok=True) if tmp_dev_path.exists(): with open(tmp_dev_path, 'rb') as pickled_dev: df_dev, Y_dev, L_dev = pickle.load(pickled_dev) if lfs is None: lfs = get_role_list_lfs() applier = PandasLFApplier(lfs) if L_train is None or df_train is None: df_train, _ = build_event_role_examples(lf_train) logger.info("Running Event Role Labeling Function Applier") L_train = applier.apply(df_train) if tmp_path: with open(tmp_train_path, 'wb') as pickled_train: pickle.dump((df_train, L_train), pickled_train) if lf_dev is not None and any(element is None for element in [df_dev, Y_dev, L_dev]): df_dev, Y_dev = build_event_role_examples(lf_dev) logger.info("Running Event Role Labeling Function Applier on dev set") L_dev = applier.apply(df_dev) if tmp_path: with open(tmp_dev_path, 'wb') as pickled_dev: pickle.dump((df_dev, Y_dev, L_dev), pickled_dev) if use_majority_label_voter: logger.info( "Using MajorityLabelVoter to calculate role class probabilities") label_model = MajorityLabelVoter(cardinality=11) else: label_model = LabelModel(cardinality=11, verbose=True) logger.info( "Fitting LabelModel on the data and predicting role class probabilities" ) if seed: label_model.fit(L_train=L_train, n_epochs=5000, log_freq=500, seed=seed, Y_dev=Y_dev) else: label_model.fit(L_train=L_train, n_epochs=5000, log_freq=500, Y_dev=Y_dev) # Evaluate label model on development data if df_dev is not None and Y_dev is not None: metrics = ["accuracy", "f1_micro", "f1_macro"] logger.info("Evaluate on the dev set") label_model_metrics = label_model.score(L=L_dev, Y=Y_dev, tie_break_policy="random", metrics=metrics) if use_majority_label_voter: logger.info('Role Majority Label Voter Metrics') else: logger.info('Role Label Model Metrics') logger.info( f"{'Accuracy:':<25} {label_model_metrics['accuracy'] * 100:.1f}%") logger.info( f"{'F1 (micro averaged):':<25} {label_model_metrics['f1_micro'] * 100:.1f}%" ) logger.info( f"{'F1 (macro averaged):':<25} {label_model_metrics['f1_macro'] * 100:.1f}%" ) event_role_probs = label_model.predict_proba(L_train) if filter_abstains: df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=event_role_probs, L=L_train) merged_event_role_examples = merge_event_role_examples( df_train_filtered, probs_train_filtered) else: # Multiplies probabilities of abstains with zero so that the example is treated as padding in the end model merged_event_role_examples = merge_event_role_examples( df_train, utils.zero_out_abstains(event_role_probs, L_train)) return merged_event_role_examples
def label_user(inp_path, prefix=""): df_train = pd.read_pickle(inp_path) ########## threshold on word similarity take_first = 100 overall_first = 10000 global thresh_by_value, overall_thresh df_train['root_value'] = df_train['value'].swifter.set_dask_threshold( dask_threshold=0.001).allow_dask_on_strings().apply( lambda x: syn_to_hob[x]) thresh_by_value = df_train.groupby( ["root_value"]).apply(lambda x: np.partition( x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0) )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict() overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(), max(len(df_train) - overall_first, 0))[max( len(df_train) - overall_first, 0)] print(overall_thresh) ############################# # separately loose - strict, pos - neg, period - without names_pool = [ "context:2_count_pos", "context:3_count_pos", "context:100_count_pos", "context:2_period_count_pos", "context:3_period_count_pos", "context:100_period_count_pos", "context:2_count_neg", "context:3_count_neg", "context:100_count_neg", "context:2_period_count_neg", "context:3_period_count_neg", "context:100_period_count_neg" ] for f_name in names_pool: curr_cols = [x for x in df_train.columns if f_name in x] df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum, axis=1) df_train = df_train.drop(curr_cols, axis=1) for p in ["pos", "neg"]: df_train["new_total_context:100_count_" + p] = df_train[[ "total_context:100_count_" + p, "total_context:3_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_count_" + p] - x["total_context:3_count_" + p]), axis=1) df_train["new_total_context:3_count_" + p] = df_train[[ "total_context:3_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p ]), axis=1) df_train["new_total_context:100_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:100_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_period_count_" + p] - x[ "total_context:3_period_count_" + p]), axis=1) df_train["new_total_context:3_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:2_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_period_count_" + p] - x[ "total_context:2_period_count_" + p]), axis=1) df_train["new_total_context:2_count_" + p] = df_train[[ "total_context:100_period_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:2_count_" + p] - x[ "total_context:100_period_count_" + p]), axis=1) df_train = df_train.drop( ["total_" + x for x in names_pool if "2_period_count" not in x], axis=1) lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue] num_of_thesholds = 3 step = 100 // num_of_thesholds for col in df_train: if col not in ["author", "value", "idd", "root_value"]: if col not in [ "pos_prob_mean", "neg_prob_mean", "num_good_posts" ]: # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]: thresholds = [0] if "lexicon" in col and "unique" not in col: continue if True: # col in ["lexicon_counts", "unique_lexicon_counts"]: vals = df_train[col].to_numpy() thresholds = np.percentile( vals, list(range(0 + step, 99 + step, step))).astype(int) thresholds = sorted(list(set(thresholds))) if len(thresholds) > 1: thresholds = thresholds[:-1] if "lexicon" in col: thresholds = [3] # max_val = max(vals) # thresholds = list(range(0, int(max_val), int(max_val/5) + 1)) # elif col == "pos_prob_mean": # thresholds = [0.5 + 0.1 * x for x in range(5)] for i in range(len(thresholds)): thresh = thresholds[i] next_threshold = sys.maxsize if i == len( thresholds) - 1 else thresholds[i + 1] previous_threshold = -sys.maxsize if i == 0 else thresholds[ i - 1] if "lexicon_counts" not in col: lfs.append( make_thresold_lf(thresh=thresh, col_name=col, next_threshold=next_threshold)) else: lfs.append( make_lexicon_lf( thresh=thresh, pref=col, previous_threshold=previous_threshold)) num_annotators = 0 if num_annotators > 0: for i in range(1, num_annotators + 1): lfs.append(make_annotator_lf(worker_index=i)) lfs = [ x for x in lfs if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"]) ] print("created lfs their number", len(lfs)) print("\n".join(str(x) for x in lfs)) #### validation ##### do_val = False if do_val: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] print("Number val", df_val.shape) print("Number dev", df_dev.shape) df_val = df_val.merge(df_golden, on="auth_val") y_val = np.array(df_val["final"]) df_val = df_val.drop(labels="final", axis=1) # create test set as well with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_val = applier.apply(df=df_val, n_parallel=num_cpu) L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary() analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val) analysis.to_csv("/home/tigunova/val_analysis.csv") dev_analysis.to_csv("/home/tigunova/dev_analysis.csv") print(analysis) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_dev) #, Y_dev=y_val) model_stat = label_model.score(L=L_val, Y=y_val) print(model_stat) exit(0) ########### #### picking threshold ##### do_threshold = False if do_threshold: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] pop_size = df_dev.shape[0] print("Number val", df_val.shape) print("Number dev", df_dev.shape) applier = PandasParallelLFApplier(lfs=lfs) df_val = df_val.merge(df_golden, on="auth_val") L_val = applier.apply(df=df_val, n_parallel=num_cpu) val_thresholds = [0.01 * x for x in range(100)] label_model = LabelModel(cardinality=2, verbose=True) with TQDMDaskProgressBar(desc="Dask Apply"): L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) label_model.fit(L_dev, class_balance=[0.5, 0.5]) # , Y_dev=y_val) wghts = label_model.get_weights() print("\n".join(str(x) for x in zip(lfs, wghts))) probs_val = label_model.predict_proba(L=L_val) probs_df = pd.DataFrame(probs_val, columns=["neg_prob", "pos_prob"]) df_val = pd.concat([df_val.reset_index(), probs_df], axis=1) probs_dev = label_model.predict_proba(L=L_dev) probs_df = pd.DataFrame(probs_dev, columns=["neg_prob", "pos_prob"]) df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1) y_true = np.array(df_val["final"]) for th in val_thresholds: y_pred = np.array( df_val["pos_prob"].apply(lambda x: 1 if x > th else 0)) #print("true negatives") #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]]) prec = precision_score(y_true, y_pred) pred_labels = y_pred true_labels = y_true # True Positive (TP): we predict a label of 1 (positive), and the true label is 1. TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1)) # True Negative (TN): we predict a label of 0 (negative), and the true label is 0. TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0)) # False Positive (FP): we predict a label of 1 (positive), but the true label is 0. FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0)) # False Negative (FN): we predict a label of 0 (negative), but the true label is 1. FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1)) print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN)) # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr))) # print("******************************") print("threshold %s, proportion population %.4f, precision %s" % (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] / pop_size, str(prec))) exit(0) ########### with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(analysis) df_l_train = pd.DataFrame( L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("********************************************") t4 = time.time() label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123, class_balance=[0.3, 0.7]) probs_train = label_model.predict_proba(L=L_train) print("labeling model work ", (time.time() - t4) / 60) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) result_filtered = pd.concat([ df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df ], axis=1) print(result_filtered.shape) print("****************************************************") result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv") print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) df_train_filtered = df_train_filtered.drop(["index"], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".pkl") df_train_filtered.to_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".csv") # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv") ### write dict output_threshold = 0.63 output_dict = defaultdict(list) auth_hobby_dict = defaultdict(list) for index, row in result_filtered.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].append([row.value, row.pos_prob]) allowed_labels = [] for index, row in df_train_filtered.iterrows(): if row.value == row.value and row.author == row.author: if row.pos_prob > output_threshold: output_dict[row.author].append([row.value] + row.idd + [row.pos_prob]) allowed_labels.append(syn_to_hob[row.value]) print("\n".join([ str(y) for y in sorted(dict(Counter(allowed_labels)).items(), key=lambda x: x[1]) ])) print( "After cropping", sum([ x if x < 500 else 500 for x in dict(Counter(allowed_labels)).values() ])) print("users in total", len(output_dict)) for auth, stuffs in output_dict.items(): prof = ":::".join(set([x[0] for x in stuffs])) prob = ":::".join([str(x[-1]) for x in stuffs]) msgs = set([x for l in stuffs for x in l[1:-1]]) output_dict[auth] = [prof] + list(msgs) + [prob] with open( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict))) with open("/home/tigunova/users_profession1.txt", "w") as f_out: f_out.write(repr(dict(output_dict)))
def main(train_path, output_dir, label_dir): # Get all data df = pd.read_csv(train_path) # Get human labels human_labels = read_human_labels(label_dir) # df_test and lab_test: the set of all human-labeled notes, and their labels df_test = df.merge(human_labels, on=['record_number']) lab_test = df_test.human_label del df_test['human_label'] # df_train: formed by removing all patients from df with a human-labeled note df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr']) df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1) # Generate label matrix L_train = PandasLFApplier(lfs=lfs).apply(df=df_train) L_test = PandasLFApplier(lfs=lfs).apply(df=df_test) # Summarize LFs output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() #print(output_train) output_test = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values) #print(output_test) # Save LF analysis path = os.path.join(output_dir, 'LF_analysis_train.csv') output_train.to_csv(path, index = True) path = os.path.join(output_dir, 'LF_analysis_test.csv') output_test.to_csv(path, index = True) # Create label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7]) # Evaluate the label model using labeled test set for metric in ['recall', 'precision', 'f1', 'accuracy']: label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric] print("%-15s %.2f%%" % (metric+":", label_model_acc * 100)) null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],))) print("%-15s %.2f%%" % ("null f1:", null_f1 * 100)) print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100)) # Save error analysis preds = label_model.predict_proba(L_test) error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir) # Get labels on train probs_train = label_model.predict_proba(L_train) # Filter out unlabeled data points df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) # Save filtered training set df_train_filtered['prob'] = probs_train_filtered[:,1] path = os.path.join(output_dir, 'df_train_filtered.csv') df_train_filtered.to_csv(path, index = False) # Save label probs path = os.path.join(output_dir, 'probs_train_filtered') np.save(path, probs_train_filtered[:,1]) # Save training data set and labels assert len(df_test) == len(lab_test) df_test['human_label'] = lab_test path = os.path.join(output_dir, 'df_test.csv') df_test.to_csv(path, index = False) path = os.path.join(output_dir, 'lab_test') np.save(path, lab_test)