def test_get_label_buckets(self) -> None: y1 = np.array([[2], [1], [3], [1], [1], [3]]) y2 = np.array([1, 2, 3, 1, 2, 3]) buckets = get_label_buckets(y1, y2) expected_buckets = {(2, 1): [0], (1, 2): [1, 4], (3, 3): [2, 5], (1, 1): [3]} expected_buckets = {k: np.array(v) for k, v in expected_buckets.items()} np.testing.assert_equal(buckets, expected_buckets) y1_1d = np.array([2, 1, 3, 1, 1, 3]) buckets = get_label_buckets(y1_1d, y2) np.testing.assert_equal(buckets, expected_buckets)
def mistakes_df(df, label_model, L_test, y_test): """Compute a DataFrame of all the mistakes we've seen.""" out_dfs = [] probs_test = label_model.predict_proba(L=L_test) preds_test = probs_test >= 0.5 buckets = get_label_buckets(y_test, L_test[:, 1]) print(buckets) for (actual, predicted) in buckets.keys(): # Only shot mistakes that we actually voted on if actual != predicted: actual_name = number_to_name_dict[actual] predicted_name = number_to_name_dict[predicted] out_dfs.append( get_mistakes(df, probs_test, buckets=buckets, labels=(actual, predicted), label_names=(actual_name, predicted_name))) if len(out_dfs) > 1: return out_dfs[0].append(out_dfs[1:]) else: return out_dfs[0]
def main(): lfs = [lf_contains_link, lf_contains_co, lf_contains_sub] baseApp = LFApplier(lfs) labels = baseApp.apply(src) print(labels) print(LFAnalysis(labels, lfs).lf_summary()) buckets = get_label_buckets(labels[:, 0], labels[:, 1]) print(buckets) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(labels, n_epochs=500, log_freq=50, seed=123) pred_labels = label_model.predict(L=labels, tie_break_policy="abstain") print(pred_labels)
def test_get_label_buckets_bad_shape(self) -> None: with self.assertRaisesRegex(ValueError, "same number of elements"): get_label_buckets(np.array([0, 1, 1]), np.array([1, 1]))
def model_analysis(label_model: LabelModel, training_set: pd.DataFrame, L_train: np.ndarray, L_test: np.ndarray, Y_test: np.ndarray, lfs: list, output_file="output") -> None: # TODO: consider using **kwargs instead of this painful list of arguments """Output analysis for the label model to a file :param label_model: The current label model which we want to output analysis for :type label_model: LabelModel :param training_set: A dataframe containing the training dataset :type training_set: pd.DataFrame :param L_train: The matrix of labels generated by the labeling functions on the training data :type L_train: np.ndarray :param L_test: The matrix of labels generated bt the labeling functions on the testing data :type L_test: np.ndarray :param Y_test: Gold labels associated with data points in L_test :type Y_test: np.ndarray :param lfs: List of labeling functions :type lfs: list :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output` :type output_file: str, optional """ Y_train = label_model.predict_proba(L=L_train) Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain") lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() # TODO: Write this df to a output file. Ask Jennifer about how to handle this print(lf_analysis_train) # build majority label voter model majority_model = MajorityLabelVoter() majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) # get precision and recall scores p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted') r_score = recall_score(y_true=Y_test, y_pred=Y_pred, average='weighted', labels=np.unique(Y_pred)) # how many documents abstained probs_train = majority_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=training_set, y=probs_train, L=L_train) # get number of false positives buckets = get_label_buckets(Y_test, Y_pred) true_positives, false_positives, true_negatives, false_negatives = ( buckets.get((1, 1)), buckets.get((1, 0)), buckets.get( (0, 0)), buckets.get((0, 1))) # write analysis to file timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt", "w") as output_file: output_file.write( f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%" ) output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%") output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%") output_file.write( f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}") output_file.write( f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}" ) output_file.write( f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}" ) output_file.write( f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}" ) output_file.write( f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}" ) output_file.write( f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}") output_file.write( f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
# %% LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev) # %% [markdown] # So even these very simple rules do quite well! # We might want to pick the `check` rule, since both have high precision and `check` has higher coverage. # But let's look at our data to be sure. # # The helper method `get_label_buckets(...)` groups data points by their predicted label and true label. # For example, we can find the indices of data points that the LF labeled `SPAM` that actually belong to class `HAM`. # This may give ideas for where the LF could be made more specific. # %% from snorkel.analysis import get_label_buckets buckets = get_label_buckets(Y_dev, L_dev[:, 1]) df_dev.iloc[buckets[(HAM, SPAM)]] # %% [markdown] # There's only one row here because `check` produced only one false positive on the `dev` set. # Now let's take a look at 10 random `train` set data points where `check` labeled `SPAM` to see if it matches our intuition or if we can identify some false positives. # %% df_train.iloc[L_train[:, 1] == SPAM].sample(10, random_state=1) # %% [markdown] # No clear false positives here, but many look like they could be labeled by `check_out` as well. # Let's see 10 data points where `check_out` abstained, but `check` labeled. # %% buckets = get_label_buckets(L_train[:, 0], L_train[:, 1])
# %% [markdown] # We might want to pick the `check` rule, since `check` has higher coverage. Let's take a look at 10 random `train` set data points where `check` labeled `SPAM` to see if it matches our intuition or if we can identify some false positives. # %% df_train.iloc[L_train[:, 1] == SPAM].sample(10, random_state=1) # %% [markdown] # No clear false positives here, but many look like they could be labeled by `check_out` as well. # # Let's see 10 data points where `check_out` abstained, but `check` labeled. We can use the`get_label_buckets(...)` to group data points by their predicted label and/or true labels. # %% from snorkel.analysis import get_label_buckets buckets = get_label_buckets(L_train[:, 0], L_train[:, 1]) df_train.iloc[buckets[(ABSTAIN, SPAM)]].sample(10, random_state=1) # %% [markdown] # Most of these seem like small modifications of "check out", like "check me out" or "check it out". # Can we get the best of both worlds? # %% [markdown] # ### d) Balance accuracy and coverage # %% [markdown] # Let's see if we can use regular expressions to account for modifications of "check out" and get the coverage of `check` plus the accuracy of `check_out`. # %% import re