示例#1
0
    def test_mv_default(self):
        # less than 2 LFs have overlaps
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 0]))

        # less than 2 LFs have conflicts
        L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 1]))
示例#2
0
    def test_save_and_load(self):
        L = np.array([[0, -1, 0], [0, 1, 1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        original_preds = label_model.predict(L)

        dir_path = tempfile.mkdtemp()
        save_path = dir_path + "label_model.pkl"
        label_model.save(save_path)

        label_model_new = LabelModel(cardinality=2, verbose=False)
        label_model_new.load(save_path)
        loaded_preds = label_model_new.predict(L)
        shutil.rmtree(dir_path)

        np.testing.assert_array_equal(loaded_preds, original_preds)
示例#3
0
def labelmodel_predict(L_train, y_true, L_test, return_probs=False, **kwargs):
    kwargs.setdefault('n_epochs', 500)
    kwargs.setdefault('log_freq', 100)

    from snorkel.labeling.model import LabelModel
    n = len(set(y_true[~y_true.isna()].values))
    log.info('y_true values: %s', set(y_true[~y_true.isna()].values))
    label_model = LabelModel(cardinality=n, verbose=True)

    L_train_val = set(L_train.values.flatten())
    y_true_val = set(y_true.values.flatten())
    log.info('Values in L_train but not y_true: %s', L_train_val - y_true_val)
    log.info('Values in y_true but not L_train: %s', y_true_val - L_train_val)

    L_train, Y_dev = to_numbered(L_train, y_true)

    log.info('L_train values: %s, %s', set(L_train.flatten()), type(L_train))
    log.info('Y_dev values: %s, %s', set(Y_dev.flatten()), type(Y_dev))
    log.info('kwargs: %s', kwargs)

    label_model.fit(L_train=L_train, Y_dev=Y_dev[Y_dev != -1], **kwargs)

    y_pred = label_model.predict(to_numbered(L_test, y_true)[0],
                                 return_probs=return_probs)

    if return_probs:
        y_pred, y_score = y_pred
    y_pred = from_numbered(L_test, y_true, y_pred)
    return (y_pred, y_score) if return_probs else y_pred
示例#4
0
    def test_label_model_sparse(self) -> None:
        """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset.

        This tests the common setting where LFs abstain most of the time, which can
        cause issues for example if parameter clamping set too high (e.g. see Issue
        #1422).
        """
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n,
                                               self.m,
                                               self.cardinality,
                                               abstain_multiplier=1000.0)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=1000, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels *only on non-abstained data points*
        Y_pred = label_model.predict(L, tie_break_policy="abstain")
        (idx, ) = np.where(Y_pred != -1)
        acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx)
        self.assertGreaterEqual(acc, 0.65)

        # Make sure that we don't output abstain when an LF votes, per issue #1422
        self.assertEqual(len(idx),
                         np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
示例#5
0
def get_L_final_filter(L_train, method='model'):
    L_final = []

    if len(L_train[0]) < 3:
        method = 'absolute'
    else:
        method = 'model'

    ## TEMPORARY MEASURE
    method = 'absolute'
    ##

    if method == 'absolute':
        ## Absolute Method: Any 'irrelevant' keywords matched will be flagged as irrelevant
        for array in L_train:
            if 0 in array:
                L_final.append(0)
            else:
                L_final.append(1)
    else:
        ## Label Model
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        L_final = label_model.predict(L=L_train,return_probs=False)

    return L_final
示例#6
0
    def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, -1, 1]))

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        results = label_model.score(L=L,
                                    Y=np.array([1, 0]),
                                    metrics=["accuracy", "f1"])
        results_expected = dict(accuracy=0.5, f1=2 / 3)
        self.assertEqual(results, results_expected)
示例#7
0
    def train_model(self,
                    df_train: pd.DataFrame,
                    application_area_lfs: list,
                    analysis_path: str = "output",
                    label_output_path: str = "labels.jsonl",
                    save_model_path: str = None):
        """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points

        :param df_train: The training data for the model
        :type df_train: pd.DataFrame
        :param application_area_lfs: A list of labeling functions to use in training the Label Model
        :type application_area_lfs: list
        :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output`
        :type analysis_path: str, optional
        :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl"
        :type label_output_path: str, optional
        :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved
        :type save_model_path: str, optional
        """
        file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        applier = PandasLFApplier(lfs=application_area_lfs)
        L_train = applier.apply(df=df_train)

        model = LabelModel(cardinality=2, verbose=True)
        model.fit(L_train=L_train, n_epochs=800, log_freq=100)
        if (save_model_path is not None):
            model.save(save_model_path)

        int_labels, prob_labels = model.predict(L=L_train,
                                                return_probs=True,
                                                tie_break_policy="abstain")
        probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=prob_labels, L=L_train)

        int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=int_labels, L=L_train)
        # write out both labels. In the probability outputs, p_rel is the second probability listed
        assert list(probs_df_train_filtered["paperid"]) == list(
            int_df_train_filtered["paperid"])
        with open(f"{label_output_path}", mode="w") as out:
            for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]):
                out.write(
                    json.dumps({
                        "id": paper_id,
                        # cast to int and float to get rid of nonserializable numpy types
                        "is_rel": int(int_train_filtered[idx]),
                        "p_rel": float(probs_train_filtered[idx][1])
                    }) + "\n")

        # output LF analysis to csv file sorted by coverage
        lf_analysis = LFAnalysis(L=L_train,
                                 lfs=application_area_lfs).lf_summary()
        with open(
                f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv",
                "w") as outfile:
            lf_analysis = lf_analysis.sort_values("Coverage")
            lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
示例#8
0
    def test_progress_bar(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100, progress_bar=False)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(
            label_model.predict(L), np.array([1, -1, 1])
        )

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)
示例#9
0
    def test_predict(self):
        # 3 LFs that always disagree/abstain leads to all abstains
        L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([-1, -1, -1]))

        L = np.array([[0, 1, 0], [0, 1, 0]])
        label_model = self._set_up_model(L)

        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))
        preds = label_model.predict(L)

        true_preds = np.array([0, 0])
        np.testing.assert_array_equal(preds, true_preds)

        preds, probs = label_model.predict(L, return_probs=True)
        true_probs = np.array([[0.99, 0.01], [0.99, 0.01]])
        np.testing.assert_array_almost_equal(probs, true_probs)
示例#10
0
def main():
    lfs = [lf_contains_link, lf_contains_co, lf_contains_sub]
    baseApp = LFApplier(lfs)
    labels = baseApp.apply(src)
    print(labels)
    print(LFAnalysis(labels, lfs).lf_summary())
    buckets = get_label_buckets(labels[:, 0], labels[:, 1])
    print(buckets)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(labels, n_epochs=500, log_freq=50, seed=123)
    pred_labels = label_model.predict(L=labels, tie_break_policy="abstain")
    print(pred_labels)
def calculate_metrics(
    label_model: LabelModel,
    dataset_name: str,
    true_labels: np.ndarray,
    save_to: AbsolutePath,
) -> Dict[str, float]:
    """
    >>> from collections import namedtuple; import tempfile
    >>> def mocked_predictions(l,return_probs,tie_break_policy): return np.array([1, 0, 1]), np.array([[0.1, 0.9], [0.8, 0.2], [0.25, 0.75]])
    >>> def mocked_scores(L,Y,tie_break_policy,metrics):
    ...     return {"f1": 1.0} if metrics == ['f1'] else {"roc_auc": 0.78}
    >>> lm = namedtuple('LM', ['predict', 'score'])(mocked_predictions, mocked_scores)
    >>> with tempfile.TemporaryDirectory() as tmpdirname:
    ...     np.ndarray([]).dump(f"{tmpdirname}/heuristic_matrix_test_set.pkl")
    ...     calculate_metrics(lm, "test_set", np.array([1, 1, 0]), Path(tmpdirname))
    {'label_model_accuracy_test_set': 0.333, 'label_model_auc_test_set': 0.78, 'label_model_f1_test_set': 1.0, 'label_model_mse_test_set': 0.404}
    >>> with tempfile.TemporaryDirectory() as tmpdirname:
    ...     np.ndarray([]).dump(f"{tmpdirname}/heuristic_matrix_test_set.pkl")
    ...     calculate_metrics(lm, "test_set", np.array([0, 1, 0]), Path(tmpdirname))
    {'label_model_accuracy_test_set': 0.0, 'label_model_auc_test_set': 0.78, 'label_model_f1_test_set': 1.0, 'label_model_mse_test_set': 0.671}
    """
    lines = np.load(str(save_to / f"heuristic_matrix_{dataset_name}.pkl"),
                    allow_pickle=True)

    tie_break_policy = "random"

    Y_pred, Y_prob = label_model.predict(lines,
                                         return_probs=True,
                                         tie_break_policy=tie_break_policy)

    try:
        auc = label_model.score(L=lines,
                                Y=true_labels,
                                tie_break_policy="random",
                                metrics=["roc_auc"])["roc_auc"]
        auc = round(auc, 3)
    except ValueError:
        auc = "n/a"
    f1 = label_model.score(L=lines,
                           Y=true_labels,
                           tie_break_policy="random",
                           metrics=["f1"])["f1"]
    accuracy = sum(Y_pred == true_labels) / float(len(Y_pred))
    mse = np.mean((Y_prob[:, 1] - true_labels)**2)

    return {
        f"label_model_accuracy_{dataset_name}": round(accuracy, 3),
        f"label_model_auc_{dataset_name}": auc,
        f"label_model_f1_{dataset_name}": round(f1, 3),
        f"label_model_mse_{dataset_name}": round(mse, 3),
    }
示例#12
0
    def test_prec_init(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])

        # test without prec_init
        label_model.fit(L_train=L, n_epochs=1000, seed=123)

        # test with prec_init as float
        prec_init = 0.6
        label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)
        label_model.predict(L)

        # test with prec_init as int
        prec_init = 1
        label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)
        label_model.predict(L)

        # test with prec_init as list
        prec_init = [0.1, 0.2, 0.3]
        label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)
        label_model.predict(L)

        # test with prec_init as np.array
        prec_init = np.array([0.1, 0.2, 0.3])
        label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)
        label_model.predict(L)

        with self.assertRaisesRegex(
            TypeError,
            "prec_init is of type <class 'str'> which is not supported currently.",
        ):
            # test with unsupported type (string)
            prec_init = "skibidi bop mm dada"
            label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)

        with self.assertRaisesRegex(
            ValueError, f"prec_init must have shape {L.shape[1]}."
        ):
            # test with prec_init as list of wrong length (bigger)
            prec_init = np.array([0.1, 0.2, 0.3, 0.4])
            label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)

        with self.assertRaisesRegex(
            ValueError, f"prec_init must have shape {L.shape[1]}."
        ):
            # test with prec_init as list of wrong length (smaller)
            prec_init = np.array([0.1, 0.2])
            label_model.fit(L_train=L, prec_init=prec_init, n_epochs=1000, seed=123)
示例#13
0
def do_labeling(
    label_model: LabelModel,
    matrix: np.ndarray,
    df: pd.DataFrame,
    label_names: List[str],
) -> pd.DataFrame:
    labels, probs = label_model.predict(L=matrix, return_probs=True)
    probs = np.around(probs, decimals=2)
    df_labeled = df.assign(predicted=Series(labels))

    df_labeled[f"prob_{label_names[0]}"] = Series(probs[:, 0])
    df_labeled[f"prob_{label_names[1]}"] = Series(probs[:, 1])
    df_labeled["prob_class"] = Series(
        np.around(np.copy(probs[:, 1]), decimals=1))
    return df_labeled
示例#14
0
def get_snorkel_labels(frame_to_train, pkl_name):
    print(
        "==============================Labeling is now started======================================="
    )
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=frame_to_train)
    date_parser_coverage, currency_coverage,\
    zipcode_coverage,state_coverage,\
    quntity_coverage,phonenumber_coverage,SSN_coverage,\
    first_name_coverage,last_name_coverage,percent_coverge= (L_train != ABSTAIN).mean(axis=0)
    frame_to_train.rename(columns={
        "word_id": "word_tokens",
        "text": "ocr",
        "label_number": "preds"
    },
                          inplace=True)
    print(
        "==============================Labeling is now complete======================================="
    )
    print(
        "==============================Summary Stats=================================================="
    )
    print(f"date_parser_coverage: {date_parser_coverage * 100:.1f}%")
    print(f"currency_coverage: {currency_coverage * 100:.1f}%")
    print(f"zipcode_coverage: {zipcode_coverage * 100:.1f}%")
    print(f"state_coverage: {state_coverage * 100:.1f}%")
    print(f"quntity_coverage: {quntity_coverage * 100:.1f}%")
    print(f"phonenumber_coverage: {phonenumber_coverage * 100:.1f}%")
    print(f"SSN_coverage: {SSN_coverage * 100:.1f}%")
    print(f"first_name_coverage: {first_name_coverage * 100:.1f}%")
    print(f"last_name_coverage: {last_name_coverage * 100:.1f}%")
    #print(f"alpha_number_coverage: {alpha_number_coverage * 100:.1f}%")
    #     lol= f"{pkl_name}.pkl"
    #     print("File name I got:", lol)
    #     print(f"percent_coverage: {percent_coverge * 100:.1f}%")
    #     with open(lol, 'rb') as f:
    #         label_model = pickle.load(f)
    label_model = LabelModel(cardinality=15, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
    frame_to_train["label_number"] = label_model.predict(
        L=L_train, tie_break_policy="abstain")
    frame_to_train.label_number.fillna(0, inplace=True)
    frame_to_train['pred_names'] = frame_to_train.label_number.map(inv_et_dct)
    return frame_to_train
    #dataset_df = pd.DataFrame()
    return frame_to_train
def weak_supervisor(dataframe, model_type):
    labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function,
                          negative1_labeling_function]
    pandasApplier = PandasLFApplier(lfs=labeling_functions)
    label_training_matrix = pandasApplier.apply(df=dataframe)

    if model_type == "label_model":
        # constructing a probabilistic label model
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123)
        dataframe["weak_labels"] = label_model.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe

    else:
        majorityLabelVoter = MajorityLabelVoter()
        dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe
def model_analysis(label_model: LabelModel,
                   training_set: pd.DataFrame,
                   L_train: np.ndarray,
                   L_test: np.ndarray,
                   Y_test: np.ndarray,
                   lfs: list,
                   output_file="output") -> None:
    # TODO: consider using **kwargs instead of this painful list of arguments
    """Output analysis for the label model to a file

    :param label_model: The current label model which we want to output analysis for
    :type label_model: LabelModel
    :param training_set: A dataframe containing the training dataset
    :type training_set: pd.DataFrame
    :param L_train: The matrix of labels generated by the labeling functions on the training data
    :type L_train: np.ndarray
    :param L_test: The matrix of labels generated bt the labeling functions on the testing data
    :type L_test: np.ndarray
    :param Y_test: Gold labels associated with data points in L_test
    :type Y_test: np.ndarray
    :param lfs: List of labeling functions
    :type lfs: list
    :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output`
    :type output_file: str, optional
    """
    Y_train = label_model.predict_proba(L=L_train)
    Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain")
    lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    # TODO: Write this df to a output file. Ask Jennifer about how to handle this
    print(lf_analysis_train)

    # build majority label voter model
    majority_model = MajorityLabelVoter()
    majority_acc = majority_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="abstain",
                                        metrics=["f1", "accuracy"])

    # get precision and recall scores
    p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted')
    r_score = recall_score(y_true=Y_test,
                           y_pred=Y_pred,
                           average='weighted',
                           labels=np.unique(Y_pred))

    # how many documents abstained
    probs_train = majority_model.predict_proba(L=L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=training_set, y=probs_train, L=L_train)

    # get number of false positives
    buckets = get_label_buckets(Y_test, Y_pred)
    true_positives, false_positives, true_negatives, false_negatives = (
        buckets.get((1, 1)), buckets.get((1, 0)), buckets.get(
            (0, 0)), buckets.get((0, 1)))
    # write analysis to file
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt",
              "w") as output_file:
        output_file.write(
            f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%"
        )
        output_file.write(
            f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%"
        )
        output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%")
        output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%")
        output_file.write(
            f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}")
        output_file.write(
            f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}"
        )
        output_file.write(
            f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}"
        )
        output_file.write(
            f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}")
        output_file.write(
            f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
# ## Train LabelModel And Generate Probabilistic Labels

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model import LabelModel

# Train LabelModel.
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=100, seed=123, log_freq=20, l2=0.1, lr=0.01)

# %% [markdown]
# As a spot-check for the quality of our LabelModel, we'll score it on the dev set.

# %%
from snorkel.analysis import metric_score

preds_dev = label_model.predict(L_dev)

acc = metric_score(Y_dev, preds_dev, probs=None, metric="accuracy")
print(f"LabelModel Accuracy: {acc:.3f}")

# %% [markdown]
# We see that we get very high accuracy on the development set.
# This is due to the abundance of high quality crowdworker labels.
# **Since we don't have these high quality crowdsourcing labels for the
# test set or new incoming data points, we can't use the LabelModel reliably
# at inference time.**
# In order to run inference on new incoming data points, we need to train a
# discriminative model over the tweets themselves.
# Let's generate a set of labels for that training set.

# %%
# %%
LFAnalysis(L_dev, lfs).lf_summary(df_dev.rating.values)

# %% [markdown]
# ### Applying labeling functions to the training set
#
# We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model import LabelModel

L_train = applier.apply(df_train)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=5000, seed=123, log_freq=20, lr=0.01)
preds_train = label_model.predict(L_train)

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
    df_train, preds_train, L_train)
df_train_filtered["rating"] = preds_train_filtered

# %% [markdown]
# ### Rating Prediction Model
# We write a Keras model for predicting ratings given a user's book list and a book (which is being rated).
# The model represents the list of books the user interacted with, `books_idxs`, by learning an embedding for each idx, and averaging the embeddings in `book_idxs`.
# It learns another embedding for the `book_idx`, the book to be rated.
# Then it concatenates the two embeddings and uses an [MLP](https://en.wikipedia.org/wiki/Multilayer_perceptron) to compute the probability of the `rating` being 1.
# This type of model is common in large-scale recommender systems, for example, the [YouTube recommender system](https://ai.google/research/pubs/pub45530).
示例#19
0
print(result)

from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter(cardinality=2)
preds_train_majority = majority_model.predict(L=L_train)

from snorkel.labeling.model import LabelModel
label_model = LabelModel(cardinality=2, verbose=True, device='cuda')
#according to location data, BE tweets = 10-15%
label_model.fit(L_train=L_train,
                n_epochs=500,
                class_balance=[0.15, 0.85],
                log_freq=100,
                seed=82)
preds_train_label = label_model.predict(L=L_train)

L_dev = applier.apply(df=df_dev)
mapping = {'BE': 0, 'NL': 1}
Y_dev = np.array([mapping[i] for i in df_dev['label']])

majority_acc = majority_model.score(L=L_dev,
                                    Y=Y_dev,
                                    tie_break_policy="random")["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")
label_model_acc = label_model.score(L=L_dev,
                                    Y=Y_dev,
                                    tie_break_policy="random")["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

from snorkel.labeling import filter_unlabeled_dataframe
示例#20
0
processed_train_data = applier.apply(data)
processed_dev_data = applier.apply(data)
logging.info("applied labelling functions to scraped data")
print(LFAnalysis(L=processed_train_data, lfs=lfs).lf_summary())

###############################################################
## FITTING THE GENERATIVE MODELAND PREDICTING

###############################################################
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=processed_train_data,
                n_epochs=500,
                log_freq=100,
                seed=123)

pred_LM_train = label_model.predict(processed_dev_data)
logging.info("generated noisy labels")
logging.info("writing to DataFrame")

###############################################################
## OUTPUTS
###############################################################
pred_frame = pd.DataFrame(data={
    'title': data['title'],
    'Prediction': pred_LM_train
})
print(pred_frame['Prediction'].value_counts())

filter_ = pred_frame['Prediction'] == 1
pred_frame = pred_frame.loc[filter_]
print(pred_frame)
def labeling_applier(lfs: list,
                     dataset: list,
                     filenames: list,
                     original_images: list = None,
                     save_perfix: str = 'data/ircad_snorkel',
                     log: bool = False):
    """Function to generating label images.

    Parameters
    ----------
    lfs -
        LFs that this applier executes on examples

    dataset -
        List of numpy images 

    filenames - 
        list of filenames corresponding to dataset numpy images
    
    save_perfix - 
        folder save path

    log - 
        if true print status information
    """
    labeled_images = []

    size = 0
    for array in dataset:
        mul = 1
        for e in array.shape:
            mul *= e
        size += mul
    lab_arr = np.zeros((size, len(lfs)), dtype=np.uint8)

    if log: print('Prepare arrays', 'size:', size, 'bytes')

    index = 0
    for array in dataset:
        labeled_array = []
        for func in lfs:
            labeled_array.append(func(array).flatten())
        T = np.array(labeled_array).T
        lab_arr[index:index + T.shape[0], :] = T
        labeled_images.append(Image(T, array.shape))
        index += T.shape[0]

    #[[1 0 0 1], [0 1 0 1]]

    if log: print('Training')
    LM = LabelModel(cardinality=2, verbose=True, device='cuda')
    LM.fit(lab_arr, seed=3333, log_freq=1, class_balance=[0.965, 0.035])

    if log: print('Predict')

    iterator = zip(labeled_images, filenames, range(len(filenames)),
                   range(len(filenames)))
    if original_images is not None:
        iterator = zip(labeled_images, filenames, range(len(filenames)),
                       original_images)

    for array, name, idx, image in iterator:
        save_path = str(Path(save_perfix) / name)
        if log:
            print('Image: ' + str(idx + 1) + '/' + str(len(filenames)) +
                  ' Save path: ' + save_path)

        im_flat = np.zeros(array.shape, dtype=np.uint8).flatten()

        #[[1 0 0 1], [0 1 0 1]]
        p = LM.predict(array.labels)

        #[[1] [0] [1]...]
        p = np.reshape(p, array.shape)
        p = getLargestCC(p)
        p[p > 0] = 255

        new_im = sitk.GetImageFromArray(np.array(p, dtype=np.uint8))
        if original_images is not None:
            new_im.CopyInformation(image)
        writer = sitk.ImageFileWriter()
        writer.SetFileName(save_path)
        writer.Execute(new_im)
for i, (train_idx, test_idx) in enumerate(kf.split(L_data_local)):
    # Define train dataset
    L_train = L_data_local[train_idx]
    Y_train = Y_data_local[train_idx]
    # Define test dataset
    L_test = L_data_local[test_idx]
    Y_test = Y_data_local[test_idx]

    # Evaluate a dependency-informed Snorkel model
    l_model = LabelModel(cardinality=2, verbose=False)
    l_model.fit(L_train, n_epochs=n_epochs, lr=lr)

    try:
        if abstain_rate < 0:
            Y_pred = l_model.predict(L_test, tie_break_policy="abstain")
        else:
            Y_prob = l_model.predict_proba(L_test)
            Y_pred = predict_at_abstain_rate(Y_prob, abstain_rate)

        scores = scorer.score(Y_test, preds=Y_pred)
        all_scores.append(scores)
    except Exception as e:
        print("Iter {}: {}".format(i+1,e))
        continue
    
    # Logging
    print("Iteration " + str(i+1) + ":", scores)

print("-- SUMMARY --")
print("accuracy: AVG {:.3f}, STD {:.3f}".format(np.mean([s["accuracy"] for s in all_scores]), np.std([s["accuracy"] for s in all_scores])))
# %%
label_model.score(L_valid, Y_valid, metrics=["f1_micro"])

# %% [markdown]
# ## 4. Train a Classifier
# You can then use these training labels to train any standard discriminative model, such as [an off-the-shelf ResNet](https://github.com/KaimingHe/deep-residual-networks), which should learn to generalize beyond the LF's we've developed!

# %% [markdown]
# #### Create DataLoaders for Classifier

# %%
from snorkel.classification import DictDataLoader
from model import SceneGraphDataset, create_model

df_train["labels"] = label_model.predict(L_train)

if sample:
    TRAIN_DIR = "data/VRD/sg_dataset/samples"
else:
    TRAIN_DIR = "data/VRD/sg_dataset/sg_train_images"

dl_train = DictDataLoader(
    SceneGraphDataset("train_dataset", "train", TRAIN_DIR, df_train),
    batch_size=16,
    shuffle=True,
)

dl_valid = DictDataLoader(
    SceneGraphDataset("valid_dataset", "valid", TRAIN_DIR, df_valid),
    batch_size=16,
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [
    lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity
]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

# %% [markdown]
# Note that we used the `LabelModel` to label data; however, on many data points, all the labeling functions abstain, and so the `LabelModel` abstains as well.
# We'll filter these data points out of our training set now:

# %%
df_train = df_train[df_train.label != ABSTAIN]

# %% [markdown]
# Our ultimate goal is to use the resulting labeled training data points to train a machine learning model that can **generalize beyond the coverage of the labeling functions and the `LabelModel`**.
# However first we'll explore some of Snorkel's other operators for building and managing training data.

# %% [markdown]
# ## 3) Writing Transformation Functions for Data Augmentation
#
示例#25
0
def lambda_handler(event, context):

    df = wr.s3.read_csv(path="s3://individualtwitter/preprocessed_data.csv")


    # create spam or not spam labels
    SPAM = 1
    NOT_SPAM = 0
    ABSTAIN = -1

    # create spam labelling functions


    # label as SPAM if there is a link in the tweet

    @labeling_function()
    def lf_contains_link(x):
        return SPAM if "http" in x.text else ABSTAIN

    # create list of words which often occur in spam tweets

    spam_words = ["free", "check", "giveaway", "gift", "present", "subscribe", "follow", "retweet", "luck", 'win']


    # label twet as spam if spam word in tweet

    @labeling_function()
    def lf_contains_words(x):
        return SPAM if any(i in x.text for i in spam_words) else ABSTAIN


    # webscraped a list of crypto jargon from a website. See file webscraping_crypto_jargon for process

    not_spam_words = ['10x',
                      '51% attack',
                      'address',
                      'addy',
                      'algorithm',
                      'altcoin',
                      'a long position',
                      'a short position',
                      'alt markets',
                      'airdrop',
                      'aml',
                      'angel investor',
                      'application cryptocurrencies',
                      'arbitrage',
                      'arbitrage trading',
                      'ash-draked',
                      'asic',
                      'asic miner',
                      'ath',
                      'atl',
                      'augur',
                      'autonomous decentralized organization',
                      'bagholder',
                      'bears',
                      'bearish',
                      'bearishness',
                      'bear market',
                      'bear trap',
                      'best blockchain',
                      'bit',
                      'block',
                      'blockchain',
                      'block explorer',
                      'block header',
                      'block height',
                      'block reward',
                      'bitcoin protocol',
                      'bitcoin network',
                      'bitcoin-js',
                      'bitcoinqt',
                      'bitcoin days destroyed',
                      'bollinger band',
                      'brain wallet',
                      'btc next rsistance',
                      'btc next support',
                      'btc &amp; xbt',
                      'btd',
                      'btfd',
                      'bugling',
                      'bull trap',
                      'bullish',
                      'bulls',
                      'bullishness',
                      'bull market',
                      'buy area',
                      'buy below',
                      'buy wall',
                      'candle',
                      'choyna',
                      'colored corners',
                      'central ledger',
                      'confirmation',
                      'consensus',
                      'consensus rule',
                      'crypto bubble',
                      'cryptocurrency',
                      'cryptography',
                      'cryptojacking',
                      'cve-2012–2459',
                      'dapp',
                      'dao',
                      'darksend',
                      'darkweb',
                      'data size',
                      'deaf at bounce',
                      'decentralized',
                      'deflation',
                      'demurrage',
                      'desktop wallet',
                      'deterministic wallet',
                      'devcon of ethereum',
                      'difficulty',
                      'd***o',
                      'distributed ledger',
                      'double spending',
                      'dust transactions',
                      'dyor',
                      'eli5',
                      'i am',
                      'ether',
                      'ethereum',
                      'etf',
                      'escrow',
                      'exchange',
                      'fa',
                      'faucet',
                      'fiat money',
                      'following scalp',
                      'fomo',
                      'fork',
                      'block height',
                      'frictionless',
                      'fud',
                      'fud',
                      'fudster',
                      'full node?',
                      'fungible',
                      'genesis blocks',
                      'gpu',
                      'halving',
                      'hard cap',
                      'hard fork',
                      'hardware wallet',
                      'hash',
                      'hash rate',
                      'hodl',
                      'ico',
                      'increase leverage',
                      'inflation',
                      'inputs',
                      'jomo',
                      'kimoto gravity well',
                      'kyc',
                      'lambo',
                      'laundry',
                      'leverage',
                      'limit order',
                      'litecoin',
                      'liquidity',
                      'liquidity swap',
                      'macd',
                      'market cap/mcap',
                      'margin trading',
                      'market order',
                      'merged mining',
                      'micro-transaction',
                      'mbtc',
                      'miner',
                      'mining',
                      'mining algorithm',
                      'mining contract',
                      'mining pool',
                      'mining rig',
                      'minting',
                      'mixing service',
                      'mobil wallet',
                      'mooning',
                      'money laundering',
                      'mtgox',
                      'miltisig',
                      'namecoin',
                      'network effect',
                      'nfc',
                      'node',
                      'nonse',
                      'observe the candle',
                      'off blockchain transactions',
                      'orphanded block',
                      'open source',
                      'otc exchange',
                      'output',
                      'output index',
                      'otc',
                      'p2p',
                      'p2pkh',
                      'paper wallet',
                      'peer?',
                      'plasma',
                      'platform cryptocurrencies',
                      'pre-mining',
                      'pre-mining coins',
                      'price bubble',
                      'privacy',
                      'private key',
                      'proof of burn',
                      'proof of existence',
                      'proof of stake',
                      'pow',
                      'proof of work',
                      'public key',
                      'pump and dump',
                      'pubkey',
                      'quantitative easing',
                      'qr code',
                      'recurrent rebilling',
                      'recovery phrase ',
                      'refund',
                      'rekt',
                      'remittance',
                      'response time',
                      'result',
                      'reverse indicator',
                      'ripple',
                      'roi',
                      'rsi',
                      'satoshi',
                      'satoshi nakamoto',
                      'scalability',
                      'scamcoin',
                      'scrypts pubkey',
                      'scrypt',
                      'seed',
                      'self executing contract',
                      'segregated witness soft fork',
                      'segwit (segregated witness)',
                      'sharding',
                      'sidechain',
                      'signature',
                      'signature script',
                      'sigscript',
                      'silk road',
                      'smart contract',
                      'speculator',
                      'spv',
                      'shill',
                      'shitcoin ',
                      'soft cap',
                      'soft fork',
                      'software wallet',
                      'solidity',
                      'stale blocks',
                      'stale block',
                      'stable coin',
                      'surge',
                      'swing',
                      'ta',
                      'taint',
                      'tank',
                      'tcp/ip',
                      'testnet',
                      'the markel tree',
                      'timestamp',
                      'tokens',
                      'tor',
                      'total coin supply',
                      'to the moon',
                      'transaction block',
                      'transactional cryptocurrencies',
                      'transaction fee',
                      'txids',
                      'uri',
                      'utox',
                      'utility cryptocurrencies',
                      'vanity address',
                      'vapourware',
                      'validation rules',
                      'velocity of money',
                      'venture capitalist',
                      'virgin bitcoin',
                      'volatility',
                      'top crypto tool:',
                      'automate trading',
                      'hopper trade bot',
                      'trusted crypto wallet',
                      'super ledger',
                      'wallet',
                      'whale',
                      'whitepaper',
                      'wire transfer',
                      'zerocoin',
                      'zero confirmation transaction']

    # labelling function that labels tweet as not spam if jargon occurs in it

    @labeling_function()
    def lf_contains_notspamwords(x):
        return NOT_SPAM if any(i in x.text for i in not_spam_words) else ABSTAIN

    # labelling fucntion that labels tweet as non spam if two or more jargon words in it

    @labeling_function()
    def count_ns_words(x):
        ns_count = 0
        for j in not_spam_words:
            counting = x.text.split().count(j)
            ns_count = ns_count + counting
        return NOT_SPAM if ns_count >= 2 else ABSTAIN

    # use labelling functions as input for Label Model

    lfs = [lf_contains_link, lf_contains_words, lf_contains_notspamwords, count_ns_words]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df)

    label_model = LabelModel(cardinality=3, verbose=True)
    label_model.fit(L_train, n_epochs=100, seed=123, log_freq=20, l2=0.1, lr=0.01)


    # apply Snorkel labels to manually labelled data

    df["label_snorkel"] = label_model.predict(L=L_train, tie_break_policy="random")