示例#1
0
    def train_model(self,
                    df_train: pd.DataFrame,
                    application_area_lfs: list,
                    analysis_path: str = "output",
                    label_output_path: str = "labels.jsonl",
                    save_model_path: str = None):
        """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points

        :param df_train: The training data for the model
        :type df_train: pd.DataFrame
        :param application_area_lfs: A list of labeling functions to use in training the Label Model
        :type application_area_lfs: list
        :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output`
        :type analysis_path: str, optional
        :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl"
        :type label_output_path: str, optional
        :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved
        :type save_model_path: str, optional
        """
        file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        applier = PandasLFApplier(lfs=application_area_lfs)
        L_train = applier.apply(df=df_train)

        model = LabelModel(cardinality=2, verbose=True)
        model.fit(L_train=L_train, n_epochs=800, log_freq=100)
        if (save_model_path is not None):
            model.save(save_model_path)

        int_labels, prob_labels = model.predict(L=L_train,
                                                return_probs=True,
                                                tie_break_policy="abstain")
        probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=prob_labels, L=L_train)

        int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=int_labels, L=L_train)
        # write out both labels. In the probability outputs, p_rel is the second probability listed
        assert list(probs_df_train_filtered["paperid"]) == list(
            int_df_train_filtered["paperid"])
        with open(f"{label_output_path}", mode="w") as out:
            for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]):
                out.write(
                    json.dumps({
                        "id": paper_id,
                        # cast to int and float to get rid of nonserializable numpy types
                        "is_rel": int(int_train_filtered[idx]),
                        "p_rel": float(probs_train_filtered[idx][1])
                    }) + "\n")

        # output LF analysis to csv file sorted by coverage
        lf_analysis = LFAnalysis(L=L_train,
                                 lfs=application_area_lfs).lf_summary()
        with open(
                f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv",
                "w") as outfile:
            lf_analysis = lf_analysis.sort_values("Coverage")
            lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
示例#2
0
def run_analysis(
    applied_lf_matrix: np.ndarray,
    lfs: List[LabelingFunction],
    save_csv_to: AbsolutePath,
    save_json_to: AbsolutePath,
    label_series: Optional[Series] = None,
) -> None:
    lf_analysis_summary = LFAnalysis(applied_lf_matrix, lfs).lf_summary(
        Y=label_series.values if label_series is not None else None)
    lf_analysis_summary.to_csv(save_csv_to)
    analysis_dict = lf_analysis_summary.to_dict()
    del analysis_dict["j"]
    with open(save_json_to, "w") as f:
        json.dump(analysis_dict, f, indent=4, sort_keys=True, cls=NumpyEncoder)
def main(train_path, output_dir, label_dir):
    # Get all data
    df = pd.read_csv(train_path)

    # Get human labels
    human_labels = read_human_labels(label_dir)

    # df_test and lab_test: the set of all human-labeled notes, and their labels
    df_test = df.merge(human_labels, on=['record_number'])
    lab_test = df_test.human_label
    del df_test['human_label']

    # df_train: formed by removing all patients from df with a human-labeled note
    df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr'])
    df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1)

    # Generate label matrix
    L_train = PandasLFApplier(lfs=lfs).apply(df=df_train)
    L_test = PandasLFApplier(lfs=lfs).apply(df=df_test)

    # Summarize LFs
    output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    #print(output_train)
    output_test  = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values)
    #print(output_test)

    # Save LF analysis
    path = os.path.join(output_dir, 'LF_analysis_train.csv')
    output_train.to_csv(path, index = True)
    path = os.path.join(output_dir, 'LF_analysis_test.csv')
    output_test.to_csv(path, index = True)

    # Create label model
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7])

    # Evaluate the label model using labeled test set
    for metric in ['recall', 'precision', 'f1', 'accuracy']:
        label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric]
        print("%-15s %.2f%%" % (metric+":", label_model_acc * 100))

    null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],)))
    print("%-15s %.2f%%" % ("null f1:", null_f1 * 100))
    print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100))

    # Save error analysis
    preds = label_model.predict_proba(L_test)
    error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir)

    # Get labels on train
    probs_train = label_model.predict_proba(L_train)

    # Filter out unlabeled data points
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)

    # Save filtered training set
    df_train_filtered['prob'] = probs_train_filtered[:,1]
    path = os.path.join(output_dir, 'df_train_filtered.csv')
    df_train_filtered.to_csv(path, index = False)

    # Save label probs
    path = os.path.join(output_dir, 'probs_train_filtered')
    np.save(path, probs_train_filtered[:,1])

    # Save training data set and labels
    assert len(df_test) == len(lab_test)
    df_test['human_label'] = lab_test
    path = os.path.join(output_dir, 'df_test.csv')
    df_test.to_csv(path, index = False)
    path = os.path.join(output_dir, 'lab_test')
    np.save(path, lab_test)
示例#4
0
def label_user(inp_path, prefix=""):
    df_train = pd.read_pickle(inp_path)

    ########## threshold on word similarity
    take_first = 100
    overall_first = 10000
    global thresh_by_value, overall_thresh
    df_train['root_value'] = df_train['value'].swifter.set_dask_threshold(
        dask_threshold=0.001).allow_dask_on_strings().apply(
            lambda x: syn_to_hob[x])
    thresh_by_value = df_train.groupby(
        ["root_value"]).apply(lambda x: np.partition(
            x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0)
        )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict()
    overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(),
                                  max(len(df_train) - overall_first, 0))[max(
                                      len(df_train) - overall_first, 0)]
    print(overall_thresh)
    #############################

    # separately loose - strict, pos - neg, period - without
    names_pool = [
        "context:2_count_pos", "context:3_count_pos", "context:100_count_pos",
        "context:2_period_count_pos", "context:3_period_count_pos",
        "context:100_period_count_pos", "context:2_count_neg",
        "context:3_count_neg", "context:100_count_neg",
        "context:2_period_count_neg", "context:3_period_count_neg",
        "context:100_period_count_neg"
    ]
    for f_name in names_pool:
        curr_cols = [x for x in df_train.columns if f_name in x]
        df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum,
                                                                        axis=1)
        df_train = df_train.drop(curr_cols, axis=1)
    for p in ["pos", "neg"]:
        df_train["new_total_context:100_count_" + p] = df_train[[
            "total_context:100_count_" + p, "total_context:3_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_count_" + p] - x["total_context:3_count_" +
                                                     p]),
                         axis=1)
        df_train["new_total_context:3_count_" + p] = df_train[[
            "total_context:3_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p
                                                   ]),
                         axis=1)
        df_train["new_total_context:100_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:100_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:100_period_count_" + p] - x[
                "total_context:3_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:3_period_count_" + p] = df_train[[
            "total_context:3_period_count_" + p,
            "total_context:2_period_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:3_period_count_" + p] - x[
                "total_context:2_period_count_" + p]),
                         axis=1)
        df_train["new_total_context:2_count_" + p] = df_train[[
            "total_context:100_period_count_" + p, "total_context:2_count_" + p
        ]].swifter.apply(lambda x: max(
            0, x["total_context:2_count_" + p] - x[
                "total_context:100_period_count_" + p]),
                         axis=1)

    df_train = df_train.drop(
        ["total_" + x for x in names_pool if "2_period_count" not in x],
        axis=1)

    lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue]
    num_of_thesholds = 3
    step = 100 // num_of_thesholds

    for col in df_train:
        if col not in ["author", "value", "idd", "root_value"]:
            if col not in [
                    "pos_prob_mean", "neg_prob_mean", "num_good_posts"
            ]:  # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]:
                thresholds = [0]
                if "lexicon" in col and "unique" not in col:
                    continue
                if True:  # col in ["lexicon_counts", "unique_lexicon_counts"]:
                    vals = df_train[col].to_numpy()
                    thresholds = np.percentile(
                        vals, list(range(0 + step, 99 + step,
                                         step))).astype(int)
                    thresholds = sorted(list(set(thresholds)))
                    if len(thresholds) > 1:
                        thresholds = thresholds[:-1]
                    if "lexicon" in col:
                        thresholds = [3]
                    # max_val = max(vals)
                    # thresholds = list(range(0, int(max_val), int(max_val/5) + 1))
                # elif col == "pos_prob_mean":
                #    thresholds = [0.5 + 0.1 * x for x in range(5)]
                for i in range(len(thresholds)):
                    thresh = thresholds[i]
                    next_threshold = sys.maxsize if i == len(
                        thresholds) - 1 else thresholds[i + 1]
                    previous_threshold = -sys.maxsize if i == 0 else thresholds[
                        i - 1]
                    if "lexicon_counts" not in col:
                        lfs.append(
                            make_thresold_lf(thresh=thresh,
                                             col_name=col,
                                             next_threshold=next_threshold))
                    else:
                        lfs.append(
                            make_lexicon_lf(
                                thresh=thresh,
                                pref=col,
                                previous_threshold=previous_threshold))

    num_annotators = 0
    if num_annotators > 0:
        for i in range(1, num_annotators + 1):
            lfs.append(make_annotator_lf(worker_index=i))

    lfs = [
        x for x in lfs
        if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"])
    ]
    print("created lfs their number", len(lfs))
    print("\n".join(str(x) for x in lfs))

    #### validation #####
    do_val = False
    if do_val:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        df_val = df_val.merge(df_golden, on="auth_val")
        y_val = np.array(df_val["final"])
        df_val = df_val.drop(labels="final", axis=1)
        # create test set as well

        with TQDMDaskProgressBar(desc="Dask Apply"):
            applier = PandasParallelLFApplier(lfs=lfs)
            L_val = applier.apply(df=df_val, n_parallel=num_cpu)
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)

        dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary()
        analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val)
        analysis.to_csv("/home/tigunova/val_analysis.csv")
        dev_analysis.to_csv("/home/tigunova/dev_analysis.csv")
        print(analysis)
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_dev)  #, Y_dev=y_val)
        model_stat = label_model.score(L=L_val, Y=y_val)
        print(model_stat)
        exit(0)
    ###########

    #### picking threshold #####
    do_threshold = False
    if do_threshold:
        df_golden = pd.read_csv(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv"
        )
        name_val = list(df_golden["auth_val"])
        # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x])
        df_train["auth_val"] = df_train[["author", "value"]].swifter.apply(
            lambda x: x["author"] + "+++" + x["value"], axis=1)
        df_val = df_train[df_train.auth_val.isin(name_val)]
        df_dev = df_train[~df_train.auth_val.isin(name_val)]
        pop_size = df_dev.shape[0]
        print("Number val", df_val.shape)
        print("Number dev", df_dev.shape)
        applier = PandasParallelLFApplier(lfs=lfs)
        df_val = df_val.merge(df_golden, on="auth_val")
        L_val = applier.apply(df=df_val, n_parallel=num_cpu)
        val_thresholds = [0.01 * x for x in range(100)]
        label_model = LabelModel(cardinality=2, verbose=True)
        with TQDMDaskProgressBar(desc="Dask Apply"):
            L_dev = applier.apply(df=df_dev, n_parallel=num_cpu)
            label_model.fit(L_dev, class_balance=[0.5, 0.5])  # , Y_dev=y_val)
            wghts = label_model.get_weights()
            print("\n".join(str(x) for x in zip(lfs, wghts)))
            probs_val = label_model.predict_proba(L=L_val)
            probs_df = pd.DataFrame(probs_val,
                                    columns=["neg_prob", "pos_prob"])
            df_val = pd.concat([df_val.reset_index(), probs_df], axis=1)
            probs_dev = label_model.predict_proba(L=L_dev)
            probs_df = pd.DataFrame(probs_dev,
                                    columns=["neg_prob", "pos_prob"])
            df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1)
            y_true = np.array(df_val["final"])
        for th in val_thresholds:
            y_pred = np.array(
                df_val["pos_prob"].apply(lambda x: 1 if x > th else 0))
            #print("true negatives")
            #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]])
            prec = precision_score(y_true, y_pred)

            pred_labels = y_pred
            true_labels = y_true
            # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
            TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))

            # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
            TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))

            # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
            FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))

            # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
            FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))

            print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN))

            # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr)))
            # print("******************************")
            print("threshold %s, proportion population %.4f, precision %s" %
                  (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] /
                   pop_size, str(prec)))
        exit(0)
    ###########

    with TQDMDaskProgressBar(desc="Dask Apply"):
        applier = PandasParallelLFApplier(lfs=lfs)
        L_train = applier.apply(df=df_train, n_parallel=num_cpu)

    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
    print(analysis)

    df_l_train = pd.DataFrame(
        L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs])
    print(df_train.shape)
    print(df_l_train.shape)
    df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1)
    print(df_train.shape)
    print("********************************************")

    t4 = time.time()
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train,
                    n_epochs=1000,
                    lr=0.001,
                    log_freq=100,
                    seed=123,
                    class_balance=[0.3, 0.7])

    probs_train = label_model.predict_proba(L=L_train)
    print("labeling model work ", (time.time() - t4) / 60)

    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train)

    probs_df = pd.DataFrame(probs_train_filtered,
                            columns=["neg_prob", "pos_prob"])
    print(df_train_filtered.shape)
    print(probs_df.shape)
    result_filtered = pd.concat([
        df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df
    ],
                                axis=1)
    print(result_filtered.shape)
    print("****************************************************")

    result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv")

    print(df_train_filtered.shape)
    print(probs_df.shape)
    df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df],
                                  axis=1)
    df_train_filtered = df_train_filtered.drop(["index"], axis=1)
    print(df_train_filtered.shape)
    df_train_filtered.to_pickle(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".pkl")
    df_train_filtered.to_csv(
        "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" +
        prefix + ".csv")

    # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv")

    ### write dict
    output_threshold = 0.63
    output_dict = defaultdict(list)
    auth_hobby_dict = defaultdict(list)
    for index, row in result_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            auth_hobby_dict[row.author].append([row.value, row.pos_prob])

    allowed_labels = []
    for index, row in df_train_filtered.iterrows():
        if row.value == row.value and row.author == row.author:
            if row.pos_prob > output_threshold:
                output_dict[row.author].append([row.value] + row.idd +
                                               [row.pos_prob])
                allowed_labels.append(syn_to_hob[row.value])
    print("\n".join([
        str(y) for y in sorted(dict(Counter(allowed_labels)).items(),
                               key=lambda x: x[1])
    ]))
    print(
        "After cropping",
        sum([
            x if x < 500 else 500
            for x in dict(Counter(allowed_labels)).values()
        ]))
    print("users in total", len(output_dict))
    for auth, stuffs in output_dict.items():
        prof = ":::".join(set([x[0] for x in stuffs]))
        prob = ":::".join([str(x[-1]) for x in stuffs])
        msgs = set([x for l in stuffs for x in l[1:-1]])
        output_dict[auth] = [prof] + list(msgs) + [prob]

    with open(
            "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_"
            + prefix + ".txt", "w") as f_out:
        f_out.write(repr(dict(auth_hobby_dict)))
    with open("/home/tigunova/users_profession1.txt", "w") as f_out:
        f_out.write(repr(dict(output_dict)))