示例#1
0
 def cartesian_params(self):
     return list(
         product_dict(fraction=self.fraction,
                      contamination=self.contamination,
                      seed=self.seed))
示例#2
0
 def cartesian_params(self):
     return product_dict(min_cluster_size=self.min_cluster_size,
                         allow_noise=self.allow_noise)
示例#3
0
 def cartesian_params(self):
     return product_dict(**self.kwargs)
示例#4
0
                            for i in outliers]

one_to_many = [(inliers, [j for j in outliers
                          if j == i], [], [j for j in outliers if j != i])
               for i in outliers]

# how many samples per class are used for all tests
n_classes = [8000]
test_size = 8000

# %%
param_combinations = product_dict(**dict(seed=range(3),
                                         labeled_data=[1.0],
                                         fixed_cont=[0.1],
                                         n_oe=[False, 50, 250],
                                         use_nn=[True],
                                         use_umap=[False],
                                         min_len=[200],
                                         epochs=[15],
                                         class_split=standard_split,
                                         weakly_supervised=[False]))

# split the outlier, inlier tuple pairs and print all parameters for run
for d in param_combinations:
    d["inliers"], d["outliers"], d["test_inliers"], d["test_outliers"] = d[
        "class_split"]
    d.pop('pair', None)
    d["in_test_not_train_outlier"] = [
        x for x in d["test_outliers"] if x not in d["outliers"]
    ]
print(param_combinations)
示例#5
0
def main():
    standard_split = [
        ([0, 1, 2, 11], [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15])]
    pairwise_split = list(permutations([[x] for x in range(0, 16)], 2))
    # %%
    param_combinations = product_dict(**dict(
        seed=[42, 43, 44],
        test_size=[0.2],
        labeled_data=[0.1, 0.3,  0.5, 0.8, 1.0],
        fixed_cont=[0.05, 0.1],
        n_oe=[0],
        use_nn=[True],
        pair=standard_split
    ))
    # how many samples per class are used for all tests
    n_class = 3000

    # split the outlier, inlier tuple pairs and print all parameters for run
    for d in param_combinations:
        d["inliers"], d["outliers"] = d["pair"]
        d.pop('pair', None)

    #data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb_vec.pkl"
    data_path = "/home/philipp/projects/dad4td/data/raw/QS-OCR-Large/rvl_cdip.pkl"
    oe_path = "/home/philipp/projects/dad4td/data/processed/oe_data.pkl"
    res_path = next_path(
        "/home/philipp/projects/dad4td/reports/semisupervised/semisup_rvl_pw_%04d.tsv")

    doc2vec_model = Doc2VecModel("apnews", "apnews", 1.0,
                                 100, 1,
                                 "/home/philipp/projects/dad4td/models/apnews_dbow/doc2vec.bin")

    # load data and get the doc2vec vectors for all of the data used
    df_full = pd.read_pickle(data_path)

    # sample only a portion of the data
    df_full = df_full.groupby('target', group_keys=False).apply(
        lambda df: df.sample(n=n_class, random_state=42))

    # %%
    df_full["vecs"] = doc2vec_model.vectorize(df_full["text"])
    df_full["vecs"] = df_full["vecs"].apply(tuple)

    # %%
    result_df = pd.DataFrame()
    for i, params in enumerate(param_combinations):
        print(
            f"\n\n---------------------\n\nRun {i+1} out of {len(param_combinations)}\n\n{params}")

        df, df_test = prepare_data(df_full, **params)

        # UMAP Train
        docvecs, umap_model = umap_reduce(
            df["vecs"].to_list(), df["label"], None, **params)

        # Ivis
        docvecs, ivis_model = ivis_reduce(
            docvecs, df["label"], None, **params)

        # remove OE data, so it's not scored as well
        df["decision_scores"] = docvecs
        df = df.where(df.scorable == 1).dropna()

        # find outliers in 1D scores
        preds, iqr_model = score_out_preds(df["decision_scores"], None,
                                           contamination=df.outlier_label.value_counts(normalize=True)[-1])

        # score the predictions for outliers
        scores = get_scores(dict(), df["outlier_label"], preds)

        # %%
        #  write the scores to df and save
        scores.update(params)
        scores["data"] = "train"
        result_df = result_df.append(scores, ignore_index=True)
        result_df.to_csv(res_path, sep="\t")
        print(f"\nTraining scores:\n{pd.DataFrame([scores], index=[0])}")
        # %%
        # test UMAP and ivis
        docvecs_test, _ = umap_reduce(
            df_test["vecs"].to_list(), None, umap_model, **params)

        docvecs_test, _ = ivis_reduce(docvecs_test, None, ivis_model, **params)

        # remove OE data, so it's not scored as well
        df_test["decision_scores"] = docvecs_test
        df_test = df_test.where(df_test.scorable == 1).dropna()

        # find outliers in 1D scores
        preds = iqr_model.transform(
            df_test["decision_scores"], thresh_factor=1)

        # score the predictions for outliers
        scores = get_scores(dict(), df_test["outlier_label"], preds)

        # write the scores to df and save
        scores.update(params)
        scores["data"] = "test"
        result_df = result_df.append(scores, ignore_index=True)
        result_df.to_csv(res_path, sep="\t")
        print(f"\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
示例#6
0
文件: main.py 项目: MikeynJerry/cs525
import argparse
from ModelHelper import ModelHelper
from utils import product_dict
from tqdm import tqdm

# Multiple configuration trainer
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", nargs="+", type=str)
    parser.add_argument("--nb_hidden", nargs="+", type=int)
    parser.add_argument("--window_size", nargs="+", type=int)
    parser.add_argument("--stride", nargs="+", type=int)
    args = parser.parse_args()

    helper = ModelHelper("beatles.txt")
    configs = list(product_dict(**args.__dict__))
    for config in tqdm(configs):
        helper.build(config["window_size"], config["stride"])
        model = helper.create_model(config["model_name"], config["nb_hidden"])
        print(helper._label(config))
        history = helper.train(model, nb_epochs=100, verbose=0)
        helper.plot_history(history, config)

    helper.plot_history(configs=args.__dict__, show_plot=True)