def cartesian_params(self): return list( product_dict(fraction=self.fraction, contamination=self.contamination, seed=self.seed))
def cartesian_params(self): return product_dict(min_cluster_size=self.min_cluster_size, allow_noise=self.allow_noise)
def cartesian_params(self): return product_dict(**self.kwargs)
for i in outliers] one_to_many = [(inliers, [j for j in outliers if j == i], [], [j for j in outliers if j != i]) for i in outliers] # how many samples per class are used for all tests n_classes = [8000] test_size = 8000 # %% param_combinations = product_dict(**dict(seed=range(3), labeled_data=[1.0], fixed_cont=[0.1], n_oe=[False, 50, 250], use_nn=[True], use_umap=[False], min_len=[200], epochs=[15], class_split=standard_split, weakly_supervised=[False])) # split the outlier, inlier tuple pairs and print all parameters for run for d in param_combinations: d["inliers"], d["outliers"], d["test_inliers"], d["test_outliers"] = d[ "class_split"] d.pop('pair', None) d["in_test_not_train_outlier"] = [ x for x in d["test_outliers"] if x not in d["outliers"] ] print(param_combinations)
def main(): standard_split = [ ([0, 1, 2, 11], [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15])] pairwise_split = list(permutations([[x] for x in range(0, 16)], 2)) # %% param_combinations = product_dict(**dict( seed=[42, 43, 44], test_size=[0.2], labeled_data=[0.1, 0.3, 0.5, 0.8, 1.0], fixed_cont=[0.05, 0.1], n_oe=[0], use_nn=[True], pair=standard_split )) # how many samples per class are used for all tests n_class = 3000 # split the outlier, inlier tuple pairs and print all parameters for run for d in param_combinations: d["inliers"], d["outliers"] = d["pair"] d.pop('pair', None) #data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb_vec.pkl" data_path = "/home/philipp/projects/dad4td/data/raw/QS-OCR-Large/rvl_cdip.pkl" oe_path = "/home/philipp/projects/dad4td/data/processed/oe_data.pkl" res_path = next_path( "/home/philipp/projects/dad4td/reports/semisupervised/semisup_rvl_pw_%04d.tsv") doc2vec_model = Doc2VecModel("apnews", "apnews", 1.0, 100, 1, "/home/philipp/projects/dad4td/models/apnews_dbow/doc2vec.bin") # load data and get the doc2vec vectors for all of the data used df_full = pd.read_pickle(data_path) # sample only a portion of the data df_full = df_full.groupby('target', group_keys=False).apply( lambda df: df.sample(n=n_class, random_state=42)) # %% df_full["vecs"] = doc2vec_model.vectorize(df_full["text"]) df_full["vecs"] = df_full["vecs"].apply(tuple) # %% result_df = pd.DataFrame() for i, params in enumerate(param_combinations): print( f"\n\n---------------------\n\nRun {i+1} out of {len(param_combinations)}\n\n{params}") df, df_test = prepare_data(df_full, **params) # UMAP Train docvecs, umap_model = umap_reduce( df["vecs"].to_list(), df["label"], None, **params) # Ivis docvecs, ivis_model = ivis_reduce( docvecs, df["label"], None, **params) # remove OE data, so it's not scored as well df["decision_scores"] = docvecs df = df.where(df.scorable == 1).dropna() # find outliers in 1D scores preds, iqr_model = score_out_preds(df["decision_scores"], None, contamination=df.outlier_label.value_counts(normalize=True)[-1]) # score the predictions for outliers scores = get_scores(dict(), df["outlier_label"], preds) # %% # write the scores to df and save scores.update(params) scores["data"] = "train" result_df = result_df.append(scores, ignore_index=True) result_df.to_csv(res_path, sep="\t") print(f"\nTraining scores:\n{pd.DataFrame([scores], index=[0])}") # %% # test UMAP and ivis docvecs_test, _ = umap_reduce( df_test["vecs"].to_list(), None, umap_model, **params) docvecs_test, _ = ivis_reduce(docvecs_test, None, ivis_model, **params) # remove OE data, so it's not scored as well df_test["decision_scores"] = docvecs_test df_test = df_test.where(df_test.scorable == 1).dropna() # find outliers in 1D scores preds = iqr_model.transform( df_test["decision_scores"], thresh_factor=1) # score the predictions for outliers scores = get_scores(dict(), df_test["outlier_label"], preds) # write the scores to df and save scores.update(params) scores["data"] = "test" result_df = result_df.append(scores, ignore_index=True) result_df.to_csv(res_path, sep="\t") print(f"\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
import argparse from ModelHelper import ModelHelper from utils import product_dict from tqdm import tqdm # Multiple configuration trainer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_name", nargs="+", type=str) parser.add_argument("--nb_hidden", nargs="+", type=int) parser.add_argument("--window_size", nargs="+", type=int) parser.add_argument("--stride", nargs="+", type=int) args = parser.parse_args() helper = ModelHelper("beatles.txt") configs = list(product_dict(**args.__dict__)) for config in tqdm(configs): helper.build(config["window_size"], config["stride"]) model = helper.create_model(config["model_name"], config["nb_hidden"]) print(helper._label(config)) history = helper.train(model, nb_epochs=100, verbose=0) helper.plot_history(history, config) helper.plot_history(configs=args.__dict__, show_plot=True)