def test_int_suffix(self): """ 1. Testing whether it creates a proper custom suffix for dir names. 2. Whether a newly initialized epc parsed dir names properly. """ path = os.path.join(self.base_path, "ints1") suffix = "dummy" sep = "^" # 1. n = 4 epc = ExperimentsPathController(folder_name_type='int', sep=sep) exp_dirs = [sep.join([str(i), suffix]) for i in range(1, n + 1)] for _ in range(n): epc(path, suffix=suffix) act_dirs = os.listdir(path) self.assertTrue(set(act_dirs) == set(exp_dirs)) # 2. m = 6 epc = ExperimentsPathController(folder_name_type='int', sep=sep) new_exp_dirs = [str(i) for i in range(n + 1, n + m + 1)] for _ in range(m): epc(path) act_dirs = os.listdir(path) self.assertTrue(set(act_dirs) == set(exp_dirs + new_exp_dirs))
def __init__(self): super(PluginInitRunConfig, self).__init__() self.exper_descr = "Plug-in initialization phase on reviews." self.cuda_device_ids = [0] self.epochs = 13 self.learning_rate = 1e-05 # GENERAL DATA RELATED # # multiplication by the number of devices is needed in order to # make it work in the multi-gpu setup self.train_groups_per_batch = 20 * len(self.cuda_device_ids) self.val_groups_per_batch = 50 * len(self.cuda_device_ids) self.eval_groups_per_batch = 17 * len(self.cuda_device_ids) # DATA SOURCES # self.train_early_term = None self.val_early_term = None # GENERAL PATHS # self.experiments_folder = 'plugin_init' self.output_dir = 'runs/%s/%s' % (self.dataset, self.experiments_folder) epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = 'artifacts/amazon/checkpoints/plugin_init.tar' # FREEZING AND UNFREEZING # self.modules_to_unfreeze = ['plugin']
def __init__(self): super(JointTuningRunConfig, self).__init__() self.exper_descr = "Joint tuning: the plug-in network and the memory attention." self.epochs = 33 self.learning_rate = 0.0001 # GENERAL DATA RELATED # self.train_groups_per_batch = 15 self.val_groups_per_batch = 20 self.eval_groups_per_batch = 17 # DATA SOURCES # self.train_early_term = None self.val_early_term = None # GENERAL PATHS # self.experiments_folder = 'joint_tuning' self.output_dir = 'runs/%s/%s' % (self.dataset, self.experiments_folder) epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = 'artifacts/amazon/checkpoints/joint_tuning.tar' # self.checkpoint_path = None # FREEZING AND UNFREEZING # self.modules_to_unfreeze = [ 'plugin', '_tr_stack.layers.0.mem_attn', '_tr_stack.layers.1.mem_attn', '_tr_stack.layers.2.mem_attn', '_tr_stack.layers.3.mem_attn', '_tr_stack.layers.4.mem_attn', '_tr_stack.layers.5.mem_attn' ]
def test_int_first_dir(self): """Testing if can create first directory with a proper int number.""" path = os.path.join(self.base_path, "ints0") exp_new_path = os.path.join(path, '1') epc = ExperimentsPathController(folder_name_type='int') new_path = epc(path) self.assertTrue(exp_new_path, new_path)
def __init__(self): super(PluginTuningRunConfig, self).__init__() self.exper_descr = "Plug-in tuning phase on summaries." self.epochs = 98 self.learning_rate = 0.0007 # GENERAL DATA RELATED # self.train_groups_per_batch = 30 self.val_groups_per_batch = 50 self.eval_groups_per_batch = 17 # DATA SOURCES # self.train_early_term = None self.val_early_term = None # GENERAL PATHS # self.experiments_folder = 'plugin_tuning' self.output_dir = 'runs/%s/%s' % (self.dataset, self.experiments_folder) epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = 'artifacts/amazon/checkpoints/plugin_tuning.tar' # FREEZING AND UNFREEZING # self.modules_to_unfreeze = ['plugin']
def __init__(self): super(UnsupRunConfig, self).__init__() self.exper_descr = "Unsupervised learning phase with the leave-one-out " \ "objective and Oracle" self.cuda_device_ids = [0] self.epochs = 20 self.learning_rate = 6e-05 # GENERAL DATA RELATED # # multiplication by the number of devices is needed in order to # make it work in the multi-gpu setup self.train_groups_per_batch = 6 * len(self.cuda_device_ids) self.val_groups_per_batch = 20 * len(self.cuda_device_ids) self.eval_groups_per_batch = 13 * len(self.cuda_device_ids) # DATA SOURCES # self.train_early_term = 5000 self.val_early_term = 500 # GENERAL PATHS # self.experiments_folder = 'unsup' self.output_dir = 'runs/%s/%s' % (self.dataset, self.experiments_folder) epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = 'artifacts/amazon/checkpoints/unsupervised.tar'
def test_date_dir_names(self): """Testing correctness of date type produced dir_names.""" sep = "__" n = 5 date_format = '%m-%d:%H-%M-%S' path = os.path.join(self.base_path, "date") epc = ExperimentsPathController(folder_name_type='date', sep=sep, date_format=date_format) for _ in range(n): epc(path) exp_dir_name = datetime.now().strftime(date_format) dirs = os.listdir(path) self.assertTrue(exp_dir_name in dirs) time.sleep(1)
def test_date_dir_names_duplicates(self): """ Testing if a correct extra prefix is produced when multiple runs happen in the same second. """ sep = "__" n = 5 date_format = '%m-%d:%H-%M-%S' path = os.path.join(self.base_path, "date") epc = ExperimentsPathController(folder_name_type='date', sep=sep, date_format=date_format) for _ in range(n): epc(path) exp_dir_name = datetime.now().strftime(date_format) exp_dirs = [sep.join([exp_dir_name, str(i)]) for i in range(1, n)] + \ [exp_dir_name] act_dirs = os.listdir(path) self.assertTrue(set(act_dirs) == set(exp_dirs))
def __init__(self): super(RunHP, self).__init__() # GENERAL # self.seed = 42 self.cuda_device_id = 6 self.device = 'cpu' # 'cuda' or 'cpu' self.training_logging_step = 50 # how often to print internal metrics self.epochs = 10 # if set to 0 will immediately just to evaluation self.learning_rate = 0.0005 self.grads_clip = 0.25 # GENERAL DATA RELATED # self.dataset = 'yelp' self.train_max_groups_per_batch = 6 self.val_max_groups_per_batch = 13 self.eval_max_groups_per_batch = 20 self.max_rev_per_group = 8 # DATA SOURCES # # `early_term` limits the number of chunks per epoch self.train_early_term = None self.val_early_term = None self.gener_early_term = 2 # GENERAL PATHS # self.root_path = 'copycat' self.experiments_folder = 'first_run' self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}' self.checkpoint_full_fn = 'checkpoint.tar' epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar' self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model' # DATA PATHS # self.base_data_path = f'data/{self.dataset}/' self.train_fp = comb_paths(self.base_data_path, "split/train/") self.val_fp = comb_paths(self.base_data_path, 'split/val/') self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt' self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv') self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv') # ANNEALING # self.c_m = 8. self.c_r = 0.8 self.c_kl_ann_max_val = 1. self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000 self.z_m = 8. self.z_c = 0.8 self.z_kl_ann_max_val = 1. self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000 # DECODING/GENERATION # self.beam_size = 5 self.beam_len_norm = True self.beam_excl_words = [] self.block_ngram_repeat = 3 # or None self.ngram_mirror_window = 3 # or None self.mirror_conjs = ["and", 'or', ',', 'but'] # or None self.block_consecutive = True self.min_gen_seq_len = 20 # POST-PROCESSING AND ANALYTICS # mt = MosesTokenizer() self.tok_func = partial(mt.tokenize, escape=False) self.sent_split_func = nltk.sent_tokenize dt = MosesDetokenizer() self.detok_func = partial(dt.detokenize, unescape=False) true_caser = MosesTruecaser(load_from=self.tcaser_model_path, is_asr=True) self.true_case_func = partial(true_caser.truecase, return_str=True, use_known=True) self.analytics_func = partial(ngram_seq_analysis, tokenizer=self.tok_func, sent_splitter=self.sent_split_func, n_grams_to_comp=(2, 3, 4))