예제 #1
0
    def test_int_suffix(self):
        """
        1. Testing whether it creates a proper custom suffix for dir names.
        2. Whether a newly initialized epc parsed dir names properly.
        """
        path = os.path.join(self.base_path, "ints1")
        suffix = "dummy"
        sep = "^"

        # 1.
        n = 4
        epc = ExperimentsPathController(folder_name_type='int', sep=sep)
        exp_dirs = [sep.join([str(i), suffix]) for i in range(1, n + 1)]

        for _ in range(n):
            epc(path, suffix=suffix)

        act_dirs = os.listdir(path)

        self.assertTrue(set(act_dirs) == set(exp_dirs))

        # 2.
        m = 6
        epc = ExperimentsPathController(folder_name_type='int', sep=sep)
        new_exp_dirs = [str(i) for i in range(n + 1, n + m + 1)]
        for _ in range(m):
            epc(path)

        act_dirs = os.listdir(path)

        self.assertTrue(set(act_dirs) == set(exp_dirs + new_exp_dirs))
예제 #2
0
    def __init__(self):
        super(PluginInitRunConfig, self).__init__()
        self.exper_descr = "Plug-in initialization phase on reviews."
        self.cuda_device_ids = [0]
        self.epochs = 13
        self.learning_rate = 1e-05

        #   GENERAL DATA RELATED  #
        # multiplication by the number of devices is needed in order to
        # make it work in the multi-gpu setup
        self.train_groups_per_batch = 20 * len(self.cuda_device_ids)
        self.val_groups_per_batch = 50 * len(self.cuda_device_ids)
        self.eval_groups_per_batch = 17 * len(self.cuda_device_ids)

        #   DATA SOURCES  #
        self.train_early_term = None
        self.val_early_term = None

        #   GENERAL PATHS   #
        self.experiments_folder = 'plugin_init'
        self.output_dir = 'runs/%s/%s' % (self.dataset,
                                          self.experiments_folder)
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)

        self.checkpoint_path = 'artifacts/amazon/checkpoints/plugin_init.tar'

        #   FREEZING AND UNFREEZING   #
        self.modules_to_unfreeze = ['plugin']
예제 #3
0
    def __init__(self):
        super(JointTuningRunConfig, self).__init__()
        self.exper_descr = "Joint tuning: the plug-in network and the memory attention."
        self.epochs = 33
        self.learning_rate = 0.0001

        #   GENERAL DATA RELATED  #
        self.train_groups_per_batch = 15
        self.val_groups_per_batch = 20
        self.eval_groups_per_batch = 17

        #   DATA SOURCES  #
        self.train_early_term = None
        self.val_early_term = None

        #   GENERAL PATHS   #
        self.experiments_folder = 'joint_tuning'
        self.output_dir = 'runs/%s/%s' % (self.dataset,
                                          self.experiments_folder)
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)

        self.checkpoint_path = 'artifacts/amazon/checkpoints/joint_tuning.tar'
        # self.checkpoint_path = None

        #   FREEZING AND UNFREEZING   #
        self.modules_to_unfreeze = [
            'plugin', '_tr_stack.layers.0.mem_attn',
            '_tr_stack.layers.1.mem_attn', '_tr_stack.layers.2.mem_attn',
            '_tr_stack.layers.3.mem_attn', '_tr_stack.layers.4.mem_attn',
            '_tr_stack.layers.5.mem_attn'
        ]
예제 #4
0
 def test_int_first_dir(self):
     """Testing if can create first directory with a proper int number."""
     path = os.path.join(self.base_path, "ints0")
     exp_new_path = os.path.join(path, '1')
     epc = ExperimentsPathController(folder_name_type='int')
     new_path = epc(path)
     self.assertTrue(exp_new_path, new_path)
예제 #5
0
    def __init__(self):
        super(PluginTuningRunConfig, self).__init__()
        self.exper_descr = "Plug-in tuning phase on summaries."
        self.epochs = 98
        self.learning_rate = 0.0007

        #   GENERAL DATA RELATED  #
        self.train_groups_per_batch = 30
        self.val_groups_per_batch = 50
        self.eval_groups_per_batch = 17

        #   DATA SOURCES  #
        self.train_early_term = None
        self.val_early_term = None

        #   GENERAL PATHS   #
        self.experiments_folder = 'plugin_tuning'
        self.output_dir = 'runs/%s/%s' % (self.dataset,
                                          self.experiments_folder)
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)

        self.checkpoint_path = 'artifacts/amazon/checkpoints/plugin_tuning.tar'

        #   FREEZING AND UNFREEZING   #
        self.modules_to_unfreeze = ['plugin']
예제 #6
0
    def __init__(self):
        super(UnsupRunConfig, self).__init__()
        self.exper_descr = "Unsupervised learning phase with the leave-one-out " \
                           "objective and Oracle"
        self.cuda_device_ids = [0]
        self.epochs = 20
        self.learning_rate = 6e-05

        #   GENERAL DATA RELATED  #
        # multiplication by the number of devices is needed in order to
        # make it work in the multi-gpu setup
        self.train_groups_per_batch = 6 * len(self.cuda_device_ids)
        self.val_groups_per_batch = 20 * len(self.cuda_device_ids)
        self.eval_groups_per_batch = 13 * len(self.cuda_device_ids)

        #   DATA SOURCES  #
        self.train_early_term = 5000
        self.val_early_term = 500

        #   GENERAL PATHS   #
        self.experiments_folder = 'unsup'
        self.output_dir = 'runs/%s/%s' % (self.dataset, self.experiments_folder)
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)

        self.checkpoint_path = 'artifacts/amazon/checkpoints/unsupervised.tar'
예제 #7
0
    def test_date_dir_names(self):
        """Testing correctness of date type produced dir_names."""
        sep = "__"
        n = 5
        date_format = '%m-%d:%H-%M-%S'
        path = os.path.join(self.base_path, "date")

        epc = ExperimentsPathController(folder_name_type='date',
                                        sep=sep,
                                        date_format=date_format)

        for _ in range(n):
            epc(path)
            exp_dir_name = datetime.now().strftime(date_format)
            dirs = os.listdir(path)
            self.assertTrue(exp_dir_name in dirs)
            time.sleep(1)
예제 #8
0
    def test_date_dir_names_duplicates(self):
        """
        Testing if a correct extra prefix is produced when multiple runs happen
        in the same second.
        """
        sep = "__"
        n = 5
        date_format = '%m-%d:%H-%M-%S'
        path = os.path.join(self.base_path, "date")

        epc = ExperimentsPathController(folder_name_type='date',
                                        sep=sep,
                                        date_format=date_format)

        for _ in range(n):
            epc(path)

        exp_dir_name = datetime.now().strftime(date_format)
        exp_dirs = [sep.join([exp_dir_name, str(i)]) for i in range(1, n)] + \
                   [exp_dir_name]
        act_dirs = os.listdir(path)

        self.assertTrue(set(act_dirs) == set(exp_dirs))
예제 #9
0
    def __init__(self):
        super(RunHP, self).__init__()

        #   GENERAL  #
        self.seed = 42
        self.cuda_device_id = 6
        self.device = 'cpu'  # 'cuda' or 'cpu'
        self.training_logging_step = 50  # how often to print internal metrics
        self.epochs = 10  # if set to 0 will immediately just to evaluation
        self.learning_rate = 0.0005
        self.grads_clip = 0.25

        # GENERAL DATA RELATED #
        self.dataset = 'yelp'
        self.train_max_groups_per_batch = 6
        self.val_max_groups_per_batch = 13
        self.eval_max_groups_per_batch = 20
        self.max_rev_per_group = 8

        #   DATA SOURCES  #
        # `early_term` limits the number of chunks per epoch
        self.train_early_term = None
        self.val_early_term = None
        self.gener_early_term = 2

        #  GENERAL PATHS   #
        self.root_path = 'copycat'
        self.experiments_folder = 'first_run'
        self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}'
        self.checkpoint_full_fn = 'checkpoint.tar'
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)
        self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar'
        self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model'

        #   DATA PATHS  #
        self.base_data_path = f'data/{self.dataset}/'
        self.train_fp = comb_paths(self.base_data_path, "split/train/")
        self.val_fp = comb_paths(self.base_data_path, 'split/val/')
        self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt'
        self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv')
        self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv')

        #   ANNEALING   #
        self.c_m = 8.
        self.c_r = 0.8
        self.c_kl_ann_max_val = 1.
        self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000
        self.z_m = 8.
        self.z_c = 0.8
        self.z_kl_ann_max_val = 1.
        self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000

        #   DECODING/GENERATION  #
        self.beam_size = 5
        self.beam_len_norm = True
        self.beam_excl_words = []
        self.block_ngram_repeat = 3  # or None
        self.ngram_mirror_window = 3  # or None
        self.mirror_conjs = ["and", 'or', ',', 'but']  # or None
        self.block_consecutive = True
        self.min_gen_seq_len = 20

        #   POST-PROCESSING AND ANALYTICS #
        mt = MosesTokenizer()
        self.tok_func = partial(mt.tokenize, escape=False)
        self.sent_split_func = nltk.sent_tokenize
        dt = MosesDetokenizer()
        self.detok_func = partial(dt.detokenize, unescape=False)
        true_caser = MosesTruecaser(load_from=self.tcaser_model_path,
                                    is_asr=True)
        self.true_case_func = partial(true_caser.truecase,
                                      return_str=True,
                                      use_known=True)
        self.analytics_func = partial(ngram_seq_analysis,
                                      tokenizer=self.tok_func,
                                      sent_splitter=self.sent_split_func,
                                      n_grams_to_comp=(2, 3, 4))