def get_train(data_path, n_train): assert 'eSNLI' in data_path data_path_train = data_path if n_train != -1: data_path_train = os.path.join(data_path, "train_" + str(n_train)) if os.path.exists(data_path_train): shutil.rmtree(data_path_train) makedirs(data_path_train) for file in ["preproc1_expl_1.train", "labels.train"]: copy_first_k_lines_txt(os.path.join(data_path, file), os.path.join(data_path_train, file), n_train) target_label, expl_1 = {}, {} data_type = 'train' target_label, expl_1 = {}, {} expl_1['path'] = os.path.join(data_path_train, 'preproc1_expl_1.' + data_type) target_label['path'] = os.path.join(data_path_train, 'labels.' + data_type) expl_1['sent'] = [line.rstrip() for line in open(expl_1['path'], 'r')] target_label['data'] = np.array([ NLI_DIC_LABELS[line.rstrip('\n')] for line in open(target_label['path'], 'r') ]) assert len(target_label['data']) == len(expl_1['sent']) print data_path, 'TRAIN ', len(expl_1['sent']) data = {'label': target_label['data'], 'expl_1': expl_1['sent']} return data
def get_train(data_path, preproc, min_freq, n_train): #assert 'eSNLI' in data_path or 'ALLeNLI' in data_path data_path_train = data_path if n_train != -1: data_path_train = os.path.join( data_path, "train_" + str(n_train) + "_freq" + str(min_freq)) if os.path.exists(data_path_train): shutil.rmtree(data_path_train) # create subset of n_train data from train only makedirs(data_path_train) for file in [ "s1.train", "s2.train", "UNK_freq_" + str(min_freq) + "_preproc1_expl_1.train", "labels.train" ]: copy_first_k_lines_txt(os.path.join(data_path, file), os.path.join(data_path_train, file), n_train) s1, s2, target_label, expl_1 = {}, {}, {}, {} freq_prefix = "" if min_freq > 0: freq_prefix = "UNK_freq_" + str(min_freq) + "_" data_type = 'train' s1, s2, target_label, expl_1 = {}, {}, {}, {} s1['path'] = os.path.join(data_path_train, 's1.' + data_type) s2['path'] = os.path.join(data_path_train, 's2.' + data_type) expl_1['path'] = os.path.join( data_path_train, freq_prefix + preproc + 'expl_1.' + data_type) target_label['path'] = os.path.join(data_path_train, 'labels.' + data_type) s1['sent'] = [line.rstrip() for line in open(s1['path'], 'r')] s2['sent'] = [line.rstrip() for line in open(s2['path'], 'r')] expl_1['sent'] = [line.rstrip() for line in open(expl_1['path'], 'r')] target_label['data'] = np.array([ NLI_DIC_LABELS[line.rstrip('\n')] for line in open(target_label['path'], 'r') ]) assert len(s1['sent']) == len(s2['sent']) == len( target_label['data']) == len(expl_1['sent']) print(data_path, 'TRAIN ', len(s1['sent'])) data = { 's1': s1['sent'], 's2': s2['sent'], 'label': target_label['data'], 'expl_1': expl_1['sent'] } return data
encoder_types = [ 'BLSTMEncoder', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder' ] # 'InnerAttentionMILAEncoder', assert params.encoder_type in encoder_types, "encoder_type must be in " + str( encoder_types) assert params.decoder_type in ['gru', 'lstm'] assert params.train_set in ['eSNLI', 'ALLeNLI'] assert params.att_type in ['dot', 'lin'] # CUDNN deterministic torch.backends.cudnn.deterministic = params.cudnn_deterministic params.results_dir = params.results_dir + "_" + params.train_set makedirs(params.results_dir) params.save_title += "_dec" + str(params.decoder_type.upper( )) + "_" + params.optimizer + "_Enc" + str(params.enc_rnn_dim) + "_Dec" + str( params.dec_rnn_dim) + "_att_hid" + str(params.att_hid_dim) + "_bs" + str( params.batch_size) + "_gpu" + str(params.gpu) + "__encT" + str( params.max_T_encoder) + "__decT" + str(params.max_T_decoder) if params.fc_dim != 512: params.save_title += "_MLP_dim" + str(params.fc_dim) if params.encoder_type != 'BLSTMEncoder': params.save_title += '_' + params.encoder_type if params.min_freq != 15: params.save_title += "_min_freq" + str(params.min_freq)
params = parser.parse_args() assert(params.gpu is not None) encoder_types = ['BLSTMEncoder', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder'] # 'InnerAttentionMILAEncoder', assert params.encoder_type in encoder_types, "encoder_type must be in " + str(encoder_types) assert params.decoder_type in ['gru', 'lstm'] assert params.train_set in ['eSNLI', 'ALLeNLI'] assert params.att_type in ['dot', 'lin'] # CUDNN deterministic torch.backends.cudnn.deterministic = params.cudnn_deterministic params.results_dir = params.results_dir + "_" + params.train_set makedirs(params.results_dir) params.save_title += "_dec" + str(params.decoder_type.upper()) + "_" + params.optimizer + "_Enc" + str(params.enc_rnn_dim) + "_Dec" + str(params.dec_rnn_dim) + "_att_hid" + str(params.att_hid_dim) + "_bs" + str(params.batch_size) + "_gpu" + str(params.gpu) + "__encT" + str(params.max_T_encoder) + "__decT" + str(params.max_T_decoder) if params.fc_dim != 512: params.save_title += "_MLP_dim" + str(params.fc_dim) if params.encoder_type != 'BLSTMEncoder': params.save_title += '_' + params.encoder_type if params.min_freq != 15: params.save_title += "_min_freq" + str(params.min_freq) if "sgd" in params.optimizer: