Пример #1
0
def get_train(data_path, n_train):
    assert 'eSNLI' in data_path

    data_path_train = data_path
    if n_train != -1:
        data_path_train = os.path.join(data_path, "train_" + str(n_train))
        if os.path.exists(data_path_train):
            shutil.rmtree(data_path_train)
        makedirs(data_path_train)
        for file in ["preproc1_expl_1.train", "labels.train"]:
            copy_first_k_lines_txt(os.path.join(data_path, file),
                                   os.path.join(data_path_train, file),
                                   n_train)

    target_label, expl_1 = {}, {}

    data_type = 'train'
    target_label, expl_1 = {}, {}
    expl_1['path'] = os.path.join(data_path_train,
                                  'preproc1_expl_1.' + data_type)
    target_label['path'] = os.path.join(data_path_train, 'labels.' + data_type)

    expl_1['sent'] = [line.rstrip() for line in open(expl_1['path'], 'r')]

    target_label['data'] = np.array([
        NLI_DIC_LABELS[line.rstrip('\n')]
        for line in open(target_label['path'], 'r')
    ])

    assert len(target_label['data']) == len(expl_1['sent'])
    print data_path, 'TRAIN ', len(expl_1['sent'])

    data = {'label': target_label['data'], 'expl_1': expl_1['sent']}

    return data
Пример #2
0
def get_train(data_path, preproc, min_freq, n_train):
    #assert 'eSNLI' in data_path or 'ALLeNLI' in data_path

    data_path_train = data_path
    if n_train != -1:
        data_path_train = os.path.join(
            data_path, "train_" + str(n_train) + "_freq" + str(min_freq))
        if os.path.exists(data_path_train):
            shutil.rmtree(data_path_train)
        # create subset of n_train data from train only
        makedirs(data_path_train)
        for file in [
                "s1.train", "s2.train",
                "UNK_freq_" + str(min_freq) + "_preproc1_expl_1.train",
                "labels.train"
        ]:
            copy_first_k_lines_txt(os.path.join(data_path, file),
                                   os.path.join(data_path_train, file),
                                   n_train)

    s1, s2, target_label, expl_1 = {}, {}, {}, {}

    freq_prefix = ""
    if min_freq > 0:
        freq_prefix = "UNK_freq_" + str(min_freq) + "_"

    data_type = 'train'
    s1, s2, target_label, expl_1 = {}, {}, {}, {}
    s1['path'] = os.path.join(data_path_train, 's1.' + data_type)
    s2['path'] = os.path.join(data_path_train, 's2.' + data_type)
    expl_1['path'] = os.path.join(
        data_path_train, freq_prefix + preproc + 'expl_1.' + data_type)
    target_label['path'] = os.path.join(data_path_train, 'labels.' + data_type)

    s1['sent'] = [line.rstrip() for line in open(s1['path'], 'r')]
    s2['sent'] = [line.rstrip() for line in open(s2['path'], 'r')]
    expl_1['sent'] = [line.rstrip() for line in open(expl_1['path'], 'r')]

    target_label['data'] = np.array([
        NLI_DIC_LABELS[line.rstrip('\n')]
        for line in open(target_label['path'], 'r')
    ])

    assert len(s1['sent']) == len(s2['sent']) == len(
        target_label['data']) == len(expl_1['sent'])
    print(data_path, 'TRAIN ', len(s1['sent']))

    data = {
        's1': s1['sent'],
        's2': s2['sent'],
        'label': target_label['data'],
        'expl_1': expl_1['sent']
    }

    return data
Пример #3
0
encoder_types = [
    'BLSTMEncoder', 'BLSTMprojEncoder', 'BGRUlastEncoder',
    'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder',
    'ConvNetEncoder', 'LSTMEncoder'
]  # 'InnerAttentionMILAEncoder',
assert params.encoder_type in encoder_types, "encoder_type must be in " + str(
    encoder_types)
assert params.decoder_type in ['gru', 'lstm']
assert params.train_set in ['eSNLI', 'ALLeNLI']
assert params.att_type in ['dot', 'lin']

# CUDNN deterministic
torch.backends.cudnn.deterministic = params.cudnn_deterministic

params.results_dir = params.results_dir + "_" + params.train_set
makedirs(params.results_dir)

params.save_title += "_dec" + str(params.decoder_type.upper(
)) + "_" + params.optimizer + "_Enc" + str(params.enc_rnn_dim) + "_Dec" + str(
    params.dec_rnn_dim) + "_att_hid" + str(params.att_hid_dim) + "_bs" + str(
        params.batch_size) + "_gpu" + str(params.gpu) + "__encT" + str(
            params.max_T_encoder) + "__decT" + str(params.max_T_decoder)

if params.fc_dim != 512:
    params.save_title += "_MLP_dim" + str(params.fc_dim)

if params.encoder_type != 'BLSTMEncoder':
    params.save_title += '_' + params.encoder_type

if params.min_freq != 15:
    params.save_title += "_min_freq" + str(params.min_freq)
Пример #4
0

params = parser.parse_args()
assert(params.gpu is not None)
encoder_types = ['BLSTMEncoder', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder'] # 'InnerAttentionMILAEncoder', 
assert params.encoder_type in encoder_types, "encoder_type must be in " + str(encoder_types)
assert params.decoder_type in ['gru', 'lstm']
assert params.train_set in ['eSNLI', 'ALLeNLI']
assert params.att_type in ['dot', 'lin']


# CUDNN deterministic
torch.backends.cudnn.deterministic = params.cudnn_deterministic

params.results_dir = params.results_dir + "_" + params.train_set
makedirs(params.results_dir)


params.save_title += "_dec" + str(params.decoder_type.upper()) + "_" + params.optimizer + "_Enc" + str(params.enc_rnn_dim) + "_Dec" + str(params.dec_rnn_dim) + "_att_hid" + str(params.att_hid_dim) + "_bs" + str(params.batch_size) + "_gpu" + str(params.gpu) + "__encT" + str(params.max_T_encoder) + "__decT" + str(params.max_T_decoder)


if params.fc_dim != 512:
	params.save_title += "_MLP_dim" + str(params.fc_dim)

if params.encoder_type != 'BLSTMEncoder':
	params.save_title += '_' + params.encoder_type

if params.min_freq != 15:
	params.save_title += "_min_freq" + str(params.min_freq)

if "sgd" in params.optimizer: