示例#1
0
    print('Invalid argument for --dataset !')
    exit()

input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']])
output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']])

true_val = val_data['reply']
true_test = test_data['reply']
input_test = test_data['line']

filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'
w2v_path = config['w2v_file']

print('[INFO] Tokenizing input and output sequences')
x, input_word_index = utils.tokenize_sequence(input_sentences, 
                                                filters, 
                                                config['encoder_num_tokens'], 
                                                config['encoder_vocab'])

y, output_word_index = utils.tokenize_sequence(output_sentences, 
                                                filters, 
                                                config['decoder_num_tokens'], 
                                                config['decoder_vocab'])

print('[INFO] Split data into train-validation-test sets')
dataset_sizes = [train_data.shape[0], val_data.shape[0], test_data.shape[0]]
x_train, y_train, x_val, y_val, x_test, y_test = utils.create_data_split(x, y, dataset_sizes)

encoder_embeddings_matrix = utils.create_embedding_matrix(input_word_index, 
                                                               config['embedding_size'], 
                                                               w2v_path)
示例#2
0
from sklearn.model_selection import train_test_split

np.random.seed(1337)

snli_data = utils.get_sentences(file_path = config['data'])

print('[INFO] Number of sentences = {}'.format(len(snli_data)))

sentences = [s.strip() for s in snli_data]

np.random.shuffle(sentences)

print('[INFO] Tokenizing input and output sequences')
filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'
x, word_index = utils.tokenize_sequence(sentences,
                                             filters,
                                             config['num_tokens'],
                                             config['vocab_size'])

print('[INFO] Split data into train-validation-test sets')
x_train, _x_val_test = train_test_split(x, test_size = 0.1, random_state = 10)
x_val, x_test = train_test_split(_x_val_test, test_size = 0.5, random_state = 10)

w2v = config['w2v_file']
embeddings_matrix = utils.create_embedding_matrix(word_index,
                                                  config['embedding_size'],
                                                  w2v)

# Re-calculate the vocab size based on the word_idx dictionary
config['vocab_size'] = len(word_index)

#----------------------------------------------------------------#
示例#3
0
        label_data = f.readlines()
    for item in label_data:
        item = item.rstrip()
        labels.append(int(item))

    print('[INFO] Number of sentences = {}'.format(len(combined_data)))

    combined_sentences = [s.strip() for s in combined_data]
    input_sentences = [s.strip() for s in input_data]
    output_sentences = [s.strip() for s in output_data]

    print('[INFO] Tokenizing input and output sequences')
    filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'
        
    word_index = utils.get_dict(combined_sentences, filters,config['num_tokens'], 100)
    input_sents = utils.tokenize_sequence(input_sentences, filters, config['num_tokens'], word_index)
    output_sents = utils.tokenize_sequence(output_sentences, filters, config['num_tokens'], word_index)

    print('[INFO] Split data into train-validation-test sets')
    input_train, input_val, output_train, output_val, label_train, label_val = train_test_split(input_sents, output_sents, labels, test_size = 0.05, random_state = 10)

    w2v = config['w2v_file']
    embeddings_matrix = utils.create_embedding_matrix(word_index,
                                                    config['embedding_size'],
                                                    w2v)

    # Re-calculate the vocab size based on the word_idx dictionary
    config['vocab_size'] = len(word_index)

    #----------------------------------------------------------------#