print('Invalid argument for --dataset !') exit() input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']]) output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']]) true_val = val_data['reply'] true_test = test_data['reply'] input_test = test_data['line'] filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' w2v_path = config['w2v_file'] print('[INFO] Tokenizing input and output sequences') x, input_word_index = utils.tokenize_sequence(input_sentences, filters, config['encoder_num_tokens'], config['encoder_vocab']) y, output_word_index = utils.tokenize_sequence(output_sentences, filters, config['decoder_num_tokens'], config['decoder_vocab']) print('[INFO] Split data into train-validation-test sets') dataset_sizes = [train_data.shape[0], val_data.shape[0], test_data.shape[0]] x_train, y_train, x_val, y_val, x_test, y_test = utils.create_data_split(x, y, dataset_sizes) encoder_embeddings_matrix = utils.create_embedding_matrix(input_word_index, config['embedding_size'], w2v_path)
from sklearn.model_selection import train_test_split np.random.seed(1337) snli_data = utils.get_sentences(file_path = config['data']) print('[INFO] Number of sentences = {}'.format(len(snli_data))) sentences = [s.strip() for s in snli_data] np.random.shuffle(sentences) print('[INFO] Tokenizing input and output sequences') filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' x, word_index = utils.tokenize_sequence(sentences, filters, config['num_tokens'], config['vocab_size']) print('[INFO] Split data into train-validation-test sets') x_train, _x_val_test = train_test_split(x, test_size = 0.1, random_state = 10) x_val, x_test = train_test_split(_x_val_test, test_size = 0.5, random_state = 10) w2v = config['w2v_file'] embeddings_matrix = utils.create_embedding_matrix(word_index, config['embedding_size'], w2v) # Re-calculate the vocab size based on the word_idx dictionary config['vocab_size'] = len(word_index) #----------------------------------------------------------------#
label_data = f.readlines() for item in label_data: item = item.rstrip() labels.append(int(item)) print('[INFO] Number of sentences = {}'.format(len(combined_data))) combined_sentences = [s.strip() for s in combined_data] input_sentences = [s.strip() for s in input_data] output_sentences = [s.strip() for s in output_data] print('[INFO] Tokenizing input and output sequences') filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' word_index = utils.get_dict(combined_sentences, filters,config['num_tokens'], 100) input_sents = utils.tokenize_sequence(input_sentences, filters, config['num_tokens'], word_index) output_sents = utils.tokenize_sequence(output_sentences, filters, config['num_tokens'], word_index) print('[INFO] Split data into train-validation-test sets') input_train, input_val, output_train, output_val, label_train, label_val = train_test_split(input_sents, output_sents, labels, test_size = 0.05, random_state = 10) w2v = config['w2v_file'] embeddings_matrix = utils.create_embedding_matrix(word_index, config['embedding_size'], w2v) # Re-calculate the vocab size based on the word_idx dictionary config['vocab_size'] = len(word_index) #----------------------------------------------------------------#