def main(): parser = argparse.ArgumentParser() parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.") parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.") args = parser.parse_args() dataset_path = os.path.join("data", args.dataset) if args.dataset == "cornell": data = datasets.readCornellData(dataset_path, max_len=args.max_len) elif args.dataset == "opensubs": data = datasets.readOpensubsData(dataset_path, max_len=args.max_len) else: raise ValueError("Unrecognized dataset: {!r}".format(args.dataset)) print("Size of dataset: {}".format(len(data))) print("First 10 training pairs:") for item in data[:10]: print(item) print("Writing to a .txt file") with open("data/req-res.txt", "w") as f: for item in data: f.write(item[0]) f.write('\t') f.write(item[1]) f.write('\n') print("Done...")
def make_dataset(self, test_size=0.05, seed=12345): # read data cornell = datasets.readCornellData('cornell/', max_len=self.sent_len, kind=self.kind) sentences = ['<start>' + ' ' + i[0] + ' ' + '<eos>' for i in cornell] replies = ['<start>' + ' ' + i[1] + ' ' + '<eos>' for i in cornell] # filter sentences by length sent_mask = [ self.min_len <= len(i.split(' ')) <= self.max_len for i in sentences ] replies_mask = [ self.min_len <= len(i.split(' ')) <= self.max_len for i in replies ] full_mask = [i and j for (i, j) in zip(sent_mask, replies_mask)] sentences = np.array(sentences)[full_mask].tolist() replies = np.array(replies)[full_mask].tolist() # tokenize tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=self.num_words, filters='', oov_token='<unk>') tokenizer.fit_on_texts(sentences + replies) sentences_en = tokenizer.texts_to_sequences(sentences) replies_en = tokenizer.texts_to_sequences(replies) sentences_en = tf.keras.preprocessing.sequence.pad_sequences( sentences_en, maxlen=None, padding='post', value=0) replies_en = tf.keras.preprocessing.sequence.pad_sequences( replies_en, maxlen=None, padding='post', value=0) # train-test split X_train, X_test, y_train, y_test = train_test_split( sentences_en, replies_en, test_size=test_size, random_state=seed) buffer_size = len(X_train) steps_per_epoch = buffer_size // self.batch_size # dataset preparation dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) dataset = dataset.shuffle(buffer_size, seed=seed).batch(self.batch_size, drop_remainder=True) val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)) val_dataset = val_dataset.shuffle(buffer_size, seed=seed).batch(self.batch_size, drop_remainder=True) self.tokenizer = tokenizer self.dataset = dataset self.val_dataset = val_dataset self.steps_per_epoch = steps_per_epoch
def read_dataset(dataset_name, max_sentence_length): dataset_path = 'data/{}'.format(dataset_name) if dataset_name == "cornell": data = datasets.readCornellData(dataset_path, max_len=max_sentence_length) elif dataset_name == "opensubs": data = datasets.readOpensubsData(dataset_path, max_len=max_sentence_length) elif dataset_name == 'twitter': data = readTwitterData() else: raise ValueError("Unrecognized dataset: {!r}".format(dataset_name)) return data
def main(): parser = argparse.ArgumentParser() parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.") parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.") args = parser.parse_args() dataset_path = os.path.join("data", args.dataset) if args.dataset == "cornell": data = datasets.readCornellData(dataset_path, max_len=args.max_len) elif args.dataset == "opensubs": data = datasets.readOpensubsData(dataset_path, max_len=args.max_len) else: raise ValueError("Unrecognized dataset: {!r}".format(args.dataset)) print("Size of dataset: {}".format(len(data))) print("First 10 training pairs:") for item in data[:10]: print(item)
def __init__(self, paths): self.embeddings, self.embeddings_dim = self._load_embeddings( paths['DIALOGUE_EMBEDDINGS']) self.question_vectors = unpickle_file(paths['QUESTION_VECTORS']) self.dialogues = datasets.readCornellData(paths['DIALOGUE_FOLDER'], max_len=100)