def _create_vocab(self, tokens, special_tokens=["<PAD>", "<GO>", "<EOS>"]): vocab = Vocabulary(vocab_size=self.config.language.input_vocab_size) vocab.build_vocabulary_from_tokens(tokens, special_tokens=special_tokens) self.vocabulary = vocab
def build(self, filename, split_size=0.8, authors=None, collections=None, families=None): examples = self.load(filename, authors, collections, families) random.shuffle(examples) raw_x, raw_a, raw_f, raw_ispr = zip(*examples) print("================") print("Authors\n") print(set(raw_a)) print("================") print("Families\n") print(set(raw_f)) print("================") print("Text Type\n") print(set(raw_ispr)) # cleanup & tokenize data raw_x = self.tokenize([self.preprocess(ex) for ex in raw_x]) # dataset split self.raw_train_x, self.raw_val_x = LMDataset.split(raw_x, train_size=split_size) self.raw_train_a, self.raw_val_a = LMDataset.split(raw_a, train_size=split_size) self.raw_train_f, self.raw_val_f = LMDataset.split(raw_f, train_size=split_size) self.raw_train_ispr, self.raw_val_ispr = LMDataset.split(raw_ispr, train_size=split_size) if self.vocabulary is None: # creates vocabulary x_tokens = [item for sublist in self.raw_train_x for item in sublist] # get tokens self._create_vocab(x_tokens) print(f"Vocabulary size: {len(self.vocabulary.rev_dictionary)}") if self.authors_vocabulary is None: # creates vocabulary a_tokens = [item for item in self.raw_train_a] # get authors vocab vocab = Vocabulary(vocab_size=self.config.author_vocab_size) vocab.build_vocabulary_from_tokens(a_tokens) self.authors_vocabulary = vocab print(f"Authors Vocabulary size: {len(self.authors_vocabulary.rev_dictionary)}") self.train_a = self.authors_vocabulary.string2id(self.raw_train_a) self.val_a = self.authors_vocabulary.string2id(self.raw_val_a) if self.families_vocabulary is None: # creates vocabulary f_tokens = [item for item in self.raw_train_f] # get family vocab vocab = Vocabulary(vocab_size=self.config.family_vocab_size) vocab.build_vocabulary_from_tokens(f_tokens) self.families_vocabulary = vocab print(f"Families Vocabulary size: {len(self.families_vocabulary.rev_dictionary)}") self.train_f = self.families_vocabulary.string2id(self.raw_train_f) self.val_f = self.families_vocabulary.string2id(self.raw_val_f) self.train_ispr = self.raw_train_ispr self.val_ispr = self.raw_val_ispr # creates x for train self.train_x = self._build_dataset(self.raw_train_x, insert_go=False, max_len=self.config.language.seq_max_len, shuffle=False) # creates x,for validation self.val_x = self._build_dataset(self.raw_val_x, insert_go=False, max_len=self.config.language.seq_max_len, shuffle=False) print("TRAINING SET LENGTH: %d\n" % len(self.train_x)) print("VALIDATION SET LENGTH: %d\n" % len(self.val_x))