Пример #1
0
 def _create_vocab(self, tokens, special_tokens=["<PAD>", "<GO>", "<EOS>"]):
     vocab = Vocabulary(vocab_size=self.config.language.input_vocab_size)
     vocab.build_vocabulary_from_tokens(tokens, special_tokens=special_tokens)
     self.vocabulary = vocab
Пример #2
0
    def build(self, filename, split_size=0.8, authors=None, collections=None, families=None):
        examples = self.load(filename, authors, collections, families)
        random.shuffle(examples)
        raw_x, raw_a, raw_f, raw_ispr = zip(*examples)

        print("================")
        print("Authors\n")
        print(set(raw_a))

        print("================")
        print("Families\n")
        print(set(raw_f))

        print("================")
        print("Text Type\n")
        print(set(raw_ispr))

        # cleanup & tokenize data
        raw_x = self.tokenize([self.preprocess(ex) for ex in raw_x])

        # dataset split
        self.raw_train_x, self.raw_val_x = LMDataset.split(raw_x, train_size=split_size)
        self.raw_train_a, self.raw_val_a = LMDataset.split(raw_a, train_size=split_size)
        self.raw_train_f, self.raw_val_f = LMDataset.split(raw_f, train_size=split_size)
        self.raw_train_ispr, self.raw_val_ispr = LMDataset.split(raw_ispr, train_size=split_size)

        if self.vocabulary is None:
            # creates vocabulary
            x_tokens = [item for sublist in self.raw_train_x for item in sublist]  # get tokens
            self._create_vocab(x_tokens)
            print(f"Vocabulary size: {len(self.vocabulary.rev_dictionary)}")


        if self.authors_vocabulary is None:
            # creates vocabulary
            a_tokens = [item for item in self.raw_train_a]  # get authors vocab
            vocab = Vocabulary(vocab_size=self.config.author_vocab_size)
            vocab.build_vocabulary_from_tokens(a_tokens)
            self.authors_vocabulary = vocab
            print(f"Authors Vocabulary size: {len(self.authors_vocabulary.rev_dictionary)}")

        self.train_a = self.authors_vocabulary.string2id(self.raw_train_a)
        self.val_a = self.authors_vocabulary.string2id(self.raw_val_a)

        if self.families_vocabulary is None:
            # creates vocabulary
            f_tokens = [item for item in self.raw_train_f]  # get family vocab
            vocab = Vocabulary(vocab_size=self.config.family_vocab_size)
            vocab.build_vocabulary_from_tokens(f_tokens)
            self.families_vocabulary = vocab
            print(f"Families Vocabulary size: {len(self.families_vocabulary.rev_dictionary)}")

        self.train_f = self.families_vocabulary.string2id(self.raw_train_f)
        self.val_f = self.families_vocabulary.string2id(self.raw_val_f)

        self.train_ispr = self.raw_train_ispr
        self.val_ispr = self.raw_val_ispr

        # creates x for train
        self.train_x = self._build_dataset(self.raw_train_x, insert_go=False, max_len=self.config.language.seq_max_len, shuffle=False)

        # creates x,for validation
        self.val_x = self._build_dataset(self.raw_val_x, insert_go=False, max_len=self.config.language.seq_max_len, shuffle=False)

        print("TRAINING SET LENGTH: %d\n" % len(self.train_x))
        print("VALIDATION SET LENGTH: %d\n" % len(self.val_x))