예제 #1
0
def build_vocab(files, vocabulary=None, mtl=False, name="src", save_dir="/"):
    vocabs = []

    if vocabulary is not None:
        for v in vocabulary:
            print(f'Loading from {v}')
            vocab = Vocab()
            vocab.load_from_file(v)
            vocabs.append(vocab)
    else:
        if mtl is True:
            for index, f in enumerate(files):
                vocab = Vocab()
                vocab.build_vocab([f])
                vocab.save(save_dir + name + ".vocab." + str(index) + ".json")
                vocabs.append(vocab)
        else:
            vocab = Vocab()
            vocab.build_vocab(files)
            vocab.save(save_dir + name + ".vocab.json")
            vocabs.append(vocab)

    for index, vocab in enumerate(vocabs):
        print(f'vocabulary size {index+1:d}: {vocab.len():d}')

    return vocabs
class ParallelDataset(Dataset):
    '''
		This class builds a dataset from source/target files according to a max_length
	'''
    def __init__(self,
                 source_name,
                 target_name,
                 max_length=300,
                 source_vocab=None,
                 target_vocab=None):

        self.data_source = self.read_file(source_name)
        self.data_target = self.read_file(target_name)

        self.max_length = max_length

        self.source_vocab = source_vocab
        if source_vocab == None:
            self.source_vocab = Vocab()
            self.source_vocab.build_vocab([source_name])

        self.target_vocab = target_vocab
        if target_vocab == None:
            self.target_vocab = Vocab()
            self.target_vocab.build_vocab([target_name])

    def __len__(self):
        '''
			Return the length of the dataset
		'''
        return len(self.data_source)

    def __getitem__(self, index):

        src_tokens = self.padding_sentence(self.data_source[index])
        tgt_tokens = self.padding_sentence(self.data_target[index])

        src_tokens_ids = self.source_vocab.convert_tokens_to_ids(src_tokens)
        src_tokens_ids_tensor = torch.tensor(src_tokens_ids)

        tgt_tokens_ids = self.target_vocab.convert_tokens_to_ids(tgt_tokens)
        tgt_tokens_ids_tensor = torch.tensor(tgt_tokens_ids)

        return src_tokens_ids_tensor, tgt_tokens_ids_tensor

    def read_file(self, filename):
        '''
			Read the file to 
			filename: filename or path of the source/target files
		'''
        data = []
        with open(filename, "r") as f:
            for line in f:
                data.append(line.strip().split())
        return data

    def padding_sentence(self, tokens):
        '''
			Padding the sentence (adding sos and eos tokens and fix the length to a max_length
			tokens: list of tokens of a sentence
		'''
        tokens = ['<sos>'] + tokens + ['<eos>']

        if len(tokens) < self.max_length:
            tokens = tokens + [
                '<pad>' for _ in range(self.max_length - len(tokens))
            ]
        else:
            tokens = tokens[:self.max_length - 1] + ['<eos>']

        return tokens

    def vocabs(self):
        '''
			Return the source and target vocabulary
		'''
        return self.source_vocab, self.target_vocab