Python Vocab.build_vocab 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.vocab

클래스/타입: Vocab

메소드/함수: build_vocab

hotexamples.com에서의 예제들: 2

Python Vocab.build_vocab - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.vocab.Vocab.build_vocab에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Vocab(30)

load(22)

add(10)

from_file(8)

save(7)

add_word(5)

add_tokenized_sentence(3)

tokens2ids(3)

size(3)

tokens2indices(3)

finish(3)

set_freeze(2)

extend(2)

token2id(2)

process(2)

load_pretrained_emb(2)

__dict__(2)

get_freeze(2)

__get_index__(2)

__get_word__(2)

__len__(2)

build_vocab(2)

encode_sequence_batch(1)

build(1)

unmap(1)

top_words(1)

add_documents(1)

add_pad_token(1)

sequence_2_id(1)

add_unk_token(1)

randomly_init_embeddings(1)

add_words(1)

load_vocab(1)

load_from_pickle(1)

embeddings(1)

load_from_file(1)

load_embeddings(1)

inst2idx(1)

init_embed(1)

get_token(1)

get_size(1)

get_index(1)

contains(1)

from_iterable(1)

convert_tokens_to_ids(1)

create(1)

filter_tokens_by_cnt(1)

word_2_id(1)

예제 #1

파일 보기

파일: Trainer.py 프로젝트: kant/Multilingual-RDF-Verbalizer

def build_vocab(files, vocabulary=None, mtl=False, name="src", save_dir="/"):
    vocabs = []

    if vocabulary is not None:
        for v in vocabulary:
            print(f'Loading from {v}')
            vocab = Vocab()
            vocab.load_from_file(v)
            vocabs.append(vocab)
    else:
        if mtl is True:
            for index, f in enumerate(files):
                vocab = Vocab()
                vocab.build_vocab([f])
                vocab.save(save_dir + name + ".vocab." + str(index) + ".json")
                vocabs.append(vocab)
        else:
            vocab = Vocab()
            vocab.build_vocab(files)
            vocab.save(save_dir + name + ".vocab.json")
            vocabs.append(vocab)

    for index, vocab in enumerate(vocabs):
        print(f'vocabulary size {index+1:d}: {vocab.len():d}')

    return vocabs

예제 #2

파일 보기

파일: Dataloader.py 프로젝트: kant/Multilingual-RDF-Verbalizer

class ParallelDataset(Dataset):
    '''
		This class builds a dataset from source/target files according to a max_length
	'''
    def __init__(self,
                 source_name,
                 target_name,
                 max_length=300,
                 source_vocab=None,
                 target_vocab=None):

        self.data_source = self.read_file(source_name)
        self.data_target = self.read_file(target_name)

        self.max_length = max_length

        self.source_vocab = source_vocab
        if source_vocab == None:
            self.source_vocab = Vocab()
            self.source_vocab.build_vocab([source_name])

        self.target_vocab = target_vocab
        if target_vocab == None:
            self.target_vocab = Vocab()
            self.target_vocab.build_vocab([target_name])

    def __len__(self):
        '''
			Return the length of the dataset
		'''
        return len(self.data_source)

    def __getitem__(self, index):

        src_tokens = self.padding_sentence(self.data_source[index])
        tgt_tokens = self.padding_sentence(self.data_target[index])

        src_tokens_ids = self.source_vocab.convert_tokens_to_ids(src_tokens)
        src_tokens_ids_tensor = torch.tensor(src_tokens_ids)

        tgt_tokens_ids = self.target_vocab.convert_tokens_to_ids(tgt_tokens)
        tgt_tokens_ids_tensor = torch.tensor(tgt_tokens_ids)

        return src_tokens_ids_tensor, tgt_tokens_ids_tensor

    def read_file(self, filename):
        '''
			Read the file to 
			filename: filename or path of the source/target files
		'''
        data = []
        with open(filename, "r") as f:
            for line in f:
                data.append(line.strip().split())
        return data

    def padding_sentence(self, tokens):
        '''
			Padding the sentence (adding sos and eos tokens and fix the length to a max_length
			tokens: list of tokens of a sentence
		'''
        tokens = ['<sos>'] + tokens + ['<eos>']

        if len(tokens) < self.max_length:
            tokens = tokens + [
                '<pad>' for _ in range(self.max_length - len(tokens))
            ]
        else:
            tokens = tokens[:self.max_length - 1] + ['<eos>']

        return tokens

    def vocabs(self):
        '''
			Return the source and target vocabulary
		'''
        return self.source_vocab, self.target_vocab