示例#1
0
def get_vocab(datasets):
    all_words = [
        word for dataset in datasets for item in dataset for word in item[0]
    ]
    vocab = Vocab(data.count_tokens(all_words))
    glove = embedding.create('glove',
                             source='glove.6B.' + str(args.embedding_dim) +
                             'd')
    vocab.set_embedding(glove)
    return vocab
def test_join_embedding():
    counter = data.Counter(["love", "走秀", "vacation"])
    vocab1 = Vocab(counter)
    vocab2 = Vocab(counter)
    chinese_embedding = gluonnlp.embedding.create('fasttext', source='wiki.zh')
    eng_embedding = gluonnlp.embedding.create('fasttext', source='wiki.simple')

    vocab1.set_embedding(chinese_embedding)
    vocab2.set_embedding(eng_embedding)

    print(vocab1.embedding['vacation'] + vocab2.embedding['vacation'])
示例#3
0
    def _get_vocabs(train_examples, dev_examples, emb_file_name,
                    is_cased_embedding, shrink_word_vocab, pool):
        """Create both word-level and character-level vocabularies. Vocabularies are built using
        data from both train and dev datasets.

        Parameters
        ----------
        train_examples : List[dict]
            Tokenized training examples
        dev_examples : List[dict]
            Tokenized dev examples
        emb_file_name : str
            Glove embedding file name
        is_cased_embedding : bool
            When True, provided embedding file is cased, uncased otherwise
        shrink_word_vocab : bool
            When True, only tokens that have embeddings in the embedding file are remained in the
            word_vocab. Otherwise tokens with no embedding also stay
        pool : Pool
            Multiprocessing pool to use

        Returns
        -------
        word_vocab : Vocab
            Word-level vocabulary
        char_vocab : Vocab
            Char-level vocabulary
        """
        tic = time.time()
        print('Word counters receiving started.')

        word_mapper = SQuADAsyncVocabMapper()
        word_reducer = SQuADAsyncVocabReducer()
        word_mapped = list(
            tqdm.tqdm(word_mapper.run_async(
                itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition(
            itertools.chain(*word_mapped)),
                                     total=len(word_mapped))
        word_counts = list(
            tqdm.tqdm(word_reducer.run_async(word_partitioned, pool),
                      total=len(word_partitioned)))
        print('Word counters received in {:.3f} sec'.format(time.time() - tic))

        tic = time.time()
        print('Char counters receiving started.')
        char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
        char_reducer = SQuADAsyncVocabReducer()
        char_mapped = list(
            tqdm.tqdm(char_mapper.run_async(
                itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        char_partitioned = SQuADDataPipeline._partition(
            itertools.chain(*char_mapped))
        char_counts = list(
            tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
                      total=len(char_partitioned)))
        print('Char counters received in {:.3f} sec'.format(time.time() - tic))

        embedding = nlp.embedding.create('glove', source=emb_file_name)

        if is_cased_embedding:
            word_counts = itertools.chain(
                *[[(item[0],
                    item[1]), (item[0].lower(),
                               item[1]), (item[0].capitalize(),
                                          item[1]), (item[0].upper(), item[1])]
                  for item in word_counts])
        else:
            word_counts = [(item[0].lower(), item[1]) for item in word_counts]

        word_vocab = Vocab(
            {
                item[0]: item[1]
                for item in word_counts
                if not shrink_word_vocab or item[0] in embedding.token_to_idx
            },
            bos_token=None,
            eos_token=None)
        word_vocab.set_embedding(embedding)
        char_vocab = Vocab({item[0]: item[1]
                            for item in char_counts},
                           bos_token=None,
                           eos_token=None)

        return word_vocab, char_vocab