Пример #1
0
def load_embedding_from_path(args):
    """Load a TokenEmbedding."""
    if 'bin' in args.embedding_path:
        with utils.print_time('load fastText model.'):
            model = \
                nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
                    args.embedding_path)

        embedding = nlp.embedding.TokenEmbedding(
            unknown_token=None, unknown_lookup=model, allow_extend=True,
            unknown_autoextend=True)

        idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get)
        if not args.analogy_datasets:
            # Prune tokens not used in evaluation datasets
            eval_tokens_ = set(
                evaluation.get_tokens_in_evaluation_datasets(args))
            idx_to_token = [t for t in idx_to_token if t in eval_tokens_]
        if args.max_vocab_size:
            idx_to_token = idx_to_token[:args.max_vocab_size]

        with utils.print_time('compute vectors from subwords '
                              'for {} words.'.format(len(idx_to_token))):
            embedding[idx_to_token] = model[idx_to_token]

    else:
        embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)

    return embedding
def evaluate(args, model, vocab, global_step, eval_analogy=False):
    """Evaluation helper"""
    if 'eval_tokens' not in globals():
        global eval_tokens

        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
        if not args.no_eval_analogy:
            eval_tokens_set.update(vocab.idx_to_token)

        # GloVe does not support computing vectors for OOV words
        eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set)

        eval_tokens = list(eval_tokens_set)

    # Compute their word vectors
    context = get_context(args)
    mx.nd.waitall()

    token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
                                                   allow_extend=True)
    token_embedding[eval_tokens] = model[eval_tokens]

    results = evaluation.evaluate_similarity(
        args, token_embedding, context[0], logfile=os.path.join(
            args.logdir, 'similarity.tsv'), global_step=global_step)
    if eval_analogy:
        assert not args.no_eval_analogy
        results += evaluation.evaluate_analogy(
            args, token_embedding, context[0], logfile=os.path.join(
                args.logdir, 'analogy.tsv'))

    return results
Пример #3
0
def load_embedding_from_path(args):
    """Load a TokenEmbedding."""
    if 'bin' in args.embedding_path:
        with utils.print_time('load fastText model.'):
            model = \
                nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
                    args.embedding_path)

        # Add OOV words if the token_embedding can impute them
        token_set = set()
        token_set.update(
            filter(lambda x: x in model,
                   evaluation.get_tokens_in_evaluation_datasets(args)))

        # OOV words will be imputed and added to the
        # token_embedding.idx_to_token etc.
        with utils.print_time('compute vectors from subwords '
                              'for {} words.'.format(len(token_set))):
            embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
                                                     allow_extend=True)
            idx_to_tokens = list(token_set)
            embedding[idx_to_tokens] = model[idx_to_tokens]

    else:
        embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)

    return embedding
Пример #4
0
def load_and_evaluate(args):
    """Load the pretrained model and run evaluate."""
    context = utils.get_context(args)
    embedding, model_idx_to_token = get_model(args)

    idx_to_token_set = evaluation.get_tokens_in_evaluation_datasets(args)
    idx_to_token_set.update(model_idx_to_token)
    idx_to_token = list(idx_to_token_set)

    # Compute their word vectors
    token_embedding = embedding.to_token_embedding(idx_to_token,
                                                   ctx=context[0])

    os.makedirs(args.logdir, exist_ok=True)
    results = evaluation.evaluate_similarity(args,
                                             token_embedding,
                                             context[0],
                                             logfile=os.path.join(
                                                 args.logdir,
                                                 'similarity.tsv'))
    results += evaluation.evaluate_analogy(args,
                                           token_embedding,
                                           context[0],
                                           logfile=os.path.join(
                                               args.logdir, 'analogy.tsv'))
def load_embedding_from_path(args):
    """Load a TokenEmbedding."""
    if 'bin' in args.embedding_path:
        with utils.print_time('load fastText model.'):
            model = \
                nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
                    args.embedding_path)

        embedding = nlp.embedding.TokenEmbedding(
            unknown_token=None, unknown_lookup=model, allow_extend=True,
            unknown_autoextend=True)

        idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get)
        if not args.analogy_datasets:
            # Prune tokens not used in evaluation datasets
            eval_tokens_ = set(
                evaluation.get_tokens_in_evaluation_datasets(args))
            idx_to_token = [t for t in idx_to_token if t in eval_tokens_]
        if args.max_vocab_size:
            idx_to_token = idx_to_token[:args.max_vocab_size]

        with utils.print_time('compute vectors from subwords '
                              'for {} words.'.format(len(idx_to_token))):
            embedding[idx_to_token] = model[idx_to_token]

    else:
        embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)

    return embedding
Пример #6
0
def evaluate(args, embedding, vocab, global_step, eval_analogy=False):
    """Evaluation helper"""
    if 'eval_tokens' not in globals():
        global eval_tokens

        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
        if not args.no_eval_analogy:
            eval_tokens_set.update(vocab.idx_to_token)
        eval_tokens = list(eval_tokens_set)

    os.makedirs(args.logdir, exist_ok=True)

    # Compute their word vectors
    context = get_context(args)
    idx_to_token = eval_tokens
    mx.nd.waitall()
    token_embedding = embedding.to_token_embedding(idx_to_token,
                                                   ctx=context[0])

    results = evaluation.evaluate_similarity(args,
                                             token_embedding,
                                             context[0],
                                             logfile=os.path.join(
                                                 args.logdir,
                                                 'similarity.tsv'),
                                             global_step=global_step)
    if eval_analogy:
        assert not args.no_eval_analogy
        results += evaluation.evaluate_analogy(args,
                                               token_embedding,
                                               context[0],
                                               logfile=os.path.join(
                                                   args.logdir, 'analogy.tsv'))

    return results
Пример #7
0
def load_embedding_from_path(args):
    """Load a TokenEmbedding."""
    if 'bin' in args.embedding_path:
        with utils.print_time('load fastText model.'):
            model = \
                nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
                    args.embedding_path)

        embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
                                                 unknown_lookup=model,
                                                 allow_extend=True,
                                                 unknown_autoextend=True)

        if args.analogy_datasets:
            # Pre-compute all words in vocabulary in case of analogy evaluation
            idx_to_token = [
                model.token_to_idx[idx]
                for idx in range(len(model.token_to_idx))
            ]
            if args.max_vocab_size:
                idx_to_token = idx_to_token[:args.max_vocab_size]
        else:
            idx_to_token = [
                t for t in evaluation.get_tokens_in_evaluation_datasets(args)
                if t in model.token_to_idx
            ]
            if args.max_vocab_size:
                assert len(idx_to_token) < args.max_vocab_size, \
                    'max_vocab_size unsupported for bin model without analogy evaluation.'

        with utils.print_time('compute vectors from subwords '
                              'for {} words.'.format(len(idx_to_token))):
            embedding[idx_to_token] = model[idx_to_token]

    else:
        embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)

    return embedding
Пример #8
0
                load_ngrams=args_.fasttext_load_ngrams,
                allow_extend=True,
                unknown_autoextend=True)
        else:
            token_embedding_ = nlp.embedding.create(
                args_.embedding_name, source=args_.embedding_source)
        name = '-' + args_.embedding_name + '-' + args_.embedding_source
    else:
        token_embedding_ = load_embedding_from_path(args_)
        name = ''

    enforce_max_size(token_embedding_, args_.max_vocab_size)
    known_tokens = set(token_embedding_.idx_to_token)
    # Auto-extend token_embedding with unknown extra eval tokens
    if token_embedding_.unknown_lookup is not None:
        eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_)
        # pylint: disable=pointless-statement
        token_embedding_[[
            t for t in eval_tokens - known_tokens
            if t in token_embedding_.unknown_lookup
        ]]

        if len(token_embedding_.idx_to_token) > args_.max_vocab_size:
            logging.warning(
                'Computing embeddings for OOV words that occur '
                'in the evaluation dataset lead to having '
                'more words than --max-vocab-size. '
                'Have %s words (--max-vocab-size %s)',
                len(token_embedding_.idx_to_token), args_.max_vocab_size)

    similarity_results = evaluation.evaluate_similarity(
                load_ngrams=args_.fasttext_load_ngrams,
                allow_extend=True,
                unknown_autoextend=True)
        else:
            token_embedding_ = nlp.embedding.create(
                args_.embedding_name, source=args_.embedding_source)
        name = '-' + args_.embedding_name + '-' + args_.embedding_source
    else:
        token_embedding_ = load_embedding_from_path(args_)
        name = ''

    enforce_max_size(token_embedding_, args_.max_vocab_size)
    known_tokens = set(token_embedding_.idx_to_token)
    # Auto-extend token_embedding with unknown extra eval tokens
    if token_embedding_.unknown_lookup is not None:
        eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_)
        # pylint: disable=pointless-statement
        token_embedding_[[
            t for t in eval_tokens - known_tokens
            if t in token_embedding_.unknown_lookup
        ]]

        if args_.max_vocab_size is not None and len(
                token_embedding_.idx_to_token) > args_.max_vocab_size:
            logging.warning('Computing embeddings for OOV words that occur '
                            'in the evaluation dataset lead to having '
                            'more words than --max-vocab-size. '
                            'Have %s words (--max-vocab-size %s)',
                            len(token_embedding_.idx_to_token),
                            args_.max_vocab_size)