예제 #1
0
def evaluate(args, model, vocab, global_step, eval_analogy=False):
    """Evaluation helper"""
    if 'eval_tokens' not in globals():
        global eval_tokens

        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
        if not args.no_eval_analogy:
            eval_tokens_set.update(vocab.idx_to_token)

        # GloVe does not support computing vectors for OOV words
        eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set)

        eval_tokens = list(eval_tokens_set)

    # Compute their word vectors
    context = get_context(args)
    mx.nd.waitall()

    token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
                                                   allow_extend=True)
    token_embedding[eval_tokens] = model[eval_tokens]

    results = evaluation.evaluate_similarity(
        args, token_embedding, context[0], logfile=os.path.join(
            args.logdir, 'similarity.tsv'), global_step=global_step)
    if eval_analogy:
        assert not args.no_eval_analogy
        results += evaluation.evaluate_analogy(
            args, token_embedding, context[0], logfile=os.path.join(
                args.logdir, 'analogy.tsv'))

    return results
예제 #2
0
def evaluate(args, embedding, vocab, global_step, eval_analogy=False):
    """Evaluation helper"""
    if 'eval_tokens' not in globals():
        global eval_tokens

        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
        if not args.no_eval_analogy:
            eval_tokens_set.update(vocab.idx_to_token)

        if not args.ngram_buckets:
            # Word2Vec does not support computing vectors for OOV words
            eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set)

        eval_tokens = list(eval_tokens_set)

    if not os.path.isdir(args.logdir):
        os.makedirs(args.logdir)

    # Compute their word vectors
    context = get_context(args)
    mx.nd.waitall()

    token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
                                                   allow_extend=True)
    token_embedding[eval_tokens] = embedding[eval_tokens]

    results = evaluation.evaluate_similarity(args,
                                             token_embedding,
                                             context[0],
                                             logfile=os.path.join(
                                                 args.logdir,
                                                 'similarity.tsv'),
                                             global_step=global_step)
    if eval_analogy:
        assert not args.no_eval_analogy
        results += evaluation.evaluate_analogy(args,
                                               token_embedding,
                                               context[0],
                                               logfile=os.path.join(
                                                   args.logdir, 'analogy.tsv'))

    return results
예제 #3
0
def evaluate(args, embedding, vocab, global_step, eval_analogy=False):
    """Evaluation helper"""
    if 'eval_tokens' not in globals():
        global eval_tokens

        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
        if not args.no_eval_analogy:
            eval_tokens_set.update(vocab.idx_to_token)

        # Word2Vec does not support computing vectors for OOV words
        eval_tokens = list(filter(lambda t: t in vocab, eval_tokens_set))

    os.makedirs(args.logdir, exist_ok=True)

    # Compute their word vectors
    context = get_context(args)
    idx_to_token = eval_tokens
    mx.nd.waitall()
    token_embedding = embedding.to_token_embedding(idx_to_token,
                                                   ctx=context[0])

    results = evaluation.evaluate_similarity(args,
                                             token_embedding,
                                             context[0],
                                             logfile=os.path.join(
                                                 args.logdir,
                                                 'similarity.tsv'),
                                             global_step=global_step)
    if eval_analogy:
        assert not args.no_eval_analogy
        results += evaluation.evaluate_analogy(args,
                                               token_embedding,
                                               context[0],
                                               logfile=os.path.join(
                                                   args.logdir, 'analogy.tsv'),
                                               global_step=global_step)

    return results
예제 #4
0
    enforce_max_size(token_embedding_, args_.max_vocab_size)
    known_tokens = set(token_embedding_.idx_to_token)
    # Auto-extend token_embedding with unknown extra eval tokens
    if token_embedding_.unknown_lookup is not None:
        eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_)
        # pylint: disable=pointless-statement
        token_embedding_[[
            t for t in eval_tokens - known_tokens
            if t in token_embedding_.unknown_lookup
        ]]

        if len(token_embedding_.idx_to_token) > args_.max_vocab_size:
            logging.warning(
                'Computing embeddings for OOV words that occur '
                'in the evaluation dataset lead to having '
                'more words than --max-vocab-size. '
                'Have %s words (--max-vocab-size %s)',
                len(token_embedding_.idx_to_token), args_.max_vocab_size)

    similarity_results = evaluation.evaluate_similarity(
        args_,
        token_embedding_,
        ctx,
        logfile=os.path.join(args_.logdir, 'similarity{}.tsv'.format(name)))
    analogy_results = evaluation.evaluate_analogy(
        args_,
        token_embedding_,
        ctx,
        logfile=os.path.join(args_.logdir, 'analogy{}.tsv'.format(name)))
예제 #5
0
    else:
        token_embedding_ = load_embedding_from_path(args_)
        name = ''

    enforce_max_size(token_embedding_, args_.analogy_max_vocab_size)
    known_tokens = set(token_embedding_.idx_to_token)

    if args_.similarity_datasets:
        with utils.print_time('find relevant tokens for similarity'):
            tokens = evaluation.get_similarity_task_tokens(args_)
        vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
        with utils.print_time('set {} embeddings'.format(len(tokens))):
            vocab.set_embedding(token_embedding_)
        evaluation.evaluate_similarity(
            args_, vocab.embedding, ctx, logfile=os.path.join(
                args_.logdir, 'similarity{}.tsv'.format(name)))
    if args_.analogy_datasets:
        with utils.print_time('extend open vocabulary with '
                              'OOV tokens for analogy'):
            tokens = evaluation.get_analogy_task_tokens(args_)
            if token_embedding_.unknown_token is not None:
                tokens.update(token_embedding_.idx_to_token[1:])
            else:
                tokens.update(token_embedding_.idx_to_token)
        vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
        with utils.print_time('set {} embeddings'.format(len(tokens))):
            vocab.set_embedding(token_embedding_)
        evaluation.evaluate_analogy(
            args_, vocab.embedding, ctx, logfile=os.path.join(
                args_.logdir, 'analogy{}.tsv'.format(name)))
    else:
        token_embedding_ = load_embedding_from_path(args_)
        name = ''

    enforce_max_size(token_embedding_, args_.max_vocab_size)
    known_tokens = set(token_embedding_.idx_to_token)
    # Auto-extend token_embedding with unknown extra eval tokens
    if token_embedding_.unknown_lookup is not None:
        eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_)
        # pylint: disable=pointless-statement
        token_embedding_[[
            t for t in eval_tokens - known_tokens
            if t in token_embedding_.unknown_lookup
        ]]

        if args_.max_vocab_size is not None and len(
                token_embedding_.idx_to_token) > args_.max_vocab_size:
            logging.warning('Computing embeddings for OOV words that occur '
                            'in the evaluation dataset lead to having '
                            'more words than --max-vocab-size. '
                            'Have %s words (--max-vocab-size %s)',
                            len(token_embedding_.idx_to_token),
                            args_.max_vocab_size)

    similarity_results = evaluation.evaluate_similarity(
        args_, token_embedding_, ctx, logfile=os.path.join(
            args_.logdir, 'similarity{}.tsv'.format(name)))
    analogy_results = evaluation.evaluate_analogy(
        args_, token_embedding_, ctx, logfile=os.path.join(
            args_.logdir, 'analogy{}.tsv'.format(name)))