def evaluate(args, model, vocab, global_step, eval_analogy=False): """Evaluation helper""" if 'eval_tokens' not in globals(): global eval_tokens eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args) if not args.no_eval_analogy: eval_tokens_set.update(vocab.idx_to_token) # GloVe does not support computing vectors for OOV words eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set) eval_tokens = list(eval_tokens_set) # Compute their word vectors context = get_context(args) mx.nd.waitall() token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None, allow_extend=True) token_embedding[eval_tokens] = model[eval_tokens] results = evaluation.evaluate_similarity( args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'similarity.tsv'), global_step=global_step) if eval_analogy: assert not args.no_eval_analogy results += evaluation.evaluate_analogy( args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'analogy.tsv')) return results
def evaluate(args, embedding, vocab, global_step, eval_analogy=False): """Evaluation helper""" if 'eval_tokens' not in globals(): global eval_tokens eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args) if not args.no_eval_analogy: eval_tokens_set.update(vocab.idx_to_token) if not args.ngram_buckets: # Word2Vec does not support computing vectors for OOV words eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set) eval_tokens = list(eval_tokens_set) if not os.path.isdir(args.logdir): os.makedirs(args.logdir) # Compute their word vectors context = get_context(args) mx.nd.waitall() token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None, allow_extend=True) token_embedding[eval_tokens] = embedding[eval_tokens] results = evaluation.evaluate_similarity(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'similarity.tsv'), global_step=global_step) if eval_analogy: assert not args.no_eval_analogy results += evaluation.evaluate_analogy(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'analogy.tsv')) return results
def evaluate(args, embedding, vocab, global_step, eval_analogy=False): """Evaluation helper""" if 'eval_tokens' not in globals(): global eval_tokens eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args) if not args.no_eval_analogy: eval_tokens_set.update(vocab.idx_to_token) # Word2Vec does not support computing vectors for OOV words eval_tokens = list(filter(lambda t: t in vocab, eval_tokens_set)) os.makedirs(args.logdir, exist_ok=True) # Compute their word vectors context = get_context(args) idx_to_token = eval_tokens mx.nd.waitall() token_embedding = embedding.to_token_embedding(idx_to_token, ctx=context[0]) results = evaluation.evaluate_similarity(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'similarity.tsv'), global_step=global_step) if eval_analogy: assert not args.no_eval_analogy results += evaluation.evaluate_analogy(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'analogy.tsv'), global_step=global_step) return results
enforce_max_size(token_embedding_, args_.max_vocab_size) known_tokens = set(token_embedding_.idx_to_token) # Auto-extend token_embedding with unknown extra eval tokens if token_embedding_.unknown_lookup is not None: eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_) # pylint: disable=pointless-statement token_embedding_[[ t for t in eval_tokens - known_tokens if t in token_embedding_.unknown_lookup ]] if len(token_embedding_.idx_to_token) > args_.max_vocab_size: logging.warning( 'Computing embeddings for OOV words that occur ' 'in the evaluation dataset lead to having ' 'more words than --max-vocab-size. ' 'Have %s words (--max-vocab-size %s)', len(token_embedding_.idx_to_token), args_.max_vocab_size) similarity_results = evaluation.evaluate_similarity( args_, token_embedding_, ctx, logfile=os.path.join(args_.logdir, 'similarity{}.tsv'.format(name))) analogy_results = evaluation.evaluate_analogy( args_, token_embedding_, ctx, logfile=os.path.join(args_.logdir, 'analogy{}.tsv'.format(name)))
else: token_embedding_ = load_embedding_from_path(args_) name = '' enforce_max_size(token_embedding_, args_.analogy_max_vocab_size) known_tokens = set(token_embedding_.idx_to_token) if args_.similarity_datasets: with utils.print_time('find relevant tokens for similarity'): tokens = evaluation.get_similarity_task_tokens(args_) vocab = nlp.Vocab(nlp.data.count_tokens(tokens)) with utils.print_time('set {} embeddings'.format(len(tokens))): vocab.set_embedding(token_embedding_) evaluation.evaluate_similarity( args_, vocab.embedding, ctx, logfile=os.path.join( args_.logdir, 'similarity{}.tsv'.format(name))) if args_.analogy_datasets: with utils.print_time('extend open vocabulary with ' 'OOV tokens for analogy'): tokens = evaluation.get_analogy_task_tokens(args_) if token_embedding_.unknown_token is not None: tokens.update(token_embedding_.idx_to_token[1:]) else: tokens.update(token_embedding_.idx_to_token) vocab = nlp.Vocab(nlp.data.count_tokens(tokens)) with utils.print_time('set {} embeddings'.format(len(tokens))): vocab.set_embedding(token_embedding_) evaluation.evaluate_analogy( args_, vocab.embedding, ctx, logfile=os.path.join( args_.logdir, 'analogy{}.tsv'.format(name)))
else: token_embedding_ = load_embedding_from_path(args_) name = '' enforce_max_size(token_embedding_, args_.max_vocab_size) known_tokens = set(token_embedding_.idx_to_token) # Auto-extend token_embedding with unknown extra eval tokens if token_embedding_.unknown_lookup is not None: eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_) # pylint: disable=pointless-statement token_embedding_[[ t for t in eval_tokens - known_tokens if t in token_embedding_.unknown_lookup ]] if args_.max_vocab_size is not None and len( token_embedding_.idx_to_token) > args_.max_vocab_size: logging.warning('Computing embeddings for OOV words that occur ' 'in the evaluation dataset lead to having ' 'more words than --max-vocab-size. ' 'Have %s words (--max-vocab-size %s)', len(token_embedding_.idx_to_token), args_.max_vocab_size) similarity_results = evaluation.evaluate_similarity( args_, token_embedding_, ctx, logfile=os.path.join( args_.logdir, 'similarity{}.tsv'.format(name))) analogy_results = evaluation.evaluate_analogy( args_, token_embedding_, ctx, logfile=os.path.join( args_.logdir, 'analogy{}.tsv'.format(name)))