def load_embedding_from_path(args): """Load a TokenEmbedding.""" if 'bin' in args.embedding_path: with utils.print_time('load fastText model.'): model = \ nlp.model.train.FasttextEmbeddingModel.load_fasttext_format( args.embedding_path) embedding = nlp.embedding.TokenEmbedding( unknown_token=None, unknown_lookup=model, allow_extend=True, unknown_autoextend=True) idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get) if not args.analogy_datasets: # Prune tokens not used in evaluation datasets eval_tokens_ = set( evaluation.get_tokens_in_evaluation_datasets(args)) idx_to_token = [t for t in idx_to_token if t in eval_tokens_] if args.max_vocab_size: idx_to_token = idx_to_token[:args.max_vocab_size] with utils.print_time('compute vectors from subwords ' 'for {} words.'.format(len(idx_to_token))): embedding[idx_to_token] = model[idx_to_token] else: embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path) return embedding
def evaluate(args, model, vocab, global_step, eval_analogy=False): """Evaluation helper""" if 'eval_tokens' not in globals(): global eval_tokens eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args) if not args.no_eval_analogy: eval_tokens_set.update(vocab.idx_to_token) # GloVe does not support computing vectors for OOV words eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set) eval_tokens = list(eval_tokens_set) # Compute their word vectors context = get_context(args) mx.nd.waitall() token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None, allow_extend=True) token_embedding[eval_tokens] = model[eval_tokens] results = evaluation.evaluate_similarity( args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'similarity.tsv'), global_step=global_step) if eval_analogy: assert not args.no_eval_analogy results += evaluation.evaluate_analogy( args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'analogy.tsv')) return results
def load_embedding_from_path(args): """Load a TokenEmbedding.""" if 'bin' in args.embedding_path: with utils.print_time('load fastText model.'): model = \ nlp.model.train.FasttextEmbeddingModel.load_fasttext_format( args.embedding_path) # Add OOV words if the token_embedding can impute them token_set = set() token_set.update( filter(lambda x: x in model, evaluation.get_tokens_in_evaluation_datasets(args))) # OOV words will be imputed and added to the # token_embedding.idx_to_token etc. with utils.print_time('compute vectors from subwords ' 'for {} words.'.format(len(token_set))): embedding = nlp.embedding.TokenEmbedding(unknown_token=None, allow_extend=True) idx_to_tokens = list(token_set) embedding[idx_to_tokens] = model[idx_to_tokens] else: embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path) return embedding
def load_and_evaluate(args): """Load the pretrained model and run evaluate.""" context = utils.get_context(args) embedding, model_idx_to_token = get_model(args) idx_to_token_set = evaluation.get_tokens_in_evaluation_datasets(args) idx_to_token_set.update(model_idx_to_token) idx_to_token = list(idx_to_token_set) # Compute their word vectors token_embedding = embedding.to_token_embedding(idx_to_token, ctx=context[0]) os.makedirs(args.logdir, exist_ok=True) results = evaluation.evaluate_similarity(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'similarity.tsv')) results += evaluation.evaluate_analogy(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'analogy.tsv'))
def evaluate(args, embedding, vocab, global_step, eval_analogy=False): """Evaluation helper""" if 'eval_tokens' not in globals(): global eval_tokens eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args) if not args.no_eval_analogy: eval_tokens_set.update(vocab.idx_to_token) eval_tokens = list(eval_tokens_set) os.makedirs(args.logdir, exist_ok=True) # Compute their word vectors context = get_context(args) idx_to_token = eval_tokens mx.nd.waitall() token_embedding = embedding.to_token_embedding(idx_to_token, ctx=context[0]) results = evaluation.evaluate_similarity(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'similarity.tsv'), global_step=global_step) if eval_analogy: assert not args.no_eval_analogy results += evaluation.evaluate_analogy(args, token_embedding, context[0], logfile=os.path.join( args.logdir, 'analogy.tsv')) return results
def load_embedding_from_path(args): """Load a TokenEmbedding.""" if 'bin' in args.embedding_path: with utils.print_time('load fastText model.'): model = \ nlp.model.train.FasttextEmbeddingModel.load_fasttext_format( args.embedding_path) embedding = nlp.embedding.TokenEmbedding(unknown_token=None, unknown_lookup=model, allow_extend=True, unknown_autoextend=True) if args.analogy_datasets: # Pre-compute all words in vocabulary in case of analogy evaluation idx_to_token = [ model.token_to_idx[idx] for idx in range(len(model.token_to_idx)) ] if args.max_vocab_size: idx_to_token = idx_to_token[:args.max_vocab_size] else: idx_to_token = [ t for t in evaluation.get_tokens_in_evaluation_datasets(args) if t in model.token_to_idx ] if args.max_vocab_size: assert len(idx_to_token) < args.max_vocab_size, \ 'max_vocab_size unsupported for bin model without analogy evaluation.' with utils.print_time('compute vectors from subwords ' 'for {} words.'.format(len(idx_to_token))): embedding[idx_to_token] = model[idx_to_token] else: embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path) return embedding
load_ngrams=args_.fasttext_load_ngrams, allow_extend=True, unknown_autoextend=True) else: token_embedding_ = nlp.embedding.create( args_.embedding_name, source=args_.embedding_source) name = '-' + args_.embedding_name + '-' + args_.embedding_source else: token_embedding_ = load_embedding_from_path(args_) name = '' enforce_max_size(token_embedding_, args_.max_vocab_size) known_tokens = set(token_embedding_.idx_to_token) # Auto-extend token_embedding with unknown extra eval tokens if token_embedding_.unknown_lookup is not None: eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_) # pylint: disable=pointless-statement token_embedding_[[ t for t in eval_tokens - known_tokens if t in token_embedding_.unknown_lookup ]] if len(token_embedding_.idx_to_token) > args_.max_vocab_size: logging.warning( 'Computing embeddings for OOV words that occur ' 'in the evaluation dataset lead to having ' 'more words than --max-vocab-size. ' 'Have %s words (--max-vocab-size %s)', len(token_embedding_.idx_to_token), args_.max_vocab_size) similarity_results = evaluation.evaluate_similarity(
load_ngrams=args_.fasttext_load_ngrams, allow_extend=True, unknown_autoextend=True) else: token_embedding_ = nlp.embedding.create( args_.embedding_name, source=args_.embedding_source) name = '-' + args_.embedding_name + '-' + args_.embedding_source else: token_embedding_ = load_embedding_from_path(args_) name = '' enforce_max_size(token_embedding_, args_.max_vocab_size) known_tokens = set(token_embedding_.idx_to_token) # Auto-extend token_embedding with unknown extra eval tokens if token_embedding_.unknown_lookup is not None: eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_) # pylint: disable=pointless-statement token_embedding_[[ t for t in eval_tokens - known_tokens if t in token_embedding_.unknown_lookup ]] if args_.max_vocab_size is not None and len( token_embedding_.idx_to_token) > args_.max_vocab_size: logging.warning('Computing embeddings for OOV words that occur ' 'in the evaluation dataset lead to having ' 'more words than --max-vocab-size. ' 'Have %s words (--max-vocab-size %s)', len(token_embedding_.idx_to_token), args_.max_vocab_size)