help="Embedding dimension") parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") # parse parameters params = parser.parse_args() # check parameters assert params.src_lang, "source language undefined" assert os.path.isfile(params.src_emb) assert not params.tgt_lang or os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(trainer) # run evaluations to_log = OrderedDict({'n_iter': 0}) evaluator.monolingual_wordsim(to_log) # evaluator.monolingual_wordanalogy(to_log) if params.tgt_lang: evaluator.crosslingual_wordsim(to_log) evaluator.word_translation(to_log) evaluator.sent_translation(to_log) # evaluator.dist_mean_cosine(to_log)
parser.add_argument("--src_emb", type=str, default="data/fastText/wiki.en.vec", help="Reload source embeddings") parser.add_argument("--tgt_emb", type=str, default="data/fastText/wiki.it.vec", help="Reload target embeddings") parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") parser.add_argument("--normalize_embeddings", type=str, default="renorm", help="Normalize embeddings before training") parser.add_argument("--alpha", type=float, default=0.23, help="Reload the hyper-param alpha") parser.add_argument("--inv_K", type=int, default=4, help="Reload the hyper-param inv_K") # parse parameters params = parser.parse_args() # check parameters assert params.src_lang, "source language undefined" assert os.path.isfile(params.src_emb) assert not params.tgt_lang or os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(trainer) # run evaluations to_log = OrderedDict({'n_iter': 0}) if params.tgt_lang: evaluator.crosslingual_wordsim(to_log, 'row') evaluator.word_translation(to_log, 'row') evaluator.dist_mean_cosine(to_log, 'row')
# reload pre-trained embeddings parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size") parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") # parse parameters params = parser.parse_args() # check parameters assert params.src_lang, "source language undefined" assert os.path.isfile(params.src_emb) assert not params.tgt_lang or os.path.isfile(params.tgt_emb) # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(trainer) # run evaluations to_log = OrderedDict({'n_iter': 0}) evaluator.monolingual_wordsim(to_log) if params.tgt_lang: evaluator.crosslingual_wordsim(to_log) evaluator.word_translation(to_log) evaluator.sent_translation(to_log) # evaluator.dist_mean_cosine(to_log)
params.eval_pairs.append(f'{lang1}-{lang2}') # check parameters assert len(params.src_langs) > 0, "source language undefined" assert all([os.path.isfile(emb) for emb in params.src_embs]) assert not params.tgt_lang or os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) # build logger / model / trainer / evaluator logger = initialize_exp(params, dump_params=False, log_name='evaluate.log') embs, mappings, _ = build_model(params, False) trainer = Trainer(embs, mappings, None, params) trainer.reload_best() evaluator = Evaluator(trainer) # run evaluations to_log = OrderedDict({'n_iter': 0}) all_wt = [] evaluator.monolingual_wordsim(to_log) for eval_pair in params.eval_pairs: parts = eval_pair.split('-') assert len(parts) == 2, 'Invalid format for evaluation pairs.' src_lang, tgt_lang = parts[0], parts[1] logger.info(f'Evaluating language pair: {src_lang} - {tgt_lang}') evaluator.crosslingual_wordsim(to_log, src_lang=src_lang, tgt_lang=tgt_lang) evaluator.word_translation(to_log, src_lang=src_lang, tgt_lang=tgt_lang) all_wt.append(to_log[f'{src_lang}-{tgt_lang}_precision_at_1-csls_knn_10']) evaluator.sent_translation(to_log, src_lang=src_lang, tgt_lang=tgt_lang) logger.info(f"Overall Word Translation Precision@1 over {len(all_wt)} language pairs: {sum(all_wt)/len(all_wt)}")