Exemplo n.º 1
0
    def dist_mean_cosine(self, to_log):
        """
        Mean-cosine model selection criterion.
        """
        mean_cosines = []
        # get normalized embeddings
        for l1, l2 in itertools.permutations(self.params.langs, 2):
            logger.info('compute mean cosine languages: {},{}'.format(l1, l2))
            # map embeddings to shared space
            src_emb = apply_mapping(self.mapping[l1],
                                    self.embs[l1].weight).data
            tgt_emb = apply_mapping(self.mapping[l2],
                                    self.embs[l2].weight).data

            # normalize mapped embeddings
            src_emb = src_emb / src_emb.norm(2, 1,
                                             keepdim=True).expand_as(src_emb)
            tgt_emb = tgt_emb / tgt_emb.norm(2, 1,
                                             keepdim=True).expand_as(tgt_emb)

            # build dictionary
            #for dico_method in ['nn', 'csls_knn_10']:
            for dico_method in ['csls_knn_10']:
                dico_build = 'S2T'
                dico_max_size = 10000
                # temp params / dictionary generation
                _params = deepcopy(self.params)
                _params.dico_method = dico_method
                _params.dico_build = dico_build
                _params.dico_threshold = 0
                _params.dico_max_rank = 10000
                _params.dico_min_size = 0
                _params.dico_max_size = dico_max_size
                s2t_candidates = get_candidates(src_emb, tgt_emb, _params)
                t2s_candidates = get_candidates(tgt_emb, src_emb, _params)
                dico = build_pairwise_dictionary(src_emb, tgt_emb, _params,
                                                 s2t_candidates,
                                                 t2s_candidates, True)
                # mean cosine
                if dico is None:
                    mean_cosine = -1e9
                else:
                    mean_cosine = (
                        src_emb[dico[:dico_max_size, 0]] *
                        tgt_emb[dico[:dico_max_size, 1]]).sum(1).mean()
                    mean_cosine = mean_cosine.item()
                logger.info(
                    "Mean cosine (%s method, %s build, %i max size): %.5f" %
                    (dico_method, _params.dico_build, dico_max_size,
                     mean_cosine))
                to_log['mean_cosine-%s-%s-%i_%s_%s' %
                       (dico_method, _params.dico_build, dico_max_size, l1,
                        l2)] = mean_cosine
                mean_cosines.append(mean_cosine)
        to_log['mean_cosine-%s-%s-%i' %
               (dico_method, _params.dico_build,
                dico_max_size)] = np.mean(mean_cosines)
Exemplo n.º 2
0
 def dist_mean_cosine(self, to_log):
     """
     Mean-cosine model selection criterion.
     """
     # all pair refine
     mean_cosines = []
     for i, src_lang in enumerate(self.params.src_langs):
         # mapped word embeddings
         src_emb = apply_mapping(self.mappings[src_lang],
                                 self.embs[src_lang].weight)
         src_emb = src_emb / src_emb.norm(2, 1,
                                          keepdim=True).expand_as(src_emb)
         for j in range(i + 1, len(self.params.all_langs)):
             tgt_lang = self.params.all_langs[j]
             tgt_emb = apply_mapping(self.mappings[tgt_lang],
                                     self.embs[tgt_lang].weight)
             tgt_emb = tgt_emb / tgt_emb.norm(
                 2, 1, keepdim=True).expand_as(tgt_emb)
             # build dictionary
             # for dico_method in ['nn', 'csls_knn_10']:
             for dico_method in ['csls_knn_10']:
                 dico_build = 'S2T'
                 dico_max_size = 10000
                 # temp params / dictionary generation
                 _params = deepcopy(self.params)
                 _params.dico_method = dico_method
                 _params.dico_build = dico_build
                 _params.dico_threshold = 0
                 _params.dico_max_rank = 10000
                 _params.dico_min_size = 0
                 _params.dico_max_size = dico_max_size
                 s2t_candidates = get_candidates(src_emb, tgt_emb, _params)
                 t2s_candidates = get_candidates(tgt_emb, src_emb, _params)
                 dico = build_dictionary(src_emb, tgt_emb, _params,
                                         s2t_candidates, t2s_candidates)
                 # mean cosine
                 if dico is None:
                     mean_cosine = -1e9
                 else:
                     mean_cosine = (
                         src_emb[dico[:dico_max_size, 0]] *
                         tgt_emb[dico[:dico_max_size, 1]]).sum(1).mean()
                 mean_cosine = mean_cosine.item() if isinstance(
                     mean_cosine, torch_tensor) else mean_cosine
                 logger.info(
                     "%s-%s: Mean cosine (%s method, %s build, %i max size): %.5f"
                     % (src_lang, tgt_lang, dico_method, _params.dico_build,
                        dico_max_size, mean_cosine))
                 to_log['%s-%s-mean_cosine-%s-%s-%i' %
                        (src_lang, tgt_lang, dico_method,
                         _params.dico_build, dico_max_size)] = mean_cosine
                 mean_cosines.append(mean_cosine)
     # average cosine across lang pairs
     to_log['mean_cosine-%s-%s-%i' %
            (dico_method, _params.dico_build, dico_max_size)] = np.mean(
                list(mean_cosines))
Exemplo n.º 3
0
 def monolingual_wordanalogy(self, to_log):
     """
     Evaluation on monolingual word analogy.
     """
     analogy_monolingual_scores = {}
     for lang in self.params.all_langs:
         analogy_scores = get_wordanalogy_scores(
             lang, self.vocabs[lang].word2id,
             apply_mapping(self.mappings[lang],
                           self.embs[lang].weight.detach()).cpu().numpy())
         if analogy_scores is None:
             continue
         analogy_monolingual_scores[lang] = np.mean(
             list(analogy_scores.values()))
         logger.info("Monolingual %s word analogy score average: %.5f" %
                     (lang, analogy_monolingual_scores))
         to_log[
             f'{lang}_analogy_monolingual_scores'] = analogy_monolingual_scores[
                 lang]
     if len(analogy_monolingual_scores) == 0:
         return
     avg_analogy_monolingual_score = sum(analogy_monolingual_scores.values(
     )) / len(analogy_monolingual_scores)
     logger.info("Monolingual word analogy score average: %.5f" %
                 avg_analogy_monolingual_score)
     to_log['analogy_monolingual_scores'] = avg_analogy_monolingual_score
Exemplo n.º 4
0
    def monolingual_wordsim(self, to_log):
        """
        Evaluation on monolingual word similarity.
        """
        ws_monolingual_scores = {}
        for lang in self.params.all_langs:
            ws_scores = get_wordsim_scores(
                lang, self.vocabs[lang].word2id,
                apply_mapping(self.mappings[lang],
                              self.embs[lang].weight.detach()).cpu().numpy())
            if ws_scores is None:
                continue
            ws_monolingual_scores[lang] = np.mean(list(ws_scores.values()))
            logger.info("Monolingual %s word similarity score average: %.5f" %
                        (lang, ws_monolingual_scores[lang]))
            to_log[f'{lang}_ws_monolingual_scores'] = ws_monolingual_scores[
                lang]
            to_log.update({f'{lang}_{k}': v for k, v in ws_scores.items()})

        if len(ws_monolingual_scores) == 0:
            return
        avg_ws_monolingual_score = sum(
            ws_monolingual_scores.values()) / len(ws_monolingual_scores)
        logger.info("Monolingual word similarity score average: %.5f" %
                    avg_ws_monolingual_score)
        to_log['ws_monolingual_scores'] = avg_ws_monolingual_score
Exemplo n.º 5
0
    def word_translation(self, to_log):
        """
        Evaluation on word translation.
        """
        # mapped word embeddings
        from itertools import permutations
        for l1, l2 in permutations(self.params.langs, 2):
            torch.cuda.empty_cache()

            path = get_dict_path(self.params.dico_eval,
                                 self.params.dicts_eval_path, l1, l2)
            if not os.path.exists(path):
                logger.info(
                    'Warning: Test dictionary for {}-{} not exists. Skipping this pair'
                    .format(l1, l2))
                continue

            src_emb = apply_mapping(self.mapping[l1],
                                    self.embs[l1].weight).data
            src_emb = src_emb.cuda() if self.params.cuda else src_emb.cpu()
            tgt_emb = apply_mapping(self.mapping[l2],
                                    self.embs[l2].weight).data
            src_emb = src_emb.cuda() if self.params.cuda else src_emb.cpu()

            for method in ['nn', 'csls_knn_10']:
                results = get_word_translation_accuracy(
                    l1,
                    self.lang_dico[l1].word2id,
                    src_emb,  #.cuda(),
                    l2,
                    self.lang_dico[l2].word2id,
                    tgt_emb,  #.cuda(),
                    method=method,
                    dico_eval=self.params.dico_eval,
                    dicts_eval_path=self.params.dicts_eval_path)
                to_log.update([('%s-%s_%s-%s' % (k, method, l1, l2), v)
                               for k, v in results])
Exemplo n.º 6
0
    def word_translation(self):
        """
        Evaluation on word translation.
        """
        # mapped word embeddings
        all_emb = {
            l2: apply_mapping(self.mapping[l2],
                              self.embs[l2].weight).data.cpu()
            for l2 in self.params.langs
        }
        results = defaultdict(dict)
        # for computational efficiency, iterate over source languages and calculate all methods for each one
        for src_lang in self.params.langs:
            logger.info('\n\n\n\nSource Language: {}\n\n\n\n'.format(src_lang))
            torch.cuda.empty_cache()

            # get source queries
            paths = self.get_dico_paths(src_lang)
            query_ids = self.aggregate_query_ids(paths, src_lang)
            if query_ids is None:
                logger.info(
                    'Warning: No test dictionary was found for source language {}. Skipping!'
                    .format(src_lang))
                continue

            method = 'csls_knn_10'
            # init translation
            init_trans, top_scores = BI_translation(src_lang,
                                                    query_ids,
                                                    method,
                                                    all_emb,
                                                    cuda=self.params.cuda)

            for inf_met in self.params.multilingual_inference_method:
                logger.info('\n\nMultilingual inference method: {}\n\n'.format(
                    inf_met))

                # improve source word representation, and re-translate
                if inf_met != 'BI':
                    updated_trans, used_langs = update_translation_for_all_langs(
                        self.params.langs,
                        src_lang,
                        query_ids,
                        all_emb,
                        init_trans,
                        method,
                        inf_met,
                        top_scores,
                        cuda=self.params.cuda)
                else:
                    used_langs = None
                    updated_trans = init_trans

                # re-arrange translations for convenience
                translation_by_src_id, used_langs_by_src_id = self.translation_by_src_id(
                    updated_trans, used_langs, src_lang)

                # calcualte accuracy, and matching per source word
                for tgt_lang, path in paths.items():
                    pair_result = self.get_pair_accuracy(
                        path, src_lang, self.lang_dico[src_lang].word2id,
                        tgt_lang, self.lang_dico[tgt_lang].word2id,
                        translation_by_src_id, method)
                    if inf_met != 'BI':
                        self.print_aux_statistics(src_lang, tgt_lang, path,
                                                  used_langs_by_src_id)
                    results[(src_lang, tgt_lang)][inf_met] = pair_result

        save_results(self.params, results,
                     self.params.multilingual_inference_method)
Exemplo n.º 7
0
    def crosslingual_wordsim(self, to_log, src_lang=None, tgt_lang=None):
        """
        Evaluation on cross-lingual word similarity.
        If src_lang and tgt_lang are not specified, evaluate all src_langs to tgt_lang
        """
        # evaluate all src langs to tgt_lang by default
        if src_lang is None and tgt_lang is None:
            ws_crosslingual_scores = []
            tgt_lang = self.params.tgt_lang
            tgt_emb = self.embs[tgt_lang].weight.detach().cpu().numpy()
            for src_lang in self.params.src_langs:
                src_emb = apply_mapping(
                    self.mappings[src_lang],
                    self.embs[src_lang].weight.detach()).cpu().numpy()
                # cross-lingual wordsim evaluation
                ws_scores = get_crosslingual_wordsim_scores(
                    src_lang,
                    self.vocabs[src_lang].word2id,
                    src_emb,
                    tgt_lang,
                    self.vocabs[tgt_lang].word2id,
                    tgt_emb,
                    ignore_oov=self.params.semeval_ignore_oov)
                if ws_scores is None:
                    continue
                ws_crosslingual_score = np.mean(list(ws_scores.values()))
                ws_crosslingual_scores.append(ws_crosslingual_score)
                logger.info("%s-%s cross-lingual word similarity score: %.5f" %
                            (src_lang, tgt_lang, ws_crosslingual_score))
                to_log[
                    f'{src_lang}_{tgt_lang}_ws_crosslingual_scores'] = ws_crosslingual_score
                to_log.update({
                    f'{src_lang}_{tgt_lang}_{k}': v
                    for k, v in ws_scores.items()
                })

            avg_ws_crosslingual_score = np.mean(ws_crosslingual_scores)
            logger.info("Cross-lingual word similarity score average: %.5f" %
                        avg_ws_crosslingual_score)
            to_log['ws_crosslingual_scores'] = avg_ws_crosslingual_score
        else:
            # only evaluate src_lang to tgt_lang; bridge as necessary
            assert src_lang is not None and tgt_lang is not None
            # encode src
            src_emb = apply_mapping(self.mappings[src_lang],
                                    self.embs[src_lang].weight).cpu().numpy()
            # encode tgt
            tgt_emb = apply_mapping(self.mappings[tgt_lang],
                                    self.embs[tgt_lang].weight).cpu().numpy()
            # cross-lingual wordsim evaluation
            ws_scores = get_crosslingual_wordsim_scores(
                src_lang,
                self.vocabs[src_lang].word2id,
                src_emb,
                tgt_lang,
                self.vocabs[tgt_lang].word2id,
                tgt_emb,
            )
            if ws_scores is None:
                return
            ws_crosslingual_score = np.mean(list(ws_scores.values()))
            logger.info("%s-%s cross-lingual word similarity score: %.5f" %
                        (src_lang, tgt_lang, ws_crosslingual_score))
            to_log[
                f'{src_lang}_{tgt_lang}_ws_crosslingual_scores'] = ws_crosslingual_score
            to_log.update({
                f'{src_lang}_{tgt_lang}_{k}': v
                for k, v in ws_scores.items()
            })
Exemplo n.º 8
0
    def sent_translation(self, to_log, src_lang=None, tgt_lang=None):
        """
        Evaluation on sentence translation.
        If src_lang and tgt_lang are not specified, evaluate all src_langs to tgt_lang
        Only available on Europarl, for en - {de, es, fr, it} language pairs.
        """
        # parameters
        n_keys = 200000
        n_queries = 2000
        n_idf = 300000

        # load europarl data
        if not hasattr(self, 'europarl_data'):
            self.europarl_data = {}

        # evaluate all src langs to tgt_lang by default
        if src_lang is None and tgt_lang is None:
            tgt_lang = self.params.tgt_lang
            for src_lang in self.params.src_langs:
                lang_pair = (src_lang, tgt_lang)
                # load europarl data
                if lang_pair not in self.europarl_data:
                    self.europarl_data[lang_pair] = load_europarl_data(
                        src_lang, tgt_lang, n_max=(n_keys + 2 * n_idf))
                # if no Europarl data for this language pair
                if not self.europarl_data or lang_pair not in self.europarl_data \
                        or self.europarl_data[lang_pair] is None:
                    logger.info(
                        f'Europarl data not found for {src_lang}-{tgt_lang}.')
                    continue

                # mapped word embeddings
                src_emb = apply_mapping(self.mappings[src_lang],
                                        self.embs[src_lang].weight)
                tgt_emb = self.embs[tgt_lang].weight

                # get idf weights
                idf = get_idf(self.europarl_data[lang_pair],
                              src_lang,
                              tgt_lang,
                              n_idf=n_idf)

                for method in ['nn', 'csls_knn_10']:
                    # source <- target sentence translation
                    results = get_sent_translation_accuracy(
                        self.europarl_data[lang_pair],
                        src_lang,
                        self.vocabs[src_lang].word2id,
                        src_emb,
                        tgt_lang,
                        self.vocabs[tgt_lang].word2id,
                        tgt_emb,
                        n_keys=n_keys,
                        n_queries=n_queries,
                        method=method,
                        idf=idf)
                    to_log.update([
                        ('%s_to_%s_%s-%s' % (tgt_lang, src_lang, k, method), v)
                        for k, v in results
                    ])
                    # target <- source sentence translation
                    results = get_sent_translation_accuracy(
                        self.europarl_data[lang_pair],
                        tgt_lang,
                        self.vocabs[tgt_lang].word2id,
                        tgt_emb,
                        src_lang,
                        self.vocabs[src_lang].word2id,
                        src_emb,
                        n_keys=n_keys,
                        n_queries=n_queries,
                        method=method,
                        idf=idf)
                    to_log.update([
                        ('%s_to_%s_%s-%s' % (src_lang, tgt_lang, k, method), v)
                        for k, v in results
                    ])
        else:
            # only evaluate src_lang to tgt_lang; bridge as necessary
            assert src_lang is not None and tgt_lang is not None
            lang_pair = (src_lang, tgt_lang)
            # load europarl data
            if lang_pair not in self.europarl_data:
                self.europarl_data[lang_pair] = load_europarl_data(
                    src_lang, tgt_lang, n_max=(n_keys + 2 * n_idf))
            # if no Europarl data for this language pair
            if not self.europarl_data or lang_pair not in self.europarl_data \
                    or self.europarl_data[lang_pair] is None:
                logger.info(
                    f'Europarl data not found for {src_lang}-{tgt_lang}.')
                return
            # encode src
            src_emb = apply_mapping(self.mappings[src_lang],
                                    self.embs[src_lang].weight)
            # encode tgt
            tgt_emb = apply_mapping(self.mappings[tgt_lang],
                                    self.embs[tgt_lang].weight)
            # get idf weights
            idf = get_idf(self.europarl_data[lang_pair],
                          src_lang,
                          tgt_lang,
                          n_idf=n_idf)

            for method in ['nn', 'csls_knn_10']:
                # source <- target sentence translation
                results = get_sent_translation_accuracy(
                    self.europarl_data[lang_pair],
                    src_lang,
                    self.vocabs[src_lang].word2id,
                    src_emb,
                    tgt_lang,
                    self.vocabs[tgt_lang].word2id,
                    tgt_emb,
                    n_keys=n_keys,
                    n_queries=n_queries,
                    method=method,
                    idf=idf)
                to_log.update([
                    ('%s_to_%s_%s-%s' % (tgt_lang, src_lang, k, method), v)
                    for k, v in results
                ])
                # target <- source sentence translation
                results = get_sent_translation_accuracy(
                    self.europarl_data[lang_pair],
                    tgt_lang,
                    self.vocabs[tgt_lang].word2id,
                    tgt_emb,
                    src_lang,
                    self.vocabs[src_lang].word2id,
                    src_emb,
                    n_keys=n_keys,
                    n_queries=n_queries,
                    method=method,
                    idf=idf)
                to_log.update([
                    ('%s_to_%s_%s-%s' % (src_lang, tgt_lang, k, method), v)
                    for k, v in results
                ])
Exemplo n.º 9
0
    def word_translation(self, to_log, src_lang=None, tgt_lang=None):
        """
        Evaluation on word translation.
        If src_lang and tgt_lang are not specified, evaluate all src_langs to tgt_lang
        """
        # evaluate all src langs to tgt_lang by default
        if src_lang is None and tgt_lang is None:
            wt_precisions = []
            tgt_lang = self.params.tgt_lang
            tgt_emb = self.embs[tgt_lang].weight.detach()
            for src_lang in self.params.src_langs:
                # mapped word embeddings
                src_emb = apply_mapping(self.mappings[src_lang],
                                        self.embs[src_lang].weight.detach())

                for method in ['nn', 'csls_knn_10']:
                    results = get_word_translation_accuracy(
                        src_lang,
                        self.vocabs[src_lang].word2id,
                        src_emb,
                        tgt_lang,
                        self.vocabs[tgt_lang].word2id,
                        tgt_emb,
                        method=method,
                        dico_eval=self.params.dico_eval)
                    if results is None:
                        continue
                    to_log.update([
                        ('%s-%s_%s-%s' % (src_lang, tgt_lang, k, method), v)
                        for k, v in results
                    ])
                    if method == 'csls_knn_10':
                        for k, v in results:
                            if k == 'precision_at_1':
                                wt_precisions.append(v)
            to_log['precision_at_1-csls_knn_10'] = np.mean(wt_precisions)
            logger.info("word translation precision@1: %.5f" %
                        (np.mean(wt_precisions)))
        else:
            # only evaluate src_lang to tgt_lang; bridge as necessary
            assert src_lang is not None and tgt_lang is not None
            # encode src
            src_emb = apply_mapping(self.mappings[src_lang],
                                    self.embs[src_lang].weight).cpu()
            # encode tgt
            tgt_emb = apply_mapping(self.mappings[tgt_lang],
                                    self.embs[tgt_lang].weight).cpu()
            for method in ['nn', 'csls_knn_10']:
                results = get_word_translation_accuracy(
                    src_lang,
                    self.vocabs[src_lang].word2id,
                    src_emb,
                    tgt_lang,
                    self.vocabs[tgt_lang].word2id,
                    tgt_emb,
                    method=method,
                    dico_eval=self.params.dico_eval)
                if results is None:
                    continue
                to_log.update([
                    ('%s-%s_%s-%s' % (src_lang, tgt_lang, k, method), v)
                    for k, v in results
                ])