Пример #1
0
def _get_metric_data(data_path: Path, feat_groups: str, family_file_path: Path) -> pd.DataFrame:
    data = pd.read_csv(data_path, sep='\t')
    data = pd.pivot_table(data, index=['lang1', 'lang2'], columns='category', values='normalized_score').reset_index()
    cats = [cat.name for cat in Category if should_include(feat_groups, cat)] + ['avg']
    cols = ['lang1', 'lang2'] + cats
    data = data[cols]

    # Get ground truth distances.
    get_families(family_file_path)
    dists = get_all_distances()

    def _get_lang(lang: str):
        if len(lang) == 2:
            return languages.get(alpha_2=lang)
        elif len(lang) == 3:
            return languages.get(alpha_3=lang)
        else:
            return None

    def _get_dist(lang1: str, lang2: str):
        lang_struct1 = _get_lang(lang1)
        lang_struct2 = _get_lang(lang2)
        if lang_struct1 is None or lang_struct2 is None:
            return None
        return dists.get((lang_struct1.name, lang_struct2.name), None)

    dists = [_get_dist(lang1, lang2) for lang1, lang2, *_ in data.values]
    data['dist'] = dists
    cols.append('dist')
    data = data[~data['dist'].isnull()].reset_index(drop=True)
    return data
Пример #2
0
 def _get_embeddings(self):
     emb_dict = dict()
     for cat in Category:
         if should_include(self.feat_groups, cat):
             e = get_enum_by_cat(cat)
             nf = len(e)
             emb_dict[cat.name] = nn.Parameter(torch.zeros(nf, self.dim))
     return nn.ParameterDict(emb_dict)
Пример #3
0
    def forward(self, h: FT) -> Dict[str, FT]:
        shared_h = nn.functional.leaky_relu(self.linear(h).refine_names(
            ..., 'shared_repr'),
                                            negative_slope=0.1)
        ret = dict()
        for name, layer in self.feat_predictors.items():
            out = layer(shared_h).refine_names(..., name)
            if not should_predict_none(name, new_style=g.new_style):
                f_idx = get_none_index(name)
                out[:, f_idx] = -999.9
            ret[Name(name, 'camel')] = out

        # Compose probs for complex feature groups if possible.
        if g.new_style:
            for e in get_needed_categories(g.feat_groups,
                                           new_style=True,
                                           breakdown=False):
                if e.num_groups() > 1:
                    assert e not in ret
                    part_tensors = [
                        ret[part_enum.get_name()] for part_enum in e.parts()
                    ]
                    parts = list()
                    for i, part_tensor in enumerate(part_tensors):
                        conversion = self.conversion_idx[e.get_name().value][:,
                                                                             i]
                        bs = len(part_tensor)
                        part = part_tensor.rename(None).gather(
                            1,
                            conversion.rename(None).expand(bs, -1))
                        parts.append(part)
                    parts = torch.stack(parts, dim=-1)
                    dim_name = e.get_name().value
                    ret[e.get_name()] = parts.sum(dim=-1).refine_names(
                        'batch', dim_name)
                    for part_cat in e.parts():
                        del ret[part_cat.get_name()]
        for name in ret:
            ret[name] = torch.log_softmax(ret[name], dim=-1)

        # Deal with conditions for some categories
        for cat, index in conditions.items():
            if should_include(g.feat_groups, cat):
                # Find out the exact value to be conditioned on.
                # TODO(j_luo) ugly Category call.
                condition_e = get_enum_by_cat(Category(index.c_idx))
                condition_name = condition_e.__name__ + ('X' if g.new_style
                                                         else '')
                cat_name = get_enum_by_cat(cat).__name__ + ('X' if g.new_style
                                                            else '')

                condition_name = Name(condition_name, 'camel')
                cat_name = Name(cat_name, 'camel')
                condition_log_probs = ret[condition_name][..., index.f_idx]
                # condition_log_probs.align_as(ret[cat_name])
                ret[cat_name] = ret[cat_name] + condition_log_probs.rename(
                    None).unsqueeze(dim=-1)
        return ret
Пример #4
0
 def __init__(self):
     super().__init__()
     param_dict = dict()
     for cat in Category:
         if should_include(g.feat_groups, cat):
             e = get_enum_by_cat(cat)
             nf = len(e)
             param = nn.Parameter(torch.zeros(nf, nf))
             param_dict[cat.name] = param
     self.adapters = nn.ParameterDict(param_dict)
Пример #5
0
 def _get_embeddings(self):
     emb_dict = dict()
     for cat in Category:
         if should_include(g.feat_groups, cat):
             e = get_enum_by_cat(cat)
             nf = len(e)
             emb_dict[cat.name] = nn.Parameter(torch.zeros(nf, self.dim))
             logging.warning('dense feature embedding init')
             torch.nn.init.uniform_(emb_dict[cat.name], -0.1, 0.1)
     return nn.ParameterDict(emb_dict)
Пример #6
0
 def analyze_scores(self, scores) -> Metrics:
     metrics = Metrics()
     total_loss = 0.0
     total_weight = 0.0
     for name, (losses, weights) in scores.items():
         if should_include(self.feat_groups, name):
             loss = (losses * weights).sum()
             weight = weights.sum()
             total_loss += loss
             total_weight += weight
             loss = Metric(f'loss_{name.snake}', loss, weight)
             metrics += loss
     metrics += Metric('loss', total_loss, total_weight)
     return metrics
Пример #7
0
 def analyze(
     self,
     scores: Dict[Cat, FT],
     return_scores: bool = False
 ) -> Union[Metrics, Tuple[Metrics, Dict[Cat, FT]]]:
     metrics = Metrics()
     total_loss = 0.0
     total_weight = 0.0
     for name, (losses, weights) in scores.items():
         if should_include(g.feat_groups, name):
             loss = (losses * weights).sum()
             weight = weights.sum()
             total_loss += loss
             total_weight += weight
             loss = Metric(f'loss_{name.snake}', loss, weight)
             metrics += loss
     metrics += Metric('loss', total_loss, total_weight)
     if return_scores:
         return metrics, scores
     else:
         return metrics
Пример #8
0
 def __init__(self, data_path, num_workers, feat_groups: 'p', family_file_path: 'p', num_lang_pairs: 'p', data=None):
     if data is None:
         data = _get_metric_data(data_path, feat_groups, family_file_path)
     self.all_langs = sorted(set(data['lang1']))
     self.cats = [cat.name for cat in Category if should_include(feat_groups, cat)] + ['avg']
     super().__init__(data, batch_size=num_lang_pairs, num_workers=num_workers)
Пример #9
0
 def effective_categories(self) -> List[Category]:
     ret = list()
     for cat in Category:
         if should_include(g.feat_groups, cat):
             ret.append(cat)
     return ret
Пример #10
0
    def _get_scores(
            self, samples: LT, segments: Sequence[SegmentWindow], lengths: LT,
            feat_matrix: LT, source_padding: BT
    ) -> Tuple[PackedWords, DecipherModelScoreReturn]:
        bs = len(segments)

        segment_list = None
        if self.vocab is not None:
            segment_list = [segment.segment_list for segment in segments]
        packed_words = self.pack(samples,
                                 lengths,
                                 feat_matrix,
                                 segments,
                                 segment_list=segment_list)
        packed_words.word_feat_matrices = self._adapt(
            packed_words.word_feat_matrices)

        try:
            lm_batch = self._prepare_batch(
                packed_words
            )  # TODO(j_luo)  This is actually continous batching.
            scores = self._get_lm_scores(lm_batch)
            nlls = list()
            for cat, (nll, weight) in scores.items():
                if should_include(g.feat_groups, cat):
                    nlls.append(nll * weight)
            # nlls = sum(nlls)
            nlls = sum(nlls) / lm_batch.lengths
            bw = packed_words.word_lengths.size('batch_word')
            p = packed_words.word_positions.size('position')
            nlls = nlls.unflatten('batch', [('batch_word', bw),
                                            ('position', p)])
            nlls = nlls.sum(dim='position')
            lm_score, in_vocab_score = self._unpack(nlls, packed_words, bs)
        except EmptyPackedWords:
            lm_score = get_zeros(bs, packed_words.num_samples)
            in_vocab_score = get_zeros(bs, packed_words.num_samples)

        word_score = self._get_word_score(packed_words, bs)
        readable_score, unreadable_score = self._get_readable_scores(
            source_padding, samples)

        scores = [
            lm_score, word_score, in_vocab_score, readable_score,
            unreadable_score
        ]
        features = torch.stack(scores, new_name='feature')
        phi_score = self.phi_scorer(features).squeeze('score')

        # if g.search:
        #     samples = samples.align_to('length', 'batch', 'sample')
        #     flat_samples = samples.flatten(['batch', 'sample'], 'batch_X_sample')
        #     flat_sample_embeddings = self.tag_embedding(flat_samples)
        #     bxs = flat_samples.size('batch_X_sample')
        #     h0 = get_zeros([1, bxs, 100])
        #     c0 = get_zeros([1, bxs, 100])
        #     with NoName(flat_sample_embeddings):
        #         output, (hn, _) = self.tag_lstm(flat_sample_embeddings, (h0, c0))
        #     tag_score = self.tag_scorer(hn).squeeze(dim=0).squeeze(dim=-1)
        #     tag_score = tag_score.view(samples.size('batch'), samples.size('sample'))
        #     ret['tag_score'] = tag_score.rename('batch', 'sample')
        scores = DecipherModelScoreReturn(lm_score, word_score, in_vocab_score,
                                          readable_score, unreadable_score,
                                          phi_score)

        return packed_words, scores