def _get_metric_data(data_path: Path, feat_groups: str, family_file_path: Path) -> pd.DataFrame: data = pd.read_csv(data_path, sep='\t') data = pd.pivot_table(data, index=['lang1', 'lang2'], columns='category', values='normalized_score').reset_index() cats = [cat.name for cat in Category if should_include(feat_groups, cat)] + ['avg'] cols = ['lang1', 'lang2'] + cats data = data[cols] # Get ground truth distances. get_families(family_file_path) dists = get_all_distances() def _get_lang(lang: str): if len(lang) == 2: return languages.get(alpha_2=lang) elif len(lang) == 3: return languages.get(alpha_3=lang) else: return None def _get_dist(lang1: str, lang2: str): lang_struct1 = _get_lang(lang1) lang_struct2 = _get_lang(lang2) if lang_struct1 is None or lang_struct2 is None: return None return dists.get((lang_struct1.name, lang_struct2.name), None) dists = [_get_dist(lang1, lang2) for lang1, lang2, *_ in data.values] data['dist'] = dists cols.append('dist') data = data[~data['dist'].isnull()].reset_index(drop=True) return data
def _get_embeddings(self): emb_dict = dict() for cat in Category: if should_include(self.feat_groups, cat): e = get_enum_by_cat(cat) nf = len(e) emb_dict[cat.name] = nn.Parameter(torch.zeros(nf, self.dim)) return nn.ParameterDict(emb_dict)
def forward(self, h: FT) -> Dict[str, FT]: shared_h = nn.functional.leaky_relu(self.linear(h).refine_names( ..., 'shared_repr'), negative_slope=0.1) ret = dict() for name, layer in self.feat_predictors.items(): out = layer(shared_h).refine_names(..., name) if not should_predict_none(name, new_style=g.new_style): f_idx = get_none_index(name) out[:, f_idx] = -999.9 ret[Name(name, 'camel')] = out # Compose probs for complex feature groups if possible. if g.new_style: for e in get_needed_categories(g.feat_groups, new_style=True, breakdown=False): if e.num_groups() > 1: assert e not in ret part_tensors = [ ret[part_enum.get_name()] for part_enum in e.parts() ] parts = list() for i, part_tensor in enumerate(part_tensors): conversion = self.conversion_idx[e.get_name().value][:, i] bs = len(part_tensor) part = part_tensor.rename(None).gather( 1, conversion.rename(None).expand(bs, -1)) parts.append(part) parts = torch.stack(parts, dim=-1) dim_name = e.get_name().value ret[e.get_name()] = parts.sum(dim=-1).refine_names( 'batch', dim_name) for part_cat in e.parts(): del ret[part_cat.get_name()] for name in ret: ret[name] = torch.log_softmax(ret[name], dim=-1) # Deal with conditions for some categories for cat, index in conditions.items(): if should_include(g.feat_groups, cat): # Find out the exact value to be conditioned on. # TODO(j_luo) ugly Category call. condition_e = get_enum_by_cat(Category(index.c_idx)) condition_name = condition_e.__name__ + ('X' if g.new_style else '') cat_name = get_enum_by_cat(cat).__name__ + ('X' if g.new_style else '') condition_name = Name(condition_name, 'camel') cat_name = Name(cat_name, 'camel') condition_log_probs = ret[condition_name][..., index.f_idx] # condition_log_probs.align_as(ret[cat_name]) ret[cat_name] = ret[cat_name] + condition_log_probs.rename( None).unsqueeze(dim=-1) return ret
def __init__(self): super().__init__() param_dict = dict() for cat in Category: if should_include(g.feat_groups, cat): e = get_enum_by_cat(cat) nf = len(e) param = nn.Parameter(torch.zeros(nf, nf)) param_dict[cat.name] = param self.adapters = nn.ParameterDict(param_dict)
def _get_embeddings(self): emb_dict = dict() for cat in Category: if should_include(g.feat_groups, cat): e = get_enum_by_cat(cat) nf = len(e) emb_dict[cat.name] = nn.Parameter(torch.zeros(nf, self.dim)) logging.warning('dense feature embedding init') torch.nn.init.uniform_(emb_dict[cat.name], -0.1, 0.1) return nn.ParameterDict(emb_dict)
def analyze_scores(self, scores) -> Metrics: metrics = Metrics() total_loss = 0.0 total_weight = 0.0 for name, (losses, weights) in scores.items(): if should_include(self.feat_groups, name): loss = (losses * weights).sum() weight = weights.sum() total_loss += loss total_weight += weight loss = Metric(f'loss_{name.snake}', loss, weight) metrics += loss metrics += Metric('loss', total_loss, total_weight) return metrics
def analyze( self, scores: Dict[Cat, FT], return_scores: bool = False ) -> Union[Metrics, Tuple[Metrics, Dict[Cat, FT]]]: metrics = Metrics() total_loss = 0.0 total_weight = 0.0 for name, (losses, weights) in scores.items(): if should_include(g.feat_groups, name): loss = (losses * weights).sum() weight = weights.sum() total_loss += loss total_weight += weight loss = Metric(f'loss_{name.snake}', loss, weight) metrics += loss metrics += Metric('loss', total_loss, total_weight) if return_scores: return metrics, scores else: return metrics
def __init__(self, data_path, num_workers, feat_groups: 'p', family_file_path: 'p', num_lang_pairs: 'p', data=None): if data is None: data = _get_metric_data(data_path, feat_groups, family_file_path) self.all_langs = sorted(set(data['lang1'])) self.cats = [cat.name for cat in Category if should_include(feat_groups, cat)] + ['avg'] super().__init__(data, batch_size=num_lang_pairs, num_workers=num_workers)
def effective_categories(self) -> List[Category]: ret = list() for cat in Category: if should_include(g.feat_groups, cat): ret.append(cat) return ret
def _get_scores( self, samples: LT, segments: Sequence[SegmentWindow], lengths: LT, feat_matrix: LT, source_padding: BT ) -> Tuple[PackedWords, DecipherModelScoreReturn]: bs = len(segments) segment_list = None if self.vocab is not None: segment_list = [segment.segment_list for segment in segments] packed_words = self.pack(samples, lengths, feat_matrix, segments, segment_list=segment_list) packed_words.word_feat_matrices = self._adapt( packed_words.word_feat_matrices) try: lm_batch = self._prepare_batch( packed_words ) # TODO(j_luo) This is actually continous batching. scores = self._get_lm_scores(lm_batch) nlls = list() for cat, (nll, weight) in scores.items(): if should_include(g.feat_groups, cat): nlls.append(nll * weight) # nlls = sum(nlls) nlls = sum(nlls) / lm_batch.lengths bw = packed_words.word_lengths.size('batch_word') p = packed_words.word_positions.size('position') nlls = nlls.unflatten('batch', [('batch_word', bw), ('position', p)]) nlls = nlls.sum(dim='position') lm_score, in_vocab_score = self._unpack(nlls, packed_words, bs) except EmptyPackedWords: lm_score = get_zeros(bs, packed_words.num_samples) in_vocab_score = get_zeros(bs, packed_words.num_samples) word_score = self._get_word_score(packed_words, bs) readable_score, unreadable_score = self._get_readable_scores( source_padding, samples) scores = [ lm_score, word_score, in_vocab_score, readable_score, unreadable_score ] features = torch.stack(scores, new_name='feature') phi_score = self.phi_scorer(features).squeeze('score') # if g.search: # samples = samples.align_to('length', 'batch', 'sample') # flat_samples = samples.flatten(['batch', 'sample'], 'batch_X_sample') # flat_sample_embeddings = self.tag_embedding(flat_samples) # bxs = flat_samples.size('batch_X_sample') # h0 = get_zeros([1, bxs, 100]) # c0 = get_zeros([1, bxs, 100]) # with NoName(flat_sample_embeddings): # output, (hn, _) = self.tag_lstm(flat_sample_embeddings, (h0, c0)) # tag_score = self.tag_scorer(hn).squeeze(dim=0).squeeze(dim=-1) # tag_score = tag_score.view(samples.size('batch'), samples.size('sample')) # ret['tag_score'] = tag_score.rename('batch', 'sample') scores = DecipherModelScoreReturn(lm_score, word_score, in_vocab_score, readable_score, unreadable_score, phi_score) return packed_words, scores