def get_auto_evidences(name, abbreviations, abbrvs_trie): frags = EvidenceFinder.find_names(normalize_dataset_ws(name), abbrvs_trie) evidences = [] for f in frags: evidences.extend(abbreviations[f]) return list(set(evidences))
def __call__(self, text): text = normalize_cell_ws(normalize_dataset_ws(text)) ds = self.evidence_finder.find_datasets(text) ts = self.evidence_finder.find_tasks(text) ms = self.evidence_finder.find_metrics(text) ds -= ts ds -= ms return ts, ds, ms
def get_basic_dicts(taxonomy): tasks = {ts: [normalize_dataset_ws(ts)] for ts in taxonomy.tasks} datasets = { ds: EvidenceFinder.evidences_from_name(ds) for ds in taxonomy.datasets } metrics = { ms: EvidenceFinder.evidences_from_name(ms) for ms in taxonomy.metrics } return tasks, datasets, metrics
def compute_context_logprobs(self, context, noise, ms_noise, ts_noise, logprobs, axes_logprobs): if isinstance(context, str) or context is None: context = context or "" #abbrvs = self.extract_acronyms(context) context = normalize_cell_ws(normalize_dataset_ws(context)) #dss = set(self.evidence_finder.find_datasets(context)) | set(abbrvs.keys()) dss = self.evidence_finder.find_datasets(context) mss = self.evidence_finder.find_metrics(context) tss = self.evidence_finder.find_tasks(context) dss -= mss dss -= tss else: tss, dss, mss = context dss = {normalize_cell(ds): count for ds, count in dss.items()} mss = {normalize_cell(ms): count for ms, count in mss.items()} tss = {normalize_cell(ts): count for ts, count in tss.items()} ###print("dss", dss) ###print("mss", mss) dss = self._numba_extend_dict(dss) mss = self._numba_extend_dict(mss) tss = self._numba_extend_dict(tss) key = (self._hash_counter(tss), self._hash_counter(dss), self._hash_counter(mss), noise, ms_noise, ts_noise) if key not in self.logprobs_cache: lp, alp = compute_logprobs( self._taxonomy, self._taxonomy_tasks, self._taxonomy_datasets, self._taxonomy_metrics, self.reverse_merged_p, self.reverse_metrics_p, self.reverse_tasks_p, dss, mss, tss, noise, ms_noise, ts_noise, self.ds_pb, self.ms_pb, self.ts_pb, self.max_repetitions) self.logprobs_cache[key] = (lp, alp) else: lp, alp = self.logprobs_cache[key] logprobs += lp axes_logprobs[0] += alp[0] axes_logprobs[1] += alp[1] axes_logprobs[2] += alp[2]
def evidences_from_name(key): x = normalize_dataset_ws(key) y = [w for w in x.split() if w not in manual_dicts.stop_words] return [x] + y if len(y) > 1 else [x]