def test_map_symmetric_difference(): a = Map.from_iter({'bar': 8, 'baz': 16}) b = Map.from_iter({'bar': 32, 'moo': 64}) s = dict(a.symmetric_difference(b)) assert len(s) == 2 assert s['baz'] == ((0, 16),) assert s['moo'] == ((1, 64),)
def fst(self): if self._fst is None: if os.path.exists(self._path): self._fst = Map(self._path) else: self._fst = Map.from_iter([]) return self._fst
def flush(self): # Write to log for value in self._log_cache.values(): # concat current log offset and value len in idx offset = self.log.tell() idx_row = offset.to_bytes(4, 'big', signed=False) idx_row += len(value).to_bytes(4, 'big', signed=False) self.idx.write(idx_row) # Append payload self.log.write(value) # Update fst # TODO use same dict to store the tuple (value, offset) new_fst = Map.from_iter(sorted(self._fst_cache.items())) tmp_path = f'{self.fst_path}-tmp' with Map.build(tmp_path) as tmp_map: for k, vals in self.fst.union(new_fst): tmp_map.insert(k, max(v.value for v in vals)) # Rename tmp file os.rename(tmp_path, self.fst_path) # Close file descriptors self.log.close() self.idx.close() # Desallocate self._idx = None self._fst = None self._log = None self._cache_size = 0 self._log_cache = OrderedDict() self._fst_cache = {}
def test_map_union(): a = Map.from_iter({'bar': 8, 'baz': 16}) b = Map.from_iter({'bar': 32, 'moo': 64}) u = dict(a.union(b)) assert len(u) == 3 assert u['bar'] == ((0, 8), (1, 32)) assert u['baz'] == ((0, 16),) assert u['moo'] == ((1, 64),)
def test_map_union(): a = Map.from_iter({'bar': 8, 'baz': 16}) b = Map.from_iter({'bar': 32, 'moo': 64}) u = dict(a.union(b)) assert len(u) == 3 bar_itms = [(itm.index, itm.value) for itm in u['bar']] assert bar_itms == [(0, 8), (1, 32)] baz_itms = [(itm.index, itm.value) for itm in u['baz']] assert baz_itms == [(0, 16)] moo_itms = [(itm.index, itm.value) for itm in u['moo']] assert moo_itms == [(1, 64)]
def flush(self): tmp_path = f'{self._path}-tmp' new_fst = Map.from_iter(sorted(self._cache.items())) with Map.build(tmp_path) as tmp_map: for k, vals in self.fst.union(new_fst): tmp_map.insert(k, max(v.value for v in vals)) # Rename tmp file os.rename(tmp_path, self._path) # Desallocate self._cache = {} self._fst = None
def fit_evaluate(self, documents_train: List[Document], documents_eval: List[Document]) -> Score: model = build_frequency_dict(documents_train) mentions = [] labels = [] # Just use the entity that was most often linked with this mention for mention, candidates in model.items(): if candidates: label = max(candidates, key=candidates.get) else: label = "" mentions.append(mention) labels.append(label) le = LabelEncoder() le.fit(labels) items = [(k, v) for k, v in sorted(zip(mentions, le.transform(labels)))] m = Map.from_iter(items) gold = {} predictions = defaultdict(set) # Predict for doc in tqdm(documents_eval): for sentence in doc.sentences: if not len(sentence.entities): continue for entity in sentence.entities.values(): key = (doc.name, sentence.idx, entity.start, entity.end) gold[key] = entity.uri for n in [1, 2, 3]: for (begin, end, mention) in generate_ngrams(sentence, n): if len(mention) <= 3: continue for match, label_id in m.search(term=mention, max_dist=self.n): # Only consider matches that have the same tokenization (heuristic) if len(match) <= 3 or match.count( " ") != mention.count(" "): continue label = le.inverse_transform([label_id])[0] key = (doc.name, sentence.idx, begin, end) predictions[key].add(label) return precision_recall_f1(gold, predictions)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model = self._load_model(user_id) if model is None: return le, items = model m = Map.from_iter(items) # We iterate over the all candidates and check whether they match for (begin, end, term) in chain( self._generate_candidates(cas, 3), self._generate_candidates(cas, 2), self._generate_candidates(cas, 1) ): for mention, label_id in m.search(term=term, max_dist=2): label = le.inverse_transform([label_id])[0] prediction = create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
from rust_fst import Map content = open('output.tun', 'rb').read() idx_len = int.from_bytes(content[-2:], 'big') print(idx_len) idx_end = len(content) - 2 idx_start = idx_end - idx_len with open('tmp.fst', 'wb') as tmp_fh: tmp_fh.write(content[idx_start:idx_end]) m = Map(path='tmp.fst') items = list(m.items()) items.sort(key=lambda x: -x[1]) print(items) offsets = [o for _, o in items] for end, start in zip(offsets, offsets[1:]): if start == end: continue with open('tmp.fst', 'wb') as tmp_fh: tmp_fh.write(content[start:end]) print(start, end) m = Map(path='tmp.fst') print(dict(m))
def test_map_intersection(): a = Map.from_iter({'bar': 8, 'baz': 16}) b = Map.from_iter({'bar': 32, 'moo': 64}) i = dict(a.intersection(b)) assert len(i) == 1 assert i['bar'] == ((0, 8), (1, 32))
def test_map_difference(): a = Map.from_iter({'bar': 8, 'baz': 16}) b = Map.from_iter({'bar': 32, 'moo': 64}) d = dict(a.difference(b)) assert len(d) == 1 assert d['baz'] == ((0, 16),)
def do_build(path=None, items=TEST_ITEMS, sorted_=True): if sorted_: it = sorted(items) else: it = items return Map.from_iter(it=it, path=path)
def _get_fst_map(self): if os.path.exists(self.link_fst): return Map(self.link_fst) return Map.from_iter([])