예제 #1
0
def test_map_symmetric_difference():
    a = Map.from_iter({'bar': 8, 'baz': 16})
    b = Map.from_iter({'bar': 32, 'moo': 64})
    s = dict(a.symmetric_difference(b))
    assert len(s) == 2
    assert s['baz'] == ((0, 16),)
    assert s['moo'] == ((1, 64),)
예제 #2
0
 def fst(self):
     if self._fst is None:
         if os.path.exists(self._path):
             self._fst = Map(self._path)
         else:
             self._fst = Map.from_iter([])
     return self._fst
예제 #3
0
    def flush(self):
        # Write to log
        for value in self._log_cache.values():
            # concat current log offset and value len in idx
            offset = self.log.tell()
            idx_row = offset.to_bytes(4, 'big', signed=False)
            idx_row += len(value).to_bytes(4, 'big', signed=False)
            self.idx.write(idx_row)
            # Append payload
            self.log.write(value)

        # Update fst
        # TODO use same dict to store the tuple (value, offset)
        new_fst = Map.from_iter(sorted(self._fst_cache.items()))
        tmp_path = f'{self.fst_path}-tmp'
        with Map.build(tmp_path) as tmp_map:
            for k, vals in self.fst.union(new_fst):
                tmp_map.insert(k, max(v.value for v in vals))
        # Rename tmp file
        os.rename(tmp_path, self.fst_path)

        # Close file descriptors
        self.log.close()
        self.idx.close()
        # Desallocate
        self._idx = None
        self._fst = None
        self._log = None
        self._cache_size = 0
        self._log_cache = OrderedDict()
        self._fst_cache = {}
예제 #4
0
def test_map_union():
    a = Map.from_iter({'bar': 8, 'baz': 16})
    b = Map.from_iter({'bar': 32, 'moo': 64})
    u = dict(a.union(b))
    assert len(u) == 3
    assert u['bar'] == ((0, 8), (1, 32))
    assert u['baz'] == ((0, 16),)
    assert u['moo'] == ((1, 64),)
예제 #5
0
def test_map_union():
    a = Map.from_iter({'bar': 8, 'baz': 16})
    b = Map.from_iter({'bar': 32, 'moo': 64})
    u = dict(a.union(b))
    assert len(u) == 3
    bar_itms = [(itm.index, itm.value) for itm in u['bar']]
    assert bar_itms == [(0, 8), (1, 32)]
    baz_itms = [(itm.index, itm.value) for itm in u['baz']]
    assert baz_itms == [(0, 16)]
    moo_itms = [(itm.index, itm.value) for itm in u['moo']]
    assert moo_itms == [(1, 64)]
예제 #6
0
    def flush(self):
        tmp_path = f'{self._path}-tmp'
        new_fst = Map.from_iter(sorted(self._cache.items()))
        with Map.build(tmp_path) as tmp_map:
            for k, vals in self.fst.union(new_fst):
                tmp_map.insert(k, max(v.value for v in vals))
        # Rename tmp file
        os.rename(tmp_path, self._path)

        # Desallocate
        self._cache = {}
        self._fst = None
    def fit_evaluate(self, documents_train: List[Document],
                     documents_eval: List[Document]) -> Score:
        model = build_frequency_dict(documents_train)

        mentions = []
        labels = []

        # Just use the entity that was most often linked with this mention
        for mention, candidates in model.items():
            if candidates:
                label = max(candidates, key=candidates.get)
            else:
                label = ""

            mentions.append(mention)
            labels.append(label)

        le = LabelEncoder()
        le.fit(labels)
        items = [(k, v)
                 for k, v in sorted(zip(mentions, le.transform(labels)))]

        m = Map.from_iter(items)

        gold = {}
        predictions = defaultdict(set)

        # Predict
        for doc in tqdm(documents_eval):
            for sentence in doc.sentences:
                if not len(sentence.entities):
                    continue

                for entity in sentence.entities.values():
                    key = (doc.name, sentence.idx, entity.start, entity.end)
                    gold[key] = entity.uri

                for n in [1, 2, 3]:
                    for (begin, end, mention) in generate_ngrams(sentence, n):
                        if len(mention) <= 3:
                            continue

                        for match, label_id in m.search(term=mention,
                                                        max_dist=self.n):
                            # Only consider matches that have the same tokenization (heuristic)
                            if len(match) <= 3 or match.count(
                                    " ") != mention.count(" "):
                                continue

                            label = le.inverse_transform([label_id])[0]
                            key = (doc.name, sentence.idx, begin, end)
                            predictions[key].add(label)

        return precision_recall_f1(gold, predictions)
예제 #8
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        model = self._load_model(user_id)

        if model is None:
            return

        le, items = model

        m = Map.from_iter(items)

        # We iterate over the all candidates and check whether they match
        for (begin, end, term) in chain(
            self._generate_candidates(cas, 3), self._generate_candidates(cas, 2), self._generate_candidates(cas, 1)
        ):
            for mention, label_id in m.search(term=term, max_dist=2):
                label = le.inverse_transform([label_id])[0]
                prediction = create_prediction(cas, layer, feature, begin, end, label)
                cas.add_annotation(prediction)
예제 #9
0
from rust_fst import Map

content = open('output.tun', 'rb').read()

idx_len = int.from_bytes(content[-2:], 'big')
print(idx_len)
idx_end = len(content) - 2
idx_start = idx_end - idx_len

with open('tmp.fst', 'wb') as tmp_fh:
    tmp_fh.write(content[idx_start:idx_end])

m = Map(path='tmp.fst')
items = list(m.items())
items.sort(key=lambda x: -x[1])
print(items)
offsets = [o for _, o in items]

for end, start in zip(offsets, offsets[1:]):
    if start == end:
        continue
    with open('tmp.fst', 'wb') as tmp_fh:
        tmp_fh.write(content[start:end])
    print(start, end)
    m = Map(path='tmp.fst')
    print(dict(m))
예제 #10
0
def test_map_intersection():
    a = Map.from_iter({'bar': 8, 'baz': 16})
    b = Map.from_iter({'bar': 32, 'moo': 64})
    i = dict(a.intersection(b))
    assert len(i) == 1
    assert i['bar'] == ((0, 8), (1, 32))
예제 #11
0
def test_map_difference():
    a = Map.from_iter({'bar': 8, 'baz': 16})
    b = Map.from_iter({'bar': 32, 'moo': 64})
    d = dict(a.difference(b))
    assert len(d) == 1
    assert d['baz'] == ((0, 16),)
예제 #12
0
def do_build(path=None, items=TEST_ITEMS, sorted_=True):
    if sorted_:
        it = sorted(items)
    else:
        it = items
    return Map.from_iter(it=it, path=path)
예제 #13
0
 def _get_fst_map(self):
     if os.path.exists(self.link_fst):
         return Map(self.link_fst)
     return Map.from_iter([])