예제 #1
0
파일: cached.py 프로젝트: schlevik/tmdm
class Cached(Provider):
    cache: Dict[str, Any]
    name = 'cached'
    known_schemas = {
        # these assume same tokenisation
        "bio":
        lambda doc, annotation: offsets_from_biluo_tags(
            iob_to_biluo(doc, annotation)),
        "bilou":
        offsets_from_biluo_tags,
        "offsets":
        OFFSETS,
        "list_of_clusters":
        convert_clusters_to_offsets,
        # these provide their own tokenisation

        # annotation: List[Tuple[str,str]]
        "list_of_tuples_bio_flat":
        lambda doc, annotation: get_offsets(doc.text, annotation),

        # annotation: List[List[Tuple[str,str]]]
        "list_of_tuples_bio_stacked":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, annotation),

        # annotation: Tuple[List[str],List[str]]
        "tuple_of_lists_flat":
        lambda doc, annotation: get_offsets(doc.text, zip(*annotation[:2])),

        # annotation: List[Tuple[List[str]], Tuple[List[str]]]
        "list_of_tuples_of_lists":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, ((w, l) for t in annotation for w, l in zip(*t[:2]))),

        # annotation: Tuple[List[List[str]], Tuple[List[List[str]]
        "tuple_of_lists_of_lists":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, ((w, l) for ws, ls in zip(*annotation[:2])
                       for w, l in zip(ws, ls)))

        # TODO: BRAT
        # TODO: Pubmed
    }

    def __init__(self,
                 schema: Union[str, Callable[[Doc, Any],
                                             OffsetAnnotation]] = None,
                 getter=None,
                 path: str = None):
        self.cache = {}
        self.loaded = False
        if not schema:
            self.schema = OFFSETS
        elif schema in self.known_schemas:
            self.schema = Cached.known_schemas[schema]
        elif isinstance(schema, Callable):
            self.schema = schema
        else:
            self.schema = None
        self.getter = getter
        if path:
            self.load(path)

    @overrides
    def save(self, path: str):
        util.save_file(self.cache, path)

    # TODO: guess schema

    @overrides
    def load(self, path):
        self.cache = util.load_file(path)
        self.loaded = True

    @overrides
    def annotate_document(self, doc: Doc) -> OffsetAnnotation:
        if not self.loaded:
            raise ValueError("You forgot to load the cache!")
        annotations = self.cache.get(doc._.id, None)
        if annotations:
            if self.schema:
                if self.schema == OFFSETS:
                    return self.getter(
                        annotations) if self.getter else annotations
                else:
                    return self.schema(
                        doc,
                        self.getter(annotations)
                        if self.getter else annotations)
            else:
                logger.info(
                    f"no schema loaded for {self.__class__.__name__}, good luck!"
                )
                return annotations
예제 #2
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_commas_in_between():
    text = "I like, wedding cake cakes."
    annotation = list(zip('I like , wedding cake cakes .'.split(), "O O O B-CAKE I-CAKE B-CAKE O".split()))
    assert get_offsets(text, annotation) == [(8, 20, "CAKE"), (21, 26, "CAKE")]
예제 #3
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_funky_spacing():
    text = "I like ,    wedding cake cake."
    annotation = list(zip('I like , wedding cake cake .'.split(), "O O O B-CAKE I-CAKE B-CAKE O".split()))
    assert get_offsets(text, annotation) == [(12, 24, "CAKE"), (25, 29, "CAKE")]
예제 #4
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_last_tag():
    logger.info(f"Working dir: {os.getcwd()}")
    text = "I like cakes"
    annotation = list(zip(text.split(), "O O B-CAKE".split()))
    assert get_offsets(text, annotation) == [(7, 12, "CAKE")]
예제 #5
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_last_longer_tags():
    text = "I like big cakes"
    annotation = list(zip(text.split(), "O O B-CAKE I-CAKE".split()))
    assert get_offsets(text, annotation) == [(7, 16, "CAKE")]
예제 #6
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_consecutive_tags():
    text = "I like wedding cake cakes."
    annotation = list(zip('I like wedding cake cakes .'.split(), "O O B-CAKE I-CAKE B-CAKE O".split()))
    assert get_offsets(text, annotation) == [(7, 19, "CAKE"), (20, 25, "CAKE")]
예제 #7
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_bio_tags():
    text = "I like big cakes."
    annotation = list(zip('I like big cakes .'.split(), "O O B-CAKE I-CAKE O".split()))
    assert get_offsets(text, annotation) == [(7, 16, "CAKE")]
예제 #8
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_wrong_capitalisation():
    text = "I like cakes."
    annotation = list(zip('i like cakes .'.split(), "O O B-CAKE O".split()))
    assert get_offsets(text, annotation) == [(7, 12, "CAKE")]
예제 #9
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_get_offsets_works_with_sane_text():
    text = "I like cakes."
    annotation = list(zip('I like cakes .'.split(), "O O B-CAKE O".split()))
    assert get_offsets(text, annotation) == [(7, 12, "CAKE")]
예제 #10
0
파일: test_util.py 프로젝트: schlevik/tmdm
def test_offset_latch_match_returns_position_of_last_token():
    text = "I like ,    wedding cake cake troll"
    annotation = list(zip('I like , wedding cake cake troll'.split(), "O O O B-CAKE I-CAKE B-CAKE O".split()))
    _, last_match = get_offsets(text, annotation, return_last_match=True)
    assert last_match == 35