예제 #1
0
def test_scispacy_sentence_splitter():
    scispacy_splitter = SciSpacySentenceSplitter()
    sentences = scispacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 4
    assert sentences[1].start_pos == 23
    assert len(sentences[1].tokens) == 9
예제 #2
0
import argparse

from flair.models.sequence_tagger_model import MultiTagger
from flair.tokenization import SciSpacySentenceSplitter
from pathlib import Path
from tqdm import tqdm

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", required=True, type=Path)
    parser.add_argument("--output_file", required=True, type=Path)
    args = parser.parse_args()

    sentence_splitter = SciSpacySentenceSplitter()
    tagger = MultiTagger.load("hunflair-paper")

    args.output_file.parent.mkdir(parents=True, exist_ok=True)

    with args.input_file.open("r") as f_in, args.output_file.open(
            "w") as f_out:
        lines = f_in.readlines()
        for line in tqdm(lines, total=len(lines)):
            fname, text = line.split("\t")
            sentences = sentence_splitter.split(text)
            tagger.predict(sentences)

            for sentence in sentences:
                for entity in tagger.get_all_spans(sentence):
                    start = entity.start_pos + sentence.start_pos
                    end = entity.end_pos + sentence.start_pos
                    f_out.write(