def test_get_paragraph():
    result_keep_no_annotation = get_paragraph_from_file(path="./resources/test/test.xml",
                                                        keep_paragraph_without_annotation=True)
    result_keep_with_annotation = get_paragraph_from_file(path="./resources/test/test.xml",
                                                          keep_paragraph_without_annotation=False)
    assert len(result_keep_no_annotation) == 27
    assert len(result_keep_with_annotation) == 3
def annotate(model_dir_path: str, files_dir_path: List[str],
             out_dir_path: str) -> None:
    """
    Annotate a sample of the given XML files and save them into the given directory.

    :param model_dir_path: the directory of the Spacy model
    :param files_dir_path: the directory containing the XML files
    :param out_dir_path: the directory where to write the annotations
    """

    logging.info("Loading NER model…")
    nlp = get_empty_model(load_labels_for_training=False)
    nlp = nlp.from_disk(model_dir_path)

    # TODO remove when we have retrained
    infixes = nlp.Defaults.infixes + [r':', r"(?<=[\W\d_])-|-(?=[\W\d_])"]
    infixes_regex = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infixes_regex.finditer
    # end of deletion above

    entity_typename_builder = EntityTypename()

    logging.info("Loading cases…")

    cases: List[Case] = list()
    for path in files_dir_path:
        if path.endswith(".xml"):
            case: Case = get_paragraph_from_file(
                path=path, keep_paragraph_without_annotation=True)
            cases.append(case)
        elif path.endswith(".txt"):
            with open(path) as f:
                lines = f.readlines()
                case: Case = list()
                for line in lines:
                    clean_text = line.strip()
                    if len(clean_text) > 1:
                        basename = os.path.basename(path)
                        basename = basename.split(".")[0]
                        case.append(
                            Paragraph(basename, clean_text, list(), list()))
                cases.append(case)
def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".xml")
    ]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML",
              desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(
                path=os.path.join(data_folder, filename),
                keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception(
            "No example loaded, causes: no cases in provided path or sample size is to high"
        )

    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True,
                       embedding_storage_mode="cpu")

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html)
print(len(sys.argv))
assert len(sys.argv) <= 2

if len(sys.argv) == 2:
    param = sys.argv[1]
    train_dataset = "train_data_set" == param
    export_dataset = "export_dataset" == param
else:
    train_dataset = False
    export_dataset = False

TRAIN_DATA: List[Paragraph] = list()
for filename in os.listdir(xml_train_path):
    if filename.endswith(".xml"):
        current_path = os.path.join(xml_train_path, filename)
        TRAIN_DATA += get_paragraph_from_file(
            path=current_path, keep_paragraph_without_annotation=True)

if (not train_dataset) and (not export_dataset):
    print("Prepare training set for display")
    TRAIN_DATA = TRAIN_DATA[:number_of_paragraph_to_display]
elif train_dataset:
    print("Train model")
else:
    print("Save recurrent entities")
print("folder_path:{}".format(xml_train_path))
case_header_content = parse_xml_headers(folder_path=xml_train_path)
print("header_content:{}".format(case_header_content))
current_case_paragraphs = list()
current_case_offsets = list()
previous_case_id: Optional[str] = None
current_item_header = None
]
random.shuffle(files)

with open("./resources/training_data/generated_annotations.txt",
          mode='w') as generated_text:
    with open("./resources/training_data/generated_annotations.ent",
              mode='w') as generated_entities:
        for filename in files:
            if filename.endswith(".xml"):
                try:
                    print(f"--- {filename} ---")
                    text_lines: List[str] = list()
                    offset_lines: List[str] = list()
                    print("read XML")
                    generated_paragraphs: List[Sentence] = list()
                    paragraphs = get_paragraph_from_file(
                        path=filename, keep_paragraph_without_annotation=True)
                    if len(paragraphs) > 50000:
                        for paragraph in paragraphs:  # type: Paragraph
                            if len(paragraph.text) > 0:
                                s = Sentence(text=paragraph.text,
                                             tokenizer=tokenizer)
                                generated_paragraphs.append(s)

                        generated_paragraphs = tagger.predict(
                            sentences=generated_paragraphs,
                            mini_batch_size=32,
                            verbose=True,
                            embedding_storage_mode="none")

                        for sentence in generated_paragraphs:
                            text_lines.append(sentence.original_text)
示例#6
0
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

from xml_extractions.extract_node_values import get_paragraph_from_file
from ner.model_factory import get_empty_model
from resources.config_provider import get_config_default
from viewer.spacy_viewer import view_spacy_docs

config_training = get_config_default()
model_dir_path = config_training["model_dir_path"]
xml_dev_path = config_training["xml_dev_path"]
number_of_paragraph_to_display = int(
    config_training["number_of_paragraph_to_display"])

DEV_DATA = get_paragraph_from_file(xml_dev_path,
                                   keep_paragraph_without_annotation=True)
DEV_DATA = list(DEV_DATA)[:number_of_paragraph_to_display]

doc_annotated = list()

nlp = get_empty_model(load_labels_for_training=True)
# nlp = nlp.from_disk(model_dir_path)

for current_case_id, xml_paragraph, xml_extracted_text, xml_offset in DEV_DATA:
    spacy_matcher_offset = list()
    doc = nlp.make_doc(xml_paragraph)
    for start_offset, end_offset, type_name in xml_offset:
        # https://spacy.io/usage/linguistic-features#section-named-entities
        span_doc = doc.char_span(start_offset, end_offset, label=type_name)
        if span_doc is not None:
            # span will be none if the word is incomplete