model_dir_path = config_training["model_dir_path"] xml_dev_path = config_training["xml_dev_path"] number_of_paragraph_to_display = int( config_training["number_of_paragraph_to_display"]) DEV_DATA = get_paragraph_from_file(xml_dev_path, keep_paragraph_without_annotation=True) DEV_DATA = list(DEV_DATA)[:number_of_paragraph_to_display] doc_annotated = list() nlp = get_empty_model(load_labels_for_training=True) # nlp = nlp.from_disk(model_dir_path) for current_case_id, xml_paragraph, xml_extracted_text, xml_offset in DEV_DATA: spacy_matcher_offset = list() doc = nlp.make_doc(xml_paragraph) for start_offset, end_offset, type_name in xml_offset: # https://spacy.io/usage/linguistic-features#section-named-entities span_doc = doc.char_span(start_offset, end_offset, label=type_name) if span_doc is not None: # span will be none if the word is incomplete spacy_matcher_offset.append(span_doc) else: print("ERROR char offset", doc.text[start_offset:end_offset]) doc.ents = spacy_matcher_offset doc_annotated.append(doc) # docs = convert_offsets_to_spacy_docs(doc_annotated) view_spacy_docs(doc_annotated, port=5000)
current_case_offsets.clear() previous_case_id = paragraph.case_id current_item_header = case_header_content[paragraph.case_id] headers_matcher = MatchValuesFromHeaders( current_header=current_item_header, threshold_size=3) current_case_paragraphs.append(paragraph.text) current_case_offsets.append(paragraph.offsets) print("Number of tags:", sum([len(offsets) for _, _, offsets in doc_annotated])) #if train_dataset: if True: train_model(data=doc_annotated, folder_to_save_model=model_dir_path, n_iter=n_iter, batch_size=batch_size, dropout_rate=dropout_rate) elif export_dataset: with open(training_set_export_path, 'wb') as export_training_set_file: pickle.dump(obj=doc_annotated, file=export_training_set_file, protocol=pickle.HIGHEST_PROTOCOL) else: # Display training set docs = convert_offsets_to_spacy_docs(doc_annotated) view_spacy_docs(docs, port=5000) print("view result on browser (localhost - port 5000)")
all_docs_to_view: List[Doc] = list() # last_case_spans = dict() last_case_docs: List[Doc] = list() former_case_id = None entity_typename_builder = EntityTypename() with tqdm(total=len(DEV_DATA[:number_of_paragraph_to_display]), unit=" paragraphs", desc="Find entities") as progress_bar: for (case_id, original_text, _, _) in DEV_DATA[:number_of_paragraph_to_display]: if case_id != former_case_id: spans = entity_typename_builder.get_dict() complete_case_annotations(last_case_docs, spans) all_docs_to_view.extend(last_case_docs) last_case_docs.clear() entity_typename_builder.clear() former_case_id = case_id spacy_doc: Doc = nlp(original_text) # doc.user_data['title'] = case_id last_case_docs.append(spacy_doc) # entities_span = [(ent.text.lower(), ent.label_) for ent in spacy_doc.ents] # last_case_spans.update(entities_span) entity_typename_builder.add_spacy_entities(spacy_doc=spacy_doc) progress_bar.update() print("Generate HTML") view_spacy_docs(all_docs_to_view, port=5000) print("view result on browser (port 5000)")