예제 #1
0
def convert_offsets_to_spacy_docs(doc_annotated: list) -> list:
    """
    Convert a list of tuple of string with their offset to Spacy doc with entities ready
    :param doc_annotated: list of tuple (string, array of offsets)
    :return: list of spacy doc
    """
    model = get_empty_model(load_labels_for_training=False)
    docs = list()
    for (index, (case_id, text, tags)) in enumerate(doc_annotated):
        doc: Doc = model.make_doc(text)
        ents = list()
        for (start_offset, end_offset, type_name) in tags:
            span_doc = doc.char_span(start_offset, end_offset, label=type_name)
            if span_doc is not None:
                ents.append(span_doc)
            else:
                print("Issue in offset",
                      "Index: " + str(index),
                      "case: " + case_id,
                      text[start_offset:end_offset],
                      text,
                      sep="|")
        doc.ents = ents
        docs.append(doc)
    return docs
def annotate(model_dir_path: str, files_dir_path: List[str],
             out_dir_path: str) -> None:
    """
    Annotate a sample of the given XML files and save them into the given directory.

    :param model_dir_path: the directory of the Spacy model
    :param files_dir_path: the directory containing the XML files
    :param out_dir_path: the directory where to write the annotations
    """

    logging.info("Loading NER model…")
    nlp = get_empty_model(load_labels_for_training=False)
    nlp = nlp.from_disk(model_dir_path)

    # TODO remove when we have retrained
    infixes = nlp.Defaults.infixes + [r':', r"(?<=[\W\d_])-|-(?=[\W\d_])"]
    infixes_regex = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infixes_regex.finditer
    # end of deletion above

    entity_typename_builder = EntityTypename()

    logging.info("Loading cases…")

    cases: List[Case] = list()
    for path in files_dir_path:
        if path.endswith(".xml"):
            case: Case = get_paragraph_from_file(
                path=path, keep_paragraph_without_annotation=True)
            cases.append(case)
        elif path.endswith(".txt"):
            with open(path) as f:
                lines = f.readlines()
                case: Case = list()
                for line in lines:
                    clean_text = line.strip()
                    if len(clean_text) > 1:
                        basename = os.path.basename(path)
                        basename = basename.split(".")[0]
                        case.append(
                            Paragraph(basename, clean_text, list(), list()))
                cases.append(case)
예제 #3
0
def train_model(data: list, folder_to_save_model: str, n_iter: int,
                batch_size: int, dropout_rate: float):
    """
    Train a NER model using Spacy
    :param data: list of tuples [(text, offset)]
    :param folder_to_save_model: Where to save the learned model. None to skip. Will be overiden with new model
    :param n_iter: number iterations of the CNN
    :param batch_size: more = less precise / less time to learn
    :param dropout_rate: more : learn less / better generalization
    """
    nlp = get_empty_model(load_labels_for_training=True)
    nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
    optimizer = nlp.begin_training()
    with tqdm(total=n_iter * ceil(len(data) / batch_size),
              unit=" paragraphs",
              desc="Learn NER model") as pbar:
        for itn in range(n_iter):
            pbar.set_description(f"Learn NER model - iteration {itn + 1}")
            losses = {}
            random.shuffle(data)
            batches = util.minibatch(data, batch_size)

            for current_batch_item in batches:
                case_id, texts, annotations = zip(*current_batch_item)
                docs = [nlp.make_doc(text) for text in texts]
                gold_with_unknown_bilou = convert_unknown_bilou_bulk(
                    docs=docs, offsets=annotations)
                nlp.update(
                    docs,  # batch of texts
                    gold_with_unknown_bilou,  # batch of annotations
                    drop=
                    dropout_rate,  # dropout - make it harder to memorise rules
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                pbar.postfix = "loss: " + str(losses['ner'])
                pbar.update()

    # save model to output directory
    if folder_to_save_model is not None:
        folder_to_save_model = Path(folder_to_save_model)
        nlp.to_disk(folder_to_save_model)
예제 #4
0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

from spacy.tokens.doc import Doc

from match_text_unsafe.build_entity_dictionary import EntityTypename
from misc.convert_to_bilou import convert_unknown_bilou, convert_unknown_bilou_bulk, no_action_bilou
from ner.model_factory import get_empty_model
import pytest

pytest.nlp = get_empty_model(load_labels_for_training=True)


def test_bilou_conv():
    doc: Doc = pytest.nlp.make_doc("Ceci est un test.")
    offset1 = [(5, 8, "UNKNOWN")]
    assert convert_unknown_bilou(
        doc, offsets=offset1).ner == ['O', '-', 'O', 'O', 'O']
    assert convert_unknown_bilou_bulk(
        [doc], [offset1])[0].ner == ['O', '-', 'O', 'O', 'O']
    offset2 = [(5, 8, "PERS")]
    assert convert_unknown_bilou(
        doc, offsets=offset2).ner == ['O', 'U-PERS', 'O', 'O', 'O']
    offset3 = [(0, 4, "UNKNOWN")]
    assert convert_unknown_bilou(
        doc, offsets=offset3).ner == ['-', 'O', 'O', 'O', 'O']
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

from xml_extractions.extract_node_values import get_paragraph_from_file
from ner.model_factory import get_empty_model
from resources.config_provider import get_config_default

config_training = get_config_default()
model_dir_path = config_training["model_dir_path"]
xml_dev_path = config_training["xml_dev_path"]
nlp = get_empty_model(load_labels_for_training=False)
nlp = nlp.from_disk(model_dir_path)

DEV_DATA = get_paragraph_from_file(xml_dev_path,
                                   keep_paragraph_without_annotation=True)

for case_id, texts, xml_extracted_text, annotations in DEV_DATA:
    doc = nlp(texts)

    spacy_extracted_text_ad_pp = [
        ent.text for ent in doc.ents if ent.label_ in ["ADDRESS", "PERS"]
    ]

    spacy_extracted_text = [ent.text for ent in doc.ents]
    str_rep_spacy = ' '.join(spacy_extracted_text)
    match = [span_xml in str_rep_spacy for span_xml in xml_extracted_text]
예제 #6
0
def main(data_folder: str, model_path: Optional[str],
         output_model: Optional[str], dev_size: float, nb_epochs: int,
         print_diff: bool) -> None:
    nlp = get_empty_model(load_labels_for_training=True)
    if model_path is not None:
        nlp = nlp.from_disk(path=model_path)
        nlp.tokenizer = get_tokenizer(nlp)  # replace tokenizer
        nlp.begin_training()
        # ner = nlp.get_pipe("ner")
        # ner.model.learn_rate = 0.0001
    else:
        nlp.tokenizer = get_tokenizer(nlp)  # replace tokenizer
        nlp.begin_training()

    all_annotated_files: List[str] = [
        os.path.join(data_folder, filename)
        for filename in os.listdir(data_folder) if filename.endswith(".txt")
    ]
    random.shuffle(all_annotated_files)

    nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size)

    dev_file_names = all_annotated_files[0:nb_doc_dev_set]

    train_file_names = [
        file for file in all_annotated_files if file not in dev_file_names
    ]
    # train_file_names = ["./resources/training_data/generated_annotations.txt"] + train_file_names

    content_to_rate = load_content(txt_paths=train_file_names)
    content_to_rate_test = load_content(txt_paths=dev_file_names)

    print(
        f"nb PERS entities {sum([1 for _, offsets in content_to_rate for o in offsets if o.type == 'PERS'])}"
    )

    if model_path is not None:
        print("evaluation without fine tuning")
        spacy_evaluate(nlp, content_to_rate_test, print_diff)

    optimizer: Optimizer = nlp.resume_training()

    for epoch in range(nb_epochs):
        print(f"------- {epoch}  -------")
        random.shuffle(content_to_rate)
        losses = dict()
        batches = minibatch(content_to_rate, size=compounding(4., 16., 1.001))
        for batch_id, batch in enumerate(
                tqdm(iterable=batches, unit=" batches", desc="Training")):
            try:
                batch_gold = convert_batch_to_gold_dataset(model=nlp,
                                                           batch=batch)
                texts, manual_annotations = zip(
                    *batch_gold)  # type: List[str], List[GoldParse]
                nlp.update(texts,
                           manual_annotations,
                           drop=0.5,
                           losses=losses,
                           sgd=optimizer)

                if batch_id % 10000 == 0:
                    spacy_evaluate(model=nlp,
                                   dev=content_to_rate_test,
                                   print_diff=print_diff)
            except Exception as e:
                print(f"got exception [{e}] on batch id {batch_id}")

        print(f"Epoch {epoch + 1}\nLoss: {losses}\n")
        spacy_evaluate(model=nlp,
                       dev=content_to_rate_test,
                       print_diff=print_diff)

    if output_model is not None:
        nlp.to_disk(output_model)