예제 #1
0
def save_train_dev_data(gold_docs, split, train_file, dev_file):
    # shuffle the docs
    random.seed(27)
    random.shuffle(gold_docs)

    # split the gold data into training and evaluation
    num_training_tasks = round(len(gold_docs) * split / 100)
    train_docs = gold_docs[:num_training_tasks]
    dev_docs = gold_docs[num_training_tasks:]

    print("{} training entities".format(str(entity_count(train_docs))))
    print("{} dev entities".format(str(entity_count(dev_docs))))

    srsly.write_json(train_file, [docs_to_json(train_docs)])
    srsly.write_json(dev_file, [docs_to_json(dev_docs)])
def format_data_to_jsonl(data, file_path, print_label=False):
    result = []
    labels = set()
    i = 0

    data = tqdm.tqdm(data, leave=False)

    with file_path.open("w", encoding="utf-8") as f:
        for d in data:
            text = d['text']
            ents = []
            label_data = d["label"]
            for l, label_l in label_data.items():
                labels.update([l])
                label_ent_array = []
                for text_labeled, ent_arrays in label_l.items():
                    start_char, end_char = ent_arrays[0]
                    label_ent_array.append((start_char, end_char + 1, l))
                ents.append(label_ent_array[0])

            if True == diff_contain_overlapping(ents):
                i = i + 1

                doc = nlp(text)
                tags = biluo_tags_from_offsets(doc, ents)
                doc.ents = spans_from_biluo_tags(doc, tags)

                line = docs_to_json([doc])
                f.write(json_dumps(line) + "\n")

    msg.good(f"Finished {file_path} :: {i} rows")
    if print_label:
        msg.info(f"{labels}")
예제 #3
0
    def match_names_for_text(self, characters, text, results_dir, filename=None, tests_variant=False, displacy_option=False, save_ratios=False, save_doc=False):
        if tests_variant:
            train_data = []
            matches_table = prepare_list_for_ratios(characters)
            for sentence in text:
                matches_table_row, data_for_sentence, _ = self.recognize_person_entities(sentence, characters)
                train_data.append(data_for_sentence[0])
                matches_table.extend(matches_table_row[1:])
        else:
            matches_table, train_data, doc = self.recognize_person_entities(text, characters)

        if filename is not None:
            if save_doc:
                json_data = gold.docs_to_json(doc)
                with open(results_dir + "\\docs\\" + filename, 'w') as result:
                    json.dump(json_data, result)

            if save_ratios:
                write_list_to_file(results_dir + "\\ratios\\" + filename, matches_table)

            if tests_variant:
                with open(results_dir + filename, 'w', encoding='utf8') as result:
                    json.dump(train_data, result, ensure_ascii=False)
            else:
                with open(results_dir + filename, 'w') as result:
                    json.dump(train_data, result)

        if displacy_option:
            displacy.serve(doc, style="ent")
예제 #4
0
def convert_to_spacy_format(data_path, output_path):
    labels = get_labels(args.data)
    with open(output_path, "w") as f_o:
        json_format = []
        for i, doc in enumerate(yield_docs(args.data, labels)):
            print(i)
            json_format.append(docs_to_json(doc, id=i))
        f_o.write(json.dumps(json_format))
예제 #5
0
def test_roundtrip_docs_to_json():
    text = "I flew to Silicon Valley via London."
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    nlp = English()
    doc = nlp(text)
    doc.cats = cats
    doc[0].is_sent_start = True
    for i in range(1, len(doc)):
        doc[i].is_sent_start = False

    with make_tempdir() as tmpdir:
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(json_file), str(json_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]
예제 #6
0
def test_roundtrip_docs_to_json():
    text = "I flew to Silicon Valley via London."
    tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
    heads = [1, 1, 1, 4, 2, 1, 5, 1]
    deps = [
        "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"
    ]
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    nlp = English()
    doc = nlp(text)
    for i in range(len(tags)):
        doc[i].tag_ = tags[i]
        doc[i].dep_ = deps[i]
        doc[i].head = doc[heads[i]]
    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
    doc.cats = cats
    doc.is_tagged = True
    doc.is_parsed = True

    # roundtrip to JSON
    with make_tempdir() as tmpdir:
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(json_file), str(json_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert tags == goldparse.tags
    assert deps == goldparse.labels
    assert heads == goldparse.heads
    assert biluo_tags == goldparse.ner
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]

    # roundtrip to JSONL train dicts
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "roundtrip.jsonl"
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert tags == goldparse.tags
    assert deps == goldparse.labels
    assert heads == goldparse.heads
    assert biluo_tags == goldparse.ner
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]

    # roundtrip to JSONL tuples
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "roundtrip.jsonl"
        # write to JSONL train dicts
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
        # load and rewrite as JSONL tuples
        srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert tags == goldparse.tags
    assert deps == goldparse.labels
    assert heads == goldparse.heads
    assert biluo_tags == goldparse.ner
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]
예제 #7
0
texts = []
texts.append(my_text)

# naming of files
fname = 'example'
txt_file = fname + '.txt'
json_file = fname + '.json'

# f = open(txt_file) # will scrape strings from here
# lines = f.readlines()  # list of srings from the txt file

docs = [] # initialize a list to be populated wih nlp doc objects
# for line in lines:
    # print(line[:]) # display the sentence from that line
for text in texts:

    doc = nlp(text) # convert string into a spacy doc object using nlp
    # doc = nlp(line) # convert string into a spacy doc object using nlp
    docs.append(doc) # add new doc to the list of docs

json_data = docs_to_json(docs) # convert doc into a json file

# import json
# with open('json/' + json_file, 'w+') as outfile:
#     json.dump(json_data, outfile)

import srsly
# srsly.write_json('json/' + json_file, [spacy.gold.docs_to_json(docs)])
srsly.write_json(json_file, [spacy.gold.docs_to_json(docs)])
예제 #8
0
import spacy
import srsly
import json
from spacy.gold import docs_to_json, biluo_tags_from_offsets, spans_from_biluo_tags

nlp = spacy.load('en_core_web_lg')
for i in range(114):
    train_data = json.load(
        open(
            f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/{i}.json"
        ))

    docs = []
    c = 0
    for kgid, text, annot in train_data:
        c += 1
        print(c)
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        entities = spans_from_biluo_tags(doc, tags)
        doc.ents = entities
        docs.append(doc)

    srsly.write_json(
        f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/gold/{i}.json",
        [docs_to_json(docs)])