Exemplo n.º 1
0
def test_issue4402():
    json_data = {
        "id": 0,
        "paragraphs": [
            {
                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "How", "ner": "O"},
                            {"id": 1, "orth": "should", "ner": "O"},
                            {"id": 2, "orth": "I", "ner": "O"},
                            {"id": 3, "orth": "cook", "ner": "O"},
                            {"id": 4, "orth": "bacon", "ner": "O"},
                            {"id": 5, "orth": "in", "ner": "O"},
                            {"id": 6, "orth": "an", "ner": "O"},
                            {"id": 7, "orth": "oven", "ner": "O"},
                            {"id": 8, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {
                        "tokens": [
                            {"id": 9, "orth": "\n", "ner": "O"},
                            {"id": 10, "orth": "I", "ner": "O"},
                            {"id": 11, "orth": "'ve", "ner": "O"},
                            {"id": 12, "orth": "heard", "ner": "O"},
                            {"id": 13, "orth": "of", "ner": "O"},
                            {"id": 14, "orth": "people", "ner": "O"},
                            {"id": 15, "orth": "cooking", "ner": "O"},
                            {"id": 16, "orth": "bacon", "ner": "O"},
                            {"id": 17, "orth": "in", "ner": "O"},
                            {"id": 18, "orth": "an", "ner": "O"},
                            {"id": 19, "orth": "oven", "ner": "O"},
                            {"id": 20, "orth": ".", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                ],
                "cats": [
                    {"label": "baking", "value": 1.0},
                    {"label": "not_baking", "value": 0.0},
                ],
            },
            {
                "raw": "What is the difference between white and brown eggs?\n",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "What", "ner": "O"},
                            {"id": 1, "orth": "is", "ner": "O"},
                            {"id": 2, "orth": "the", "ner": "O"},
                            {"id": 3, "orth": "difference", "ner": "O"},
                            {"id": 4, "orth": "between", "ner": "O"},
                            {"id": 5, "orth": "white", "ner": "O"},
                            {"id": 6, "orth": "and", "ner": "O"},
                            {"id": 7, "orth": "brown", "ner": "O"},
                            {"id": 8, "orth": "eggs", "ner": "O"},
                            {"id": 9, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
                ],
                "cats": [
                    {"label": "baking", "value": 0.0},
                    {"label": "not_baking", "value": 1.0},
                ],
            },
        ],
    }
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json_to_docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        reader = Corpus(output_file)
        train_data = list(reader(nlp))
        assert len(train_data) == 2

        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4
Exemplo n.º 2
0
def test_issue4367():
    """Test that docbin init goes well"""
    DocBin()
    DocBin(attrs=["LEMMA"])
    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
Exemplo n.º 3
0
    def process(self):
        """
		Reads text and outputs entities per text body.
		"""

        # prepare staging area
        staging_area = self.dataset.get_staging_area()

        self.dataset.update_status("Preparing data")

        # go through all archived token sets and vectorise them
        results = []

        # Load the spacy goods
        nlp = spacy.load("en_core_web_sm")
        nlp.tokenizer = self.custom_tokenizer(
            nlp)  # Keep words with a dash in between

        # Disable what has _not_ been selected
        options = ["parser", "tagger", "ner"]
        enable = self.parameters.get("enable", False)

        if not enable:
            self.dataset.update_status("Select at least one of the options.")
            self.dataset.finish(0)
            return

        disable = [option for option in options if option not in enable]

        # Get all ze text first so we can process it in batches
        posts = [
            post["body"] if post["body"] else ""
            for post in self.iterate_items(self.source_file)
        ]

        # Process the text in batches
        if len(posts) < 100000:
            self.dataset.update_status("Extracting linguistic features")
        else:
            self.dataset.update_status(
                "Extracting linguistic features is currently only available for datasets with less than 100.000 items."
            )
            self.dataset.finish(0)
            return

        # Make sure only the needed information is extracted.
        attrs = []
        if "tagger" not in disable:
            attrs.append("POS")
        if "parser" not in disable:
            attrs.append("DEP")
        if "ner":
            attrs.append("ENT_IOB")
            attrs.append("ENT_TYPE")
            attrs.append("ENT_ID")
            attrs.append("ENT_KB_ID")

        # DocBin for quick saving
        doc_bin = DocBin(attrs=attrs)

        # Start the processing!
        try:
            for i, doc in enumerate(nlp.pipe(posts, disable=disable)):
                doc_bin.add(doc)

                # It's quite a heavy process, so make sure it can be interrupted
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Processor interrupted while iterating through CSV file"
                    )

                if i % 1000 == 0:
                    self.dataset.update_status("Done with post %s out of %s" %
                                               (i, len(posts)))
        except MemoryError:
            self.dataset.update_status(
                "Out of memory. The dataset may be too large to process. Try again with a smaller dataset.",
                is_final=True)
            return

        self.dataset.update_status(
            "Serializing results - this will take a while")

        # Then serialize the NLP docs and the vocab
        doc_bytes = doc_bin.to_bytes()

        # Dump ze data in a temporary folder
        with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile:
            pickle.dump(doc_bytes, outputfile)

        # create zip of archive and delete temporary files and folder
        self.write_archive_and_finish(staging_area,
                                      compression=zipfile.ZIP_LZMA)
Exemplo n.º 4
0
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

with open("exercises/zh/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = spacy.blank("zh")
matcher = Matcher(nlp.vocab)
# 将pattern加入mattcher中
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [
        Span(doc, start, end, label=match_id)
        for match_id, start, end in matches
    ]
    doc.ents = spans
    docs.append(doc)

doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./train.spacy")
Exemplo n.º 5
0
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import generateTrainData

#loads training data from generateTrainData
training_data = generateTrainData.generateTrainData()

nlp = spacy.blank('de')  # load a new spacy model
db = DocBin()  # create a DocBin object

for text, annot in tqdm(training_data):  # data in previous format
    doc = nlp.make_doc(text)  # create doc object from text
    ents = []
    for start, end, label in annot["entities"]:  # add character indexes
        span = doc.char_span(start,
                             end,
                             label=label,
                             alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents  # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy")  # save the docbin object
Exemplo n.º 6
0
def main(trained_pipeline: Path, test_data: Path, print_details: bool):
    print_details = False
    nlp = spacy.load(trained_pipeline)

    doc_bin = DocBin(store_user_data=True).from_disk(test_data)
    docs = doc_bin.get_docs(nlp.vocab)
    examples = []
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        for name, proc in nlp.pipeline:
            pred = proc(pred)
        examples.append(Example(pred, gold))

        # Print the gold and prediction, if gold label is not 0
        if print_details:
            print()
            print(f"Text: {gold.text}")
            print(f"spans: {[(e.start, e.text, e.label_) for e in pred.ents]}")
            for value, rel_dict in pred._.rel.items():
                gold_labels = [
                    k for (k, v) in gold._.rel[value].items() if v == 1.0
                ]
                if gold_labels:
                    print(
                        f" pair: {value} --> gold labels: {gold_labels} --> predicted values: {rel_dict}"
                    )
            print()

    random_examples = []
    docs = doc_bin.get_docs(nlp.vocab)
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        relation_extractor = nlp.get_pipe("relation_extractor")
        get_instances = relation_extractor.model.attrs["get_instances"]
        for (e1, e2) in get_instances(pred):
            offset = (e1.start, e2.start)
            if offset not in pred._.rel:
                pred._.rel[offset] = {}
            for label in relation_extractor.labels:
                pred._.rel[offset][label] = random.uniform(0, 1)
        random_examples.append(Example(pred, gold))

    thresholds = [
        0.000, 0.050, 0.100, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99,
        0.999
    ]
    print()
    print("Random baseline:")
    _score_and_format(random_examples, thresholds)

    print()
    print("Results of the trained model:")
    _score_and_format(examples, thresholds)