Python DocBin примеры использования

Язык программирования: Python

Пространство имен/Пакет: spacy.tokens

Класс/Тип: DocBin

Примеров на hotexamples.com: 6

Python DocBin - 6 примеров найдено. Это лучшие примеры Python кода для spacy.tokens.DocBin, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DocBin(30)

add(30)

to_disk(30)

get_docs(28)

to_bytes(26)

from_disk(3)

from_bytes(1)

merge(1)

spaces(1)

tokens(1)

user_data(1)

Пример #1

Показать файл

def test_issue4402():
    json_data = {
        "id": 0,
        "paragraphs": [
            {
                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "How", "ner": "O"},
                            {"id": 1, "orth": "should", "ner": "O"},
                            {"id": 2, "orth": "I", "ner": "O"},
                            {"id": 3, "orth": "cook", "ner": "O"},
                            {"id": 4, "orth": "bacon", "ner": "O"},
                            {"id": 5, "orth": "in", "ner": "O"},
                            {"id": 6, "orth": "an", "ner": "O"},
                            {"id": 7, "orth": "oven", "ner": "O"},
                            {"id": 8, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {
                        "tokens": [
                            {"id": 9, "orth": "\n", "ner": "O"},
                            {"id": 10, "orth": "I", "ner": "O"},
                            {"id": 11, "orth": "'ve", "ner": "O"},
                            {"id": 12, "orth": "heard", "ner": "O"},
                            {"id": 13, "orth": "of", "ner": "O"},
                            {"id": 14, "orth": "people", "ner": "O"},
                            {"id": 15, "orth": "cooking", "ner": "O"},
                            {"id": 16, "orth": "bacon", "ner": "O"},
                            {"id": 17, "orth": "in", "ner": "O"},
                            {"id": 18, "orth": "an", "ner": "O"},
                            {"id": 19, "orth": "oven", "ner": "O"},
                            {"id": 20, "orth": ".", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                ],
                "cats": [
                    {"label": "baking", "value": 1.0},
                    {"label": "not_baking", "value": 0.0},
                ],
            },
            {
                "raw": "What is the difference between white and brown eggs?\n",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "What", "ner": "O"},
                            {"id": 1, "orth": "is", "ner": "O"},
                            {"id": 2, "orth": "the", "ner": "O"},
                            {"id": 3, "orth": "difference", "ner": "O"},
                            {"id": 4, "orth": "between", "ner": "O"},
                            {"id": 5, "orth": "white", "ner": "O"},
                            {"id": 6, "orth": "and", "ner": "O"},
                            {"id": 7, "orth": "brown", "ner": "O"},
                            {"id": 8, "orth": "eggs", "ner": "O"},
                            {"id": 9, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
                ],
                "cats": [
                    {"label": "baking", "value": 0.0},
                    {"label": "not_baking", "value": 1.0},
                ],
            },
        ],
    }
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json_to_docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        reader = Corpus(output_file)
        train_data = list(reader(nlp))
        assert len(train_data) == 2

        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4

Пример #2

Показать файл

def test_issue4367():
    """Test that docbin init goes well"""
    DocBin()
    DocBin(attrs=["LEMMA"])
    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])

Пример #3

Показать файл

Файл: linguistic_extractor.py Проект: p-charis/4cat

    def process(self):
        """
		Reads text and outputs entities per text body.
		"""

        # prepare staging area
        staging_area = self.dataset.get_staging_area()

        self.dataset.update_status("Preparing data")

        # go through all archived token sets and vectorise them
        results = []

        # Load the spacy goods
        nlp = spacy.load("en_core_web_sm")
        nlp.tokenizer = self.custom_tokenizer(
            nlp)  # Keep words with a dash in between

        # Disable what has _not_ been selected
        options = ["parser", "tagger", "ner"]
        enable = self.parameters.get("enable", False)

        if not enable:
            self.dataset.update_status("Select at least one of the options.")
            self.dataset.finish(0)
            return

        disable = [option for option in options if option not in enable]

        # Get all ze text first so we can process it in batches
        posts = [
            post["body"] if post["body"] else ""
            for post in self.iterate_items(self.source_file)
        ]

        # Process the text in batches
        if len(posts) < 100000:
            self.dataset.update_status("Extracting linguistic features")
        else:
            self.dataset.update_status(
                "Extracting linguistic features is currently only available for datasets with less than 100.000 items."
            )
            self.dataset.finish(0)
            return

        # Make sure only the needed information is extracted.
        attrs = []
        if "tagger" not in disable:
            attrs.append("POS")
        if "parser" not in disable:
            attrs.append("DEP")
        if "ner":
            attrs.append("ENT_IOB")
            attrs.append("ENT_TYPE")
            attrs.append("ENT_ID")
            attrs.append("ENT_KB_ID")

        # DocBin for quick saving
        doc_bin = DocBin(attrs=attrs)

        # Start the processing!
        try:
            for i, doc in enumerate(nlp.pipe(posts, disable=disable)):
                doc_bin.add(doc)

                # It's quite a heavy process, so make sure it can be interrupted
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Processor interrupted while iterating through CSV file"
                    )

                if i % 1000 == 0:
                    self.dataset.update_status("Done with post %s out of %s" %
                                               (i, len(posts)))
        except MemoryError:
            self.dataset.update_status(
                "Out of memory. The dataset may be too large to process. Try again with a smaller dataset.",
                is_final=True)
            return

        self.dataset.update_status(
            "Serializing results - this will take a while")

        # Then serialize the NLP docs and the vocab
        doc_bytes = doc_bin.to_bytes()

        # Dump ze data in a temporary folder
        with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile:
            pickle.dump(doc_bytes, outputfile)

        # create zip of archive and delete temporary files and folder
        self.write_archive_and_finish(staging_area,
                                      compression=zipfile.ZIP_LZMA)

Пример #4

Показать файл

Файл: solution_04_04.py Проект: datakime/spacy-course

import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

with open("exercises/zh/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = spacy.blank("zh")
matcher = Matcher(nlp.vocab)
# 将pattern加入mattcher中
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [
        Span(doc, start, end, label=match_id)
        for match_id, start, end in matches
    ]
    doc.ents = spans
    docs.append(doc)

doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./train.spacy")

Пример #5

Показать файл

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import generateTrainData

#loads training data from generateTrainData
training_data = generateTrainData.generateTrainData()

nlp = spacy.blank('de')  # load a new spacy model
db = DocBin()  # create a DocBin object

for text, annot in tqdm(training_data):  # data in previous format
    doc = nlp.make_doc(text)  # create doc object from text
    ents = []
    for start, end, label in annot["entities"]:  # add character indexes
        span = doc.char_span(start,
                             end,
                             label=label,
                             alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents  # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy")  # save the docbin object

Пример #6

Показать файл

def main(trained_pipeline: Path, test_data: Path, print_details: bool):
    print_details = False
    nlp = spacy.load(trained_pipeline)

    doc_bin = DocBin(store_user_data=True).from_disk(test_data)
    docs = doc_bin.get_docs(nlp.vocab)
    examples = []
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        for name, proc in nlp.pipeline:
            pred = proc(pred)
        examples.append(Example(pred, gold))

        # Print the gold and prediction, if gold label is not 0
        if print_details:
            print()
            print(f"Text: {gold.text}")
            print(f"spans: {[(e.start, e.text, e.label_) for e in pred.ents]}")
            for value, rel_dict in pred._.rel.items():
                gold_labels = [
                    k for (k, v) in gold._.rel[value].items() if v == 1.0
                ]
                if gold_labels:
                    print(
                        f" pair: {value} --> gold labels: {gold_labels} --> predicted values: {rel_dict}"
                    )
            print()

    random_examples = []
    docs = doc_bin.get_docs(nlp.vocab)
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        relation_extractor = nlp.get_pipe("relation_extractor")
        get_instances = relation_extractor.model.attrs["get_instances"]
        for (e1, e2) in get_instances(pred):
            offset = (e1.start, e2.start)
            if offset not in pred._.rel:
                pred._.rel[offset] = {}
            for label in relation_extractor.labels:
                pred._.rel[offset][label] = random.uniform(0, 1)
        random_examples.append(Example(pred, gold))

    thresholds = [
        0.000, 0.050, 0.100, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99,
        0.999
    ]
    print()
    print("Random baseline:")
    _score_and_format(random_examples, thresholds)

    print()
    print("Results of the trained model:")
    _score_and_format(examples, thresholds)