def generate_corpus(nlp): directory_path = path.join('app','static', 'pickleFiles', 'training_testing') corpus_path = Path(path.join(directory_path, file_name) + ".spacy") raw_path = Path(path.join(directory_path, file_name) + ".jsonl") if exists(corpus_path): return Corpus(corpus_path)(nlp) vulnerabilities = [] with open(raw_path) as file: for line in file.readlines(): vulnerability = loads(line) vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])}) corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"]) for vulnerability in vulnerabilities: document = nlp.make_doc(vulnerability['description'].lower()) #print(vulnerability) #print(len(document)) #iob = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc] #biluo = iob_to_biluo(iob) #print(biluo) #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']]) #document.set_ents(list(document.ents)) tags = offsets_to_biluo_tags(document, vulnerability['entities']) entities = biluo_tags_to_spans(document, tags) document.set_ents(entities) ''' Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens. ''' #print(document.has_annotation(1)) #ID of "SOFTWARE" # passt alles! ents = list(document.ents) for i, _ in enumerate(ents): print(ents[i].label_) print(ents[i].text) print('\n') print('\nOK\n') #exit() corpus.add(document) print(len(corpus)) print(list(corpus.get_docs(nlp.vocab))) corpus.to_disk(corpus_path) if exists(corpus_path): return Corpus(corpus_path)(nlp)
def docbin2docs(docbin_bytes: bytes, language: t.Union[str, Language], similarity_method: int = 0) -> t.Tuple[Doc, ...]: if isinstance(language, str): language = blank(language, similarity_method) docbin = DocBin().from_bytes(docbin_bytes) return tuple(docbin.get_docs(language.vocab))
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) in_db = DocBin().from_disk(input_path) out_db = DocBin() logging.info(f"Read {len(in_db)} documents from {input_path}.") for doc in in_db.get_docs(nlp.vocab): new_doc = nlp.make_doc(doc.text) new_doc.user_data = doc.user_data new_doc.ents = doc.ents out_db.add(new_doc) out_db.to_disk(output_path)
def json_path_to_examples(data_path, NLP): data = srsly.read_json(data_path) # no good way to convert with a specified vocab, so convert, then reload # through DocBin with the right vocab docs = json_to_docs(data) docbin = DocBin() for doc in docs: docbin.add(doc) docs = docbin.get_docs(NLP.vocab) examples = [Example(NLP.make_doc(doc.text), doc) for doc in docs] return examples
def read_spacy_docs( filepath: types.PathLike, *, format: str = "binary", lang: Optional[types.LangLike] = None, ) -> Iterable[Doc]: """ Read the contents of a file at ``filepath``, written in binary or pickle format. Args: filepath: Path to file on disk from which data will be read. format ({"binary", "pickle"}): Format of the data that was written to disk. If "binary", uses :class:`spacy.tokens.DocBin` to deserialie data; if "pickle", uses python's stdlib ``pickle``. .. warning:: Docs written in pickle format were saved all together as a list, which means they're all loaded into memory at once before streaming one by one. Mind your RAM usage, especially when reading many docs! lang: Language with which spaCy originally processed docs, represented as the full name of or path on disk to the pipeline, or an already instantiated pipeline instance. Note that this is only required when ``format`` is "binary". Yields: Next deserialized document. Raises: ValueError: if format is not "binary" or "pickle", or if ``lang`` is None when ``format="binary"`` """ if format == "binary": if lang is None: raise ValueError( "lang=None is invalid. When format='binary', a `spacy.Language` " "(well, its associated `spacy.Vocab`) is required to deserialize " "the binary data. Note that this should be the same language pipeline " "used when processing the original docs!") else: lang = spacier.utils.resolve_langlike(lang) docbin = DocBin().from_disk(filepath) for doc in docbin.get_docs(lang.vocab): yield doc elif format == "pickle": with io_utils.open_sesame(filepath, mode="rb") as f: for spacy_doc in pickle.load(f): yield spacy_doc else: raise ValueError( errors.value_invalid_msg("format", format, {"binary", "pickle"}))
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] nlp = English() for doc in nlp.pipe(texts): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) list(doc_bin.get_docs(nlp.vocab))
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): """Test that custom extensions are correctly serialized in DocBin.""" Doc.set_extension("foo", default="nothing") doc = Doc(en_vocab, words=["hello", "world"]) doc._.foo = "bar" doc_bin_1 = DocBin(store_user_data=writer_flag) doc_bin_1.add(doc) doc_bin_bytes = doc_bin_1.to_bytes() doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] assert doc_2._.foo == reader_value Underscore.doc_extensions = {}
def read_files(file: Path, nlp: "Language") -> Iterable[Example]: """Custom reader that keeps the tokenization of the gold data, and also adds the gold GGP annotations as we do not attempt to predict these.""" doc_bin = DocBin().from_disk(file) docs = doc_bin.get_docs(nlp.vocab) for gold in docs: pred = Doc( nlp.vocab, words=[t.text for t in gold], spaces=[t.whitespace_ for t in gold], ) pred.ents = gold.ents yield Example(pred, gold)
def test_issue4528(en_vocab): """Test that user_data is correctly serialized in DocBin.""" doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" # This is how extension attribute values are stored in the user data doc.user_data[("._.", "foo", None, None)] = "bar" doc_bin = DocBin(store_user_data=True) doc_bin.add(doc) doc_bin_bytes = doc_bin.to_bytes() new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) new_doc = list(new_doc_bin.get_docs(en_vocab))[0] assert new_doc.user_data["foo"] == "bar" assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
def prepare_data( params: Params, verbose: bool = True, ) -> Dict[str, Doc]: """ return a single spacy doc for each age. warning: if corpus binary is not on disk already, it will be saved to disk. this means the corpus should never be modified - else, the binary will also contain unexpected modifications """ # try loading transcripts from disk fn = params.corpus_name + '.spacy' bin_path = configs.Dirs.corpora / fn if bin_path.exists(): doc_bin = DocBin().from_disk(bin_path) docs = list(doc_bin.get_docs(nlp.vocab)) # load raw transcripts + process them else: print( f'WARNING: Did not find binary file associated with {params.corpus_name}. Preprocessing corpus...' ) transcripts = load_transcripts(params) docs: List[Doc] = [doc for doc in nlp.pipe(transcripts)] # WARNING: only save to disk if we know that corpus has not been modified doc_bin = DocBin(docs=docs) doc_bin.to_disk(bin_path) # group docs by age ages = load_ages(params) if len(ages) != len(docs): raise RuntimeError(f'Num docs={len(docs)} and num ages={len(ages)}') age2docs = {} for age in SortedSet(ages): if age == EXCLUDED_AGE: continue docs_at_age = [docs[n] for n, ai in enumerate(ages) if ai == age] age2docs[age] = docs_at_age if verbose: print( f'Processed {len(age2docs[age]):>6} transcripts for age={age}') # combine all documents at same age age2doc = {} for age, docs in age2docs.items(): doc_combined = Doc.from_docs(docs) age2doc[age] = doc_combined print(f'Num tokens at age={age} is {len(doc_combined):,}') return age2doc
def test_serialize_doc_bin_unknown_spaces(en_vocab): doc1 = Doc(en_vocab, words=["that", "'s"]) assert doc1.has_unknown_spaces assert doc1.text == "that 's " doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) assert not doc2.has_unknown_spaces assert doc2.text == "that's" doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) assert re_doc1.has_unknown_spaces assert re_doc1.text == "that 's " assert not re_doc2.has_unknown_spaces assert re_doc2.text == "that's"
def main(): keywords_df = pd.read_csv('data/keywords.csv') keywords_dic = dict(zip(keywords_df['keyword'], keywords_df['entity'])) data = pd.read_csv('data/taged_all.csv') cd = CleanData() data_clean = cd.normalize_text(data.copy()) data_clean['keywords'] = data_clean['clean_text'].str.split() doc_entity_df = doc_entity(data_clean, keywords_dic) doc_entity_df = doc_entity_df.merge(data_clean[['id', 'target', 'predict']], how='left', left_on='id', right_on='id') doc_entity_df.set_index('id', inplace=True) doc_entity_df.to_csv('data/doc_entity_df.csv', index=True, header=True) doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = [ "Disaster control teams are studying ways to evacuate the port area in response to tidal wave warnings.[900037]" ] nlp = spacy.load("en_core_web_md") for doc in nlp.pipe(texts): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Read and write binary file with open('data/sample', "wb") as out_file: out_file.write(bytes_data) with open('data/sample', "rb") as in_file: data = in_file.read() in_file.close() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(data) docs = list(doc_bin.get_docs(nlp.vocab)) # ################################################################################### data = pd.read_csv('data/taged_all.csv') for row in tqdm(data['text'], total=data.shape[0]): doc = nlp(row) doc.to_disk('data/sample') print([(X.text, X.label_) for X in doc.ents])
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4): """ Step 2: Preprocess text in sense2vec's format Expects a binary .spacy input file consisting of the parsed Docs (DocBin) and outputs a text file with one sentence per line in the expected sense2vec format (merged noun phrases, concatenated phrases with underscores and added "senses"). Example input: Rats, mould and broken furniture: the scandal of the UK's refugee housing Example output: Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") with input_path.open("rb") as f: doc_bin_bytes = f.read() doc_bin = DocBin().from_bytes(doc_bin_bytes) msg.good(f"Loaded {len(doc_bin)} parsed docs") docs = doc_bin.get_docs(nlp.vocab) output_file = output_path / f"{input_path.stem}.s2v" lines_count = 0 words_count = 0 with output_file.open("w", encoding="utf8") as f: for doc in tqdm.tqdm(docs, desc="Docs", unit=""): doc = merge_phrases(doc) words = [] for token in doc: if not token.is_space: word, sense = make_spacy_key(token, prefer_ents=True) words.append(make_key(word, sense)) f.write(" ".join(words) + "\n") lines_count += 1 words_count += len(words) msg.good( f"Successfully preprocessed {lines_count} docs ({words_count} words)", output_file.resolve(), )
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH): nlp = spacy.load(model) print("Reading data from {}".format(docbin_path)) with open(docbin_path, "rb") as file_: bytes_data = file_.read() nr_word = 0 start_time = timer() entities = Counter() docbin = DocBin().from_bytes(bytes_data) for doc in docbin.get_docs(nlp.vocab): nr_word += len(doc) entities.update((e.label_, e.text) for e in doc.ents) end_time = timer() msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)" wps = nr_word / (end_time - start_time) print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps)) print("Most common entities:") for (label, entity), freq in entities.most_common(30): print(freq, entity, label)
def test_to_spacy_file_and_back(small_dataset): spacy_pipeline = spacy.load("en_core_web_sm") InputSample.create_spacy_dataset( small_dataset, output_path="dataset.spacy", translate_tags=False, spacy_pipeline=spacy_pipeline, alignment_mode="strict", ) db = DocBin() db.from_disk("dataset.spacy") docs = db.get_docs(vocab=spacy_pipeline.vocab) for doc, input_sample in zip(docs, small_dataset): input_ents = sorted(input_sample.spans, key=lambda x: x.start_position) spacy_ents = sorted(doc.ents, key=lambda x: x.start_char) for spacy_ent, input_span in zip(spacy_ents, input_ents): assert spacy_ent.start_char == input_span.start_position assert spacy_ent.end_char == input_span.end_position
def load_dataset(input_path, output_path, binary=False): # Load the dataset in the given path, and ignore it if the given output exists dataset = None if config.IGNORE_PROCESSED_DATASET and isfile(output_path): return None try: if not binary: return open(input_path, "r") with open(input_path, "rb") as parsed_dataset_file: dataset_bytes = parsed_dataset_file.read() doc_bin = DocBin().from_bytes(dataset_bytes) dataset = doc_bin.get_docs(ud_parser.vocab) except zlib.error: pass return dataset
def extract_docs(self): """ Extracts serialised SpaCy docs from a zipped archive. :returns: SpaCy docs. """ nlp = en_core_web_sm.load() # Load model with zipfile.ZipFile(str(self.source_file), "r") as archive: file_name = archive.namelist()[0] # always just one pickle file with archive.open(file_name, "r") as pickle_file: # Load DocBin file = pickle.load(pickle_file) doc_bin = DocBin().from_bytes(file) docs = list(doc_bin.get_docs(nlp.vocab)) return docs
def cache_docbin(self, force=False): sp = self.sp refresh = force or not os.path.isfile(DOCBIN_CACHE) \ or not os.path.isdir(VOCAB_CACHE) if refresh: paragraphs = self.load_paragraphs() doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) for doc in sp.pipe(tqdm(paragraphs)): doc_bin.add(doc) with open(DOCBIN_CACHE, "wb") as f: f.write(doc_bin.to_bytes()) sp.vocab.to_disk(VOCAB_CACHE) sp.vocab.from_disk(VOCAB_CACHE) with open(DOCBIN_CACHE, "rb") as f: bb = f.read() doc_bin = DocBin().from_bytes(bb) return list(doc_bin.get_docs(sp.vocab))
def docbin_reader(docbin_file_path: str, spacy_model_name: str = "en_core_web_md", cutoff: Optional[int] = None, nb_to_skip: int = 0): """Read a binary file containing a DocBin repository of spacy documents. In addition to the file path, we also need to provide the name of the spacy model (which is necessary to load the vocabulary), such as "en_core_web_md". If cutoff is specified, the method will stop after generating the given number of documents. If nb_to_skip is > 0, the method will skip the given number of documents before starting the generation. """ import spacy # Reading the binary data from the file fd = open(docbin_file_path, "rb") data = fd.read() fd.close() docbin = DocBin(store_user_data=True) docbin.from_bytes(data) del data # print("Total number of documents in docbin:", len(docbin)) # Skip a number of documents if nb_to_skip: docbin.tokens = docbin.tokens[nb_to_skip:] docbin.spaces = docbin.spaces[nb_to_skip:] docbin.user_data = docbin.user_data[nb_to_skip:] # Retrieves the vocabulary vocab = get_spacy_model(spacy_model_name).vocab # We finally generate the documents one by one reader = docbin.get_docs(vocab) for i, doc in enumerate(reader): yield doc if cutoff is not None and (i + 1) >= cutoff: return
def read_spacy_docs(filepath, vocab_filepath): """ Reads serialized spacy docs from a file into memory. Parameters ---------- filepath: str File path to serialized spacy docs Returns ------- list of spacy.tokens.doc.Doc List of spacy Docs loaded from file """ from spacy.vocab import Vocab with open(vocab_filepath, 'rb') as f: vocab = Vocab().from_bytes(f.read()) with open(filepath, 'rb') as f: data = f.read() doc_bin = DocBin().from_bytes(data) docs = list(doc_bin.get_docs(vocab)) return docs
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc.spans["start"] = [doc[0:2]] doc[0].norm_ = "UNUSUAL_TOKEN_NORM" doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats assert len(doc.spans) == 1 assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
from pandas import read_csv from glob import glob import spacy from spacy.tokens import DocBin nlp = spacy.load("de_core_news_sm") daten = glob("plenarprotokolle/pp19/*.xml.spacy") spacy_db = {} for datei in daten: protokoll = DocBin(store_user_data=True).from_bytes(open(datei, "rb").read()) protokoll = list(protokoll.get_docs(nlp.vocab)) datei = datei.split("plenarprotokolle/pp19/")[1] spacy_db[datei] = protokoll for f, protokoll in spacy_db.items(): for rede in protokoll: rede.user_data["entitaeten"] = [x.text for x in rede.ents] rede.user_data["entitaeten"] = [x for x in rede.user_data["entitaeten"] if not x == "||"] def collect_classifiers_sentiws(sourcefile): with open(sourcefile) as csv_file: classifiers = read_csv(csv_file, sep="\t", header=None, names=["lemma", "wert", "formen"]) classifiers["formen"] = classifiers["formen"].astype(str) classifiers["formen"] = classifiers["formen"].apply(lambda x: x.split(",")) classifiers[["lemma", "pos"]] = classifiers["lemma"].str.split("|", expand=True) classifiers["lemma"] = classifiers["lemma"].astype(str) for formen, lemma in zip(classifiers.formen, classifiers.lemma):
# Ask for password for the email respones email_address = input("Enter your e-mail address: ") password = getpass("Password for sending emails: ") # Create the arguments extractor test_extractor = ArgumentsExtractor("NOMLEX-plus.1.0.txt") # Create the UD parser, that resulted in odin formated representation nlp = spacy.load("en_ud_model_lg") nlp.tokenizer = _custom_tokenizer(nlp) converter = Converter(False, False, False, 0, False, False, False, False, False, ConvsCanceler()) nlp.add_pipe(converter, name="BART") tagger = nlp.get_pipe('tagger') parser = nlp.get_pipe('parser') # Load the example sentences DATA_PATH = "data/too_clean_wiki/example.txt" with open(DATA_PATH, "r") as example_sentence_file: example_sentences = example_sentence_file.readlines() # Load the parsed example sentences with open(DATA_PATH.replace(".txt", ".parsed"), "rb") as parsed_dataset_file: dataset_bytes = parsed_dataset_file.read() doc_bin = DocBin().from_bytes(dataset_bytes) docs = doc_bin.get_docs(nlp.vocab) parsed_example_sentences = list(doc_bin.get_docs(nlp.vocab)) # Start the server run(host='0.0.0.0', reloader=False, port=5000, server='paste')
def process(self): """ Opens the SpaCy output and gets ze nouns. """ noun_type = self.parameters["type"] # Validate whether the user enabled the right parameters. # Check part of speech tagging if "tagger" not in self.source_dataset.parameters["enable"]: self.dataset.update_status( "Enable \"Part-of-speech tagging\" in previous module") self.dataset.finish(0) # Check dependency parsing if nouns and compouns nouns is selected elif (noun_type == "nouns_and_compounds" or noun_type == "noun_chunks" ) and "parser" not in self.source_dataset.parameters["enable"]: self.dataset.update_status( "Enable \"Part-of-speech tagging\" and \"Dependency parsing\" for compound nouns/noun chunks in previous module" ) self.dataset.finish(0) # Valid parameters else: # Extract the SpaCy docs first self.dataset.update_status("Unzipping SpaCy docs") self.dataset.update_status("Extracting nouns") # Store all the nouns in this list li_nouns = [] nlp = spacy.load("en_core_web_sm") # Load model spacy.load("en_core_web_sm") for doc_file in self.iterate_archive_contents(self.source_file): with doc_file.open("rb") as pickle_file: # Load DocBin file = pickle.load(pickle_file) doc_bin = DocBin().from_bytes(file) docs = list(doc_bin.get_docs(nlp.vocab)) # Simply add each word if its POS is "NOUN" if noun_type == "nouns": for doc in docs: post_nouns = [] post_nouns += [ token.text for token in doc if token.pos_ == "NOUN" ] li_nouns.append(post_nouns) # Use SpaCy's noun chunk detection elif noun_type == "noun_chunks": for doc in docs: # Note: this is a workaround for now. # Serialization of the SpaCy docs does not # work well with dependency parsing after # loading. Quick fix: parse again. new_doc = nlp(doc.text) post_nouns = [] for chunk in new_doc.noun_chunks: post_nouns.append(chunk.text) li_nouns.append(post_nouns) # Use a custom script to get single nouns and compound nouns elif noun_type == "nouns_and_compounds": for doc in docs: post_nouns = [] noun = "" for i, token in enumerate(doc): # Check for common nouns (general, e.g. "people") # and proper nouns (specific, e.g. "London") if token.pos_ == "NOUN" or token.pos_ == "PROPN": # Check if the token is part of a noun chunk if token.dep_ == "compound": # Check for a compound relation noun = token.text else: if noun: noun += " " + token.text post_nouns.append(noun) noun = "" else: post_nouns.append(token.text) li_nouns.append(post_nouns) results = [] if li_nouns: # Also add the data to the original csv file, if indicated. if self.parameters.get("overwrite"): self.update_parent(li_nouns, noun_type) # convert to lower and filter out one-letter words all_nouns = [] for post_n in li_nouns: all_nouns += [ str(cap_noun).lower() for cap_noun in post_n if len(cap_noun) > 1 ] # Group and rank count_nouns = Counter(all_nouns).most_common() results = [{ "word": tpl[0], "count": tpl[1] } for tpl in count_nouns] # done! if results: self.dataset.update_status("Finished") self.write_csv_items_and_finish(results) else: self.dataset.update_status( "Finished, but no nouns were extracted.") self.dataset.finish(0)
def main(trained_pipeline: Path, test_data: Path, print_details: bool): nlp = spacy.load(trained_pipeline) doc_bin = DocBin(store_user_data=True).from_disk(test_data) docs = doc_bin.get_docs(nlp.vocab) examples = [] for gold in docs: pred = Doc( nlp.vocab, words=[t.text for t in gold], spaces=[t.whitespace_ for t in gold], ) pred.ents = gold.ents for name, proc in nlp.pipeline: pred = proc(pred) examples.append(Example(pred, gold)) # Print the gold and prediction, if gold label is not 0 if print_details: print() print(f"Text: {gold.text}") print(f"spans: {[(e.start, e.text, e.label_) for e in pred.ents]}") for value, rel_dict in pred._.rel.items(): gold_labels = [ k for (k, v) in gold._.rel[value].items() if v == 1.0 ] if gold_labels: print( f" pair: {value} --> gold labels: {gold_labels} --> predicted values: {rel_dict}" ) print() random_examples = [] docs = doc_bin.get_docs(nlp.vocab) for gold in docs: pred = Doc( nlp.vocab, words=[t.text for t in gold], spaces=[t.whitespace_ for t in gold], ) pred.ents = gold.ents relation_extractor = nlp.get_pipe("relation_extractor") get_instances = relation_extractor.model.attrs["get_instances"] for (e1, e2) in get_instances(pred): offset = (e1.start, e2.start) if offset not in pred._.rel: pred._.rel[offset] = {} for label in relation_extractor.labels: pred._.rel[offset][label] = random.uniform(0, 1) random_examples.append(Example(pred, gold)) thresholds = [ 0.000, 0.050, 0.100, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999 ] print() print("Random baseline:") _score_and_format(random_examples, thresholds) print() print("Results of the trained model:") _score_and_format(examples, thresholds)
nlp = spacy.load("en_core_web_sm") doc_base = nlp(text) print("") print_doc(doc_base) # Serialize document to disk and bytes doc_base.to_disk("doc.spacy") doc_base_bytes = doc_base.to_bytes() # Serialize using DocBin docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True) docbin_base.add(doc_base) docbin_base_bytes = docbin_base.to_bytes() # Restore document from disk doc = Doc(Vocab()) doc.from_disk("doc.spacy") print("") print_doc(doc) # Restore document from bytes doc = Doc(Vocab()) doc.from_bytes(doc_base_bytes) print("") print_doc(doc) # Restore using DocBin docbin = DocBin().from_bytes(docbin_base_bytes) docs = list(docbin.get_docs(nlp.vocab)) print("") print_doc(docs[0])
def process(self): """ Opens the SpaCy output and gets ze entities. """ # Validate whether the user enabled the right parameters. if "ner" not in self.source_dataset.parameters["enable"]: self.dataset.update_status( "Enable \"Named entity recognition\" in previous module") self.dataset.finish(0) return if self.source_dataset.num_rows > 25000: self.dataset.update_status( "Named entity recognition is only available for datasets smaller than 25.000 items." ) self.dataset.finish(0) return else: # Extract the SpaCy docs first self.dataset.update_status("Unzipping SpaCy docs") # Store all the entities in this list li_entities = [] nlp = spacy.load("en_core_web_sm") # Load model for doc_file in self.iterate_archive_contents(self.source_file): with doc_file.open("rb") as pickle_file: # Load DocBin file = pickle.load(pickle_file) doc_bin = DocBin().from_bytes(file) docs = list(doc_bin.get_docs(nlp.vocab)) for doc in docs: post_entities = [] # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing documents") for ent in doc.ents: if ent.label_ in self.parameters["entities"]: post_entities.append( (ent.text, ent.label_)) # Add a tuple li_entities.append(post_entities) results = [] if li_entities: # Also add the data to the original csv file, if indicated. if self.parameters.get("overwrite"): self.update_parent(li_entities) all_entities = [] # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily. for post_ents in li_entities: for pair in post_ents: if pair and len(pair[0]) > 1: pair = pair[0].lower() + " |#| " + pair[1] all_entities.append(pair) # Group and rank count_nouns = Counter(all_entities).most_common() # Unsplit and list the count. results = [{ "word": tpl[0].split(" |#| ")[0], "entity": tpl[0].split(" |#| ")[1], "count": tpl[1] } for tpl in count_nouns] # done! if results: self.dataset.update_status("Finished") self.write_csv_items_and_finish(results) else: self.dataset.update_status( "Finished, but no entities were extracted.") self.dataset.finish(0)