def test_gold_biluo_overlap(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [ (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), (len("I flew to "), len("I flew to San Francisco"), "LOC"), ] with pytest.raises(ValueError): offsets_to_biluo_tags(doc, entities)
def test_cli_converters_conllu_to_docs_name_ner_map(lines): input_data = "\n".join(lines) converted_docs = list( conllu_to_docs(input_data, n_sents=1, ner_map={ "PER": "PERSON", "BAD": "" })) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert converted[0]["paragraphs"][0][ "raw"] == "Dommer FinnEilertsen avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 5 tokens = sent["tokens"] assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu_to_docs(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) converted_docs = list(conllu_to_docs(input_data, n_sents=1)) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
def test_gold_biluo_BIL(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to London"), "LOC")] tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "U-LOC", "O"]
def tags_from_offsets( doc: Doc, offsets: List[Dict], label_encoding: Optional[str] = "BIOUL", ) -> List[str]: """Converts offsets to BIOUL or BIO tags using spacy's `offsets_to_biluo_tags`. Parameters ---------- doc A spaCy Doc created with `text` and the backbone tokenizer offsets A list of dicts with start and end character index with respect to the doc, and the span label: `{"start": int, "end": int, "label": str}` label_encoding The label encoding to be used: BIOUL or BIO Returns ------- tags (BIOUL or BIO) """ tags = offsets_to_biluo_tags( doc, [(offset["start"], offset["end"], offset["label"]) for offset in offsets] ) if label_encoding == "BIO": tags = bioul_tags_to_bio_tags(tags) return tags
def test_gold_biluo_misalign(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley."] spaces = [True, True, True, True, True, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] with pytest.warns(UserWarning): tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"]
def generate_corpus(nlp): directory_path = path.join('data') corpus_path = Path(path.join(directory_path, file_name) + ".spacy") raw_path = Path(path.join(directory_path, file_name) + ".jsonl") if exists(corpus_path): return Corpus(corpus_path)(nlp) vulnerabilities = [] with open(raw_path) as file: for line in file.readlines(): vulnerability = loads(line) vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])}) corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"]) for vulnerability in vulnerabilities: document = nlp.make_doc(vulnerability['description'].lower()) #print(vulnerability) #print(len(document)) #iob = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc] #biluo = iob_to_biluo(iob) #print(biluo) #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']]) #document.set_ents(list(document.ents)) tags = offsets_to_biluo_tags(document, vulnerability['entities']) entities = biluo_tags_to_spans(document, tags) document.set_ents(entities) ''' Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens. ''' #print(document.has_annotation(1)) #ID of "SOFTWARE" # passt alles! ents = list(document.ents) for i, _ in enumerate(ents): print(ents[i].label_) print(ents[i].text) print('\n') print('\nOK\n') #exit() corpus.add(document) print(len(corpus)) print(list(corpus.get_docs(nlp.vocab))) corpus.to_disk(corpus_path) if exists(corpus_path): return Corpus(corpus_path)(nlp)
def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) biluo_tags_converted = offsets_to_biluo_tags(doc, offsets) assert biluo_tags_converted == biluo_tags offsets_converted = biluo_tags_to_offsets(doc, biluo_tags) offsets_converted = [ent for ent in offsets if ent[2]] assert offsets_converted == offsets
def test_cli_converters_conllu_to_docs_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER", "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) converted_docs = list( conllu_to_docs(input_data, n_sents=1, merge_subtokens=True, append_morphology=True)) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] print(tokens) assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."] assert [t["tag"] for t in tokens] == [ "NOUN__Definite=Ind|Gender=Masc|Number=Sing", "PROPN_X__Gender=Fem,Masc|Tense=past", "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin", "PUNCT", ] assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"] assert [t["morph"] for t in tokens] == [ "Definite=Ind|Gender=Masc|Number=Sing", "Gender=Fem,Masc|Tense=past", "Mood=Ind|Tense=Pres|VerbForm=Fin", "", ] assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"]
def make_docs(folder, doc_list): nlp = spacy.load('ru_core_news_lg') """ this function will take a list of texts and annotations and transform them in spacy documents folder: folder consisting .txt and .out files (for this function to work you should have the same folder name in ../annotated directory ) foc_list: list of documents for appending """ out = 'out' for filename in os.listdir( 'data/bsnlp2021_train_r1/raw/{folder}/ru'.format(folder=folder)): df = pd.read_csv( 'data/bsnlp2021_train_r1/annotated/{folder}/ru/{filename}{out}'. format(folder=folder, filename=filename[:-3], out='out'), skiprows=1, header=None, sep='\t', encoding='utf8', error_bad_lines=False, engine='python') f = open('data/bsnlp2021_train_r1/raw/{folder}/ru/{filename}'.format( folder=folder, filename=filename), "r", encoding='utf8') list_words = df.iloc[:, 0].tolist() labels = df.iloc[:, 2].tolist() text = f.read() entities = [] for n in range(len(list_words)): for m in re.finditer(list_words[n].strip(), text): entities.append([m.start(), m.end(), labels[n]]) for f in range(len(entities)): if len(entities[f]) == 3: for s in range(f + 1, len(entities)): if len(entities[s]) == 3 and len(entities[f]) == 3: # print(entities[f],entities[s]) # print(f, s) if entities[f][0] == entities[s][0] or entities[f][ 1] == entities[s][1]: # print(entities[f],entities[s]) # print(f, s) if (entities[f][1] - entities[f][0]) >= ( entities[s][1] - entities[s][0]): entities.pop(s) entities.insert(s, ('')) else: entities.pop(f) entities.insert(f, ('')) if len(entities[s]) == 3 and len(entities[f]) == 3: if entities[f][0] in range(entities[s][0] + 1, entities[s][1]): entities.pop(f) entities.insert(f, ('')) elif entities[s][0] in range( entities[f][0] + 1, entities[f][1]): entities.pop(s) entities.insert(s, ('')) entities_cleared = [i for i in entities if len(i) == 3] doc = nlp(text) tags = offsets_to_biluo_tags(doc, entities_cleared) #assert tags == ["O", "O", "U-LOC", "O"] entities_x = biluo_tags_to_spans(doc, tags) doc.ents = entities_x doc_list.append(doc)