def _bio_to_standoff(bio_tags: List[str], spacy_doc: spacy.tokens.Doc) -> List[Annotation]: """Convert BIO tagged document to annotations in standoff format. The original spaCy document is used to recreate correct entity offsets. Parameters ---------- bio_tags : List[str] A BIO tagged sentence. `len(bio_tags) == len(spacy_doc)` has to hold. spacy_doc : spacy.tokens.Doc The spaCy doc corresponding to the BIO tags. Returns ------- List[Annotation] The standoff annotations. """ bio_tags = fix_dangling_entities(bio_tags) biluo_tags = _bio_to_biluo(bio_tags) offsets = offsets_from_biluo_tags(spacy_doc, biluo_tags) annotations = [] for i, offset in enumerate(offsets): annotations.append(Annotation( text=spacy_doc.char_span(offset[0], offset[1]).text, start=offset[0], end=offset[1], tag=offset[2], ann_id='T{}'.format(i), )) return annotations
def parse_sp(ner: List[str], sentence: str) -> List[NERMarkerIdx]: """Parse a NER tagged sentence based on Spacy (used for longer sentences).""" # U for unique, i.e. no B or I if single etc (Spacy) doc = nlp(sentence) return [ NERMarkerIdx(*offset) for offset in offsets_from_biluo_tags(doc, ner) ]
def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) assert offsets_converted == offsets
def _sentence_to_spacy_annotations(self, tokens, tags) -> Tuple[str, Tuple]: sentence = " ".join(tokens) tags = iob_to_biluo(tags) doc = self.nlp(sentence) annotations = offsets_from_biluo_tags(doc, tags) annotations = [(begin, end, tag) for begin, end, tag in annotations if len(tag) > 0] return sentence, annotations
def process(self, line, intent_treshold_score=0.5): doc = self.nlp.make_doc(line) words_true = [w.text for w in doc] length = len(words_true) words_true += ['<EOS>'] words = words_true + ['<PAD>'] * (50 - len(words_true)) words = np.array(words) batch = [{'words': words, 'length': length}] decoder_prediction, intent, intent_score = self.model.test(batch) # batch only contains one element intent = intent[0] intent_score = intent_score[0] # get the part that corresponds to words (truncate PAD and EOS) decoder_prediction = decoder_prediction[:length, 0] #print(decoder_prediction, intent[0], intent_score) # clean up <EOS> and <PAD> decoder_prediction = [ t if (t != '<EOS>' and t != '<PAD>') else 'O' for t in decoder_prediction ] biluo_tags = iob_to_biluo(decoder_prediction) entities_offsets = offsets_from_biluo_tags(doc, biluo_tags) entities = [] for ent in entities_offsets: e_parts = ent[2].split('.') if len(e_parts) > 1: # role.type entity = {'role': e_parts[0], 'type': e_parts[1]} else: entity = {'role': None, 'type': e_parts[0]} value = line[ent[0]:ent[1]] entities.append({ '_entity': entity['type'], 'role': entity['role'], 'value': value, '_body': value, '_start': ent[0], '_end': ent[1] }) # now convert to the same format as wit.ai, applying the treshold if intent_score < intent_treshold_score: intent_result = None else: intent_result = {'confidence': str(intent_score), 'value': intent} entities_result = {} for ent in entities: if ent['role']: entities_result[ent['role']] = ent else: entities_result[ent['_entity']] = ent return intent_result, entities_result
def _sentence_to_spacy_annotations(self, tokens, tags) -> Tuple[str, Tuple]: sentence = " ".join(tokens) tags = iob_to_biluo(tags) doc = self.nlp(sentence) annotations = offsets_from_biluo_tags(doc, tags) # print(sentence) # print(tags) # print(annotations) return sentence, annotations
def format_predictions_to_display(doc, predictions, probability_maps, pos=False): """Format predictions into spacy display formar.""" bert_predictions = [] iob_tags = [] tags_formatted = [] for prediction, probability_map in zip(predictions[0], probability_maps[0]): word = list(prediction.keys())[0] probas = probability_map[word] normalized_probas = list(softmax(np.mean(probas, axis=0))) bert_predictions.append( (word, prediction[word], np.max(normalized_probas))) if pos: iob_tags.append("I-" + prediction[word]) else: iob_tags.append(prediction[word]) biluo_tags = iob_to_biluo(iob_tags) tags = offsets_from_biluo_tags(doc, biluo_tags) for tag in tags: start_token = get_token_for_char(doc, tag[0]) word_span = doc.text[tag[0]:tag[1]] length_of_span = len(word_span.split()) if length_of_span == 1: probs = [bert_predictions[start_token][2]] else: probs = [ item[2] for item in bert_predictions[start_token:start_token + length_of_span] ] tags_formatted.append({ "start": tag[0], "end": tag[1], "label": tag[2], "score": np.prod(probs) }) return bert_predictions, tags_formatted
def predictions_to_doccano(input_file: str, output_file: str, language='en_core_web_sm'): """ Convert AllenNLP output json to Doccano style. :param input_file str: Input AllenNLP json file. :param output_file str: Doccano style output json. :param language SpaCy language: For splitting by sentences. """ nlp = spacy.blank('en') json_lines = [] with jsonlines.open(input_file) as reader: for obj in reader: text = obj['sentence'] doc = nlp(text) offsets = offsets_from_biluo_tags(doc, obj['tags']) json_line = {'text': text, 'labels': offsets} json_lines.append(json_line) with jsonlines.open(output_file, mode='w') as writer: for line in json_lines: writer.write(line)
def brat2spacy(tokenizer, ann, text): doc = tokenizer(text) words = [i.text for i in doc] entity_ids = defaultdict(tuple) relation_ids = defaultdict(tuple) entities = [] for line in ann.strip().split('\n'): annotation = line.strip().rsplit('\t') id_ = annotation[0] if id_ == '*': ann_type = id_[0] else: ann_type = annotation_ids[id_[0]] if ann_type == 'entity': if len(annotation[1:]) == 2: span, surface_form = annotation[1:] entity_type, start, end = span.split(' ') entity_ids[id_] = (int(start), int(end)) entities.append((int(start), int(end), entity_type)) if ann_type == 'relation': if len(annotation[1:]) == 1: rel_type, head, dep = annotation[1].split(' ') relation_ids[id_] = (rel_type, head, dep) entities.sort(key=lambda x: x[0]) tags = biluo_tags_from_offsets(doc, entities) if relation_ids: # mapping from brat ids to doc's id brat_doc_ids_map = {} for entity in entity_ids: span = doc.char_span(*entity_ids[entity]) if span.end - span.start == 1: brat_doc_ids_map[entity] = span.start else: # raise Warning("Tokenization mismatch, more than 1 spaCy token in ann token span") brat_doc_ids_map[entity] = span.start ids = range(len(doc)) heads = defaultdict(int) deps = defaultdict(int) for rel_id, rel in relation_ids.items(): dep, token, head = rel token, head = brat_doc_ids_map[token.split( ':')[1]], brat_doc_ids_map[head.split(':')[1]] heads[head] = token deps[head] = dep heads = [ i[1] if i[1] > 0 else i[0] for i in [(i, heads[i]) for i in ids] ] deps = [ i[1] if i[1] != 0 else 'ROOT' for i in [(i, deps[i]) for i in ids] ] assert len(words) == len(heads) == len(deps) == len(tags) return GoldParse(doc, words=words, heads=heads, tags=tags, deps=deps, entities=entities), text else: assert len(words) == len(tags) return GoldParse(doc, words=words, tags=tags, entities=offsets_from_biluo_tags(doc, tags)), text
class Cached(Provider): cache: Dict[str, Any] name = 'cached' known_schemas = { # these assume same tokenisation "bio": lambda doc, annotation: offsets_from_biluo_tags( iob_to_biluo(doc, annotation)), "bilou": offsets_from_biluo_tags, "offsets": OFFSETS, "list_of_clusters": convert_clusters_to_offsets, # these provide their own tokenisation # annotation: List[Tuple[str,str]] "list_of_tuples_bio_flat": lambda doc, annotation: get_offsets(doc.text, annotation), # annotation: List[List[Tuple[str,str]]] "list_of_tuples_bio_stacked": lambda doc, annotation: get_offsets_from_sentences( doc.text, annotation), # annotation: Tuple[List[str],List[str]] "tuple_of_lists_flat": lambda doc, annotation: get_offsets(doc.text, zip(*annotation[:2])), # annotation: List[Tuple[List[str]], Tuple[List[str]]] "list_of_tuples_of_lists": lambda doc, annotation: get_offsets_from_sentences( doc.text, ((w, l) for t in annotation for w, l in zip(*t[:2]))), # annotation: Tuple[List[List[str]], Tuple[List[List[str]] "tuple_of_lists_of_lists": lambda doc, annotation: get_offsets_from_sentences( doc.text, ((w, l) for ws, ls in zip(*annotation[:2]) for w, l in zip(ws, ls))) # TODO: BRAT # TODO: Pubmed } def __init__(self, schema: Union[str, Callable[[Doc, Any], OffsetAnnotation]] = None, getter=None, path: str = None): self.cache = {} self.loaded = False if not schema: self.schema = OFFSETS elif schema in self.known_schemas: self.schema = Cached.known_schemas[schema] elif isinstance(schema, Callable): self.schema = schema else: self.schema = None self.getter = getter if path: self.load(path) @overrides def save(self, path: str): util.save_file(self.cache, path) # TODO: guess schema @overrides def load(self, path): self.cache = util.load_file(path) self.loaded = True @overrides def annotate_document(self, doc: Doc) -> OffsetAnnotation: if not self.loaded: raise ValueError("You forgot to load the cache!") annotations = self.cache.get(doc._.id, None) if annotations: if self.schema: if self.schema == OFFSETS: return self.getter( annotations) if self.getter else annotations else: return self.schema( doc, self.getter(annotations) if self.getter else annotations) else: logger.info( f"no schema loaded for {self.__class__.__name__}, good luck!" ) return annotations
# doc_toks rövidebb általában j = 0 k = 0 new_tags = [] while j < len(sentences[i]): if sentences[i][j] == doc_toks[k]: new_tags.append(iobs[i][j]) j += 1 k += 1 else: new_tags.append(iobs[i][j]) k += 1 j += 2 tags = iob_to_biluo(new_tags) else: tags = iob_to_biluo(iobs[i]) try: entities = offsets_from_biluo_tags(doc, tags) e = (detokenized_sent, entities) corpus.append(e) except Exception as err: print(err, detokenized_sent) continue print(len(corpus)) corpus = [e for e in corpus if len(e[0]) > 0] print(len(corpus)) with open("data/interim/corpus.p", "wb") as of: pickle.dump(corpus, of)