def _bio_to_standoff(bio_tags: List[str], spacy_doc: spacy.tokens.Doc) -> List[Annotation]:
    """Convert BIO tagged document to annotations in standoff format.

    The original spaCy document is used to recreate correct entity offsets.

    Parameters
    ----------
    bio_tags : List[str]
        A BIO tagged sentence. `len(bio_tags) == len(spacy_doc)` has to hold.
    spacy_doc : spacy.tokens.Doc
        The spaCy doc corresponding to the BIO tags.

    Returns
    -------
    List[Annotation]
        The standoff annotations.

    """
    bio_tags = fix_dangling_entities(bio_tags)
    biluo_tags = _bio_to_biluo(bio_tags)
    offsets = offsets_from_biluo_tags(spacy_doc, biluo_tags)

    annotations = []
    for i, offset in enumerate(offsets):
        annotations.append(Annotation(
            text=spacy_doc.char_span(offset[0], offset[1]).text,
            start=offset[0],
            end=offset[1],
            tag=offset[2],
            ann_id='T{}'.format(i),
        ))

    return annotations
示例#2
0
 def parse_sp(ner: List[str], sentence: str) -> List[NERMarkerIdx]:
     """Parse a NER tagged sentence based on Spacy (used for longer sentences)."""
     # U for unique, i.e. no B or I if single etc (Spacy)
     doc = nlp(sentence)
     return [
         NERMarkerIdx(*offset)
         for offset in offsets_from_biluo_tags(doc, ner)
     ]
示例#3
0
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets
示例#4
0
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets
示例#5
0
    def _sentence_to_spacy_annotations(self, tokens,
                                       tags) -> Tuple[str, Tuple]:
        sentence = " ".join(tokens)
        tags = iob_to_biluo(tags)

        doc = self.nlp(sentence)
        annotations = offsets_from_biluo_tags(doc, tags)
        annotations = [(begin, end, tag) for begin, end, tag in annotations
                       if len(tag) > 0]

        return sentence, annotations
示例#6
0
    def process(self, line, intent_treshold_score=0.5):
        doc = self.nlp.make_doc(line)
        words_true = [w.text for w in doc]
        length = len(words_true)
        words_true += ['<EOS>']
        words = words_true + ['<PAD>'] * (50 - len(words_true))
        words = np.array(words)
        batch = [{'words': words, 'length': length}]
        decoder_prediction, intent, intent_score = self.model.test(batch)
        # batch only contains one element
        intent = intent[0]
        intent_score = intent_score[0]
        # get the part that corresponds to words (truncate PAD and EOS)
        decoder_prediction = decoder_prediction[:length, 0]
        #print(decoder_prediction, intent[0], intent_score)
        # clean up <EOS> and <PAD>
        decoder_prediction = [
            t if (t != '<EOS>' and t != '<PAD>') else 'O'
            for t in decoder_prediction
        ]
        biluo_tags = iob_to_biluo(decoder_prediction)
        entities_offsets = offsets_from_biluo_tags(doc, biluo_tags)
        entities = []
        for ent in entities_offsets:
            e_parts = ent[2].split('.')
            if len(e_parts) > 1:
                # role.type
                entity = {'role': e_parts[0], 'type': e_parts[1]}
            else:
                entity = {'role': None, 'type': e_parts[0]}
            value = line[ent[0]:ent[1]]
            entities.append({
                '_entity': entity['type'],
                'role': entity['role'],
                'value': value,
                '_body': value,
                '_start': ent[0],
                '_end': ent[1]
            })

        # now convert to the same format as wit.ai, applying the treshold
        if intent_score < intent_treshold_score:
            intent_result = None
        else:
            intent_result = {'confidence': str(intent_score), 'value': intent}

        entities_result = {}
        for ent in entities:
            if ent['role']:
                entities_result[ent['role']] = ent
            else:
                entities_result[ent['_entity']] = ent

        return intent_result, entities_result
示例#7
0
    def _sentence_to_spacy_annotations(self, tokens,
                                       tags) -> Tuple[str, Tuple]:
        sentence = " ".join(tokens)
        tags = iob_to_biluo(tags)

        doc = self.nlp(sentence)
        annotations = offsets_from_biluo_tags(doc, tags)
        # print(sentence)
        # print(tags)
        # print(annotations)

        return sentence, annotations
def format_predictions_to_display(doc,
                                  predictions,
                                  probability_maps,
                                  pos=False):
    """Format predictions into spacy display formar."""
    bert_predictions = []
    iob_tags = []
    tags_formatted = []

    for prediction, probability_map in zip(predictions[0],
                                           probability_maps[0]):
        word = list(prediction.keys())[0]
        probas = probability_map[word]
        normalized_probas = list(softmax(np.mean(probas, axis=0)))
        bert_predictions.append(
            (word, prediction[word], np.max(normalized_probas)))
        if pos:
            iob_tags.append("I-" + prediction[word])
        else:
            iob_tags.append(prediction[word])

    biluo_tags = iob_to_biluo(iob_tags)
    tags = offsets_from_biluo_tags(doc, biluo_tags)

    for tag in tags:
        start_token = get_token_for_char(doc, tag[0])
        word_span = doc.text[tag[0]:tag[1]]
        length_of_span = len(word_span.split())
        if length_of_span == 1:
            probs = [bert_predictions[start_token][2]]
        else:
            probs = [
                item[2] for item in bert_predictions[start_token:start_token +
                                                     length_of_span]
            ]
        tags_formatted.append({
            "start": tag[0],
            "end": tag[1],
            "label": tag[2],
            "score": np.prod(probs)
        })
    return bert_predictions, tags_formatted
示例#9
0
def predictions_to_doccano(input_file: str,
                           output_file: str,
                           language='en_core_web_sm'):
    """
    Convert AllenNLP output json to Doccano style.

    :param input_file str: Input AllenNLP json file.
    :param output_file str: Doccano style output json.
    :param language SpaCy language: For splitting by sentences.
    """
    nlp = spacy.blank('en')
    json_lines = []
    with jsonlines.open(input_file) as reader:
        for obj in reader:
            text = obj['sentence']
            doc = nlp(text)
            offsets = offsets_from_biluo_tags(doc, obj['tags'])

            json_line = {'text': text, 'labels': offsets}
            json_lines.append(json_line)

    with jsonlines.open(output_file, mode='w') as writer:
        for line in json_lines:
            writer.write(line)
示例#10
0
def brat2spacy(tokenizer, ann, text):
    doc = tokenizer(text)
    words = [i.text for i in doc]
    entity_ids = defaultdict(tuple)
    relation_ids = defaultdict(tuple)
    entities = []
    for line in ann.strip().split('\n'):
        annotation = line.strip().rsplit('\t')
        id_ = annotation[0]
        if id_ == '*':
            ann_type = id_[0]
        else:
            ann_type = annotation_ids[id_[0]]
        if ann_type == 'entity':
            if len(annotation[1:]) == 2:
                span, surface_form = annotation[1:]
                entity_type, start, end = span.split(' ')
                entity_ids[id_] = (int(start), int(end))
                entities.append((int(start), int(end), entity_type))
        if ann_type == 'relation':
            if len(annotation[1:]) == 1:
                rel_type, head, dep = annotation[1].split(' ')
                relation_ids[id_] = (rel_type, head, dep)
    entities.sort(key=lambda x: x[0])
    tags = biluo_tags_from_offsets(doc, entities)
    if relation_ids:
        # mapping from brat ids to doc's id
        brat_doc_ids_map = {}
        for entity in entity_ids:
            span = doc.char_span(*entity_ids[entity])
            if span.end - span.start == 1:
                brat_doc_ids_map[entity] = span.start
            else:
                # raise Warning("Tokenization mismatch, more than 1 spaCy token in ann token span")
                brat_doc_ids_map[entity] = span.start
        ids = range(len(doc))
        heads = defaultdict(int)
        deps = defaultdict(int)
        for rel_id, rel in relation_ids.items():
            dep, token, head = rel
            token, head = brat_doc_ids_map[token.split(
                ':')[1]], brat_doc_ids_map[head.split(':')[1]]
            heads[head] = token
            deps[head] = dep
        heads = [
            i[1] if i[1] > 0 else i[0] for i in [(i, heads[i]) for i in ids]
        ]
        deps = [
            i[1] if i[1] != 0 else 'ROOT' for i in [(i, deps[i]) for i in ids]
        ]
        assert len(words) == len(heads) == len(deps) == len(tags)
        return GoldParse(doc,
                         words=words,
                         heads=heads,
                         tags=tags,
                         deps=deps,
                         entities=entities), text
    else:
        assert len(words) == len(tags)
        return GoldParse(doc,
                         words=words,
                         tags=tags,
                         entities=offsets_from_biluo_tags(doc, tags)), text
示例#11
0
文件: cached.py 项目: schlevik/tmdm
class Cached(Provider):
    cache: Dict[str, Any]
    name = 'cached'
    known_schemas = {
        # these assume same tokenisation
        "bio":
        lambda doc, annotation: offsets_from_biluo_tags(
            iob_to_biluo(doc, annotation)),
        "bilou":
        offsets_from_biluo_tags,
        "offsets":
        OFFSETS,
        "list_of_clusters":
        convert_clusters_to_offsets,
        # these provide their own tokenisation

        # annotation: List[Tuple[str,str]]
        "list_of_tuples_bio_flat":
        lambda doc, annotation: get_offsets(doc.text, annotation),

        # annotation: List[List[Tuple[str,str]]]
        "list_of_tuples_bio_stacked":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, annotation),

        # annotation: Tuple[List[str],List[str]]
        "tuple_of_lists_flat":
        lambda doc, annotation: get_offsets(doc.text, zip(*annotation[:2])),

        # annotation: List[Tuple[List[str]], Tuple[List[str]]]
        "list_of_tuples_of_lists":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, ((w, l) for t in annotation for w, l in zip(*t[:2]))),

        # annotation: Tuple[List[List[str]], Tuple[List[List[str]]
        "tuple_of_lists_of_lists":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, ((w, l) for ws, ls in zip(*annotation[:2])
                       for w, l in zip(ws, ls)))

        # TODO: BRAT
        # TODO: Pubmed
    }

    def __init__(self,
                 schema: Union[str, Callable[[Doc, Any],
                                             OffsetAnnotation]] = None,
                 getter=None,
                 path: str = None):
        self.cache = {}
        self.loaded = False
        if not schema:
            self.schema = OFFSETS
        elif schema in self.known_schemas:
            self.schema = Cached.known_schemas[schema]
        elif isinstance(schema, Callable):
            self.schema = schema
        else:
            self.schema = None
        self.getter = getter
        if path:
            self.load(path)

    @overrides
    def save(self, path: str):
        util.save_file(self.cache, path)

    # TODO: guess schema

    @overrides
    def load(self, path):
        self.cache = util.load_file(path)
        self.loaded = True

    @overrides
    def annotate_document(self, doc: Doc) -> OffsetAnnotation:
        if not self.loaded:
            raise ValueError("You forgot to load the cache!")
        annotations = self.cache.get(doc._.id, None)
        if annotations:
            if self.schema:
                if self.schema == OFFSETS:
                    return self.getter(
                        annotations) if self.getter else annotations
                else:
                    return self.schema(
                        doc,
                        self.getter(annotations)
                        if self.getter else annotations)
            else:
                logger.info(
                    f"no schema loaded for {self.__class__.__name__}, good luck!"
                )
                return annotations
        # doc_toks rövidebb általában
        j = 0
        k = 0
        new_tags = []
        while j < len(sentences[i]):
            if sentences[i][j] == doc_toks[k]:
                new_tags.append(iobs[i][j])
                j += 1
                k += 1
            else:
                new_tags.append(iobs[i][j])
                k += 1
                j += 2
        tags = iob_to_biluo(new_tags)
    else:
        tags = iob_to_biluo(iobs[i])
    try:
        entities = offsets_from_biluo_tags(doc, tags)
        e = (detokenized_sent, entities)
        corpus.append(e)
    except Exception as err:
        print(err, detokenized_sent)
        continue

print(len(corpus))
corpus = [e for e in corpus if len(e[0]) > 0]
print(len(corpus))

with open("data/interim/corpus.p", "wb") as of:
    pickle.dump(corpus, of)