Exemplo n.º 1
0
def extract_from_biocreative(f, annotation_type, split_sentences=False):
    sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin')
    tree = etree.parse(f)
    sentences = []
    entities = []
    document_ids = []
    documents = tree.xpath('.//document')

    for document in tqdm(documents):
        document_id = document.xpath('./id')[0].text
        passages = document.xpath('passage')
        uses_passage_offset = len(passages) > 1

        for passage in document.xpath('passage'):
            text = passage.xpath('text/text()')[0]

            if not split_sentences:
                tmp_sentences = text.split('\n')
            else:
                tmp_sentences = sentence_splitter.split(text)
            tmp_entities = [[] for _ in tmp_sentences]

            for annotation in passage.xpath('.//annotation'):
                # skip non-contiguous entities
                if len(annotation.xpath('.//location')) > 1:
                    continue

                is_entity = False

                for infon in annotation.xpath('.//infon'):
                    # Check if multiple types are given as a list
                    if type(annotation_type) == type(""):
                        is_entity |= ((infon.get('key') == 'type') &
                                      (infon.text == annotation_type))
                    else:
                        is_entity |= ((infon.get('key') == 'type') &
                                      (infon.text in annotation_type))
                if not is_entity:
                    continue
                offset = int(annotation.xpath('.//location')[0].get('offset'))
                if uses_passage_offset:
                    offset -= int(passage.xpath('./offset/text()')[0])
                length = int(annotation.xpath('.//location')[0].get('length'))
                if not split_sentences:
                    sentence_idx = 0
                    while offset > len(tmp_sentences[sentence_idx]):
                        offset -= len(tmp_sentences[sentence_idx]) + 1
                        sentence_idx += 1
                    end = offset + length
                else:
                    o_end = offset + length
                    o_offset = offset
                    sentence_idx, offset, end = sentence_splitter.map_offsets(
                        o_offset, o_end)
                    while (len(tmp_sentences[sentence_idx]) < end):
                        tmp_sentences = sentence_splitter.merge_sentences(
                            sentence_idx)
                        tmp_entities[sentence_idx] += tmp_entities[sentence_idx
                                                                   + 1]
                        del tmp_entities[sentence_idx + 1]
                        sentence_idx, offset, end = sentence_splitter.map_offsets(
                            o_offset, o_end)
                annotated_entity = tmp_sentences[sentence_idx][offset:end]
                true_entity = annotation.xpath('.//text')[0].text
                assert annotated_entity.lower() == true_entity.lower()

                tmp_entities[sentence_idx] += [(offset, end)]

            document_ids += [document_id] * len(tmp_sentences)
            sentences += tmp_sentences
            entities += tmp_entities

    return sentences, entities, document_ids
Exemplo n.º 2
0
from flask import Flask, request, jsonify, session
import time
import codecs
import optparse
import numpy as np
from loader import prepare_sentence
from utils import create_input, iobes_iob, zero_digits
from model import Model
import sys
import logging
from opennlp_wrapper import SentenceSplitter, OpenNLP

app = Flask(__name__)

model = Model(model_path="/usr/huner/models/" + sys.argv[1])
sentence_splitter = SentenceSplitter(os.getenv('OPENNLP'), 'en-sent.bin')
tokenizer = OpenNLP(os.getenv('OPENNLP'), 'TokenizerME', 'en-token.bin')
parameters = model.parameters


def split_sentences(text):
    text = text.strip()
    return sentence_splitter.split(text)


def tokenize(sentence):
    sentence = sentence.strip()
    return tokenizer.parse(sentence).decode().split()


# Load reverse mappings
import utils

parser = argparse.ArgumentParser()
parser.add_argument("input_dir")
parser.add_argument("types")
parser.add_argument("output")
args = parser.parse_args()

types = args.types.split(',')

sentences = []
entities = []
document_ids = []

opennlp_path = os.environ['OPENNLP']
sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin')

files = [file for file in os.listdir(args.input_dir) if file[-4:]=='.ann']

for file in tqdm.tqdm(files):
    with open(os.path.join(args.input_dir, file), 'r') as f_ann:
        with open(os.path.join(args.input_dir, file[:-4]), 'r') as f_txt:
            text = f_txt.read()
            document_id = text.split('\n\n')[0]
            tmp_sentences = sentence_splitter.split(text)
            tmp_entities = [[] for _ in tmp_sentences]
            tree = etree.parse(f_ann)
            for annotation in tree.xpath(".//Annotation"):
                if not annotation.get('type') in types:
                    continue
                o_start, o_end = [int(x) for x in annotation.get('span').split('..')]
Exemplo n.º 4
0
import argparse
import utils
import os
from opennlp_wrapper import SentenceSplitter

parser = argparse.ArgumentParser()
parser.add_argument("input_dir")
parser.add_argument("type")
parser.add_argument("output")
args = parser.parse_args()

opennlp_path = os.environ['OPENNLP']
sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin')

sentences = []
entities = []
document_ids = []

ann_files = [file for file in os.listdir(args.input_dir) if file[-4:]=='.ann']

for ann_file in ann_files:
    txt_file = ann_file[:-4] + '.txt'
    with open(os.path.join(args.input_dir, ann_file)) as f_ann:
        with open(os.path.join(args.input_dir, txt_file)) as f_txt:
            document_id = os.path.basename(txt_file).strip('.txt')
            tmp_sentences = sentence_splitter.split(f_txt.read())
            tmp_entities = [[] for _ in tmp_sentences]
            for line in f_ann:
                if not line:
                    continue
                line = line.split('\t')
Exemplo n.º 5
0
import argparse
import os
from opennlp_wrapper import OpenNLP, SentenceSplitter

opennlp_path = os.environ['OPENNLP']

parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("output")
args = parser.parse_args()

pos_tagger = OpenNLP(opennlp_path, 'POSTagger', 'en-pos-maxent.bin')
sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin')

with open(args.input, 'r', encoding='iso-8859-1') as f_in:
    with open(args.output, 'w') as f_out:
        tokens = []
        entities = []
        for line in f_in:
            line = line.strip()
            if not line:
                continue
            if line[:3] == '###':
                i = 0
                for sentence in sentence_splitter.split(' '.join(tokens)):
                    sentence_start = i
                    length = 0
                    while length < len(sentence):
                        length += len(tokens[i]) + 1
                        i += 1
                    pos_tags = [
Exemplo n.º 6
0
import argparse

from tqdm import tqdm

import utils
import os
from opennlp_wrapper import SentenceSplitter

parser = argparse.ArgumentParser()
parser.add_argument("text")
parser.add_argument("annotations")
parser.add_argument("output")
args = parser.parse_args()

opennlp_path = os.environ['OPENNLP']
sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin')

sentences = []
entities = []
document_ids = []

with open(args.text, 'r') as f_text:
    with open(args.annotations, 'r') as f_annotations:
        texts = {}
        for line in f_text:
            if not line:
                continue
            parts = line.split('\t')
            texts[parts[0]] = (parts[1], parts[2])
        last_doc = ''
        tmp_sentences = []