def extract_from_biocreative(f, annotation_type, split_sentences=False): sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin') tree = etree.parse(f) sentences = [] entities = [] document_ids = [] documents = tree.xpath('.//document') for document in tqdm(documents): document_id = document.xpath('./id')[0].text passages = document.xpath('passage') uses_passage_offset = len(passages) > 1 for passage in document.xpath('passage'): text = passage.xpath('text/text()')[0] if not split_sentences: tmp_sentences = text.split('\n') else: tmp_sentences = sentence_splitter.split(text) tmp_entities = [[] for _ in tmp_sentences] for annotation in passage.xpath('.//annotation'): # skip non-contiguous entities if len(annotation.xpath('.//location')) > 1: continue is_entity = False for infon in annotation.xpath('.//infon'): # Check if multiple types are given as a list if type(annotation_type) == type(""): is_entity |= ((infon.get('key') == 'type') & (infon.text == annotation_type)) else: is_entity |= ((infon.get('key') == 'type') & (infon.text in annotation_type)) if not is_entity: continue offset = int(annotation.xpath('.//location')[0].get('offset')) if uses_passage_offset: offset -= int(passage.xpath('./offset/text()')[0]) length = int(annotation.xpath('.//location')[0].get('length')) if not split_sentences: sentence_idx = 0 while offset > len(tmp_sentences[sentence_idx]): offset -= len(tmp_sentences[sentence_idx]) + 1 sentence_idx += 1 end = offset + length else: o_end = offset + length o_offset = offset sentence_idx, offset, end = sentence_splitter.map_offsets( o_offset, o_end) while (len(tmp_sentences[sentence_idx]) < end): tmp_sentences = sentence_splitter.merge_sentences( sentence_idx) tmp_entities[sentence_idx] += tmp_entities[sentence_idx + 1] del tmp_entities[sentence_idx + 1] sentence_idx, offset, end = sentence_splitter.map_offsets( o_offset, o_end) annotated_entity = tmp_sentences[sentence_idx][offset:end] true_entity = annotation.xpath('.//text')[0].text assert annotated_entity.lower() == true_entity.lower() tmp_entities[sentence_idx] += [(offset, end)] document_ids += [document_id] * len(tmp_sentences) sentences += tmp_sentences entities += tmp_entities return sentences, entities, document_ids
from flask import Flask, request, jsonify, session import time import codecs import optparse import numpy as np from loader import prepare_sentence from utils import create_input, iobes_iob, zero_digits from model import Model import sys import logging from opennlp_wrapper import SentenceSplitter, OpenNLP app = Flask(__name__) model = Model(model_path="/usr/huner/models/" + sys.argv[1]) sentence_splitter = SentenceSplitter(os.getenv('OPENNLP'), 'en-sent.bin') tokenizer = OpenNLP(os.getenv('OPENNLP'), 'TokenizerME', 'en-token.bin') parameters = model.parameters def split_sentences(text): text = text.strip() return sentence_splitter.split(text) def tokenize(sentence): sentence = sentence.strip() return tokenizer.parse(sentence).decode().split() # Load reverse mappings
import utils parser = argparse.ArgumentParser() parser.add_argument("input_dir") parser.add_argument("types") parser.add_argument("output") args = parser.parse_args() types = args.types.split(',') sentences = [] entities = [] document_ids = [] opennlp_path = os.environ['OPENNLP'] sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin') files = [file for file in os.listdir(args.input_dir) if file[-4:]=='.ann'] for file in tqdm.tqdm(files): with open(os.path.join(args.input_dir, file), 'r') as f_ann: with open(os.path.join(args.input_dir, file[:-4]), 'r') as f_txt: text = f_txt.read() document_id = text.split('\n\n')[0] tmp_sentences = sentence_splitter.split(text) tmp_entities = [[] for _ in tmp_sentences] tree = etree.parse(f_ann) for annotation in tree.xpath(".//Annotation"): if not annotation.get('type') in types: continue o_start, o_end = [int(x) for x in annotation.get('span').split('..')]
import argparse import utils import os from opennlp_wrapper import SentenceSplitter parser = argparse.ArgumentParser() parser.add_argument("input_dir") parser.add_argument("type") parser.add_argument("output") args = parser.parse_args() opennlp_path = os.environ['OPENNLP'] sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin') sentences = [] entities = [] document_ids = [] ann_files = [file for file in os.listdir(args.input_dir) if file[-4:]=='.ann'] for ann_file in ann_files: txt_file = ann_file[:-4] + '.txt' with open(os.path.join(args.input_dir, ann_file)) as f_ann: with open(os.path.join(args.input_dir, txt_file)) as f_txt: document_id = os.path.basename(txt_file).strip('.txt') tmp_sentences = sentence_splitter.split(f_txt.read()) tmp_entities = [[] for _ in tmp_sentences] for line in f_ann: if not line: continue line = line.split('\t')
import argparse import os from opennlp_wrapper import OpenNLP, SentenceSplitter opennlp_path = os.environ['OPENNLP'] parser = argparse.ArgumentParser() parser.add_argument("input") parser.add_argument("output") args = parser.parse_args() pos_tagger = OpenNLP(opennlp_path, 'POSTagger', 'en-pos-maxent.bin') sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin') with open(args.input, 'r', encoding='iso-8859-1') as f_in: with open(args.output, 'w') as f_out: tokens = [] entities = [] for line in f_in: line = line.strip() if not line: continue if line[:3] == '###': i = 0 for sentence in sentence_splitter.split(' '.join(tokens)): sentence_start = i length = 0 while length < len(sentence): length += len(tokens[i]) + 1 i += 1 pos_tags = [
import argparse from tqdm import tqdm import utils import os from opennlp_wrapper import SentenceSplitter parser = argparse.ArgumentParser() parser.add_argument("text") parser.add_argument("annotations") parser.add_argument("output") args = parser.parse_args() opennlp_path = os.environ['OPENNLP'] sentence_splitter = SentenceSplitter(opennlp_path, 'en-sent.bin') sentences = [] entities = [] document_ids = [] with open(args.text, 'r') as f_text: with open(args.annotations, 'r') as f_annotations: texts = {} for line in f_text: if not line: continue parts = line.split('\t') texts[parts[0]] = (parts[1], parts[2]) last_doc = '' tmp_sentences = []