parser = argparse.ArgumentParser() parser.add_argument("input") parser.add_argument("types") parser.add_argument("output") args = parser.parse_args() sentences = [] entities = [] document_ids = [] types = args.types.split(',') with open(args.input) as f_in: tree = etree.parse(f_in) for document in tree.xpath('.//document'): sentence_elems = document.xpath('.//sentence') for sentence in sentence_elems: sentences += [sentence.get('text')] document_ids += [document.get('id')] entities += [[]] for entity in sentence.xpath(".//entity"): if not entity.get('type') in types: continue char_offset = entity.get('charOffset').split('-') entities[-1] += [(int(char_offset[0]), int(char_offset[1]))] with open(args.output, 'w') as f_out: utils.write_to_conll(sentences, entities, document_ids, f_out)
tmp_sentence = [] akt_pos = 0 start = word_to_id[sentence[1].split('..')[0]] end = word_to_id[sentence[1].split('..')[1]] for i in range(start, end + 1): tmp_sentence += [words[i]] word_pos[i] = (j, akt_pos) akt_pos += len(words[i]) + 1 tmp_sentences += [tmp_sentence] tmp_entities = [[] for _ in tmp_sentences] for protein in protein_tree: try: start = word_to_id[protein.get('span').split('..')[0]] end = word_to_id[protein.get('span').split('..')[-1]] tmp_entities[word_pos[start - 1][0]] += [ (word_pos[start - 1][1], word_pos[end - 1][1] + len(words[end - 1])) ] except: print('Skipped multipart entity') sentences += tmp_sentences entities += tmp_entities document_ids += [document_id] * len(tmp_sentences) with open(args.output, 'w') as f_out: utils.write_to_conll([' '.join(x) for x in sentences], entities, document_ids, f_out, tokenizer=lambda x: x.split(' '))
sentences[sentence_id] = sentence_text for entity_token_ids in all_entity_token_ids: entity_start = None for token_idx, (token_id, token_offset) in enumerate( zip(token_ids, token_offsets)): if token_id in entity_token_ids: if entity_start is None: entity_start = token_offset else: if entity_start is not None: entities_per_sentence[sentence_id].append( (entity_start, token_offset - 1)) entity_start = None with open(args.output, 'w') as f_out: sentences = [sentences[s_id] for s_id in sentence_ids_in_order] entities = [ utils.merge_overlapping_entities(entities_per_sentence[s_id]) for s_id in sentence_ids_in_order ] def tokenizer(sentence): return sentence.split() utils.write_to_conll( sentences, entities, [str(x) for x in range(0, len(sentence_ids_in_order))], f_out, tokenizer=tokenizer)
sentences = [] entities = [] with open(args.input, 'r') as f_in: tree = etree.parse(f_in) for document in tree.xpath('.//document'): assert len(document.xpath('passage/text')) == 1 text = document.xpath('passage/text')[0].text tmp_sentences = text.split('\n') tmp_entities = [[] for x in tmp_sentences] for annotation in document.xpath('.//annotation'): prot = False for infon in annotation.xpath('.//infon'): prot |= ((infon.get('key') == 'type') & (infon.text[0:2] == 'pm')) if not prot: continue offset = int(annotation.xpath('.//location')[0].get('offset')) length = int(annotation.xpath('.//location')[0].get('length')) i = 0 while offset > len(tmp_sentences[i]): offset -= len(tmp_sentences[i]) + 1 i += 1 tmp_entities[i] += [(offset, offset + length)] sentences += tmp_sentences entities += tmp_entities with open(args.output, 'w') as f_out: utils.write_to_conll(sentences, entities, f_out)
soup = BeautifulSoup(f_in, 'lxml') for paragraph in soup.find_all('snippet'): if len(paragraph.text.strip()) == 0: continue # each (new-line-separated) sentence seems to be valid html # so we extract the source, split at newline and parse again source = "".join(str(elem) for elem in paragraph.children) sentences = source.split('\n') for sentence in sentences: document_id = os.path.basename(os.path.dirname(document)) sentence_bs = BeautifulSoup(sentence, 'lxml') sentence_text, sentence_entities = extract_sentence_info(sentence_bs) tmp_sentences = sentence_splitter.split(sentence_text) tmp_entities = [[] for _ in tmp_sentences] for entity in sentence_entities: org_start = entity[0] org_end = entity[1] id, start, end = sentence_splitter.map_offsets(org_start, org_end) while len(tmp_sentences[id]) < end: tmp_sentences = sentence_splitter.merge_sentences(id) tmp_entities[id] += tmp_entities[id + 1] del tmp_entities[id + 1] id, start, end = sentence_splitter.map_offsets(org_start, org_end) tmp_entities[id] += [(start, end)] all_sentences += tmp_sentences all_entities += tmp_entities document_ids += [document_id] * len(tmp_sentences) with open(args.output, 'w') as f_out: utils.write_to_conll(all_sentences, all_entities, document_ids, f_out)