def __parse_document(self, tree): document = BioCDocument() document.id = tree.findtext('id') document.infons = self.__parse_infons(tree) for child in tree.findall('passage'): document.add_passage(self.__parse_passage(child)) for child in tree.findall('annotation'): document.add_annotation(self.__parse_annotation(child)) for child in tree.findall('relation'): document.add_relation(self.__parse_relation(child)) return document
def parse_doc(obj: Dict) -> BioCDocument: """ Deserialize a dict obj to a BioCDocument object """ doc = BioCDocument() doc.id = obj['id'] doc.infons = obj['infons'] for passage in obj['passages']: doc.add_passage(parse_passage(passage)) if 'annotations' in obj: for annotation in obj['annotations']: doc.add_annotation(parse_annotation(annotation)) for relation in obj['relations']: doc.add_relation(parse_relation(relation)) return doc
def __parse_document(self, tree): document = BioCDocument() document.id = tree.attrib['id'] document.infons = self.__parse_infons(tree) if tree.find('text') is not None: document.text = tree.findtext('text') for child in tree.findall('passage'): document.add_passage(self.__parse_passage(child)) for child in tree.findall('sentence'): document.add_sentence(self.__parse_sentence(child)) for child in tree.findall('annotation'): document.add_annotation(self.__parse_annotation(child)) for child in tree.findall('relation'): document.add_relation(self.__parse_relation(child)) return document
def __read(self): while self.__has_next(): event, elem = self.__next_event() if self.__state == 0: if event == 'start': if elem.tag == 'collection': self.__state = 1 self.__collection = BioCCollection() # collection elif self.__state == 1: if event == 'start': if elem.tag == 'document': self.__document = BioCDocument() self.__document.id = elem.get('id') self.__state = 2 elif event == 'end': if elem.tag == 'source': self.__collection.source = elem.text elif elem.tag == 'date': self.__collection.date = elem.text elif elem.tag == 'key': self.__collection.key = elem.text elif elem.tag == 'version': self.__collection.version = elem.text elif elem.tag == 'infon': self.__collection.infons[elem.get('key')] = elem.text elif elem.tag == 'collection': self.__state = 0 self.__document = None self.__passage = None self.__sentence = None # document elif self.__state == 2: if event == 'start': if elem.tag == 'passage': self.__passage = BioCPassage() self.__passage.offset = int(elem.get('offset')) self.__state = 3 elif elem.tag == 'annotation': self.__document.add_annotation( self.__read_annotation(elem)) elif elem.tag == 'relation': self.__document.add_relation( self.__read_relation(elem)) elif event == 'end': if elem.tag == 'infon': self.__document.infons[elem.get('key')] = elem.text elif elem.tag == 'document': self.__state = 1 return # passage elif self.__state == 3: if event == 'start': if elem.tag == 'sentence': self.__sentence = BioCSentence() self.__sentence.offset = int(elem.get('offset')) self.__state = 4 elif elem.tag == 'annotation': self.__passage.add_annotation( self.__read_annotation(elem)) elif elem.tag == 'relation': self.__passage.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'text': self.__passage.text = elem.text elif elem.tag == 'infon': self.__passage.infons[elem.get('key')] = elem.text elif elem.tag == 'passage': self.__state = 2 if self.__passage is not None: self.__document.add_passage(self.__passage) # sentence elif self.__state == 4: if event == 'start': if elem.tag == 'annotation': self.__sentence.add_annotation( self.__read_annotation(elem)) elif elem.tag == 'relation': self.__sentence.add_relation( self.__read_relation(elem)) elif event == 'end': if elem.tag == 'text': self.__sentence.text = elem.text elif elem.tag == 'infon': self.__sentence.infons[elem.get('key')] = elem.text elif elem.tag == 'sentence': self.__state = 3 if self.__sentence is not None: self.__passage.add_sentence(self.__sentence)
class BioCXMLDocumentReader: """ Reader for the BioC XML format, one document per iteration. """ def __init__(self, source: Union[str, BinaryIO]): # if not isinstance(file, str): # file = str(file) self.file = source self.__context = iter( etree.iterparse(self.file, events=('start', 'end'))) self.__state = 0 self.__event = None self.__elem = None self.__read() def __iter__(self): return self def __next__(self): """ Reads one BioC document from the XML file. Returns: BioCDocument: the BioC document """ if self.__document is None: raise StopIteration else: document = self.__document self.__read() return document def __read(self): while self.__has_next(): event, elem = self.__next_event() if self.__state == 0: if event == 'start': if elem.tag == 'collection': self.__state = 1 self.__collection = BioCCollection() # collection elif self.__state == 1: if event == 'start': if elem.tag == 'document': self.__document = BioCDocument() self.__document.id = elem.get('id') self.__state = 2 elif event == 'end': if elem.tag == 'source': self.__collection.source = elem.text elif elem.tag == 'date': self.__collection.date = elem.text elif elem.tag == 'key': self.__collection.key = elem.text elif elem.tag == 'version': self.__collection.version = elem.text elif elem.tag == 'infon': self.__collection.infons[elem.get('key')] = elem.text elif elem.tag == 'collection': self.__state = 0 self.__document = None self.__passage = None self.__sentence = None # document elif self.__state == 2: if event == 'start': if elem.tag == 'passage': self.__passage = BioCPassage() self.__passage.offset = int(elem.get('offset')) self.__state = 3 elif elem.tag == 'annotation': self.__document.add_annotation( self.__read_annotation(elem)) elif elem.tag == 'relation': self.__document.add_relation( self.__read_relation(elem)) elif event == 'end': if elem.tag == 'infon': self.__document.infons[elem.get('key')] = elem.text elif elem.tag == 'document': self.__state = 1 return # passage elif self.__state == 3: if event == 'start': if elem.tag == 'sentence': self.__sentence = BioCSentence() self.__sentence.offset = int(elem.get('offset')) self.__state = 4 elif elem.tag == 'annotation': self.__passage.add_annotation( self.__read_annotation(elem)) elif elem.tag == 'relation': self.__passage.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'text': self.__passage.text = elem.text elif elem.tag == 'infon': self.__passage.infons[elem.get('key')] = elem.text elif elem.tag == 'passage': self.__state = 2 if self.__passage is not None: self.__document.add_passage(self.__passage) # sentence elif self.__state == 4: if event == 'start': if elem.tag == 'annotation': self.__sentence.add_annotation( self.__read_annotation(elem)) elif elem.tag == 'relation': self.__sentence.add_relation( self.__read_relation(elem)) elif event == 'end': if elem.tag == 'text': self.__sentence.text = elem.text elif elem.tag == 'infon': self.__sentence.infons[elem.get('key')] = elem.text elif elem.tag == 'sentence': self.__state = 3 if self.__sentence is not None: self.__passage.add_sentence(self.__sentence) def __read_annotation(self, start_elem): ann = BioCAnnotation() ann.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'text': ann.text = elem.text elif elem.tag == 'infon': ann.infons[elem.get('key')] = elem.text elif elem.tag == 'location': ann.add_location( BioCLocation(int(elem.get('offset')), int(elem.get('length')))) elif elem.tag == 'annotation': return ann raise RuntimeError("should not reach here") # pragma: no cover def __read_relation(self, start_elem): rel = BioCRelation() rel.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'infon': rel.infons[elem.get('key')] = elem.text elif elem.tag == 'node': rel.add_node(BioCNode(elem.get('refid'), elem.get('role'))) if elem.tag == 'relation': return rel raise RuntimeError("should not reach here") # pragma: no cover def __has_next(self): try: self.__event, self.__elem = next(self.__context) return True except StopIteration: self.__event = None self.__elem = None return False def __next_event(self): return self.__event, self.__elem def get_collection_info(self) -> BioCCollection: """ Reads the collection information: encoding, version, DTD, source, date, key, infons, etc. Returns: the BioC collection that contains only information """ return self.__collection