示例#1
0
文件: decoder.py 项目: bionlplab/bioc
 def __parse_document(self, tree):
     document = BioCDocument()
     document.id = tree.findtext('id')
     document.infons = self.__parse_infons(tree)
     for child in tree.findall('passage'):
         document.add_passage(self.__parse_passage(child))
     for child in tree.findall('annotation'):
         document.add_annotation(self.__parse_annotation(child))
     for child in tree.findall('relation'):
         document.add_relation(self.__parse_relation(child))
     return document
示例#2
0
文件: decoder.py 项目: bionlplab/bioc
def parse_doc(obj: Dict) -> BioCDocument:
    """
    Deserialize a dict obj to a BioCDocument object
    """
    doc = BioCDocument()
    doc.id = obj['id']
    doc.infons = obj['infons']
    for passage in obj['passages']:
        doc.add_passage(parse_passage(passage))
    if 'annotations' in obj:
        for annotation in obj['annotations']:
            doc.add_annotation(parse_annotation(annotation))
    for relation in obj['relations']:
        doc.add_relation(parse_relation(relation))
    return doc
示例#3
0
 def __parse_document(self, tree):
     document = BioCDocument()
     document.id = tree.attrib['id']
     document.infons = self.__parse_infons(tree)
     if tree.find('text') is not None:
         document.text = tree.findtext('text')
     for child in tree.findall('passage'):
         document.add_passage(self.__parse_passage(child))
     for child in tree.findall('sentence'):
         document.add_sentence(self.__parse_sentence(child))
     for child in tree.findall('annotation'):
         document.add_annotation(self.__parse_annotation(child))
     for child in tree.findall('relation'):
         document.add_relation(self.__parse_relation(child))
     return document
示例#4
0
 def __read(self):
     while self.__has_next():
         event, elem = self.__next_event()
         if self.__state == 0:
             if event == 'start':
                 if elem.tag == 'collection':
                     self.__state = 1
                     self.__collection = BioCCollection()
         # collection
         elif self.__state == 1:
             if event == 'start':
                 if elem.tag == 'document':
                     self.__document = BioCDocument()
                     self.__document.id = elem.get('id')
                     self.__state = 2
             elif event == 'end':
                 if elem.tag == 'source':
                     self.__collection.source = elem.text
                 elif elem.tag == 'date':
                     self.__collection.date = elem.text
                 elif elem.tag == 'key':
                     self.__collection.key = elem.text
                 elif elem.tag == 'version':
                     self.__collection.version = elem.text
                 elif elem.tag == 'infon':
                     self.__collection.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'collection':
                     self.__state = 0
                     self.__document = None
                     self.__passage = None
                     self.__sentence = None
         # document
         elif self.__state == 2:
             if event == 'start':
                 if elem.tag == 'passage':
                     self.__passage = BioCPassage()
                     self.__passage.offset = int(elem.get('offset'))
                     self.__state = 3
                 elif elem.tag == 'annotation':
                     self.__document.add_annotation(
                         self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__document.add_relation(
                         self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'infon':
                     self.__document.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'document':
                     self.__state = 1
                     return
         # passage
         elif self.__state == 3:
             if event == 'start':
                 if elem.tag == 'sentence':
                     self.__sentence = BioCSentence()
                     self.__sentence.offset = int(elem.get('offset'))
                     self.__state = 4
                 elif elem.tag == 'annotation':
                     self.__passage.add_annotation(
                         self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__passage.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'text':
                     self.__passage.text = elem.text
                 elif elem.tag == 'infon':
                     self.__passage.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'passage':
                     self.__state = 2
                     if self.__passage is not None:
                         self.__document.add_passage(self.__passage)
         # sentence
         elif self.__state == 4:
             if event == 'start':
                 if elem.tag == 'annotation':
                     self.__sentence.add_annotation(
                         self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__sentence.add_relation(
                         self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'text':
                     self.__sentence.text = elem.text
                 elif elem.tag == 'infon':
                     self.__sentence.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'sentence':
                     self.__state = 3
                     if self.__sentence is not None:
                         self.__passage.add_sentence(self.__sentence)
示例#5
0
class BioCXMLDocumentReader:
    """
    Reader for the BioC XML format, one document per iteration.
    """
    def __init__(self, source: Union[str, BinaryIO]):
        # if not isinstance(file, str):
        #     file = str(file)
        self.file = source
        self.__context = iter(
            etree.iterparse(self.file, events=('start', 'end')))
        self.__state = 0
        self.__event = None
        self.__elem = None
        self.__read()

    def __iter__(self):
        return self

    def __next__(self):
        """
        Reads one BioC document from the XML file.

        Returns:
            BioCDocument: the BioC document
        """
        if self.__document is None:
            raise StopIteration
        else:
            document = self.__document
            self.__read()
            return document

    def __read(self):
        while self.__has_next():
            event, elem = self.__next_event()
            if self.__state == 0:
                if event == 'start':
                    if elem.tag == 'collection':
                        self.__state = 1
                        self.__collection = BioCCollection()
            # collection
            elif self.__state == 1:
                if event == 'start':
                    if elem.tag == 'document':
                        self.__document = BioCDocument()
                        self.__document.id = elem.get('id')
                        self.__state = 2
                elif event == 'end':
                    if elem.tag == 'source':
                        self.__collection.source = elem.text
                    elif elem.tag == 'date':
                        self.__collection.date = elem.text
                    elif elem.tag == 'key':
                        self.__collection.key = elem.text
                    elif elem.tag == 'version':
                        self.__collection.version = elem.text
                    elif elem.tag == 'infon':
                        self.__collection.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'collection':
                        self.__state = 0
                        self.__document = None
                        self.__passage = None
                        self.__sentence = None
            # document
            elif self.__state == 2:
                if event == 'start':
                    if elem.tag == 'passage':
                        self.__passage = BioCPassage()
                        self.__passage.offset = int(elem.get('offset'))
                        self.__state = 3
                    elif elem.tag == 'annotation':
                        self.__document.add_annotation(
                            self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__document.add_relation(
                            self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'infon':
                        self.__document.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'document':
                        self.__state = 1
                        return
            # passage
            elif self.__state == 3:
                if event == 'start':
                    if elem.tag == 'sentence':
                        self.__sentence = BioCSentence()
                        self.__sentence.offset = int(elem.get('offset'))
                        self.__state = 4
                    elif elem.tag == 'annotation':
                        self.__passage.add_annotation(
                            self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__passage.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'text':
                        self.__passage.text = elem.text
                    elif elem.tag == 'infon':
                        self.__passage.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'passage':
                        self.__state = 2
                        if self.__passage is not None:
                            self.__document.add_passage(self.__passage)
            # sentence
            elif self.__state == 4:
                if event == 'start':
                    if elem.tag == 'annotation':
                        self.__sentence.add_annotation(
                            self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__sentence.add_relation(
                            self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'text':
                        self.__sentence.text = elem.text
                    elif elem.tag == 'infon':
                        self.__sentence.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'sentence':
                        self.__state = 3
                        if self.__sentence is not None:
                            self.__passage.add_sentence(self.__sentence)

    def __read_annotation(self, start_elem):
        ann = BioCAnnotation()
        ann.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'text':
                    ann.text = elem.text
                elif elem.tag == 'infon':
                    ann.infons[elem.get('key')] = elem.text
                elif elem.tag == 'location':
                    ann.add_location(
                        BioCLocation(int(elem.get('offset')),
                                     int(elem.get('length'))))
                elif elem.tag == 'annotation':
                    return ann
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __read_relation(self, start_elem):
        rel = BioCRelation()
        rel.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'infon':
                    rel.infons[elem.get('key')] = elem.text
                elif elem.tag == 'node':
                    rel.add_node(BioCNode(elem.get('refid'), elem.get('role')))
                if elem.tag == 'relation':
                    return rel
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __has_next(self):
        try:
            self.__event, self.__elem = next(self.__context)
            return True
        except StopIteration:
            self.__event = None
            self.__elem = None
            return False

    def __next_event(self):
        return self.__event, self.__elem

    def get_collection_info(self) -> BioCCollection:
        """
        Reads the collection information: encoding, version, DTD, source, date, key, infons, etc.

        Returns:
            the BioC collection that contains only information
        """
        return self.__collection