Exemplo n.º 1
0
def validate_pubtator(content, document):
    """ Returns bool if the provided str is a valid
        pubtator (BioC) response for the Document instance
    """
    try:
        r = BioCReader(source=content)
        r.read()

        # Check general Collection + Document attributes
        assert (len(r.collection.documents) == 1), 'The response included more than the provided Document'
        assert (document.document_id == int(r.collection.documents[0].id)), 'The response does not include the requested PMID'
        assert (len(r.collection.documents[0].passages) == 2), 'The response document does not include the correct number of sections'

        # Check the Title
        assert (int(r.collection.documents[0].passages[0].offset) == 0), 'The title does not start at 0'
        section = document.section_set.first()
        assert (section.text == r.collection.documents[0].passages[0].text), 'The response title does not equal the provided text'
        assert (section.id == int(r.collection.documents[0].passages[0].infons.get('id'))), 'The response title is not correctly identified'

        # Check the Abstract
        assert (int(r.collection.documents[0].passages[1].offset) >= 1), 'The abstract does not start after 0'
        section = document.section_set.last()
        assert (section.text == r.collection.documents[0].passages[1].text), 'The response abstract does not equal the provided text'
        assert (section.id == int(r.collection.documents[0].passages[1].infons.get('id'))), 'The response abstract is not correctly identified'
        return True

    except Exception:
        client.captureException()
        return False
Exemplo n.º 2
0
 def get_instance(self):
     """
         Returns the pubtator BioC instance if valid or None
     """
     try:
         r = BioCReader(source=self.content)
         r.read()
         return r
     except Exception:
         # If one of them doesn't validate leave
         return False
Exemplo n.º 3
0
 def count_annotations(self):
     """ Returns an int count of all types of ER annotations in the Pubtator instance
         If none are found or the document is invalid, return 0
     """
     try:
         r = BioCReader(source=self.content)
         r.read()
         return sum([
             len(passage.annotations)
             for passage in r.collection.documents[0].passages
         ])
     except Exception:
         return 0
Exemplo n.º 4
0
    def entity_recognition_df(self,
                              documents=[],
                              users=[],
                              include_pubtator=True,
                              writer=None):
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            filter_doc_level = 'WHERE `document_section`.`document_id` IN ({0})'.format(
                ','.join(doc_arr))
        else:
            filter_doc_level = ''

        if len(users):
            from django.contrib.auth.models import User
            user_arr = []
            for u in users:
                if type(u) == User:
                    user_arr.append(str(u.pk))
                elif type(u) is str or type(u) is unicode and d.isdigit():
                    user_arr.append(u)
                elif type(u) is int:
                    user_arr.append(str(u))

            filter_user_level = '{0} `document_view`.`user_id` IN ({1})'.format(
                'WHERE' if filter_doc_level == '' else 'AND',
                ','.join(user_arr))
        else:
            filter_user_level = ''

        content_type_id = str(
            ContentType.objects.get_for_model(
                EntityRecognitionAnnotation.objects.first()).id)

        df_arr = []

        cmd_str = ""
        with open('mark2cure/document/commands/get-er-results.sql', 'r') as f:
            cmd_str = f.read()
        cmd_str = cmd_str.format(content_type_pk=content_type_id,
                                 filter_doc_level=filter_doc_level,
                                 filter_user_level=filter_user_level)

        c = connection.cursor()
        try:
            c.execute(cmd_str)

            # Get the full writer in advnaced!!
            if not writer:
                writer = Document.objects.as_writer(documents=documents)

            res = [x for x in c.fetchall()]

            # We group the response to reduce BioCDocument offset dict lookups
            for key, doc_group in groupby(res, lambda x: x[5]):

                bioc_documents = filter(
                    lambda d: d.infons.get('document_pk') == str(key),
                    writer.collection.documents)
                # If a pubtator doesn't exist for the document, we can't include any annotations as the passage offsets need to come from Pubtator
                if len(bioc_documents) == 1:

                    # Use the BioC pubtator file for the offset values
                    offset_dict = {}
                    for passage in bioc_documents[0].passages:
                        offset_dict[int(
                            passage.infons.get('id'))] = passage.offset

                    for x in doc_group:
                        df_arr.append(
                            self._create_er_df_row(
                                uid=x[0],
                                source='db',
                                user_id=x[8],
                                text=x[2],
                                ann_type_idx=x[1],
                                document_pk=x[5],
                                section_id=x[7],
                                section_offset=offset_dict[x[7]],
                                offset_relative=True,
                                start_position=x[3],
                                length=len(x[2])))

        finally:
            c.close()

        if include_pubtator:
            '''
                This is the component that merges the 3 different pubtator
                reponses into 1 main file. It performances selective
                ordering and precedence for some annotations types / instances
            '''
            cmd_str = ""
            with open(
                    'mark2cure/document/commands/get-er-pubtator-results.sql',
                    'r') as f:
                cmd_str = f.read()
            cmd_str = cmd_str.format(','.join(doc_arr))

            c = connection.cursor()
            try:
                c.execute(cmd_str)
                res = [x for x in c.fetchall()]
            finally:
                c.close()

            # Counter({'Disease': 3676, 'Chemical': 2928, 'Species': 1553, 'Gene': 1544, 'FamilyName': 536, 'DomainMotif': 20}) (Sampleing from DB 11/30/2016)
            pubtator_types = ['Disease', 'Gene', 'Chemical']
            for pubtator_content in res:
                r = BioCReader(source=pubtator_content[2])
                r.read()
                bioc_document = r.collection.documents[0]

                section_ids = pubtator_content[3].split(',')

                # Iterate over all the annotations in both passages
                for p_idx, passage in enumerate(bioc_document.passages):
                    for annotation in passage.annotations:

                        # Determine some meta-data (UID info) about the BioCAnnotation
                        annotation_type = None
                        uid_type = None
                        uid = None
                        for key in annotation.infons.keys():
                            if key == 'type':
                                annotation_type = annotation.infons.get(
                                    key, None)
                            else:
                                uid_type = key
                                uid = annotation.infons.get(uid_type, None)

                        # We're only interested in Pubtator Annotations that are the same concepts users highlight
                        if annotation_type in pubtator_types:
                            start, length = str(
                                annotation.locations[0]).split(':')
                            df_arr.append(
                                self._create_er_df_row(
                                    uid=uid,
                                    source=uid_type if uid_type else None,
                                    user_id=None,
                                    text=annotation.text,
                                    ann_type_idx=pubtator_types.index(
                                        annotation_type),
                                    document_pk=pubtator_content[1],
                                    section_id=section_ids[p_idx],
                                    section_offset=passage.offset,
                                    offset_relative=False,
                                    start_position=start,
                                    length=length))

        return pd.DataFrame(df_arr, columns=DF_COLUMNS)
Exemplo n.º 5
0
    def as_writer(self, documents=[]):
        '''
            Return a blank BioC Writer that is based off the pubtator content.

            Problems: This requires every document to have at least 1 pubtator model
            Pros: This prevents us from generating our own BioC file which may
            have inconsistencies
        '''
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            str_doc_arr = list(set(doc_arr))
        else:
            raise ValueError('No documents supplied to generator writer')

        cmd_str = ""
        with open('mark2cure/document/commands/get-pubtators.sql', 'r') as f:
            cmd_str = f.read()
        cmd_str = cmd_str.format(','.join(str_doc_arr))

        c = connection.cursor()
        try:
            c.execute(cmd_str)
            res = [(x[0], x[1], x[2]) for x in c.fetchall()]
        finally:
            c.close()

        writer = bioc_writer(None)
        for pubtator_content in res:
            section_ids = pubtator_content[2].split(',')
            r = BioCReader(source=pubtator_content[1])
            r.read()

            doc = r.collection.documents[0]
            doc.put_infon('document_pk', str(pubtator_content[0]))
            for idx, passage in enumerate(doc.passages):
                passage.clear_annotations()

                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section_ids[idx]))

            writer.collection.add_document(doc)

            str_doc_arr.remove(str(pubtator_content[0]))

        # Capture all the documents not available via pubtators
        for document_pk_str in str_doc_arr:
            # Can optimize this model retrieval but should rarely occur
            document_model = Document.objects.get(pk=document_pk_str)

            bioc_document = BioCDocument()
            bioc_document.id = str(document_model.document_id)
            bioc_document.put_infon('document_pk', document_pk_str)

            passage_offset = 0
            for idx, section in enumerate(document_model.available_sections()):
                passage = BioCPassage()
                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section.pk))
                # (TODO) Missing a "type" infon?
                passage.text = section.text

                passage.offset = str(passage_offset)
                passage_offset += len(passage.text) + 1

                bioc_document.add_passage(passage)

            writer.collection.add_document(bioc_document)
        return writer