Пример #1
0
def validate_pubtator(content, document):
    """ Returns bool if the provided str is a valid
        pubtator (BioC) response for the Document instance
    """
    try:
        r = BioCReader(source=content)
        r.read()

        # Check general Collection + Document attributes
        assert (len(r.collection.documents) == 1), 'The response included more than the provided Document'
        assert (document.document_id == int(r.collection.documents[0].id)), 'The response does not include the requested PMID'
        assert (len(r.collection.documents[0].passages) == 2), 'The response document does not include the correct number of sections'

        # Check the Title
        assert (int(r.collection.documents[0].passages[0].offset) == 0), 'The title does not start at 0'
        section = document.section_set.first()
        assert (section.text == r.collection.documents[0].passages[0].text), 'The response title does not equal the provided text'
        assert (section.id == int(r.collection.documents[0].passages[0].infons.get('id'))), 'The response title is not correctly identified'

        # Check the Abstract
        assert (int(r.collection.documents[0].passages[1].offset) >= 1), 'The abstract does not start after 0'
        section = document.section_set.last()
        assert (section.text == r.collection.documents[0].passages[1].text), 'The response abstract does not equal the provided text'
        assert (section.id == int(r.collection.documents[0].passages[1].infons.get('id'))), 'The response abstract is not correctly identified'
        return True

    except Exception:
        client.captureException()
        return False
    def test_document_as_bioc_with_pubtator(self):
        pub_query_set = Pubtator.objects.filter(
                document=self.doc,
                session_id='',
                content__isnull=False)

        response = self.client.get('/document/pubtator/{pmid}.json'.format(pmid=self.doc.document_id))
        json_string = response.content
        self.assertNotEqual(json_string, '', msg='API returned empty response for document BioC Pubtator Representation.')

        data = json.loads(json_string)

        # Make sure it's the same document in BioC as DB
        self.assertEqual(int(data.get('collection').get('document').get('id')), self.doc.document_id)
        self.assertEqual(len(data.get('collection').get('document').get('passage')), 2)
        self.assertEqual(data.get('collection').get('document').get('passage')[0].get('text'), self.doc.section_set.first().text)
        self.assertEqual(data.get('collection').get('document').get('passage')[1].get('text'), self.doc.section_set.last().text)

        # Make sure it contains any annotations
        self.assertNotEqual(len(data.get('collection').get('document').get('passage')[0].get('annotation')), 0)
        self.assertNotEqual(len(data.get('collection').get('document').get('passage')[1].get('annotation')), 0)

        # We already validated everything in JSON b/c it's easier. Let's just
        # make sure the XML document passes too without specific checks
        response = self.client.get('/document/pubtator/{pmid}.xml'.format(pmid=self.doc.document_id))
        r = BioCReader(source=response.content)
        r.read()
        self.assertEqual(len(r.collection.documents), 1)
        self.assertEqual(int(r.collection.documents[0].id), self.doc.document_id)
        self.assertEqual(len(r.collection.documents[0].passages), 2)
        self.assertNotEqual(len(r.collection.documents[0].passages[0].annotations), 0)
        self.assertNotEqual(len(r.collection.documents[0].passages[1].annotations), 0)
    def handle(self, *args, **options):
        types_arr = []
        errors = 0

        if options['keys']:
            for pubtator in Pubtator.objects.filter(content__isnull=False).all():
                try:
                    r = BioCReader(source=pubtator.content)
                    r.read()

                    for d_idx, document in enumerate(r.collection.documents):
                        for p_idx, passage in enumerate(document.passages):
                            for annotation in r.collection.documents[d_idx].passages[p_idx].annotations:
                                types_arr.append( annotation.infons['type'] )

                except Exception as e:
                    '''
                    print '%' in pubtator.content
                    for sec in pubtator.document.available_sections():
                        print '%' in sec.text
                    print ' - - - '
                    '''
                    errors = errors + 1

            print 'Errors:', errors
            print Counter(types_arr)
Пример #4
0
    def as_pubtator_annotation_df(self):
        # If the document has 3 solid annotations
        # "GNormPlus"
        # "DNorm"
        # "tmChem"
        df_columns = ('uid', 'source', 'ann_type', 'text', 'offset', 'location')

        pubtator_dfs = []
        if self.valid_pubtator():

            pubtators = Pubtator.objects.filter(
                document=self,
                session_id='',
                content__isnull=False).all()

            for pubtator in pubtators:
                r = BioCReader(source=pubtator.content)
                r.read()

                pubtator_arr = []
                bioc_document = r.collection.documents[0]
                for passage in bioc_document.passages:

                    for annotation in passage.annotations:
                        infons = annotation.infons

                        annotation_type = None
                        uid_type = None
                        uid = None

                        for key in infons.keys():
                            if key == 'type':
                                annotation_type = infons.get(key, None)
                            else:
                                uid_type = key
                                uid = infons.get(uid_type, None)

                        #print infons.keys()
                        #print infons
                        #print uid_type, uid, '('+str(annotation_type)+')'
                        #print ' - '*40

                        pubtator_arr.append({
                            'uid': uid,
                            'source': uid_type,

                            'ann_type': annotation_type,
                            'text': str(annotation.text),

                            'offset': int(passage.offset),
                            'location': str(annotation.locations[0])
                        })

                pubtator_dfs.append( pd.DataFrame(pubtator_arr, columns=df_columns) )

        if len(pubtator_dfs):
            return pd.concat(pubtator_dfs)
        else:
            return pd.DataFrame([], columns=df_columns)
Пример #5
0
    def as_writer(self, request=None):
        r = BioCReader(source=self.content)
        r.read()

        bioc_writer = BioCWriter()
        bioc_writer.collection = r.collection

        return bioc_writer
Пример #6
0
 def get_instance(self):
     """
         Returns the pubtator BioC instance if valid or None
     """
     try:
         r = BioCReader(source=self.content)
         r.read()
         return r
     except Exception:
         # If one of them doesn't validate leave
         return False
Пример #7
0
    def count_annotations(self):
        if self.valid():
            count = 0
            reader = BioCReader(source=self.content)
            reader.read()
            for doc in reader.collection.documents:
                for passage in doc.passages:
                    count += len(passage.annotations)
            return count

        else:
            return 0
Пример #8
0
 def count_annotations(self):
     """ Returns an int count of all types of ER annotations in the Pubtator instance
         If none are found or the document is invalid, return 0
     """
     try:
         r = BioCReader(source=self.content)
         r.read()
         return sum([
             len(passage.annotations)
             for passage in r.collection.documents[0].passages
         ])
     except Exception:
         return 0
    def test_document_as_bioc_with_m2c(self):
        # Submit Annotations (As User 1) so they show up when inspecting the M2C submissions
        self.assertEqual(Annotation.objects.count(), 0)
        self.client.login(username='******', password='******')
        response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True)

        doc = response.context['document']
        abstract = doc.available_sections().last()

        # Annotation submit URL
        url = reverse('document:create', kwargs={'task_pk': self.task.pk, 'section_pk': abstract.pk})
        self.assertEqual(self.client.post(url, {'type': 0, 'text': 'text annotation 0', 'start': 0}).status_code, 200)
        self.assertEqual(self.client.post(url, {'type': 1, 'text': 'text annotation 1', 'start': 10}).status_code, 200)
        self.assertEqual(self.client.post(url, {'type': 2, 'text': 'text annotation 2', 'start': 20}).status_code, 200)
        self.assertEqual(Annotation.objects.count(), 3)
        # Then submit the document for the Quest
        response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True)
        self.client.logout()

        # Submit Annotations (As User 1) so they show up when inspecting the M2C submissions
        self.assertEqual(Annotation.objects.count(), 3)
        self.client.login(username='******', password='******')
        response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True)

        # Annotation submit URL
        url = reverse('document:create', kwargs={'task_pk': self.task.pk, 'section_pk': abstract.pk})
        self.assertEqual(self.client.post(url, {'type': 0, 'text': 'text annotation 3', 'start': 30}).status_code, 200)
        self.assertEqual(self.client.post(url, {'type': 1, 'text': 'text annotation 4', 'start': 40}).status_code, 200)
        self.assertEqual(self.client.post(url, {'type': 2, 'text': 'text annotation 5', 'start': 50}).status_code, 200)
        self.assertEqual(Annotation.objects.count(), 6)
        # Then submit the document for the Quest
        response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True)
        self.client.logout()

        # As Anon user, export the documents submissions
        res = self.client.get(reverse('document:read-users-bioc', kwargs={'pubmed_id': doc.document_id, 'format_type': 'xml'}), follow=True)
        self.assertEqual(res.status_code, 200)
        bioc = BioCReader(source=res.content)
        bioc.read()

        # Make sure the BioC document has the opponent's infp
        self.assertEqual(len(bioc.collection.documents), 1)
        self.assertEqual(int(bioc.collection.documents[0].id), doc.document_id)
        self.assertEqual(len(bioc.collection.documents[0].passages), 2)
        self.assertEqual(len(bioc.collection.documents[0].passages[0].annotations), 0)
        self.assertEqual(len(bioc.collection.documents[0].passages[1].annotations), 6)
Пример #10
0
    def valid(self):
        # (TODO) This may return 2 different "types" check on
        # implications of this discrepancy
        if self.validate_cache:
            return True

        if self.session_id != '':
            return False

        if self.content is None:
            return False

        try:
            r = BioCReader(source=self.content)
            r.read()
            return r
        except Exception:
            # If one of them doesn't validate leave
            return False
    def test_group_for_all_user_annotations(self):
        self.load_fake_annotations()

        # Fetch the Group BioC as JSON to ensure is online
        response = self.client.get(reverse('api:group-users-bioc', kwargs={'group_pk': self.group.pk, 'format_type': 'json'}))
        self.assertEqual(response.status_code, 200)

        # Fetch the Group BioC for all user annotations
        response = self.client.get(reverse('api:group-users-bioc', kwargs={'group_pk': self.group.pk, 'format_type': 'xml'}))
        self.assertEqual(response.status_code, 200)
        r = BioCReader(source=response.content)
        r.read()

        # Does BioC have correct number of Group Documents
        self.assertEqual(len(r.collection.documents), self.group.get_documents().count())

        # Does BioC have correct number of Group Annotations
        total_bioc_annotation_int = 0
        for bioc_doc in r.collection.documents:
            for bioc_passage in bioc_doc.passages:
                total_bioc_annotation_int += len(bioc_passage.annotations)
        self.assertEqual(Annotation.objects.count(), total_bioc_annotation_int)
Пример #12
0
    def entity_recognition_df(self,
                              documents=[],
                              users=[],
                              include_pubtator=True,
                              writer=None):
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            filter_doc_level = 'WHERE `document_section`.`document_id` IN ({0})'.format(
                ','.join(doc_arr))
        else:
            filter_doc_level = ''

        if len(users):
            from django.contrib.auth.models import User
            user_arr = []
            for u in users:
                if type(u) == User:
                    user_arr.append(str(u.pk))
                elif type(u) is str or type(u) is unicode and d.isdigit():
                    user_arr.append(u)
                elif type(u) is int:
                    user_arr.append(str(u))

            filter_user_level = '{0} `document_view`.`user_id` IN ({1})'.format(
                'WHERE' if filter_doc_level == '' else 'AND',
                ','.join(user_arr))
        else:
            filter_user_level = ''

        content_type_id = str(
            ContentType.objects.get_for_model(
                EntityRecognitionAnnotation.objects.first()).id)

        df_arr = []

        cmd_str = ""
        with open('mark2cure/document/commands/get-er-results.sql', 'r') as f:
            cmd_str = f.read()
        cmd_str = cmd_str.format(content_type_pk=content_type_id,
                                 filter_doc_level=filter_doc_level,
                                 filter_user_level=filter_user_level)

        c = connection.cursor()
        try:
            c.execute(cmd_str)

            # Get the full writer in advnaced!!
            if not writer:
                writer = Document.objects.as_writer(documents=documents)

            res = [x for x in c.fetchall()]

            # We group the response to reduce BioCDocument offset dict lookups
            for key, doc_group in groupby(res, lambda x: x[5]):

                bioc_documents = filter(
                    lambda d: d.infons.get('document_pk') == str(key),
                    writer.collection.documents)
                # If a pubtator doesn't exist for the document, we can't include any annotations as the passage offsets need to come from Pubtator
                if len(bioc_documents) == 1:

                    # Use the BioC pubtator file for the offset values
                    offset_dict = {}
                    for passage in bioc_documents[0].passages:
                        offset_dict[int(
                            passage.infons.get('id'))] = passage.offset

                    for x in doc_group:
                        df_arr.append(
                            self._create_er_df_row(
                                uid=x[0],
                                source='db',
                                user_id=x[8],
                                text=x[2],
                                ann_type_idx=x[1],
                                document_pk=x[5],
                                section_id=x[7],
                                section_offset=offset_dict[x[7]],
                                offset_relative=True,
                                start_position=x[3],
                                length=len(x[2])))

        finally:
            c.close()

        if include_pubtator:
            '''
                This is the component that merges the 3 different pubtator
                reponses into 1 main file. It performances selective
                ordering and precedence for some annotations types / instances
            '''
            cmd_str = ""
            with open(
                    'mark2cure/document/commands/get-er-pubtator-results.sql',
                    'r') as f:
                cmd_str = f.read()
            cmd_str = cmd_str.format(','.join(doc_arr))

            c = connection.cursor()
            try:
                c.execute(cmd_str)
                res = [x for x in c.fetchall()]
            finally:
                c.close()

            # Counter({'Disease': 3676, 'Chemical': 2928, 'Species': 1553, 'Gene': 1544, 'FamilyName': 536, 'DomainMotif': 20}) (Sampleing from DB 11/30/2016)
            pubtator_types = ['Disease', 'Gene', 'Chemical']
            for pubtator_content in res:
                r = BioCReader(source=pubtator_content[2])
                r.read()
                bioc_document = r.collection.documents[0]

                section_ids = pubtator_content[3].split(',')

                # Iterate over all the annotations in both passages
                for p_idx, passage in enumerate(bioc_document.passages):
                    for annotation in passage.annotations:

                        # Determine some meta-data (UID info) about the BioCAnnotation
                        annotation_type = None
                        uid_type = None
                        uid = None
                        for key in annotation.infons.keys():
                            if key == 'type':
                                annotation_type = annotation.infons.get(
                                    key, None)
                            else:
                                uid_type = key
                                uid = annotation.infons.get(uid_type, None)

                        # We're only interested in Pubtator Annotations that are the same concepts users highlight
                        if annotation_type in pubtator_types:
                            start, length = str(
                                annotation.locations[0]).split(':')
                            df_arr.append(
                                self._create_er_df_row(
                                    uid=uid,
                                    source=uid_type if uid_type else None,
                                    user_id=None,
                                    text=annotation.text,
                                    ann_type_idx=pubtator_types.index(
                                        annotation_type),
                                    document_pk=pubtator_content[1],
                                    section_id=section_ids[p_idx],
                                    section_offset=passage.offset,
                                    offset_relative=False,
                                    start_position=start,
                                    length=length))

        return pd.DataFrame(df_arr, columns=DF_COLUMNS)
Пример #13
0
    def entity_recognition_df(self, documents=[], users=[], include_pubtator=True, writer=None):
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            filter_doc_level = 'WHERE `document_section`.`document_id` IN ({0})'.format(','.join(doc_arr))
        else:
            filter_doc_level = ''

        if len(users):
            from django.contrib.auth.models import User
            user_arr = []
            for u in users:
                if type(u) == User:
                    user_arr.append(str(u.pk))
                elif type(u) is str or type(u) is unicode and d.isdigit():
                    user_arr.append(u)
                elif type(u) is int:
                    user_arr.append(str(u))

            filter_user_level = '{0} `document_view`.`user_id` IN ({1})'.format(
                'WHERE' if filter_doc_level == '' else 'AND',
                ','.join(user_arr))
        else:
            filter_user_level = ''

        content_type_id = str(ContentType.objects.get_for_model(
            EntityRecognitionAnnotation.objects.first()).id)

        df_arr = []

        cmd_str = '''
            SELECT
                `entity_recognition_entityrecognitionannotation`.`id`,
                `entity_recognition_entityrecognitionannotation`.`type`,
                `entity_recognition_entityrecognitionannotation`.`text`,
                `entity_recognition_entityrecognitionannotation`.`start`,
                `document_annotation`.`created`,
                `document_document`.`id` as `document_pk`,
                `document_document`.`document_id` as `pmid`,
                `document_view`.`section_id`,
                `document_view`.`user_id`

            FROM `entity_recognition_entityrecognitionannotation`

            INNER JOIN `document_annotation`
                ON `document_annotation`.`object_id` = `entity_recognition_entityrecognitionannotation`.`id` AND `document_annotation`.`content_type_id` = {content_type_pk}

            INNER JOIN `document_view`
                ON `document_annotation`.`view_id` = `document_view`.`id`

            INNER JOIN `document_section`
                ON `document_view`.`section_id` = `document_section`.`id`

            INNER JOIN `document_document`
                ON `document_document`.`id` = `document_section`.`document_id`

            {filter_doc_level}
            {filter_user_level}
        '''.format(content_type_pk=content_type_id, filter_doc_level=filter_doc_level, filter_user_level=filter_user_level)
        c = connection.cursor()
        try:
            c.execute(cmd_str)

            # Get the full writer in advnaced!!
            if not writer:
                writer = Document.objects.as_writer(documents=documents)

            res = [x for x in c.fetchall()]

            # We group the response to reduce BioCDocument offset dict lookups
            for key, doc_group in groupby(res, lambda x: x[5]):

                bioc_documents = filter(lambda d: d.infons.get('document_pk') == str(key), writer.collection.documents)
                # If a pubtator doesn't exist for the document, we can't include any annotations as the passage offsets need to come from Pubtator
                if len(bioc_documents) == 1:

                    # Use the BioC pubtator file for the offset values
                    offset_dict = {}
                    for passage in bioc_documents[0].passages:
                        offset_dict[int(passage.infons.get('id'))] = passage.offset

                    for x in doc_group:
                        df_arr.append(self._create_er_df_row(
                            uid=x[0], source='db', user_id=x[8],
                            text=x[2], ann_type=x[1],
                            document_pk=x[5], section_id=x[7], section_offset=offset_dict[x[7]], offset_relative=True,
                            start_position=x[3], length=len(x[2])))

        finally:
            c.close()

        if include_pubtator:
            '''
                This is the component that merges the 3 different pubtator
                reponses into 1 main file. It performances selective
                ordering and precedence for some annotations types / instances
            '''
            cmd_str = '''
                SELECT
                    `document_pubtator`.`id`,
                    `document_pubtator`.`document_id`,
                    `document_pubtator`.`content`,
                    GROUP_CONCAT(DISTINCT `document_section`.`id`) as `section_ids`

                FROM `document_pubtator`

                JOIN `document_section`
                    ON `document_section`.`document_id` = `document_pubtator`.`document_id`

                WHERE `document_pubtator`.`content` != ''
                    AND `document_pubtator`.`document_id` IN ({0})

                GROUP BY `document_pubtator`.`id`
            '''.format(','.join(doc_arr))
            c = connection.cursor()
            try:
                c.execute(cmd_str)
                res = [x for x in c.fetchall()]
            finally:
                c.close()

            for pubtator_content in res:
                r = BioCReader(source=pubtator_content[2])
                r.read()
                bioc_document = r.collection.documents[0]

                section_ids = pubtator_content[3].split(',')

                # Iterate over all the annotations in both passages
                for p_idx, passage in enumerate(bioc_document.passages):
                    for annotation in passage.annotations:

                        # Determine some meta-data (UID info) about the BioCAnnotation
                        annotation_type = None
                        uid_type = None
                        uid = None
                        for key in annotation.infons.keys():
                            if key == 'type':
                                annotation_type = annotation.infons.get(key, None)
                            else:
                                uid_type = key
                                uid = annotation.infons.get(uid_type, None)

                        start, length = str(annotation.locations[0]).split(':')
                        df_arr.append(self._create_er_df_row(

                            uid=uid, source=uid_type if uid_type else None, user_id=None,
                            text=annotation.text, ann_type=annotation_type if annotation_type else None,
                            document_pk=pubtator_content[1], section_id=section_ids[p_idx], section_offset=passage.offset, offset_relative=False,
                            start_position=start, length=length))

        return pd.DataFrame(df_arr, columns=DF_COLUMNS)
    def test_document_as_bioc_for_pairing(self):
        # Ensure the player views the Q but can't match b/c no Anns exist
        self.client.login(username='******', password='******')

        # Ensure the User info is showing up in the header
        response = self.client.get('/dashboard/')
        self.assertInHTML('<p>Level: Expert</p>', response.content)

        # Ensure no User >> Quest views until after viewed once
        self.assertEqual(UserQuestRelationship.objects.count(), 0)
        response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True)
        doc = response.context['document']
        self.assertEqual(UserQuestRelationship.objects.count(), 1)

        # Ensure this returns a 500 for the player b/c there are no submissions yet
        response = self.client.get(reverse('document:results-bioc', kwargs={'task_pk': self.task.pk, 'doc_pk': doc.pk, 'format_type': 'xml'}))
        self.assertEqual(response.status_code, 500)
        self.assertEqual(response.content, 'no_points_awarded')
        self.client.logout()

        #
        # Submit bogus Annotations as opponent to try match again for player
        #
        self.client.login(username='******', password='******')
        self.assertEqual(Annotation.objects.count(), 0)
        response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True)

        # Annotation submit URL
        abstract = doc.available_sections().last()
        url = reverse('document:create', kwargs={'task_pk': self.task.pk, 'section_pk': abstract.pk})
        self.assertEqual(self.client.post(url, {'type': 0, 'text': 'text annotation 0', 'start': 0}).status_code, 200)
        self.assertEqual(self.client.post(url, {'type': 1, 'text': 'text annotation 1', 'start': 10}).status_code, 200)
        self.assertEqual(self.client.post(url, {'type': 2, 'text': 'text annotation 2', 'start': 20}).status_code, 200)
        self.assertEqual(Annotation.objects.count(), 3)

        # Then submit the document for the Quest
        response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True)
        self.client.logout()

        #
        # Try again as the player to see if comparison uses opponents
        #
        self.client.login(username='******', password='******')
        # Submit this Document without contributing any Annotations
        response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True)

        # Fetch the BioC Document again
        response = self.client.get(reverse('document:results-bioc', kwargs={'task_pk': self.task.pk, 'doc_pk': doc.pk, 'format_type': 'xml'}))
        self.assertEqual(response.status_code, 200)
        r = BioCReader(source=response.content)
        r.read()

        # Make sure the BioC document has the opponent's infp
        self.assertEqual(len(r.collection.documents), 1)
        self.assertEqual(int(r.collection.documents[0].id), doc.document_id)
        self.assertEqual(len(r.collection.documents[0].passages), 2)
        self.assertEqual(len(r.collection.documents[0].passages[0].annotations), 0)
        self.assertEqual(len(r.collection.documents[0].passages[1].annotations), 3)

        self.assertEqual(r.collection.documents[0].passages[1].annotations[0].infons['user_name'], 'opponent')
        self.assertEqual(int(r.collection.documents[0].passages[1].annotations[0].infons['type']), 0)
        self.assertEqual(r.collection.documents[0].passages[1].annotations[0].text, 'text annotation 0')
        self.assertEqual(int(r.collection.infons['points']), 0)
        self.assertEqual(r.collection.infons['partner'], 'opponent')
        self.client.logout()
Пример #15
0
    def as_writer(self, documents=[]):
        '''
            Return a blank BioC Writer that is based off the pubtator content.

            Problems: This requires every document to have at least 1 pubtator model
            Pros: This prevents us from generating our own BioC file which may
            have inconsistencies
        '''
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            str_doc_arr = list(set(doc_arr))
        else:
            raise ValueError('No documents supplied to generator writer')

        cmd_str = ""
        with open('mark2cure/document/commands/get-pubtators.sql', 'r') as f:
            cmd_str = f.read()
        cmd_str = cmd_str.format(','.join(str_doc_arr))

        c = connection.cursor()
        try:
            c.execute(cmd_str)
            res = [(x[0], x[1], x[2]) for x in c.fetchall()]
        finally:
            c.close()

        writer = bioc_writer(None)
        for pubtator_content in res:
            section_ids = pubtator_content[2].split(',')
            r = BioCReader(source=pubtator_content[1])
            r.read()

            doc = r.collection.documents[0]
            doc.put_infon('document_pk', str(pubtator_content[0]))
            for idx, passage in enumerate(doc.passages):
                passage.clear_annotations()

                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section_ids[idx]))

            writer.collection.add_document(doc)

            str_doc_arr.remove(str(pubtator_content[0]))

        # Capture all the documents not available via pubtators
        for document_pk_str in str_doc_arr:
            # Can optimize this model retrieval but should rarely occur
            document_model = Document.objects.get(pk=document_pk_str)

            bioc_document = BioCDocument()
            bioc_document.id = str(document_model.document_id)
            bioc_document.put_infon('document_pk', document_pk_str)

            passage_offset = 0
            for idx, section in enumerate(document_model.available_sections()):
                passage = BioCPassage()
                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section.pk))
                # (TODO) Missing a "type" infon?
                passage.text = section.text

                passage.offset = str(passage_offset)
                passage_offset += len(passage.text) + 1

                bioc_document.add_passage(passage)

            writer.collection.add_document(bioc_document)
        return writer
Пример #16
0
    def as_writer(self, documents=[]):
        '''
            Return a blank BioC Writer that is based off the pubtator content.

            Problems: This requires every document to have at least 1 pubtator model
            Pros: This prevents us from generating our own BioC file which may
            have inconsistencies
        '''
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            str_doc_arr = list(set(doc_arr))
        else:
            raise ValueError('No documents supplied to generator writer')

        cmd_str = '''
            SELECT
                `document_pubtator`.`document_id`,
                ANY_VALUE(`document_pubtator`.`content`),
                GROUP_CONCAT(DISTINCT `document_section`.`id`) as `section_ids`

            FROM `document_pubtator`

            JOIN `document_section`
                ON `document_section`.`document_id` = `document_pubtator`.`document_id`

            WHERE `document_pubtator`.`content` != '' AND `document_pubtator`.`document_id` IN ({0})

            GROUP BY `document_pubtator`.`document_id`;
        '''.format(','.join(str_doc_arr))
        c = connection.cursor()
        try:
            c.execute(cmd_str)
            res = [(x[0], x[1], x[2]) for x in c.fetchall()]
        finally:
            c.close()

        writer = bioc_writer(None)
        for pubtator_content in res:
            section_ids = pubtator_content[2].split(',')
            r = BioCReader(source=pubtator_content[1])
            r.read()

            doc = r.collection.documents[0]
            doc.put_infon('document_pk', str(pubtator_content[0]))
            for idx, passage in enumerate(doc.passages):
                passage.clear_annotations()

                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section_ids[idx]))

            writer.collection.add_document(doc)

            str_doc_arr.remove(str(pubtator_content[0]))

        # Capture all the documents not available via pubtators
        for document_pk_str in str_doc_arr:
            # Can optimize this model retrieval but should rarely occur
            document_model = Document.objects.get(pk=document_pk_str)

            bioc_document = BioCDocument()
            bioc_document.id = str(document_model.document_id)
            bioc_document.put_infon('document_pk', document_pk_str)

            passage_offset = 0
            for idx, section in enumerate(document_model.available_sections()):
                passage = BioCPassage()
                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section.pk))
                # (TODO) Missing a "type" infon?
                passage.text = section.text

                passage.offset = str(passage_offset)
                passage_offset += len(passage.text) + 1

                bioc_document.add_passage(passage)

            writer.collection.add_document(bioc_document)
        return writer
Пример #17
0
    def as_bioc_with_pubtator_annotations(self, request=None):
        '''
            This is a function that merges the 3 different pubtator
            reponses into 1 main file. It performances selective
            ordering and precedence for some annotations types / instances
        '''
        approved_types = ['Disease', 'Gene', 'Chemical']
        self.init_pubtator()
        reader = self.as_writer(request)

        pub_query_set = Pubtator.objects.filter(
            document=self,
            session_id='',
            content__isnull=False)

        # Load up our various pubtator responses
        pub_readers = []
        for pubtator in pub_query_set.all():
            r = BioCReader(source=pubtator.content)
            r.read()
            pub_readers.append(r)

        for d_idx, document in enumerate(reader.collection.documents):
            for p_idx, passage in enumerate(document.passages):
                # For each passage in each document in the collection
                # add the appropriate annotation
                for p in pub_readers:

                    for annotation in p.collection.documents[d_idx].passages[p_idx].annotations:
                        ann_type = annotation.infons['type']
                        infons = annotation.infons

                        if ann_type in approved_types:

                            uid_type = None
                            uid = None
                            for key in infons.keys():
                                if key != 'type':
                                    uid_type = key
                                    uid = infons.get(uid_type, None)

                            annotation.clear_infons()
                            annotation.put_infon('type', str(approved_types.index(ann_type)))
                            annotation.put_infon('user', 'pubtator')
                            annotation.put_infon('uid', str(uid))
                            reader.collection.documents[d_idx].passages[p_idx].add_annotation(annotation)

                # Remove the shorter annotation if they're multiple
                # at the same start position
                anns = reader.collection.documents[d_idx].passages[p_idx].annotations
                ann_offsets = [x.locations[0].offset for x in anns]

                import collections
                # For each of the offset positions where there are multiple annotations
                for offset in [x for x, y in collections.Counter(ann_offsets).items() if y > 1]:

                    conflicting_anns = [x for x in anns if x.locations[0].offset == offset]
                    longest_ann = max(conflicting_anns, key=lambda a: int(a.locations[0].length))

                    for ann in conflicting_anns:
                        if ann is not longest_ann:
                            reader.collection.documents[d_idx].passages[p_idx].remove_annotation(ann)

                # Remove any annoations that overlap, prefer selection for longest
                anns = reader.collection.documents[d_idx].passages[p_idx].annotations
                for needle_ann in anns:
                    needle_ann_offset = int(needle_ann.locations[0].offset)
                    needle_ann_length = int(needle_ann.locations[0].length)

                    for stack_ann in anns:
                        stack_ann_offset = int(stack_ann.locations[0].offset)
                        stack_ann_length = int(stack_ann.locations[0].length)

                        if needle_ann_offset >= stack_ann_offset and needle_ann_length < stack_ann_length:
                            try:
                                reader.collection.documents[d_idx].passages[p_idx].remove_annotation(needle_ann)
                            except:
                                pass

        return reader.collection.documents[0]