def read_bioc(request, pubmed_id, format_type):
    # When fetching via pubmed, include no annotaitons
    writer = bioc_writer(request)
    doc = get_object_or_404(Document, document_id=pubmed_id)

    writer = bioc_writer(request)
    bioc_document = doc.as_bioc_with_passages()
    writer.collection.add_document(bioc_document)

    if format_type == 'json':
        writer_json = bioc_as_json(writer)
        return HttpResponse(writer_json, content_type='application/json')
    else:
        return HttpResponse(writer, content_type='text/xml')
def group_pubtator_bioc(request, group_pk, format_type):
    group = get_object_or_404(Group, pk=group_pk)

    # When fetching via pubmed, include all user annotaitons
    writer = bioc_writer(request)

    for doc in group.get_documents():
        doc_bioc = doc.as_bioc_with_pubtator_annotations()
        writer.collection.add_document(doc_bioc)

    if format_type == 'json':
        writer_json = bioc_as_json(writer)
        return HttpResponse(writer_json, content_type='application/json')
    else:
        return HttpResponse(writer, content_type='text/xml')
Exemplo n.º 3
0
 def as_writer(self, request=None):
     from mark2cure.common.formatter import bioc_writer
     writer = bioc_writer(request)
     document = self.as_bioc_with_passages()
     writer.collection.add_document(document)
     return writer
Exemplo n.º 4
0
    def as_writer(self, documents=[]):
        '''
            Return a blank BioC Writer that is based off the pubtator content.

            Problems: This requires every document to have at least 1 pubtator model
            Pros: This prevents us from generating our own BioC file which may
            have inconsistencies
        '''
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            str_doc_arr = list(set(doc_arr))
        else:
            raise ValueError('No documents supplied to generator writer')

        cmd_str = ""
        with open('mark2cure/document/commands/get-pubtators.sql', 'r') as f:
            cmd_str = f.read()
        cmd_str = cmd_str.format(','.join(str_doc_arr))

        c = connection.cursor()
        try:
            c.execute(cmd_str)
            res = [(x[0], x[1], x[2]) for x in c.fetchall()]
        finally:
            c.close()

        writer = bioc_writer(None)
        for pubtator_content in res:
            section_ids = pubtator_content[2].split(',')
            r = BioCReader(source=pubtator_content[1])
            r.read()

            doc = r.collection.documents[0]
            doc.put_infon('document_pk', str(pubtator_content[0]))
            for idx, passage in enumerate(doc.passages):
                passage.clear_annotations()

                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section_ids[idx]))

            writer.collection.add_document(doc)

            str_doc_arr.remove(str(pubtator_content[0]))

        # Capture all the documents not available via pubtators
        for document_pk_str in str_doc_arr:
            # Can optimize this model retrieval but should rarely occur
            document_model = Document.objects.get(pk=document_pk_str)

            bioc_document = BioCDocument()
            bioc_document.id = str(document_model.document_id)
            bioc_document.put_infon('document_pk', document_pk_str)

            passage_offset = 0
            for idx, section in enumerate(document_model.available_sections()):
                passage = BioCPassage()
                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section.pk))
                # (TODO) Missing a "type" infon?
                passage.text = section.text

                passage.offset = str(passage_offset)
                passage_offset += len(passage.text) + 1

                bioc_document.add_passage(passage)

            writer.collection.add_document(bioc_document)
        return writer
Exemplo n.º 5
0
    def as_writer(self, documents=[]):
        '''
            Return a blank BioC Writer that is based off the pubtator content.

            Problems: This requires every document to have at least 1 pubtator model
            Pros: This prevents us from generating our own BioC file which may
            have inconsistencies
        '''
        if len(documents):
            from .models import Document
            doc_arr = []
            for d in documents:
                if type(d) == Document:
                    doc_arr.append(str(d.pk))
                elif type(d) is str or type(d) is unicode and d.isdigit():
                    doc_arr.append(d)
                elif type(d) is int or type(d) is long:
                    doc_arr.append(str(d))
            str_doc_arr = list(set(doc_arr))
        else:
            raise ValueError('No documents supplied to generator writer')

        cmd_str = '''
            SELECT
                `document_pubtator`.`document_id`,
                ANY_VALUE(`document_pubtator`.`content`),
                GROUP_CONCAT(DISTINCT `document_section`.`id`) as `section_ids`

            FROM `document_pubtator`

            JOIN `document_section`
                ON `document_section`.`document_id` = `document_pubtator`.`document_id`

            WHERE `document_pubtator`.`content` != '' AND `document_pubtator`.`document_id` IN ({0})

            GROUP BY `document_pubtator`.`document_id`;
        '''.format(','.join(str_doc_arr))
        c = connection.cursor()
        try:
            c.execute(cmd_str)
            res = [(x[0], x[1], x[2]) for x in c.fetchall()]
        finally:
            c.close()

        writer = bioc_writer(None)
        for pubtator_content in res:
            section_ids = pubtator_content[2].split(',')
            r = BioCReader(source=pubtator_content[1])
            r.read()

            doc = r.collection.documents[0]
            doc.put_infon('document_pk', str(pubtator_content[0]))
            for idx, passage in enumerate(doc.passages):
                passage.clear_annotations()

                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section_ids[idx]))

            writer.collection.add_document(doc)

            str_doc_arr.remove(str(pubtator_content[0]))

        # Capture all the documents not available via pubtators
        for document_pk_str in str_doc_arr:
            # Can optimize this model retrieval but should rarely occur
            document_model = Document.objects.get(pk=document_pk_str)

            bioc_document = BioCDocument()
            bioc_document.id = str(document_model.document_id)
            bioc_document.put_infon('document_pk', document_pk_str)

            passage_offset = 0
            for idx, section in enumerate(document_model.available_sections()):
                passage = BioCPassage()
                passage.put_infon('section', ['title', 'paragraph'][idx])
                passage.put_infon('id', str(section.pk))
                # (TODO) Missing a "type" infon?
                passage.text = section.text

                passage.offset = str(passage_offset)
                passage_offset += len(passage.text) + 1

                bioc_document.add_passage(passage)

            writer.collection.add_document(bioc_document)
        return writer