Пример #1
0
 def test_force_update_pmid_exists(self):
     """
     force_update should cause existing data to be overwritten.
     """
     initial = {
         'pmid': 1,
         'title': 'ASDF',
     }
     pub = factory.create(Publication, initial)
     pub.save()
     self.assertEqual(pub.title, initial['title'])
     load_pmids([
         '1',
     ], force_update=True)
     #First Pub
     pub1 = Publication.objects.get(pmid=1)
     self.assertEqual(
         pub1.title,
         'Formate assay in body fluids: application in methanol poisoning.')
     self.assertEqual(pub1.date, date(1975, 6, 1))
     self.assertEqual(pub1.authors,
                      'Makar AB, McMartin KE, Palese M, Tephly TR')
     self.assertEqual(pub1.journal, 'Biochem Med')
     self.assertEqual(pub1.volume, '13')
     self.assertEqual(pub1.issue, '2')
     self.assertEqual(pub1.pages, '117-26')
     self.assertEqual(pub1.id,
                      pub.id)  # make sure primary key doesn't change
Пример #2
0
 def test_get_multiple_pmid(self):
     """
     load_pmids should be able to get multiple publications' information successfully from pubmed's efetch
     """
     load_pmids(['1', '2'])
     #First Pub
     pub1 = Publication.objects.get(pmid=1)
     self.assertEqual(
         pub1.title,
         'Formate assay in body fluids: application in methanol poisoning.')
     self.assertEqual(pub1.date, date(1975, 6, 1))
     self.assertEqual(pub1.authors,
                      'Makar AB, McMartin KE, Palese M, Tephly TR')
     self.assertEqual(pub1.journal, 'Biochem Med')
     self.assertEqual(pub1.volume, '13')
     self.assertEqual(pub1.issue, '2')
     self.assertEqual(pub1.pages, '117-26')
     #Second Pub
     pub2 = Publication.objects.get(pmid=2)
     self.assertEqual(
         pub2.title,
         'Delineation of the intimate details of the backbone conformation of pyridine nucleotide coenzymes in aqueous solution.'
     )
     self.assertEqual(pub2.date, date(1975, 10, 27))
     self.assertEqual(pub2.authors, 'Bose KS, Sarma RH')
     self.assertEqual(pub2.journal, 'Biochem Biophys Res Commun')
     self.assertEqual(pub2.volume, '66')
     self.assertEqual(pub2.issue, '4')
     self.assertEqual(pub2.pages, '1173-9')
Пример #3
0
 def test_can_load_when_issue_volume_pages_are_null(self):
     """
     Some PMIDs lack an issue (e.g. 9371713), volume, or pages. This test makes sure that these can still be loaded.
     """
     load_pmids([
         '9371713',
     ])
     pub = Publication.objects.get(pmid=9371713)
     self.assertIsNone(pub.issue)
Пример #4
0
 def test_error_pub_id(self):
     """
     A publication that does not exist should emit a warning and nothing should be created.
     """
     load_pmids([
         '2764472319',
     ])
     with self.assertRaises(Publication.DoesNotExist):
         Publication.objects.get(pmid=2764472319)
Пример #5
0
 def test_load_pub_already_exists(self):
     """
     Loading a publication that already exists should do nothing when force_update is not passed.
     """
     initial = {
         'pmid': 1,
         'title': 'ASDF',
     }
     pub = factory.create(Publication, initial)
     pub.save()
     self.assertEqual(pub.title, initial['title'])
     load_pmids([
         '1',
     ])
     self.assertEqual(pub.title, initial['title'])
Пример #6
0
 def test_get_single_pmid(self):
     """
     load_pmids should be able to get a single publication's information successfully from pubmed's efetch
     """
     load_pmids([
         '1',
     ])
     pub = Publication.objects.get(pmid=1)
     self.assertEqual(
         pub.title,
         'Formate assay in body fluids: application in methanol poisoning.')
     self.assertEqual(pub.date, date(1975, 6, 1))
     self.assertEqual(pub.authors,
                      'Makar AB, McMartin KE, Palese M, Tephly TR')
     self.assertEqual(pub.journal, 'Biochem Med')
     self.assertEqual(pub.volume, '13')
     self.assertEqual(pub.issue, '2')
     self.assertEqual(pub.pages, '117-26')
Пример #7
0
 def test_force_update_pmid_doesnt_exist(self):
     """
     force_update should cause existing data to be overwritten.
     """
     load_pmids([
         '1',
     ], force_update=True)
     #First Pub
     pub1 = Publication.objects.get(pmid=1)
     self.assertEqual(
         pub1.title,
         'Formate assay in body fluids: application in methanol poisoning.')
     self.assertEqual(pub1.date, date(1975, 6, 1))
     self.assertEqual(pub1.authors,
                      'Makar AB, McMartin KE, Palese M, Tephly TR')
     self.assertEqual(pub1.journal, 'Biochem Med')
     self.assertEqual(pub1.volume, '13')
     self.assertEqual(pub1.issue, '2')
     self.assertEqual(pub1.pages, '117-26')
Пример #8
0
    def format_annotations(self, annots, xrdb, full_pubs, organism=None):
        """
        xrdb is the type of gene identifier that the annotations are sent as
        """
        formatted_for_db_annotations = set()
        genes_not_found = set()
        multiple_genes_found = set()
        pubs_not_loaded = set()
        annotation_dict = {}

        if organism is not None:
            gene_objects_manager = Gene.objects.filter(
                organism__scientific_name=organism)
        else:
            gene_objects_manager = Gene.objects

        for key in annots:
            # This loop validates the annotations and gets the actual
            # gene/publication objects
            try:
                if (xrdb is None):
                    gene_obj = gene_objects_manager.get(id=key)
                elif (xrdb == 'Entrez'):
                    gene_obj = gene_objects_manager.get(entrezid=key)
                elif (xrdb == 'Symbol'):
                    try:
                        gene_obj = gene_objects_manager.get(standard_name=key)
                    except Gene.DoesNotExist:
                        gene_obj = gene_objects_manager.get(
                            systematic_name=key)
                else:
                    xref_obj = CrossRef.objects.filter(
                        crossrefdb__name=xrdb).get(xrid=key)
                    gene_obj = xref_obj.gene

                pubs = set()
                for publication in annots[key]:

                    if full_pubs:
                        # The full publication database objects were sent
                        pubs.add(publication['id'])
                    else:
                        # Only the pubmed IDs were sent
                        pubmed_id = publication
                        try:
                            # Check to see if publication is in the database
                            pub_obj = Publication.objects.get(pmid=pubmed_id)
                        except Publication.DoesNotExist:
                            # If it doesn't exist in the database, load it
                            logger.info(
                                "Pubmed ID %s did not exist in the "
                                "database. Loading it now.", pubmed_id)
                            load_pmids([
                                pubmed_id,
                            ])
                            try:
                                # Try again to see if publication is now in
                                # the database
                                pub_obj = Publication.objects.get(
                                    pmid=pubmed_id)
                            except Publication.DoesNotExist:
                                # Pubmed id that was passed probably does not
                                # exist
                                logger.warning(
                                    "Pubmed ID %s could not be "
                                    "loaded from Pubmed server. "
                                    "Saving it in version as None.", pubmed_id)
                                pubs_not_loaded.add(pubmed_id)
                                pub_obj = None
                        if pub_obj:
                            pubs.add(pub_obj.id)

                annotation_dict[gene_obj.pk] = pubs

            except (Gene.DoesNotExist, CrossRef.DoesNotExist):
                genes_not_found.add(key)

            except (Gene.MultipleObjectsReturned):
                multiple_genes_found.add(key)

        if annotation_dict:
            # if annotations (genes and publications) exist in the database:
            for key in annotation_dict:
                # The following statement is the pythonic way to check if the
                # set is not empty (i.e. there are publications for this gene)
                if annotation_dict[key]:
                    # There are publications for this gene - add them as tuples
                    # to formatted_for_db_annotations set.
                    for pub in annotation_dict[key]:
                        formatted_for_db_annotations.add((key, pub))
                else:
                    # There are no pubs for this gene
                    formatted_for_db_annotations.add((key, None))

            formatted_for_db_annotations = frozenset(
                formatted_for_db_annotations)

        return (formatted_for_db_annotations, genes_not_found, pubs_not_loaded,
                multiple_genes_found)
Пример #9
0
    def handle(self, *args, **options):

        user_name = options.get('user')
        user = None
        try:
            user = User.objects.get(username=user_name)
        except User.DoesNotExist:
            logger.error('The user %s did not exist.',
                         user_name,
                         extra={'options': options})
            sys.exit()

        org = None
        try:
            org = Organism.objects.get(scientific_name=options.get('organism'))
        except Organism.DoesNotExist:
            logger.error('The organism %s did not exist.',
                         options.get('organism'),
                         extra={'options': options})
            sys.exit()

        accepted_evcodes = None
        if options.get('evcodes'):
            accepted_evcodes = set(options.get('evcodes').split(','))

        gene_ontology = go()
        remote = options.get('remote') != None
        obo_location = GO_OBO_URL if remote else options.get('obo')
        loaded_obo = gene_ontology.load_obo(obo_location,
                                            remote_location=remote,
                                            timeout=5)

        if not loaded_obo:
            logger.error("Couldn't load OBO file %s with remote equal to %s.",
                         obo_location, remote)
            sys.exit()

        annot_zip_fh = None
        annot_fh = None
        if remote:
            annot_zip_fh = urllib2.urlopen(GO_ASSOC_FTP + '.'.join(
                (GO_ASSOC_PREFIX, GO_NAMES[org.scientific_name],
                 GO_ASSOC_SUFFIX)),
                                           timeout=5)
        else:
            annot_zip_fh = open(options.get('annot'))
        annot_fh = gzip.GzipFile(fileobj=io.BytesIO(annot_zip_fh.read()))
        annot_zip_fh.close()

        annots = []
        load_pairs = {}
        pubs = set()

        for line in annot_fh:
            if line.startswith('!'):
                continue
            toks = line.strip().split('\t')

            (xrdb, xrid, details, goid, ref, ev,
             date) = (toks[0], toks[1], toks[3], toks[4], toks[5], toks[6],
                      toks[13])

            if options.get('tair'):
                import re
                tair_regex = re.compile('AT[0-9MC]G[0-9][0-9][0-9][0-9][0-9]')
                first_alias = toks[10].split('|')[0]
                if tair_regex.match(toks[2]):
                    xrid = toks[2]
                elif tair_regex.match(toks[9]):
                    xrid = toks[9]
                elif tair_regex.match(first_alias):
                    xrid = first_alias

            if options.get('only_wb') and (toks[0] != 'WB'):
                continue

            if details == 'NOT':
                continue
            if accepted_evcodes is not None and not (ev in accepted_evcodes):
                continue

            if options.get('leading') is not None:
                xrid = xrid.split(':')[1]

            try:
                load_pairs[xrdb].append(xrid)
            except KeyError:
                load_pairs[xrdb] = [
                    xrid,
                ]

            refs = ref.split('|')
            for ref_item in refs:
                if ref_item.startswith('PMID:'):
                    pubs.add(ref_item.split(':')[1])
                else:
                    logger.info("Unknown publication key %s", ref_item)
            annots.append((xrdb, xrid, goid, ref, date))

        xref_cache = {}

        if options.get('pseudomonas'):
            logger.info('Pseudomonas entered')
            for (xrdb, xrids) in load_pairs.iteritems():
                gene_objs = Gene.objects.filter(systematic_name__in=xrids)
                logger.info(
                    "Mapped %s Pseudomonas genes from the database using gene systematic name.",
                    gene_objs.count())
                for gene_obj in gene_objs:
                    xref_cache[(xrdb, gene_obj.systematic_name)] = gene_obj

        else:
            for (xrdb, xrids) in load_pairs.iteritems():
                if xrdb in DB_REMAP:
                    xrdb = DB_REMAP[xrdb]
                try:
                    xrdb_obj = CrossRefDB.objects.get(name=xrdb)
                except CrossRefDB.DoesNotExist:
                    logger.warning("Couldn't find the cross reference DB %s.",
                                   xrdb)
                    continue
                xrid_objs = CrossRef.objects.filter(
                    crossrefdb=xrdb_obj).filter(xrid__in=xrids)
                logger.info("Mapped %s cross references from %s",
                            xrid_objs.count(), xrdb)
                for xrid_obj in xrid_objs:
                    xref_cache[(xrdb, xrid_obj.xrid)] = xrid_obj.gene

        load_pmids(pubs)
        pub_cache = {}
        pub_values = Publication.objects.filter(pmid__in=pubs).only(
            'id', 'pmid').values()
        for pub in pub_values:
            pub_cache[pub['pmid']] = pub['id']

        for annot in annots:
            (xrdb, xrid, goid, ref, date) = annot
            if xrdb in DB_REMAP:
                xrdb = DB_REMAP[xrdb]
            try:
                gene = xref_cache[(xrdb, xrid)]
            except KeyError:
                logger.debug("Couldn't find xrid %s in xrdb %s.", xrid, xrdb)
                logger.info("Couldn't find xrid %s in xrdb %s.", xrid, xrdb)
                continue
            refs = ref.split('|')
            pub = None
            for ref_item in refs:
                if ref_item.startswith('PMID:'):
                    try:
                        pub = pub_cache[int(ref_item.split(':')[1])]
                    except KeyError:
                        pub = None
            gene_ontology.add_annotation(go_id=goid,
                                         gid=gene.pk,
                                         ref=pub,
                                         date=date,
                                         direct=True)

        gene_ontology.populated = True  #mark annotated
        gene_ontology.propagate()  #prop annotations

        evlist = list(accepted_evcodes)
        for (term_id, term) in gene_ontology.go_terms.iteritems():
            if term.annotations:
                slug = slugify(' '.join(
                    (term.go_id, org.scientific_name,
                     term.full_name)))[:50]  #make first 50 chars into a slug

                namespace = GO_NAMESPACE_MAP[term.get_namespace()]
                go_id = term.go_id.split(':')[1]
                #construct title
                title = 'GO' + '-' + namespace + '-' + go_id + ':' + term.full_name

                #construct abstract
                #write evidence as string
                evclause = ''
                if len(evlist):
                    evclause = ' Only annotations with evidence coded as '
                    if len(evlist) == 1:
                        evclause = evclause + evlist[0]
                    else:
                        evclause = evclause + ', '.join(
                            evlist[:-1]) + ' or ' + evlist[-1]
                    evclause = evclause + ' are included.'
                if term.description:
                    description = term.description + ' Annotations are propagated through transitive closure as recommended by the GO Consortium.' + evclause
                else:
                    logger.info("No description on term %s", term)

                #get geneset
                changed = False
                try:
                    gs_obj = Geneset.objects.get(slug=slug, creator=user)
                    changed = False  #flag to know if we need to call save

                    #all these genesets should be public
                    if not gs_obj.public:
                        gs_obj.public = True
                        changed = True

                    if gs_obj.title != title:
                        gs_obj.title = title
                        changed = True

                    if gs_obj.abstract != description:
                        gs_obj.abstract = description
                        changed = True

                except Geneset.DoesNotExist:
                    gs_obj = Geneset(title=title,
                                     slug=slug,
                                     creator=user,
                                     organism=org,
                                     public=True,
                                     abstract=description)
                    changed = True

                #if anything changed
                if changed:
                    gs_obj.save()

                if options.get('initial'):
                    #disable commit field's auto_now_add, allows us to set a prior annotation date
                    commit_date = Version._meta.get_field_by_name(
                        'commit_date')[0]
                    commit_date.auto_now_add = False
                    logger.info(
                        'Initial load. Need to construct versions of %s from annotation date.',
                        term.go_id)
                    date_annots = {}
                    for annotation in term.annotations:
                        date = timezone.make_aware(
                            datetime.strptime(annotation.date, '%Y%m%d'),
                            timezone.get_default_timezone())
                        try:
                            date_annots[date].append(annotation)
                        except KeyError:
                            date_annots[date] = [
                                annotation,
                            ]
                    annots_as_of_date = set()
                    prior_annots = set()
                    prior_version = None
                    for (date, annots) in sorted(date_annots.iteritems()):
                        annots_as_of_date.update([(annotation.gid,
                                                   annotation.ref)
                                                  for annotation in annots])
                        if (annots_as_of_date == prior_annots
                            ):  #if nothing changed, continue
                            continue
                        v_obj = Version(geneset=gs_obj,
                                        creator=user,
                                        parent=prior_version,
                                        commit_date=date)
                        v_obj.description = "Added " + str(
                            len(annots)
                        ) + " annotations from GO based on the dates provided in the GO annotation file."
                        v_obj.annotations = annots_as_of_date
                        v_obj.save()
                        prior_version = v_obj
                        prior_annots = annots_as_of_date.copy()
                    #re-enable auto_now_add
                    commit_date.auto_now_add = True
                else:
                    #load annotations
                    most_recent_versions = Version.objects.filter(
                        geneset=gs_obj).order_by('-commit_date')[:1]
                    annots = set([(annotation.gid, annotation.ref)
                                  for annotation in term.annotations])
                    description = ''
                    most_recent_version = None
                    if most_recent_versions:
                        most_recent_version = most_recent_versions[0]
                        if (most_recent_version.commit_date > timezone.now()):
                            logger.error('Version from the future: %s.',
                                         most_recent_version)
                        new = annots - most_recent_version.annotations
                        removed = most_recent_version.annotations - annots
                        if (new or removed):
                            description = description + 'Added ' + str(
                                len(new)) + ' and removed ' + str(
                                    len(removed)) + ' annotations from GO.'
                    else:
                        description = 'Created with ' + str(
                            len(annots)) + ' annotations from GO.'
                    if description:
                        v_obj = Version(geneset=gs_obj,
                                        creator=user,
                                        parent=most_recent_version,
                                        commit_date=timezone.now())
                        v_obj.description = description
                        v_obj.annotations = annots
                        v_obj.save()