Пример #1
0
    def handle(self, *args, **options):

        user_name = options.get('user')
        user = None
        try:
            user = User.objects.get(username=user_name)
        except User.DoesNotExist:
            logger.error('The user %s did not exist.',
                         user_name,
                         extra={'options': options})
            sys.exit()

        org = None
        try:
            org = Organism.objects.get(
                scientific_name='H**o sapiens')  #Only exists for human
        except Organism.DoesNotExist:
            logger.error('The organism %s did not exist.',
                         'H**o sapiens',
                         extra={'options': options})
            sys.exit()

        obo_r = requests.get(DO_URL)

        obo_strio = StringIO(obo_r.text)
        disease_ontology = go()
        loaded_obo = disease_ontology.parse(obo_strio)

        doid_omim = {}
        obo_reversed_str_array = obo_r.text.splitlines()[::-1]
        while obo_reversed_str_array:  #Loop from Dima @ Princeton
            line = obo_reversed_str_array.pop()
            if line == '[Term]':
                while line != '':
                    line = obo_reversed_str_array.pop()
                    if line.startswith('id:'):
                        doid = re.search('DOID:[0-9]+', line).group(0)
                    if line.startswith('xref: OMIM:'):
                        omim = re.search('[0-9]+', line).group(0)
                        if not doid_omim.has_key(doid):
                            doid_omim[doid] = set()
                        if omim not in doid_omim[doid]:
                            doid_omim[doid].add(omim)

        mim_gene = {}
        s = requests.Session()
        mim2gene_list = s.retr(
            OMIM_FTP + 'mim2gene.txt',
            auth=("anonymous",
                  "*****@*****.**")).text.splitlines()
        for line in mim2gene_list:  #Loop from Dima @ Princeton
            toks = line.split('\t')
            mim = toks[0]
            gtype = toks[1]
            gid = toks[2]
            if gtype in LIMIT_TYPE:
                if mim in mim_gene:
                    logger.warning("MIM already exists: %s", mim)
                mim_gene[mim] = gid

        mimdiseases = {}
        genemap_list = s.retr(
            OMIM_FTP + "genemap",
            auth=("anonymous",
                  "*****@*****.**")).text.splitlines()
        for l in genemap_list:  #Loop from Dima @ Princeton
            #The choice of fields relies on info from the genemap.key file from omim
            l_split = l.split('|')
            status = l_split[6].strip()
            mim_geneid = l_split[9].strip()
            disorders = l_split[13].strip()

            #continuation of disorder field
            d2 = l_split[14].strip()
            d3 = l_split[15].strip()
            if d2 != '': disorders = disorders + ' ' + d2
            if d3 != '': disorders = disorders + ' ' + d3

            if disorders != '' and status in LIMIT_STATUS and mim_gene.has_key(
                    mim_geneid):
                #print 'Status ok, not blank and genemap has key'

                geneid = mim_gene[mim_geneid]
                tuple_gid_status = (geneid, status)

                disorders_list = disorders.split(';')
                for d in disorders_list:
                    if '[' not in d and '?' not in d:
                        mim_info = re.search(FIND_MIMID, d)
                        if mim_info:
                            #print 'Has necessary info'
                            #TODO: Make sure to include ? and [
                            info_split = mim_info.group(0).split(' ')
                            mim_disease_id = info_split[1].strip()
                            mim_phetype = info_split[2].strip()
                            if mim_phetype == LIMIT_PHENO:
                                #print 'Correct phenotype'
                                if not mimdiseases.has_key(mim_disease_id):
                                    mimdiseases[mim_disease_id] = mim_disease()
                                    mimdiseases[
                                        mim_disease_id].mimid = mim_disease_id
                                    mimdiseases[
                                        mim_disease_id].phe_mm = mim_phetype
                                if '{' in d:
                                    mimdiseases[
                                        mim_disease_id].is_susceptibility = 1
                                if tuple_gid_status not in mimdiseases[
                                        mim_disease_id].genetuples:
                                    mimdiseases[
                                        mim_disease_id].genetuples.append(
                                            tuple_gid_status)

        logger.debug(disease_ontology.go_terms)
        entrez_gid = {}
        for doid in doid_omim.keys():
            term = disease_ontology.get_term(doid)
            if term is None:
                continue
            logger.info("Processing %s", term)
            omim_list = doid_omim[doid]
            for o in omim_list:
                omim_id = o
                if mimdiseases.has_key(omim_id):
                    mim_entry = mimdiseases[omim_id]
                    if mim_entry.is_susceptibility:
                        d_or_s = 'S'
                    else:
                        d_or_s = 'D'
                    for g in mim_entry.genetuples:
                        entrez = int(g[0])
                        if entrez in entrez_gid:
                            term.add_annotation(gid=entrez_gid[entrez],
                                                ref=None)
                        else:
                            gene = Gene.objects.get(entrezid=entrez)
                            entrez_gid[entrez] = gene.id
                            term.add_annotation(gid=gene.id, ref=None)

        disease_ontology.populated = True  #mark annotated
        disease_ontology.propagate()  #prop annotations

        for (term_id, term) in disease_ontology.go_terms.iteritems():
            if term.annotations:
                logger.info("Creating %s", term)
                slug = slugify(' '.join(
                    (term.go_id, org.scientific_name,
                     term.full_name)))[:50]  #make first 50 chars into a slug

                doid = term.go_id
                do_num = doid.split(':')[1]
                #construct title
                title = 'DO' + '-' + do_num + ':' + term.full_name

                #construct abstract
                #write evidence as string
                omim_clause = ''
                if doid in doid_omim:
                    omim_list = list(doid_omim[doid])
                    if len(omim_list):
                        omim_clause = ' Annotations directly to this term are provided by the OMIM disease ID'
                        if len(omim_list) == 1:
                            omim_clause = omim_clause + ' ' + omim_list[0]
                        else:
                            omim_clause = omim_clause + 's ' + ', '.join(
                                omim_list[:-1]) + ' and ' + omim_list[-1]
                        omim_clause = omim_clause + '.'

                description = ''
                if term.description:
                    description += term.description
                description += ' Annotations from child terms in the disease ontology are propagated through transitive closure.' + omim_clause
                logger.info(description)

                #get geneset
                changed = False
                try:
                    gs_obj = Geneset.objects.get(slug=slug, creator=user)
                    changed = False  #flag to know if we need to call save

                    #all these genesets should be public
                    if not gs_obj.public:
                        gs_obj.public = True
                        changed = True

                    if gs_obj.title != title:
                        gs_obj.title = title
                        changed = True

                    if gs_obj.abstract != description:
                        gs_obj.abstract = description
                        changed = True

                except Geneset.DoesNotExist:
                    gs_obj = Geneset(title=title,
                                     slug=slug,
                                     creator=user,
                                     organism=org,
                                     public=True,
                                     abstract=description)
                    changed = True

                #if anything changed
                if changed:
                    gs_obj.save()

                #load annotations
                most_recent_versions = Version.objects.filter(
                    geneset=gs_obj).order_by('-commit_date')[:1]
                annots = set([(annotation.gid, annotation.ref)
                              for annotation in term.annotations])
                description = ''
                most_recent_version = None
                if most_recent_versions:
                    most_recent_version = most_recent_versions[0]
                    if (most_recent_version.commit_date > timezone.now()):
                        logger.error('Version from the future: %s.',
                                     most_recent_version)
                    new = annots - most_recent_version.annotations
                    removed = most_recent_version.annotations - annots
                    if (new or removed):
                        description = description + 'Added ' + str(
                            len(new)
                        ) + ' and removed ' + str(
                            len(removed)
                        ) + ' annotations from OMIM and the disease ontology.'
                else:
                    description = 'Created with ' + str(
                        len(annots)
                    ) + ' annotations from OMIM and the disease ontology.'
                if description:
                    v_obj = Version(geneset=gs_obj,
                                    creator=user,
                                    parent=most_recent_version,
                                    commit_date=timezone.now())
                    v_obj.description = description
                    v_obj.annotations = annots
                    v_obj.save()
Пример #2
0
    def handle(self, *args, **options):

        user_name = options.get('user')
        user = None
        try:
            user = User.objects.get(username = user_name)
        except User.DoesNotExist:
            logger.error('The user %s did not exist.', user_name, extra={'options': options})
            sys.exit()

        org = None
        try:
            org = Organism.objects.get(scientific_name = options.get('organism'))
        except Organism.DoesNotExist:
            logger.error('The organism %s did not exist.', options.get('organism'), extra={'options': options})
            sys.exit()

        version = get_kegg_version(KEGG_URL_BASE)
        if version is None:
            logger.error('The KEGG api may have changed. Release no longer starts with "Release".')
        else:
            logger.info('Working with KEGG version %s.', version)

        if (options.get('kegg_record_types')):
            kegg_record_types = (options.get('kegg_record_types')).replace(" ", "").split(",")
            kegg_record_types = tuple(kegg_record_types)
            logger.info('Requested KEGG Record Types are: %s', str(kegg_record_types))

        else:
            kegg_record_types = KEGG_RECORD_TYPES
            logger.info('Using pre-set KEGG Record Types')

        for record_type in kegg_record_types:
            logger.info('Starting record type %s.', record_type)
            record_members = get_kegg_members(KEGG_URL_BASE, KEGG_NAMES[org.scientific_name], record_type)
            for (record, members) in record_members.iteritems():
                if record_type == 'Module':
                    record = record.split('_').pop() #for modules, they are prefixed with species_
                slug = slugify(org.scientific_name + ' ' + record)
                gs_info = get_kegg_info(KEGG_URL_BASE, record)
                gs_info['title'] = 'KEGG-' + record_type + '-' + record + ': ' + gs_info['title'] #make title more search friendly
                try:
                    geneset = Geneset.objects.get(slug=slug)
                    changed = False
                    if geneset.title != gs_info['title']:
                        geneset.title = gs_info['title']
                        changed = True
                    if geneset.abstract != gs_info['abstract']:
                        geneset.abstract = gs_info['abstract']
                        changed = True
                    if changed:
                        geneset.save()

                except Geneset.DoesNotExist:
                    geneset = Geneset(creator=user, title=gs_info['title'], organism=org, abstract=gs_info['abstract'], slug=slug, public=True)
                    geneset.save()

                if (options.get('gene_id')):
                    gene_id = options.get('gene_id')
                    if (gene_id == 'systematic_name'):
                        annots = frozenset([(gene.pk, None) for gene in Gene.objects.filter(systematic_name__in=members)])
                    elif (gene_id == 'standard_name'):
                        annots = frozenset([(gene.pk, None) for gene in Gene.objects.filter(standard_name__in=members)])
                    else:
                        logger.error('gene_id entered is not supported (yet)')
                        return False
                else:
                    annots = frozenset([(gene.pk, None) for gene in Gene.objects.filter(entrezid__in=members)])

                most_recent_versions = Version.objects.filter(geneset=geneset).order_by('-commit_date')[:1]
                description = ''
                most_recent_version = None
                if most_recent_versions:
                    most_recent_version = most_recent_versions[0]
                    if (most_recent_version.commit_date > timezone.now()):
                        logger.error('Version from the future: %s.', most_recent_version)
                    new = annots - most_recent_version.annotations
                    removed = most_recent_version.annotations - annots
                    if (new or removed):
                        description = description + 'Added ' + str(len(new)) + ' and removed ' + str(len(removed)) + ' annotations from KEGG version ' + version + '.'
                else:
                    description = 'Created with ' + str(len(annots)) + ' annotations from KEGG version ' + version + '.'
                if description:
                    v_obj = Version(geneset=geneset, creator=user, parent=most_recent_version, commit_date=timezone.now())
                    v_obj.description = description
                    v_obj.annotations = annots
                    v_obj.save()
Пример #3
0
    def handle(self, *args, **options):

        user_name = options.get('user')
        user = None
        try:
            user = User.objects.get(username=user_name)
        except User.DoesNotExist:
            logger.error('The user %s did not exist.',
                         user_name,
                         extra={'options': options})
            sys.exit()

        org = None
        try:
            org = Organism.objects.get(scientific_name=options.get('organism'))
        except Organism.DoesNotExist:
            logger.error('The organism %s did not exist.',
                         options.get('organism'),
                         extra={'options': options})
            sys.exit()

        accepted_evcodes = None
        if options.get('evcodes'):
            accepted_evcodes = set(options.get('evcodes').split(','))

        gene_ontology = go()
        remote = options.get('remote') != None
        obo_location = GO_OBO_URL if remote else options.get('obo')
        loaded_obo = gene_ontology.load_obo(obo_location,
                                            remote_location=remote,
                                            timeout=5)

        if not loaded_obo:
            logger.error("Couldn't load OBO file %s with remote equal to %s.",
                         obo_location, remote)
            sys.exit()

        annot_zip_fh = None
        annot_fh = None
        if remote:
            annot_zip_fh = urllib2.urlopen(GO_ASSOC_FTP + '.'.join(
                (GO_ASSOC_PREFIX, GO_NAMES[org.scientific_name],
                 GO_ASSOC_SUFFIX)),
                                           timeout=5)
        else:
            annot_zip_fh = open(options.get('annot'))
        annot_fh = gzip.GzipFile(fileobj=io.BytesIO(annot_zip_fh.read()))
        annot_zip_fh.close()

        annots = []
        load_pairs = {}
        pubs = set()

        for line in annot_fh:
            if line.startswith('!'):
                continue
            toks = line.strip().split('\t')

            (xrdb, xrid, details, goid, ref, ev,
             date) = (toks[0], toks[1], toks[3], toks[4], toks[5], toks[6],
                      toks[13])

            if options.get('tair'):
                import re
                tair_regex = re.compile('AT[0-9MC]G[0-9][0-9][0-9][0-9][0-9]')
                first_alias = toks[10].split('|')[0]
                if tair_regex.match(toks[2]):
                    xrid = toks[2]
                elif tair_regex.match(toks[9]):
                    xrid = toks[9]
                elif tair_regex.match(first_alias):
                    xrid = first_alias

            if options.get('only_wb') and (toks[0] != 'WB'):
                continue

            if details == 'NOT':
                continue
            if accepted_evcodes is not None and not (ev in accepted_evcodes):
                continue

            if options.get('leading') is not None:
                xrid = xrid.split(':')[1]

            try:
                load_pairs[xrdb].append(xrid)
            except KeyError:
                load_pairs[xrdb] = [
                    xrid,
                ]

            refs = ref.split('|')
            for ref_item in refs:
                if ref_item.startswith('PMID:'):
                    pubs.add(ref_item.split(':')[1])
                else:
                    logger.info("Unknown publication key %s", ref_item)
            annots.append((xrdb, xrid, goid, ref, date))

        xref_cache = {}

        if options.get('pseudomonas'):
            logger.info('Pseudomonas entered')
            for (xrdb, xrids) in load_pairs.iteritems():
                gene_objs = Gene.objects.filter(systematic_name__in=xrids)
                logger.info(
                    "Mapped %s Pseudomonas genes from the database using gene systematic name.",
                    gene_objs.count())
                for gene_obj in gene_objs:
                    xref_cache[(xrdb, gene_obj.systematic_name)] = gene_obj

        else:
            for (xrdb, xrids) in load_pairs.iteritems():
                if xrdb in DB_REMAP:
                    xrdb = DB_REMAP[xrdb]
                try:
                    xrdb_obj = CrossRefDB.objects.get(name=xrdb)
                except CrossRefDB.DoesNotExist:
                    logger.warning("Couldn't find the cross reference DB %s.",
                                   xrdb)
                    continue
                xrid_objs = CrossRef.objects.filter(
                    crossrefdb=xrdb_obj).filter(xrid__in=xrids)
                logger.info("Mapped %s cross references from %s",
                            xrid_objs.count(), xrdb)
                for xrid_obj in xrid_objs:
                    xref_cache[(xrdb, xrid_obj.xrid)] = xrid_obj.gene

        load_pmids(pubs)
        pub_cache = {}
        pub_values = Publication.objects.filter(pmid__in=pubs).only(
            'id', 'pmid').values()
        for pub in pub_values:
            pub_cache[pub['pmid']] = pub['id']

        for annot in annots:
            (xrdb, xrid, goid, ref, date) = annot
            if xrdb in DB_REMAP:
                xrdb = DB_REMAP[xrdb]
            try:
                gene = xref_cache[(xrdb, xrid)]
            except KeyError:
                logger.debug("Couldn't find xrid %s in xrdb %s.", xrid, xrdb)
                logger.info("Couldn't find xrid %s in xrdb %s.", xrid, xrdb)
                continue
            refs = ref.split('|')
            pub = None
            for ref_item in refs:
                if ref_item.startswith('PMID:'):
                    try:
                        pub = pub_cache[int(ref_item.split(':')[1])]
                    except KeyError:
                        pub = None
            gene_ontology.add_annotation(go_id=goid,
                                         gid=gene.pk,
                                         ref=pub,
                                         date=date,
                                         direct=True)

        gene_ontology.populated = True  #mark annotated
        gene_ontology.propagate()  #prop annotations

        evlist = list(accepted_evcodes)
        for (term_id, term) in gene_ontology.go_terms.iteritems():
            if term.annotations:
                slug = slugify(' '.join(
                    (term.go_id, org.scientific_name,
                     term.full_name)))[:50]  #make first 50 chars into a slug

                namespace = GO_NAMESPACE_MAP[term.get_namespace()]
                go_id = term.go_id.split(':')[1]
                #construct title
                title = 'GO' + '-' + namespace + '-' + go_id + ':' + term.full_name

                #construct abstract
                #write evidence as string
                evclause = ''
                if len(evlist):
                    evclause = ' Only annotations with evidence coded as '
                    if len(evlist) == 1:
                        evclause = evclause + evlist[0]
                    else:
                        evclause = evclause + ', '.join(
                            evlist[:-1]) + ' or ' + evlist[-1]
                    evclause = evclause + ' are included.'
                if term.description:
                    description = term.description + ' Annotations are propagated through transitive closure as recommended by the GO Consortium.' + evclause
                else:
                    logger.info("No description on term %s", term)

                #get geneset
                changed = False
                try:
                    gs_obj = Geneset.objects.get(slug=slug, creator=user)
                    changed = False  #flag to know if we need to call save

                    #all these genesets should be public
                    if not gs_obj.public:
                        gs_obj.public = True
                        changed = True

                    if gs_obj.title != title:
                        gs_obj.title = title
                        changed = True

                    if gs_obj.abstract != description:
                        gs_obj.abstract = description
                        changed = True

                except Geneset.DoesNotExist:
                    gs_obj = Geneset(title=title,
                                     slug=slug,
                                     creator=user,
                                     organism=org,
                                     public=True,
                                     abstract=description)
                    changed = True

                #if anything changed
                if changed:
                    gs_obj.save()

                if options.get('initial'):
                    #disable commit field's auto_now_add, allows us to set a prior annotation date
                    commit_date = Version._meta.get_field_by_name(
                        'commit_date')[0]
                    commit_date.auto_now_add = False
                    logger.info(
                        'Initial load. Need to construct versions of %s from annotation date.',
                        term.go_id)
                    date_annots = {}
                    for annotation in term.annotations:
                        date = timezone.make_aware(
                            datetime.strptime(annotation.date, '%Y%m%d'),
                            timezone.get_default_timezone())
                        try:
                            date_annots[date].append(annotation)
                        except KeyError:
                            date_annots[date] = [
                                annotation,
                            ]
                    annots_as_of_date = set()
                    prior_annots = set()
                    prior_version = None
                    for (date, annots) in sorted(date_annots.iteritems()):
                        annots_as_of_date.update([(annotation.gid,
                                                   annotation.ref)
                                                  for annotation in annots])
                        if (annots_as_of_date == prior_annots
                            ):  #if nothing changed, continue
                            continue
                        v_obj = Version(geneset=gs_obj,
                                        creator=user,
                                        parent=prior_version,
                                        commit_date=date)
                        v_obj.description = "Added " + str(
                            len(annots)
                        ) + " annotations from GO based on the dates provided in the GO annotation file."
                        v_obj.annotations = annots_as_of_date
                        v_obj.save()
                        prior_version = v_obj
                        prior_annots = annots_as_of_date.copy()
                    #re-enable auto_now_add
                    commit_date.auto_now_add = True
                else:
                    #load annotations
                    most_recent_versions = Version.objects.filter(
                        geneset=gs_obj).order_by('-commit_date')[:1]
                    annots = set([(annotation.gid, annotation.ref)
                                  for annotation in term.annotations])
                    description = ''
                    most_recent_version = None
                    if most_recent_versions:
                        most_recent_version = most_recent_versions[0]
                        if (most_recent_version.commit_date > timezone.now()):
                            logger.error('Version from the future: %s.',
                                         most_recent_version)
                        new = annots - most_recent_version.annotations
                        removed = most_recent_version.annotations - annots
                        if (new or removed):
                            description = description + 'Added ' + str(
                                len(new)) + ' and removed ' + str(
                                    len(removed)) + ' annotations from GO.'
                    else:
                        description = 'Created with ' + str(
                            len(annots)) + ' annotations from GO.'
                    if description:
                        v_obj = Version(geneset=gs_obj,
                                        creator=user,
                                        parent=most_recent_version,
                                        commit_date=timezone.now())
                        v_obj.description = description
                        v_obj.annotations = annots
                        v_obj.save()