示例#1
0
def genelist2fs(gl):
    qid2cds = ids2cds(gl)
    fs = {
        "description": "Feature set generated by " + ",".join(gl),
        "elements": {}
    }
    cdmie = CDMI_EntityAPI(URLS.cdmi)
    cdmic = CDMI_API(URLS.cdmi)
    cds_ids = qid2cds.values()
    cds2l = cds2locus(cds_ids)
    lfunc = cdmic.fids_to_functions(cds2l.values())

    fm = cdmie.get_entity_Feature(
        cds_ids,
        ['feature_type', 'source_id', 'sequence_length', 'function', 'alias'])
    for i in cds_ids:
        if i in fm:
            if not fm[i]['function'] and cds2l[i] in lfunc:
                fm[i]['function'] = lfunc[cds2l[i]]
            fs['elements'][i] = {
                "data": {
                    'type': fm[i]['feature_type'],
                    'id': i,
                    'dna_sequence_length': int(fm[i]['sequence_length']),
                    'function': fm[i]['function'],
                    'aliases': fm[i]['alias']
                }
            }
    return fs
示例#2
0
def getOtuGenomeIds(count, config):
    ''' Query the CDMI for a list of OTU genome IDs.

        @param count Number of entities to retrieve in each function call
        @param config Dictionary of configuration variables
        @return List of all OTU genome IDs, list of only prokaryote OTUs
    '''

    # Get the complete list of OTUs.
    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])
    otudict = dict()
    start = 0
    done = False
    while not done:
        subdict = cdmi_entity.all_entities_OTU(start, count, ["id"])
        otudict.update(subdict)
        start += count
        if len(subdict) < count:
            done = True

    # Find out if a OTU is marked as representative and if it is prokaryotic.
    otuids = getFieldFromEntity(otudict, "id")
    gendict = cdmi_entity.get_relationship_IsCollectionOf(otuids, [], ["representative"], ["id", "prokaryotic"])
    isrep = getFieldFromRelationship(gendict, "representative", "rel")
    isprok = getFieldFromRelationship(gendict, "prokaryotic", "to")
    genomeid = getFieldFromRelationship(gendict, "id", "to")
    prokotus = []
    otus = []
    for ii in range(len(genomeid)):
        if int(isrep[ii]) == 1 and int(isprok[ii]) == 1:
            prokotus.append(genomeid[ii])
        if int(isrep[ii]) == 1:
            otus.append(genomeid[ii])
    return otus, prokotus
示例#3
0
def getGenomeNeighborhoodsAndRoles(genomes, config):
    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    pegs = genomesToPegs(genomes)
    # Get contigs
    fidlocdict = cdmi_entity.get_relationship_IsLocatedIn(pegs, [], ["begin", "dir"], ["id"])
    fids = getFieldFromRelationship(fidlocdict, "from_link", "rel")
    begins = getFieldFromRelationship(fidlocdict, "begin", "rel")
    dirs = getFieldFromRelationship(fidlocdict, "dir", "rel")
    cids = getFieldFromRelationship(fidlocdict, "id", "to")

    tuplist = []
    for ii in range(len(cids)):
        tuplist.append( (cids[ii], fids[ii], int(begins[ii]), dirs[ii]) )
    # Sort by contig first, then by start location.
    tuplist = sorted(tuplist, key=operator.itemgetter(0,2))

    # Now lets get the role for all of these IDs
    # Note that a single protein can have multiple roles.
    roledict = cdmi_entity.get_relationship_HasFunctional(fids, [], [], ["id"])
    fids = getFieldFromRelationship(roledict, "from_link", "rel")
    roles = getFieldFromRelationship(roledict, "id", "to")
    fidToRoles = {}
    rolesToFids = {}
    for ii in range(len(fids)):
        if fids[ii] in fidToRoles:
            fidToRoles[fids[ii]].append(roles[ii])
        else:
            fidToRoles[fids[ii]] = [ roles[ii] ]
        if roles[ii] in rolesToFids:
            rolesToFids[roles[ii]].append(fids[ii])
        else:
            rolesToFids[roles[ii]] = [ fids[ii] ]
    return tuplist, fidToRoles
示例#4
0
def getDlitFids(count, config):
    ''' Query the CDMI for a list of feature IDs with direct literature evidence (dlits).

        @param count Number of entities to retrieve in each function call
        @param config Dictionary of configuration variables
        @return List of literature feature IDs
    '''

    cdmi = CDMI_API(config["cdmi_url"])
    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])
    pubdict = dict()
    start = 0
    done = False
    while not done:
        subdict = cdmi_entity.all_entities_Publication(start, count, ["id"])
        pubdict.update(subdict)
        start += count
        if len(subdict) < count:
            done = True

    pubids = getFieldFromEntity(pubdict, "id")
    sys.stderr.write("Found %d publication IDs\n" %(len(pubids)))
    pub2seq = cdmi_entity.get_relationship_Concerns(pubids, [], [], ["id"])
    pubseqs = getFieldFromRelationship(pub2seq, "id", "to")
    sys.stderr.write("Found %d protein sequences from publications\n" %(len(pubseqs)))
    seq2fids = cdmi_entity.get_relationship_IsProteinFor(pubseqs, [], [], ["id"])
    fids = getFieldFromRelationship(seq2fids, "id", "to")
    return fids
示例#5
0
def subsystemFids(count, config):
    ''' Query the CDMI for a list of feature IDs in the subsystems.

        @param count Number of entities to retrieve in each function call
        @param config Dictionary of configuration variables
        @return List of subsystem feature IDs
    '''

    cdmi = CDMI_API(config["cdmi_url"])
    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Get the genes that are in subsystems and in OTUs.
    ssdict = dict()
    start = 0
    done = False
    while not done:
        subdict = cdmi_entity.all_entities_Subsystem(start, count, ["id"])
        ssdict.update(subdict)
        start += count
        if len(subdict) < count:
            done = True
    ssids = getFieldFromEntity(ssdict, "id")
    sys.stderr.write('Found %d subsystems\n' %(len(ssids)))

    # Now lets get a list of FIDs within those subsystems
    # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 10
    end = start + increment
    counter = len(ssids)
    ssfids = []
    while counter > 0:
        try:
            ssfiddict = cdmi.subsystems_to_fids(ssids[start:end], [])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        for key in ssfiddict:
            for ssfid in ssfiddict[key]:
                ls = ssfiddict[key][ssfid]
                for arr in ls:
                    if len(arr) > 1:
                        gl = arr[1]
                        for l in gl:
                            ssfids.append(l)
                            
        # Move to next sub-list
        start += increment
        end += increment
        if end >= len(ssids):
            end = len(ssids)
        counter -= increment

    # Uniquify!
    return list(set(ssfids))
示例#6
0
def fidsToRoles(fidlist, config):
    ''' Given a list of feature IDs return a dictionary from FID to the list of roles the encoding gene
        performs and a dictionary from roles to the FIDs performing them.

        @param fidlist List of feature IDs
        @param config Dictionary of configuration variables
        @return Dictionary keyed by feature ID of list of roles encoding gene performs, dictionary
            keyed by role of list of feature IDs performing the role
    '''

    cdmi = CDMI_API(config["cdmi_url"])
    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])
    
    # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 1000
    end = start + increment
    counter = len(fidlist)
    fidsToRoles = {}
    rolesToFids = {}
    while counter > 0:
        try:
            roledict = cdmi_entity.get_relationship_HasFunctional(fidlist[start:end], [], [], ["id"])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        flist = getFieldFromRelationship(roledict, "from_link", "rel")
        rolelist = getFieldFromRelationship(roledict, "id", "to")
        for ii in range(len(flist)):
            # We have to use sets here because a bug(?) in get_relationship_HasFunctional allows multiple identical
            # links between fids and roles.
            # See for example what happens when you call it on g.9647.peg.2332
            if flist[ii] in fidsToRoles:
                fidsToRoles[flist[ii]].add(rolelist[ii])
            else:
                fidsToRoles[flist[ii]] = set([rolelist[ii]])
            if rolelist[ii] in rolesToFids:
                rolesToFids[rolelist[ii]].add(flist[ii])
            else:
                rolesToFids[rolelist[ii]] = set([flist[ii]])
                
        # Move to next sub-list
        start += increment
        end += increment
        if end >= len(fidlist):
            end = len(fidlist)
        counter -= increment
        
    # Convert back to lists to not break other functions.
    for f in fidsToRoles:
        fidsToRoles[f] = list(fidsToRoles[f])
    for r in rolesToFids:
        rolesToFids[r] = list(rolesToFids[r])
    return fidsToRoles, rolesToFids
示例#7
0
def cds2locus(gids):
    cdmie = CDMI_EntityAPI(URLS.cdmi)
    mrnas_l = cdmie.get_relationship_IsEncompassedIn(gids, [], ['to_link'], [])
    mrnas = dict((i[1]['from_link'], i[1]['to_link']) for i in mrnas_l)
    locus_l = cdmie.get_relationship_IsEncompassedIn(mrnas.values(), [],
                                                     ['to_link'], [])
    locus = dict((i[1]['from_link'], i[1]['to_link']) for i in locus_l)
    lgids = dict(
        (i, locus[mrnas[i]]) for i in gids if i in mrnas and mrnas[i] in locus)
    return lgids
示例#8
0
def complexRoleLinks(count, config):
    ''' Query the CDM for a list of links from complexes to roles.

        Only roles listed as "required" are included in the links.

        @note OBSOLETE - will be replaced by Chris's roles_to_reactions() function
        @param count Number of entities to retrieve in each function call
        @param config Dictionary of configuration variables
        @return Dictionary keyed by role of a list of complex IDs, dictionary keyed by
            complex ID to a list of roles.
    '''

    # Get a list of complexes
    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])
    cplxdict = dict()
    start = 0
    done = False
    while not done:
        subdict = cdmi_entity.all_entities_Complex(start, count, ["id"])
        cplxdict.update(subdict)
        start += count
        if len(subdict) < count:
            done = True
    cplxlist = getFieldFromEntity(cplxdict, "id")

    # Get a list of roles linked to those complexes
    roledict = cdmi_entity.get_relationship_IsTriggeredBy(cplxlist, [], ["optional"], ["id"])
    cplx = getFieldFromRelationship(roledict, "from_link", "rel")
    opt = getFieldFromRelationship(roledict, "optional", "rel")
    role = getFieldFromRelationship(roledict, "id", "to")
    complexToRequiredRoles = {}
    requiredRolesToComplex = {}
    for ii in range(len(cplx)):
        # For now - we don't want to deal with the "optional" components. I'm not sure how I'd incorporate them into a likelihood calculation anyway.
        if int(opt[ii]) == 1:
            continue
        # Note - this becomes an all-AND GPR - (role1 AND role2 AND ... )
        if cplx[ii] in complexToRequiredRoles:
            complexToRequiredRoles[cplx[ii]].append(role[ii])
        else:
            complexToRequiredRoles[cplx[ii]] = [ role[ii] ]
        if role[ii] in requiredRolesToComplex:
            requiredRolesToComplex[role[ii]].append(cplx[ii])
        else:
            requiredRolesToComplex[role[ii]] = [ cplx[ii] ]
    return complexToRequiredRoles, requiredRolesToComplex
示例#9
0
def genomesToPegs(genomes, config):
    ''' Given a list of genome IDs, returns a list of feature IDs for protein-encoding genes in the specified genomes.

        @param genomes List of genome IDs
        @param config Dictionary of configuration variables
        @return List of feature IDs for protein-encoding genes in specified genomes
    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])
    fiddict = cdmi_entity.get_relationship_IsOwnerOf(genomes, [], [], ["id", "feature_type"])
    fidlist = getFieldFromRelationship(fiddict, "id", "to")
    typelist = getFieldFromRelationship(fiddict, "feature_type", "to")
    # We want protein-encoding genes only (not e.g. operons, operators, etc...)
    # The type of protein-encoding genes is CDS now but will possibly be changed to peg later...
    pegs = []
    for ii in range(len(fidlist)):
        if typelist[ii] == "peg" or typelist[ii] == "CDS":
            pegs.append(fidlist[ii])    
    return pegs
示例#10
0
def reactionComplexLinks(count, config):
    ''' Query the CDM for a list of links from reactions to complexes.

        @note OBSOLETE - will be replaced by Chris's roles_to_reactions() function
        @param count Number of entities to retrieve in each function call
        @param config Dictionary of configuration variables
        @return Dictionary keyed by reaction ID to lists of complexes performing them,
            dictionary keyed by complex ID to list of reactions they perform.
    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # The API was recently changed to use model IDs and to not use the reactions_to_complexes
    # but use the ER model instead.
    # I reflect that here...
    rxndict = dict()
    start = 0
    done = False
    while not done:
        subdict = cdmi_entity.all_entities_Reaction(start, count, ['id'])
        rxndict.update(subdict)
        start += count
        if len(subdict) < count:
            done = True
    rxns = getFieldFromEntity(rxndict, "id")
    cplxdict = cdmi_entity.get_relationship_IsStepOf(rxns, [], [], ["id"])
    rxnlist = getFieldFromRelationship(cplxdict, "from_link", "rel")
    cplxlist = getFieldFromRelationship(cplxdict, "id", "to")
    
    rxnToComplex = {}
    complexToRxn = {}
    for ii in range(len(rxnlist)):
        if rxnlist[ii] in rxnToComplex:
            rxnToComplex[rxnlist[ii]].append(cplxlist[ii])
        else:
            rxnToComplex[rxnlist[ii]] = [ cplxlist[ii] ]
        if cplxlist[ii] in complexToRxn:
            complexToRxn[cplxlist[ii]].append(rxnlist[ii])
        else:
            complexToRxn[cplxlist[ii]] = [ rxnlist[ii] ]

    return rxnToComplex, complexToRxn
示例#11
0
def filterFidsByOtus(fidlist, otus, config):
    '''
    Obsolete (I think this isn't used any more)

    Given a list of representative organism IDs (OTUs) and a list of
    FIDs, returns only those FIDs found in an OTU.'''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each fid
    # If this fails to find an organism we don't want it anyway...
    orgdict = cdmi_entity.get_relationship_IsOwnedBy(fidlist, [], [], ["id"])
    flist = getFieldFromRelationship(orgdict, "from_link", "rel")
    olist = getFieldFromRelationship(orgdict, "id", "to")

    fids = []
    for ii in range(len(olist)):
        if olist[ii] in otus:
            fids.append(flist[ii])
    return fids
def addRxnProbabilitiesToBiochemistryJson(reaction_probability_file, biochemistry_json_file, output_file):
    '''Searches the biochemistry JSON for reaction UUIDs.
    A dictionary is created (using the alias table for 'modelSEED' from reaction UUID
    to modelSEED ID, and another from modelSEED IDs to KBase reaction IDs.

    If we managed to get a probability for that reaction, we print that (even if it is 0 - which
    means that the complex was defined but not found in the organism) along with the proposed GPR
    which is just a string.

    The probability of the reaction is in the 'probability' field while the GPR is in the 'GPR' field in
    the modified biochemistry json file.

    If we did NOT calculate a probability for a particular reaction that means no complexes are defined
    for it and we print -5 to indicate that those have '0 probability' but due to database limitations
    rather than due to lack of genetic evidence...'''

    if os.path.exists(output_file):
        sys.stderr.write("Modified biochemistry JSON file %s already exists!\n" %(output_file))
        exit(2)

    # KBase ID --> (probability, complex_info, GPR)
    kbaseIdToInfo = {}
    for line in open(reaction_probability_file, "r"):
        spl = line.strip("\r\n").split("\t")
        kbaseIdToInfo[spl[0]] = ( spl[1], spl[3], spl[4] )

    # Model ID --> Kbase ID
    cdmi_entity = CDMI_EntityAPI(CDMI_URL)
    rxniddict = cdmi_entity.all_entities_Reaction(MINN, COUNT, ["source_id"])
    kbaseIds = getFieldFromEntity(rxniddict, "id")
    modelIds = getFieldFromEntity(rxniddict, "source_id")
    modelToKbase = {}
    for ii in range(len(modelIds)):
        modelToKbase[modelIds[ii]] = kbaseIds[ii]

    # Different biochemistries will (I think?) have different UUIDs
    # for all the reactions in them... but they will have links set up
    # to the model IDs. At least, I HOPE so.
    resp = json.load(open(biochemistry_json_file, "r"))
    
    # UUID --> Model ID
    aliasSetList = resp["aliasSets"]
    uuidToModelId = {}
    for aliasSet in aliasSetList:
        if aliasSet["source"] == "ModelSEED" and aliasSet["attribute"] == "reactions":
            aliasDict = aliasSet["aliases"]
            for k in aliasDict:
                # aliasDict is really a dict from reaction id to a LIST of UUIDs, implying that
                # it is possible that more than one UUID goes with the same reaction ID.
                # If that happens (WHY?????) then I'll just assign all of them the probability
                # of that reaction.
                for uuid in aliasDict[k]:
                    uuidToModelId[uuid] = k
            # We found the one we need, no need to go through the rest of them...
            break
    
    # Now we need to iterate over all of the reactions and add the appropriate probabilities
    # to each of these.
    rxnList = resp["reactions"]
    for ii in range(len(rxnList)):
        myuuid = rxnList[ii]["uuid"]
        myProbability = 0
        myComplexInfo = ""
        myGPR = ""
        # These flags indicate database issues that could bias the probabilities.
        #
        # If all of them are FALSE or if only reactionHasComplex is true, the probability will be 0
        # but it is due to missing data.
        #
        # If only allComplexesHaveRepresentativeRoles is false, that only means there is some missing data
        # but still enough to test presence of some subunits of some complexes.
        reactionHasComplex = False
        oneComplexHasRepresentativeRoles = False
        allComplexesHaveRepresentativeRoles = False
        # If the database versions are consistent this should always be the case
        if myuuid in uuidToModelId:
            modelId = uuidToModelId[myuuid]
            if modelId in modelToKbase:
                kbaseId = modelToKbase[modelId]
                # This one is only the case if there are complexes associated with the reaction
                if kbaseId in kbaseIdToInfo:
                    reactionHasComplex = True
                    ''' There are three possibilities for each complex.
                    1: The roles attached to the complex had no representatives in our BLAST db (BAD)
                    2: The roles attached to the complex had representatives, but they were not found in
                    our BLAST search (OK)
                    3: The roles attached were all found with some probability in the BLAST search (OK)

                    Since there are multiple possibilities for complexes we need to decide when we should
                    call it OK and when we can't.

                    For now I will set separate flags for the occasion when one complex has roles and one doesn't
                    (and the calculated probability is for the complex with a probability)
                    and the occasion when NONE of the complexes have representeatives of their roles
                    (and the calculated probability is artificially 0)

                    PARTIAL cases are treated as "has representatives" for this purpose.
                    Therefore, if only allComplexesHaveRepresentativeRoles is false, that means
                    there is incomplete information, but at least one subunit of one complex attached
                    to the reaction had a representative that we could use to calculate a probability.

                    '''
                    # CPLX_FULL   [ok]
                    # CPLX_NOTTHERE [ok]
                    # CPLX_PARTIAL [sort of ok - treated as OK for this purpose]
                    # CPLX_NOREPS [bad]
                    myProbability = kbaseIdToInfo[kbaseId][0]
                    myGPR = kbaseIdToInfo[kbaseId][2]
                    myComplexInfo = kbaseIdToInfo[kbaseId][1]
                    if "CPLX_NOREPS" in myComplexInfo:
                        if "CPLX_FULL" in myComplexInfo or "CPLX_NOTTHERE" in myComplexInfo or "CPLX_PARTIAL" in myComplexInfo:
                            oneComplexHasRepresentativeRoles = True
                        else:
                            # No complexes have representative roles.
                            pass
                    else:
                        # All of them are either CPLX_FULL or CPLX_NOTTHERE
                        oneComplexHasRepresentativeRoles = True
                        allComplexesHaveRepresentativeRoles = True

        resp["reactions"][ii]["probability"] = myProbability
        resp["reactions"][ii]["complexinfo"] = myComplexInfo
        resp["reactions"][ii]["GPR"]         = myGPR
        resp["reactions"][ii]["reactionHasComplex"]                   = reactionHasComplex
        resp["reactions"][ii]["oneComplexHasRepresentativeRoles"]     = oneComplexHasRepresentativeRoles
        resp["reactions"][ii]["allComplexesHaveRepresentativeRoles"]  = allComplexesHaveRepresentativeRoles
        
    json.dump(resp, open(output_file, "w"), indent=4)
示例#13
0
def go_anno_net(meth, net_obj_id=None):
    """Add Gene Ontology annotation to network gene nodes

    :param net_obj_id: Network object id
    :type net_obj_id: kbtypes.KBaseNetworks.Network
    :return: Workspace id
    :rtype: kbtypes.Unicode
    :output_widget: ValueListWidget
    """
    meth.stages = 5

    meth.advance("Prepare annotation service")
    #gc = GWAS(URLS.gwas, token=meth.token)

    # load from current or other workspace
    wsid = meth.workspace_id
    # save to current workspace
    ws_save_id = meth.workspace_id

    meth.advance("Load network object")
    wsd = Workspace2(token=meth.token, wsid=wsid)
    oc = Ontology(url=URLS.ontology)

    net_object = wsd.get(net_obj_id)
    nc = Node(net_object['nodes'], net_object['edges'])

    idc = IDServerAPI(URLS.ids)
    cdmic = CDMI_API(URLS.cdmi)
    cdmie = CDMI_EntityAPI(URLS.cdmi)
    #idm = IdMap(URLS.idmap)
    gids = [
        i for i in sorted(nc.ugids.keys()) if 'CDS' in i or 'locus' in i or (
            not 'clst' in i and not i.startswith('cluster') and 'ps.' not in i)
    ]

    meth.advance("Get relationships from central data model")
    #eids = idc.kbase_ids_to_external_ids(gids)
    eids = kb_id2ext_id(idc, gids, 100)
    gids2cds = ids2cds(gids)
    cgids = gids2cds.values()
    cds2l = cds2locus(cgids)
    #mrnas_l = cdmie.get_relationship_Encompasses(gids, [], ['to_link'], [])
    #mrnas = dict((i[1]['from_link'], i[1]['to_link']) for i in mrnas_l)
    #locus_l = cdmie.get_relationship_Encompasses(mrnas.values(), [], ['to_link'], [])
    #locus = dict((i[1]['from_link'], i[1]['to_link']) for i in locus_l)
    #lgids = [locus[mrnas[i]] for i in gids if i in mrnas.keys()]  # ignore original locus ids in gids
    lgids = cds2l.values()

    meth.advance("Annotate ({:d} nodes, {:d} edges)".format(
        len(net_object['nodes']), len(net_object['edges'])))
    #ots = oc.get_goidlist(lgids, ['biological_process'], ['IEA'])
    ots = oc.get_goidlist(cgids, [], [])
    oan = ()  #oc.get_go_annotation(lgids)
    funcs = cdmic.fids_to_functions(lgids)
    funcs_org = cdmic.fids_to_functions(cgids)
    annotate_nodes(net_object,
                   ots=ots,
                   oan=oan,
                   funcs=funcs,
                   funcs_org=funcs_org,
                   eids=eids,
                   gids2cds=gids2cds,
                   cds2l=cds2l)

    meth.advance("Save annotated object to workspace {}".format(ws_save_id))
    obj = {
        'type': 'KBaseNetworks.Network',
        'data': net_object,
        'name': net_obj_id + ".ano",
        'meta': {
            'original': net_obj_id
        }
    }
    wsd.save_objects({'workspace': ws_save_id, 'objects': [obj]})

    return _workspace_output(net_obj_id + ".ano")
示例#14
0
def filterFidsByOtusOptimized(featureIdList, rolesToFids, otuRepsToMembers, config):
    ''' Filter feature IDs by OTU (optimized version).

        To minimize the amount of redundancy in the list of target proteins, filter
        the feature IDs so there is at most one protein from each OTU for each
        functional role.

        @param featureIdList List of unfiltered feature IDs
        @param rolesToFids Dictionary keyed by role of list of feature IDs
        @param otuRepsToMembers Dictionary keyed by OTU representative to list of OTU members
        @param config Dictionary of configuration variables
        @return Dictionary keyed by feature ID of list of roles, dictionary keyed by role
            of list of feature IDs
    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each feature ID.
    # If this fails to find an organism we don't want it anyway...
    fidToOrganism = dict() # Map feature IDs to organisms

     # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 100000
    end = start + increment
    counter = len(featureIdList)
    while counter > 0:
        try:
            ownedBy = cdmi_entity.get_relationship_IsOwnedBy(featureIdList[start:end], [], ['from_link'], ['id'])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        # just build the dictionary here, run the list of ob, extracting fid from from_link and organism from id
        fidList = getFieldFromRelationship(ownedBy, "from_link", "rel")
        organismList = getFieldFromRelationship(ownedBy, "id", "to")
        for index in range(len(fidList)):
            fidToOrganism[fidList[index]] = organismList[index]

        start += increment
        end += increment
        if end >= len(featureIdList):
            end = len(featureIdList)
        counter -= increment

    # Add all possible keys to the dictionaries and initialize the value.
    # Then we don't have to check if the key exists in the main loop below.
    keptFidsToRoles = dict()
    for index in range(len(featureIdList)):
        keptFidsToRoles[featureIdList[index]] = list()
    keptRolesToFids = dict()
    for role in rolesToFids:
        keptRolesToFids[role] = list()

    # Find the feature ID (protein) from each OTU for each functional role.
    otuCounter = 0
    for otuRepresentative in otuRepsToMembers:
        # This loop takes a very long time so print a message every so often
        # to track progress.
        otuCounter += 1
        if otuCounter % 10 == 0:
            sys.stderr.write('Processed %d OTUs at %s\n' %(otuCounter, now()))

        # Check every functional role.
        for role in rolesToFids:
            keepFid = None
            keepRole = None
            for fid in rolesToFids[role]:
                # This can happen due to MOL issues
                if fid not in fidToOrganism:
                    continue
                organism = fidToOrganism[fid]

                # If the organism is the representative we keep it and go to the next role
                if organism == otuRepresentative:
                    keepFid = fid
                    keepRole = role
                    break

                # Otherwise look at the rest of the list (note that I just pick one without really paying
                # attention to WHICH one...). We save them in case there are no examples of the role in the
                # representative organism, but continue on anyway.
                if organism in otuRepsToMembers[otuRepresentative]:
                    keepFid = fid
                    keepRole = role

            # Add to the dictionaries if we are keeping the feature ID.
            if keepFid is not None:
                keptFidsToRoles[keepFid].append(keepRole)
                keptRolesToFids[keepRole].append(keepFid)

    # Look for any empty lists and remove them.
    keysToRemove = list()
    for fid in keptFidsToRoles:
        if len(keptFidsToRoles[fid]) == 0:
            keysToRemove.append(fid)
    for key in keysToRemove:
        del keptFidsToRoles[key]
    keysToRemove = list()
    for role in keptRolesToFids:
        if len(keptRolesToFids[role]) == 0:
            keysToRemove.append(role)
    for key in keysToRemove:
        del keptRolesToFids[key]

    return keptFidsToRoles, keptRolesToFids
示例#15
0
def filterFidsByOtusBetter(fidsToRoles, rolesToFids, oturepsToMembers, config):
    '''Attempt to do a more intelligent filtering of FIDs by OTU.

    Given all FIDs attached to a role in the unfiltered set we do the following:
    
    Initialize KEEP
    For each OTU and each role:
       If role is found in the representative, add to KEEP and continue;
       Otherwise, iterate over other genomes.
           If role is found in one other genome, add to KEEP and continue;

    This process should make our calculation less sensitive to the choice of OTUs...

    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each fid
    # If this fails to find an organism we don't want it anyway...
    fidlist = fidsToRoles.keys()
    orgdict = []
     # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 5000
    end = start + increment
    counter = len(fidlist)
    while counter > 0:
        try:
            od = cdmi_entity.get_relationship_IsOwnedBy(fidlist[start:end], [], [], ["id"])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        orgdict.extend(od)
        start += increment
        end += increment
        if end >= len(fidlist):
            end = len(fidlist)
        counter -= increment
    fidlist = getFieldFromRelationship(orgdict, "from_link", "rel")
    orglist = getFieldFromRelationship(orgdict, "id", "to")
    fidToOrg = {}
    for ii in range(len(fidlist)):
        fidToOrg[fidlist[ii]] = orglist[ii]
    
    keptFidsToRoles = {}
    keptRolesToFids = {}
    # If the OTUs are comprehensive this should be empty.
    missingRoles = []

    # For each OTU
    for oturep in oturepsToMembers:
        # for each role
        for role in rolesToFids:
            fidlist = rolesToFids[role]
            keepFid = None
            keepRole = None
            for fid in fidlist:
                # This can happen due to MOL issues
                if fid not in fidToOrg:
                    continue
                org = fidToOrg[fid]
                # If the organism is the representative we keep it and go to the next role
                if org == oturep:
                    keepFid = fid
                    keepRole = role
                    break
                # Otherwise look at the rest of the list (note that I just pick one without really paying
                # attention to WHICH one...). We save them in case there are no examples of the role in the
                # representative organism, but continue on anyway.
                if org in oturepsToMembers[oturep]:
                    keepFid = fid
                    keepRole = role
            if keepFid is not None:
                if keepFid in keptFidsToRoles:
                    keptFidsToRoles[keepFid].append(keepRole)
                else:
                    keptFidsToRoles[keepFid] = [ keepRole ]
                if keepRole in keptRolesToFids:
                    keptRolesToFids[keepRole].append(keepFid)
                else:
                    keptRolesToFids[keepRole] = [ keepFid ]

    missingRoles = list(set(rolesToFids.keys()) - set(keptRolesToFids.keys()))

#    print oturepsToMembers
#    print missingRoles
#    print keptRolesToFids

    return keptFidsToRoles, keptRolesToFids, missingRoles