def genelist2fs(gl): qid2cds = ids2cds(gl) fs = { "description": "Feature set generated by " + ",".join(gl), "elements": {} } cdmie = CDMI_EntityAPI(URLS.cdmi) cdmic = CDMI_API(URLS.cdmi) cds_ids = qid2cds.values() cds2l = cds2locus(cds_ids) lfunc = cdmic.fids_to_functions(cds2l.values()) fm = cdmie.get_entity_Feature( cds_ids, ['feature_type', 'source_id', 'sequence_length', 'function', 'alias']) for i in cds_ids: if i in fm: if not fm[i]['function'] and cds2l[i] in lfunc: fm[i]['function'] = lfunc[cds2l[i]] fs['elements'][i] = { "data": { 'type': fm[i]['feature_type'], 'id': i, 'dna_sequence_length': int(fm[i]['sequence_length']), 'function': fm[i]['function'], 'aliases': fm[i]['alias'] } } return fs
def getOtuGenomeIds(count, config): ''' Query the CDMI for a list of OTU genome IDs. @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return List of all OTU genome IDs, list of only prokaryote OTUs ''' # Get the complete list of OTUs. cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) otudict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_OTU(start, count, ["id"]) otudict.update(subdict) start += count if len(subdict) < count: done = True # Find out if a OTU is marked as representative and if it is prokaryotic. otuids = getFieldFromEntity(otudict, "id") gendict = cdmi_entity.get_relationship_IsCollectionOf(otuids, [], ["representative"], ["id", "prokaryotic"]) isrep = getFieldFromRelationship(gendict, "representative", "rel") isprok = getFieldFromRelationship(gendict, "prokaryotic", "to") genomeid = getFieldFromRelationship(gendict, "id", "to") prokotus = [] otus = [] for ii in range(len(genomeid)): if int(isrep[ii]) == 1 and int(isprok[ii]) == 1: prokotus.append(genomeid[ii]) if int(isrep[ii]) == 1: otus.append(genomeid[ii]) return otus, prokotus
def getGenomeNeighborhoodsAndRoles(genomes, config): cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) pegs = genomesToPegs(genomes) # Get contigs fidlocdict = cdmi_entity.get_relationship_IsLocatedIn(pegs, [], ["begin", "dir"], ["id"]) fids = getFieldFromRelationship(fidlocdict, "from_link", "rel") begins = getFieldFromRelationship(fidlocdict, "begin", "rel") dirs = getFieldFromRelationship(fidlocdict, "dir", "rel") cids = getFieldFromRelationship(fidlocdict, "id", "to") tuplist = [] for ii in range(len(cids)): tuplist.append( (cids[ii], fids[ii], int(begins[ii]), dirs[ii]) ) # Sort by contig first, then by start location. tuplist = sorted(tuplist, key=operator.itemgetter(0,2)) # Now lets get the role for all of these IDs # Note that a single protein can have multiple roles. roledict = cdmi_entity.get_relationship_HasFunctional(fids, [], [], ["id"]) fids = getFieldFromRelationship(roledict, "from_link", "rel") roles = getFieldFromRelationship(roledict, "id", "to") fidToRoles = {} rolesToFids = {} for ii in range(len(fids)): if fids[ii] in fidToRoles: fidToRoles[fids[ii]].append(roles[ii]) else: fidToRoles[fids[ii]] = [ roles[ii] ] if roles[ii] in rolesToFids: rolesToFids[roles[ii]].append(fids[ii]) else: rolesToFids[roles[ii]] = [ fids[ii] ] return tuplist, fidToRoles
def getDlitFids(count, config): ''' Query the CDMI for a list of feature IDs with direct literature evidence (dlits). @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return List of literature feature IDs ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) pubdict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_Publication(start, count, ["id"]) pubdict.update(subdict) start += count if len(subdict) < count: done = True pubids = getFieldFromEntity(pubdict, "id") sys.stderr.write("Found %d publication IDs\n" %(len(pubids))) pub2seq = cdmi_entity.get_relationship_Concerns(pubids, [], [], ["id"]) pubseqs = getFieldFromRelationship(pub2seq, "id", "to") sys.stderr.write("Found %d protein sequences from publications\n" %(len(pubseqs))) seq2fids = cdmi_entity.get_relationship_IsProteinFor(pubseqs, [], [], ["id"]) fids = getFieldFromRelationship(seq2fids, "id", "to") return fids
def subsystemFids(count, config): ''' Query the CDMI for a list of feature IDs in the subsystems. @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return List of subsystem feature IDs ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Get the genes that are in subsystems and in OTUs. ssdict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_Subsystem(start, count, ["id"]) ssdict.update(subdict) start += count if len(subdict) < count: done = True ssids = getFieldFromEntity(ssdict, "id") sys.stderr.write('Found %d subsystems\n' %(len(ssids))) # Now lets get a list of FIDs within those subsystems # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 10 end = start + increment counter = len(ssids) ssfids = [] while counter > 0: try: ssfiddict = cdmi.subsystems_to_fids(ssids[start:end], []) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue for key in ssfiddict: for ssfid in ssfiddict[key]: ls = ssfiddict[key][ssfid] for arr in ls: if len(arr) > 1: gl = arr[1] for l in gl: ssfids.append(l) # Move to next sub-list start += increment end += increment if end >= len(ssids): end = len(ssids) counter -= increment # Uniquify! return list(set(ssfids))
def fidsToRoles(fidlist, config): ''' Given a list of feature IDs return a dictionary from FID to the list of roles the encoding gene performs and a dictionary from roles to the FIDs performing them. @param fidlist List of feature IDs @param config Dictionary of configuration variables @return Dictionary keyed by feature ID of list of roles encoding gene performs, dictionary keyed by role of list of feature IDs performing the role ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 1000 end = start + increment counter = len(fidlist) fidsToRoles = {} rolesToFids = {} while counter > 0: try: roledict = cdmi_entity.get_relationship_HasFunctional(fidlist[start:end], [], [], ["id"]) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue flist = getFieldFromRelationship(roledict, "from_link", "rel") rolelist = getFieldFromRelationship(roledict, "id", "to") for ii in range(len(flist)): # We have to use sets here because a bug(?) in get_relationship_HasFunctional allows multiple identical # links between fids and roles. # See for example what happens when you call it on g.9647.peg.2332 if flist[ii] in fidsToRoles: fidsToRoles[flist[ii]].add(rolelist[ii]) else: fidsToRoles[flist[ii]] = set([rolelist[ii]]) if rolelist[ii] in rolesToFids: rolesToFids[rolelist[ii]].add(flist[ii]) else: rolesToFids[rolelist[ii]] = set([flist[ii]]) # Move to next sub-list start += increment end += increment if end >= len(fidlist): end = len(fidlist) counter -= increment # Convert back to lists to not break other functions. for f in fidsToRoles: fidsToRoles[f] = list(fidsToRoles[f]) for r in rolesToFids: rolesToFids[r] = list(rolesToFids[r]) return fidsToRoles, rolesToFids
def cds2locus(gids): cdmie = CDMI_EntityAPI(URLS.cdmi) mrnas_l = cdmie.get_relationship_IsEncompassedIn(gids, [], ['to_link'], []) mrnas = dict((i[1]['from_link'], i[1]['to_link']) for i in mrnas_l) locus_l = cdmie.get_relationship_IsEncompassedIn(mrnas.values(), [], ['to_link'], []) locus = dict((i[1]['from_link'], i[1]['to_link']) for i in locus_l) lgids = dict( (i, locus[mrnas[i]]) for i in gids if i in mrnas and mrnas[i] in locus) return lgids
def complexRoleLinks(count, config): ''' Query the CDM for a list of links from complexes to roles. Only roles listed as "required" are included in the links. @note OBSOLETE - will be replaced by Chris's roles_to_reactions() function @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return Dictionary keyed by role of a list of complex IDs, dictionary keyed by complex ID to a list of roles. ''' # Get a list of complexes cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) cplxdict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_Complex(start, count, ["id"]) cplxdict.update(subdict) start += count if len(subdict) < count: done = True cplxlist = getFieldFromEntity(cplxdict, "id") # Get a list of roles linked to those complexes roledict = cdmi_entity.get_relationship_IsTriggeredBy(cplxlist, [], ["optional"], ["id"]) cplx = getFieldFromRelationship(roledict, "from_link", "rel") opt = getFieldFromRelationship(roledict, "optional", "rel") role = getFieldFromRelationship(roledict, "id", "to") complexToRequiredRoles = {} requiredRolesToComplex = {} for ii in range(len(cplx)): # For now - we don't want to deal with the "optional" components. I'm not sure how I'd incorporate them into a likelihood calculation anyway. if int(opt[ii]) == 1: continue # Note - this becomes an all-AND GPR - (role1 AND role2 AND ... ) if cplx[ii] in complexToRequiredRoles: complexToRequiredRoles[cplx[ii]].append(role[ii]) else: complexToRequiredRoles[cplx[ii]] = [ role[ii] ] if role[ii] in requiredRolesToComplex: requiredRolesToComplex[role[ii]].append(cplx[ii]) else: requiredRolesToComplex[role[ii]] = [ cplx[ii] ] return complexToRequiredRoles, requiredRolesToComplex
def genomesToPegs(genomes, config): ''' Given a list of genome IDs, returns a list of feature IDs for protein-encoding genes in the specified genomes. @param genomes List of genome IDs @param config Dictionary of configuration variables @return List of feature IDs for protein-encoding genes in specified genomes ''' cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) fiddict = cdmi_entity.get_relationship_IsOwnerOf(genomes, [], [], ["id", "feature_type"]) fidlist = getFieldFromRelationship(fiddict, "id", "to") typelist = getFieldFromRelationship(fiddict, "feature_type", "to") # We want protein-encoding genes only (not e.g. operons, operators, etc...) # The type of protein-encoding genes is CDS now but will possibly be changed to peg later... pegs = [] for ii in range(len(fidlist)): if typelist[ii] == "peg" or typelist[ii] == "CDS": pegs.append(fidlist[ii]) return pegs
def reactionComplexLinks(count, config): ''' Query the CDM for a list of links from reactions to complexes. @note OBSOLETE - will be replaced by Chris's roles_to_reactions() function @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return Dictionary keyed by reaction ID to lists of complexes performing them, dictionary keyed by complex ID to list of reactions they perform. ''' cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # The API was recently changed to use model IDs and to not use the reactions_to_complexes # but use the ER model instead. # I reflect that here... rxndict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_Reaction(start, count, ['id']) rxndict.update(subdict) start += count if len(subdict) < count: done = True rxns = getFieldFromEntity(rxndict, "id") cplxdict = cdmi_entity.get_relationship_IsStepOf(rxns, [], [], ["id"]) rxnlist = getFieldFromRelationship(cplxdict, "from_link", "rel") cplxlist = getFieldFromRelationship(cplxdict, "id", "to") rxnToComplex = {} complexToRxn = {} for ii in range(len(rxnlist)): if rxnlist[ii] in rxnToComplex: rxnToComplex[rxnlist[ii]].append(cplxlist[ii]) else: rxnToComplex[rxnlist[ii]] = [ cplxlist[ii] ] if cplxlist[ii] in complexToRxn: complexToRxn[cplxlist[ii]].append(rxnlist[ii]) else: complexToRxn[cplxlist[ii]] = [ rxnlist[ii] ] return rxnToComplex, complexToRxn
def filterFidsByOtus(fidlist, otus, config): ''' Obsolete (I think this isn't used any more) Given a list of representative organism IDs (OTUs) and a list of FIDs, returns only those FIDs found in an OTU.''' cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Identify the organism belonging to each fid # If this fails to find an organism we don't want it anyway... orgdict = cdmi_entity.get_relationship_IsOwnedBy(fidlist, [], [], ["id"]) flist = getFieldFromRelationship(orgdict, "from_link", "rel") olist = getFieldFromRelationship(orgdict, "id", "to") fids = [] for ii in range(len(olist)): if olist[ii] in otus: fids.append(flist[ii]) return fids
def addRxnProbabilitiesToBiochemistryJson(reaction_probability_file, biochemistry_json_file, output_file): '''Searches the biochemistry JSON for reaction UUIDs. A dictionary is created (using the alias table for 'modelSEED' from reaction UUID to modelSEED ID, and another from modelSEED IDs to KBase reaction IDs. If we managed to get a probability for that reaction, we print that (even if it is 0 - which means that the complex was defined but not found in the organism) along with the proposed GPR which is just a string. The probability of the reaction is in the 'probability' field while the GPR is in the 'GPR' field in the modified biochemistry json file. If we did NOT calculate a probability for a particular reaction that means no complexes are defined for it and we print -5 to indicate that those have '0 probability' but due to database limitations rather than due to lack of genetic evidence...''' if os.path.exists(output_file): sys.stderr.write("Modified biochemistry JSON file %s already exists!\n" %(output_file)) exit(2) # KBase ID --> (probability, complex_info, GPR) kbaseIdToInfo = {} for line in open(reaction_probability_file, "r"): spl = line.strip("\r\n").split("\t") kbaseIdToInfo[spl[0]] = ( spl[1], spl[3], spl[4] ) # Model ID --> Kbase ID cdmi_entity = CDMI_EntityAPI(CDMI_URL) rxniddict = cdmi_entity.all_entities_Reaction(MINN, COUNT, ["source_id"]) kbaseIds = getFieldFromEntity(rxniddict, "id") modelIds = getFieldFromEntity(rxniddict, "source_id") modelToKbase = {} for ii in range(len(modelIds)): modelToKbase[modelIds[ii]] = kbaseIds[ii] # Different biochemistries will (I think?) have different UUIDs # for all the reactions in them... but they will have links set up # to the model IDs. At least, I HOPE so. resp = json.load(open(biochemistry_json_file, "r")) # UUID --> Model ID aliasSetList = resp["aliasSets"] uuidToModelId = {} for aliasSet in aliasSetList: if aliasSet["source"] == "ModelSEED" and aliasSet["attribute"] == "reactions": aliasDict = aliasSet["aliases"] for k in aliasDict: # aliasDict is really a dict from reaction id to a LIST of UUIDs, implying that # it is possible that more than one UUID goes with the same reaction ID. # If that happens (WHY?????) then I'll just assign all of them the probability # of that reaction. for uuid in aliasDict[k]: uuidToModelId[uuid] = k # We found the one we need, no need to go through the rest of them... break # Now we need to iterate over all of the reactions and add the appropriate probabilities # to each of these. rxnList = resp["reactions"] for ii in range(len(rxnList)): myuuid = rxnList[ii]["uuid"] myProbability = 0 myComplexInfo = "" myGPR = "" # These flags indicate database issues that could bias the probabilities. # # If all of them are FALSE or if only reactionHasComplex is true, the probability will be 0 # but it is due to missing data. # # If only allComplexesHaveRepresentativeRoles is false, that only means there is some missing data # but still enough to test presence of some subunits of some complexes. reactionHasComplex = False oneComplexHasRepresentativeRoles = False allComplexesHaveRepresentativeRoles = False # If the database versions are consistent this should always be the case if myuuid in uuidToModelId: modelId = uuidToModelId[myuuid] if modelId in modelToKbase: kbaseId = modelToKbase[modelId] # This one is only the case if there are complexes associated with the reaction if kbaseId in kbaseIdToInfo: reactionHasComplex = True ''' There are three possibilities for each complex. 1: The roles attached to the complex had no representatives in our BLAST db (BAD) 2: The roles attached to the complex had representatives, but they were not found in our BLAST search (OK) 3: The roles attached were all found with some probability in the BLAST search (OK) Since there are multiple possibilities for complexes we need to decide when we should call it OK and when we can't. For now I will set separate flags for the occasion when one complex has roles and one doesn't (and the calculated probability is for the complex with a probability) and the occasion when NONE of the complexes have representeatives of their roles (and the calculated probability is artificially 0) PARTIAL cases are treated as "has representatives" for this purpose. Therefore, if only allComplexesHaveRepresentativeRoles is false, that means there is incomplete information, but at least one subunit of one complex attached to the reaction had a representative that we could use to calculate a probability. ''' # CPLX_FULL [ok] # CPLX_NOTTHERE [ok] # CPLX_PARTIAL [sort of ok - treated as OK for this purpose] # CPLX_NOREPS [bad] myProbability = kbaseIdToInfo[kbaseId][0] myGPR = kbaseIdToInfo[kbaseId][2] myComplexInfo = kbaseIdToInfo[kbaseId][1] if "CPLX_NOREPS" in myComplexInfo: if "CPLX_FULL" in myComplexInfo or "CPLX_NOTTHERE" in myComplexInfo or "CPLX_PARTIAL" in myComplexInfo: oneComplexHasRepresentativeRoles = True else: # No complexes have representative roles. pass else: # All of them are either CPLX_FULL or CPLX_NOTTHERE oneComplexHasRepresentativeRoles = True allComplexesHaveRepresentativeRoles = True resp["reactions"][ii]["probability"] = myProbability resp["reactions"][ii]["complexinfo"] = myComplexInfo resp["reactions"][ii]["GPR"] = myGPR resp["reactions"][ii]["reactionHasComplex"] = reactionHasComplex resp["reactions"][ii]["oneComplexHasRepresentativeRoles"] = oneComplexHasRepresentativeRoles resp["reactions"][ii]["allComplexesHaveRepresentativeRoles"] = allComplexesHaveRepresentativeRoles json.dump(resp, open(output_file, "w"), indent=4)
def go_anno_net(meth, net_obj_id=None): """Add Gene Ontology annotation to network gene nodes :param net_obj_id: Network object id :type net_obj_id: kbtypes.KBaseNetworks.Network :return: Workspace id :rtype: kbtypes.Unicode :output_widget: ValueListWidget """ meth.stages = 5 meth.advance("Prepare annotation service") #gc = GWAS(URLS.gwas, token=meth.token) # load from current or other workspace wsid = meth.workspace_id # save to current workspace ws_save_id = meth.workspace_id meth.advance("Load network object") wsd = Workspace2(token=meth.token, wsid=wsid) oc = Ontology(url=URLS.ontology) net_object = wsd.get(net_obj_id) nc = Node(net_object['nodes'], net_object['edges']) idc = IDServerAPI(URLS.ids) cdmic = CDMI_API(URLS.cdmi) cdmie = CDMI_EntityAPI(URLS.cdmi) #idm = IdMap(URLS.idmap) gids = [ i for i in sorted(nc.ugids.keys()) if 'CDS' in i or 'locus' in i or ( not 'clst' in i and not i.startswith('cluster') and 'ps.' not in i) ] meth.advance("Get relationships from central data model") #eids = idc.kbase_ids_to_external_ids(gids) eids = kb_id2ext_id(idc, gids, 100) gids2cds = ids2cds(gids) cgids = gids2cds.values() cds2l = cds2locus(cgids) #mrnas_l = cdmie.get_relationship_Encompasses(gids, [], ['to_link'], []) #mrnas = dict((i[1]['from_link'], i[1]['to_link']) for i in mrnas_l) #locus_l = cdmie.get_relationship_Encompasses(mrnas.values(), [], ['to_link'], []) #locus = dict((i[1]['from_link'], i[1]['to_link']) for i in locus_l) #lgids = [locus[mrnas[i]] for i in gids if i in mrnas.keys()] # ignore original locus ids in gids lgids = cds2l.values() meth.advance("Annotate ({:d} nodes, {:d} edges)".format( len(net_object['nodes']), len(net_object['edges']))) #ots = oc.get_goidlist(lgids, ['biological_process'], ['IEA']) ots = oc.get_goidlist(cgids, [], []) oan = () #oc.get_go_annotation(lgids) funcs = cdmic.fids_to_functions(lgids) funcs_org = cdmic.fids_to_functions(cgids) annotate_nodes(net_object, ots=ots, oan=oan, funcs=funcs, funcs_org=funcs_org, eids=eids, gids2cds=gids2cds, cds2l=cds2l) meth.advance("Save annotated object to workspace {}".format(ws_save_id)) obj = { 'type': 'KBaseNetworks.Network', 'data': net_object, 'name': net_obj_id + ".ano", 'meta': { 'original': net_obj_id } } wsd.save_objects({'workspace': ws_save_id, 'objects': [obj]}) return _workspace_output(net_obj_id + ".ano")
def filterFidsByOtusOptimized(featureIdList, rolesToFids, otuRepsToMembers, config): ''' Filter feature IDs by OTU (optimized version). To minimize the amount of redundancy in the list of target proteins, filter the feature IDs so there is at most one protein from each OTU for each functional role. @param featureIdList List of unfiltered feature IDs @param rolesToFids Dictionary keyed by role of list of feature IDs @param otuRepsToMembers Dictionary keyed by OTU representative to list of OTU members @param config Dictionary of configuration variables @return Dictionary keyed by feature ID of list of roles, dictionary keyed by role of list of feature IDs ''' cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Identify the organism belonging to each feature ID. # If this fails to find an organism we don't want it anyway... fidToOrganism = dict() # Map feature IDs to organisms # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 100000 end = start + increment counter = len(featureIdList) while counter > 0: try: ownedBy = cdmi_entity.get_relationship_IsOwnedBy(featureIdList[start:end], [], ['from_link'], ['id']) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue # just build the dictionary here, run the list of ob, extracting fid from from_link and organism from id fidList = getFieldFromRelationship(ownedBy, "from_link", "rel") organismList = getFieldFromRelationship(ownedBy, "id", "to") for index in range(len(fidList)): fidToOrganism[fidList[index]] = organismList[index] start += increment end += increment if end >= len(featureIdList): end = len(featureIdList) counter -= increment # Add all possible keys to the dictionaries and initialize the value. # Then we don't have to check if the key exists in the main loop below. keptFidsToRoles = dict() for index in range(len(featureIdList)): keptFidsToRoles[featureIdList[index]] = list() keptRolesToFids = dict() for role in rolesToFids: keptRolesToFids[role] = list() # Find the feature ID (protein) from each OTU for each functional role. otuCounter = 0 for otuRepresentative in otuRepsToMembers: # This loop takes a very long time so print a message every so often # to track progress. otuCounter += 1 if otuCounter % 10 == 0: sys.stderr.write('Processed %d OTUs at %s\n' %(otuCounter, now())) # Check every functional role. for role in rolesToFids: keepFid = None keepRole = None for fid in rolesToFids[role]: # This can happen due to MOL issues if fid not in fidToOrganism: continue organism = fidToOrganism[fid] # If the organism is the representative we keep it and go to the next role if organism == otuRepresentative: keepFid = fid keepRole = role break # Otherwise look at the rest of the list (note that I just pick one without really paying # attention to WHICH one...). We save them in case there are no examples of the role in the # representative organism, but continue on anyway. if organism in otuRepsToMembers[otuRepresentative]: keepFid = fid keepRole = role # Add to the dictionaries if we are keeping the feature ID. if keepFid is not None: keptFidsToRoles[keepFid].append(keepRole) keptRolesToFids[keepRole].append(keepFid) # Look for any empty lists and remove them. keysToRemove = list() for fid in keptFidsToRoles: if len(keptFidsToRoles[fid]) == 0: keysToRemove.append(fid) for key in keysToRemove: del keptFidsToRoles[key] keysToRemove = list() for role in keptRolesToFids: if len(keptRolesToFids[role]) == 0: keysToRemove.append(role) for key in keysToRemove: del keptRolesToFids[key] return keptFidsToRoles, keptRolesToFids
def filterFidsByOtusBetter(fidsToRoles, rolesToFids, oturepsToMembers, config): '''Attempt to do a more intelligent filtering of FIDs by OTU. Given all FIDs attached to a role in the unfiltered set we do the following: Initialize KEEP For each OTU and each role: If role is found in the representative, add to KEEP and continue; Otherwise, iterate over other genomes. If role is found in one other genome, add to KEEP and continue; This process should make our calculation less sensitive to the choice of OTUs... ''' cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Identify the organism belonging to each fid # If this fails to find an organism we don't want it anyway... fidlist = fidsToRoles.keys() orgdict = [] # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 5000 end = start + increment counter = len(fidlist) while counter > 0: try: od = cdmi_entity.get_relationship_IsOwnedBy(fidlist[start:end], [], [], ["id"]) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue orgdict.extend(od) start += increment end += increment if end >= len(fidlist): end = len(fidlist) counter -= increment fidlist = getFieldFromRelationship(orgdict, "from_link", "rel") orglist = getFieldFromRelationship(orgdict, "id", "to") fidToOrg = {} for ii in range(len(fidlist)): fidToOrg[fidlist[ii]] = orglist[ii] keptFidsToRoles = {} keptRolesToFids = {} # If the OTUs are comprehensive this should be empty. missingRoles = [] # For each OTU for oturep in oturepsToMembers: # for each role for role in rolesToFids: fidlist = rolesToFids[role] keepFid = None keepRole = None for fid in fidlist: # This can happen due to MOL issues if fid not in fidToOrg: continue org = fidToOrg[fid] # If the organism is the representative we keep it and go to the next role if org == oturep: keepFid = fid keepRole = role break # Otherwise look at the rest of the list (note that I just pick one without really paying # attention to WHICH one...). We save them in case there are no examples of the role in the # representative organism, but continue on anyway. if org in oturepsToMembers[oturep]: keepFid = fid keepRole = role if keepFid is not None: if keepFid in keptFidsToRoles: keptFidsToRoles[keepFid].append(keepRole) else: keptFidsToRoles[keepFid] = [ keepRole ] if keepRole in keptRolesToFids: keptRolesToFids[keepRole].append(keepFid) else: keptRolesToFids[keepRole] = [ keepFid ] missingRoles = list(set(rolesToFids.keys()) - set(keptRolesToFids.keys())) # print oturepsToMembers # print missingRoles # print keptRolesToFids return keptFidsToRoles, keptRolesToFids, missingRoles