def getRawResults(params): ''' params: a 4-tuple of roundup params like (qdb, sdb, div, evalue). returns: a string containing all the orthologs for the params. in the form external query sequence id, external subject sequence id, and distance or None if the results do not exist. ''' qdb, sdb, div, evalue = params pair = roundup_common.makePair(qdb, sdb) # get orthologs from db orthologs = roundup_db.getOrthologs(release=webconfig.CURRENT_RELEASE, qdb=pair[0], sdb=pair[1], divergence=div, evalue=evalue) # get a map to external sequence ids sequenceIds = set() for ortholog in orthologs: sequenceIds.add(ortholog[0]) sequenceIds.add(ortholog[1]) sequenceIds = list(sequenceIds) sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap( release=webconfig.CURRENT_RELEASE, sequenceIds=sequenceIds) # format orthologs for download by mapping to external sequence ids. results = None if orthologs: results = ''.join(['{}\t{}\t{}\n'.format(sequenceIdToSequenceDataMap[qid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY], sequenceIdToSequenceDataMap[sid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY], dist) for qid, sid, dist in orthologs]) return results
def getOrthData(params): qdb, sdb, div, evalue = params pair = roundup_common.makePair(qdb, sdb) # get orthologs from db dbOrthologs = roundup_db.getOrthologs(release=webconfig.CURRENT_RELEASE, qdb=pair[0], sdb=pair[1], divergence=div, evalue=evalue) # get a map to external sequence ids sequenceIds = set() for ortholog in dbOrthologs: sequenceIds.add(ortholog[0]) sequenceIds.add(ortholog[1]) sequenceIds = list(sequenceIds) sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap( release=webconfig.CURRENT_RELEASE, sequenceIds=sequenceIds) # orthologs = [(sequenceIdToSequenceDataMap[qid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY], sequenceIdToSequenceDataMap[sid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY], str(dist)) for qid, sid, dist in dbOrthologs] return (params, orthologs)
def doOrthologyQuery(query_desc=None, tc_only=False, db_cursor_read_buffer_size=DEFAULT_DB_CURSOR_READ_BUFFER_SIZE, genome=None, limit_genomes=None, genomes=None, seq_ids=None, divergence=None, evalue=None, go_term=False, gene_name=False, outputPath=None, sortGenomes=True, distance_lower_limit=None, distance_upper_limit=None, release=None, dataset=None, **keywords): ''' query_desc: string describing the query being run. used by the web to let the user know what query was run to generate these results. tc_only: if true only transitively closed clusters are returned. seq_ids: a list of external_sequence_ids/accession numbers/GIs. if not empty, it is used to restrict orthologs to only those that have either query_id or subject_id in seq_ids. genome: get orthologs with a sequence from this genome limit_genomes: get orthologs with a sequence in a genome from limit_genomes. genomes: get orthologs where both sequences are from genomes. divergence: get orthologs calculated with this divergence threshold. evalue: get orthologs calculated with this evalue threshold. go_term: if true, a mapping of seq ids to go terms is returned for the seq ids in the orthology results. gene_name: if true, a mapping of seq ids to gene names is returned for the seq ids in the orthology results. outputPath: if not None, the return value is pickled to this path, not returned, and None is returned. keywords: ignored. here for historical compatibility reasons. This function queries the database to get a list of orthologs and possibly gene names and go terms associated with those orthologs. The orthologs are grouped into clusters (connected subgraphs). returns: a dict containing clusters, column headers, and possibly containing dicts for gene names, go terms, genome names, etc. ''' tableDesc = {'query_desc': query_desc, 'release': release, 'dataset': dataset} distanceLowerLimitFilter, distanceUpperLimitFilter = makeLowerAndUpperLimitFilterFuncs(distance_lower_limit, distance_upper_limit) with roundup_db.connCM() as conn: pairs = makePairsForGenomeParams(genome, limit_genomes, genomes) orthologsLists = [] for pair in pairs: orthologs = roundup_db.getOrthologs(release, qdb=pair[0], sdb=pair[1], divergence=divergence, evalue=evalue, conn=conn) orthologsLists.append(orthologs) # orthologsLists is a list of lists of (query_sequence_id, subject_sequence_id, distance) tuples sequenceIds = set() for ortholog in itertools.chain(*orthologsLists): # orthologs: if distanceLowerLimitFilter(ortholog) and distanceUpperLimitFilter(ortholog): sequenceIds.add(ortholog[0]) sequenceIds.add(ortholog[1]) # get sequence data map from sequenceId to external_id, genome_id, gene_name. sequenceIds = list(sequenceIds) sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap( release, sequenceIds, conn=conn) # cluster orthologs, limiting by seq_ids clusterer = clustering.EdgeClusterer(storeEdges=True) for ortholog in itertools.chain(*orthologsLists): # orthologs: if distanceLowerLimitFilter(ortholog) and distanceUpperLimitFilter(ortholog): # skip orthologs not in seq_ids if seq_ids: if sequenceIdToSequenceDataMap[ortholog[0]][roundup_common.EXTERNAL_SEQUENCE_ID_KEY] not in seq_ids: if sequenceIdToSequenceDataMap[ortholog[1]][roundup_common.EXTERNAL_SEQUENCE_ID_KEY] not in seq_ids: continue clusterer.cluster(ortholog) pass # get genome database ids genomeIds = set([sequenceIdToSequenceDataMap[id][roundup_common.GENOME_ID_KEY] for id in sequenceIds]) genomeIds = list(genomeIds) genomes = [roundup_db.getGenomeForId(release, id=id, conn=conn) for id in genomeIds] # map genome to genomeId genomeToGenomeId = dict(zip(genomes, genomeIds)) genomeIdToGenome = dict(zip(genomeIds, genomes)) # sorted genomes, with genome keyword (if any) at front. if sortGenomes: genomes.sort() ### jike added 'if sortGenomes' if genome and genome in genomes: genomes.remove(genome) genomes.insert(0, genome) # map genomeId to column in result rows genomeIdToCol = dict([(genomeToGenomeId[genomes[col]], col) for col in range(len(genomes))]) # genomeColToGenome = dict([(col, genomes[col]) for col in range(len(genomes))]) # sort genomes and map genome ids to columns # sortedGenomeAndIdPairsList = zip(genomes, genomeIds) # sortedGenomeAndIdPairsList.sort() # genomeIdToCol = dict((sortedGenomeAndIdPairsList[col][1], col) for col in xrange(len(sortedGenomeAndIdPairsList))) # add each cluster to the cluster table # each row contains the genes for each genome in the correct column and the avg distance of the cluster edges. clusterTable = [] clusterOrthologsList = [] headerRow = genomes + ['Average Evolutionary Distance'] for clusterId, cluster in clusterer.clusterIdToNodes.iteritems(): clusterOrthologsList.append(clusterer.clusterIdToEdges[clusterId]) numNodes = len(cluster) numClassesInCluster = len(set([sequenceIdToSequenceDataMap[gene][roundup_common.GENOME_ID_KEY] for gene in cluster])) # if tc_only, do not report non-transitively closed clusters or cluster-classes. if tc_only and (clusterer.clusterIdToNumEdges[clusterId] < ((numNodes * (numNodes - 1)) / 2) or numClassesInCluster != len(genomeIds)): continue # initialize lists for genes in each genome belonging to cluster clusterRow = [[] for i in range(len(genomeIds))] # tack on avg dist to end of row. avgEdgeDist = clusterer.clusterIdToSumDistances[clusterId]/float(clusterer.clusterIdToNumEdges[clusterId]) clusterRow.append('%.3f'%avgEdgeDist) try: for gene in cluster: genomeId = sequenceIdToSequenceDataMap[gene][roundup_common.GENOME_ID_KEY] clusterRow[genomeIdToCol[genomeId]].append(gene) except: logging.debug('gene: '+str(gene)) logging.debug('genomes: '+str(genomes)) logging.debug('genomeIds: '+str(genomeIds)) logging.debug('sequenceIdToSequenceDataMap: '+str(sequenceIdToSequenceDataMap)) logging.debug('genomeIdToCol: '+str(genomeIdToCol)) raise clusterTable.append(clusterRow) tableDesc['type'] = 'clusters' tableDesc['headers'] = headerRow tableDesc['rows'] = clusterTable tableDesc['orthologs'] = clusterOrthologsList tableDesc['divergence'] = divergence tableDesc['evalue'] = evalue seqIdDataMap = dict([(id, {roundup_common.EXTERNAL_SEQUENCE_ID_KEY: sequenceIdToSequenceDataMap[id][roundup_common.EXTERNAL_SEQUENCE_ID_KEY], roundup_common.GENOME_ID_KEY: sequenceIdToSequenceDataMap[id][roundup_common.GENOME_ID_KEY]}) for id in sequenceIdToSequenceDataMap]) if gene_name: tableDesc['has_gene_names'] = True for id in sequenceIdToSequenceDataMap: seqIdDataMap[id][roundup_common.GENE_NAME_KEY] = sequenceIdToSequenceDataMap[id][roundup_common.GENE_NAME_KEY] if go_term: tableDesc['has_go_terms'] = True (sequenceIdToTermsMap, termMap) = roundup_db.getSequenceIdToTermsMap( release, sequenceIds, conn=conn) for id in sequenceIdToSequenceDataMap: seqIdDataMap[id][roundup_common.TERMS_KEY] = sequenceIdToTermsMap.get(id, []) tableDesc['term_map'] = termMap tableDesc['seq_id_to_data_map'] = seqIdDataMap tableDesc['genome_id_to_genome_map'] = genomeIdToGenome if outputPath: util.dumpObject(tableDesc, outputPath) return None else: return tableDesc