def get_consistency(p, t, phyid, seedid, species_list, n, processed_seedids): """Return consistency of given family across collateral trees from given phylome.""" #current tree is ok signals = [1] processed = set() #get seed sp paralogs seedsp = _get_spcode(seedid) protids = filter(lambda x: _get_spcode(x) == seedsp, t.get_leaf_names()) #process all protids - with iterative update of protids while protids: #take first protid protid = protids.pop() #mark as processed processed.add(protid) #add to processed seedids, so it's visited only once processed_seedids.add(protid) #skip current tree if protid == seedid: continue #get list of groups A1-B1 A2-B2 and so on groups, t = process_tree(p, phyid, n, species_list, protid) #add signal if groups: #add 1 if correct groups in collateral tree signals.append(1) else: #or 0 if collateral tree is inconsistent with tree signals.append(0) #skip if no t if not t: continue #update protids list - important, as different gene content trees for diverged some families for _protid in filter(lambda x: _get_spcode(x) == seedsp, t.get_leaf_names()): if _protid not in processed: protids.append(_protid) #calculate cs cs = np.mean(signals) return cs, processed_seedids
def get_orthologs(phyprot, phyid, p, code2score={}): """Return orthologs for given protid""" orthologs = set((phyprot, )) #get tree t, bestSeedid = get_seed_or_collateral_tree(phyprot, phyid, p) if not t: info = "[WARNING] No tree for %s in phylome %s has been found.\n" sys.stderr.write(info % (phyprot, phyid)) return orthologs, code2score, bestSeedid #set species naming function #t.set_species_naming_function(_get_spcode) ###set retrieving species informations! #root tree if phyid in ROOTED_PHYLOMES: outgroup = t.get_farthest_oldest_leaf(ROOTED_PHYLOMES[phyid]) t.set_outgroup(outgroup) else: t.set_outgroup(t.get_midpoint_outgroup()) #get phyprot node l = t.get_leaves_by_name(phyprot)[0] #get orthologs w = 0 for s in filter(lambda e: e.etype == 'S', l.get_my_evol_events()): w += 1 if phyprot in s.in_seqs: seqs = s.out_seqs else: seqs = s.in_seqs for o in seqs: orthologs.add(o) if _get_spcode(o) not in code2score: code2score[_get_spcode(o)] = [ w, ] code2score[_get_spcode(o)].append(w) return orthologs, code2score, bestSeedid
def get_consistency( p,t,phyid,seedid,species_list,n,processed_seedids ): """Return consistency of given family across collateral trees from given phylome.""" #current tree is ok signals = [1] processed = set() #get seed sp paralogs seedsp = _get_spcode(seedid) protids = filter( lambda x: _get_spcode(x)==seedsp,t.get_leaf_names() ) #process all protids - with iterative update of protids while protids: #take first protid protid = protids.pop() #mark as processed processed.add( protid ) #add to processed seedids, so it's visited only once processed_seedids.add( protid ) #skip current tree if protid==seedid: continue #get list of groups A1-B1 A2-B2 and so on groups,t = process_tree( p,phyid,n,species_list,protid ) #add signal if groups: #add 1 if correct groups in collateral tree signals.append(1) else: #or 0 if collateral tree is inconsistent with tree signals.append(0) #skip if no t if not t: continue #update protids list - important, as different gene content trees for diverged some families for _protid in filter( lambda x: _get_spcode(x)==seedsp,t.get_leaf_names() ): if _protid not in processed: protids.append( _protid ) #calculate cs cs = np.mean(signals) return cs,processed_seedids
def get_orthologs( t,seedname,phyid,species_list ): """ Takes rooted tree as input. Return two list: orthology_list - all2all orthologous sequences retrieved from given tree. The group contain one species (the first possible) from each phylogroup. """ #get evol_events evolEvents,seednode=get_evolEvents( t,phyid,seedname ) if not evolEvents: return [] orthologs = [ seedname ] cur_species=[_get_spcode(seedname)] for e in filter( lambda x: x.etype=='S', seednode.get_my_evol_events() ): for o in e.out_seqs: spCode = _get_spcode(o) if spCode in species_list: orthologs.append( o ) if spCode not in cur_species: cur_species.append(spCode) if len(cur_species)==len(species_list): break return orthologs
def get_orthologs(phyprot, phyid, p, code2score={}): """Return orthologs for given protid""" orthologs = set((phyprot, )) #get tree t, bestSeedid = get_seed_or_collateral_tree(phyprot, phyid, p) if not t: info = "[WARNING] No tree for %s in phylome %s has been found.\n" sys.stderr.write(info%(phyprot, phyid)) return orthologs, code2score, bestSeedid #set species naming function #t.set_species_naming_function(_get_spcode) ###set retrieving species informations! #root tree if phyid in ROOTED_PHYLOMES: outgroup = t.get_farthest_oldest_leaf( ROOTED_PHYLOMES[phyid] ) t.set_outgroup( outgroup ) else: t.set_outgroup( t.get_midpoint_outgroup() ) #get phyprot node l=t.get_leaves_by_name(phyprot)[0] #get orthologs w=0 for s in filter( lambda e: e.etype=='S', l.get_my_evol_events() ): w+=1 if phyprot in s.in_seqs: seqs=s.out_seqs else: seqs=s.in_seqs for o in seqs: orthologs.add( o ) if _get_spcode(o) not in code2score: code2score[_get_spcode(o)] = [ w, ] code2score[_get_spcode(o)].append( w ) return orthologs, code2score, bestSeedid
def generate_orthogroups(orthologs, species_list, collpase_inparalogs=True): """ """ sp2orth = {} for o in orthologs: sp = _get_spcode(o) try: sp2orth[sp].append(o) except: sp2orth[sp] = [o] orthogroups = [] for sp in species_list: if sp in sp2orth: #collapse in-paralogs if orthogroups and collpase_inparalogs: for o in sp2orth[sp]: orthogroups[-1].append(o) #add another species orthologs elif orthogroups: orthogroups_org = copy(orthogroups) #multiply o.groups when in-paralogs found for i in range(len(sp2orth[sp]) - 1): for orthogroup in orthogroups_org: orthogroups.append( copy(orthogroup)) #; print orthogroups, plen index = 0 for o in sp2orth[sp]: for i in range(len(orthogroups) / len(sp2orth[sp])): try: orthogroups[index].append(o) except: print "Error! index: %s\n%s\n%s" % ( index, orthologs, orthogroups) index += 1 #make sure adding only one of given species in-paralogs into orthogroups #populate orthogroups with first orthologs else: for o in sp2orth[sp]: orthogroups.append([o]) return orthogroups
def generate_orthogroups( orthologs,species_list,collpase_inparalogs=True ): """ """ sp2orth={} for o in orthologs: sp=_get_spcode(o) try: sp2orth[sp].append(o) except: sp2orth[sp]=[o] orthogroups=[] for sp in species_list: if sp in sp2orth: #collapse in-paralogs if orthogroups and collpase_inparalogs: for o in sp2orth[sp]: orthogroups[-1].append(o) #add another species orthologs elif orthogroups: orthogroups_org=copy(orthogroups) #multiply o.groups when in-paralogs found for i in range(len(sp2orth[sp])-1): for orthogroup in orthogroups_org: orthogroups.append(copy(orthogroup))#; print orthogroups, plen index=0 for o in sp2orth[sp]: for i in range( len(orthogroups)/len(sp2orth[sp]) ): try: orthogroups[index].append(o) except: print "Error! index: %s\n%s\n%s" % ( index,orthologs,orthogroups ) index+=1#make sure adding only one of given species in-paralogs into orthogroups #populate orthogroups with first orthologs else: for o in sp2orth[sp]: orthogroups.append([o]) return orthogroups
def profile(handle, out, phyid, protid2phylmeFn, spCode, speciesInRows, \ annotationFn, verbose): """Generate orthologous gene profile.""" #get phylomeDB connection p = _getConnection() #get protids protids=[] #,protid2pfam,protid2change=get_protids( handle,foldChange,foldChangeColumn ) genes=[] for r in SeqIO.parse(handle, 'fasta'): gene = protid = r.id.split('|')[0] #orf19.3038|TPS2 protids.append(protid) #get gene name if present if len(r.id.split('|'))>1: gene=r.id.split('|')[1].split('_')[0] genes.append(gene) #load annotation prot2ann = {} if annotationFn: prot2ann = load_annotation(annotationFn) #get species info code2name = get_species_in_phylome(phyid, p) #define empty profiles code2profile={} code2score={} for code in code2name: code2profile[code]=[0 for i in range(len(protids))] code2score[code]=[] #get phylomedb ids protid2seedid = {} k = 0 protid2phyid = get_protid2phyid(protid2phylmeFn, protids, spCode, p, phyid) for i, protid in enumerate(protids, 1): sys.stderr.write(" %s / %s %s \r"%(i, len(protids), protid)) if protid not in protid2phyid: continue phyprot = protid2phyid[protid] orthologs, code2score, seedid = get_orthologs(phyprot, phyid, p, code2score) protid2seedid[protid] = seedid #s; print protid, seedids #fill profiles for o in orthologs: code2profile[_get_spcode(o)][i-1]+=1 if len(orthologs)>1: k += 1 elif verbose: sys.stderr.write("[WARNING] Only %s orthologs found for %s (%s)!\n"%(len(orthologs), protid, phyprot)) #write info sys.stderr.write("%s proteins; %s with orthologs\n"%(i, k)) ###print summary #header if not speciesInRows: info='#Protid\tGene\tSeedID' for code in sorted(code2name, key=lambda x: np.mean(code2score[x])): nameShort='%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1]) info+='\t%s' % nameShort info+='\tAnnotation' #data for j in range(len(protids)): protid=protids[j] gene=genes[j] if gene==protid: gene='' seedid = "" if protid in protid2seedid and protid2seedid[protid]: seedid = protid2seedid[protid] info+='\n%s\t%s\t%s' % (protid, gene, seedid) for code in sorted(code2name,key=lambda x: np.mean(code2score[x])): info+='\t%s' % code2profile[code][j] if protid in prot2ann: info+="\t%s" % prot2ann[protid] else: info='#Species' protidLine='#Protid' annLine='#Annotation' for protid,gene in zip(protids,genes): info+='\t%s' % gene if gene!=protid: protidLine+='\t%s' % protid else: protidLine+='\t' seedid = "" if protid in protid2seedid and protid2seedid[protid]: seedid = protid2seedid[protid] protidLine += "\t%s" %seedid if protid in prot2ann: annLine+="\t%s" % prot2ann[protid] else: annLine+='\t' j=0 for code in sorted(code2name, key=lambda x: np.mean(code2score[x])): info+='\n%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1]) for orthologNo in code2profile[code]: info+='\t%s' % orthologNo info += protidLine + annLine out.write(info)
def process_phylome( phyid,species_list=None,one2one=True,collpase_inparalogs=False,missingSpeciesTh=0.10,step=100 ): """If not species_list, all species of given phylome are taken. """ print "Generating orthogroups..." all_orthogroups=[] p=_getConnection()#; print p.get_phylomes() #get some neccesary info if not species_list: species_list=[] proteomes_in_phylome=p.get_proteomes_in_phylome(phyid)['proteomes'] for proteomeID in proteomes_in_phylome: spCode=proteomeID.split('.')[0] species_list.append(spCode) print " for %s species: %s" % ( len( species_list ),", ".join( species_list ) ) #make sure seed species if in orthogroups seed_sp = p.get_phylome_info(phyid)['seed_proteome'].split('.')[0] if not seed_sp in species_list: species_list.append( seed_sp ) orthoFpath='phylome%s_orthogroups_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )#; uncommonFpath='phylome%s_uncommon_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh ) if os.path.isfile( orthoFpath ): print " Loading orthologous groups from file: %s" % orthoFpath for line in open(orthoFpath): line=line.strip() all_orthogroups.append( line.split('\t') ) return all_orthogroups,orthoFpath,species_list phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994'] outFile=open( orthoFpath,'w' )#; uncommonFile=open( uncommonFpath,'w' ) trees=not_included=i=pI=low_species_cov=0; pt=datetime.now() for seedid in phylome_seedids: trees_dict=p.get_best_tree( seedid,phyid )#; print trees_dict if not trees_dict: continue t=trees_dict['tree'] if not t: continue trees+=1 #process orthogroups orthogroups=get_orthogroups( t,seedid,phyid,species_list,one2one,collpase_inparalogs ) for og in orthogroups: line=""; _curSpecies=[] for o in og: spCode=_get_spcode(o) if spCode not in _curSpecies: _curSpecies.append(spCode) line+="%s\t" % o species_coverage=len(_curSpecies)*1.0/len(species_list) line=line[:-1]+'\n' #if not enough species in orthogroup if species_coverage<1-missingSpeciesTh: #uncommonFile.write( line) # save in uncommon low_species_cov+=1 else: outFile.write( line ) #save in orthogroup file all_orthogroups.append(og) #and add orthogroup to list i+=1 if trees>pI: pI+=step sys.stdout.write( " %s %s %s\t%s\r" % ( trees,i,seedid,datetime.now()-pt ) ) pt=datetime.now() print print " Processed %s trees (skipped: %s ) for %s seeds. %s one2one orthologous groups and %s with species coverage < %s." % ( trees,not_included,len(phylome_seedids),i,low_species_cov,1-missingSpeciesTh ) outFile.close() return all_orthogroups,orthoFpath,species_list