예제 #1
0
def get_consistency(p, t, phyid, seedid, species_list, n, processed_seedids):
    """Return consistency of given family across
  collateral trees from given phylome."""
    #current tree is ok
    signals = [1]
    processed = set()
    #get seed sp paralogs
    seedsp = _get_spcode(seedid)
    protids = filter(lambda x: _get_spcode(x) == seedsp, t.get_leaf_names())
    #process all protids - with iterative update of protids
    while protids:
        #take first protid
        protid = protids.pop()
        #mark as processed
        processed.add(protid)
        #add to processed seedids, so it's visited only once
        processed_seedids.add(protid)
        #skip current tree
        if protid == seedid:
            continue
        #get list of groups A1-B1 A2-B2 and so on
        groups, t = process_tree(p, phyid, n, species_list, protid)
        #add signal
        if groups:  #add 1 if correct groups in collateral tree
            signals.append(1)
        else:  #or 0 if collateral tree is inconsistent with tree
            signals.append(0)
        #skip if no t
        if not t:
            continue
        #update protids list - important, as different gene content trees for diverged some families
        for _protid in filter(lambda x: _get_spcode(x) == seedsp,
                              t.get_leaf_names()):
            if _protid not in processed:
                protids.append(_protid)

    #calculate cs
    cs = np.mean(signals)
    return cs, processed_seedids
예제 #2
0
def get_orthologs(phyprot, phyid, p, code2score={}):
    """Return orthologs for given protid"""
    orthologs = set((phyprot, ))
    #get tree
    t, bestSeedid = get_seed_or_collateral_tree(phyprot, phyid, p)
    if not t:
        info = "[WARNING] No tree for %s in phylome %s has been found.\n"
        sys.stderr.write(info % (phyprot, phyid))
        return orthologs, code2score, bestSeedid

    #set species naming function
    #t.set_species_naming_function(_get_spcode) ###set retrieving species informations!
    #root tree
    if phyid in ROOTED_PHYLOMES:
        outgroup = t.get_farthest_oldest_leaf(ROOTED_PHYLOMES[phyid])
        t.set_outgroup(outgroup)
    else:
        t.set_outgroup(t.get_midpoint_outgroup())

    #get phyprot node
    l = t.get_leaves_by_name(phyprot)[0]

    #get orthologs
    w = 0
    for s in filter(lambda e: e.etype == 'S', l.get_my_evol_events()):
        w += 1
        if phyprot in s.in_seqs:
            seqs = s.out_seqs
        else:
            seqs = s.in_seqs
        for o in seqs:
            orthologs.add(o)
            if _get_spcode(o) not in code2score:
                code2score[_get_spcode(o)] = [
                    w,
                ]
            code2score[_get_spcode(o)].append(w)

    return orthologs, code2score, bestSeedid
예제 #3
0
def get_consistency( p,t,phyid,seedid,species_list,n,processed_seedids ):
  """Return consistency of given family across
  collateral trees from given phylome."""
  #current tree is ok
  signals = [1]
  processed = set()
  #get seed sp paralogs
  seedsp  = _get_spcode(seedid)
  protids = filter( lambda x: _get_spcode(x)==seedsp,t.get_leaf_names() )
  #process all protids - with iterative update of protids
  while protids:
    #take first protid
    protid = protids.pop()
    #mark as processed
    processed.add( protid )
    #add to processed seedids, so it's visited only once
    processed_seedids.add( protid )
    #skip current tree
    if protid==seedid:
      continue
    #get list of groups A1-B1 A2-B2 and so on
    groups,t = process_tree( p,phyid,n,species_list,protid )
    #add signal
    if groups: #add 1 if correct groups in collateral tree
      signals.append(1)
    else:      #or 0 if collateral tree is inconsistent with tree
      signals.append(0)
    #skip if no t
    if not t:
      continue
    #update protids list - important, as different gene content trees for diverged some families
    for _protid in filter( lambda x: _get_spcode(x)==seedsp,t.get_leaf_names() ):
      if _protid not in processed:
        protids.append( _protid )

  #calculate cs    
  cs = np.mean(signals)
  return cs,processed_seedids
예제 #4
0
def get_orthologs( t,seedname,phyid,species_list ):
    """
    Takes rooted tree as input.
    Return two list: orthology_list - all2all orthologous sequences retrieved from given tree. 
    The group contain one species (the first possible) from each phylogroup.
    """
    #get evol_events
    evolEvents,seednode=get_evolEvents( t,phyid,seedname )
    if not evolEvents:
        return []
  
    orthologs = [ seedname ]
    cur_species=[_get_spcode(seedname)]
    for e in filter( lambda x: x.etype=='S', seednode.get_my_evol_events() ):
        for o in e.out_seqs: 
            spCode = _get_spcode(o)
            if spCode in species_list: 
                orthologs.append( o )
                if spCode not in cur_species:
                    cur_species.append(spCode)
        if len(cur_species)==len(species_list):
            break 
    return orthologs
예제 #5
0
def get_orthologs( t,seedname,phyid,species_list ):
    """
    Takes rooted tree as input.
    Return two list: orthology_list - all2all orthologous sequences retrieved from given tree. 
    The group contain one species (the first possible) from each phylogroup.
    """
    #get evol_events
    evolEvents,seednode=get_evolEvents( t,phyid,seedname )
    if not evolEvents:
        return []
  
    orthologs = [ seedname ]
    cur_species=[_get_spcode(seedname)]
    for e in filter( lambda x: x.etype=='S', seednode.get_my_evol_events() ):
        for o in e.out_seqs: 
            spCode = _get_spcode(o)
            if spCode in species_list: 
                orthologs.append( o )
                if spCode not in cur_species:
                    cur_species.append(spCode)
        if len(cur_species)==len(species_list):
            break 
    return orthologs
예제 #6
0
def get_orthologs(phyprot, phyid, p, code2score={}):
    """Return orthologs for given protid"""
    orthologs = set((phyprot, ))
    #get tree
    t, bestSeedid = get_seed_or_collateral_tree(phyprot, phyid, p)
    if not t:
        info = "[WARNING] No tree for %s in phylome %s has been found.\n"
        sys.stderr.write(info%(phyprot, phyid))
        return orthologs, code2score, bestSeedid
        
    #set species naming function
    #t.set_species_naming_function(_get_spcode) ###set retrieving species informations!
    #root tree
    if phyid in ROOTED_PHYLOMES:
        outgroup = t.get_farthest_oldest_leaf( ROOTED_PHYLOMES[phyid] )
        t.set_outgroup( outgroup )
    else:
        t.set_outgroup( t.get_midpoint_outgroup() )
    
    #get phyprot node
    l=t.get_leaves_by_name(phyprot)[0]
  
    #get orthologs
    w=0
    for s in filter( lambda e: e.etype=='S', l.get_my_evol_events() ):
        w+=1
        if phyprot in s.in_seqs:
            seqs=s.out_seqs
        else:
            seqs=s.in_seqs
        for o in seqs: 
            orthologs.add( o )
            if _get_spcode(o) not in code2score:
                code2score[_get_spcode(o)] = [ w, ]
            code2score[_get_spcode(o)].append( w )
  
    return orthologs, code2score, bestSeedid
예제 #7
0
def generate_orthogroups(orthologs, species_list, collpase_inparalogs=True):
    """
    """
    sp2orth = {}
    for o in orthologs:
        sp = _get_spcode(o)
        try:
            sp2orth[sp].append(o)
        except:
            sp2orth[sp] = [o]

    orthogroups = []
    for sp in species_list:
        if sp in sp2orth:
            #collapse in-paralogs
            if orthogroups and collpase_inparalogs:
                for o in sp2orth[sp]:
                    orthogroups[-1].append(o)
            #add another species orthologs
            elif orthogroups:
                orthogroups_org = copy(orthogroups)
                #multiply o.groups when in-paralogs found
                for i in range(len(sp2orth[sp]) - 1):
                    for orthogroup in orthogroups_org:
                        orthogroups.append(
                            copy(orthogroup))  #; print orthogroups, plen
                index = 0
                for o in sp2orth[sp]:
                    for i in range(len(orthogroups) / len(sp2orth[sp])):
                        try:
                            orthogroups[index].append(o)
                        except:
                            print "Error! index: %s\n%s\n%s" % (
                                index, orthologs, orthogroups)
                    index += 1  #make sure adding only one of given species in-paralogs into orthogroups
            #populate orthogroups with first orthologs
            else:
                for o in sp2orth[sp]:
                    orthogroups.append([o])

    return orthogroups
예제 #8
0
def generate_orthogroups( orthologs,species_list,collpase_inparalogs=True ): 
    """
    """ 
    sp2orth={}
    for o in orthologs:
        sp=_get_spcode(o)
        try:
            sp2orth[sp].append(o)
        except:
            sp2orth[sp]=[o]
  
    orthogroups=[]
    for sp in species_list:
        if sp in sp2orth:
            #collapse in-paralogs
            if orthogroups and collpase_inparalogs: 
                for o in sp2orth[sp]:
                    orthogroups[-1].append(o)
            #add another species orthologs
            elif orthogroups: 
                orthogroups_org=copy(orthogroups)
                #multiply o.groups when in-paralogs found
                for i in range(len(sp2orth[sp])-1): 
                    for orthogroup in orthogroups_org:
                        orthogroups.append(copy(orthogroup))#; print orthogroups, plen
                index=0
                for o in sp2orth[sp]:
                    for i in range( len(orthogroups)/len(sp2orth[sp]) ): 
                        try:
                            orthogroups[index].append(o)
                        except:
                            print "Error! index: %s\n%s\n%s" % ( index,orthologs,orthogroups )
                    index+=1#make sure adding only one of given species in-paralogs into orthogroups
            #populate orthogroups with first orthologs
            else: 
                for o in sp2orth[sp]:
                    orthogroups.append([o])

    return orthogroups
예제 #9
0
def profile(handle, out, phyid, protid2phylmeFn, spCode, speciesInRows, \
            annotationFn, verbose):
    """Generate orthologous gene profile."""
    #get phylomeDB connection
    p = _getConnection()
    
    #get protids
    protids=[] #,protid2pfam,protid2change=get_protids( handle,foldChange,foldChangeColumn )
    genes=[]
    for r in SeqIO.parse(handle, 'fasta'): 
        gene = protid = r.id.split('|')[0] #orf19.3038|TPS2
        protids.append(protid)
        #get gene name if present
        if len(r.id.split('|'))>1:
            gene=r.id.split('|')[1].split('_')[0]
        genes.append(gene)
  
    #load annotation
    prot2ann = {}
    if annotationFn:
        prot2ann = load_annotation(annotationFn)       
  
    #get species info
    code2name = get_species_in_phylome(phyid, p)
        
    #define empty profiles
    code2profile={}
    code2score={}
    for code in code2name: 
        code2profile[code]=[0 for i in range(len(protids))]
        code2score[code]=[]
    
    #get phylomedb ids
    protid2seedid = {}
    k = 0
    protid2phyid = get_protid2phyid(protid2phylmeFn, protids, spCode, p, phyid)
    for i, protid in enumerate(protids, 1):
        sys.stderr.write(" %s / %s %s   \r"%(i, len(protids), protid))
        if protid not in protid2phyid: 
            continue
        phyprot = protid2phyid[protid]
        orthologs, code2score, seedid = get_orthologs(phyprot, phyid, p, code2score)
        protid2seedid[protid] = seedid #s; print protid, seedids
        #fill profiles
        for o in orthologs: 
            code2profile[_get_spcode(o)][i-1]+=1
        if len(orthologs)>1:
            k += 1
        elif verbose:
            sys.stderr.write("[WARNING] Only %s orthologs found for %s (%s)!\n"%(len(orthologs), protid, phyprot))
    #write info
    sys.stderr.write("%s proteins; %s with orthologs\n"%(i, k))
        
    ###print summary
    #header
    if not speciesInRows:
        info='#Protid\tGene\tSeedID'
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):
            nameShort='%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            info+='\t%s' % nameShort
        info+='\tAnnotation'
        #data
        for j in range(len(protids)):
            protid=protids[j]
            gene=genes[j]
            if gene==protid:
                gene=''
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            info+='\n%s\t%s\t%s' % (protid, gene, seedid)
            for code in sorted(code2name,key=lambda x: np.mean(code2score[x])):  
                info+='\t%s' % code2profile[code][j]
      
            if protid in prot2ann:
                info+="\t%s" %  prot2ann[protid] 
    else:
        info='#Species'
        protidLine='#Protid'
        annLine='#Annotation'
        for protid,gene in zip(protids,genes): 
            info+='\t%s' % gene
            if gene!=protid:
                protidLine+='\t%s' % protid
            else:
                protidLine+='\t'
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            protidLine += "\t%s" %seedid
            if protid in prot2ann:
                annLine+="\t%s" % prot2ann[protid]
            else:
                annLine+='\t'
        j=0
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):  
            info+='\n%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            for orthologNo in code2profile[code]:
                info+='\t%s' % orthologNo
    
        info += protidLine + annLine
    out.write(info)
예제 #10
0
def process_phylome( phyid,species_list=None,one2one=True,collpase_inparalogs=False,missingSpeciesTh=0.10,step=100 ):    
    """If not species_list, all species of given phylome are taken.
    """
    print "Generating orthogroups..."
    all_orthogroups=[]
    p=_getConnection()#; print p.get_phylomes() #get some neccesary info
    if not species_list:
        species_list=[]
        proteomes_in_phylome=p.get_proteomes_in_phylome(phyid)['proteomes']
        for proteomeID in proteomes_in_phylome: 
            spCode=proteomeID.split('.')[0]
            species_list.append(spCode)

    print " for %s species: %s" % ( len( species_list ),", ".join( species_list ) ) 
    #make sure seed species if in orthogroups
    seed_sp = p.get_phylome_info(phyid)['seed_proteome'].split('.')[0]
    if not seed_sp in species_list:
        species_list.append( seed_sp )
              
    orthoFpath='phylome%s_orthogroups_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )#; uncommonFpath='phylome%s_uncommon_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )
    if os.path.isfile( orthoFpath ): 
        print " Loading orthologous groups from file: %s" % orthoFpath
        for line in open(orthoFpath):
            line=line.strip()
            all_orthogroups.append( line.split('\t') )
        return all_orthogroups,orthoFpath,species_list
  
    phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994']
    outFile=open( orthoFpath,'w' )#; uncommonFile=open( uncommonFpath,'w' )
    trees=not_included=i=pI=low_species_cov=0; pt=datetime.now()
    for seedid in phylome_seedids:
        trees_dict=p.get_best_tree( seedid,phyid )#; print trees_dict
        if not trees_dict:
            continue
        t=trees_dict['tree']
        if not t:
            continue
        trees+=1
        #process orthogroups
        orthogroups=get_orthogroups( t,seedid,phyid,species_list,one2one,collpase_inparalogs )
        for og in orthogroups:
            line=""; _curSpecies=[]
            for o in og: 
                spCode=_get_spcode(o)
                if spCode not in _curSpecies:
                    _curSpecies.append(spCode)
                    line+="%s\t" % o                    
            species_coverage=len(_curSpecies)*1.0/len(species_list)
            line=line[:-1]+'\n'
            #if not enough species in orthogroup 
            if species_coverage<1-missingSpeciesTh:  
                #uncommonFile.write( line) # save in uncommon
                low_species_cov+=1
            else: 
                outFile.write( line ) #save in orthogroup file
                all_orthogroups.append(og) #and add orthogroup to list
                i+=1

        if trees>pI:
            pI+=step
            sys.stdout.write( "   %s %s %s\t%s\r" % ( trees,i,seedid,datetime.now()-pt ) )
            pt=datetime.now()
    print
    print " Processed %s trees (skipped: %s ) for %s seeds. %s one2one orthologous groups and %s with species coverage < %s." % ( trees,not_included,len(phylome_seedids),i,low_species_cov,1-missingSpeciesTh )
    outFile.close()
    return all_orthogroups,orthoFpath,species_list
예제 #11
0
def profile(handle, out, phyid, protid2phylmeFn, spCode, speciesInRows, \
            annotationFn, verbose):
    """Generate orthologous gene profile."""
    #get phylomeDB connection
    p = _getConnection()
    
    #get protids
    protids=[] #,protid2pfam,protid2change=get_protids( handle,foldChange,foldChangeColumn )
    genes=[]
    for r in SeqIO.parse(handle, 'fasta'): 
        gene = protid = r.id.split('|')[0] #orf19.3038|TPS2
        protids.append(protid)
        #get gene name if present
        if len(r.id.split('|'))>1:
            gene=r.id.split('|')[1].split('_')[0]
        genes.append(gene)
  
    #load annotation
    prot2ann = {}
    if annotationFn:
        prot2ann = load_annotation(annotationFn)       
  
    #get species info
    code2name = get_species_in_phylome(phyid, p)
        
    #define empty profiles
    code2profile={}
    code2score={}
    for code in code2name: 
        code2profile[code]=[0 for i in range(len(protids))]
        code2score[code]=[]
    
    #get phylomedb ids
    protid2seedid = {}
    k = 0
    protid2phyid = get_protid2phyid(protid2phylmeFn, protids, spCode, p, phyid)
    for i, protid in enumerate(protids, 1):
        sys.stderr.write(" %s / %s %s   \r"%(i, len(protids), protid))
        if protid not in protid2phyid: 
            continue
        phyprot = protid2phyid[protid]
        orthologs, code2score, seedid = get_orthologs(phyprot, phyid, p, code2score)
        protid2seedid[protid] = seedid #s; print protid, seedids
        #fill profiles
        for o in orthologs: 
            code2profile[_get_spcode(o)][i-1]+=1
        if len(orthologs)>1:
            k += 1
        elif verbose:
            sys.stderr.write("[WARNING] Only %s orthologs found for %s (%s)!\n"%(len(orthologs), protid, phyprot))
    #write info
    sys.stderr.write("%s proteins; %s with orthologs\n"%(i, k))
        
    ###print summary
    #header
    if not speciesInRows:
        info='#Protid\tGene\tSeedID'
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):
            nameShort='%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            info+='\t%s' % nameShort
        info+='\tAnnotation'
        #data
        for j in range(len(protids)):
            protid=protids[j]
            gene=genes[j]
            if gene==protid:
                gene=''
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            info+='\n%s\t%s\t%s' % (protid, gene, seedid)
            for code in sorted(code2name,key=lambda x: np.mean(code2score[x])):  
                info+='\t%s' % code2profile[code][j]
      
            if protid in prot2ann:
                info+="\t%s" %  prot2ann[protid] 
    else:
        info='#Species'
        protidLine='#Protid'
        annLine='#Annotation'
        for protid,gene in zip(protids,genes): 
            info+='\t%s' % gene
            if gene!=protid:
                protidLine+='\t%s' % protid
            else:
                protidLine+='\t'
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            protidLine += "\t%s" %seedid
            if protid in prot2ann:
                annLine+="\t%s" % prot2ann[protid]
            else:
                annLine+='\t'
        j=0
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):  
            info+='\n%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            for orthologNo in code2profile[code]:
                info+='\t%s' % orthologNo
    
        info += protidLine + annLine
    out.write(info)
예제 #12
0
def process_phylome( phyid,species_list=None,one2one=True,collpase_inparalogs=False,missingSpeciesTh=0.10,step=100 ):    
    """If not species_list, all species of given phylome are taken.
    """
    print "Generating orthogroups..."
    all_orthogroups=[]
    p=_getConnection()#; print p.get_phylomes() #get some neccesary info
    if not species_list:
        species_list=[]
        proteomes_in_phylome=p.get_proteomes_in_phylome(phyid)['proteomes']
        for proteomeID in proteomes_in_phylome: 
            spCode=proteomeID.split('.')[0]
            species_list.append(spCode)

    print " for %s species: %s" % ( len( species_list ),", ".join( species_list ) ) 
    #make sure seed species if in orthogroups
    seed_sp = p.get_phylome_info(phyid)['seed_proteome'].split('.')[0]
    if not seed_sp in species_list:
        species_list.append( seed_sp )
              
    orthoFpath='phylome%s_orthogroups_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )#; uncommonFpath='phylome%s_uncommon_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )
    if os.path.isfile( orthoFpath ): 
        print " Loading orthologous groups from file: %s" % orthoFpath
        for line in open(orthoFpath):
            line=line.strip()
            all_orthogroups.append( line.split('\t') )
        return all_orthogroups,orthoFpath,species_list
  
    phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994']
    outFile=open( orthoFpath,'w' )#; uncommonFile=open( uncommonFpath,'w' )
    trees=not_included=i=pI=low_species_cov=0; pt=datetime.now()
    for seedid in phylome_seedids:
        trees_dict=p.get_best_tree( seedid,phyid )#; print trees_dict
        if not trees_dict:
            continue
        t=trees_dict['tree']
        if not t:
            continue
        trees+=1
        #process orthogroups
        orthogroups=get_orthogroups( t,seedid,phyid,species_list,one2one,collpase_inparalogs )
        for og in orthogroups:
            line=""; _curSpecies=[]
            for o in og: 
                spCode=_get_spcode(o)
                if spCode not in _curSpecies:
                    _curSpecies.append(spCode)
                    line+="%s\t" % o                    
            species_coverage=len(_curSpecies)*1.0/len(species_list)
            line=line[:-1]+'\n'
            #if not enough species in orthogroup 
            if species_coverage<1-missingSpeciesTh:  
                #uncommonFile.write( line) # save in uncommon
                low_species_cov+=1
            else: 
                outFile.write( line ) #save in orthogroup file
                all_orthogroups.append(og) #and add orthogroup to list
                i+=1

        if trees>pI:
            pI+=step
            sys.stdout.write( "   %s %s %s\t%s\r" % ( trees,i,seedid,datetime.now()-pt ) )
            pt=datetime.now()
    print
    print " Processed %s trees (skipped: %s ) for %s seeds. %s one2one orthologous groups and %s with species coverage < %s." % ( trees,not_included,len(phylome_seedids),i,low_species_cov,1-missingSpeciesTh )
    outFile.close()
    return all_orthogroups,orthoFpath,species_list