def orthoFromSampleRecs(nfrec, outortdir, nsample=[], methods=['mixed'], \ foutdiffog=None, outputOGperSampledRecGT=True, colourTreePerSampledRecGT=False, \ graphCombine=None, majRuleCombine=None, **kw): """""" verbose = kw.get('verbose') fam = os.path.basename(nfrec).split('-', 1)[0] if verbose: print "\n# # # %s"%fam # collect the desired sample from the reconciliation file dparserec = parseALERecFile(nfrec, skipLines=True, skipEventFreq=True, nsample=nsample, returnDict=True) lrecgt = dparserec['lrecgt'] if kw.get('userefspetree'): refspetree = dparserec['spetree'] else: refspetree = None colourCombinedTree = kw.get('colourCombinedTree') ddogs = {} dnexustrans = {} drevnexustrans = {} ltaxnexus = [] llabs = [] for i, recgenetree in enumerate(lrecgt): if nsample: g = nsample[i] else: g = i if verbose: print recgenetree if verbose: print "\n# # reconciliation sample %d"%g N = recgenetree.nb_leaves() dlabs = {} if set(['strict', 'mixed']) & set(methods): if verbose: print "\n# strict_ogs:\n" strict_ogs, unclassified, dlabs = getOrthologues(recgenetree, method='strict', refspetree=refspetree, dlabs=dlabs, **kw) n1 = summaryOGs(strict_ogs, dlabs, N, verbose) else: strict_ogs = unclassified = None; n1 = 'NA' if 'unicopy' in methods: if verbose: print "\n# unicopy_ogs:\n" unicopy_ogs, notrelevant, dlabs = getOrthologues(recgenetree, method='unicopy', refspetree=refspetree, dlabs=dlabs, **kw) n2 = summaryOGs(unicopy_ogs, dlabs, N, verbose) else: unicopy_ogs = None; n2 = 'NA' if 'mixed' in methods: if verbose: print "\n# mixed_ogs:\n" mixed_ogs, unclassified, dlabs = getOrthologues(recgenetree, method='mixed', strict_ogs=strict_ogs, unclassified=unclassified, refspetree=refspetree, dlabs=dlabs, **kw) # n3 = summaryOGs(mixed_ogs, dlabs, N, verbose) else: mixed_ogs = None; n3 = 'NA' if foutdiffog or verbose: o12 = str(sum([int(o in strict_ogs) for o in unicopy_ogs])) if (strict_ogs and unicopy_ogs) else 'NA' o13 = str(sum([int(o in strict_ogs) for o in mixed_ogs])) if (strict_ogs and mixed_ogs) else 'NA' o23 = str(sum([int(o in unicopy_ogs) for o in mixed_ogs])) if (mixed_ogs and unicopy_ogs) else 'NA' if verbose: print "\n# summary:\n" print "overlap strict_ogs with unicopy_ogs:", o12 print "overlap strict_ogs with mixed_ogs:", o13 print "overlap unicopy_ogs with mixed_ogs:", o23 if foutdiffog: foutdiffog.write('\t'.join([fam, str(g), n1, n2, n3, o12, o13, o23])+'\n') if colourTreePerSampledRecGT or colourCombinedTree: if i==0: recgenetree, dnexustrans, drevnexustrans, ltaxnexus = indexCleanTreeLabels(recgenetree, dlabs) else: recgenetree, dnexustrans, drevnexustrans, ltaxnexus = indexCleanTreeLabels(recgenetree, dlabs, \ dnexustrans=dnexustrans, drevnexustrans=drevnexustrans, ltaxnexus=ltaxnexus, update=False) ddogs[g] = {'strict':strict_ogs, 'unicopy':unicopy_ogs, 'mixed':mixed_ogs} if verbose: print "\n# # # # # # # #" if i==0: # collect the leaf labels; just do once llabs = dlabs.values() llabs.sort() R = len(lrecgt) gs = nsample if nsample else range(R) for method in methods: ltrees = [] nfoutrad = os.path.join(outortdir, method, "%s_%s"%(fam, method)) if colourTreePerSampledRecGT: logs = [ddogs[g][method] for g in gs] writeRecGeneTreesColouredByOrthologs(lrecgt, logs, nfoutrad+"_orthologous_groups.nex", drevnexustrans, \ treenames=["tree_%d" for g in gs], ltax=ltaxnexus, dtranslate=dnexustrans, figtree=True) if outputOGperSampledRecGT: with open(nfoutrad+".orthologs.per_sampled_tree", 'w') as foutort: for g in gs: ogs = ddogs[g][method] foutort.write('\n'.join([' '.join(x) for x in ogs])+'\n#\n') if graphCombine or majRuleCombine: ## for later output recgt0 = lrecgt[0] if colourCombinedTree else None # could also use the ALE consensus tree, which has branch supports but has no lengths ## first make a dict of edge frequencies dedgefreq = {} for g in gs: ogs = ddogs[g][method] for og in ogs: if len(og)==1: orfan = og[0] ; combo = (orfan, orfan) dedgefreq[combo] = dedgefreq.get(combo, 0) + 1 else: # get all pairs of genes in the OG combogs = combinations(sorted(og), 2) # add the counts for combo in combogs: dedgefreq[combo] = dedgefreq.get(combo, 0) + 1 ## build a graph of connectivity of the genes in OGs, integrating over the sample gOG = igraph.Graph() gOG.add_vertices(len(llabs)) gOG.vs['name'] = llabs # first make a full weighted graph # add the edges to the graph edges, freqs = zip(*dedgefreq.iteritems()) gOG.add_edges(edges) gOG.es['weight'] = freqs if majRuleCombine: ## make a majority rule unweighted graph mjgOG = gOG.copy() # select edges with frequency below the threshold mjdropedges = [] minfreq = majRuleCombine*R for e in mjgOG.es: # use strict majority (assuming the parameter majRuleCombine=0.5, the default) to avoid obtaining family-wide single components if e['weight'] <= minfreq: mjdropedges.append(e.index) # remove the low-freq edges to the graph mjgOG.delete_edges(mjdropedges) if verbose: print "Majority Rule Consensus network: droped %d edges with weight <= %d from the full network (%d edges)"%(len(mjdropedges), minfreq, len(gOG.es)) # find connected components (i.e. perform clustering) compsOGs = mjgOG.components() # resolve conflicts in orthology classification mjgOG, compsOGs = enforceUnicity(mjgOG, compsOGs, getVertexClustering, communitymethod='components', **kw) # write results writeGraphCombinedOrthologs(nfoutrad, "majrule_combined_%f"%majRuleCombine, mjgOG, compsOGs, llabs, \ colourCombinedTree=colourCombinedTree, recgt=recgt0, drevnexustrans=drevnexustrans, \ ltax=ltaxnexus, dtranslate=dnexustrans, ltreenames=["tree_0"], figtree=True) if graphCombine: # find communities (i.e. perform clustering) in full weighted graph commsOGs = getVertexClustering(gOG, graphCombine) # resolve conflicts in orthology classification gOG, commsOGs = enforceUnicity(gOG, commsOGs, getVertexClustering, maxdrop=20, communitymethod=graphCombine, **kw) # write results writeGraphCombinedOrthologs(nfoutrad, 'graph_combined_%s'%graphCombine, gOG, commsOGs, llabs, \ colourCombinedTree=colourCombinedTree, recgt=recgt0, drevnexustrans=drevnexustrans, \ ltax=ltaxnexus, dtranslate=dnexustrans, ltreenames=["tree_0"], figtree=True)
if not os.path.isdir(dirlineageout): os.mkdir(dirlineageout) curfamily = None curlineage = None curspetree = None dnodefreq = {} ltrans = [] header = linesplit(flineagecommevents.readline()) for line in flineagecommevents: family, lineage, event, freq, evtype, reclabel, donlabel = linesplit(line) if family!=curfamily: if dirrec: # load pobability density of gene presence of the whole gene family over the species tree # !!! when reconciliation used partially collapsed species tree, requires a matching of uncollapsed to collapsed nodes (NOT IMPLEMENTED) if family not in dfamspetree: nfrec = os.path.join(dirrec, "%s%s"%(family, recfilesuffix)) recspetree, subspetree, lrecgt, recgtlines, restrictlabs, dnodeevt = parseALERecFile(nfrec, reftreelen=refspetree) for node in recspetree: node.branchwidth = float(dnodeevt[node.label()][-1])/scaleFreqToWidth dfamspetree[family] = recspetree else: recspetree = dfamspetree[family] curfamily = family if lineage!=curlineage: if not (curlineage is None): # write out previous lineage projection nfoutrad = os.path.join(dirlineageout, "lineage_%s_projection"%curlineage) curspetree.write_newick(nfoutrad+".nwk", ignoreBS=False) curspetree.writeSvgTree(nfoutrad+".svg", padleaves=True, supports=False, phylofact=10000, branchwidths='branchwidth', \ treetype='species', transfers=ltrans, textorbit=5, modstyle="stroke-width:1; ", \ transfercolor=transferColor, transferpathtype='arc', transferwidth='freq') if dirrec: curspetree = copy.deepcopy(recspetree)
def parseRec(nfrec, refspetree=None, ALEmodel='undated', drefspeeventTup2Ids=None, onlyLineages=[], recordEvTypes='DTS', minFreqReport=0, returnDict=True, \ lineageTableOutDir=None, noTranslateSpeTree=False, allEventByLineageByGenetree=False, verbose=False): """parse reconciled gene tree sample, returning sampled events by gene lineage if allEventByLineageByGenetree is True, return more detailed data, stored in a dict with the following elements: { 'allrectevtlineages': <dict of all single observed events by lineage by gene tree in the sample>, 'devtlineagecount': <dict of all events and total observed frequency by lineage>, 'dexactevt': <dict of frequencies of events, irrespective of the lineage in which they ocurred>' } otherwise (default), only the 'devtlineagecount' is returned. """ if not (returnDict or lineageTableOutDir): raise ValueError, "no output option chosen" print nfrec # parse reconciliation file and extract collapsed species tree, mapping of events (with freq.) on the species tree, and reconciled gene trees colspetree, subspetree, lrecgt, recgtlines, restrictlabs, dnodeevt = pAr.parseALERecFile(nfrec) nsample = len(lrecgt) recgtsample = ''.join(recgtlines) if not noTranslateSpeTree: tcolspetree, dcol2fullspenames = translateRecStree(colspetree, refspetree) else: # no need to translate tcolspetree, dcol2fullspenames = colspetree, {} if refspetree: assert refspetree.hasSameTopology(tcolspetree, checkInternalLabels=True) if verbose: print 'refspetree:', refspetree.newick(ignoreBS=True) print 'colspetree:', colspetree.newick(ignoreBS=True) print 'dcol2fullspenames:', dcol2fullspenames if ALEmodel=='dated': # add reference for '#OUTSIDE#' taxon dcol2fullspenames[outtaxlab] = outtaxlab # parse reconciled gene trees # and extract (exact) event-wise event frequency dexactevt = {} devtlineagecount = {} allrectevtlineages = {} for i, recgt in enumerate(lrecgt): # gather scenario-scpecific events (i.e. dependent on reconciled gene tree topology, which varies among the sample) dlevt, dnodeallevt = pAr.parseRecGeneTree(recgt, colspetree, ALEmodel=ALEmodel, dexactevt=dexactevt, recgtsample=recgtsample, \ nsample=nsample, fillDTLSdict=False, recordEvTypes=recordEvTypes, \ excludeTaggedLeaves=collapsedcladetag, excludeTaggedSubtrees=replacementcladetag, verbose=verbose) # here events involving a replcement clade (RC) or leaf (CC) are excluded # * 'dexactevt' is used as cache to store frequencies of event s as inferred from regex searches of the event pattern # these frequencies are not specific to gene lineages, but aggregate the counts over the whole gene family # * 'dlevt' is of no use and here returned empty because of fillDTLSdict=False # would it not be empty, it could be translated to the full reference tree with: # tdlevt = {etype:translateEventList(ldtl, dcol2fullspenames, drefspeevents) for etype, ldtl in dlevt.iteritems()} evtlineages = eventLineages(recgt, dnodeallevt, ALEmodel=ALEmodel, onlyLeaves=onlyLineages, recordEvTypes=recordEvTypes) print 'evtlineages:', evtlineages tevtlineages = translateEventLineage(evtlineages, dcol2fullspenames, drefspeeventTup2Ids) print 'tevtlineages:', tevtlineages if allEventByLineageByGenetree: # one way to proceed is to build the object 'allrectevtlineages' # a dict that contains all events in a lineage, # for all the lineages in reconciled gene tree, # for all the reconcile gene trees in the ALE sample. # IT CAN BE A VERY HEAVY OBJECT. for geneleaflab, evtlineage in tevtlineages.iteritems(): allrectevtlineages.setdefault(geneleaflab, []).append(evtlineage) else: # another way is to aggregate data immediately # might be slower due to many updates of the 'devtlineagecount' dict, # but more efficient in memory use for geneleaflab, evtlineage in tevtlineages.iteritems(): for evtup in evtlineage: nevtup = devtlineagecount.setdefault(geneleaflab, {}).setdefault(evtup, 0) devtlineagecount[geneleaflab][evtup] = nevtup + 1 if allEventByLineageByGenetree: devtlineagecount = {} for geneleaflab, allreclineages in allrectevtlineages.iteritems(): allrecevt = reduce(lambda x, y: x+y, allreclineages) # combine event counts across the sample fevent = {evtup:allrecevt.count(evtup) for evtup in set(allrecevt)} if minFreqReport>0: # skips the low-frequency events if float(fevent)/nsample < minFreqReport: continue devtlineagecount[geneleaflab] = fevent elif minFreqReport>0: # cleanup by deleting low-frequency events a posteriori for geneleaflab, eventlineage in devtlineagecount.iteritems(): for evtup, fevent in eventlineage.items(): if float(fevent)/nsample < minFreqReport: del eventlineage[evtup] # optionally write out events gene by gene (those that occured at least once above a gene in [rooted] reconciled gene tree, and at which frequency) if lineageTableOutDir: nfTableEventsOut = os.path.join(lineageTableOutDir, "%s.%s.eventlineages"%(os.path.basename(nfrec), recordEvTypes)) with open(nfTableEventsOut, 'w') as fTableOut: geneleaflabs = devtlineagecount.keys() geneleaflabs.sort() for geneleaflab in geneleaflabs: eventlineage = devtlineagecount[geneleaflab] for evtup, freq in eventlineage.iteritems(): if drefspeeventTup2Ids: fTableOut.write('\t'.join((geneleaflab, str(evtup), str(freq)))+'\n') else: fTableOut.write('\t'.join((geneleaflab,)+evtup+(str(freq),))+'\n') print "stored events listed by gene lineage in '%s'"%nfTableEventsOut sys.stdout.flush() retd = {} retd['nfrec'] = nfrec if returnDict: retd['devtlineagecount'] = devtlineagecount if allEventByLineageByGenetree: retd['allrectevtlineages'] = allrectevtlineages retd['dexactevt'] = dexactevt else: return retd