예제 #1
0
def orthoFromSampleRecs(nfrec, outortdir, nsample=[], methods=['mixed'], \
                        foutdiffog=None, outputOGperSampledRecGT=True, colourTreePerSampledRecGT=False, \
                        graphCombine=None, majRuleCombine=None, **kw):
	""""""
	verbose = kw.get('verbose')
	fam = os.path.basename(nfrec).split('-', 1)[0]
	if verbose: print "\n# # # %s"%fam
	# collect the desired sample from the reconciliation file
	dparserec = parseALERecFile(nfrec, skipLines=True, skipEventFreq=True, nsample=nsample, returnDict=True)
	lrecgt = dparserec['lrecgt']
	if kw.get('userefspetree'):
		refspetree = dparserec['spetree']
	else:
		refspetree = None
	colourCombinedTree = kw.get('colourCombinedTree')
	
	ddogs = {}
	dnexustrans = {}
	drevnexustrans = {}
	ltaxnexus = []
	llabs = []
	for i, recgenetree in enumerate(lrecgt):
		if nsample: g = nsample[i]
		else: g = i
		if verbose: print recgenetree
		if verbose: print "\n# # reconciliation sample %d"%g
		N = recgenetree.nb_leaves()
		dlabs = {}
		if set(['strict', 'mixed']) & set(methods):
			if verbose: print "\n# strict_ogs:\n"
			strict_ogs, unclassified, dlabs = getOrthologues(recgenetree, method='strict', refspetree=refspetree, dlabs=dlabs, **kw)
			n1 = summaryOGs(strict_ogs, dlabs, N, verbose)
		else:
			strict_ogs = unclassified = None; n1 = 'NA'
		if 'unicopy' in methods:
			if verbose: print "\n# unicopy_ogs:\n"
			unicopy_ogs, notrelevant, dlabs = getOrthologues(recgenetree, method='unicopy', refspetree=refspetree, dlabs=dlabs, **kw)
			n2 = summaryOGs(unicopy_ogs, dlabs, N, verbose)
		else:
			unicopy_ogs = None; n2 = 'NA'
		if 'mixed' in methods:
			if verbose: print "\n# mixed_ogs:\n"
			mixed_ogs, unclassified, dlabs = getOrthologues(recgenetree, method='mixed', strict_ogs=strict_ogs, unclassified=unclassified, refspetree=refspetree, dlabs=dlabs, **kw) #
			n3 = summaryOGs(mixed_ogs, dlabs, N, verbose)
		else:
			mixed_ogs = None; n3 = 'NA'
		
		if foutdiffog or verbose: 
			o12 = str(sum([int(o in strict_ogs) for o in unicopy_ogs])) if (strict_ogs and unicopy_ogs) else 'NA'
			o13 = str(sum([int(o in strict_ogs) for o in mixed_ogs])) if (strict_ogs and mixed_ogs) else 'NA'
			o23 = str(sum([int(o in unicopy_ogs) for o in mixed_ogs])) if (mixed_ogs and unicopy_ogs) else 'NA'
		if verbose:
			print "\n# summary:\n"
			print "overlap strict_ogs with unicopy_ogs:", o12
			print "overlap strict_ogs with mixed_ogs:", o13
			print "overlap unicopy_ogs with mixed_ogs:", o23
		if foutdiffog:
			foutdiffog.write('\t'.join([fam, str(g), n1, n2, n3, o12, o13, o23])+'\n')
		
		if colourTreePerSampledRecGT or colourCombinedTree:
			if i==0:
				recgenetree, dnexustrans, drevnexustrans, ltaxnexus = indexCleanTreeLabels(recgenetree, dlabs)
			else:
				recgenetree, dnexustrans, drevnexustrans, ltaxnexus = indexCleanTreeLabels(recgenetree, dlabs, \
				         dnexustrans=dnexustrans, drevnexustrans=drevnexustrans, ltaxnexus=ltaxnexus, update=False)
		
		ddogs[g] = {'strict':strict_ogs, 'unicopy':unicopy_ogs, 'mixed':mixed_ogs}
		if verbose: print "\n# # # # # # # #"
		if i==0:
			# collect the leaf labels; just do once
			llabs = dlabs.values() 
			llabs.sort()
	
	R = len(lrecgt)
	gs = nsample if nsample else range(R)
	for method in methods:
		ltrees = []
		nfoutrad = os.path.join(outortdir, method, "%s_%s"%(fam, method))
		if colourTreePerSampledRecGT:
			logs = [ddogs[g][method] for g in gs]
			writeRecGeneTreesColouredByOrthologs(lrecgt, logs, nfoutrad+"_orthologous_groups.nex", drevnexustrans, \
				treenames=["tree_%d" for g in gs], ltax=ltaxnexus, dtranslate=dnexustrans, figtree=True)
		if outputOGperSampledRecGT:
			with open(nfoutrad+".orthologs.per_sampled_tree", 'w') as foutort:
				for g in gs:
					ogs = ddogs[g][method]
					foutort.write('\n'.join([' '.join(x) for x in ogs])+'\n#\n')
		
		if graphCombine or majRuleCombine:
			## for later output
			recgt0 = lrecgt[0] if colourCombinedTree else None 
			# could also use the ALE consensus tree, which has branch supports but has no lengths
			## first make a dict of edge frequencies
			dedgefreq = {}
			for g in gs:
				ogs = ddogs[g][method]
				for og in ogs:
					if len(og)==1:
						orfan = og[0] ; combo = (orfan, orfan)
						dedgefreq[combo] = dedgefreq.get(combo, 0) + 1
					else:
						# get all pairs of genes in the OG
						combogs = combinations(sorted(og), 2)
						# add the counts
						for combo in combogs:
							dedgefreq[combo] = dedgefreq.get(combo, 0) + 1
			## build a graph of connectivity of the genes in OGs, integrating over the sample
			gOG = igraph.Graph()
			gOG.add_vertices(len(llabs))
			gOG.vs['name'] = llabs
			# first make a full weighted graph
			# add the edges to the graph
			edges, freqs = zip(*dedgefreq.iteritems())
			gOG.add_edges(edges)
			gOG.es['weight'] = freqs
			if majRuleCombine:
				## make a majority rule unweighted graph
				mjgOG = gOG.copy()
				# select edges with frequency below the threshold
				mjdropedges = []
				minfreq = majRuleCombine*R
				for e in mjgOG.es:
					# use strict majority (assuming the parameter majRuleCombine=0.5, the default) to avoid obtaining family-wide single components
					if e['weight'] <= minfreq: mjdropedges.append(e.index)
				# remove the low-freq edges to the graph
				mjgOG.delete_edges(mjdropedges)
				if verbose: print "Majority Rule Consensus network: droped %d edges with weight <= %d from the full network (%d edges)"%(len(mjdropedges), minfreq, len(gOG.es))
				# find connected components (i.e. perform clustering)
				compsOGs = mjgOG.components()
				# resolve conflicts in orthology classification
				mjgOG, compsOGs = enforceUnicity(mjgOG, compsOGs, getVertexClustering, communitymethod='components', **kw)
				# write results
				writeGraphCombinedOrthologs(nfoutrad, "majrule_combined_%f"%majRuleCombine, mjgOG, compsOGs, llabs, \
                                             colourCombinedTree=colourCombinedTree, recgt=recgt0, drevnexustrans=drevnexustrans, \
                                             ltax=ltaxnexus, dtranslate=dnexustrans, ltreenames=["tree_0"], figtree=True)
			if graphCombine:
				# find communities (i.e. perform clustering) in full weighted graph
				commsOGs = getVertexClustering(gOG, graphCombine)
				# resolve conflicts in orthology classification
				gOG, commsOGs = enforceUnicity(gOG, commsOGs, getVertexClustering, maxdrop=20, communitymethod=graphCombine, **kw)
				# write results
				writeGraphCombinedOrthologs(nfoutrad, 'graph_combined_%s'%graphCombine, gOG, commsOGs, llabs, \
                                             colourCombinedTree=colourCombinedTree, recgt=recgt0, drevnexustrans=drevnexustrans, \
                                             ltax=ltaxnexus, dtranslate=dnexustrans, ltreenames=["tree_0"], figtree=True)
	if not os.path.isdir(dirlineageout): os.mkdir(dirlineageout)
	curfamily = None
	curlineage = None
	curspetree = None
	dnodefreq = {}
	ltrans = []
	header = linesplit(flineagecommevents.readline())
	for line in flineagecommevents:
		family, lineage, event, freq, evtype, reclabel, donlabel = linesplit(line)
		if family!=curfamily:
			if dirrec:
				# load pobability density of gene presence of the whole gene family over the species tree
				# !!! when reconciliation used partially collapsed species tree, requires a matching of uncollapsed to collapsed nodes (NOT IMPLEMENTED)
				if family not in dfamspetree:
					nfrec = os.path.join(dirrec, "%s%s"%(family, recfilesuffix))
					recspetree, subspetree, lrecgt, recgtlines, restrictlabs, dnodeevt = parseALERecFile(nfrec, reftreelen=refspetree)
					for node in recspetree:
						node.branchwidth = float(dnodeevt[node.label()][-1])/scaleFreqToWidth
					dfamspetree[family] = recspetree
				else:
					recspetree = dfamspetree[family]
			curfamily = family
		if lineage!=curlineage:
			if not (curlineage is None):
				# write out previous lineage projection
				nfoutrad = os.path.join(dirlineageout, "lineage_%s_projection"%curlineage)
				curspetree.write_newick(nfoutrad+".nwk", ignoreBS=False)
				curspetree.writeSvgTree(nfoutrad+".svg", padleaves=True, supports=False, phylofact=10000, branchwidths='branchwidth', \
		                                  treetype='species', transfers=ltrans, textorbit=5, modstyle="stroke-width:1; ", \
		                                  transfercolor=transferColor, transferpathtype='arc', transferwidth='freq')
			if dirrec: curspetree = copy.deepcopy(recspetree)
def parseRec(nfrec, refspetree=None, ALEmodel='undated', drefspeeventTup2Ids=None, onlyLineages=[], recordEvTypes='DTS', minFreqReport=0, returnDict=True, \
             lineageTableOutDir=None, noTranslateSpeTree=False, allEventByLineageByGenetree=False, verbose=False):
	"""parse reconciled gene tree sample, returning sampled events by gene lineage
	
	if allEventByLineageByGenetree is True, return more detailed data, stored in a dict with the following elements: 
	{
	 'allrectevtlineages': <dict of all single observed events by lineage by gene tree in the sample>, 
	 'devtlineagecount': <dict of all events and total observed frequency by lineage>, 
	 'dexactevt': <dict of frequencies of events, irrespective of the lineage in which they ocurred>'
	}
	otherwise (default), only the 'devtlineagecount' is returned.
	"""
	if not (returnDict or lineageTableOutDir): raise ValueError, "no output option chosen"
	print nfrec
	# parse reconciliation file and extract collapsed species tree, mapping of events (with freq.) on the species tree, and reconciled gene trees
	colspetree, subspetree, lrecgt, recgtlines, restrictlabs, dnodeevt = pAr.parseALERecFile(nfrec)
	nsample = len(lrecgt)
	recgtsample = ''.join(recgtlines)
	if not noTranslateSpeTree:
		tcolspetree, dcol2fullspenames = translateRecStree(colspetree, refspetree)
	else:
		# no need to translate
		tcolspetree, dcol2fullspenames = colspetree, {}
		if refspetree:
			assert refspetree.hasSameTopology(tcolspetree, checkInternalLabels=True)
	if verbose:
		print 'refspetree:', refspetree.newick(ignoreBS=True)
		print 'colspetree:', colspetree.newick(ignoreBS=True)
		print 'dcol2fullspenames:', dcol2fullspenames
	if ALEmodel=='dated':
		# add reference for '#OUTSIDE#' taxon
		dcol2fullspenames[outtaxlab] = outtaxlab
	# parse reconciled gene trees
	# and extract (exact) event-wise event frequency
	dexactevt = {}
	devtlineagecount = {}
	allrectevtlineages = {}
	for i, recgt in enumerate(lrecgt):
		# gather scenario-scpecific events (i.e. dependent on reconciled gene tree topology, which varies among the sample)
		dlevt, dnodeallevt = pAr.parseRecGeneTree(recgt, colspetree, ALEmodel=ALEmodel, dexactevt=dexactevt, recgtsample=recgtsample, \
		                                          nsample=nsample, fillDTLSdict=False, recordEvTypes=recordEvTypes, \
		                                          excludeTaggedLeaves=collapsedcladetag, excludeTaggedSubtrees=replacementcladetag, verbose=verbose)
		# here events involving a replcement clade (RC) or leaf (CC) are excluded
		# * 'dexactevt' is used as cache to store frequencies of event s as inferred from regex searches of the event pattern
		# these frequencies are not specific to gene lineages, but aggregate the counts over the whole gene family
		# * 'dlevt' is of no use and here returned empty because of fillDTLSdict=False
		# would it not be empty, it could be translated to the full reference tree with:
		# tdlevt = {etype:translateEventList(ldtl, dcol2fullspenames, drefspeevents) for etype, ldtl in dlevt.iteritems()}
		evtlineages = eventLineages(recgt, dnodeallevt, ALEmodel=ALEmodel, onlyLeaves=onlyLineages, recordEvTypes=recordEvTypes)
		print 'evtlineages:', evtlineages
		tevtlineages = translateEventLineage(evtlineages, dcol2fullspenames, drefspeeventTup2Ids)
		print 'tevtlineages:', tevtlineages
		
		if allEventByLineageByGenetree:
			# one way to proceed is to build the object 'allrectevtlineages'
			# a dict that contains all events in a lineage, 
			# for all the lineages in reconciled gene tree, 
			# for all the reconcile gene trees in the ALE sample.
			# IT CAN BE A VERY HEAVY OBJECT.
			for geneleaflab, evtlineage in tevtlineages.iteritems():
				allrectevtlineages.setdefault(geneleaflab, []).append(evtlineage)
		else:
			# another way is to aggregate data immediately
			# might be slower due to many updates of the 'devtlineagecount' dict,
			# but more efficient in memory use
			for geneleaflab, evtlineage in tevtlineages.iteritems():
				for evtup in evtlineage:
					nevtup = devtlineagecount.setdefault(geneleaflab, {}).setdefault(evtup, 0)
					devtlineagecount[geneleaflab][evtup] = nevtup + 1
	
	if allEventByLineageByGenetree:
		devtlineagecount = {}
		for geneleaflab, allreclineages in allrectevtlineages.iteritems():
			allrecevt = reduce(lambda x, y: x+y, allreclineages)
			# combine event counts across the sample
			fevent = {evtup:allrecevt.count(evtup) for evtup in set(allrecevt)}
			if minFreqReport>0:
				# skips the low-frequency events
				if float(fevent)/nsample < minFreqReport: continue
			devtlineagecount[geneleaflab] = fevent
	elif minFreqReport>0:
		# cleanup by deleting low-frequency events a posteriori
		for geneleaflab, eventlineage in devtlineagecount.iteritems():
			for evtup, fevent in eventlineage.items():
				if float(fevent)/nsample < minFreqReport:
					del eventlineage[evtup]
	
	# optionally write out events gene by gene (those that occured at least once above a gene in [rooted] reconciled gene tree, and at which frequency)
	if lineageTableOutDir:
		nfTableEventsOut = os.path.join(lineageTableOutDir, "%s.%s.eventlineages"%(os.path.basename(nfrec), recordEvTypes))
		with open(nfTableEventsOut, 'w') as fTableOut:
			geneleaflabs = devtlineagecount.keys()
			geneleaflabs.sort()
			for geneleaflab in geneleaflabs:
				eventlineage = devtlineagecount[geneleaflab]
				for evtup, freq in eventlineage.iteritems():
					if drefspeeventTup2Ids: fTableOut.write('\t'.join((geneleaflab, str(evtup), str(freq)))+'\n')
					else: fTableOut.write('\t'.join((geneleaflab,)+evtup+(str(freq),))+'\n')
		print "stored events listed by gene lineage in '%s'"%nfTableEventsOut
	
	sys.stdout.flush()
	retd = {}
	retd['nfrec'] = nfrec
	if returnDict:
		retd['devtlineagecount'] = devtlineagecount
		if allEventByLineageByGenetree:
			retd['allrectevtlineages'] = allrectevtlineages
			retd['dexactevt'] = dexactevt
	else:
		return retd