예제 #1
0
def clique_with_peel(seed_i, cursor):
	THRESHOLD = CLIQUE_MIN_SIZE * QUASI_GAMMA - 1
	G = seed_graph(seed_i, cursor)
	# first time we're lenient
	G.delete_nodes_from( filter(lambda n: G.degree(n)<=THRESHOLD, G.nodes_iter()) )
	print >> sys.stderr, "initial seed graph gets {0} nodes, {1} edges".format(G.number_of_nodes(),G.number_of_edges())

	G_nodes = G.nodes()
	S,H = p.convert_graph_connectivity_to_sparse(G, G_nodes)
	tQ = p.grasp(S, H, QUASI_GAMMA, QUASI_MAXITR, G_nodes.index(seed_i))
	Q = map(lambda x: G_nodes[x], tQ)

	print >> sys.stderr, "before peel clique size {0}, graph {1} nodes, {2} edges".format(len(Q),\
			G.number_of_nodes(), G.number_of_edges())

	if seed_i not in Q:
		print >> sys.stderr, "seed node got kicked out PART 1. NOOOOO!!!"
		return ([],[])  #cHANGE BACK LATER

	peel(G, seed_i, Q, QUASI_GAMMA, cursor)
	print >> sys.stderr, "after peel seed graph gets {0} nodes, {1} edges".format(G.number_of_nodes(),G.number_of_edges())

#	return Q, G # TODO: delete later

	G_nodes = G.nodes()
	S,H = p.convert_graph_connectivity_to_sparse(G, G_nodes)
	print >> sys.stderr, "index of seed i", G_nodes.index(seed_i)
	tQ = p.grasp(S, H, QUASI_GAMMA, QUASI_MAXITR, G_nodes.index(seed_i))
	# TESTING (TODO: DELETE LATER) for quasi=0.8 followed by quasi=0.6
	print >> sys.stderr, "after peel clique size {0}".format(len(tQ))
	Q0 = map(lambda x: G_nodes[x], tQ)
	p.local_extra(H, tQ, 0.6)
	print >> sys.stderr, "after local quasi=0.6 clique size {0}".format(len(tQ))
	
	Q = map(lambda x: G_nodes[x], tQ)

	if seed_i not in Q:
		print >> sys.stderr, "seed node got kicked out PART 2. NOOOOO!!!"
		return ([],[])

	# sanity check, delete later (TODO)
	if len(Q) <= CLIQUE_MIN_SIZE:
		return (Q0,Q) # this line is just for avoiding error at the sanity check, delete later
	threshold = 0.6 * len(Q) # CHANGE THIS BACK LATER #threshold = QUASI_GAMMA * len(Q)
	for q in Q:
		assert sum( map(lambda x: G.has_edge(q,x), Q) ) >= threshold

	return (Q0, Q) # CHANGE BACK TO return Q later
예제 #2
0
def init_cluster_by_clique(fasta_filename, qver_get_func, bestn=100, ece_penalty=1, ece_min_len=20, nproc=8, maxScore=-1000):
    """
    fasta_filename --- initial fasta filename, probably called aloha.fa_split00.fa
    qver_get_func --- function that returns QVs on reads
    bestn --- parameter in BLASR, higher helps in finding perfect cliques but bigger output
    nproc, maxScore --- parameter in BLASR, set maxScore appropriate to input transcript length
    ece_penalty, ece_min_len --- parameter in isoform hit calling

    Self-blasr input then iteratively find all mutually exclusive cliques (in decreasing size)
    Returns dict of cluster_index --> list of seqids
    which is the 'uc' dict that can be used by ICE
    """
    out_filename = fasta_filename + '.self.blasr'

    if os.path.exists(out_filename):
        print >> sys.stderr, "{0} already exists. No need to run BLASR.".format(out_filename)
    else:
        cmd = "blasr {i} {i} -m 5 -maxLCPLength 15 -nproc {cpu} -maxScore {score} -bestn {n} -nCandidates {n} -out {o}".format(\
        i=fasta_filename, n=bestn, o=out_filename, cpu=nproc, score=maxScore)
        print >> sys.stderr, cmd
        subprocess.check_call(cmd, shell=True)

    G = nx.Graph()
    for r in iCEC.blasr_against_ref(out_filename, is_FL=True, sID_starts_with_c=False, qver_get_func=qver_get_func, ece_penalty=ece_penalty, ece_min_len=ece_min_len):
        if r[0] == r[1]: continue # self hit, ignore
        if r[-1] is not None:
            print >> sys.stderr, "adding edge {0},{1}".format(r[0], r[1])
            G.add_edge(r[0], r[1])

    uc = {}
    used = []
    ind = 0

    deg = G.degree().items()
    deg.sort(key=lambda x:x[1], reverse=True)
    for d in deg:
        if d[0] not in G: continue
        # just get the immediate neighbors since we're looking for perfect cliques
        G_prime = G.subgraph([d[0]] + G.neighbors(d[0]))
        G_prime_nodes = G_prime.nodes()
        S,H = pClique.convert_graph_connectivity_to_sparse(G_prime, G_prime_nodes)
        seed_i = G_prime_nodes.index(d[0])
        tQ = pClique.grasp(S, H, 1., 5, seed_i)
        if len(tQ) > 0:
            c = [G_prime_nodes[i] for i in tQ]
            uc[ind] = c
            ind += 1
            used += c
            G.remove_nodes_from(c)

# ------ below, old way of using find_cliques, inefficient on large graphs
#    cliques = list(nx.find_cliques(G))
#    cliques.sort(key=lambda x: len(x), reverse=True)
#    for c in cliques:
#        if all(map(lambda x: x in G, c)):
#            uc[ind] = c
#            ind += 1
#            used += c
#            G.remove_nodes_from(c)

    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        if r.id not in used:
            uc[ind] = [r.id]
            ind += 1

    return uc
예제 #3
0
def new_cluster_pipe(rfam_fam, shuffle_ratio):
	assert type(shuffle_ratio) is int
	output_prefix = "Rfam_{fam}_shuffle{X}X".format(fam=rfam_fam, X=shuffle_ratio)
	fasta_filename = output_prefix+'.fna'
	blast_output = "{input}.M8N7Q16R2W3E2.WUblast".format(input=fasta_filename)

	report_f = open(output_prefix+'.report', 'w')
	if not os.path.exists(blast_output):
		dummy_id = 0
		nodes_to_index = {}
		with open(fasta_filename, 'w') as f:
			with get_conn_ncRNA() as cursor:
				cursor.execute("select id,seq from Rfam_fasta where rfam_fam='{fam}' order by id".format(fam=rfam_fam))
				for _id,seq in cursor.fetchall():
					id = "TP{0}_{1}".format(dummy_id, _id)
					f.write(">{id}\n{seq}\n".format(id=id, seq=seq))
					nodes_to_index[id] = dummy_id
					dummy_id += 1
					ushuffle.shuffle(seq, len(seq), 2)
					for x in xrange(shuffle_ratio):
						id = "FP{0}_{1}".format(dummy_id, _id)
						f.write(">{id}\n{seq}\n".format(id=id, seq=ushuffle.shuffle2()))
						nodes_to_index[id] = dummy_id
						dummy_id += 1
		start_t = time.time()				
		# now blast it
		os.system("xdformat -n -o {input} {input}".format(input=fasta_filename))
		os.system("blastn -d {input} -i {input} -M 8 -N -7 -Q 16 -R 2 -E 2 \
				-W 3 -mformat 2 -cpus 4 -o {output}".format(input=fasta_filename, output=blast_output))
		report_f.write("(1)  BLAST TIME: {0} sec\n".format(time.time()-start_t))

		# now parse the blast
		nodes_to_index = c1.NodesToIndex(nodes_to_index, -1)
		G = Graph()
		c1.step1_process_blast(blast_output=blast_output,\
				score_cutoff=35, nodes_to_index=nodes_to_index, G=G, program='WU')
		print >> sys.stderr, "Homology graph has {0} nodes, {1} edges....".format(\
				G.number_of_nodes(), G.number_of_edges())
		c1.export_to_db(G, nodes_to_index, 0, blast_output)
		# convert nodes_to_index into dict nodes_ind --> acc id
		nodes_to_index = dict( map(lambda (x,y):(y,x), nodes_to_index.d.items()) )
		with open(blast_output+'.nodes_to_index', 'w') as handle:
			for ind,id in nodes_to_index.iteritems():
				handle.write("{0}\t{1}\n".format(ind,id))

	# read back the .parsed and .sets_for_nodes files
	G = Graph()
	sets_for_nodes = {}
	nodes_to_index = {}
	with open(blast_output+'.parsed') as handle:
		for line in handle:
			raw = map(int, line.strip().split('\t'))
			G.add_edge(raw[0],raw[1])
	with open(blast_output+'.sets_for_nodes') as handle:
		for line in handle:
			raw = map(int, line.strip().split('\t'))
			sets_for_nodes[raw[0]] = {'nodes_ind':raw[1],'start':raw[2],'end':raw[3]}
	with open(blast_output+'.nodes_to_index') as handle:
		for line in handle:
			raw = line.strip().split()
			nodes_to_index[int(raw[0])] = raw[1]

	tmp = len(filter(lambda x: nodes_to_index[sets_for_nodes[x]['nodes_ind']].startswith('FP'), G.nodes_iter()))
	report_f.write("(2)  AFTER parsing BLAST, graph has {0} negative control nodes, {1} TP nodes\n".format(tmp, G.number_of_nodes()-tmp))

	# remove low deg (< 3) nodes
	x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())
	while len(x) > 0:
		G.delete_nodes_from(x)
		x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())
	tmp = len(filter(lambda x: nodes_to_index[sets_for_nodes[x]['nodes_ind']].startswith('FP'), G.nodes_iter()))
	report_f.write("(3)  AFTER recursively removing nodes of degree < 3, graph has {0} negative control nodes, {1} TP nodes\n".format(tmp, G.number_of_nodes()-tmp))
	report_f.write("----------------------------------------------------------------------------\n")
	report_f.write("OUT\tDIR\tCLIQUE_SIZE\tSCANNED_TP\tSCANNED_FP\tCM_time\n")
	report_f.write("----------------------------------------------------------------------------\n")

	# for now just brute force....go through node by node as seeds
	dummy_round = 0
	while G.number_of_nodes()>=NEW_MIN_CLIQUE_SIZE and G.number_of_edges()>=NEW_MIN_CLIQUE_SIZE:
		# find perfect max cliques with a random starting node
		G_nodes = G.nodes()
		S,H = p.convert_graph_connectivity_to_sparse(G, G_nodes)
		tQ = p.grasp(S, H, gamma=1.0, maxitr=20, given_starting_node=None)
		Q = map(lambda x: G_nodes[x], tQ)
		if len(Q) < NEW_MIN_CLIQUE_SIZE: # delete these nodes
			G.delete_nodes_from(Q)
			continue
		# PERFECT CLIQUE SANITY TESTING, DELETE LATER
		for x in Q:
			for y in Q:
				if x!=y:
					print >> sys.stderr, "testing....", x,y
					try:
						assert G.has_edge(x,y)
					except:
						return Q,G
		Q.sort()
		print >> sys.stderr, "clique is...", Q
		prefix = output_prefix + str(dummy_round) + '_size' + str(len(Q)) + '_'
		dummy_round += 1
		start_t = time.time()
		scan_dir,scan_result = run_cmfinder(Q, nodes_to_index, sets_for_nodes, prefix, os.path.abspath(fasta_filename))
		cm_time = time.time()-start_t
		if scan_result is not None:
			outf = open(os.path.basename(scan_dir)+'.gv','w')
			outf.write("""graph test{
			edge [ dir=none ];
			node [ style=filled, fontsize=2.0, height=0.1, width=0.1, fixedsize=true ];
			""")
			# draw this graph
			scanned = {'TP':0, 'FP':0}
			for n in G.nodes_iter():
				id = nodes_to_index[sets_for_nodes[n]['nodes_ind']] # id is something like TP1_NC_XXXX.... or FP10_NC_XXXX....
				_id = id[:id.find('_')]
				shape = 'circle' if id.startswith('TP') else 'box'
				if n in Q:
					outf.write("{0} [color=dodgerblue1, shape={1}];\n".format(n, shape))
				elif id in scan_result:
					outf.write("{0} [color=darkorange, shape={1}];\n".format(n, shape))
					scanned[_id[:2]] += 1
				else:
					outf.write("{0} [color=grey, shape={1}];\n".format(n, shape))
			for (n1,n2) in G.edges_iter(data=False):
				id1 = nodes_to_index[sets_for_nodes[n1]['nodes_ind']]
				id1 = id1[:id1.find('_')]
				id2 = nodes_to_index[sets_for_nodes[n2]['nodes_ind']]
				id2 = id2[:id2.find('_')]
				outf.write("{0} -- {1};\n".format(n1,n2))
			outf.write("}")
			outf.close()
			report_f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(outf.name,scan_dir,len(Q),scanned['TP'],scanned['FP'],cm_time))
			report_f.flush()
		# delete the edges from the graph
		G.delete_edges_from(itertools.combinations(Q, 2))
		# again, remove low-degree nodes
		x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())
		while len(x) > 0:
			G.delete_nodes_from(x)
			x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())  
	report_f.close()