예제 #1
0
def get_guild_values(parameters, targets, source_to_genes, method='s', dump_file=None):
    if dump_file is None:
	dump_file = parameters.get("guild_file")  
    if os.path.exists(dump_file):
	source_to_target_to_score = cPickle.load(open(dump_file))
	return source_to_target_to_score 
    source_to_target_to_score = {} 
    f = open(dump_file + ".txt", 'w')
    f.write("source\ttarget\tscore\n")
    for target in targets:
	target_mod = text_utilities.convert_to_R_string(target)
	out_file = parameters.get("output_dir") + "/%s.n%s" % (target_mod, method) 
	if not os.path.exists(out_file):
	    print "File not found:", out_file
	    raise ValueError("GUILD values missing!")
	node_to_score = dict(line.strip("\n").split() for line in open(out_file).readlines())
	values = map(float, numpy.array(node_to_score.values()))
	m = numpy.mean(values)
	s = numpy.std(values)
	for source, genes in source_to_genes.iteritems(): 
	    score = numpy.mean([(float(node_to_score[gene]) - m) / s for gene in genes])
	    d = source_to_target_to_score.setdefault(source, {})
	    d[target] = score
	    f.write("%s\t%s\t%f\n" % (source, target, score))
    f.close()
    if dump_file is not None:
	cPickle.dump(source_to_target_to_score, open(dump_file, 'w'))
    return source_to_target_to_score
예제 #2
0
def run_guild_on_cluster(parameters, target_to_geneids, run_mode='array job', method = 's'):
    """
    run_mode: array job | run local 
    method: netshort 's' | netrank 'r'
    """
    network_lcc_file = parameters.get("network_file")
    executable_path = parameters.get("guild_path")
    output_dir = parameters.get("output_dir") + "/"
    qname = "all.q"
    delay = 10
    network = wrappers.get_network(parameters.get("network_file"), only_lcc = True) # already using LCC file
    nodes = network.nodes()
    for target, geneids in target_to_geneids.iteritems():
	#print target, len(geneids_target)
	target = text_utilities.convert_to_R_string(target)
	target_to_score = dict((gene, 1.0) for gene in geneids)
	out_file = parameters.get("output_dir") + "/%s.n%s" % (target, method) 
	if os.path.exists(out_file):
	    continue
	if run_mode == "run local":
	    qName = None
	elif run_mode != "array job":
	    raise ValueError("Unknown run_mode: %s" % run_mode)
	score_command = wrappers.run_guild(target, target_to_score, nodes, network_lcc_file, output_dir, executable_path, background_score = 0.01, qname = qname, method = method) 
    return
예제 #3
0
def get_proximity_values(parameters, source_to_geneids, target_to_geneids, dump_file=None, convert_names=True, md5_conversion=False):
    if dump_file is None:
	dump_file = parameters.get("proximity_file")  
    if os.path.exists(dump_file):
	try:
	    source_to_target_to_proximity, source_to_target_to_d = cPickle.load(open(dump_file))
	except: # For old dumps storing only z
	    raise ValueError("Update proximity dump to store d in addition to z!") # print
	    source_to_target_to_proximity = cPickle.load(open(dump_file))
	    source_to_target_to_d = None
	return source_to_target_to_proximity, source_to_target_to_d 
    source_to_target_to_proximity = {} # before source was stored as R string
    source_to_target_to_d = {} 
    f = open(dump_file + ".txt", 'w')
    f.write("source\ttarget\tz\td\n")
    for source, geneids_source in source_to_geneids.iteritems(): 
	source_mod = source
	if convert_names:
	    source_mod = text_utilities.convert_to_R_string(source) 
	if md5_conversion:
	    md5 = hashlib.md5("-".join(sorted(geneids_source))).hexdigest()
	    source_mod = md5
	source_to_target_to_proximity[source] = {}
	source_to_target_to_d[source] = {}
	for target in target_to_geneids:
	    target_mod = target
	    if convert_names:
		target_mod = text_utilities.convert_to_R_string(target)
		#target_mod = target_mod.lower() 
	    out_file = parameters.get("output_dir") + "/%s_%s.out" % (source_mod, target_mod) 
	    if not os.path.exists(out_file):
		print "File not found:", out_file
		#continue 
		raise ValueError("Proximity values missing!")
	    z, d, m, s = open(out_file).readline().strip("\n").split()
	    source_to_target_to_proximity[source][target] = float(z)
	    source_to_target_to_d[source][target] = float(d)
	    f.write("%s\t%s\t%s\t%s\n" % (source, target, z, d))
    f.close()
    if dump_file is not None:
	cPickle.dump((source_to_target_to_proximity, source_to_target_to_d), open(dump_file, 'w'))
    return source_to_target_to_proximity, source_to_target_to_d
예제 #4
0
def guildify_multiple(network_lcc_file,
                      from_file,
                      to_file,
                      output_dir,
                      out_file,
                      method="s",
                      executable_path=None):
    network = get_network(network_lcc_file,
                          only_lcc=False)  # already using LCC
    nodes = set(network.nodes())
    disease_to_genes, disease_to_category = get_diseasome_genes(to_file,
                                                                nodes=nodes)
    drug_to_targets, drug_to_category = get_diseasome_genes(from_file,
                                                            nodes=nodes)
    f = open(out_file, 'w')
    f.write("source\ttarget\tscore\n")
    for target, geneids in disease_to_genes.iteritems():
        #print target, len(geneids)
        target_mod = text_utilities.convert_to_R_string(target)
        target_to_score = dict((gene, 1.0) for gene in geneids)
        node_file = output_dir + "%s.n%s" % (target_mod, method)
        if os.path.exists(node_file):
            print "Skipping existing:", node_file
            continue
        run_guild(target_mod,
                  target_to_score,
                  nodes,
                  network_lcc_file,
                  output_dir,
                  executable_path,
                  background_score=0.01,
                  qname=None,
                  method=method)
        node_to_score = dict(
            line.strip("\n").split() for line in open(node_file).readlines())
        values = map(float, numpy.array(node_to_score.values()))
        m = numpy.mean(values)
        s = numpy.std(values)
        for source, geneids in drug_to_targets.iteritems():
            score = -numpy.mean([(float(node_to_score[gene]) - m) / s
                                 for gene in geneids])
            f.write("%s\t%s\t%f\n" % (source, target, score))
    f.close()
    return
예제 #5
0
파일: wrappers.py 프로젝트: smw1414/toolbox
def guildify_multiple(network_file, to_file, output_dir, from_file=None, out_file="guild.txt", method="s", executable_path=None):
    """
    to_file: seeds
    If from_file is not None, returns a dictionary containing average z scores of targets to source, otherwise returns empty dictionary
    method: d | s | r | w | p
    (netshort | netscore | page rank | random walk | propagation)
    """
    if from_file is not None and os.path.exists(out_file):
	target_to_source_score = dict(line.strip("\n").split() for line in open(out_file).readlines())
	return target_to_source_score 
    target_to_source_score = {}
    network = get_network(network_file, only_lcc = True) # using LCC 
    if network_file.endswith(".lcc"):
	network_lcc_file = network_file
    else:
	network_lcc_file = network_file + ".lcc"
    nodes = set(network.nodes())
    disease_to_genes, disease_to_category = get_diseasome_genes(to_file, nodes = nodes)
    if not os.path.exists(output_dir):
	print "Creating output directory", output_dir
	os.makedirs(output_dir)
    # Generate background file (for P-value calculation)
    if not os.path.exists(output_dir + "/background.node"):
	node_to_degree = dict(network.degree())
	n = max(map(len, disease_to_genes.values()))
	values = node_to_degree.items()
	values.sort(key=lambda x: -x[1])
	#k = 1.0 * max(node_to_degree.values())
	values = set(zip(*values[:n])[0])
	f = open(output_dir + "/background.node", 'w')
	for node, degree in node_to_degree.iteritems():
	    #score = degree/k
	    if node in values: score = 1
	    else: score = 0.01
	    f.write("%s %f\n" % (node, score))
	f.close()
    if from_file is not None:
	drug_to_targets, drug_to_category = get_diseasome_genes(from_file, nodes = nodes)
	f = open(out_file, 'w')
	f.write("source\ttarget\tscore\n")
    for target, geneids in disease_to_genes.iteritems():
	#print target, len(geneids)
	target_mod = text_utilities.convert_to_R_string(target)
	target_to_score = dict((gene, 1.0) for gene in geneids)
	node_file = output_dir + "%s.n%s" % (target_mod, method) 
	if os.path.exists(node_file):
	    print "Skipping existing:", node_file
	    continue
	run_guild(target_mod, target_to_score, nodes, network_lcc_file, output_dir, executable_path, background_score = 0.01, qname = "print", method = method) #!
	node_to_score = dict(line.strip("\n").split() for line in open(node_file).readlines())
	if from_file is not None:
	    values = map(float, numpy.array(node_to_score.values()))
	    m = numpy.mean(values)
	    s = numpy.std(values)
	    for source, geneids in drug_to_targets.iteritems():
		score = -numpy.mean([(float(node_to_score[gene]) - m) / s for gene in geneids])
		f.write("%s\t%s\t%f\n" % (source, target, score))
		d = target_to_source_score.setdefault(target, {})
		d[source] = score
    if from_file is not None:
	f.close()
    return target_to_source_score
예제 #6
0
def run_proximity_on_cluster(parameters, source_to_geneids, target_to_geneids, run_mode='array job', convert_names=True, md5_conversion=False, n_start = 0, n_end = 640000):
    """
    run_mode: array job | single job | run local | run cluster
    """
    network_file = parameters.get("network_file")
    n_random = int(parameters.get("n_random"))
    min_bin_size = int(parameters.get("n_node_in_bins"))
    seed = int(parameters.get("random_seed"))
    executable_path = parameters.get("executable_path")
    try:
	cluster_dir = parameters.get("cluster_dir")
	network_file = network_file.replace(parameters.get("base_dir"), cluster_dir) 
    except:
	cluster_dir = None
    qname = "all.q"
    delay = 10
    values = []
    source_to_md5 = {}
    md5_to_sources = {}
    #n_start, n_end = 500000, 640000 #15000 638952 
    increment = 500
    if run_mode == "run cluster":
	i = n_start
	while i < n_end+1:
	    experiment_count = get_number_of_jobs_in_queues()
	    while experiment_count > 60: 
		sleep(delay)
		experiment_count = get_number_of_jobs_in_queues()
	    #input_file = parameters.get("data_dir") + "/input/"
	    #if not os.path.exists(input_file):
	    #	continue
	    #out_file = [ word for word in open(input_file).readline().strip("\n").split() if word.endswith(".out") ][0]
	    #if os.path.exists(out_file):
	    #	continue
	    score_command = "-p %s/input/ -i %d -j %d" % (parameters.get("data_dir"), i, i + increment)
	    os.system("sbatch -x node30 run_proximity.sh %s" % score_command)  
	    i += increment
	return
    for source, geneids_source in source_to_geneids.iteritems(): 
	#print source, len(geneids_source)
	if convert_names:
	    source = text_utilities.convert_to_R_string(source)
	if md5_conversion:
	    md5 = hashlib.md5("-".join(sorted(geneids_source))).hexdigest()
	    source_to_md5[source] = (md5, geneids_source)
	    if md5 in md5_to_sources:
		md5_to_sources.setdefault(md5, set()).add(source)
		continue
	    else:
		md5_to_sources.setdefault(md5, set()).add(source)
	    source = md5
	for target, geneids_target in target_to_geneids.iteritems():
	    #print target, len(geneids_target)
	    if convert_names:
		target = text_utilities.convert_to_R_string(target)
	    out_file = parameters.get("output_dir") + "/%s_%s.out" % (source, target) 
	    if cluster_dir is not None:
		out_file = out_file.replace(parameters.get("base_dir"), cluster_dir) 

	    if os.path.exists(out_file):
	    	continue
	    score_command = ' -x %d -m %d -n %d -e "%s" -o "%s" -s "%s" -t "%s"' % (seed, min_bin_size, n_random, network_file, out_file, ",".join(geneids_source), ",".join(geneids_target))
	    if run_mode != "array job" and run_mode != "run cluster":
		score_command = executable_path + score_command
	    if run_mode == "array job":
		print "%s" % (score_command.replace('"', ''))
		values.append(score_command.replace('"', ''))
	    elif run_mode == "single job":
		print "qsub -cwd -S /bin/bash -o out -e err -v PATH=$PATH -v PYTHONPATH=$PYTHONPATH -q %s -N %s_%s -b y %s" % (qname, source[:3], target[:3], score_command)
	    elif run_mode == "run local":
		print "%s" % score_command
		os.system(score_command)
	    elif run_mode == "run cluster":
		experiment_count = get_number_of_jobs_in_queues()
		while experiment_count > 60: 
		    sleep(delay)
		    experiment_count = get_number_of_jobs_in_queues()
		#print score_command
		#os.system("qsub -cwd -S /bin/bash -o out -e err -v PATH=$PATH -v PYTHONPATH=$PYTHONPATH -q %s -N %s_%s -b y %s" % (qname, source[:3], target[:3], score_command))
		#os.system("sbatch -x node30 run_proximity.sh -f ../data/input/%i.txt" % i)
		os.system("sbatch -x node30 run_proximity.sh %s" % score_command)  
	    else:
		raise ValueError("Unknown run_mode: %s" % run_mode)
    n = 0
    for md5, sources in md5_to_sources.iteritems():
	if len(sources) > 1:
	    n += len(sources) - 1
	    print len(sources)
	    for source in sources:
		val, targets = source_to_md5[source]
		for source2 in sources:
		    val, targets2 = source_to_md5[source2]
		    if targets != targets2:
			print targets, targets2
    print len(source_to_geneids), n, len(md5_to_sources)
    return values