def main(): from os import system from sys import argv from common import Params import sys dir_path = os.path.dirname(os.path.realpath(__file__)) config = Params(argv[1]) scheduler = argv[2] collective_list = config.getStrlst("collectives") omb_path = config.getStr("omb_collective_directory") imb_bin = config.getStr("imb_binary") num_rank_list = config.getIntlst("number_of_ranks") max_num_node = config.getInt("max_num_node") num_core_per_node = config.getInt("number_of_cores_per_node") num_run = config.getInt("number_of_runs_per_test") job_directory = dir_path + "/collective_jobs" for collective in collective_list: params = Params(job_directory + "/" + collective + ".job") if not os.path.exists(dir_path + "/output"): os.makedirs(dir_path + "/output") if not os.path.exists(dir_path + "/output/" + collective): os.makedirs(dir_path + "/output/" + collective) num_alg = params.getInt("number_of_algorithms") exclude_alg = params.getIntlst("exclude_algorithms") two_proc_alg = -1 try: two_proc_alg = params.getInt("two_proc_alg") except Exception as e: print "No two proc algorithm for " + collective f = open( dir_path + "/output/" + collective + "/" + collective + "_coltune.sh", "w") print >> f, "#!/bin/sh" print >> f, "#" if scheduler == "slurm": print >> f, "#SBATCH --job-name=" + collective print >> f, "#SBATCH --output=res.txt" print >> f, "#" print >> f, "#SBATCH --ntasks-per-node=" + str(num_core_per_node) print >> f, "#SBATCH --time=1000:00:00" print >> f, "#SBATCH --nodes=" + str(max_num_node) elif scheduler == "sge": print >> f, "#$ -j y" print >> f, "#$ -pe mpi %d" % (max_num_node * num_core_per_node) print >> f, "#" print >> f, "#$ -cwd" print >> f, "#" print >> f, "echo Got $NSOLTS processors." else: print "Unknown scheduler. Aborting.." sys.exit() print >> f, "" for num_rank in num_rank_list: for alg in range(num_alg + 1): if alg in exclude_alg or (alg == two_proc_alg and num_rank > 2): continue print >> f, "# ", alg, num_rank, "ranks" for run_id in xrange(num_run): if collective in imb_collectives: prg_name = imb_bin + " -npmin %d %s " % (num_rank, collective) else: prg_name = omb_path + "/osu_" + collective cmd = "mpirun --np %d " % (num_rank) cmd += "--mca coll_tuned_use_dynamic_rules 1 --mca coll_tuned_" + collective + "_algorithm " + str( alg) cmd += " " + prg_name cmd += " >& " + dir_path + "/output/" + collective + "/" + str( alg) + "_" + str(num_rank) + "ranks" + "_run" + str( run_id) + ".out" print >> f, cmd print >> f, "" f.close() print "SGE script wrote to " + collective + "_coltune.sh successfully!"
def writeDecision(config, dir_path, outfil): collective_list = config.getStrlst("collectives") num_rank_list = config.getIntlst("number_of_ranks") num_run = config.getInt("number_of_runs_per_test") num_coll = len(collective_list) output_dir = dir_path + "/output" job_dir = dir_path + "/collective_jobs" f = open(outfil, "w") print >> f, "%-10s" % num_coll, "# Number of collectives" for collective in collective_list: if not os.path.exists(dir_path + "/output/" + collective): print "Collective " + collective + " output not detected. Exiting." return params = Params(job_dir + "/" + collective + ".job") num_alg = params.getInt("number_of_algorithms") exclude_alg = params.getIntlst("exclude_algorithms") two_proc_alg = -1 try: two_proc_alg = params.getInt("two_proc_alg") except Exception as e: print "No two proc algorithm for " + collective raw_dir = dir_path + "/output/" + collective coll_result = {} for num_rank in num_rank_list: coll_result[num_rank] = NumRankResult(config, num_alg, exclude_alg, two_proc_alg, raw_dir, num_rank, collective) writeResult(num_rank_list, coll_result, raw_dir + "/best.out") print "Result wrote for " + collective + " to " + collective + "/best.out" print >> f, "%-10s" % coll_id_from_name( collective), "# Collective ID for", collective com_sizes = len(num_rank_list) print >> f, "%-10s" % com_sizes, "# Number of com sizes" for num_rank in num_rank_list: nod_result = coll_result[num_rank] print >> f, "%-10s" % num_rank, "# Com size" best = Params(output_dir + "/" + collective + "/best.out") best_alg = 0 # Open MPI requires that all data should start from msg size 0. # The default one is `0 0 0 0\n` # For collective data starts from msg size 0 (barrier or # collectives benchmarked by IMB) this line could be updated. if nod_result.msgsizlst()[0] == 0: num_sizes = 0 size_output = "" else: num_sizes = 1 size_output = "0 0 0 0\n" for i, msg_siz in enumerate(nod_result.msgsizlst()): new_alg = nod_result.selectAlg()[i] if new_alg == best_alg: continue best_alg = new_alg num_sizes += 1 size_output += str(msg_siz) size_output += " " + str(best_alg) size_output += " 0" size_output += " 0\n" print >> f, "%-10s" % num_sizes, "# Number of msg sizes" print >> f, size_output, writeDetail(params, coll_result, raw_dir + "/detail.out", num_alg, exclude_alg, two_proc_alg, num_run, num_rank_list)