def main(file_list, jar, xml, db, out, queue, walltime, engine, num_jobs, vmem, log_level, port, local): """ Specify the path to a .json file as created by the fetch_runs.py script via the FILE_LIST argument. num_jobs will be created and executed on the cluster. """ level = logging.INFO if log_level is "DEBUG": level = logging.DEBUG elif log_level is "WARN": level = logging.WARN elif log_level is "INFO": level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s"), level=level) df = pd.read_json(file_list) logger.info("Read {} runs from .json file".format(len(df))) # get data files jarpath = path.abspath(jar) xmlpath = path.abspath(xml) db_path = path.abspath(db) outpath = path.abspath(out) output_directory = path.dirname(outpath) # create dir if it doesnt exist os.makedirs(output_directory, exist_ok=True) logger.info("Writing output and temporary data to {}".format(output_directory)) job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df, engine, queue, vmem, num_jobs, walltime) job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local) erna.collect_output(job_outputs, out, df)
def main(earliest_night, latest_night, data_dir, jar, xml, db, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, password): level=logging.INFO if log_level is 'DEBUG': level = logging.DEBUG elif log_level is 'WARN': level = logging.WARN elif log_level is 'INFO': level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=level) jarpath = os.path.abspath(jar) xmlpath =os. path.abspath(xml) outpath = os.path.abspath(out) erna.ensure_output(out) db_path = os.path.abspath(db) output_directory = os.path.dirname(outpath) #create dir if it doesnt exist os.makedirs(output_directory, exist_ok=True) logger.info("Writing output data to {}".format(out)) factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password)) data_conditions=dcc.conditions[conditions] df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions) logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs)) click.confirm('Do you want to continue processing and start jobs?', abort=True) job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df_runs, engine, queue, vmem, num_runs, walltime) job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local) erna.collect_output(job_outputs, out, df_runs)
def check_process_jobs(wait_sec, local): inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec), (16, wait_sec)] expected = list(map(compute_factorial, inputs)) function_jobs = make_jobs(inputs, compute_factorial) outputs = process_jobs(function_jobs, quiet=False, local=local) eq_(expected, outputs)
def main(): """ run a set of jobs on cluster """ logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=logging.INFO) print("=====================================") print("======== Submit and Wait ========") print("=====================================") print("") functionJobs = make_jobs() print("sending function jobs to cluster") print("") job_outputs = process_jobs(functionJobs, max_processes=4) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result))
def check_idle_parent_process(wait_sec): ''' Make sure that we don't kill idle parents that have active children. ''' inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec), (16, wait_sec)] inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec), (16, wait_sec)] expected = list(map(compute_factorial, inputs)) outputs = process_jobs([Job(pool_map_factorial, [inputs])], quiet=False)[0] eq_(expected, outputs)
def RunPerPool(vcfFile,id,sampledir,count,args): """This will run the pool to be analyzed. :param str vcfFile: str of vcf file name :param str id: str of sample id :param Namespace args: Namespace of args to get other variables :return: None :rtype: None """ jobs = [] if(os.path.isfile(vcfFile)): jobId = "run_ppg_" + str(count) + "_" + str(id) cmdList = [] cmd = args.python + " " + args.ppg + " " + vcfFile + " " + sampledir + " -s " + id + " --iAnnotateSV " + args.ias + " --genome hg19" # cmd = str(cmd) threads = int(args.threads) threads = threads + 1 qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \ " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + sampledir + " -sync y " + " -b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job( RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId, num_slots=1, queue=args.queue) jobs.append(job) print("sending function jobs to cluster") print("") job_outputs = process_jobs( jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) return
def RunPerPool(vcfFile, id, sampledir, count, args): """ This will run the pool to be analyzed. :param str vcfFile: str of vcf file name :param str id: str of sample id :param Namespace args: Namespace of args to get other variables :return: None :rtype: None """ jobs = [] if (os.path.isfile(vcfFile)): jobId = "run_dhs_" + str(count) + "_" + str(id) cmdList = [] cmd = args.python + " " + args.dhs + " " + vcfFile + " " + sampledir + \ " --iAnnotateSV " + args.ias + " --genome hg19" + " -hsl " + args.hotspotFile + " -bl " + args.blackListGenes + " -kgl " + args.genesToKeep # cmd = str(cmd) threads = int(args.threads) threads = threads + 1 qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \ " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + sampledir + " -sync y " + " -b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job(RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId, num_slots=1, queue=args.queue) jobs.append(job) print("sending function jobs to cluster") print("") job_outputs = process_jobs(jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) return
def main( jar, xml, out, mc_path, queue, walltime, engine, num_jobs, vmem, log_level, port, local): ''' Script to execute fact-tools on MonteCarlo files. Use the MC_PATH argument to specifiy the folders containing the MC ''' level=logging.INFO if log_level is 'DEBUG': level = logging.DEBUG elif log_level is 'WARN': level = logging.WARN elif log_level is 'INFO': level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=level) erna.ensure_output(out) jarpath = path.abspath(jar) xmlpath = path.abspath(xml) drspath = erna.mc_drs_file() logger.info('Using drs file at {}'.format(drspath)) #get data files files=[] for folder in tqdm(mc_path): # print("Entering folder {}".format(folder)) pattern = path.join(folder, '**/*_Events.fit*') f = glob.glob(pattern, recursive=True) files = files + f num_files = len(files) logger.info("Found {} files.".format(num_files)) if num_files == 1: logger.error("Need more than one file to work with.") return if num_jobs > num_files: logger.error("You specified more jobs than files. This doesn't make sense.") return click.confirm('Do you want to continue processing and start jobs?', abort=True) mc_paths_array = np.array(files) drs_paths_array = np.repeat(np.array(drspath), len(mc_paths_array)) job_list = make_jobs(jarpath, xmlpath, mc_paths_array, drs_paths_array, engine, queue, vmem, num_jobs, walltime) job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local) erna.collect_output(job_outputs, out)
def main(): """ run a set of jobs on cluster """ print("=====================================") print("======== Submit and Wait ========") print("=====================================") print("") functionJobs = make_jobs() print("sending function jobs to cluster") print("") job_outputs = process_jobs(functionJobs) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result))
def main(): """ run a set of jobs on cluster """ args = parser.parse_args() engine = args.engine queue = args.queue vmem = args.vmem port = args.port local =args.local level = args.logging if level is 'DEBUG': level = logging.DEBUG elif level is 'WARN': level = logging.WARN elif level is 'INFO': level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=level) print("=====================================") print("======== Submit and Wait ========") print("=====================================\n") functionJobs = make_jobs(engine, queue, vmem) if local : print('Running jobs locally') else: print("Sending function jobs to cluster engine: {}. Into queue: {} \n".format(engine, queue)) job_outputs = process_jobs(functionJobs, max_processes=4, port=port, local=local) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result))
def RunPerPool(titleFile, outdir, HSmetricsFileList, bamFileList, args): """This will run the pool to be analyzed. :param str titleFile: str of meta information file :param str outdir: str of output directory :param list HSmetricsFileList: list of picard hsmetrics files :param list bamFileList: list of bam files :param Namespace args: Namespace of args to get other variables :return: None :rtype: None """ # Run Preprocess titleFileDF = pd.read_csv(titleFile, sep='\t', header=0, keep_default_na='True') groupByPatientId = titleFileDF.groupby('Patient_ID') baseNames = {} jobs = [] poolidRegXcompile = re.compile('.*[PoolNormal|PooledNormal].*') poolHsmetricsFile = filter(poolidRegXcompile.match, HSmetricsFileList).pop() poolbamFile = filter(poolidRegXcompile.match, bamFileList).pop() for patientID, group in groupByPatientId: print patientID, ":" tsampleId = '' tBamFile = '' nBamFile = '' basename = '' nsampleId = '' if(os.path.isdir(outdir)): if(args.verbose): print "Pool Output Dir:", outdir, "exists!!!" else: os.mkdir(outdir) os.chmod(outdir, 0o755) for count, row in group.iterrows(): sampleId = row.loc['Sample_ID'] patientId = row.loc['Patient_ID'] sampleClass = row.loc['Class'] idRegXcompile = re.compile('.*' + sampleId + '.*') if(sampleClass == "Tumor"): basename = sampleId tBamFile = filter(idRegXcompile.match, bamFileList).pop() os.symlink(tBamFile, os.path.join(outdir, os.path.basename(tBamFile))) tBamFile = os.path.join(outdir, os.path.basename(tBamFile)) tsampleId = sampleId if(sampleClass == "Normal"): nBamFile = filter(idRegXcompile.match, bamFileList).pop() os.symlink(nBamFile, os.path.join(outdir, os.path.basename(nBamFile))) nBamFile = os.path.join(outdir, os.path.basename(nBamFile)) nsampleId = sampleId nHSmetricsFile = filter(idRegXcompile.match, HSmetricsFileList).pop() (decision) = SelectNormal(nHSmetricsFile, poolHsmetricsFile) if(decision == 'UnMatched'): nBamFile = poolbamFile else: if(args.verbose): print "Matched Sample\n" if(os.path.isfile(tBamFile) and (os.path.isfile(nBamFile))): jobId = "iCallSV_" + str(count) + "_" + str(basename) cmdList = [] cmd = args.python + " " + args.icsv + " -sc " + args.conf + " -bbam " + nBamFile + " -abam " + \ tBamFile + " -aId " + tsampleId + " -bId " + nsampleId + " -op " + tsampleId + " -o " + outdir + " -v" # cmd = str(cmd) threads = int(args.threads) threads = threads + 1 qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \ " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + outdir + " -sync y " + " -b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job( RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId, num_slots=1, queue=args.queue) jobs.append(job) print("sending function jobs to cluster") print("") job_outputs = process_jobs( jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) return
def RunPerPool(titleFile, outdir, HSmetricsFileList, bamFileList, args): """This will run the pool to be analyzed. :param str titleFile: str of meta information file :param str outdir: str of output directory :param list HSmetricsFileList: list of picard hsmetrics files :param list bamFileList: list of bam files :param Namespace args: Namespace of args to get other variables :return: None :rtype: None """ # Run Preprocess titleFileDF = pd.read_csv(titleFile, sep='\t', header=0, keep_default_na='True') groupByPatientId = titleFileDF.groupby('Patient_ID') baseNames = {} jobs = [] poolidRegXcompile = re.compile('.*[PoolNormal|PooledNormal].*') poolHsmetricsFile = filter(poolidRegXcompile.match, HSmetricsFileList).pop() poolbamFile = filter(poolidRegXcompile.match, bamFileList).pop() for patientID, group in groupByPatientId: print patientID, ":" tsampleId = '' tBamFile = '' nBamFile = '' basename = '' nsampleId = '' if (os.path.isdir(outdir)): if (args.verbose): print "Pool Output Dir:", outdir, "exists!!!" else: os.mkdir(outdir) os.chmod(outdir, 0o755) for count, row in group.iterrows(): sampleId = row.loc['Sample_ID'] patientId = row.loc['Patient_ID'] sampleClass = row.loc['Class'] idRegXcompile = re.compile('.*' + sampleId + '.*') if (sampleClass == "Tumor"): basename = sampleId tBamFile = filter(idRegXcompile.match, bamFileList).pop() os.symlink(tBamFile, os.path.join(outdir, os.path.basename(tBamFile))) tBamFile = os.path.join(outdir, os.path.basename(tBamFile)) tsampleId = sampleId if (sampleClass == "Normal"): nBamFile = filter(idRegXcompile.match, bamFileList).pop() os.symlink(nBamFile, os.path.join(outdir, os.path.basename(nBamFile))) nBamFile = os.path.join(outdir, os.path.basename(nBamFile)) nsampleId = sampleId nHSmetricsFile = filter(idRegXcompile.match, HSmetricsFileList).pop() (decision) = SelectNormal(nHSmetricsFile, poolHsmetricsFile) if (decision == 'UnMatched'): nBamFile = poolbamFile else: if (args.verbose): print "Matched Sample\n" if (os.path.isfile(tBamFile) and (os.path.isfile(nBamFile))): jobId = "iCallSV_" + str(count) + "_" + str(basename) cmdList = [] cmd = args.python + " " + args.icsv + " -sc " + args.conf + " -bbam " + nBamFile + " -abam " + \ tBamFile + " -aId " + tsampleId + " -bId " + nsampleId + " -op " + tsampleId + " -o " + outdir + " -v" # cmd = str(cmd) threads = int(args.threads) threads = threads + 1 qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \ " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + outdir + " -sync y " + " -b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job(RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId, num_slots=1, queue=args.queue) jobs.append(job) print("sending function jobs to cluster") print("") job_outputs = process_jobs(jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) return
def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hosts=None, write_summary=True, quiet=False, ablation=0, resume=False, log_level=logging.INFO): """ Takes a configuration file and runs the specified jobs on the grid. Parameters ---------- config_file : str Path to the configuration file we would like to use. local : bool, optional Should this be run locally instead of on the cluster? Defaults to ``False``. overwrite : bool, optional If the model files already exist, should we overwrite them instead of re-using them? Defaults to ``True``. queue : str, optional The DRMAA queue to use if we're running on the cluster. Defaults to ``'all.q'``. hosts : list of str, optional If running on the cluster, these are the machines we should use. Defaults to ``None``. write_summary : bool, optional Write a TSV file with a summary of the results. Defaults to ``True``. quiet : bool, optional Suppress printing of "Loading..." messages. Defaults to ``False``. ablation : int, optional Number of features to remove when doing an ablation experiment. If positive, we will perform repeated ablation runs for all combinations of features removing the specified number at a time. If ``None``, we will use all combinations of all lengths. If 0, the default, no ablation is performed. If negative, a ``ValueError`` is raised. Defaults to 0. resume : bool, optional If result files already exist for an experiment, do not overwrite them. This is very useful when doing a large ablation experiment and part of it crashes. Defaults to ``False``. log_level : str, optional The level for logging messages. Defaults to ``logging.INFO``. Returns ------- result_json_paths : list of str A list of paths to .json results files for each variation in the experiment. Raises ------ ValueError If value for ``"ablation"`` is not a positive int or ``None``. OSError If the lenth of the ``FeatureSet`` name > 210. """ try: # Read configuration (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objectives, probability, pipeline, results_path, pos_label_str, feature_scaling, min_feature_count, folds_file, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, save_cv_models, use_folds_file_for_grid_search, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path, custom_metric_path, learning_curve_cv_folds_list, learning_curve_train_sizes, output_metrics) = parse_config_file(config_file, log_level=log_level) # get the main experiment logger that will already have been # created by the configuration parser so we don't need anything # except the name `experiment`. logger = get_skll_logger('experiment') # Check if we have gridmap if not local and not _HAVE_GRIDMAP: local = True logger.warning('gridmap 0.10.1+ not available. Forcing local ' 'mode. To run things on a DRMAA-compatible ' 'cluster, install gridmap>=0.10.1 via pip.') # No grid search or ablation for learning curve generation if task == 'learning_curve': if ablation is None or ablation > 0: ablation = 0 logger.warning("Ablating features is not supported during " "learning curve generation. Ignoring.") # if we just had a train file and a test file, there are no real featuresets # in which case there are no features to ablate if len(featuresets) == 1 and len(featuresets[0]) == 1: if ablation is None or ablation > 0: ablation = 0 logger.warning( "Not enough featuresets for ablation. Ignoring.") # if performing ablation, expand featuresets to include combinations of # features within those sets if ablation is None or ablation > 0: # Make new feature set lists so that we can iterate without issue expanded_fs = [] expanded_fs_names = [] for features, featureset_name in zip(featuresets, featureset_names): features = sorted(features) featureset = set(features) # Expand to all feature combinations if ablation is None if ablation is None: for i in range(1, len(features)): for excluded_features in combinations(features, i): expanded_fs.append( sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Otherwise, just expand removing the specified number at a time else: for excluded_features in combinations(features, ablation): expanded_fs.append( sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Also add version with nothing removed as baseline expanded_fs.append(features) expanded_fs_names.append(featureset_name + '_all') # Replace original feature set lists featuresets = expanded_fs featureset_names = expanded_fs_names elif ablation < 0: raise ValueError('Value for "ablation" argument must be either ' 'positive integer or None.') # the list of jobs submitted (if running on grid) if not local: jobs = [] # the list to hold the paths to all the result json files result_json_paths = [] # check if the length of the featureset_name exceeds the maximum length # allowed for featureset_name in featureset_names: if len(featureset_name) > 210: raise OSError( 'System generated file length "{}" exceeds the ' 'maximum length supported. Please specify names of ' 'your datasets with "featureset_names". If you are ' 'running ablation experiment, please reduce the ' 'length of the features in "featuresets" because the' ' auto-generated name would be longer than the file ' 'system can handle'.format(featureset_name)) # if the task is learning curve, and ``metrics`` was specified, then # assign the value of ``metrics`` to ``grid_objectives`` - this lets # us piggyback on the parallelization of the objectives that is already # set up for us to use if task == 'learning_curve' and len(output_metrics) > 0: grid_objectives = output_metrics # if there were no grid objectives provided, just set it to # a list containing a single None so as to allow the parallelization # to proceeed and to pass the correct default value of grid_objective # down to _classify_featureset(). if not grid_objectives: grid_objectives = [None] # Run each featureset-learner-objective combination for featureset, featureset_name in zip(featuresets, featureset_names): for learner_num, learner_name in enumerate(learners): for grid_objective in grid_objectives: # for the individual job name, we need to add the feature set name # and the learner name if grid_objective is None or len(grid_objectives) == 1: job_name_components = [ experiment_name, featureset_name, learner_name ] else: job_name_components = [ experiment_name, featureset_name, learner_name, grid_objective ] job_name = '_'.join(job_name_components) # change the prediction prefix to include the feature set prediction_prefix = join(prediction_dir, job_name) # the log file that stores the actual output of this script (e.g., # the tuned parameters, what kind of experiment was run, etc.) logfile = join(log_path, '{}.log'.format(job_name)) # Figure out result json file path result_json_path = join(results_path, '{}.results.json'.format(job_name)) # save the path to the results json file that will be written result_json_paths.append(result_json_path) # If result file already exists and we're resuming, move on if resume and (exists(result_json_path) and getsize(result_json_path)): logger.info( 'Running in resume mode and %s exists, ' 'so skipping job.', result_json_path) continue # create job if we're doing things on the grid job_args = {} job_args["experiment_name"] = experiment_name job_args["task"] = task job_args["sampler"] = sampler job_args["feature_hasher"] = feature_hasher job_args["hasher_features"] = hasher_features job_args["job_name"] = job_name job_args["featureset"] = featureset job_args["featureset_name"] = featureset_name job_args["learner_name"] = learner_name job_args["train_path"] = train_path job_args["test_path"] = test_path job_args["train_set_name"] = train_set_name job_args["test_set_name"] = test_set_name job_args["shuffle"] = do_shuffle job_args["model_path"] = model_path job_args["prediction_prefix"] = prediction_prefix job_args["grid_search"] = do_grid_search job_args["grid_objective"] = grid_objective job_args['output_metrics'] = output_metrics job_args["suffix"] = suffix job_args["log_file"] = logfile job_args["log_level"] = log_level job_args["probability"] = probability job_args["pipeline"] = pipeline job_args["results_path"] = results_path job_args["sampler_parameters"] = ( fixed_sampler_parameters if fixed_sampler_parameters else dict()) job_args["fixed_parameters"] = ( fixed_parameter_list[learner_num] if fixed_parameter_list else dict()) job_args["param_grid"] = (param_grid_list[learner_num] if param_grid_list else None) job_args["pos_label_str"] = pos_label_str job_args["overwrite"] = overwrite job_args["feature_scaling"] = feature_scaling job_args["min_feature_count"] = min_feature_count job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["folds_file"] = folds_file job_args["cv_folds"] = cv_folds job_args["save_cv_folds"] = save_cv_folds job_args["save_cv_models"] = save_cv_models job_args[ "use_folds_file_for_grid_search"] = use_folds_file_for_grid_search job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats job_args["quiet"] = quiet job_args["class_map"] = class_map job_args["custom_learner_path"] = custom_learner_path job_args["custom_metric_path"] = custom_metric_path job_args[ "learning_curve_cv_folds"] = learning_curve_cv_folds_list[ learner_num] job_args[ "learning_curve_train_sizes"] = learning_curve_train_sizes if not local: jobs.append( Job(_classify_featureset, [job_args], num_slots=(MAX_CONCURRENT_PROCESSES if (do_grid_search or task == 'learning_curve') else 1), name=job_name, queue=queue)) else: _classify_featureset(job_args) # Call get_skll_logger again after _classify_featureset # calls are finished so that any warnings that may # happen after this point get correctly logged to the # main logger logger = get_skll_logger('experiment') # submit the jobs (if running on grid) if not local and _HAVE_GRIDMAP: if log_path: job_results = process_jobs(jobs, white_list=hosts, temp_dir=log_path) else: job_results = process_jobs(jobs, white_list=hosts) _check_job_results(job_results) # write out the summary results file if (task == 'cross_validate' or task == 'evaluate') and write_summary: summary_file_name = experiment_name + '_summary.tsv' with open(join(results_path, summary_file_name), 'w', newline='') as output_file: _write_summary_file(result_json_paths, output_file, ablation=ablation) elif task == 'learning_curve': output_file_name = experiment_name + '_summary.tsv' output_file_path = join(results_path, output_file_name) with open(output_file_path, 'w', newline='') as output_file: _write_learning_curve_file(result_json_paths, output_file) # generate the actual plot if we have the requirements installed generate_learning_curve_plots(experiment_name, results_path, output_file_path) finally: # Close/remove any logger handlers close_and_remove_logger_handlers(get_skll_logger('experiment')) return result_json_paths
def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hosts=None, write_summary=True, quiet=False, ablation=0, resume=False): """ Takes a configuration file and runs the specified jobs on the grid. :param config_path: Path to the configuration file we would like to use. :type config_path: str :param local: Should this be run locally instead of on the cluster? :type local: bool :param overwrite: If the model files already exist, should we overwrite them instead of re-using them? :type overwrite: bool :param queue: The DRMAA queue to use if we're running on the cluster. :type queue: str :param hosts: If running on the cluster, these are the machines we should use. :type hosts: list of str :param write_summary: Write a tsv file with a summary of the results. :type write_summary: bool :param quiet: Suppress printing of "Loading..." messages. :type quiet: bool :param ablation: Number of features to remove when doing an ablation experiment. If positive, we will perform repeated ablation runs for all combinations of features removing the specified number at a time. If ``None``, we will use all combinations of all lengths. If 0, the default, no ablation is performed. If negative, a ``ValueError`` is raised. :type ablation: int or None :param resume: If result files already exist for an experiment, do not overwrite them. This is very useful when doing a large ablation experiment and part of it crashes. :type resume: bool :return: A list of paths to .json results files for each variation in the experiment. :rtype: list of str """ # Initialize logger logger = logging.getLogger(__name__) # Read configuration (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_file) # Check if we have gridmap if not local and not _HAVE_GRIDMAP: local = True logger.warning('gridmap 0.10.1+ not available. Forcing local ' 'mode. To run things on a DRMAA-compatible ' 'cluster, install gridmap>=0.10.1 via pip.') # if performing ablation, expand featuresets to include combinations of # features within those sets if ablation is None or ablation > 0: # Make new feature set lists so that we can iterate without issue expanded_fs = [] expanded_fs_names = [] for features, featureset_name in zip(featuresets, featureset_names): features = sorted(features) featureset = set(features) # Expand to all feature combinations if ablation is None if ablation is None: for i in range(1, len(features)): for excluded_features in combinations(features, i): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Otherwise, just expand removing the specified number at a time else: for excluded_features in combinations(features, ablation): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Also add version with nothing removed as baseline expanded_fs.append(features) expanded_fs_names.append(featureset_name + '_all') # Replace original feature set lists featuresets = expanded_fs featureset_names = expanded_fs_names elif ablation < 0: raise ValueError('Value for "ablation" argument must be either ' 'positive integer or None.') # the list of jobs submitted (if running on grid) if not local: jobs = [] # the list to hold the paths to all the result json files result_json_paths = [] # check if the length of the featureset_name exceeds the maximum length # allowed for featureset_name in featureset_names: if len(featureset_name) > 210: raise OSError('System generated file length "{}" exceeds the ' 'maximum length supported. Please specify names of ' 'your datasets with "featureset_names". If you are ' 'running ablation experiment, please reduce the ' 'length of the features in "featuresets" because the' ' auto-generated name would be longer than the file ' 'system can handle'.format(featureset_name)) # Run each featureset-learner combination for featureset, featureset_name in zip(featuresets, featureset_names): for learner_num, learner_name in enumerate(learners): # for the individual job name, we need to add the feature set name # and the learner name job_name_components = [experiment_name, featureset_name, learner_name] job_name = '_'.join(job_name_components) # change the prediction prefix to include the feature set prediction_prefix = join(prediction_dir, job_name) # the log file that stores the actual output of this script (e.g., # the tuned parameters, what kind of experiment was run, etc.) temp_logfile = join(log_path, '{}.log'.format(job_name)) # Figure out result json file path result_json_path = join(results_path, '{}.results.json'.format(job_name)) # save the path to the results json file that will be written result_json_paths.append(result_json_path) # If result file already exists and we're resuming, move on if resume and (exists(result_json_path) and os.path.getsize(result_json_path)): logger.info('Running in resume mode and %s exists, so skipping' ' job.', result_json_path) continue # create job if we're doing things on the grid job_args = {} job_args["experiment_name"] = experiment_name job_args["task"] = task job_args["sampler"] = sampler job_args["feature_hasher"] = feature_hasher job_args["hasher_features"] = hasher_features job_args["job_name"] = job_name job_args["featureset"] = featureset job_args["featureset_name"] = featureset_name job_args["learner_name"] = learner_name job_args["train_path"] = train_path job_args["test_path"] = test_path job_args["train_set_name"] = train_set_name job_args["test_set_name"] = test_set_name job_args["shuffle"] = do_shuffle job_args["model_path"] = model_path job_args["prediction_prefix"] = prediction_prefix job_args["grid_search"] = do_grid_search job_args["grid_objective"] = grid_objective job_args["suffix"] = suffix job_args["log_path"] = temp_logfile job_args["probability"] = probability job_args["results_path"] = results_path job_args["sampler_parameters"] = (fixed_sampler_parameters if fixed_sampler_parameters else dict()) job_args["fixed_parameters"] = (fixed_parameter_list[learner_num] if fixed_parameter_list else dict()) job_args["param_grid"] = (param_grid_list[learner_num] if param_grid_list else None) job_args["pos_label_str"] = pos_label_str job_args["overwrite"] = overwrite job_args["feature_scaling"] = feature_scaling job_args["min_feature_count"] = min_feature_count job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["cv_folds"] = cv_folds job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats job_args["quiet"] = quiet job_args["class_map"] = class_map job_args["custom_learner_path"] = custom_learner_path if not local: jobs.append(Job(_classify_featureset, [job_args], num_slots=(MAX_CONCURRENT_PROCESSES if do_grid_search else 1), name=job_name, queue=queue)) else: _classify_featureset(job_args) test_set_name = basename(test_path) # submit the jobs (if running on grid) if not local and _HAVE_GRIDMAP: if log_path: job_results = process_jobs(jobs, white_list=hosts, temp_dir=log_path) else: job_results = process_jobs(jobs, white_list=hosts) _check_job_results(job_results) # write out the summary results file if (task == 'cross_validate' or task == 'evaluate') and write_summary: summary_file_name = experiment_name + '_summary.tsv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, summary_file_name), file_mode) as output_file: _write_summary_file(result_json_paths, output_file, ablation=ablation) return result_json_paths
def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hosts=None, write_summary=True, quiet=False, ablation=0, resume=False): """ Takes a configuration file and runs the specified jobs on the grid. :param config_path: Path to the configuration file we would like to use. :type config_path: str :param local: Should this be run locally instead of on the cluster? :type local: bool :param overwrite: If the model files already exist, should we overwrite them instead of re-using them? :type overwrite: bool :param queue: The DRMAA queue to use if we're running on the cluster. :type queue: str :param hosts: If running on the cluster, these are the machines we should use. :type hosts: list of str :param write_summary: Write a tsv file with a summary of the results. :type write_summary: bool :param quiet: Suppress printing of "Loading..." messages. :type quiet: bool :param ablation: Number of features to remove when doing an ablation experiment. If positive, we will perform repeated ablation runs for all combinations of features removing the specified number at a time. If ``None``, we will use all combinations of all lengths. If 0, the default, no ablation is performed. If negative, a ``ValueError`` is raised. :type ablation: int or None :param resume: If result files already exist for an experiment, do not overwrite them. This is very useful when doing a large ablation experiment and part of it crashes. :type resume: bool :return: A list of paths to .json results files for each variation in the experiment. :rtype: list of str """ # Initialize logger logger = logging.getLogger(__name__) # Read configuration (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objectives, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_file) # Check if we have gridmap if not local and not _HAVE_GRIDMAP: local = True logger.warning('gridmap 0.10.1+ not available. Forcing local ' 'mode. To run things on a DRMAA-compatible ' 'cluster, install gridmap>=0.10.1 via pip.') # if performing ablation, expand featuresets to include combinations of # features within those sets if ablation is None or ablation > 0: # Make new feature set lists so that we can iterate without issue expanded_fs = [] expanded_fs_names = [] for features, featureset_name in zip(featuresets, featureset_names): features = sorted(features) featureset = set(features) # Expand to all feature combinations if ablation is None if ablation is None: for i in range(1, len(features)): for excluded_features in combinations(features, i): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Otherwise, just expand removing the specified number at a time else: for excluded_features in combinations(features, ablation): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Also add version with nothing removed as baseline expanded_fs.append(features) expanded_fs_names.append(featureset_name + '_all') # Replace original feature set lists featuresets = expanded_fs featureset_names = expanded_fs_names elif ablation < 0: raise ValueError('Value for "ablation" argument must be either ' 'positive integer or None.') # the list of jobs submitted (if running on grid) if not local: jobs = [] # the list to hold the paths to all the result json files result_json_paths = [] # check if the length of the featureset_name exceeds the maximum length # allowed for featureset_name in featureset_names: if len(featureset_name) > 210: raise OSError('System generated file length "{}" exceeds the ' 'maximum length supported. Please specify names of ' 'your datasets with "featureset_names". If you are ' 'running ablation experiment, please reduce the ' 'length of the features in "featuresets" because the' ' auto-generated name would be longer than the file ' 'system can handle'.format(featureset_name)) # Run each featureset-learner combination for featureset, featureset_name in zip(featuresets, featureset_names): for learner_num, learner_name in enumerate(learners): for grid_objective in grid_objectives: # for the individual job name, we need to add the feature set name # and the learner name if len(grid_objectives) == 1: job_name_components = [experiment_name, featureset_name, learner_name] else: job_name_components = [experiment_name, featureset_name, learner_name, grid_objective] job_name = '_'.join(job_name_components) # change the prediction prefix to include the feature set prediction_prefix = join(prediction_dir, job_name) # the log file that stores the actual output of this script (e.g., # the tuned parameters, what kind of experiment was run, etc.) temp_logfile = join(log_path, '{}.log'.format(job_name)) # Figure out result json file path result_json_path = join(results_path, '{}.results.json'.format(job_name)) # save the path to the results json file that will be written result_json_paths.append(result_json_path) # If result file already exists and we're resuming, move on if resume and (exists(result_json_path) and os.path.getsize(result_json_path)): logger.info('Running in resume mode and %s exists, ' 'so skipping job.', result_json_path) continue # create job if we're doing things on the grid job_args = {} job_args["experiment_name"] = experiment_name job_args["task"] = task job_args["sampler"] = sampler job_args["feature_hasher"] = feature_hasher job_args["hasher_features"] = hasher_features job_args["job_name"] = job_name job_args["featureset"] = featureset job_args["featureset_name"] = featureset_name job_args["learner_name"] = learner_name job_args["train_path"] = train_path job_args["test_path"] = test_path job_args["train_set_name"] = train_set_name job_args["test_set_name"] = test_set_name job_args["shuffle"] = do_shuffle job_args["model_path"] = model_path job_args["prediction_prefix"] = prediction_prefix job_args["grid_search"] = do_grid_search job_args["grid_objective"] = grid_objective job_args["suffix"] = suffix job_args["log_path"] = temp_logfile job_args["probability"] = probability job_args["results_path"] = results_path job_args["sampler_parameters"] = (fixed_sampler_parameters if fixed_sampler_parameters else dict()) job_args["fixed_parameters"] = (fixed_parameter_list[learner_num] if fixed_parameter_list else dict()) job_args["param_grid"] = (param_grid_list[learner_num] if param_grid_list else None) job_args["pos_label_str"] = pos_label_str job_args["overwrite"] = overwrite job_args["feature_scaling"] = feature_scaling job_args["min_feature_count"] = min_feature_count job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["cv_folds"] = cv_folds job_args["save_cv_folds"] = save_cv_folds job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats job_args["quiet"] = quiet job_args["class_map"] = class_map job_args["custom_learner_path"] = custom_learner_path if not local: jobs.append(Job(_classify_featureset, [job_args], num_slots=(MAX_CONCURRENT_PROCESSES if do_grid_search else 1), name=job_name, queue=queue)) else: _classify_featureset(job_args) test_set_name = basename(test_path) # submit the jobs (if running on grid) if not local and _HAVE_GRIDMAP: if log_path: job_results = process_jobs(jobs, white_list=hosts, temp_dir=log_path) else: job_results = process_jobs(jobs, white_list=hosts) _check_job_results(job_results) # write out the summary results file if (task == 'cross_validate' or task == 'evaluate') and write_summary: summary_file_name = experiment_name + '_summary.tsv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, summary_file_name), file_mode) as output_file: _write_summary_file(result_json_paths, output_file, ablation=ablation) return result_json_paths
def RunPerPool(titleFile, outdir, HSmetricsFileList, bamFileList, segmentFileList, args, jobqueue): # Run Preprocess titleFileDF = pd.read_csv(titleFile, sep='\t', header=0, keep_default_na='True') groupByPatientId = titleFileDF.groupby('Patient_ID') baseNames = {} jobs = [] poolidRegXcompile = re.compile('.*[PoolNormal|PooledNormal].*') poolHsmetricsFile = filter(poolidRegXcompile.match, HSmetricsFileList).pop() poolbamFile = filter(poolidRegXcompile.match, bamFileList).pop() for patientID, group in groupByPatientId: print patientID, ":" outTargetFile = '' tBamFile = '' nBamFile = '' basename = '' toutdir = '' if(os.path.isdir(outdir)): if(args.verbose): print "Pool Output Dir:", outdir, "exists!!!" else: os.mkdir(outdir) os.chmod(outdir, 0o755) for count, row in group.iterrows(): bcId = row.loc['Barcode'] poolId = row.loc['Pool'] sampleId = row.loc['Sample_ID'] patientId = row.loc['Patient_ID'] sampleClass = row.loc['Class'] idRegXcompile = re.compile('.*' + sampleId + '.*') if(sampleClass == "Tumor"): toutdir = outdir + "/" + sampleId if(os.path.isdir(toutdir)): if(args.verbose): print "Output Dir:", toutdir, "exists!!!" else: os.mkdir(toutdir) os.chmod(toutdir, 0o755) outTargetFile = toutdir + "/" + sampleId + "_targetRegion.bed" txt_fh = open(outTargetFile, "wb") txt_fh.write("chrom\tloc.start\tloc.end\n") basename = sampleId tBamFile = filter(idRegXcompile.match, bamFileList).pop() segfile = filter(idRegXcompile.match, segmentFileList).pop() if(segfile): segFileDF = pd.read_csv(segfile, sep=' ', header=0, keep_default_na='True') for segcount, segrow in segFileDF.iterrows(): chr = segrow.loc['chrom'] start = segrow.loc['loc.start'] end = segrow.loc['loc.end'] txt_fh.write(str(chr) + "\t" + str(start) + "\t" + str(end) + "\n") txt_fh.close() if(sampleClass == "Normal"): nBamFile = filter(idRegXcompile.match, bamFileList).pop() nHSmetricsFile = filter(idRegXcompile.match, HSmetricsFileList).pop() (decision) = SelectNormal(nHSmetricsFile, poolHsmetricsFile) if(decision == 'UnMatched'): nBamFile = poolbamFile else: if(args.verbose): print "Matched Sample\n" if(os.path.isfile(tBamFile) and (os.path.isfile(nBamFile))and (os.path.isfile(outTargetFile))): # Make Bai and Soft-Link Bam and Bai Files# Tumor Bam file_dir, this_filename = os.path.split(tBamFile) destTBamFile = toutdir + "/" + this_filename tBaiFile = this_filename tBaiFile = tBaiFile[:-1] tBaiFile = tBaiFile + "i" destTBaiFile = tBaiFile[:-4] destTBaiFile = destTBaiFile + ".bam.bai" destTBaiFile = toutdir + "/" + destTBaiFile tBaiFile = file_dir + "/" + tBaiFile if(os.path.isfile(destTBamFile)): print destTBamFile, "File already exists!!" else: os.symlink(tBamFile, destTBamFile) if(os.path.isfile(destTBaiFile)): print destTBaiFile, "File already exists!!" else: os.symlink(tBaiFile, destTBaiFile) tBamFile = destTBamFile # Make Bai and Soft-Link Bam and Bai Files#Noraml Bam File file_dir, this_filename = os.path.split(nBamFile) destNBamFile = toutdir + "/" + this_filename nBaiFile = this_filename nBaiFile = nBaiFile[:-1] nBaiFile = nBaiFile + "i" destNBaiFile = nBaiFile[:-4] destNBaiFile = destNBaiFile + ".bam.bai" destNBaiFile = toutdir + "/" + destNBaiFile nBaiFile = file_dir + "/" + nBaiFile # print destNBamFile,"\n",destNBaiFile,"\n",nBamFile,"\n",nBaiFile if(os.path.isfile(destNBamFile)): print destNBamFile, "File already exists!!" else: os.symlink(nBamFile, destNBamFile) if(os.path.isfile(destNBaiFile)): print destNBaiFile, "File already exists!!" else: os.symlink(nBaiFile, destNBaiFile) nBamFile = destNBamFile jobId_preprocess = "Preprocess_" + str(count) + "_" + str(basename) baseNames[basename] = toutdir + "#" + jobId_preprocess outpklFile = toutdir + "/" + basename + '.MixClone.input.pkl' if(os.path.isfile(outpklFile)): continue else: cmdList = [] cmd = args.python + " " + args.mixclone + " preprocess " + args.ref + " " + outTargetFile + " " + nBamFile + " " + tBamFile + " " + basename + \ " --min_depth " + args.minDepth + " --min_base_qual " + args.minBQ + " --min_map_qual " + args.minMQ + " --process_num " + args.threads #cmd = str(cmd) threads = int(args.threads) threads = threads + 1 qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId_preprocess + " -o " + jobId_preprocess + ".stdout" + " -e " + \ jobId_preprocess + ".stderr" + " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + toutdir + " -sync y " + " -b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job( RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId_preprocess, num_slots=1, queue=args.queue) jobs.append(job) print("sending function jobs to cluster") print("") job_outputs = process_jobs( jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) # RunModel count = 0 jobs = [] for basename, jdata in baseNames.iteritems(): (toutdir, jobId_preprocess) = jdata.split('#', 1) jobId_runmodel = "RunModel_" + str(count) + "_" + str(basename) outputbasename = basename + '_output' outpklFileRegex = toutdir + "/" + basename + '*.MixClone.output.pkl' outpklFiles = glob.glob(outpklFileRegex) if(outpklFiles): continue else: cmdList = [] cmd = args.python + " " + args.mixclone + " run_model " + basename + " " + \ outputbasename + " --max_copynumber 8 --subclone_num 3 --max_iters 30 --stop_value 1e-6" qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId_runmodel + " -o " + jobId_runmodel + ".stdout" + " -e " + \ jobId_runmodel + ".stderr" + " -V -l h_vmem=6G,virtual_free=6G -pe smp 1" + " -wd " + toutdir + " -sync y " + "-b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job( RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId_runmodel, num_slots=1, queue=args.queue) jobs.append(job) count = count + 1 print("sending function jobs to cluster") print("") job_outputs = process_jobs( jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) # Run BAF count = 0 jobs = [] for basename, jdata in baseNames.iteritems(): (toutdir, jobId_preprocess) = jdata.split('#', 1) outputbasename = basename + '_output' jobId_PP = "PostPorcess_" + str(count) + "_" + str(basename) cmdList = [] cmd = args.python + " " + args.mixclone + " postprocess " + outputbasename qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId_PP + " -o " + jobId_PP + ".stdout" + " -e " + jobId_PP + \ ".stderr" + " -V -l h_vmem=6G,virtual_free=6G -pe smp 1" + " -wd " + toutdir + " -sync y " + " -b y " + cmd print "qsub_cmd:", qsub_cmd, "\n" cmdList.append(qsub_cmd) job = Job( RunJob, cmdList, kwlist=None, cleanup=True, mem_free="2G", name=jobId_PP, num_slots=1, queue=args.queue) jobs.append(job) count = count + 1 print("sending function jobs to cluster") print("") job_outputs = process_jobs( jobs, max_processes=10, temp_dir='/dmp/analysis/SCRATCH/', white_list=None, quiet=False, local=False) print("results from each job") for (i, result) in enumerate(job_outputs): print("Job {0}- result: {1}".format(i, result)) return