def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() self.split=Split(self.workspace, copy(self.vital_parameters)) self.split_summary=Summarize(self.workspace, self.split) self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters)) self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment)
def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() self.split=Split(self.workspace, copy(self.vital_parameters)) self.split_summary=Summarize(self.workspace, self.split) self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters)) self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment) self.assembly=Assembly(self.workspace, copy(self.vital_parameters)) self.assembly_summary=Summarize(self.workspace, self.assembly) self.merge_assembly=Merge(self.workspace, self.assembly) self.group_manifest=GroupManifest(self.workspace, self.assembly)
class RefineB0(Step): def __init__(self, workspace, vital_parameters): self.workspace=workspace self.vital_parameters=vital_parameters self.quality=None self.output_prefix="refineB0" self.color=1 self.aligned_site_threshold=5 self.max_coverage=100 self.enable_multi_mode=True self.internal_split_ratio=0.20 self.internal_trimmed_coverage_ratio=0.35 # TODO this file doesn't exist in other assemblies... self.cnt_file="refineB0_max_id" self.min_contig_len=100.0 self.allow_no_splits=True self.allow_infinite_splits=False self.min_end_coverage=6.99 self.scale_bias_wt=0 self.min_likelihood_ratio=1e2 self.max_query_alignment=4 self.max_reference_alignment=6 self.max_repeat_shift=2 self.repeat_pval_ratio=0.01 self.repeat_log_pval_ratio=0.7 self.repeat_min_shift_ratio=0.6 self.min_gap_flanking_sites=2 self.output_trimmed_coverage=True self.normalize_trimmed_coverage=True self.min_gap_flanking_len=55 self.last_non_chimeric_site_after_gap=2 self.split_molecules_with_outliers=True self.outlier_pvals_per_true_positive=1e-5 self.end_outlier_prior_probability=1e-4 self.pval_after_refinement=1 self.faster_refinement_resolution="" self.count_splits_with_largest_ids=True self.contig_split_version="" self.reduced_contig_resolution_divided_by_two=2.0 self.overwrite_output=True self.hash_window=5 self.hash_min_sites=3 self.hash_sd_max=2.4 self.hash_sd_rms=1.5 self.hash_relative_error=0.05 self.hash_offset_kb=5.0 self.hash_max_insert_errors=1 self.hash_max_probe_errors=1 self.hash_max_unresolved_sites=1 self.hash_file="" self.hash_threshold="" self.hashdelta=10 self.reduced_molecule_resolution=1.2 self.insert_threads=4 self.skip_alignment_statistic_computation=True self.sd=0.2 self.sf=0.2 self.sr=0.03 self.res=3.3 self.regex_acceptible_output_file=".*.bnx" self.write_output_to_file=True self.write_errors_to_file=True self.max_job_count=2 self.autoGeneratePrereqs() def writeCode(self): code="cd " + self.workspace.work_dir + "\n" code+="mkdir -p " + self.getStepDir() + "\n" code+="cd " + self.getStepDir() + "\n" code+="pwd\n" param_values=OrderedDict() param_values["-i"]="placeholder" param_values["-o"]=self.output_prefix param_values["-maxthreads"]=str(self.getThreads()) param_values["-ref"]=self.merge_refineA.getOutputFile() param_values["-T"]=str(self.vital_parameters.pval) param_values["-usecolor"]=str(self.color) param_values["-A"]=str(self.aligned_site_threshold) param_values["-extend"]="1" param_values["-MaxCov"]=str(self.max_coverage) if self.enable_multi_mode: param_values["-MultiMode"]="" param_values["-contigsplit"]=" ".join([str(self.internal_split_ratio), str(self.internal_trimmed_coverage_ratio), self.cnt_file]) param_values["-MinSplitLen"]=str(self.min_contig_len) param_values["-nosplit"] = "2" if self.allow_no_splits else "0" if self.allow_infinite_splits else "1" param_values["-EndTrim"]=str(self.min_end_coverage) param_values["-biaswt"]=str(self.scale_bias_wt) param_values["-LRbias"]=str(self.min_likelihood_ratio) param_values["-deltaX"]=str(self.max_query_alignment) param_values["-deltaY"]=str(self.max_reference_alignment) param_values["-RepeatMask"]=" ".join([str(self.max_repeat_shift), str(self.repeat_pval_ratio)]) param_values["-RepeatRec"]=" ".join([str(self.repeat_log_pval_ratio), str(self.repeat_min_shift_ratio)]) param_values["-CovTrim"]=str(self.min_gap_flanking_sites) if self.output_trimmed_coverage: param_values["-ReplaceCov"]="" if self.normalize_trimmed_coverage: param_values["-TrimNorm"]="" param_values["-CovTrimLen"]=str(self.min_gap_flanking_len) param_values["-TrimNormChim"]=str(self.last_non_chimeric_site_after_gap) if self.split_molecules_with_outliers: param_values["-TrimOutlier"]="" param_values["-outlier"]=str(self.outlier_pvals_per_true_positive) param_values["-endoutlier"]=str(self.end_outlier_prior_probability) param_values["-endoutlierFinal"]=str(self.pval_after_refinement) param_values["-Mprobeval"]=str(self.faster_refinement_resolution) if self.count_splits_with_largest_ids: param_values["-splitcnt"]="" param_values["-splitrev"]=str(self.contig_split_version) param_values["-rres"]=str(self.reduced_contig_resolution_divided_by_two) if self.overwrite_output: param_values["-f"]="" param_values["-refine"]="0" param_values["-hashgen"] = " ".join([str(self.hash_window), str(self.hash_min_sites), str(self.hash_sd_max), str(self.hash_sd_rms), str(self.hash_relative_error), str(self.hash_offset_kb), str(self.hash_max_insert_errors), str(self.hash_max_probe_errors), str(self.hash_max_unresolved_sites)]) param_values["-hash"]=" ".join([self.hash_file, str(self.hash_threshold)]) param_values["-hashdelta"]=str(self.hashdelta) param_values["-mres"]=str(self.reduced_molecule_resolution) param_values["-insertThreasds"]=str(self.insert_threads) if self.skip_alignment_statistic_computation: param_values["-nostat"]="" param_values["-maxmem"]=str(self.getMem()) param_values["-FP"]=str(self.vital_parameters.fp) param_values["-FN"]=str(self.vital_parameters.fn) param_values["-sd"]=str(self.sd) param_values["-sf"]=str(self.sf) param_values["-sr"]=str(self.sr) param_values["-res"]=str(self.res) param_values["-refine"]="0" param_values["-grouped"]="../" + self.group_manifest.getOutputFile() param_values["-mapped"]="placeholder" param_values["-output-filter"]=self.regex_acceptible_output_file param_values["-id"]="placeholder" if self.write_output_to_file: param_values["-stdout"]="" if self.write_errors_to_file: param_values["-stderr"]="" param_values["-XmapStatRead"]="../"+self.molecule_stats.getOutputFile() param_values["-minlen"]=str(self.vital_parameters.min_molecule_len) param_values["-minsites"]=str(self.vital_parameters.min_molecule_sites) tmp_code="" cur_jobs=0 code_parts=[] for block in xrange(1, self.split.vital_parameters.blocks+1): cur_jobs+=1 param_values["-i"]=self.split.getOutputFile(block) param_values["-mapped"]="refineB0_id"+str(block)+"_mapped" param_values["-id"]=str(block) param_list=[self.workspace.binaries["bng_ref_aligner"]] for key in param_values: param_list.append(key) param_list.append(param_values[key]) tmp_code+=" ".join(param_list) + "\n" if cur_jobs>=self.max_job_count: code_parts.append(code+tmp_code) tmp_code="" cur_jobs=0 if len(tmp_code) > 0: code_parts.append(code+tmp_code) return code_parts def getStepDir(self): return "_".join(["refineB0", "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval), "minlen"+str(self.vital_parameters.min_molecule_len), "minsites"+str(self.vital_parameters.min_molecule_sites)]) def getOutputFile(self): return self.getStepDir() + "/" + self.output_prefix + "." + self.getOutputFileExtension() def getOutputFileExtension(self): return "contigs" def getOutputFileExtension(self): return "bnx" def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() self.split=Split(self.workspace, copy(self.vital_parameters)) self.split_summary=Summarize(self.workspace, self.split) self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters)) self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment) self.assembly=Assembly(self.workspace, copy(self.vital_parameters)) self.assembly_summary=Summarize(self.workspace, self.assembly) self.merge_assembly=Merge(self.workspace, self.assembly) self.refineA=RefineA(self.workspace, copy(self.vital_parameters)) self.refineA_summary=Summarize(self.workspace, self.refineA) self.merge_refineA=Merge(self.workspace, self.refineA) self.group_manifest=GroupManifest(self.workspace, self.refineA) def getPrereq(self): return self.group_manifest def getMem(self): return self.workspace.resources.getMediumMemory() def getTime(self): return self.workspace.resources.getLargeTime() def getThreads(self): return self.workspace.resources.getMediumThreads()
class PairwiseAlignment(Step): def __init__(self, workspace, vital_parameters): self.workspace=workspace self.vital_parameters=vital_parameters self.color=1 self.sd=0.2 self.sf=0.2 self.sr=0.03 self.res=3.3 self.min_alignment_sites=5 self.min_alignment_score=1 self.outlier_pval=0.0001 self.endoutlier_pval=0 self.repeat_max_shift=2 self.repeat_pval_change=0.01 self.repeat_pval_ratio=0.7 self.repeat_min_change=0.6 self.hash_window=5 self.hash_min_sites=3 self.hash_sd_max=2.2 self.hash_sd_rms=1.2 self.hash_relative_error=0.05 self.hash_offset_kb=3.0 self.hash_max_insert_errors=1 self.hash_max_probe_errors=1 self.hash_max_unresolved_sites=1 self.target_resolution=1.2 self.allow_no_splits=True self.allow_infinite_splits=False self.overwrite_output=True self.send_output_to_file=True self.send_error_to_file=True split=Split(self.workspace, self.vital_parameters) total_blocks=split.total_job_count self.total_job_count=total_blocks*(total_blocks+1)/2 approx_mins_per_job=270.0 self.max_job_count=self.getTime() * (60.0/approx_mins_per_job) - 1 if self.max_job_count<1: self.max_job_count=1 self.autoGeneratePrereqs() def writeCode(self): code_parts=[] param_values=OrderedDict() param_values["-usecolor"] = str(self.color) param_values["-FP"] = str(self.vital_parameters.fp) param_values["-FN"] = str(self.vital_parameters.fn) param_values["-sd"] = str(self.sd) param_values["-sf"] = str(self.sf) param_values["-sr"] = str(self.sr) param_values["-res"] = str(self.res) param_values["-T"] = str(self.vital_parameters.pval) maxmem=int(self.getMem()/self.getThreads()) if maxmem < 1: maxmem=1 param_values["-maxmem"] = str(maxmem) param_values["-o"] = "placeholder" param_values["-A"] = str(self.min_alignment_sites) param_values["-S"] = str(self.min_alignment_score) param_values["-outlier"] = str(self.outlier_pval) param_values["-endoutlier"] = str(self.endoutlier_pval) param_values["-RepeatMask"] = " ".join([str(self.repeat_max_shift), str(self.repeat_pval_change)]) param_values["-RepeatRec"] = " ".join([str(self.repeat_pval_ratio), str(self.repeat_min_change)]) param_values["-hashgen"] = " ".join([str(self.hash_window), str(self.hash_min_sites), str(self.hash_sd_max), str(self.hash_sd_rms), str(self.hash_relative_error), str(self.hash_offset_kb), str(self.hash_max_insert_errors), str(self.hash_max_probe_errors), str(self.hash_max_unresolved_sites)]) param_values["-hash"] = "" param_values["-mres"] = str(self.target_resolution) param_values["-nosplit"] = "2" if self.allow_no_splits else "0" if self.allow_infinite_splits else "1" param_values["-maxthreads"] = str(self.getThreads()) param_values["-XmapStatRead"] = "../" + str(self.molecule_stats.getOutputFile()) if self.overwrite_output: param_values["-f"]="" if self.send_output_to_file: param_values["-stdout"]="" if self.send_error_to_file: param_values["-stderr"]="" tmp_code="" cur_jobs=0 totalBlocks=self.split.total_job_count currentJob = 0 for i in xrange(1,totalBlocks+1): file1="../" + self.split.getOutputFile(i) for j in range(i,totalBlocks + 1): file2="../" + self.split.getOutputFile(j) currentJob += 1 param_values["-o"]='pairwise%dof%d' % (currentJob, self.total_job_count) if path.exists(self.getStepDir() + "/" + param_values["-o"] + ".align"): continue param_values["-i"]=file1 if i==j : if "-first" in param_values: del param_values["-first"] if "-1" in param_values: del param_values["-1"] if "-i " in param_values: del param_values["-i "] else : param_values["-first"]="" param_values["-1"]="" param_values["-i "]=file2 param_list=[self.workspace.binaries["bng_ref_aligner"]] for key in param_values: param_list.append(key) param_list.append(param_values[key]) tmp_code += "if [ ! -e " + param_values["-o"] + ".align ]\n" tmp_code += "then\n" tmp_code += " " + " ".join(param_list) + "\n" tmp_code += "fi\n" cur_jobs+=1 if cur_jobs>=self.max_job_count: code = "cd " + self.workspace.work_dir + "\n" code += "mkdir -p " + self.getStepDir() + "\n" code += "cd " + self.getStepDir() + "\n" code += tmp_code code_parts.append(code) tmp_code="" cur_jobs=0 if tmp_code != "": code = "cd " + self.workspace.work_dir + "\n" code += "mkdir -p " + self.getStepDir() + "\n" code += "cd " + self.getStepDir() + "\n" code += "pwd\n" code += tmp_code code_parts.append(code) tmp_code="" cur_jobs=0 if len(code_parts)==0: return ["# do nothing"] return code_parts def getStepDir(self): return "_".join(["pairwise", self.inpt.getStepDir(), "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval)]) def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() self.split=Split(self.workspace, copy(self.vital_parameters)) self.split_summary=Summarize(self.workspace, self.split) def getPrereq(self): return self.split_summary def getMem(self): return self.workspace.resources.getLargeMemory() def getTime(self): return self.workspace.resources.getLargeTime() def getThreads(self): return self.workspace.resources.getLargeThreads() def getOutputFile(self): raise Exception("Pairwise doesn't have an output file, per se") def getOutputFileExtension(self): return "align"
class Assembly(GenericAssembly): def __init__(self, workspace, vital_parameters): self.workspace=workspace self.vital_parameters=vital_parameters self.quality=None self.sd=0.2 self.sf=0.2 self.sr=0.03 self.res=3.3 self.color=1 self.alignment_score_threshold=1 self.max_rel_coverage_multiple=100 self.max_rel_coverage_absolute=200 self.max_rel_coverage_absolute_2=30 self.bulge_coverage=20 self.max_coverage=10 self.min_coverage=10 self.min_average_coverage=5 self.min_maps=5 self.min_contig_len=0.0 self.end_trim=1 self.chimera_pval=0.001 self.chimera_num=3 self.fast_bulge=1000 self.fragile_preserve=False self.draftsize=1 self.min_duplicate_len=1 self.binary_output=True self.min_snr=2 self.output_prefix="unrefined" self.add_alignment_filter=True self.alignment_filter_threshold=100 self.alignment_filter_minlen_change=2.0 self.alignment_filter_pval_change=0.5 self.overwrite_output=True self.hide_branches=True self.send_output_to_file=True self.send_errors_to_file=True self.total_job_count=1 self.autoGeneratePrereqs() def writeCode(self): code = "cd " + self.workspace.work_dir + "\n" code += "mkdir " + self.getStepDir() + "\n" code += "cd " + self.getStepDir() + "\n" code += "pwd\n" param_values=OrderedDict() param_values["-if"]= "../" + str(self.split_summary.getOutputFile()) param_values["-af"]= "../" + str(self.pairwise_summary.getOutputFile()) param_values["-XmapStatRead"]= "../" + str(self.molecule_stats.getOutputFile()) param_values["-usecolor"]= str(self.color) param_values["-FP"]= str(self.vital_parameters.fp) param_values["-FN"]= str(self.vital_parameters.fn) param_values["-sd"]= str(self.sd) param_values["-sf"]= str(self.sf) param_values["-sr"]= str(self.sr) param_values["-res"]= str(self.res) param_values["-T"]= str(self.vital_parameters.pval) param_values["-S"]= str(self.alignment_score_threshold) param_values["-MaxRelCoverage"]= " ".join([str(self.max_rel_coverage_multiple), str(self.max_rel_coverage_absolute), str(self.max_rel_coverage_absolute_2)]) param_values["-BulgeCoverage"]= str(self.bulge_coverage) param_values["-MaxCoverage"]= str(self.max_coverage) param_values["-MinCov"]= str(self.min_coverage) param_values["-MinAvCov"]= str(self.min_average_coverage) param_values["-MinMaps"]= str(self.min_maps) param_values["-MinContigLen"]= str(self.min_contig_len) param_values["-EndTrim"]= str(self.end_trim) param_values["-refine"]="0" param_values["-PVchim"]= " ".join([str(self.chimera_pval),str(self.chimera_num)]) param_values["-FastBulge"]= str(self.fast_bulge) param_values["-FragilePreserve"]= str("1" if self.fragile_preserve else "0") param_values["-draftsize"]= str("1") param_values["-SideBranch"]= str(self.min_duplicate_len) param_values["-contigs_format"]= str("1" if self.binary_output else "0") param_values["-maxthreads"]= str(self.getThreads()) maxmem=int(self.getMem()/self.getThreads()) if maxmem<1: maxmem=1 param_values["-maxmem"]= str(maxmem) param_values["-minlen"]= str(self.vital_parameters.min_molecule_len) param_values["-minsites"]= str(self.vital_parameters.min_molecule_sites) param_values["-minSNR"]= str(self.min_snr) param_values["-o"]= str(self.output_prefix) if self.add_alignment_filter: param_values["-AlignmentFilter"] = " ".join([str(self.alignment_filter_threshold), str(self.alignment_filter_minlen_change), str(self.alignment_filter_pval_change)]) if self.overwrite_output: param_values["-force"] = "" if self.hide_branches: param_values["-SideChain"] = "" if self.send_output_to_file: param_values["-stdout"] = "" if self.send_errors_to_file: param_values["-stderr"] = "" param_list=[self.workspace.binaries["bng_assembler"]] for key in param_values: param_list.append(key) param_list.append(param_values[key]) code += " ".join(param_list) + "\n" return [code] def getStepDir(self): return "_".join(["assembly", self.inpt.getStepDir(), "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval), "minlen"+str(self.vital_parameters.min_molecule_len), "minsites"+str(self.vital_parameters.min_molecule_sites)]) def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() self.split=Split(self.workspace, copy(self.vital_parameters)) self.split_summary=Summarize(self.workspace, self.split) self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters)) self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment) def getPrereq(self): return self.pairwise_summary def isComplete(self): return path.exists(self.getOutputFile()) def createQualityObject(self): if not self.isComplete(): raise Exception("The step is not complete yet") count=0 total_length=0.0 lengths=[] label_occurrences=0 label_count=0 for cmap_name in glob(self.getStepDir() + "/*.cmap"): # This glob relies on there not being a merged .cmap in the same directory (i.e. Summarize has not been run) contigs=set() cmap_file=CmapFile(cmap_name) for label in cmap_file.parse(): if not label.contig_id in contigs: count+=1 total_length+=label.contig_len contigs.add(label.contig_id) lengths.append(label.contig_len) label_occurrences+=label.occurrences label_count+=1 sorted_lengths=sorted(lengths, reverse=True) minlen=sorted_lengths[len(sorted_lengths)-1] maxlen=sorted_lengths[0] n50=0 length_included_in_n50=0 target_length_included=total_length/2.0 for length in sorted(lengths, reverse=True): length_included_in_n50+=length if length_included_in_n50 >= target_length_included: n50 = length break with open(self.getOutputFile()) as contig_file: for line in contig_file: if line[0] != "C": continue contig_data=line.split(",") nummaps=contig_data[len(contig_data)-1] nummaps_data=nummaps.split("=") nummaps=nummaps_data[len(nummaps_data)-1] self.quality=Quality(length=total_length, count=count, average_length=total_length/count, n50=n50, min=minlen, max=maxlen, average_occurrences=float(label_occurrences)/label_count, total_mols_aligned=nummaps, avg_mols_aligned=float(nummaps)/count) self.saveQualityObjectToFile() def getQuality_count(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.count def getQuality_length(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.length def getQuality_averageLength(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.average_length def getQuality_n50(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.n50 def getQuality_max(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.max def getQuality_min(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.min def getQuality_averageOccurrences(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.average_occurrences def getQuality_totalMolsAligned(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.total_mols_aligned def getQuality_avgMolsAligned(self): if self.quality is None: self.loadQualityObjectFromFile() return self.quality.avg_mols_aligned def getMem(self): return self.workspace.resources.getLargeMemory() def getTime(self): return self.workspace.resources.getMediumTime() def getThreads(self): return 1
class RefineA(GenericAssembly): def __init__(self, workspace, vital_parameters): self.workspace=workspace self.vital_parameters=vital_parameters self.quality=None self.sd=0.2 self.sf=0.2 self.sr=0.03 self.res=3.3 self.usecolor=1 self.use_multi_mode=True self.consensus_end_coverage=0.99 self.bias_for_low_likelihood_ratio=1e2 self.refinement_length_accuracy="" self.largest_query_map_interval=4 self.largest_reference_map_interval=6 self.outlier_pval=1e-5 self.end_outlier_prior_probability=0.00001 self.contigs_format=1 self.overwrite_output=True self.output_prefix="refineA" self.send_output_to_file=True self.send_errors_to_file=True self.total_job_count=1 self.autoGeneratePrereqs() def writeCode(self): code="cd " + self.workspace.work_dir + "\n" code+="mkdir " + self.getStepDir() + "\n" code+="cd " + self.getStepDir() + "\n" code+="pwd\n" param_values=OrderedDict() param_values["-i"]="../" + self.sort.getOutputFile() param_values["-contigs"]=" ".join(["../" + self.assembly.getOutputFile(), "$group_start", "$group_end"]) param_values["-maxthreads"]=str(self.getThreads()) param_values["-T"]=str(self.vital_parameters.pval) param_values["-usecolor"]=str(self.usecolor) param_values["-extend"]="1" param_values["-refine"]="2" if self.use_multi_mode: param_values["-MultiMode"]="" param_values["-EndTrim"]=str(self.consensus_end_coverage) param_values["-LRbias"]=str(self.bias_for_low_likelihood_ratio) param_values["-Mprobeval"]=str(self.refinement_length_accuracy) param_values["-deltaX"]=str(self.largest_query_map_interval) param_values["-deltaY"]=str(self.largest_reference_map_interval) param_values["-outlier"]=str(self.outlier_pval) param_values["-endoutlier"]=str(self.end_outlier_prior_probability) param_values["-contigs_format"]=str(self.contigs_format) if self.overwrite_output: param_values["-force"]="" param_values["-FP"]=str(self.vital_parameters.fp) param_values["-FN"]=str(self.vital_parameters.fn) param_values["-sd"]=str(self.sd) param_values["-sf"]=str(self.sf) param_values["-sr"]=str(self.sr) param_values["-res"]=str(self.res) param_values["-o"]=self.output_prefix if self.send_output_to_file: param_values["-stdout"]="" if self.send_errors_to_file: param_values["-stderr"]="" param_values["-XmapStatRead"]="../" + self.molecule_stats.getOutputFile() param_list=[self.workspace.binaries["bng_assembler"]] for key in param_values: param_list.append(key) param_list.append(param_values[key]) code+="let contig_num=0\n" code+="while read line\n" code+="do\n" code+=" if [[ $line == \"#\"* ]]; then continue; fi\n" code+=" let contig_num+=1\n" code+=" group_start=`echo $line | awk '{print $1}'`\n" code+=" group_end=`echo $line | awk '{print $NF}'`\n" code+=" " + " ".join(param_list) + "\n" code+="done < ../" + self.group_manifest.getOutputFile() return [code] def getStepDir(self): return "_".join(["refineA", self.inpt.getStepDir(), "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval), "minlen"+str(self.vital_parameters.min_molecule_len), "minsites"+str(self.vital_parameters.min_molecule_sites)]) def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() self.split=Split(self.workspace, copy(self.vital_parameters)) self.split_summary=Summarize(self.workspace, self.split) self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters)) self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment) self.assembly=Assembly(self.workspace, copy(self.vital_parameters)) self.assembly_summary=Summarize(self.workspace, self.assembly) self.merge_assembly=Merge(self.workspace, self.assembly) self.group_manifest=GroupManifest(self.workspace, self.assembly) def getPrereq(self): return self.group_manifest def getMem(self): return self.workspace.resources.getMediumMemory() def getTime(self): return self.workspace.resources.getLargeTime() def getThreads(self): return self.workspace.resources.getMediumThreads()
def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats()
class Split(Step): def __init__(self, workspace, vital_parameters): self.workspace=workspace self.vital_parameters=vital_parameters self.overwrite_output=True self.send_output_to_file=True self.send_error_to_file=True self.autoGeneratePrereqs() if vital_parameters.blocks is None: with open(self.workspace.work_dir + "/" + self.inpt.getOutputFile()) as iFile: count=0 site_count=0 for line in iFile: if line[0] == "0": count+=1 if line[0] == "1": site_count+=len(line.split())-1 blocks=int(math.ceil(count/80000.0)) site_blocks=int(math.ceil(site_count/1e6)) if site_blocks>blocks: blocks=site_blocks self.total_job_count=blocks self.vital_parameters.blocks=blocks else: self.total_job_count=vital_parameters.blocks approx_mins_per_job=5 self.max_job_count=self.getTime()*(60.0/approx_mins_per_job)-3 if self.max_job_count<1: self.max_job_count=1 def writeCode(self): code_parts=[] param_values=OrderedDict() param_values["-i"] = "../" + self.sort.getOutputFile() param_values["-o"] = "placeholder" param_values["-maxthreads"] = str(self.getThreads()) param_values["-merge"] = "" param_values["-bnx"] = "" if self.overwrite_output: param_values["-f"] = "" if self.send_output_to_file: param_values["-stdout"] = "" if self.send_error_to_file: param_values["-stderr"] = "" tmp_code="" cur_jobs=0 for cur_block in xrange(1, self.total_job_count+1): param_list=[self.workspace.binaries["bng_ref_aligner"]] param_values["-o"]="split_" + str(cur_block) + "_of_" + str(self.total_job_count) param_values["-subsetbin"]=str(cur_block) + " " + str(self.total_job_count) for key in param_values: param_list.append(key) param_list.append(param_values[key]) tmp_code += " ".join(param_list) + "\n" cur_jobs+=1 if cur_jobs>=self.max_job_count or cur_block==self.total_job_count: code = "cd " + self.workspace.work_dir + "\n" code += "mkdir -p " + self.getStepDir() + "\n" code += "cd " + self.getStepDir() + "\n" code += tmp_code code += "pwd\n" code_parts.append(code) cur_jobs=0 tmp_code="" return code_parts def getStepDir(self): return "_".join(["split", self.inpt.getStepDir(), "blockCount"+str(self.total_job_count)]) def autoGeneratePrereqs(self): self.inpt=Input(self.workspace) self.sort=Sort(self.workspace, copy(self.vital_parameters)) self.molecule_stats=self.sort.getMoleculeStats() def getPrereq(self): return self.sort def getMem(self): return self.workspace.resources.getMediumMemory() def getTime(self): return self.workspace.resources.getSmallTime() def getThreads(self): return self.workspace.resources.getSmallThreads() def getOutputFile(self, block_num): return self.getStepDir() + "/split_" + str(block_num) + "_of_" + str(self.total_job_count) + ".bnx" def getOutputFileExtension(self): return "bnx"