def blasr_align(self): q_fofn = self.query_fofn target_fa = self.target_fa target_sa = self.target_sa output_dir = self.parameters["mapping_data_dir"] q_sn = self.parameters["q_sn"] t_sn = self.parameters["t_sn"] out_fn = os.path.join( output_dir, "q%05d_t%05d.m4" % (q_sn, t_sn)) script_fn = os.path.join( output_dir, "q%05d_t%05d.sh" % (q_sn, t_sn)) config = self.parameters["config"] blasr_opt = config["blasr_opt"] sge_option_dm = config["sge_option_dm"] install_prefix = config["install_prefix"] #blasr_cmd = """blasr %s %s -sa %s -noSplitSubreads -bestn 16 -nCandidates 32 -maxScore -1000 -minMatch 12 -maxLCPLength 15 -nproc 16 -m 4 -out %s""" % (fn(q_fofn), fn(target_fa), fn(target_sa), out_fn) blasr_cmd = """blasr {query} {target} -sa {target_sa} {blasr_opt} -noSplitSubreads -m 4 -out {out_fn}""" blasr_cmd = blasr_cmd.format( query=fn(q_fofn), target=fn(target_fa), target_sa=fn(target_sa), blasr_opt = blasr_opt, out_fn=out_fn ) with open(script_fn,"w") as script_file: script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write(blasr_cmd+"\n") script_file.write("touch %s" % fn(self.job_done)) job_name = self.URL.split("/")[-1] job_name += str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": os.getcwd(), "sge_option": sge_option_dm, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn(self.job_done), task=self, job_name=job_name )
def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) script.append( "DB2Falcon preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s\n""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid4())[:8] job_data = {"job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name )
def query_filter(self): #print self.parameters #print [fn(f) for f in self.inputs.values()] output_dir = self.parameters["mapping_data_dir"] q_sn = self.parameters["q_sn"] script_fn = os.path.join( output_dir, "qf%05d.sh" % q_sn) qf_fofn = os.path.join( output_dir, "qf%05d_input.fofn" % (q_sn, ) ) install_prefix = config["install_prefix"] sge_option_qf = config["sge_option_qf"] length_cutoff_pr = config["length_cutoff_pr"] bestn = config["bestn"] with open(script_fn,"w") as script_file: script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write("""find %s -name "q[0-9]*_t[0-9]*.m4" > %s\n""" % (output_dir, qf_fofn)) script_file.write("""query_m4_filtering.py %s 1 0 %d %d %s\n""" % (qf_fofn, bestn, length_cutoff_pr, fn(self.qf_out) )) script_file.write("""touch %s\n""" % fn(self.job_done) ) job_name = self.URL.split("/")[-1] job_data = {"job_name": job_name, "cwd": os.getcwd(), "sge_option": sge_option_qf, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn(self.job_done), task=self, job_name=job_name )
def blasr_align(self): q_fofn = self.query_fofn target_fa = self.target_fa target_sa = self.target_sa output_dir = self.parameters["mapping_data_dir"] q_sn = self.parameters["q_sn"] t_sn = self.parameters["t_sn"] out_fn = os.path.join( output_dir, "q%05d_t%05d.m4" % (q_sn, t_sn)) script_fn = os.path.join( output_dir, "q%05d_t%05d.sh" % (q_sn, t_sn)) config = self.parameters["config"] blasr_opt = config["blasr_opt"] sge_option_dm = config["sge_option_dm"] install_prefix = config["install_prefix"] #blasr_cmd = """blasr %s %s -sa %s -noSplitSubreads -bestn 16 -nCandidates 32 -maxScore -1000 -minMatch 12 -maxLCPLength 15 -nproc 16 -m 4 -out %s""" % (fn(q_fofn), fn(target_fa), fn(target_sa), out_fn) blasr_cmd = """blasr {query} {target} -sa {target_sa} {blasr_opt} -noSplitSubreads -m 4 -out {out_fn}""" blasr_cmd = blasr_cmd.format( query=fn(q_fofn), target=fn(target_fa), target_sa=fn(target_sa), blasr_opt = blasr_opt, out_fn=out_fn ) with open(script_fn,"w") as script_file: script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write(blasr_cmd+"\n") script_file.write("touch %s" % fn(self.job_done)) job_name = self.URL.split("/")[-1] job_data = {"job_name": job_name, "cwd": os.getcwd(), "sge_option": sge_option_dm, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn(self.job_done), task=self, job_name=job_name )
def run_daligner(self): daligner_cmd = self.parameters["daligner_cmd"] job_uid = self.parameters["job_uid"] cwd = self.parameters["cwd"] config = self.parameters["config"] install_prefix = config["install_prefix"] db_prefix = self.parameters["db_prefix"] nblock = self.parameters["nblock"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rj_%s.sh" % (job_uid)) log_path = os.path.join( script_dir, "rj_%s.log" % (job_uid)) script = [] script.append( "cd %s" % cwd ) script.append( "hostname >> %s" % log_path ) script.append( "date >> %s" % log_path ) # Jason's time path does not work on Centos (where time has no path!?!) # this code is also rather fugly - the time output is not logged - encapsulate in brackets script.append( "(time "+ daligner_cmd + ") >> %s 2>&1 " % log_path ) script.append( "touch %s" % fn( self.job_done ) ) for p_id in xrange( 1, nblock+1 ): script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """ % (db_prefix, p_id, db_prefix, p_id) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid4())[:8] job_data = {"job_name": job_name, "cwd": cwd, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
def task_report_pre_assembly(self): # TODO(CD): Bashify this, in case it is slow. i_raw_reads_db_fn = fn(self.raw_reads_db) i_preads_fofn_fn = fn(self.preads_fofn) i_length_cutoff_fn = fn(self.length_cutoff_fn) o_json_fn = fn(self.pre_assembly_report) cfg = self.parameters genome_length = int(cfg.get('genome_size', 0)) # different name in falcon length_cutoff = int(cfg['length_cutoff']) length_cutoff = support.get_length_cutoff(length_cutoff, i_length_cutoff_fn) kwds = { 'i_raw_reads_db_fn': i_raw_reads_db_fn, 'i_preads_fofn_fn': i_preads_fofn_fn, 'genome_length': genome_length, 'length_cutoff': length_cutoff, } fc_run_logger.info('Report inputs: {}'.format(repr(kwds))) report_dict = stats_preassembly.calc_dict(**kwds) content = json.dumps(report_dict, sort_keys=True, indent=4, separators=(',', ': ')) fc_run_logger.info('Report stats:\n{}'.format(content)) open(o_json_fn, 'w').write(content)
def query_filter(self): #print self.parameters #print [fn(f) for f in self.inputs.values()] output_dir = self.parameters["mapping_data_dir"] q_sn = self.parameters["q_sn"] script_fn = os.path.join(output_dir, "qf%05d.sh" % q_sn) qf_fofn = os.path.join(output_dir, "qf%05d_input.fofn" % (q_sn, )) install_prefix = config["install_prefix"] sge_option_qf = config["sge_option_qf"] length_cutoff_pr = config["length_cutoff_pr"] bestn = config["bestn"] with open(script_fn, "w") as script_file: script_file.write("source {install_prefix}/bin/activate\n".format( install_prefix=install_prefix)) script_file.write("""find %s -name "q[0-9]*_t[0-9]*.m4" > %s\n""" % (output_dir, qf_fofn)) script_file.write("""query_m4_filtering.py %s 1 0 %d %d %s\n""" % (qf_fofn, bestn, length_cutoff_pr, fn(self.qf_out))) script_file.write("""touch %s\n""" % fn(self.job_done)) job_name = self.URL.split("/")[-1] job_name += str(uuid.uuid1())[:8] job_data = { "job_name": job_name, "cwd": os.getcwd(), "sge_option": sge_option_qf, "script_fn": script_fn } run_script(job_data, job_type=config["job_type"]) wait_for_file(fn(self.job_done), task=self, job_name=job_name)
def quiver_reseq(self): config = self.config sge_option_ck = config["sge_option_ck"] sge_option_qv = config["sge_option_qv"] big_tmpdir = config["big_tmpdir"] try: os.makedirs("quiver_reseq") except: pass SEYMOUR_HOME = config["SEYMOUR_HOME"] if SEYMOUR_HOME == None: print "SEYMOUR_HOME not set, bypass quiver consensus step" return 0 job_name = "QuiverReq_"+str(uuid.uuid4()) quiver_script = """#!/bin/bash export SEYMOUR_HOME=%s . $SEYMOUR_HOME/etc/setup.sh cd %s/quiver_reseq cp ../CA/9-terminator/asm.ctg.fasta . referenceUploader -c -p $PWD -n assembly -f asm.ctg.fasta --skipIndexUpdate compareSequences.py --info --useGuidedAlign --algorithm=blasr --nproc=24 --noXML --h5mode=w --h5fn=out.cmp.h5 --minAccuracy=0.70 --minLength=200 -x -nCandidates 50 -x -minMatch 12 -x -bestn 1 -x -minPctIdentity 70.0 %s assembly/ loadPulses %s out.cmp.h5 -metrics DeletionQV,IPD,InsertionQV,PulseWidth,QualityValue,MergeQV,SubstitutionQV,DeletionTag -byread cmph5tools.py sort out.cmp.h5 --tmp %s variantCaller.py --algorithm quiver -j 16 --referenceFilename assembly/sequence/assembly.fasta --parameters best -o output.gff -o output.fasta -o output.fastq -q 0 -X 80 -x 5 --mapQvThreshold 0 out.cmp.h5 """ % (SYMOURE_HOME, os.getcwd(), fn(self.input_fofn), fn(self.input_fofn), big_tmpdir) with open("scripts/quiver_reseq.sh", "w") as f: print >>f, quiver_script os.system( """qsub -sync y {sge_option_qv} -N {jn} -o {cwd}/sge_log -j y -S /bin/bash scripts/quiver_reseq.sh """.format(jn=job_name, cwd=os.getcwd(), sge_option_qv = sge_option_qv) ) with open("scripts/quiver_done.sh","w") as f: print >>f, "echo done > %s" % fn(self.Quiver_done) os.system("bash scripts/quiver_done.sh")
def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "set -vex" ) script.append( "trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done) ) script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) # Write preads4falcon.fasta, in 1-preads_ovl: script.append( "DB2Falcon -U preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (fn(db_file), overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile # Write 'p_ctg.fa' and 'a_ctg.fa': script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = config["sge_option_fc"] run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_data['job_name'])
def gather_qm4(self): all_qf = [fn(o) for o in self.inputs.values()] all_qf.sort() with open( fn( self.qm4_fofn ),"w" ) as f: for m4f in all_qf: if m4f.endswith("m4"): print >> f, m4f
def run_merge_task(self): p_script_fn = self.parameters["merge_script"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] config = self.parameters["config"] install_prefix = config["install_prefix"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rp_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id)) script = [] script.append( "cd %s" % cwd ) script.append( "hostname >> %s" % log_path ) script.append( "date >> %s" % log_path ) # Jason's time path does not work on Centos (where time has no path!?!) # this code is also rather fugly - the time output is not logged - encapsulate in brackets script.append( "(time bash %s) >> %s 2>&1 " % (p_script_fn, log_path) ) script.append( "touch %s" % fn( self.job_done ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid4())[:8] job_data = {"job_name": job_name, "cwd": cwd, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fa" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done))
def task_build_pdb( self ): #essential the same as build_rdb() but the subtle differences are tricky to consolidate to one function input_fofn_fn = fn(self.pread_fofn) job_done = fn(self.pdb_build_done) db = fn(self.preads_db) run_jobs = fn(self.run_jobs) remove(job_done, db, run_jobs) work_dir = self.parameters["work_dir"] config = self.parameters["config"] script_fn = os.path.join(work_dir, "prepare_pdb.sh") args = { 'input_fofn_fn': input_fofn_fn, 'config': config, 'job_done': job_done, 'script_fn': script_fn, 'run_jobs_fn': run_jobs, } support.build_pdb(**args) run_script_and_wait_and_rm_exit(self.URL, script_fn, job_done, self, job_type=config['job_type'], sge_option=config['sge_option_pda'])
def task_run_falcon_asm(self): wd = self.parameters["wd"] #self.db2falcon_done db_file = fn(self.db_file) job_done = fn(self.falcon_asm_done) config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, "run_falcon_asm.sh") # Generate las.fofn in run-dir. system('cd {}; find {}/m_*/ -name "*.las" >| las.fofn'.format( wd, pread_dir)) las_fofn_fn = 'las.fofn' args = { 'las_fofn_fn': las_fofn_fn, 'preads4falcon_fasta_fn': os.path.join(pread_dir, 'preads4falcon.fasta'), 'db_file_fn': db_file, 'config': config, 'job_done': job_done, 'script_fn': script_fn, } support.run_falcon_asm(**args) run_script_and_wait_and_rm_exit(self.URL, script_fn, job_done, self, job_type=config['job_type'], sge_option=config['sge_option_fc'])
def build_rdb(self): input_fofn = self.input_fofn input_fofn_fn = fn(input_fofn) rdb_build_done = self.rdb_build_done work_dir = self.parameters["work_dir"] config = self.parameters["config"] sge_option_da = config["sge_option_da"] install_prefix = config["install_prefix"] length_cutoff = config["length_cutoff"] pa_HPCdaligner_option = config["pa_HPCdaligner_option"] pa_DBsplit_option = config["pa_DBsplit_option"] script_fn = os.path.join( work_dir, "prepare_db.sh" ) with open(script_fn,"w") as script_file: script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write("cd {work_dir}\n".format(work_dir = work_dir)) script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done\n".format(input_fofn_fn = input_fofn_fn)) script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option) script_file.write("HPCdaligner %s -H%d raw_reads > run_jobs.sh\n" % (pa_HPCdaligner_option, length_cutoff)) script_file.write("touch {rdb_build_done}\n".format(rdb_build_done = fn(rdb_build_done))) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": os.getcwd(), "sge_option": sge_option_da, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn(rdb_build_done), task=self, job_name=job_name )
def run_merge_task(self): p_script_fn = self.parameters["merge_script"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] config = self.parameters["config"] sge_option_la = config["sge_option_la"] install_prefix = config["install_prefix"] script_dir = os.path.join(cwd) script_fn = os.path.join(script_dir, "rp_%05d.sh" % (job_id)) log_path = os.path.join(script_dir, "rp_%05d.log" % (job_id)) script = [] script.append("source {install_prefix}/bin/activate\n".format( install_prefix=install_prefix)) script.append("cd %s" % cwd) script.append("hostname >> %s" % log_path) script.append("date >> %s" % log_path) script.append(("/usr/bin/time bash %s " % p_script_fn) + (" >> %s 2>&1" % log_path) + (" && touch %s" % fn(self.job_done))) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-" + str(uuid.uuid1())[:8] job_data = { "job_name": job_name, "cwd": cwd, "sge_option": sge_option_la, "script_fn": script_fn } run_script(job_data, job_type=config["job_type"]) wait_for_file(fn(self.job_done), task=self, job_name=job_name)
def task_report_pre_assembly(self): i_raw_reads_db_fn = fn(self.raw_reads_db) i_preads_fofn_fn = fn(self.preads_fofn) i_length_cutoff_fn = fn(self.length_cutoff_fn) o_json_fn = fn(self.pre_assembly_report) cfg = self.parameters genome_length = int(cfg.get('genome_size', 0)) # different name in falcon length_cutoff = int(cfg['length_cutoff']) # Update length_cutoff if auto-calc (when length_cutoff is negative). # i_length_cutoff_fn was created long ago, so no filesystem issues. length_cutoff = support.get_length_cutoff(length_cutoff, i_length_cutoff_fn) cwd = self.parameters['cwd'] mkdir(cwd) script_fn = os.path.join(cwd , 'run_report_pre_assembly.sh') job_done = os.path.join(cwd, 'report_pa_done') kwds = { 'i_raw_reads_db_fn': i_raw_reads_db_fn, 'i_preads_fofn_fn': i_preads_fofn_fn, 'genome_length': genome_length, 'length_cutoff': length_cutoff, 'o_json_fn': o_json_fn, 'job_done': job_done, 'script_fn': script_fn, } fc_run_logger.info('Report inputs: {}'.format(repr(kwds))) support.run_report_pre_assembly(**kwds) self.generated_script_fn = script_fn
def task_daligner_gather(self): da_done = fn(self.da_done) main_dir = os.path.dirname(da_done) out_dict = self.inputDataObjs nblock = self.parameters['nblock'] fc_run_logger.debug('nblock=%d, out_dir:\n%s' % (nblock, out_dict)) # Create m_* dirs. for block in xrange(1, nblock + 1): mdir = os.path.join( main_dir, 'm_%05d' % block) # By convention. pbsmrtpipe works differently. mkdir(mdir) # TODO: Remove existing symlinks? job_rundirs = [ os.path.dirname(fn(dal_done)) for dal_done in out_dict.values() ] # Symlink all daligner *.las. links = collections.defaultdict(list) for block, las_path in support.daligner_gather_las(job_rundirs): mdir = os.path.join( main_dir, 'm_%05d' % block) # By convention. pbsmrtpipe works differently. #las_path = os.path.relpath(las_path, mdir) links[mdir].append(las_path) only_these_symlinks(links) system("touch %s" % da_done)
def task_build_rdb(self): input_fofn_fn = fn(self.input_fofn) job_done = fn(self.rdb_build_done) db = fn(self.raw_reads_db) run_jobs = fn(self.run_jobs) remove(job_done, db, run_jobs) work_dir = self.parameters["work_dir"] config = self.parameters["config"] sge_option_da = config["sge_option_da"] script_fn = os.path.join(work_dir, "prepare_rdb.sh") args = { 'input_fofn_fn': input_fofn_fn, 'config': config, 'job_done': job_done, 'script_fn': script_fn, 'run_jobs_fn': run_jobs, } support.build_rdb(**args) run_script_and_wait_and_rm_exit(self.URL, script_fn, job_done, self, job_type=config['job_type'], sge_option=sge_option_da)
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False): job_id = 0 tasks = [] tasks_out = {} nblock = 1 new_db = True if os.path.exists(fn(db_file)): with open(fn(db_file)) as f: for l in f: l = l.strip().split() if l[0] == "blocks" and l[1] == "=": nblock = int(l[2]) new_db = False break for pid in xrange(1, nblock + 1): support.make_dirs("%s/m_%05d" % (wd, pid)) with open(os.path.join(wd, "run_jobs.sh")) as f: for l in f: l = l.strip() job_uid = hashlib.md5(l).hexdigest() job_uid = job_uid[:8] l = l.split() if l[0] == "daligner": support.make_dirs(os.path.join(wd, "./job_%s" % job_uid)) call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % ( wd, job_uid, db_prefix, db_prefix, db_prefix, ) rc = os.system(call) if rc: raise Exception("Failure in system call: %r -> %d" % (call, rc)) job_done = makePypeLocalFile(os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid))) if pread_aln == True: l[0] = "daligner_p" parameters = { "daligner_cmd": " ".join(l), "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix, } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix), ) daligner_task = make_daligner_task(run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done job_id += 1 return tasks, tasks_out
def run_daligner(self): daligner_cmd = self.parameters["daligner_cmd"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] script_dir = os.path.join(cwd) script_fn = os.path.join(script_dir, "rj_%05d.sh" % (job_id)) log_path = os.path.join(script_dir, "rj_%05d.log" % (job_id)) script = [] script.append("export PATH=~/task2014/dazzler/DALIGNER/:$PATH") script.append("cd %s" % cwd) script.append("/usr/bin/time " + daligner_cmd + (" >& %s " % log_path) + (" && touch %s" % fn(self.job_done))) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-" + str(uuid.uuid1())[:8] job_data = { "job_name": job_name, "cwd": cwd, "sge_option": " -pe smp 6 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type="SGE") wait_for_file(fn(self.job_done), task=self, job_name=job_name)
def check_p_merge_check_task(self): wdir = os.path.dirname(fn(self.p_merge_done)) mkdir(wdir) system("touch %s" % fn(self.p_merge_done)) script_fn = os.path.join(wdir, 'noop.sh') open(script_fn, 'w').write('echo NOOP raw') self.generated_script_fn = script_fn
def task_run_consensus(self): merge_job_done = fn(self.job_done) out_file_fn = fn(self.out_file) out_done = fn(self.out_done) job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] config = self.parameters["config"] prefix = self.parameters["prefix"] script_dir = os.path.join(cwd) script_fn = os.path.join(script_dir, "c_%05d.sh" % (job_id)) db_fn = os.path.abspath( '{cwd}/../../{prefix}'.format(**locals())) # ASSUMING 2-levels deep merge_job_dir = os.path.dirname(merge_job_done) # by convention, we assume the name of the .las file las_fn = os.path.abspath( '{merge_job_dir}/{prefix}.{job_id}.las'.format(**locals())) args = { 'db_fn': db_fn, 'las_fn': las_fn, 'out_file_fn': out_file_fn, 'config': config, 'job_done': out_done, 'script_fn': script_fn, } support.run_consensus(**args) self.generated_script_fn = script_fn
def run_merge_task(self): p_script_fn = self.parameters["merge_script"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] job_done = self.job_done config = self.parameters["config"] sge_option_la = config["sge_option_la"] install_prefix = config["install_prefix"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rp_%05d.sh" % (job_id)) script = [] script.append( "set -vex" ) script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = fn(job_done)) ) script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % cwd ) script.append( "hostname" ) script.append( "date" ) script.append( "time bash %s" % p_script_fn ) script.append( "touch {job_done}".format(job_done = fn(job_done)) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_option_la run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(job_done), task=self, job_name=job_data['job_name'])
def task_run_las_merge(self): gathered_las_fn = fn(self.gathered_las) script = self.parameters["merge_script"] job_id = self.parameters["job_id"] # aka "block" cwd = self.parameters["cwd"] mkdir(cwd) gathered_dict = read_gathered_las(gathered_las_fn) las_paths = gathered_dict[job_id] for las_path in las_paths: src = os.path.relpath(las_path, cwd) tgt = os.path.join(cwd, os.path.basename(las_path)) fc_run_logger.debug('symlink {!r} -> {!r}'.format(src, tgt)) os.symlink(src, tgt) job_done = fn(self.job_done) config = self.parameters["config"] script_dir = os.path.join(cwd) script_fn = os.path.join(script_dir, "rp_%05d.sh" % (job_id)) args = { 'script': script, 'config': config, 'job_done': job_done, 'script_fn': script_fn, } support.run_las_merge(**args) self.generated_script_fn = script_fn
def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) script.append( "DB2Falcon preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --fofn las.fofn %s \ --n_core 24 --min_len %d > preads.ovl""" % (overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl > fc.log""" ) script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s\n""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name )
def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done))
def gather_qm4(self): all_qf = [fn(o) for o in self.inputs.values()] all_qf.sort() with open(fn(self.qm4_fofn), "w") as f: for m4f in all_qf: if m4f.endswith("m4"): print >> f, m4f
def task_daligner_gather(self): da_done = fn(self.da_done) main_dir = os.path.dirname(da_done) out_dict = self.inputDataObjs nblock = self.parameters['nblock'] fc_run_logger.debug('nblock=%d, out_dir:\n%s' % (nblock, out_dict)) # Create m_* dirs. for block in xrange(1, nblock + 1): mdir = os.path.join( main_dir, 'm_%05d' % block) # By convention. pbsmrtpipe works differently. mkdir(mdir) # TODO: Remove existing symlinks? # Symlink all daligner *.las. # Could be L1.* or preads.* re_las = re.compile(r'\.(\d*)(\.\d*)?\.las$') for dal_done in out_dict.values(): job_rundir = os.path.dirname(fn(dal_done)) for las_fn in os.listdir(job_rundir): mo = re_las.search(las_fn) if not mo: continue block = int( mo.group(1)) # We will merge in the m_* dir of the left block. mdir = os.path.join( main_dir, 'm_%05d' % block) # By convention. pbsmrtpipe works differently. las_path = os.path.join('..', os.path.basename(job_rundir), las_fn) cmd = 'ln -sf {} {}'.format(las_path, mdir) system(cmd) system("touch %s" % da_done)
def run_daligner(self): daligner_cmd = self.parameters["daligner_cmd"] job_uid = self.parameters["job_uid"] cwd = self.parameters["cwd"] job_done = self.job_done config = self.parameters["config"] sge_option_da = config["sge_option_da"] install_prefix = config["install_prefix"] db_prefix = self.parameters["db_prefix"] nblock = self.parameters["nblock"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rj_%s.sh" % (job_uid)) script = [] script.append( "set -vex" ) script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = fn(job_done)) ) script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % cwd ) script.append( "hostname" ) script.append( "date" ) script.append( "time "+ daligner_cmd ) for p_id in xrange( 1, nblock+1 ): script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """ % (db_prefix, p_id, db_prefix, p_id) ) script.append( "touch {job_done}".format(job_done = fn(job_done)) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_option_da run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(job_done), task=self, job_name=job_data['job_name'])
def task_daligner_gather(self): da_done = fn(self.da_done) main_dir = os.path.dirname(da_done) out_dict = self.inputDataObjs nblock = self.parameters['nblock'] fc_run_logger.debug('nblock=%d, out_dir:\n%s' % (nblock, out_dict)) # Create m_* dirs. for block in xrange(1, nblock + 1): mdir = os.path.join( main_dir, 'm_%05d' % block) # By convention. pbsmrtpipe works differently. mkdir(mdir) # TODO: Remove existing symlinks? job_rundirs = [ os.path.dirname(fn(dal_done)) for dal_done in out_dict.values() ] # Symlink all daligner *.las. for block, las_path in support.daligner_gather_las(job_rundirs): #fc_run_logger.warning('block: %s, las_path: %s' %(block, las_path)) mdir = os.path.join( main_dir, 'm_%05d' % block) # By convention. pbsmrtpipe works differently. las_path = os.path.relpath(las_path, mdir) cmd = 'ln -sf {} {}'.format(las_path, mdir) system(cmd) system("touch %s" % da_done)
def build_pdb(self): input_fofn = self.pread_fofn input_fofn_fn = fn(input_fofn) pdb_build_done = self.pdb_build_done work_dir = self.parameters["work_dir"] config = self.parameters["config"] sge_option_pda = config["sge_option_pda"] install_prefix = config["install_prefix"] length_cutoff = config["length_cutoff_pr"] ovlp_HPCdaligner_option = config["ovlp_HPCdaligner_option"] ovlp_DBsplit_option = config["ovlp_DBsplit_option"] script_fn = os.path.join( work_dir, "prepare_pdb.sh" ) with open(script_fn,"w") as script_file: script_file.write("set -vex\n") script_file.write("trap 'touch {pdb_build_done}.exit' EXIT\n".format(pdb_build_done = fn(pdb_build_done))) script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write("cd {work_dir}\n".format(work_dir = work_dir)) script_file.write("hostname\n") script_file.write("date\n") script_file.write("fasta2DB -v preads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn)) script_file.write("DBsplit -x%d %s preads\n" % (length_cutoff, ovlp_DBsplit_option)) script_file.write("HPCdaligner %s -H%d preads > run_jobs.sh\n" % (ovlp_HPCdaligner_option, length_cutoff)) script_file.write("touch {pdb_build_done}\n".format(pdb_build_done = fn(pdb_build_done))) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_option_pda run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(pdb_build_done), task=self, job_name=job_data['job_name'])
def dist_map(self): config = self.config dist_map_num_chunk = config["dist_map_num_chunk"] directory_for_dist_map = config["directory_for_dist_map"] sge_option_ck = config["sge_option_ck"] sge_option_dm = config["sge_option_dm"] install_prefix = config["install_prefix"] blasr_opt = config["blasr_opt"] #set_up_script = "fastasplit %s %s/ -c %d" % (fn(self.seed_fasta), directory_for_dist_map, dist_map_num_chunk) #os.system(set_up_script) fasta_file = pbcore.io.FastaReader(fn(self.seed_fasta)) out_files = [] for i in range(dist_map_num_chunk): out_files.append( open( "%s/%s_chunk_%07d" % (directory_for_dist_map, os.path.basename(fn( self.seed_fasta)), i), "w")) for s in fasta_file: g = hash(s.name) % dist_map_num_chunk out_file = out_files[g] out_file.write(">%s\n" % s.name) out_file.write("%s\n" % s.sequence) for i in range(dist_map_num_chunk): out_files[i].close() fasta_file.file.close() align_script_template = """\ . {install_prefix}/bin/activate cd %s/%s blasr {blasr_opt} -m 4 -out m4_%s.dat %s %s """.format(install_prefix=install_prefix, blasr_opt=blasr_opt) job_name = "dist_map_" + str(uuid.uuid4()) i = 0 for chunk_name in glob.glob( "%s/%s_chunk_*" % (directory_for_dist_map, os.path.basename(fn(self.seed_fasta)))): script = align_script_template % ( os.getcwd(), directory_for_dist_map, os.path.basename(chunk_name), fn(self.normalized_fasta), os.path.basename(chunk_name)) with open("scripts/dist_map_%02d.sh" % i, "w") as f: print >> f, script os.system("qsub -N {jn} {sge_option_dm} -o {cwd}/sge_log -j y\ -S /bin/bash scripts/dist_map_{jid:02d}.sh".format( jn=job_name + "_%02d" % i, cwd=os.getcwd(), sge_option_dm=sge_option_dm, jid=i)) i += 1 with open("scripts/mapping_done.sh", "w") as f: print >> f, "echo done > %s" % fn(self.m4_data_done) os.system( """qsub -sync y {sge_option_ck} -hold_jid "{jn}*" -o {cwd}/sge_log -j y\ -S /bin/bash scripts/mapping_done.sh""".format( jn=job_name, cwd=os.getcwd(), sge_option_ck=sge_option_ck))
def get_phased_reads(self): q_id_map_fn = fn(self.q_id_map_file) vmap_fn = fn(self.vmap_file) p_variant_fn = fn(self.phased_variant_file) parameters = self.parameters ctg_id = parameters["ctg_id"] phased_read_fn = fn(self.phased_read_file) rid_map = {} with open(q_id_map_fn) as f: for l in f: l = l.strip().split() rid_map[int(l[0])] = l[1] read_to_variants = {} variant_to_reads = {} with open(vmap_fn) as f: for l in f: l = l.strip().split() variant = "_".join(l[:3]) read_id = int(l[3]) read_to_variants.setdefault(read_id, set()) read_to_variants[read_id].add(variant) variant_to_reads.setdefault(variant, set()) variant_to_reads[variant].add(read_id) variant_to_phase = {} with open(p_variant_fn) as f: for l in f: """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781""" l = l.strip().split() if l[0] != "V": continue pb_id = int(l[1]) variant_to_phase[l[3]] = (pb_id, 0) variant_to_phase[l[4]] = (pb_id, 1) with open(phased_read_fn, "w") as out_f: for r in read_to_variants: vl = {} pl = set() for v in list(read_to_variants[r]): if v in variant_to_phase: p = variant_to_phase[v] vl[p] = vl.get(p, 0) + 1 pl.add(p[0]) pl = list(pl) pl.sort() for p in pl: if vl.get((p, 0), 0) - vl.get((p, 1), 0) > 1: print >> out_f, r, ctg_id, p, 0, vl.get((p, 0), 0), vl.get( (p, 1), 0), rid_map[r] elif vl.get((p, 1), 0) - vl.get((p, 0), 0) > 1: print >> out_f, r, ctg_id, p, 1, vl.get((p, 0), 0), vl.get( (p, 1), 0), rid_map[r]
def h5fofn_to_fasta(self): os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\ (fn(self.input_fofn), self.parameters["fasta_dir"], self.parameters["min_length"], self.parameters["min_read_score"])) os.system("""find %s -name "*_t.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.target_fa_fofn))) os.system("""find %s -name "*_q.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.query_fa_fofn))) os.system("touch %s" % fn(self.fasta_dump_done))
def get_phased_reads(self): q_id_map_fn = fn(self.q_id_map_file) vmap_fn = fn(self.vmap_file) p_variant_fn = fn(self.phased_variant_file) ctg_id = parameters["ctg_id"] phased_read_fn = fn(self.phased_read_file) rid_map = {} with open(q_id_map_fn) as f: for l in f: l = l.strip().split() rid_map[int(l[0])] = l[1] read_to_variants = {} variant_to_reads = {} with open(vmap_fn) as f: for l in f: l = l.strip().split() variant = "_".join(l[:3]) read_id = int(l[3]) read_to_variants.setdefault(read_id, set()) read_to_variants[read_id].add(variant) variant_to_reads.setdefault(variant, set()) variant_to_reads[variant].add(read_id) variant_to_phase = {} with open(p_variant_fn) as f: for l in f: """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781""" l = l.strip().split() if l[0] != "V": continue pb_id = int(l[1]) variant_to_phase[ l[3] ] = (pb_id, 0) variant_to_phase[ l[4] ] = (pb_id, 1) with open(phased_read_fn, "w") as out_f: for r in read_to_variants: vl = {} pl = set() for v in list( read_to_variants[r] ): if v in variant_to_phase: p = variant_to_phase[v] vl[ p ] = vl.get(p, 0) + 1 pl.add(p[0]) pl = list(pl) pl.sort() for p in pl: if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1: print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r] elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1: print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
def split_fofn_task(self): query_chunk_size = self.parameters["config"]["q_chunk_size"] target_chunk_size = self.parameters["config"]["t_chunk_size"] split_fofn( fn(self.query_fa_fofn), self.parameters["dist_map_dir"], "query", query_chunk_size, incremental = True, allow_fraction = True) split_fofn( fn(self.target_fa_fofn), self.parameters["dist_map_dir"], "target", target_chunk_size, incremental = True, allow_fraction = True) os.system("touch %s" % fn(self.split_fofn_done))
def build_rdb(self): input_fofn = self.input_fofn input_fofn_fn = fn(input_fofn) rdb_build_done = self.rdb_build_done work_dir = self.parameters["work_dir"] config = self.parameters["config"] sge_option_da = config["sge_option_da"] install_prefix = config["install_prefix"] length_cutoff = config["length_cutoff"] pa_HPCdaligner_option = config["pa_HPCdaligner_option"] pa_DBsplit_option = config["pa_DBsplit_option"] openending = config["openending"] script_fn = os.path.join( work_dir, "prepare_db.sh" ) last_block = 1 new_db = True if os.path.exists( os.path.join(work_dir, "raw_reads.db") ): with open( os.path.join(work_dir, "raw_reads.db") ) as f: for l in f: l = l.strip().split() if l[0] == "blocks" and l[1] == "=": last_block = int(l[2]) new_db = False break with open(script_fn,"w") as script_file: script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write("cd {work_dir}\n".format(work_dir = work_dir)) script_file.write("hostname >> db_build.log\n") script_file.write("date >> db_build.log\n") script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done >> db_build.log \n".format(input_fofn_fn = input_fofn_fn)) if new_db == True: script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option) if openending == True: script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3-1}')\n""") else: script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3}')\n""") script_file.write("HPCdaligner %s -H%d raw_reads %d-$LB > run_jobs.sh\n" % (pa_HPCdaligner_option, length_cutoff, last_block) ) script_file.write("touch {rdb_build_done}\n".format(rdb_build_done = fn(rdb_build_done))) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid4())[:8] job_data = {"job_name": job_name, "cwd": os.getcwd(), "sge_option": sge_option_da, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn(rdb_build_done), task=self, job_name=job_name )
def check_r_cns_task(self): with open(fn(self.preads_fofn), "w") as f: for fa_fn in sorted(fn(plf) for plf in self.inputs.values()): print >> f, fa_fn wdir = os.path.dirname(fn(self.cns_done)) #mkdir(wdir) We SHOULD need this! TODO system("touch %s" % fn(self.cns_done)) script_fn = os.path.join(wdir, 'noop.sh') open(script_fn, 'w').write('echo NOOP raw') self.generated_script_fn = script_fn
def get_rid_to_phase_all(self): rid_to_phase_all_fn = fn(self.rid_to_phase_all) inputs_fn = [fn(f) for f in self.inputs.values()] inputs_fn.sort() output = [] for fname in inputs_fn: output.extend(open(fname).read()) out = open(rid_to_phase_all_fn, "w") out.write("".join(output)) out.close()
def prepare_seed_reads(self): config = self.config length_cutoff = config["length_cutoff"] f = pbcore.io.FastaReader(fn(self.normalized_fasta)) with open(fn(self.seed_fasta), 'w') as sfasta: for r in f: if len(r.sequence) > length_cutoff: sfasta.write(">%s\n" % r.name) sfasta.write("%s\n" % r.sequence.upper())
def get_preassembled_reads(self): config = self.config directory_for_dist_map = config["directory_for_dist_map"] sge_option_ck = config["sge_option_ck"] sge_option_pa = config["sge_option_pa"] bestn = config["bestn"] tmpdir = config["tmpdir"] install_prefix = config["install_prefix"] num_chunk = config["preassembly_num_chunk"] min_cov = config["min_cov"] max_cov = config["max_cov"] trim_align = config["trim_align"] trim_plr = config["trim_plr"] q_nproc = config["q_nproc"] #set_up_script = "cp generate_preassemble_reads.py %s/" % directory_for_dist_map #os.system(set_up_script) SGE_script_template = """. %s/bin/activate cd %s/%s echo start: `date` > %01d"_job.log" hostname >> %01d"_job.log" ls -l m4*.dat >> %01d"_job.log" %s >> %01d"_job.log" echo end: `date` >> %01d"_job.log" """ job_name = "preassembly_" + str(uuid.uuid4()) for j_id in range(0, num_chunk): #TODO: use real template lib g_plr_str = "generate_preassemble_reads.py %01d %s %s %d %s %d %d %d %d %d %d" % ( j_id, fn(self.normalized_fasta), fn(self.seed_fasta), bestn, tmpdir, num_chunk, min_cov, max_cov, trim_align, trim_plr, q_nproc) script = SGE_script_template % (install_prefix, os.getcwd(), directory_for_dist_map, j_id, j_id, j_id, g_plr_str, j_id, j_id) with open("scripts/preassembly_%02d.sh" % j_id, "w") as f: print >> f, script os.system("qsub -N {jn} {sge_option_pa} -o {cwd}/sge_log -j y\ -S /bin/bash scripts/preassembly_{jid:02d}.sh".format( jn=job_name + "_%02d" % j_id, cwd=os.getcwd(), sge_option_pa=sge_option_pa, jid=j_id)) with open("scripts/preassembly_done.sh", "w") as f: print >> f, "echo done > %s" % fn(self.preassembly_done) os.system( """qsub -sync y {sge_option_ck} -hold_jid "{jn}*" -o {cwd}/sge_log -j y -S /bin/bash scripts/preassembly_done.sh""" .format(jn=job_name, cwd=os.getcwd(), sge_option_ck=sge_option_ck))
def prepare_seed_reads(self): config = self.config length_cutoff = config["length_cutoff"] f = pbcore.io.FastaReader(fn(self.normalized_fasta)) with open(fn(self.seed_fasta),'w') as sfasta: for r in f: if len(r.sequence) > length_cutoff: sfasta.write( ">%s\n" % r.name ) sfasta.write( "%s\n" % r.sequence.upper() )
def task_run_quiver(self): ref_fasta = fn(self.ref_fasta) read_sam = fn(self.read_sam) cns_fasta = fn(self.cns_fasta) cns_fastq = fn(self.cns_fastq) job_done = fn(self.job_done) job_uid = self.parameters["job_uid"] wd = self.parameters["wd"] config = self.parameters["config"] ctg_id = self.parameters["ctg_id"] smrt_bin = config["smrt_bin"] sge_quiver = config["sge_quiver"] job_type = config["job_type"] samtools = os.path.join( smrt_bin, "samtools") pbalign = os.path.join( smrt_bin, "pbalign") makePbi = os.path.join( smrt_bin, "makePbi") variantCaller = os.path.join( smrt_bin, "variantCaller") script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir , "cns_%s.sh" % (ctg_id)) script = [] script.append( "set -vex" ) script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) ) script.append( "cd %s" % wd ) script.append( "hostname" ) script.append( "date" ) script.append( "cd {wd}".format(wd = wd) ) script.append( "{samtools} faidx {ref_fasta}".format( samtools=samtools, ref_fasta=ref_fasta ) ) script.append( "{samtools} view -b -S {read_sam} > {ctg_id}.bam".format( samtools=samtools, read_sam = read_sam, ctg_id = ctg_id ) ) script.append( "{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\ --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\ --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\ {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam".format( pbalign=pbalign , ctg_id = ctg_id, ref_fasta = ref_fasta)) script.append( "#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam".format(makePbi = makePbi, ref_fasta = ref_fasta, ctg_id = ctg_id) ) script.append( "({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\ -o {cns_fasta} -o {cns_fastq}) || echo quvier failed".format( variantCaller = variantCaller, ctg_id = ctg_id, ref_fasta = ref_fasta, cns_fasta=cns_fasta, cns_fastq=cns_fastq )) script.append( "date" ) script.append( "touch {job_done}".format(job_done = job_done) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script) + '\n') job_data = support.make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_quiver run_script(job_data, job_type = job_type) wait_for_file(job_done, task=self, job_name=job_data['job_name'])
def dist_map(self): config = self.config dist_map_num_chunk = config["dist_map_num_chunk"] directory_for_dist_map = config["directory_for_dist_map"] sge_option_ck = config["sge_option_ck"] sge_option_dm = config["sge_option_dm"] install_prefix = config["install_prefix"] blasr_opt = config["blasr_opt"] #set_up_script = "fastasplit %s %s/ -c %d" % (fn(self.seed_fasta), directory_for_dist_map, dist_map_num_chunk) #os.system(set_up_script) fasta_file = pbcore.io.FastaReader(fn(self.seed_fasta)) out_files = [] for i in range(dist_map_num_chunk): out_files.append( open( "%s/%s_chunk_%07d" % (directory_for_dist_map, os.path.basename(fn(self.seed_fasta)), i), "w")) for s in fasta_file: g = hash(s.name) % dist_map_num_chunk out_file = out_files[g] out_file.write(">%s\n" % s.name) out_file.write("%s\n" % s.sequence) for i in range(dist_map_num_chunk): out_files[i].close() fasta_file.file.close() align_script_template = """\ . {install_prefix}/bin/activate cd %s/%s blasr {blasr_opt} -m 4 -out m4_%s.dat %s %s """.format(install_prefix = install_prefix, blasr_opt=blasr_opt) job_name = "dist_map_"+str(uuid.uuid4()) i = 0 for chunk_name in glob.glob("%s/%s_chunk_*" % ( directory_for_dist_map, os.path.basename(fn(self.seed_fasta))) ): script = align_script_template % (os.getcwd(), directory_for_dist_map, os.path.basename(chunk_name), fn(self.normalized_fasta), os.path.basename(chunk_name)) with open("scripts/dist_map_%02d.sh" % i,"w") as f: print >>f, script os.system("qsub -N {jn} {sge_option_dm} -o {cwd}/sge_log -j y\ -S /bin/bash scripts/dist_map_{jid:02d}.sh".format(jn=job_name+"_%02d" % i, cwd=os.getcwd(), sge_option_dm = sge_option_dm, jid=i)) i += 1 with open("scripts/mapping_done.sh","w") as f: print >>f, "echo done > %s" % fn(self.m4_data_done) os.system("""qsub -sync y {sge_option_ck} -hold_jid "{jn}*" -o {cwd}/sge_log -j y\ -S /bin/bash scripts/mapping_done.sh""".format(jn=job_name, cwd=os.getcwd(), sge_option_ck=sge_option_ck))
def build_rdb(self): #essential the same as build_rdb() but the subtle differences are tricky to consolidate to one function input_fofn = self.input_fofn input_fofn_fn = fn(input_fofn) rdb_build_done = self.rdb_build_done work_dir = self.parameters["work_dir"] config = self.parameters["config"] sge_option_da = config["sge_option_da"] install_prefix = config["install_prefix"] length_cutoff = config["length_cutoff"] pa_HPCdaligner_option = config["pa_HPCdaligner_option"] pa_DBsplit_option = config["pa_DBsplit_option"] openending = config["openending"] script_fn = os.path.join( work_dir, "prepare_rdb.sh" ) last_block = 1 new_db = True if os.path.exists( os.path.join(work_dir, "raw_reads.db") ): with open( os.path.join(work_dir, "raw_reads.db") ) as f: for l in f: l = l.strip().split() if l[0] == "blocks" and l[1] == "=": last_block = int(l[2]) new_db = False break with open(script_fn,"w") as script_file: script_file.write("set -vex\n") script_file.write("trap 'touch {rdb_build_done}.exit' EXIT\n".format(rdb_build_done = fn(rdb_build_done))) script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)) script_file.write("cd {work_dir}\n".format(work_dir = work_dir)) script_file.write("hostname\n") script_file.write("date\n") #script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done\n".format(input_fofn_fn = input_fofn_fn)) script_file.write("fasta2DB -v raw_reads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn)) if new_db == True: script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option) if openending == True: script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3-1}')\n""") else: script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3}')\n""") script_file.write("HPCdaligner %s -H%d raw_reads %d-$LB > run_jobs.sh\n" % (pa_HPCdaligner_option, length_cutoff, last_block)) script_file.write("touch {rdb_build_done}\n".format(rdb_build_done = fn(rdb_build_done))) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_option_da run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(rdb_build_done), task=self, job_name=job_data['job_name'])
def task_run_blasr(self): job_done = fn(self.job_done) ref_fasta = fn(self.ref_fasta) read_fasta = fn(self.read_fasta) job_uid = self.parameters["job_uid"] wd = self.parameters["wd"] ctg_id = self.parameters["ctg_id"] config = self.parameters["config"] smrt_bin = config["smrt_bin"] sge_blasr_aln = config["sge_blasr_aln"] job_type = config["job_type"] blasr = os.path.join(smrt_bin, "blasr") samtools = os.path.join(smrt_bin, "samtools") script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, "aln_{ctg_id}.sh".format(ctg_id=ctg_id)) script = [] script.append("set -vex") script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done=job_done)) script.append("cd %s" % wd) script.append("hostname") script.append("date") script.append("cd {wd}".format(wd=wd)) script.append( "time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\ -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\ -minMatch 12 -nproc 24 -sam -out tmp_aln.sam".format(blasr=blasr, read_fasta=read_fasta, ref_fasta=ref_fasta)) script.append( "{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted". format(samtools=samtools, ctg_id=ctg_id)) script.append("{samtools} index {ctg_id}_sorted.bam".format( samtools=samtools, ctg_id=ctg_id)) script.append("rm tmp_aln.sam") script.append("date") script.append("touch {job_done}".format(job_done=job_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script) + '\n') job_data = support.make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_blasr_aln run_script(job_data, job_type=config["job_type"]) wait_for_file(job_done, task=self, job_name=job_data['job_name'])
def make_fofn_abs(self): """Copy i_fofn to o_fofn, but with relative filenames expanded for CWD. """ i_fofn_fn = fn(self.i_fofn) o_fofn_fn = fn(self.o_fofn) #cwd = self.parameters["cwd"] assert os.path.abspath(o_fofn_fn) != os.path.abspath(i_fofn_fn) with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs: for line in ifs: ifn = line.strip() if not ifn: continue abs_ifn = os.path.abspath(ifn) ofs.write('%s\n' %abs_ifn)
def make_fofn_abs(self): """Copy i_fofn to o_fofn, but with relative filenames expanded for CWD. """ i_fofn_fn = fn(self.i_fofn) o_fofn_fn = fn(self.o_fofn) #cwd = self.parameters["cwd"] assert os.path.abspath(o_fofn_fn) != os.path.abspath(i_fofn_fn) with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs: for line in ifs: ifn = line.strip() if not ifn: continue abs_ifn = os.path.abspath(ifn) ofs.write('%s\n' % abs_ifn)
def task_phasing(self): ref_fasta = fn(self.ref_fasta) aln_bam = fn(self.aln_bam) job_done = fn(self.job_done) job_uid = self.parameters["job_uid"] wd = self.parameters["wd"] ctg_id = self.parameters["ctg_id"] config = self.parameters["config"] sge_phasing = config["sge_phasing"] job_type = config["job_type"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir , "p_%s.sh" % (ctg_id)) script = [] script.append( "set -vex" ) script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) ) script.append( "cd %s" % wd ) script.append( "hostname" ) script.append( "date" ) script.append( "cd {wd}".format(wd = wd) ) script.append( "fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir ../".format( aln_bam = aln_bam, ref_fasta = ref_fasta, ctg_id = ctg_id )) script.append( "fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads".format(ctg_id = ctg_id) ) #script.append( "fc_ovlp_filter_with_phase.py --fofn ../../2-asm-falcon/las.fofn\ # --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500\ # --db ../../1-preads_ovl/preads.db --rid_phase_map ./rid_to_phase > preads.p_ovl") #TODO: make it configurable #script.append( "fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log" ) #script.append( "fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_phase_path ./ --ctg_id {ctg_id}\ # --rid_phase_map ./rid_to_phase --fasta ../../1-preads_ovl/preads4falcon.fasta".format(ctg_id = ctg_id)) #script.append( "fc_dedup_h_tigs.py" ) script.append( "date" ) script.append( "touch {job_done}".format(job_done = job_done) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script) + '\n') job_data = support.make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_phasing run_script(job_data, job_type = job_type) wait_for_file(job_done, task=self, job_name=job_data['job_name'])
def run_p_task(self): p_script_fn = self.parameters["p_file"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rp_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id)) script = [] script.append( "export PATH=~/task2014/dazzler/DALIGNER/:$PATH" ) script.append( "cd %s" % cwd ) script.append( ("/usr/bin/time bash %s " % p_script_fn) + ( " >& %s " % log_path ) + ( " && touch %s" % fn( self.job_done ) ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": " -pe smp 2 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
def run_consensus_task(self): job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] config = self.parameters["config"] sge_option_cns = config["sge_option_cns"] install_prefix = config["install_prefix"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "c_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "c_%05d.log" % (job_id)) prefix = self.parameters["prefix"] falcon_sense_option = config["falcon_sense_option"] length_cutoff = config["length_cutoff"] with open( os.path.join(cwd, "cp_%05d.sh" % job_id), "w") as c_script: print >> c_script, "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) print >> c_script, "cd .." print >> c_script, """LA4Falcon -H%d -o -f:%s las_files/%s.%d.las | """ % (length_cutoff, prefix, prefix, job_id), print >> c_script, """fc_consensus.py %s > %s""" % (falcon_sense_option, fn(self.out_file)) script = [] script.append( "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) ) script.append( "cd %s" % cwd ) script.append( ("/usr/bin/time bash cp_%05d.sh " % job_id ) + ( " >& %s " % log_path ) + ( " && touch c_%05d_done" % job_id ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": sge_option_cns, "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( os.path.join(cwd,"c_%05d_done" % job_id) , task=self, job_name=job_name )
def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] fa_serial = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() c = 0 fa_serial += 1 with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm: f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_","") ignore_read = False for cc in r.sequence: if cc not in ["A","C","G","T"]: ignore_read = True break if ignore_read: continue print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) ) for i in range(0, len(r.sequence)/80): print >> p_norm, r.sequence[ i *80 : (i + 1) * 80] print >> p_norm, r.sequence[(i+1)*80:] c += 1 os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) ) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) os.system("cd %s; touch rdb_build_done" % pread_dir)
def run_merge_task(self): p_script_fn = self.parameters["merge_script"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] config = self.parameters["config"] sge_option_la = config["sge_option_la"] install_prefix = config["install_prefix"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rp_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id)) script = [] script.append( "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) ) script.append( "cd %s" % cwd ) script.append( "hostname >> %s" % log_path ) script.append( "date >> %s" % log_path ) script.append( ("/usr/bin/time bash %s " % p_script_fn) + ( " >> %s 2>&1" % log_path ) + ( " && touch %s" % fn( self.job_done ) ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid4())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": sge_option_la, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
def run_daligner(self): daligner_cmd = self.parameters["daligner_cmd"] job_uid = self.parameters["job_uid"] cwd = self.parameters["cwd"] config = self.parameters["config"] sge_option_da = config["sge_option_da"] install_prefix = config["install_prefix"] db_prefix = self.parameters["db_prefix"] nblock = self.parameters["nblock"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rj_%s.sh" % (job_uid)) log_path = os.path.join( script_dir, "rj_%s.log" % (job_uid)) script = [] script.append( "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) ) script.append( "cd %s" % cwd ) script.append( "hostname >> %s" % log_path ) script.append( "date >> %s" % log_path ) script.append( "/usr/bin/time "+ daligner_cmd + ( " >> %s 2>&1 " % log_path ) + ( " && touch %s" % fn( self.job_done ) ) ) for p_id in xrange( 1, nblock+1 ): script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """ % (db_prefix, p_id, db_prefix, p_id) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid4())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": sge_option_da, "script_fn": script_fn } run_script(job_data, job_type = config["job_type"]) wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
def task_run_daligner(self): job_done = fn(self.job_done) daligner_cmd = self.parameters["daligner_cmd"] job_uid = self.parameters["job_uid"] cwd = self.parameters["cwd"] db_prefix = self.parameters["db_prefix"] nblock = self.parameters["nblock"] config = self.parameters["config"] sge_option_da = config["sge_option_da"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rj_%s.sh" % (job_uid)) args = { 'daligner_cmd': daligner_cmd, 'db_prefix': db_prefix, 'nblock': nblock, 'config': config, 'job_done': job_done, 'script_fn': script_fn, } support.run_daligner(**args) job_data = support.make_job_data(self.URL, script_fn) job_data["sge_option"] = sge_option_da run_script(job_data, job_type = config["job_type"]) wait_for_file(job_done, task=self, job_name=job_data['job_name'])