def main(args): samples = ["MDX_22_AGTTCC_L003_R1_001", "MDX_23_ATGTCA_L003_R1_001", "MDX_24_CCGTCC_L003_R1_001", "WT_21_AGTCAA_L003_R1_001", "WT_25_GTAGAG_L003_R1_001", "WT_42_GTCCGC_L003_R1_001"] datadir = "/vol1/home/brownj/projects/leinwand/data/20121101" adapters = "%s/adapters.fa" % datadir resultsdir = "/vol1/home/brownj/projects/leinwand/results/common" fastqc_script = "/vol1/home/brownj/opt/fastqc/fastqc" picard = "/vol1/home/brownj/opt/picard-tools-1.79" reference_fasta = "/vol1/home/brownj/ref/mm9/mm9.fa" gmapdb = "/vol1/home/brownj/ref/gmapdb" gsnapcmd = "gsnap -D {} -d mm9 --gunzip \ --batch=5 --nofails --nthreads=4 --format=sam -v snp128_strict_wholeChrs {} \ | samtools view -ShuF 4 - \ | samtools sort -o - {}.temp -m 9500000000 > {}" chrom_sizes = "/vol1/home/brownj/ref/mm9/mm9.sizes" if args.clobber: ngseq.clobber_previous(resultsdir) # ngseq.fastqc(fastqc_script, samples, datadir) bsub.poll(ngseq.trimadapter(datadir, adapters)) # bsub.poll(ngseq.gsnap(samples, datadir, resultsdir, gmapdb, gsnapcmd)) # ngseq.alignment_stats(resultsdir, picard, reference_fasta) ngseq.cleanup(resultsdir) # create genomedata archive in results/common bam_pattern = "/vol1/home/brownj/projects/leinwand/results/common/*/*.bam" output_dir = "/vol1/home/brownj/projects/leinwand/results/common"
def readoutfile(file, jobid): """ parse lsf log (.out, .err) file :param file: lsf log file (.out) :param jobid: lsf job id :return: exitcode of lsf job """ if not os.path.isfile(file): bsub.poll(jobid) else: with open(file) as myfile: lines = myfile.readlines() exitcode = None for line in lines: hits = regexes['exit_code'].search(line) if hits is None: pass elif hits.group(1) is not None: exitcode = 0 elif hits.group(2) is not None: exitcode = int(hits.group(2)) print("Final exit code is ", exitcode) print(type(exitcode)) return exitcode
def launch_lsf(self, command_strings, verbose=False, output='/dev/null'): curr_dir = os.getcwd() os.chdir(self.tmpdir) job_ids = [bsub('phyml_task', o='/dev/null', e='/dev/null', verbose=verbose)(cmd).job_id for cmd in command_strings] bsub.poll(job_ids) os.chdir(curr_dir)
def main(): base = "/vol1/home/brownj/projects/davidson" data = base + "/data/20120924" results = base + "/results/common" samples = ["1","2","3","4","5","6"] joinscript = base + "/bin/join_reads.py" seeds = "/vol1/home/brownj/projects/davidson/data/20120924/tr_ab_v.fa" # bsub.poll(trim(samples, data)) bsub.poll(join(samples, data, joinscript)) assemble(samples, data, results, seeds)
def alignment_stats(results_path, picard_path, ref_fasta): for bam in getfilelist(results_path, "*.bam"): cmd = "samtools index %s" % bam if not op.exists("%s.bai" % bam): jobid = bsub("index", verbose=True)(cmd) bsub.poll(jobid) cmd = "java -Xmx8g -jar %s/CollectMultipleMetrics.jar \ INPUT=%s REFERENCE_SEQUENCE=%s ASSUME_SORTED=true OUTPUT=metrics \ PROGRAM=CollectAlignmentSummaryMetrics \ PROGRAM=QualityScoreDistribution \ PROGRAM=MeanQualityByCycle" % (picard_path, bam, ref_fasta) bsub("alignment_summary", verbose=True)(cmd)
def assemble(samples, data_dir, results_dir, seed_fa): """assemble using SSAKE.""" # jobs = [] for sample in samples: fastas = ngseq.getfilelist(datadir, sample + ".jnd.fa.gz") assert(len(fastas) == 1) gzipfasta = fastas[0] outdir = "%s/%s" % (results_dir, sample) fasta = outdir + "/" + op.splitext(op.basename(gzipfasta))[0] if not op.exists(fasta): bsub.poll(ngseq.extract(gzipfasta, fasta)) cmd = "SSAKE -f " + fasta + " -s " + seed_fa + " -m 40 -o 50 -r 0.8 -b " + sample + " -p 1 -v 1 -d 200 -e 0.75 -k 10 -a 0.5 -x 50" jobid = bsub("3prime_seed_extension", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
def launch_lsf(self, command_strings, verbose=False): curr_dir = os.getcwd() os.chdir(self.tmpdir) job_launcher = bsub('treeCl_gtp_task', o='/dev/null', e='/dev/null', verbose=verbose) if not self.debug: job_launcher.kwargs['o'] = job_launcher.kwargs['e'] = '/dev/null' job_ids = [job_launcher(cmd).job_id for cmd in command_strings] self.job_ids.update(job_ids) bsub.poll(job_ids) os.chdir(curr_dir)
def novoalign(samples, datadir, resultsdir, index, genome): jobs = [] for sample in samples: fastqs = getfilelist(datadir, sample + ".fastq.gz") assert(len(fastqs) == 1) outdir = resultsdir.rstrip("/") + "/" + sample alignresult = outdir + "/" + sample + "." + genome + ".bam" if op.exists(alignresult): continue if not op.exists(outdir): os.makedirs(outdir) gzipfastq = fastqs[0] fastq = outdir + "/" + op.splitext(op.basename(gzipfastq))[0] if not op.exists(fastq): bsub.poll(extract(gzipfastq, fastq)) cmd = "novoalignCS -c 1 -d " + index + " -f " + fastq + " -F BFASTQ -o SAM -r Random -e 100 -s 8 -l 20 | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult jobid = bsub("novoalign", n="1", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def readoutfile(file, jobid): if not os.path.isfile(file): bsub.poll(jobid) else: with open(file) as f: lines = f.readlines() exitcode= None for line in lines: hits = regexes['exit_code'].search(line) if hits is None: pass elif hits.group(1) is not None: exitcode = 0 elif hits.group(2) is not None: exitcode = int(hits.group(2)) print("Final exit code is ", exitcode) print(type(exitcode)) return exitcode
def counts(samples, result_path, peak_ext, bam_ext): # get the consensus peaks f = open("%s/peak_coordinates.bed" % result_path, 'w') x = BedTool() consensus = x.multi_intersect(i=getfilelist(result_path, "*%s" % peak_ext)) for c in consensus: # fixing formatting from bedtool object replicate_counts = c.name if replicate_counts < 2: continue fields = [c.chrom, c.start, c.stop, "%s:%d-%d\n" % \ (c.chrom, c.start, c.stop)] f.write("\t".join(map(str, fields))) f.close() # get counts for each sample jobs = [] countfiles = [] for sample in samples: bams = getfilelist(result_path, sample + "*%s" % bam_ext) assert(len(bams) == 1) outdir = result_path.rstrip("/") + "/" + sample countsresult = outdir + "/" + sample + ".counts" countfiles.append(countsresult) if op.exists(countsresult): continue cmd = "bedtools coverage -abam %s -b %s > %s" % \ (bams[0], f.name, countsresult) jobid = bsub(sample + "_counts", R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) bsub.poll(jobs) # counts to matrix allcounts = {} for cf in countfiles: cfname = op.basename(cf).split(".counts")[0] casecounts = {} for toks in reader(cf, header="chrom start stop name a_overlaps_in_b \ b_with_nonzero length_b frac_b_nonzero".split()): casecounts[toks['name']] = int(toks['a_overlaps_in_b']) allcounts[cfname] = casecounts countsdf = pd.DataFrame(allcounts) countsdf.to_csv(sys.stdout, sep="\t", header=True)
def _launch_lsf_dynamic_memory(self, command_strings, verbose=False): curr_dir = os.getcwd() os.chdir(self.tmpdir) memory = self.get_memory_requirements() job_ids = [] for i, cmd in enumerate(command_strings): memory_reqd = memory[i] job_launcher = bsub('treeCl_dynamic_phyml_task', R='rusage[mem={}]'.format(memory_reqd), M=memory_reqd, verbose=verbose) if not self.debug: job_launcher.kwargs['o'] = '/dev/null' job_launcher.kwargs['e'] = '/dev/null' job_ids.append(job_launcher(cmd).job_id) self.job_ids.update(job_ids) bsub.poll(job_ids) os.chdir(curr_dir)
def _launch_lsf_fixed_memory(self, command_strings, minmem=4096, verbose=False): """ Uses bsub package to send phyml jobs to lsf """ curr_dir = os.getcwd() os.chdir(self.tmpdir) job_launcher = bsub('treeCl_static_phyml_task', R='rusage[mem={}]'.format(minmem), M=minmem, verbose=verbose) # overwrite kwargs pertaining to output log files if not self.debug: job_launcher.kwargs['o'] = job_launcher.kwargs['e'] = '/dev/null' job_ids = [job_launcher(cmd).job_id for cmd in command_strings] self.job_ids.update(job_ids) bsub.poll(job_ids) os.chdir(curr_dir)
def rum(samples, datadir, resultsdir, index): """align to index using rum""" jobs = [] for sample in samples: fastqs = getfilelist(datadir, sample + ".trim.fastq.gz") assert(len(fastqs) == 1) outdir = resultsdir + "/" + sample alignresult = outdir + "/" + sample + ".bam" alternatealignresult = outdir + "/RUM.sam" if op.exists(alignresult) or op.exists(alternatealignresult): continue gzipfastq = fastqs[0] fastq = outdir + "/" + op.splitext(op.basename(gzipfastq))[0] if not op.exists(fastq): bsub.poll(extract(gzipfastq, fastq)) cmd = "rum_runner align -v -i " + index + " -o " + outdir + " --chunks 5 --dna --nu-limit 2 --variable-length-reads --name " + sample + " " + fastq jobid = bsub("rum", n="5", R="select[mem>28] rusage[mem=28] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def main(): fastqc() # does this actually work? bsub.poll(concat()) bsub.poll(align()) bsub.poll(cleanup()) indexbams()
def main(args): samples = ['2Som_chip1_GCCAAT_L006_R1_001', '2Som_chip2_GTCCGC_L006_R1_001', '2Som_Input_GTGAAA_L006_R1_001', '31hpt_Chip1_CAGATC_L006_R1_001', '31hpt_Chip2_ACAGTG_L006_R1_001', '31hpt_Input_TGACCA_L006_R1_001'] controls = ['2Som_Input_GTGAAA_L006_R1_001', '31hpt_Input_TGACCA_L006_R1_001'] datadir = "/vol1/home/brownj/projects/artinger/data/20121101" resultsdir = "/vol1/home/brownj/projects/artinger/results/common" fastqc_script="/vol1/home/brownj/opt/fastqc/fastqc" picard = "/vol1/home/brownj/opt/picard-tools-1.79" rumindex = "/vol1/home/brownj/ref/rum/zebrafish" reference_fasta = "/vol1/home/brownj/ref/zebrafish/Danio_rerio.Zv9.68.fa" gmapdb = "/vol1/home/brownj/ref/gmapdb" rumcmd = "rum_runner align -v -i %s -o {} --chunks 5 --dna --nu-limit 2 --variable-length-reads --name {} {}" % rumindex macscmd = "macs14 -t {} -f BAM -n {} -g 1400000000 -w --single-profile --call-subpeaks" gsnapcmd = "gsnap -D {} -d zebrafish --gunzip --npaths=1 --quiet-if-excessive --batch=5 --nofails --nthreads=4 --format=sam {} | samtools view -ShuF 4 - | samtools sort -o - {}.temp -m 9500000000 > {}" if args.clobber: clobber_previous(resultsdir) fastqc(fastqc_script, samples, datadir) bsub.poll(trim(datadir, "*R1_001.fastq.gz")) bsub.poll(gsnap(samples, datadir, resultsdir, gmapdb, gsnapcmd)) # alignment_stats(resultsdir, picard, reference_fasta) bsub.poll(macs(samples, resultsdir, controls, macscmd)) cleanup(resultsdir)
def counts(samples, resultsdir): """docstring""" # get the consensus peaks f = open(resultsdir + "/peak_coordinates.bed", 'w') x = BedTool() consensus = x.multi_intersect(i=getfilelist(resultsdir, "*peaks.bed.gz")) for c in consensus: replicate_counts = c.name if replicate_counts < 2: continue fields = [c.chrom, c.start, c.stop, "%s:%d-%d\n" % (c.chrom, c.start, c.stop)] f.write("\t".join(map(str, fields))) f.close() # get counts for each sample jobs = [] countfiles = [] for sample in samples: bams = getfilelist(resultsdir, sample + "*.hg19_novoalign.bam") assert(len(bams) == 1) outdir = resultsdir.rstrip("/") + "/" + sample countsresult = outdir + "/" + sample + ".counts" countfiles.append(countsresult) if op.exists(countsresult): continue cmd = "bedtools coverage -abam " + bams[0] + " -b " + f.name + " > " + countsresult jobid = bsub(sample + "_counts", R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) bsub.poll(jobs) # counts to matrix allcounts = {} for cf in countfiles: cfname = op.basename(cf).split(".hg19_novoalign.bam")[0] casecounts = {} for toks in reader(cf, header="chrom start stop name a_overlaps_in_b b_with_nonzero length_b frac_b_nonzero".split()): casecounts[toks['name']] = int(toks['a_overlaps_in_b']) allcounts[cfname] = casecounts countsdf = pd.DataFrame(allcounts) countsdf.to_csv(resultsdir + "/sample_counts.csv", sep=",", header=True)
def main(): samples = ['RS_input_CCGTCC_L005_R1_001', 'RS_iso_ATGTCA_L005_R1_001', 'RS_tbet_CTTGTA_L005_R1_001'] control = 'RS_input_CCGTCC_L005_R1_001' datadir = "/vol1/home/brownj/projects/marrack/data/20121101" resultsdir = "/vol1/home/brownj/projects/marrack/results/common" rumindex = "/vol1/home/brownj/ref/rum/mm9" fastqc(samples, datadir, resultsdir) bsub.poll(trim(datadir, "*R1_001.fastq.gz")) bsub.poll(rum(samples, datadir, resultsdir, rumindex)) bsub.poll(postprocessrum(resultsdir)) bsub.poll(macs(samples, resultsdir, control)) cleanup(resultsdir)
def bowtiealign(samples, index, genome): """align to index using bowtie""" jobs = [] for sample in samples: fastqs = getfilelist(DATA, sample + "_*.trm.fq.gz") # single end assert(len(fastqs) == 1) outdir = RESULTS + sample alignresult = outdir + "/" + sample + "." + genome + ".bam" if op.exists(alignresult):continue if not op.exists(outdir): os.makedirs(outdir) gzipfastq = fastqs[0] fastq = outdir + "/" + os.path.splitext(op.basename(gzipfastq))[0] if not op.exists(fastq): bsub.poll(extract(gzipfastq, fastq)) cmd = "bowtie -p4 --best --sam -q " + index + " " + fastq + " | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult jobid = bsub(PI + ".bowtie", n="4", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def main(): hairpinindex = "/vol1/home/brownj/ref/mirbase/19/hairpin19" matureindex = "/vol1/home/brownj/ref/mirbase/19/mature19" tuberculosisindex = "/vol1/home/brownj/ref/tuberculosis/H37Rv" fastqc(SAMPLES) bsub.poll(trimadapters(DATA)) # Bowtie bsub.poll(bowtiealign(SAMPLES, matureindex, "mature")) bsub.poll(bowtiealign(SAMPLES, tuberculosisindex, "H37Rv")) removefastqs()
def main(): """ Main call to the data_provider scripts. :return: None """ error_list = list() get_args() prop = properties(properties_file) lsf = prop.lsf #print(prop) conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost, prop.dbname, prop.dbport) data_provider_list = get_list(conn) print(data_provider_list) process_jobids = {} for data_provider_stage in data_provider_list: print(data_provider_stage.process_id, data_provider_stage.selection_id, data_provider_stage.stage_list) if not data_provider_stage.check_started(conn): print("\nTo be started job: process_id:{}\ collection id: {} dataprovider id: {} ".format( data_provider_stage.process_id, data_provider_stage.selection_id, data_provider_stage.stage_list)) data_provider_stage.set_started(conn) process_dir = prop.workdir + data_provider_stage.process_id print("Creating process directory:{}".format(process_dir)) create_processing_dir(process_dir) account_name = get_datahub_names(conn, data_provider_stage.process_id) print("account to be processed:{}".format(account_name)) files = get_file_names(conn, data_provider_stage.process_id) print("Files to be downloaded:{}".format(files)) pass_word = get_datahub_account_password(conn, account_name) process_id = data_provider_stage.process_id jobids = download_datahub_file(account_name, pass_word, files, process_dir, process_id, lsf, dryrun=False) """ We should be able to capture the .err and .out lsf output into the database. Maybe define a a generic lsf_stat class, that will match in .out the "Successfully completed" string if true set length of error_list to 0 other wise logs the full path to the .out file in database """ if not lsf: #if len(error_list) != 0: if len(error_list): final_errors = '\n'.join( str(v).replace("'", "") for v in error_list) data_provider_stage.set_error(conn, final_errors) else: data_provider_stage.set_finished(conn) elif lsf: err = [ os.getcwd() + '/data_provider_' + process_id + '.' + y for y in [x + '.err' for x in jobids] ] out = [ os.getcwd() + '/data_provider_' + process_id + '.' + y for y in [x + '.out' for x in jobids] ] final_errors = '\n'.join(str(v).replace("'", "") for v in out) print(final_errors) process_jobids[process_id] = out error_list = list() if lsf: print(process_jobids) """ We should check for the content of lsmyfile.out file and store the full path of the error and out file in DB """ if lsf: for data_provider_stage in data_provider_list: process_id = data_provider_stage.process_id for lsf_out in process_jobids[process_id]: print('*' * 100) print(lsf_out) print('*' * 100) jobid = lsf_out.split('.')[-2] bsub.poll(jobid) if os.path.isfile(lsf_out): print( "Processing lsmyfile.out for: jobid {}".format(jobid)) print("Processing: {}".format(lsf_out)) print('*' * 100) localexitcode = readoutfile(lsf_out, jobid) print(localexitcode) if localexitcode != 0: final_errors = lsf_out + ' with exit code ' + str( localexitcode) data_provider_stage.set_error(conn, final_errors) else: data_provider_stage.set_finished(conn) print('*' * 100) else: print("Awaiting completion of: jobid {}".format(jobid)) print("Processing: {}".format(lsf_out)) print('*' * 100) #bsub.poll(jobid) if os.path.isfile(lsf_out): localexitcode = readoutfile(lsf_out, jobid) print(localexitcode) if localexitcode != 0: final_errors = lsf_out + ' with exit code ' + str( localexitcode) data_provider_stage.set_error(conn, final_errors) else: data_provider_stage.set_finished(conn) else: bsub.poll(jobid) conn.close()