def get_xml(dirname, analysis_id, logger): print "Downloading XML" print "Analysis ID = %s" % analysis_id xml_file = "%s.xml" %os.path.join(dirname, analysis_id) cmd = ['cgquery', '-o' , xml_file, 'analysis_id=%s' %analysis_id] pipelineUtil.log_function_time('cgquery', analysis_id, cmd, logger) return xml_file
def decompress(filename, workdir): """ Unpack fastq files """ if filename.endswith(".tar"): cmd = ['tar', '-xvf', filename, '-C', workdir] elif filename.endswith(".gz"): cmd = ['tar', '-xzvf', filename, '-C', workdir] else: raise Exception('Unknown input file extension for file %s' % filename) pipelineUtil.log_function_time("tar", filename, cmd)
def run_pipeline(args, workdir, analysis_id, logger): """ align datasets using STAR and compute expression using cufflinks """ for filename in os.listdir(workdir): if filename.endswith(".tar") or filename.endswith(".tar.gz"): tar_file_in = os.path.join(workdir, filename) break star_output_dir = os.path.join(workdir, 'star_2_pass') if os.path.isdir(star_output_dir): pipelineUtil.remove_dir(star_output_dir) os.mkdir(star_output_dir) bam = "%s_star.bam" %os.path.join(star_output_dir, analysis_id) if not os.path.isfile(bam): star_cmd = ['time', '/usr/bin/time', 'python', args.star_pipeline, '--genomeDir', args.genome_dir, '--runThreadN', args.p, '--tarFileIn', tar_file_in, '--workDir', workdir, '--out', bam, '--genomeFastaFile', args.genome_fasta_file, '--sjdbGTFfile', args.gtf ] if args.quantMode != "": star_cmd.append('--quantMode') star_cmd.append(args.quantMode) pipelineUtil.log_function_time("STAR", analysis_id, star_cmd, logger) remote_bam_path = "%s_star.bam" % os.path.join(args.bucket, analysis_id, analysis_id) pipelineUtil.upload_to_cleversafe(logger, remote_bam_path, bam) cufflinks_cmd = ['time', '/usr/bin/time', 'python', args.cufflinks_pipeline, '--bam', bam, '--gtf', args.gtf, '--analysis_id', analysis_id, '--out', star_output_dir, '--p', args.p, '--multi_read_correct', 'True' ] pipelineUtil.log_function_time("CUFFLINKS", analysis_id, cufflinks_cmd, logger) cuffout_genes_local = os.path.join(star_output_dir, "genes.fpkm_tracking") cuffout_genes_remote = os.path.join(args.bucket, "cufflinks", "star_gene", "%s.genes.fpkm_tracking" %analysis_id) pipelineUtil.upload_to_cleversafe(logger, cuffout_genes_remote, cuffout_genes_local) cuffout_isoforms_local = os.path.join(star_output_dir, "isoforms.fpkm_tracking") cuffout_isoforms_remote = os.path.join(args.bucket, "cufflinks", "star_iso", "%s.isoforms.fpkm_tracking" %analysis_id) pipelineUtil.upload_to_cleversafe(logger, cuffout_isoforms_remote, cuffout_isoforms_local) pipelineUtil.remove_dir(star_output_dir)
def cufflinks_compute(args, logger=None): """ compute rna-seq expression using cufflinks """ cmd = ['cufflinks'] if args.multi_read_correct == 'True': cmd.append('--multi-read-correct') if args.frag_bias_correct == 'True': cmd.append('--frag-bias-correct') cmd += [ '--GTF', args.gtf, '--output-dir', args.out, '--num-threads', str(args.p), args.bam ] print cmd pipelineUtil.log_function_time('cufflinks', args.analysis_id, cmd, logger)
def rna_seq_qc(rna_seq_qc_path, bam_file, uuid, outdir, ref_genome, gtf, logger=None): """ Perform RNA-seqQC on post alignment BAM file """ if os.path.isfile(bam_file) and os.path.isfile(rna_seq_qc_path) and os.path.isfile(gtf): cmd = ['java', '-jar', rna_seq_qc_path, '-o', outdir, '-r', ref_genome, '-s', '%s|%s|%s' %(uuid, bam_file, uuid), '-t', gtf] exit_code = pipelineUtil.log_function_time('RNAseq_qc', uuid, cmd, logger) else: raise Exception("Cannot find one of rnaseq-qc %s, bam %s or gtf %s" %(rna_seq_qc_path, bam_file, gtf)) if not exit_code == 0: if not logger == None: logger.error("Broad's RNA-Seq-QC returned non-zero exit code %s" %exit_code) return exit_code
def bam_index(bam_file, uuid, logger=None): """ Index the resultant post alignment BAM file """ if os.path.isfile(bam_file): cmd = ['samtools', 'index', '-b', bam_file] exit_code = pipelineUtil.log_function_time("BamIndex", uuid, cmd, logger) if exit_code == 0: assert(os.path.isfile('%s.bai' %bam_file)) else: raise Exception("Cannot file bam file %s" %bam_file) if not exit_code == 0: if not logger == None: logger.error("Samtools Index returned non-zero exit code %s" %exit_code) return exit_code
def fastqc(fastqc_path, reads_1, reads_2, rg_id_dir, analysis_id, logger=None): """ perform pre-alignment qc checks using fastqc """ if not os.path.isdir(rg_id_dir): raise Exception("Invalid directory: %s") fastqc_results = "%s" %(os.path.join(rg_id_dir, "fastqc_results")) if not os.path.isdir(fastqc_results): os.mkdir(fastqc_results) if not reads_2 == "": cmd = [fastqc_path, reads_1, reads_2, '--outdir', fastqc_results, '--extract'] else: cmd = [fastqc_path, reads_1, '--outdir', fastqc_results, '--extract'] exit_code = pipelineUtil.log_function_time("FastQC", analysis_id, cmd, logger) if not exit_code == 0: if not logger == None: logger.error('FastQC returned a non-zero exit code: %s' %exit_code)
def bam_to_fastq(fastq_dir, bam_file, analysis_id, logger=None): """ Convert input BAM to Fastq files """ tmp_fastq = os.path.join(fastq_dir, 'tmp') cmd = ['bamtofastq', 'filename=%s' %bam_file, 'outputdir=%s' %fastq_dir, 'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=%s' %tmp_fastq] exit_code = pipelineUtil.log_function_time('Biobambam', analysis_id, cmd, logger) if exit_code == 0: for filename in os.listdir(fastq_dir): if filename.endswith(".fq"): new_filename = filename.replace(".fq", ".fastq") os.rename(os.path.join(fastq_dir, filename), os.path.join(fastq_dir, new_filename)) else: logger.error("Biobambam BamToFastq conversion of %s returned a non-zero exit code %s" %(analysis_id, exit_code))
def reorder_bam(picard_path, bam_file, uuid, outdir, ref_genome, logger=None): """ Reorder the BAM file according to the reference genome """ if os.path.isfile(bam_file) and os.path.isfile(picard_path) and os.path.isfile(ref_genome): outbam = os.path.join(outdir, '%s.reorder.bam' %uuid) tmp_dir = os.path.join(outdir, 'tmp') if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) cmd = ['java', '-jar', picard_path, 'ReorderSam', 'I=%s' %bam_file, 'O=%s' %outbam, 'R=%s' %ref_genome, 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=%s' %tmp_dir] exit_code = pipelineUtil.log_function_time("picard_reorder_sam", uuid, cmd, logger) if exit_code == 0: assert(os.path.isfile(outbam)) else: raise Exception("Cannot find one of bam %s, picard path %s or reference genome %s" %(bam_file, picard_path, ref_genome)) if not exit_code == 0: if not logger == None: logger.error("Picard reorderBAM returned non-zero exit code %s" %exit_code) return outbam
def collect_rna_seq_metrics(picard_path, bam_file, uuid, outdir, ref_flat, logger=None): """ Collect RNA-seq metrics using Picard """ if os.path.isfile(picard_path) and os.path.isfile(bam_file): tmp_dir = os.path.join(outdir, 'tmp') outfile = os.path.join(outdir, "%s.rna_seq_metrics.txt" %uuid) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) cmd = ['java', '-jar', picard_path, "CollectRnaSeqMetrics", "METRIC_ACCUMULATION_LEVEL=READ_GROUP", "I=%s" %bam_file, "O=%s" %outfile, "STRAND=NONE", "REF_FLAT=%s" %ref_flat, "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=%s" %tmp_dir] exit_code = pipelineUtil.log_function_time("RNAseq_metrics", uuid, cmd, logger) if exit_code == 0: assert(os.path.isfile(outfile)) else: raise Exception("Invalid path to picard or bam") if not exit_code == 0: if not logger == None: logger.error("Picard CollectRnaSeqMetrics returned non-zero exit code %s" %exit_code) return exit_code
def validate_bam_file(picard_path, bam_file, uuid, outdir, logger=None): """ Validate resulting post-alignment BAM file """ if os.path.isfile(picard_path) and os.path.isfile(bam_file): outfile = os.path.join(outdir, "%s.validate" %uuid) tmp_dir = os.path.join(outdir, 'tmp') if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) cmd = ['java', '-jar', picard_path, "ValidateSamFile", "I=%s" %bam_file, "O=%s" %outfile, "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=%s" %tmp_dir] exit_code = pipelineUtil.log_function_time("ValidateSAM", uuid, cmd, logger) if exit_code == 0: assert(os.path.isfile(outfile)) else: raise Exception("Invalid path to picard or BAM") if not exit_code == 0: if not logger == None: logger.error("Picard ValidateSamFile returned non-zero exit code %s" %exit_code) return exit_code
def add_or_replace_read_group(picard_path, bam_file, outdir, uuid, rg_id, rg_lb="Unknown", rg_pl="Unknown", rg_pu="Unknown",rg_sm="Unknown", logger=None): """ Replace the @RG tag in the reads and header """ outbam = '%s.addRG.bam' %os.path.join(outdir, uuid) if os.path.isfile(bam_file) and os.path.isfile(picard_path): tmp_dir = os.path.join(outdir, 'tmp') if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) cmd = ['java', '-jar', picard_path, 'AddOrReplaceReadGroups', 'I=%s' %bam_file, 'O=%s' %outbam, 'RGID=%s'%rg_id, 'RGLB=%s' %rg_lb, 'RGPL=%s' %rg_pl, 'RGPU=%s' %rg_pu, 'RGSM=%s' %rg_sm, 'VALIDATION_STRINGENCY=LENIENT','TMP_DIR=%s' %tmp_dir] exit_code = pipelineUtil.log_function_time('AddOrReplaceReadGroups', uuid, cmd, logger) else: raise Exception("Cannot find bam file %s or path to picard %s" %(bam_file, picard_path)) if not exit_code == 0: if not logger == None: logger.error("Picard AddOrReplaceReadGroups returned non-zero exit code %s" %exit_code) if os.path.isfile(outbam): print "returning file now %s" %outbam return outbam else: raise Exception('Could not add or replace read groups. Check log file for errors')
log_file = os.path.join(sub_dir, log_file) logp = open(log_file, "r") for line in logp: if "CUFFLINKS_TIME" in line: line = line.split() f.write("%s\t%s\t%s\n" %(line[4], line[5], metadata["downloadable_file_size"])) f.close() if __name__ == "__main__": parser = argparse.ArgumentParser(prog='label_dataset.py') parser.add_argument('--dirname', default='/home/ubuntu/SCRATCH/lung_results') parser.add_argument('--disease', help='disease to be labeled') args = parser.parse_args() #collect_metrics(args.dirname) for filename in os.listdir(args.dirname): if filename.endswith('fpkm_tracking'): analysis_id = filename.split(".")[0] #metadata = extract_metadata(args.dirname, analysis_id, None) #print 'disease= %s' %metadata['disease'] cmd = ['mv', '%s' % os.path.join(args.dirname, filename), '%s' %os.path.join(args.dirname, args.disease, '%s_%s' %(args.disease, analysis_id))] """ print metadata if metadata['disease'] != "": if not os.path.isdir(os.path.join(args.dirname, metadata['disease'])): os.mkdir(os.path.join(args.dirname, metadata['disease'])) cmd = ['mv', '%s' % os.path.join(args.dirname, filename), '%s' %os.path.join(args.dirname, metadata["disease"], '%s_%s' %(metadata['disease'], analysis_id))] """ pipelineUtil.log_function_time('mv', analysis_id, cmd, None)
def run_pipeline(args, workdir, analysis_id, fastq_dir, logger): """ align datasets using STAR and compute expression using cufflinks """ tar_file_in = args.input_file qc_dir = os.path.join(workdir, 'qc') if not os.path.isdir(qc_dir): os.mkdir(qc_dir) decompress(tar_file_in, fastq_dir) for fname in os.listdir(fastq_dir): if fname.endswith("_1.fastq.gz") or fname.endswith("_1.fastq"): reads_1 = os.path.join(fastq_dir, fname) if fname.endswith("_2.fastq.gz") or fname.endswith("_2.fastq"): reads_2 = os.path.join(fastq_dir, fname) qc.fastqc(args.fastqc_path, reads_1, reads_2, qc_dir, analysis_id, logger) star_output_dir = os.path.join(workdir, 'star_2_pass') if os.path.isdir(star_output_dir): pipelineUtil.remove_dir(star_output_dir) os.mkdir(star_output_dir) bam = "%s_star.bam" %os.path.join(star_output_dir, analysis_id) if not os.path.isfile(bam): star_cmd = ['time', '/usr/bin/time', 'python', args.star_pipeline, '--genomeDir', args.genome_dir, '--runThreadN', args.p, '--tarFileIn', tar_file_in, '--workDir', workdir, '--out', bam, '--genomeFastaFile', args.genome_fasta_file, '--sjdbGTFfile', args.gtf ] if args.quantMode != "": star_cmd.append('--quantMode') star_cmd.append(args.quantMode) pipelineUtil.log_function_time("STAR", analysis_id, star_cmd, logger) exit_code = 1 #Fix mate information for BAM exit_code, fix_mate_out = post_alignment_qc.fix_mate_information(args.picard, bam, analysis_id, workdir, logger) if exit_code == 0: os.remove(bam) assert(not os.path.isfile(bam)) os.rename(fix_mate_out, bam) assert(os.path.isfile(bam)) #validate the post alignment BAM file post_alignment_qc.validate_bam_file(args.picard, bam, analysis_id, qc_dir, logger) #collect RNA-seq metrics post_alignment_qc.collect_rna_seq_metrics(args.picard, bam, analysis_id, qc_dir, args.ref_flat, logger) #quantify using cufflinks cufflinks_cmd = ['time', '/usr/bin/time', 'python', args.cufflinks_pipeline, '--bam', bam, '--gtf', args.gtf, '--analysis_id', analysis_id, '--out', star_output_dir, '--p', args.p, '--multi_read_correct', 'True' ] pipelineUtil.log_function_time("CUFFLINKS", analysis_id, cufflinks_cmd, logger)
"--outSAMstrandField", str(args.outSAMstrandField), "--outSAMunmapped", str(args.outSAMunmapped) ] if args.keepJunctions: cmd = cmd.append("--keepJunctions") cmd = cmd.append(str(args.keepJunctions)) if not args.metaDataTab == None: cmd = cmd.append("--metaDataTab") cmd = cmd.append(str(args.metaDataTab)) if not args.outSAMattrRGline == None: cmd = cmd.append("--outSAMattrRGline") cmd = cmd.append(str(args.outSAMattrRGline)) if not args.outSAMattrRGfile == None: cmd = cmd.append("--outSAMattrRGfile") cmd = cmd.append(str(args.outSAMattrRGfile)) logger.info('Starting Alignment with STAR') exit_code = pipelineUtil.log_function_time("STAR_ALIGN", args.id, cmd, logger) if exit_code == 0: logger.info('Starting post alignment QC') post_aln_qc(args, args.out, logger) else: logger.error('STAR returned a non-zero exit code %s' %exit_code)