def bwa_aln_paired(bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,logger): pe_realn_dir=os.path.join(realn_dir,'bwa_aln_pe') logger.info('pe_realn_dir=%s' % pe_realn_dir) logger.info('read1=%s' % read1) logger.info('read2=%s' % read2) fastqbasename=read1.replace('_1.fq','') logger.info('fastqbasename=%s' % fastqbasename) outbam=os.path.basename(fastqbasename+'.bam') outbam_path=os.path.join(se_realn_dir,outbam) if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger): logger.info('already completed step `bwa aln paired` of: %s' % bam_path) else: os.makedirs(pe_realn_dir,exist_ok=True) f1_path=os.path.join(fastq_dir,read1) f2_path=os.path.join(fastq_dir,read2) sai1=fastqbasename+'_1.sai' sai2=fastqbasename+'_2.sai' sai1_path=os.path.join(pe_realn_dir,sai1) sai2_path=os.path.join(pe_realn_dir,sai2) bwa_aln_cmd1=['bwa','aln','-t 24',reference_fasta_path,f1_path] bwa_aln_cmd2=['bwa','aln','-t 24',reference_fasta_path,f2_path] sai1_open=open(sai1_path,'wb') pipe_util.do_command(bwa_aln_cmd1,logger,stdout=sai1_open,stderr=subprocess.PIPE) sai1_open.close() sai2_open=open(sai2_path,'wb') pipe_util.do_command(bwa_aln_cmd2,logger,stdout=sai2_open,stderr=subprocess.PIPE) sai2_open.close() bwa_aln_sampe_cmd=['bwa','sampe','-a 500',reference_fasta_path,sai1_path,sai2_path,f1_path,f2_path] samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] cmdlist=list() cmdlist.append(bwa_aln_sampe_cmd) cmdlist.append(samtools_cmd) pipe_util.do_piped_commands(cmdlist,logger) pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger) return outbam_path
def bam_to_fastq(uuid,bam_path,engine,logger): uuid_dir=os.path.dirname(bam_path) logger.info('uuid_dir is: %s' % uuid_dir) fastq_dir=os.path.join(uuid_dir,'fastq') logger.info('fastq_dir is: %s' % fastq_dir) if pipe_util.already_step(fastq_dir,'fastq',logger): logger.info('already completed step `bamtofastq` of: %s' % bam_path) else: logger.info('running step `bamtofastq` of %s: ' % bam_path) os.makedirs(fastq_dir,exist_ok=True) tempfq=os.path.join(fastq_dir,'tempfq') cmd=['bamtofastq','filename='+bam_path,'outputdir='+fastq_dir,'tryoq=1','collate=1','outputperreadgroup=1','T='+tempfq] pipe_util.do_command(cmd,logger) pipe_util.create_already_step(fastq_dir,'fastq',logger) return
def picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger): sd_dir = os.path.dirname(reference_fasta_name) ref_name = os.path.basename(reference_fasta_name) ref_base, ref_ext = os.path.splitext(ref_name) sd_file = ref_base + ".dict" sd_file_path = os.path.join(sd_dir, sd_file) if pipe_util.already_step(sd_dir, ref_name + "_dict", logger): logger.info("already completed step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name) else: logger.info("running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name) home_dir = os.path.expanduser("~") picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar") cmd = [ "java", "-d64", "-Xmx16G", "-jar", picard_path, "CreateSequenceDictionary", "R=" + reference_fasta_name, "O=" + sd_file_path, ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["reference_fasta"] = reference_fasta_name df["sequence_dictionary"] = sd_file_path unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "sequence_dictionary": sd_file_path} table_name = "time_mem_picard_CreateSequenceDictionary" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name) pipe_util.create_already_step(sd_dir, ref_name + "_dict", logger) return sd_file_path
def bam_to_fastq(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) uuid_dir = step_dir logger.info('uuid_dir is: %s' % uuid_dir) fastq_dir = os.path.join(uuid_dir, 'fastq') logger.info('fastq_dir is: %s' % fastq_dir) if pipe_util.already_step(fastq_dir, 'fastq', logger): logger.info('already completed step `bamtofastq` of: %s' % bam_path) else: logger.info('running step `bamtofastq` of %s: ' % bam_path) os.makedirs(fastq_dir, exist_ok=True) tempfq = os.path.join(fastq_dir, 'tempfq') cmd = [ 'bamtofastq', 'S=%s' % uuid + '.fq', 'filename=' + bam_path, 'outputdir=' + fastq_dir, 'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=' + tempfq, 'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY' ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_bamtofastq' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, 'fastq', logger) logger.info('completed running step `bamtofastq` of: %s' % bam_path) return
def bam_mark_duplicates(uuid,bam_path,thread_count,engine,logger): merge_dir=os.path.dirname(bam_path) merge_parent_dir=os.path.dirname(merge_dir) md_dir=os.path.join(merge_parent_dir,'md') os.makedirs(md_dir,exist_ok=True) logger.info('md_dir=%s' % md_dir) step_dir=md_dir outbam=os.path.basename(bam_path) outbam_path=os.path.join(md_dir,outbam) logger.info('outbam_path=%s' % outbam_path) if pipe_util.already_step(step_dir,'markduplicates',logger): logger.info('already completed step `markduplicates` of: %s' % bam_path) else: logger.info('running step `merge of: %s' % bam_path) tmpfile=os.path.join(md_dir,'tmpfile_md') cmd=['bammarkduplicates2','markthreads='+thread_count,'rmdup=0','md5=1','index=1','level=-1','tmpfile='+tmpfile,'I='+bam_path,'O='+outbam_path] output=pipe_util.do_command(cmd,logger) #store time/mem to db df=time_util.store_time(uuid,cmd,output,logger) df['bam_path']=bam_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'thread_count':thread_count} table_name='time_mem_bammarkduplicates2' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'markduplicates',logger) logger.info('completed running step `markduplicates` of: %s' % bam_path) return outbam_path
def bam_to_fastq(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) uuid_dir = step_dir logger.info("uuid_dir is: %s" % uuid_dir) fastq_dir = os.path.join(uuid_dir, "fastq") logger.info("fastq_dir is: %s" % fastq_dir) if pipe_util.already_step(fastq_dir, "fastq", logger): logger.info("already completed step `bamtofastq` of: %s" % bam_path) else: logger.info("running step `bamtofastq` of %s: " % bam_path) os.makedirs(fastq_dir, exist_ok=True) tempfq = os.path.join(fastq_dir, "tempfq") cmd = [ "bamtofastq", "S=%s" % uuid + ".fq", "filename=" + bam_path, "outputdir=" + fastq_dir, "tryoq=1", "collate=1", "outputperreadgroup=1", "T=" + tempfq, "exclude=QCFAIL,SECONDARY,SUPPLEMENTARY", ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["bam_path"] = bam_path unique_key_dict = {"uuid": uuid, "bam_path": bam_path} table_name = "time_mem_bamtofastq" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, "fastq", logger) logger.info("completed running step `bamtofastq` of: %s" % bam_path) return
def get_file_md5(uuid,file_path,engine,logger): file_dir=os.path.dirname(file_path) file_name=os.path.basename(file_path) file_shortname,file_ext=os.path.splitext(file_name) file_md5_name=file_name+'.md5' file_md5_path=os.path.join(file_dir,file_md5_name) if pipe_util.already_step(file_dir,file_name+'_md5sum',logger): logger.info('already completed step `md5sum` of: %s' % file_path) with open(file_md5_path,'r') as file_md5_path_open: file_md5=file_md5_path_open.readline().strip() return file_md5 else: cmd=['md5sum',file_path] output=pipe_util.do_command(cmd,logger) file_md5=output.split()[0].decode() file_md5_path_open=open(file_md5_path,'w') file_md5_path_open.write(file_md5) file_md5_path_open.close() df=time_util.store_time(uuid,cmd,output,logger) df['file_path']=file_path logger.info('df=%s' % df) unique_key_dict={'uuid':uuid,'file_path':file_path} table_name='time_mem_md5' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(file_dir,file_name+'_md5sum',logger) return file_md5 return None
def run_hc(uuid,bam_path,reference_fasta_path,scratch_dir,engine,thread_count,logger): vcf_dir=os.path.join(scratch_dir,uuid,'hc') os.makedirs(vcf_dir,exist_ok=True) logger.info('hc vcf_dir=%s' % vcf_dir) bamname=os.path.basename(bam_path) bambase,bamext=os.path.splitext(bamname) outvcf=bambase+'.vcf' vcf_path=os.path.join(vcf_dir,outvcf) logger.info('vcf_path=%s' % vcf_path) home_dir=os.path.expanduser('~') if pipe_util.already_step(vcf_dir,'hc_'+bambase,logger): logger.info('already completed step `HaplotypeCaller` of: %s' % bam_path) else: #do work gatk_path=os.path.join(home_dir,'bin','GenomeAnalysisTK.jar') tmp_dir=os.path.join(scratch_dir,'tmp') shellcmd='java -d64 -Djava.io.tmpdir='+tmp_dir+' -jar '+gatk_path+' --analysis_type HaplotypeCaller --generate_md5 -nct '+thread_count+' --output_mode EMIT_VARIANTS_ONLY --input_file ' + bam_path + ' --reference_sequence ' + reference_fasta_path+' --out '+vcf_path #+' -L "1:500000-900000"' logger.info('shellcmd=%s' % shellcmd) cmd=shlex.split(shellcmd) logger.info('cmd=%s' % cmd) output=pipe_util.do_command(cmd,logger) #store timing/mem results in db. uuid+vcf_path are unique key df=time_util.store_time(uuid,cmd,output,logger) df['vcf_path']=vcf_path logger.info('df=%s' % df) table_name='time_mem_gatk_hc' #variable, consider making a parameter unique_key_dict={'uuid':uuid,'vcf_path':vcf_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) #done flag pipe_util.create_already_step(vcf_dir,'hc_'+bambase,logger) return
def bam_validate(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) validate_file = bam_path + '.validate' if pipe_util.already_step(step_dir, 'validate', logger): logger.info('already completed step `validate` of: %s' % bam_path) else: logger.info('running step validate of: %s' % bam_path) home_dir = os.path.expanduser('~') mo = int((2 ** 32) / 2) - 1 cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file] output = pipe_util.do_command(cmd, logger, allow_fail=True) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_picard_ValidateSamFile' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step validate of: %s' % bam_path) pipe_util.create_already_step(step_dir, 'validate', logger) if pipe_util.already_step(step_dir, 'validate_db', logger): logger.info('alreaddy stored `picard validate` to db') else: logger.info('storing `picard validate` to db') store_validate_error(uuid, bam_path, validate_file, engine, logger) pipe_util.create_already_step(step_dir, 'validate_db', logger) logger.info('completed storing `picard validate` to db')
def sump_wxs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger): sump_dir = os.path.dirname(muse_call_output_path) input_name = os.path.basename(muse_call_output_path) input_base, input_ext = os.path.splitext(input_name) sample_base, sample_ext = os.path.splitext(input_base) logger.info('MuSE_sump_dir=%s' % sump_dir) step_dir = sump_dir muse_sump_output = input_base + '.vcf' muse_sump_output_path = os.path.join(sump_dir, muse_sump_output) logger.info('muse_sump_output_path=%s' % muse_sump_output_path) if pipe_util.already_step(step_dir, sample_base + '_MuSE_sump', logger): logger.info('already completed step `MuSE sump` of: %s' % input_name) else: logger.info('running step `MuSE sump` of the tumor bam: %s' % input_name) home_dir = os.path.expanduser('~') muse_path = os.path.join(home_dir, 'tools', 'MuSEv1.0rc_submission_c039ffa') cmd = [muse_path, 'sump', '-I', muse_call_output_path, '-E', '-O', muse_sump_output_path, '-D', dbsnp_known_snp_sites] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['muse_call_output'] = muse_call_output_path df['muse_sump_output'] = muse_sump_output_path unique_key_dict = {'uuid': uuid, 'muse_call_output': muse_call_output_path, 'muse_sump_output': muse_sump_output_path} table_name = 'time_mem_MuSE_sump_wxs' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, sample_base + '_MuSE_sump', logger) logger.info('completed running `MuSE sump` of the tumor bam: %s' % input_name) return muse_sump_output_path
def bam_validate(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) bam_name = os.path.basename(bam_path) validate_file = bam_path + '.validate' if pipe_util.already_step(step_dir, bam_name + '_validate', logger): logger.info('already completed step `validate` of: %s' % bam_path) else: logger.info('running step validate of: %s' % bam_path) home_dir = os.path.expanduser('~') mo = int((2 ** 32) / 2) - 1 cmd = ['java', '-d64', '-Xmx16G', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file] output = pipe_util.do_command(cmd, logger, allow_fail=True) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path df['validate_file'] = validate_file unique_key_dict = {'uuid': uuid, 'bam_path': bam_path, 'validate_file': validate_file} table_name = 'time_mem_picard_ValidateSamFile' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step validate of: %s' % bam_path) pipe_util.create_already_step(step_dir, bam_name + '_validate', logger) logger.info('completed running step `picard validate` of: %s' % bam_path) if pipe_util.already_step(step_dir, bam_name + '_validate_db', logger): logger.info('alread stored `picard validate` to db') else: logger.info('storing `picard validate` to db') store_validate_error(uuid, bam_path, validate_file, engine, logger) pipe_util.create_already_step(step_dir, bam_name + '_validate_db', logger) logger.info('completed storing `picard validate` to db')
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient): out_bam_path_list = list() for input_bam in bam_path_list: bam_name = os.path.basename(input_bam) bam_base, bam_ext = os.path.splitext(bam_name) input_dir = os.path.dirname(input_bam) outdir_path = os.path.join(input_dir, 'sorted') outbam_path = os.path.join(outdir_path, bam_name) tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name) logger.info('outbam_path=%s' % outbam_path) out_bam_path_list.append(outbam_path) if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger): logger.info('already completed step `picard sort` of: %s' % bam_name) else: logger.info('running step `picard sort` of: %s' % bam_name) os.makedirs(outdir_path, exist_ok=True) home_dir = os.path.expanduser('~') cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path] if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path df['reference_fasta_path'] = reference_fasta_path unique_key_dict = {'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path} table_name = 'time_mem_picard_bamsort' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger) logger.info('completed running step `picard sort` of: %s' % bam_name) return out_bam_path_list
def get_s3_md5(s3_bucket,s3_object,logger): s3_path=os.path.join('s3://',s3_bucket,analysis_id,s3_object) cmd=['s3cmd','info',s3_path] output=pipe_util.do_command(cmd,logger) s3_md5=str() for line in output: if 'MD5' in line: md5_s3=line.split(':')[1].strip() return md5_s3 return None
def pull_cgquery_xml_to_file(uuid,outputxml,logger): file_dir=os.path.dirname(outputxml) if pipe_util.already_step(file_dir,'cgquery_xml',logger): logger.info('already completed step `cgquery` of: %s' % uuid) return else: logger.info('running command `cgquery` of: %s' % uuid) cmd=['cgquery','-a','analysis_id='+uuid,'-o',outputxml] output=pipe_util.do_command(cmd,logger) pipe_util.create_already_step(file_dir,'cgquery_xml',logger) return
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient): out_bam_path_list = list() for input_bam in bam_path_list: bam_name = os.path.basename(input_bam) bam_base, bam_ext = os.path.splitext(bam_name) input_dir = os.path.dirname(input_bam) outdir_path = os.path.join(input_dir, 'sorted') outbam_path = os.path.join(outdir_path, bam_name) tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name) logger.info('outbam_path=%s' % outbam_path) out_bam_path_list.append(outbam_path) if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger): logger.info('already completed step `picard sort` of: %s' % bam_name) else: logger.info('running step `picard sort` of: %s' % bam_name) os.makedirs(outdir_path, exist_ok=True) home_dir = os.path.expanduser('~') cmd = [ 'java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path ] if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path df['reference_fasta_path'] = reference_fasta_path unique_key_dict = { 'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path } table_name = 'time_mem_picard_bamsort' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger) logger.info('completed running step `picard sort` of: %s' % bam_name) return out_bam_path_list
def do_picard_collectwgsmetrics(uuid,bam_path,reference_fasta_path,engine,logger): step_dir=os.path.dirname(bam_path) bam_name=os.path.basename(bam_path) bam_base,bam_ext=os.path.splitext(bam_name) home_dir=os.path.expanduser('~') picard_dir=os.path.join(home_dir,'tools','picard-tools') stats_outfile='picard_collectwgsmetrics_'+bam_base+'.txt' stats_path=os.path.join(step_dir,stats_outfile) if pipe_util.already_step(step_dir,'picard_collectwgsmetrics',logger): logger.info('already completed step `picard_collectwgsmetrics` of: %s' % bam_path) else: logger.info('running step `picard_collectwgsmetrics` of: %s' % bam_path) cmd=['java','-d64','-jar',os.path.join(picard_dir,'picard.jar'),'CollectWgsMetrics','INPUT='+bam_path,'OUTPUT='+stats_path,'REFERENCE_SEQUENCE='+reference_fasta_path,'INCLUDE_BQ_HISTOGRAM=true','VALIDATION_STRINGENCY=LENIENT'] picard_cwgsm_output=pipe_util.do_command(cmd,logger) #with open(stats_path,'w') as stats_path_open: # for aline in stats_output.decode().format(): # stats_path_open.write(aline) #save time/mem to db df=time_util.store_time(uuid,cmd,picard_cwgsm_output,logger) df['bam_path']=bam_path df['reference_fasta_path']=reference_fasta_path unique_key_dict={'uuid':uuid,'bam_path':bam_path} table_name='time_mem_picard_cwgsm' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics',logger) logger.info('completed running step `picard_collectwgsmetrics` of: %s' % bam_path) #save stats to db if pipe_util.already_step(step_dir,'picard_collectwgsmetrics_db',logger): logger.info('already stored `picard collectwgsmetrics` of %s to db' % bam_path) else: data_dict=picard_wgs_to_dict(uuid,bam_path,stats_path,logger) data_dict['uuid']=[uuid] data_dict['bam_path']=bam_path data_dict['reference_fasta_path']=reference_fasta_path df=pd.DataFrame(data_dict) table_name='picard_collectwgsmetrics' unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics_db',logger) logger.info('completed storing `picard collectwgsmetrics` to db') return
def do_fastqc(uuid, fastq_path, engine, logger): fastq_name = os.path.basename(fastq_path) fastq_dir = os.path.dirname(fastq_path) fastq_base, fastq_ext = os.path.splitext(fastq_name) if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger): logger.info('already completed step `fastqc`: %s' % fastq_path) else: logger.info('running step `fastqc`: %s' % fastq_path) cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract', fastq_path] # fix the path here output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['fastq_path'] = fastq_path table_name = 'time_mem_fastqc' unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base, logger) return
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger, be_lenient): sorted_bam_dir = os.path.dirname(bam_path_list[0]) bwa_alignment_dir = os.path.dirname(sorted_bam_dir) realn_dir = os.path.dirname(bwa_alignment_dir) out_dir = os.path.join(realn_dir, 'merge') os.makedirs(out_dir, exist_ok=True) step_dir = out_dir preharmbam = os.path.basename(preharmonize_bam_path) preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam) outbam_name = preharmbam_name + '_gdc_realn.bam' outbam_path = os.path.join(out_dir, outbam_name) logger.info('bam_path_list=%s' % bam_path_list) lenient_merge = False if pipe_util.already_step(step_dir, 'picard_merge', logger): logger.info('already completed step `merge` of: %s' % outbam_path) else: logger.info('running step `picard merge of: %s' % outbam_path) #tmpfile=os.path.join(merge_dir,'tmpfile') home_dir = os.path.expanduser('~') cmd = [ 'java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true', 'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path, 'TMP_DIR=' + out_dir ] for input_bam in bam_path_list: input_string = 'INPUT=' + input_bam cmd.append(input_string) if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name} table_name = 'time_mem_picard_bam_merge' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'picard_merge', logger) logger.info('completed running step `picard merge` of: %s' % outbam_path) return outbam_path
def samtools_bam_index(uuid, bam_path, engine, logger): bam_file = os.path.basename(bam_path) bam_name, bam_ext = os.path.splitext(bam_file) out_dir = os.path.dirname(bam_path) bai_path = bam_path + ".bai" if pipe_util.already_step(out_dir, bam_name + "_index", logger): logger.info("already completed step `samtools index` of %s" % bam_path) else: logger.info("running step `samtools index` of %s" % bam_path) cmd = ["samtools", "index", bam_path] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["bam_path"] = bam_path unique_key_dict = {"uuid": uuid, "bam_path": bam_path} table_name = "time_mem_samtools_index" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running `samtools index` of %s" % bam_path) return bai_path
def samtools_bam_index(uuid, bam_path, engine, logger): bam_file = os.path.basename(bam_path) bam_name, bam_ext = os.path.splitext(bam_file) out_dir = os.path.dirname(bam_path) bai_path = bam_path + '.bai' if pipe_util.already_step(out_dir, bam_name + '_index', logger): logger.info('already completed step `samtools index` of %s' % bam_path) else: logger.info('running step `samtools index` of %s' % bam_path) cmd = ['samtools', 'index', bam_path] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_samtools_index' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running `samtools index` of %s' % bam_path) return bai_path
def do_samtools_stats(uuid,bam_path,reference_fasta_path,engine,logger): step_dir=os.path.dirname(bam_path) bam_name=os.path.basename(bam_path) bam_base,bam_ext=os.path.splitext(bam_name) stats_outfile='stats_'+bam_base+'.txt' stats_path=os.path.join(step_dir,stats_outfile) if pipe_util.already_step(step_dir,'samtools_stats',logger): logger.info('already completed step `samtools stats` of: %s' % bam_path) else: logger.info('running step `samtools stats` of: %s' % bam_path) cmd=['samtools','stats',bam_path] stats_output=pipe_util.do_command(cmd,logger) with open(stats_path,'w') as stats_path_open: for aline in stats_output.decode().format(): stats_path_open.write(aline) #save time/mem to db df=time_util.store_time(uuid,cmd,stats_output,logger) df['bam_path']=bam_path df['reference_fasta_path']=reference_fasta_path unique_key_dict={'uuid':uuid,'bam_path':bam_path} table_name='time_mem_samtools_stats' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'samtools_stats',logger) logger.info('completed running step `samtools stats` of: %s' % bam_path) #save stats to db if pipe_util.already_step(step_dir,'samtools_stats_db',logger): logger.info('already stored `samtools stats` of %s to db' % bam_path) else: data_dict=samtools_stats_to_dict(uuid,bam_path,stats_path,logger) data_dict['uuid']=[uuid] data_dict['bam_path']=bam_path data_dict['reference_fasta_path']=reference_fasta_path df=pd.DataFrame(data_dict) table_name='samtools_stats' unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'samtools_stats_db',logger) logger.info('completed storing `samtools stats` to db') return
def samtools_faidx(uuid, reference_fasta_name, engine, logger): ref_file = os.path.basename(reference_fasta_name) fai_path = reference_fasta_name + ".fai" out_dir = os.path.dirname(reference_fasta_name) if pipe_util.already_step(out_dir, ref_file + "_faidx", logger): logger.info("already completed step `samtools faidx` of %s" % reference_fasta_name) else: logger.info("running step `samtools faidx` of %s" % reference_fasta_name) cmd = ["samtools", "faidx", reference_fasta_name] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["reference_fasta"] = reference_fasta_name df["fai_path"] = fai_path unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "fai_path": fai_path} table_name = "time_mem_samtools_faidx" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(out_dir, ref_file + "_faidx", logger) logger.info("completed running `samtools faidx` of %s" % reference_fasta_name) return fai_path
def picard_sortvcf(uuid, muse_vcf, reference_fasta_name, engine, logger): sd_dir = os.path.dirname(reference_fasta_name) ref_name = os.path.basename(reference_fasta_name) ref_base, ref_ext = os.path.splitext(ref_name) sd_file = ref_base + ".dict" sd_file_path = os.path.join(sd_dir, sd_file) if os.path.isfile(sd_file_path): logger.info("reference_dict_path=%s" % sd_file_path) else: sd_file_path = picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger) logger.info("reference_dict_path=%s" % sd_file_path) srt_dir = os.path.dirname(muse_vcf) vcf_name = os.path.basename(muse_vcf) vcf_base, vcf_ext = os.path.splitext(vcf_name) srt_vcf = vcf_base + ".srt" + vcf_ext srt_vcf_path = os.path.join(srt_dir, srt_vcf) if pipe_util.already_step(srt_dir, vcf_name + "_sorted", logger): logger.info("already completed step `Picard SortVcf` of %s" % muse_vcf) else: logger.info("running step `Picard SortVcf` of %s" % muse_vcf) home_dir = os.path.expanduser("~") picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar") cmd = [ "java", "-d64", "-Xmx16G", "-jar", picard_path, "SortVcf", "I=" + muse_vcf, "O=" + srt_vcf_path, "SD=" + sd_file_path, ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["MuSE_VCF"] = muse_vcf df["MuSE_sorted_VCF"] = srt_vcf_path unique_key_dict = {"uuid": uuid, "MuSE_VCF": muse_vcf, "MuSE_sorted_VCF": srt_vcf_path} table_name = "time_mem_picard_SortVcf" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `Picard SortVcf` of %s" % muse_vcf) pipe_util.create_already_step(srt_dir, vcf_name + "_sorted", logger) return srt_vcf_path
def get_s3_objects(uuid,bucket,name,destination,s3cfg_dir,engine,logger): if pipe_util.already_have(destination,name,logger): logger.info('already have object(s) %s in %s' % (name,destination)) else: logger.info('downloading object(s) %s to %s' % (name,destination)) base_name=os.path.splitext(name)[0] s3_path=os.path.join('s3://',bucket,base_name) home_dir=os.path.expanduser('~') s3cmd_path=os.path.join(home_dir,'.local','bin','s3cmd') cmd=[s3cmd_path,'-c',os.path.join(s3cfg_dir,'.s3cfg'),'sync',s3_path,destination] output=pipe_util.do_command(cmd,logger) df=time_util.store_time(uuid,cmd,output,logger) df['bucket']=bucket df['name']=name table_name='time_mem_s3_sync' unique_key_dict={'bucket':bucket,'name':name} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_have(destination,name,logger) logger.info('finished downloading object(s) %s to %s' % (name,destination)) return
def bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger): uuid_dir=os.path.dirname(bam_path) realn_dir=os.path.join(uuid_dir,'realn') out_bam_path_list=list() input_thread_count=str(int(int(thread_count)/2)) output_thread_count=input_thread_count logger.info('bamsort input_thread_count=%s' % input_thread_count) logger.info('bamsort output_thread_count=%s' % output_thread_count) for input_bam in bam_path_list: bam_name=os.path.basename(input_bam) bam_base,bam_ext=os.path.splitext(bam_name) input_dir=os.path.dirname(input_bam) #indir_name=input_dir.split('/')[-1] #outdir_name=indir_name+'_sorted' outdir_path=os.path.join(input_dir,'sorted') outbam_path=os.path.join(outdir_path,bam_name) tmpfile=os.path.join(outdir_path,'tmpfile_'+bam_name) logger.info('outbam_path=%s' % outbam_path) out_bam_path_list.append(outbam_path) if pipe_util.already_step(outdir_path,'sort_'+bam_base,logger): logger.info('already completed step `sort` of: %s' % bam_name) else: logger.info('running step `sort` of: %s' % bam_name) os.makedirs(outdir_path,exist_ok=True) cmd=['bamsort','I='+input_bam,'O='+outbam_path,'inputthreads='+input_thread_count,'outputthreads='+output_thread_count,'calmdnm=1','calmdnmreference='+reference_fasta_path,'calmdnmrecompindetonly=1','tmpfile='+tmpfile,'index=1'] #cmd=['bamsort','I='+input_bam,'O='+outbam_path,'inputthreads='+input_thread_count,'outputthreads='+output_thread_count,'tmpfile='+tmpfile] output=pipe_util.do_command(cmd,logger) df=time_util.store_time(uuid,cmd,output,logger) df['bam_path']=bam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bamsort' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(outdir_path,'sort_'+bam_base,logger) logger.info('completed running step `merge` of: %s' % bam_name) return out_bam_path_list
def tabix_index(uuid, dbsnp_known_snp_sites, engine, logger): dbsnp_file = os.path.basename(dbsnp_known_snp_sites) dbsnp_tbi_path = dbsnp_known_snp_sites + ".tbi" out_dir = os.path.dirname(dbsnp_known_snp_sites) if pipe_util.already_step(out_dir, dbsnp_file + "_tbi", logger): logger.info("already completed step `tbi index of dbsnp.vcf` of %s" % dbsnp_known_snp_sites) else: logger.info("running step `tbi index of dbsnp.vcf` of %s" % dbsnp_known_snp_sites) cmd = ["tabix", "-p", "vcf", dbsnp_known_snp_sites] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["dbsnp_known_snp_sites"] = dbsnp_known_snp_sites df["dbsnp_tbi_path"] = dbsnp_tbi_path unique_key_dict = { "uuid": uuid, "dbsnp_known_snp_sites": dbsnp_known_snp_sites, "dbsnp_tbi_path": dbsnp_tbi_path, } table_name = "time_mem_tabix_index_dbsnp_bgz" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(out_dir, dbsnp_file + "_tbi", logger) logger.info("completed running `tbi index of dbsnp.vcf` of %s" % dbsnp_known_snp_sites) return dbsnp_tbi_path
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger, be_lenient): sorted_bam_dir = os.path.dirname(bam_path_list[0]) bwa_alignment_dir = os.path.dirname(sorted_bam_dir) realn_dir = os.path.dirname(bwa_alignment_dir) out_dir = os.path.join(realn_dir, 'merge') os.makedirs(out_dir, exist_ok=True) step_dir = out_dir preharmbam = os.path.basename(preharmonize_bam_path) preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam) outbam_name = preharmbam_name + '_gdc_realn.bam' outbam_path = os.path.join(out_dir, outbam_name) logger.info('bam_path_list=%s' % bam_path_list) lenient_merge = False if pipe_util.already_step(step_dir, 'picard_merge', logger): logger.info('already completed step `merge` of: %s' % outbam_path) else: logger.info('running step `picard merge of: %s' % outbam_path) #tmpfile=os.path.join(merge_dir,'tmpfile') home_dir = os.path.expanduser('~') cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true', 'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path, 'TMP_DIR=' + out_dir] for input_bam in bam_path_list: input_string = 'INPUT=' + input_bam cmd.append(input_string) if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name} table_name = 'time_mem_picard_bam_merge' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'picard_merge', logger) logger.info('completed running step `picard merge` of: %s' % outbam_path) return outbam_path
def main(): parser = argparse.ArgumentParser("Graph generation", description="Generate graphs for different miRNA stats") # Logging flag parser.add_argument( "-d", "--debug", action="store_const", const=logging.DEBUG, dest="level", help="Enable debug logging." ) parser.set_defaults(level=logging.INFO) # Required flags parser.add_argument("-s", "--sam_path", required=True, help="Path to SAM file") parser.add_argument("-f", "--filtered_taglen", required=True, help="Path to filtered_taglength.csv") parser.add_argument("-v", "--softclip_taglen", required=True, help="Path to softclip_taglength.csv") parser.add_argument("-a", "--adapter_report", required=True, help="Path to adapter report") parser.add_argument("-c", "--chastity_taglen", required=True, help="Path to chastity_taglength.csv") parser.add_argument("-l", "--alignment_stats", required=True, help="Path to alignment_stats.csv") parser.add_argument("-u", "--uuid", required=True, help="UUID/GDC_ID for the harmonized BAM.") parser.add_argument("-r", "--barcode", required=True, help="BAM barcode") # Optional DB Flags parser.add_argument("-y", "--db_cred_s3url", required=False, help="String s3url of the postgres db_cred file") parser.add_argument("-z", "--s3cfg_path", required=False, help="Path to the s3cfg file.") args = parser.parse_args() sam_path = args.sam_path filtered_taglen = args.filtered_taglen softclip_taglen = args.softclip_taglen adapter_report = args.adapter_report chastity_taglen = args.chastity_taglen alignment_stats = args.alignment_stats uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging("mir_profiler_graph", args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: # local sqllite case sqlite_name = "mir_profiler_graph" + uuid + ".db" engine_path = "sqlite:///" + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level="SERIALIZABLE") # Generate the graphs for the annotation data logger.info("Beginning: Annotation graph generation") graph_CMD = [ "perl", "/home/ubuntu/bin/mirna-profiler/v0.2.7/code/library_stats/graph_libs.pl", "-s", sam_path, "-f", filtered_taglen, "-o", softclip_taglen, "-a", adapter_report, "-c", chastity_taglen, "-t", alignment_stats, ] output = pipe_util.do_command(graph_CMD, logger) df = time_util.store_time(uuid, graph_CMD, output, logger) df["bam_name"] = barcode unique_key_dict = {"uuid": uuid, "bam_name": barcode} table_name = "time_mem_mir_graph" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) # Store time command will go here logger.info("Completed: Annotation graph generation")
def main(): parser = argparse.ArgumentParser('SAM Annotator', description = 'Annotates the SAM files with miRNA hits',) # Logging flag parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags parser.add_argument('-e', '--species_code', required = True, choices = ['hsa'], help = 'Organism species code.', ) parser.add_argument('-s', '--sam_path', required = True, help = 'Path to directory containing bams.', ) parser.add_argument('-w', '--db_connect', required = True, help = 'Path to db_connection file', ) parser.add_argument('-u', '--uuid', required = True, help = 'UUID/GDC_ID for the harmonized BAM.', ) parser.add_argument('-r', '--barcode', required = True, help = 'BAM barcode', ) # Optional DB Flags parser.add_argument('-y', '--db_cred_s3url', required = False, help = 'String s3url of the postgres db_cred file', ) parser.add_argument('-z', '--s3cfg_path', required = False, help = 'Path to the s3cfg file.', ) args = parser.parse_args() species_code = args.species_code sam_path = args.sam_path connect_path = args.db_connect uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging('mir_profiler_annotator', args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: #local sqllite case sqlite_name = 'mir_profiler_annotator' + uuid + '.db' engine_path = 'sqlite:///' + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') # Annotate the SAM files logger.info('Beginning: SAM file annotation') annotate_CMD = ['perl', '/home/ubuntu/bin/mirna-profiler/v0.2.7/code/annotation/annotate.pl', '-d', connect_path, '-o', species_code, '-s', sam_path] output = pipe_util.do_command(annotate_CMD, logger) df = time_util.store_time(uuid, annotate_CMD, output, logger) df['bam_name'] = barcode unique_key_dict = {'uuid': uuid, 'bam_name': barcode} table_name = 'time_mem_mir_sam_annotator' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('Completed: SAM file annotation')
def main(): parser = argparse.ArgumentParser('miRNA matrix mimat development', description = 'Mature miRNA gene expression matrix genreation',) # Logging flag parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags parser.add_argument('-w', '--db_connect', required = True, help = 'Name of desired miRbase.', ) parser.add_argument('-e', '--species_code', required = True, choices = ['hsa'], help = 'Organism species code.', ) parser.add_argument('-s', '--sam_path', required = True, help = 'Path to SAM file', ) parser.add_argument('-m', '--mirna_path', required = True, help = 'Path to miRNA.txt file', ) parser.add_argument('-x', '--crossmapped_path', required = True, help = 'Path to crossmapped.txt file', ) parser.add_argument('-u', '--uuid', required = True, help = 'UUID/GDC_ID for the harmonized BAM.', ) parser.add_argument('-r', '--barcode', required = True, help = 'BAM barcode', ) # Optional DB Flags parser.add_argument('-y', '--db_cred_s3url', required = False, help = 'String s3url of the postgres db_cred file', ) parser.add_argument('-z', '--s3cfg_path', required = False, help = 'Path to the s3cfg file.', ) args = parser.parse_args() db_connect = args.db_connect species_code = args.species_code sam_path = args.sam_path mirna_path = args.mirna_path crossmapped_path = args.crossmapped_path uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging('mir_profiler_mimat', args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: #local sqllite case sqlite_name = 'mir_profiler_mimat' + uuid + '.db' engine_path = 'sqlite:///' + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') # Get stats from the alignment annotations logger.info('Beginning: Mature miRNA gene expression matrix genreation') mimat_CMD = ['perl', '/home/ubuntu/bin/mirna-profiler/v0.2.7/code/library_stats/expression_matrix_mimat.pl', '-d', db_connect, '-o', species_code, '-s', sam_path, '-r', mirna_path, '-c', crossmapped_path] output = pipe_util.do_command(mimat_CMD, logger) df = time_util.store_time(uuid, mimat_CMD, output, logger) df['bam_name'] = barcode unique_key_dict = {'uuid': uuid, 'bam_name': barcode} table_name = 'time_mem_mir_expn_mimat' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('Completed: Mature miRNA gene expression matrix genreation')
bam_analysis_id=bam_analysis_id.strip('/') scratch_dir=args['scratch_dir'] thread_count=args['thread_count'] def get_s3_objects(uuid,bucket,name,destination,logger): #sync_name=name.split('.')[0]#temp hack to get reference.dict needed by GATK UG/HC if pipe_util.already_have(destination,name,logger): logger.info('already have object(s) %s in %s' % (name,destination)) else: logger.info('downloading object(s) %s to %s' % (name,destination)) base_name=os.path.splitext(name)[0] s3_path=os.path.join('s3://',bucket,base_name) cmd=['s3cmd','sync',s3_path,destination] output=pipe_util.do_command(cmd,logger) pipe_util.create_have(destination,name,logger) df=time_util.store_time(uuid,cmd,output,logger) def main(): ##logging uuid=pipe_util.get_uuid_from_path(bam_analysis_id) logging.basicConfig(filename='vcf_'+uuid+'.log',level=logging.DEBUG,filemode='a', format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z') logger=logging.getLogger(__name__) ##open stats and timing db home_dir=os.path.expanduser('~') db_path=os.path.join(home_dir,'vcf_pipe.sqlite')
def main(): parser = argparse.ArgumentParser("TCGA", description="TCGA formatted results generation") # Logging flag parser.add_argument( "-d", "--debug", action="store_const", const=logging.DEBUG, dest="level", help="Enable debug logging." ) parser.set_defaults(level=logging.INFO) # Required flags parser.add_argument("-w", "--db_connect", required=True, help="Name of desired miRbase.") parser.add_argument("-g", "--genome_version", required=True, choices=["hg38"], help="Genome Version of Annotation.") parser.add_argument("-e", "--species_code", required=True, choices=["hsa"], help="Organism species code.") parser.add_argument("-s", "--sam_path", required=True, help="Path to directory containing bams.") parser.add_argument("-p", "--mirna_species", required=True, help="Path to mirna_species.txt") parser.add_argument("-x", "--crossmapped", required=True, help="Path to crossmapped.txt") parser.add_argument("-i", "--isoforms", required=True, help="Path to isoforms.txt") parser.add_argument("-u", "--uuid", required=True, help="UUID/GDC_ID for the harmonized BAM.") parser.add_argument("-r", "--barcode", required=True, help="BAM barcode") # Optional DB Flags parser.add_argument("-y", "--db_cred_s3url", required=False, help="String s3url of the postgres db_cred file") parser.add_argument("-z", "--s3cfg_path", required=False, help="Path to the s3cfg file.") args = parser.parse_args() connect_path = args.db_connect genome_version = args.genome_version species_code = args.species_code sam_path = args.sam_path mirna_species = args.mirna_species crossmapped = args.crossmapped isoforms = args.isoforms uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging("mir_profiler_tcga", args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: # local sqllite case sqlite_name = "mir_profiler_tcga" + uuid + ".db" engine_path = "sqlite:///" + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level="SERIALIZABLE") # Generate TCGA formatted results logger.info("Beginning: TCGA formatted results generation") tcga_CMD = [ "perl", "/home/ubuntu/bin/mirna-profiler/v0.2.7/code/custom_output/tcga/tcga.pl", "-d", connect_path, "-o", species_code, "-g", genome_version, "-s", sam_path, "-r", mirna_species, "-c", crossmapped, "-i", isoforms, ] output = pipe_util.do_command(tcga_CMD, logger) df = time_util.store_time(uuid, tcga_CMD, output, logger) df["bam_name"] = barcode unique_key_dict = {"uuid": uuid, "bam_name": barcode} table_name = "time_mem_mir_tcga" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("Completed: TCGA formatted results generation")
def do_samtools_flagstat(uuid, bam_path, reference_fasta_path, engine, logger): step_dir = os.path.dirname(bam_path) bam_name = os.path.basename(bam_path) bam_base, bam_ext = os.path.splitext(bam_name) flagstat_outfile = 'samtools_flagstat_' + bam_base + '.txt' flagstat_path = os.path.join(step_dir, flagstat_outfile) if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base, logger): logger.info('already completed step `samtools flagstat of: %s' % bam_path) else: logger.info('running step stat of: %s' % bam_path) cmd = ['samtools', 'flagstat', bam_path] flagstat_output = pipe_util.do_command(cmd, logger) with open(flagstat_path, 'w') as flagstat_path_open: for aline in flagstat_output.decode().format(): flagstat_path_open.write(aline) #save time/mem to db df = time_util.store_time(uuid, cmd, flagstat_output, logger) df['bam_path'] = bam_path df['reference_fasta_path'] = reference_fasta_path table_name = 'time_mem_samtools_flagstat' unique_key_dict = { 'uuid': uuid, 'bam_path': bam_path, 'reference_fasta_path': reference_fasta_path } df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'samtools_flagstat_' + bam_base, logger) logger.info('completed running step `samtools flagstat` of: %s' % bam_path) #save stats to db if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base + '_db', logger): logger.info('already stored `samtools flagstat` of %s to db' % bam_path) else: data_dict = samtools_flagstat_to_dict(uuid, bam_path, flagstat_path, logger) data_dict['uuid'] = [uuid] data_dict['bam_path'] = bam_path data_dict['reference_fasta_path'] = reference_fasta_path df = pd.DataFrame(data_dict) table_name = 'samtools_flagstat' unique_key_dict = { 'uuid': uuid, 'bam_path': bam_path, 'reference_fasta_path': reference_fasta_path } df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'samtools_flagstat_' + bam_base + '_db', logger) logger.info('completed storing `samtools flagstat` of %s to db' % bam_path) return
def main(): parser = argparse.ArgumentParser('SAM alignment stats', description = 'Generate alignment stats for the miRNA in the annotated SAM file',) # Logging flag parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags parser.add_argument('-s', '--sam_path', required = True, help = 'Path to SAM file', ) parser.add_argument('-a', '--adapter_path', required = True, help = 'Path to adapter report', ) parser.add_argument('-u', '--uuid', required = True, help = 'UUID/GDC_ID for the harmonized BAM.', ) parser.add_argument('-r', '--barcode', required = True, help = 'BAM barcode', ) # Optional DB Flags parser.add_argument('-y', '--db_cred_s3url', required = False, help = 'String s3url of the postgres db_cred file', ) parser.add_argument('-z', '--s3cfg_path', required = False, help = 'Path to the s3cfg file.', ) args = parser.parse_args() sam_path = args.sam_path adapter_path = args.adapter_path uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging('mir_profiler_stats', args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: #local sqllite case sqlite_name = 'mir_profiler_stats' + uuid + '.db' engine_path = 'sqlite:///' + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') # Get stats from the alignment annotations logger.info('Beginning: Alignment stats generation') stats_CMD = ['perl', '/home/ubuntu/bin/mirna-profiler/v0.2.7/code/library_stats/alignment_stats.pl', '-s', sam_path, '-a', adapter_path] output = pipe_util.do_command(stats_CMD, logger) df = time_util.store_time(uuid, stats_CMD, output, logger) df['bam_name'] = barcode unique_key_dict = {'uuid': uuid, 'bam_name': barcode} table_name = 'time_mem_mir_alignment_stats' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('Completed: Alignment stats generation')
def get_file_size(uuid,file_path,engine,logger): cmd=['ls','-l',file_path] output=pipe_util.do_command(cmd,logger) filesize=output.split()[4].decode() logger.info('%s filesize=%s' % (file_path,filesize)) return filesize