def filter_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('Filtering {0} by coverage'.format(sample)) helpers.filter_vcf_by_coverage_cutoffs( vcf=(sample + in_file_extension), cutoff_table=mglobals.coverage_cutoffs) log.info('Filtering {0} according to SNP file: {1}'.format( sample, mglobals.current_snp_file)) dgrp_intersect_command = [ 'nice', '-n', '5', 'intersectBed', '-a', (sample + '_covfil.vcf'), # the output of the helper # function above. '-b', mglobals.current_snp_file, '-wa' ] sample_dgrp_intersect = sample + out_file_extension with open(sample_dgrp_intersect, 'w') as out: helpers.sub_call(dgrp_intersect_command, stdout=out)
def tophat_call(sample, ref_fasta): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) ref_fasta_base = ref_fasta.split('.')[0] mismatches = '2' number_of_samples = len(mglobals.samples_list) threads_per_sample = mglobals.cpu_count//number_of_samples threads = str(threads_per_sample) log.info('threads per sample ' + threads) log.info('tophat: aligning sample {} with ref fasta {}'.format(sample, ref_fasta)) tophat_params = ['nice', '-n', '5', 'tophat', '-p', threads, '-G', mglobals.dros_gtf, '--transcriptome-index=../transcriptome_data/known', '-N', mismatches, '--b2-L', '20', '--b2-N', '1', '--read-edit-dist', mismatches, '-o', (sample + '_thout'), '--no-novel-juncs', ref_fasta_base, join(mglobals.samples_path, (sample + '.fastq'))] helpers.sub_call(tophat_params) log.info('tophat: finished analyzing sample: {} with ref fasta: {}'.format(sample, ref_fasta))
def variant_calls_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('Varscan: creating csv for: ' + sample) varscan_command = [ 'nice', '-n', '5', 'java', '-jar', mglobals.varscan_path, 'mpileup2snp', (sample + in_file_extension), '--min-coverage', '2', '--min-avg-qual', '20', '--strand-filter', '0', '--p-value', '1', '--min-var-freq', '1e-10', '--output-vcf', '1', ] output_file = sample + out_file_extension with open(output_file, 'w') as out: helpers.sub_call(varscan_command, stdout=out) log.info('varscan finished for: ' + sample)
def filter_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info("Filtering {0} by coverage".format(sample)) helpers.filter_vcf_by_coverage_cutoffs(vcf=(sample + in_file_extension), cutoff_table=mglobals.coverage_cutoffs) log.info("Filtering {0} according to SNP file: {1}".format(sample, mglobals.current_snp_file)) dgrp_intersect_command = [ "nice", "-n", "5", "intersectBed", "-a", (sample + "_covfil.vcf"), # the output of the helper # function above. "-b", mglobals.current_snp_file, "-wa", ] sample_dgrp_intersect = sample + out_file_extension with open(sample_dgrp_intersect, "w") as out: helpers.sub_call(dgrp_intersect_command, stdout=out)
def variant_calls_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info("Varscan: creating csv for: " + sample) varscan_command = [ "nice", "-n", "5", "java", "-jar", mglobals.varscan_path, "mpileup2snp", (sample + in_file_extension), "--min-coverage", "2", "--min-avg-qual", "20", "--strand-filter", "0", "--p-value", "1", "--min-var-freq", "1e-10", "--output-vcf", "1", ] output_file = sample + out_file_extension with open(output_file, "w") as out: helpers.sub_call(varscan_command, stdout=out) log.info("varscan finished for: " + sample)
def tophat_call(sample, ref_fasta): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) ref_fasta_base = ref_fasta.split('.')[0] mismatches = '5' mate_inner = sample.split('_')[2] number_of_samples = len(mglobals.samples_list) threads_per_sample = mglobals.cpu_count // number_of_samples threads = str(threads_per_sample) log.info('threads per sample ' + threads) log.info('tophat: aligning sample {} with ref fasta {}'.format( sample, ref_fasta)) tophat_params = [ 'nice', '-n', '5', 'tophat', '-p', threads, '-G', mglobals.dros_gtf, '--transcriptome-index=../transcriptome_data/known', '-N', mismatches, '--b2-L', '20', '--b2-N', '1', '--read-edit-dist', mismatches, '-o', (sample + '_thout'), '--no-novel-juncs', '--mate-inner-dist', mate_inner, ref_fasta_base, join(mglobals.samples_path, (sample + '_trim_R1.fastq')), join(mglobals.samples_path, (sample + '_trim_R2.fastq')) ] helpers.sub_call(tophat_params) log.info( 'tophat: finished analyzing sample: {} with ref fasta: {}'.format( sample, ref_fasta))
def trim_call(sample): log.info('Trimming sample {}'.format(sample)) trimmed_path = join(mglobals.trimmed_path, sample) p_trim_R1 = trimmed_path + '_trim_R1.fastq' u_trim_R1 = trimmed_path + '_u_trim_R1.fastq' p_trim_R2 = trimmed_path + '_trim_R2.fastq' u_trim_R2 = trimmed_path + '_u_trim_R2.fastq' if os.path.exists(p_trim_R1) and os.path.exists(p_trim_R2): log.info('Sample already trimmed, linking from trimmed_path') # Need the try block because will fail if link already exists try: os.symlink(p_trim_R1, os.path.basename(p_trim_R1)) os.symlink(p_trim_R2, os.path.basename(p_trim_R2)) except OSError: pass else: threads = str(mglobals.cpu_count // 10) # This optimal for ~40 samples but would # probably crash with a higher number. log.info('Sample not already trimmed, trimming now') trim_params = ([ 'nice', '-n', '5', 'java', '-jar', mglobals.trimmomatic_path, 'PE', '-threads', threads, '-phred33', sample + '_R1.fastq', sample + '_R2.fastq', p_trim_R1, u_trim_R1, p_trim_R2, u_trim_R2, 'SLIDINGWINDOW:4:20', 'TRAILING:20', 'MINLEN:50' ]) helpers.sub_call(trim_params) log.info('Finished trimming sample {}, linking in.'.format(sample)) os.symlink(p_trim_R1, os.path.basename(p_trim_R1)) os.symlink(p_trim_R2, os.path.basename(p_trim_R2))
def annotate_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('Annotating ' + sample + in_file_extension + ' with ' + mglobals.current_genes_file) gtf_intersect_command = [ 'nice', '-n', '5', 'intersectBed', '-a', (sample + in_file_extension), '-b', mglobals.current_genes_file, '-wa', '-wb' ] sample_gtf_intersect = sample + out_file_extension with open(sample_gtf_intersect, 'w') as out: helpers.sub_call(gtf_intersect_command, stdout=out)
def annotate_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('Annotating ' + sample + in_file_extension + ' with ' + mglobals.current_genes_file) gtf_intersect_command = ['nice', '-n', '5', 'intersectBed', '-a', (sample + in_file_extension), '-b', mglobals.current_genes_file, '-wa', '-wb' ] sample_gtf_intersect = sample + out_file_extension with open(sample_gtf_intersect, 'w') as out: helpers.sub_call(gtf_intersect_command, stdout=out)
def build_fastas_call(sample): os.chdir(join(mglobals.original_path, sample)) log.info("Beginning to build alternate fasta for: " + sample) fixed_vcf = sample + "_fix.vcf" log.info("Removing duplicated annotations (per transcript annotations)") helpers.remove_dups(input_f=(sample + in_file_extension), output_f=(sample + ".temp")) log.info("Removing duplicate alleles and adding header") # The fact that the original vcf was named sample.vcf is hardcoded # here. Be careful. helpers.vcf_fix(template_f=(sample + ".vcf"), input_f=(sample + ".temp"), output_f=fixed_vcf) # Delete temporary file os.remove(sample + ".temp") log.info("Creating alternate fasta") new_fasta = sample + "_unfixed.fa" helpers.sub_call( [ "nice", "-n", "5", "java", "-Xmx2g", "-jar", mglobals.gatk_path, "-R", "genome.fa", "-T", "FastaAlternateReferenceMaker", "-o", new_fasta, "--variant", fixed_vcf, ] ) # Fix the fasta log.info("Fixing gatk fasta") # If you change this name, you need to change the alternate fastas list as well. final_fasta = sample + ".fa" helpers.fasta_fix(input_f=new_fasta, output_f=final_fasta) # Delete the unfixed version os.remove(new_fasta) log.info("Moving new fasta to: " + join(mglobals.alternate_path, sample)) shutil.move(final_fasta, join(mglobals.alternate_path, sample)) log.info("Indexing new fasta") os.chdir(join(mglobals.alternate_path, sample)) helpers.sub_call(["bowtie2-build", "-f", final_fasta, sample])
def pileup_call(sample, ref_fasta): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('mpileup: creating .mpileup file for {} with ref fasta: {}'.format(sample, ref_fasta)) pileup_command = ['nice', '-n', '5', 'samtools', 'mpileup', '-B', '-d10000000', '-f', ref_fasta, join((sample + '_thout'), 'filter.bam')] output_file = sample + out_file_extension with open(output_file, 'w') as output_file: helpers.sub_call(pileup_command, stdout=output_file) log.info('mpileup: finished for {} with ref fasta: {}'.format(sample, ref_fasta))
def tophat_call(sample, ref_fasta): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) ref_fasta_base = ref_fasta.split(".")[0] mismatches = "5" number_of_samples = len(mglobals.samples_list) threads_per_sample = mglobals.cpu_count // number_of_samples threads = str(threads_per_sample) log.info("threads per sample " + threads) log.info("tophat: aligning sample {} with ref fasta {}".format(sample, ref_fasta)) tophat_params = [ "nice", "-n", "5", "tophat", "-p", threads, "-G", mglobals.dros_gtf, "--transcriptome-index=../transcriptome_data/known", "-N", mismatches, "--b2-L", "20", "--b2-N", "1", "--read-edit-dist", mismatches, "-o", (sample + "_thout"), "--no-novel-juncs", ref_fasta_base, join(mglobals.samples_path, (sample + ".fastq")), ] helpers.sub_call(tophat_params) log.info("tophat: finished analyzing sample: {} with ref fasta: {}".format(sample, ref_fasta))
def trim_call(sample): log.info('Trimming sample {}'.format(sample)) trimmed_path = join(mglobals.trimmed_path, sample) p_trim_R1 = trimmed_path + '_trim_R1.fastq' u_trim_R1 = trimmed_path + '_u_trim_R1.fastq' p_trim_R2 = trimmed_path + '_trim_R2.fastq' u_trim_R2 = trimmed_path + '_u_trim_R2.fastq' if os.path.exists(p_trim_R1) and os.path.exists(p_trim_R2): log.info('Sample already trimmed, linking from trimmed_path') # Need the try block because will fail if link already exists try: os.symlink(p_trim_R1, os.path.basename(p_trim_R1)) os.symlink(p_trim_R2, os.path.basename(p_trim_R2)) except OSError: pass else: threads = str(mglobals.cpu_count//10) # This optimal for ~40 samples but would # probably crash with a higher number. log.info('Sample not already trimmed, trimming now') trim_params = (['nice', '-n', '5', 'java', '-jar', mglobals.trimmomatic_path, 'PE', '-threads', threads, '-phred33', sample + '_R1.fastq', sample + '_R2.fastq', p_trim_R1, u_trim_R1, p_trim_R2, u_trim_R2, 'SLIDINGWINDOW:4:20', 'TRAILING:20', 'MINLEN:50']) helpers.sub_call(trim_params) log.info('Finished trimming sample {}, linking in.'.format(sample)) os.symlink(p_trim_R1, os.path.basename(p_trim_R1)) os.symlink(p_trim_R2, os.path.basename(p_trim_R2))
def filter_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('Filtering {0} by coverage'.format(sample)) helpers.filter_vcf_by_coverage_cutoffs(vcf=(sample + in_file_extension), cutoff_table=mglobals.coverage_cutoffs) log.info('Filtering {0} according to SNP file: {1}'.format(sample, mglobals.current_snp_file)) dgrp_intersect_command = ['nice', '-n', '5', 'intersectBed', '-a', (sample + '_covfil.vcf'), # the output of the helper # function above. '-b', mglobals.current_snp_file, '-wa' ] sample_dgrp_intersect = sample + out_file_extension with open(sample_dgrp_intersect, 'w') as out: helpers.sub_call(dgrp_intersect_command, stdout=out)
def annotate_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info("Annotating " + sample + in_file_extension + " with " + mglobals.current_genes_file) gtf_intersect_command = [ "nice", "-n", "5", "intersectBed", "-a", (sample + in_file_extension), "-b", mglobals.current_genes_file, "-wa", "-wb", ] sample_gtf_intersect = sample + out_file_extension with open(sample_gtf_intersect, "w") as out: helpers.sub_call(gtf_intersect_command, stdout=out)
def build_fastas_call(sample): os.chdir(join(mglobals.original_path, sample)) log.info('Beginning to build alternate fasta for: ' + sample) fixed_vcf = sample + '_fix.vcf' log.info('Removing duplicated annotations (per transcript annotations)') helpers.remove_dups(input_f=(sample + in_file_extension), output_f=(sample + '.temp')) log.info('Removing duplicate alleles and adding header') # The fact that the original vcf was named sample.vcf is hardcoded # here. Be careful. helpers.vcf_fix(template_f=(sample + '.vcf'), input_f=(sample + '.temp'), output_f=fixed_vcf) # Delete temporary file os.remove(sample + '.temp') log.info('Creating alternate fasta') new_fasta = sample + '_unfixed.fa' helpers.sub_call(['nice', '-n', '5', 'java', '-Xmx2g', '-jar', mglobals.gatk_path, '-R', 'genome.fa', '-T', 'FastaAlternateReferenceMaker', '-o', new_fasta, '--variant', fixed_vcf]) # Fix the fasta log.info('Fixing gatk fasta') # If you change this name, you need to change the alternate fastas list as well. final_fasta = sample + '.fa' helpers.fasta_fix(input_f=new_fasta, output_f=final_fasta) # Delete the unfixed version os.remove(new_fasta) log.info('Moving new fasta to: ' + join(mglobals.alternate_path, sample)) shutil.move(final_fasta, join(mglobals.alternate_path, sample)) log.info('Indexing new fasta') os.chdir(join(mglobals.alternate_path, sample)) helpers.sub_call(['bowtie2-build', '-f', final_fasta, sample])
def variant_calls_call(sample): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info('Varscan: creating csv for: ' + sample) varscan_command = ['nice', '-n', '5', 'java', '-jar', mglobals.varscan_path, 'mpileup2snp', (sample + in_file_extension), '--min-coverage', '2', '--min-avg-qual', '20', '--strand-filter', '0', '--p-value', '1', '--min-var-freq', '1e-10', '--output-vcf', '1', ] output_file = sample + out_file_extension with open(output_file, 'w') as out: helpers.sub_call(varscan_command, stdout=out) log.info('varscan finished for: ' + sample)
def pileup_call(sample, ref_fasta): if mglobals.original: os.chdir(join(mglobals.original_path, sample)) else: os.chdir(join(mglobals.alternate_path, sample)) log.info("mpileup: creating .mpileup file for {} with ref fasta: {}".format(sample, ref_fasta)) pileup_command = [ "nice", "-n", "5", "samtools", "mpileup", "-B", "-d10000000", "-f", ref_fasta, join((sample + "_thout"), "filter.bam"), ] output_file = sample + out_file_extension with open(output_file, "w") as output_file: helpers.sub_call(pileup_command, stdout=output_file) log.info("mpileup: finished for {} with ref fasta: {}".format(sample, ref_fasta))