def convert_sam_to_bam(sam): util.info('Converting %s to bam format so to save disk space...' % sam) bam_file = sam.strip('.sam') + '.bam' cmdArgs = ['samtools','view','-bh',sam,'-o',bam_file] util.call(cmdArgs) os.remove(sam) return(bam_file)
def gatk_merge_vcfs(dir_name, strain_vcf_paths, genome_fasta_path, num_cpu=util.MAX_CORES): merge_file_path = _get_merged_vcf_path(dir_name, strain_vcf_paths.keys(), CALLER_GATK) if os.path.exists(merge_file_path): util.info('%s already exists and won\'t be overwritten...' % merge_file_path) else: cmd_args = list(util.JAVA) + [ '-jar', exe.EXE[CALLER_GATK], '-T', 'GenotypeGVCFs', '-R', genome_fasta_path, #'-nt', str(min(8, num_cpu)), # Seems to fail with multiple CPU threads... '-o', merge_file_path ] for strain in strain_vcf_paths: cmd_args += ['-V', strain_vcf_paths[strain]] util.call(cmd_args) return merge_file_path
def call_genotype_gatk(strain_bam_paths, genome_fasta_path, num_cpu, out_dir, sub_dir_name): # GATK pipeline - parallelise strains in python genome_index_file = genome_fasta_path + '.fai' genome_dict_file = os.path.splitext(genome_fasta_path)[0] + '.dict' if not os.path.exists(genome_index_file): util.info('Making index for genome FASTA file %s' % genome_fasta_path) cmd_args = [exe.EXE['samtools'], 'faidx', genome_fasta_path] util.call(cmd_args) if not os.path.exists(genome_dict_file): util.info('Making dict filr for genome FASTA file %s' % genome_fasta_path) cmd_args = [ exe.EXE['samtools'], 'dict', genome_fasta_path, '-o', genome_dict_file ] util.call(cmd_args) strains = sorted(strain_bam_paths) bam_paths = [strain_bam_paths[s] for s in strains ] # Each parallel call will be sent one of these common_args = [genome_fasta_path, sub_dir_name] # All tasks share this vcf_paths = util.parallel_split_job(gatk_haplotype_job, bam_paths, common_args, num_cpu, collect_output=True) # BAM and VCF path will be in corresponding order # Multi-sample strain_vcf_paths = {strains[i]: p for i, p in enumerate(vcf_paths)} merged_vcf_path = gatk_merge_vcfs(out_dir, strain_vcf_paths, genome_fasta_path, num_cpu=num_cpu) return merged_vcf_path
def gatk_haplotype_job(bam_file_path, genome_fasta_path, sub_dir_name): path_root, file_ext = os.path.splitext(bam_file_path) dir_name, file_root = os.path.split(path_root) vcf_dir_name = os.path.join(dir_name, sub_dir_name) # vcf_file_path = os.path.join(vcf_dir_name, '%s_hap.vcf' % (file_root)) gvcf_file_path = os.path.join(vcf_dir_name, '%s_hap.g.vcf' % (file_root)) if os.path.exists(gvcf_file_path): util.info( "VCF file %s already exists. Skipping haplotype calling for %s" % (gvcf_file_path, file_root)) return gvcf_file_path util.makedirs(vcf_dir_name, exist_ok=True) util.info('Creating GVCF file for %s using GATK' % file_root) cmd_args = list(util.JAVA) + [ '-jar', exe.EXE[CALLER_GATK], '-T', 'HaplotypeCaller', '-R', genome_fasta_path, '-I', bam_file_path, '-o', gvcf_file_path, '-ERC', 'GVCF', '-variant_index_type', 'LINEAR', # Deprecated for GATK 4.0 '-variant_index_parameter', '128000' ] # Deprecated for GATK 4.0 util.call(cmd_args) return gvcf_file_path
def sam_parser(sam_file,aligner,remove_sam = True): ext = '.sam' if '.bam' in sam_file: ext = '.bam' counts_file = sam_file.strip(ext) + '_lib_guidecounts.txt' counts_log = sam_file.strip(ext) + '_lib_guidecounts.log' sam_parser_to_guide_counts = os.path.dirname(os.path.realpath(__file__)) + '/sam_parser_to_guide_counts.sh' if convert_to_bam: util.info('Removing sam header from %s in order to proceed to read counting...' % sam_file) # Remove header temp = sam_file.strip(ext) + '_temp.sam' # Remove unaligned reads if there are any. # This is particularly important when using bowtie because the --no-unal flag doesn't really work. cmdArgs = ['samtools','view','-F','4',sam_file,'-o',temp] util.call(cmdArgs) util.info('Counting reads from %s...' % sam_file) cmdArgs = [sam_parser_to_guide_counts,temp,counts_file,aligner] util.call(cmdArgs,stderr=counts_log) os.remove(temp) else: util.info('Counting reads from %s...' % sam_file) cmdArgs = [sam_parser_to_guide_counts,sam_file,counts_file,aligner] util.call(cmdArgs,stderr=counts_log) if remove_sam is True and ext is '.sam': os.remove(sam_file) return(counts_file)
def plot_coverage(coverage_files,output=None): outdir = coverage_files[0] outdir = outdir.split("/") outdir = outdir[0:-2] outdir = "/".join(outdir) if output is None: output = util.get_rand_string(8) util.info("Header for output files has not been specified. Random string %s will be used instead...") % output all_strains_cov = "%s/%s_all_strains_cov.txt" % (outdir,output) fileObj1 = open(all_strains_cov,"w") fileObj1.write("Genome_cov\tExon_cov\n") for f in coverage_files: strain = f.split("/") strain = strain[-1] strain = strain.split("_") strain = strain[0] fileObj = open(f,"r") line1 = fileObj.readline() line1 = line1.split(" ") genome_cov = line1[4] genome_cov = genome_cov.rstrip() line2 = fileObj.readline() line2 = line2.split(" ") exon_cov = line2[7] exon_cov = exon_cov.rstrip() towrite = "\t".join([strain,genome_cov,exon_cov]) + "\n" fileObj1.write(towrite) fileObj.close() fileObj1.close() util.info("File saved as %s..." % all_strains_cov) cmdArgs = ['Rscript','--vanilla', exe.EXE['phc'], all_strains_cov,output] util.call(cmdArgs)
def freebayes_genotype_job(region, genome_fasta_path, bam_paths): out_vcf_path = 'temp_%s_freebayes.vcf' % region if not os.path.exists(out_vcf_path): cmd_args = [ exe.EXE['freebayes'], # '--no-mnps', # make this optional # '--no-complex', # make this optional '-f', genome_fasta_path, '-r', region, '-v', out_vcf_path ] #, '--ploidy', '2'] cmd_args += bam_paths util.call(cmd_args) return out_vcf_path
def gatk_select_vars(strain_name, merged_vcf_path, genome_fasta_path, file_tag='extracted', homozygous=True): dir_name, file_name = os.path.split(merged_vcf_path) out_vcf_path = os.path.join(dir_name, '%s_%s.vcf' % (strain_name, file_tag)) # Original naming: "%s_sorted_f3_F4_q1_mark_dups_w_mate_cig_gatk_hap_call_extracted.vcf" % strain_name util.info('Creating VCF file for %s' % strain_name) cmd_args = list(util.JAVA) + [ '-jar', exe.EXE[CALLER_GATK], '-T', 'SelectVariants', '-R', genome_fasta_path, '-V', merged_vcf_path, '-o', out_vcf_path, '-sn', 'sample_%s' % strain_name ] if homozygous: cmd_args += [ '-select', "vc.getGenotype('sample_%s').isHomVar()" % strain_name ] # Check quotes else: # cmd_args += ['-select', "! vc.getGenotype('sample_%s').isHomRef()" % strain_name] cmd_args += [ '-select', "vc.getGenotype('sample_%s').isHomVar() || vc.getGenotype('sample_%s').isHet() && ! vc.getGenotype('sample_%s').isHomRef()" % (strain_name, strain_name, strain_name) ] util.call(cmd_args) util.info('All done for strain %s. VCF file can be found in %s' % (strain_name, out_vcf_path))
def call_genotype_freebayes(strain_bam_paths, genome_fasta_path, num_cpu, out_dir, sub_dir_name): # FreeBayes pipeline strain_names, bam_file_paths = zip(*list(strain_bam_paths.items())) merge_file_path = _get_merged_vcf_path(out_dir, bam_file_paths, CALLER_FREEBAYES) if os.path.exists(merge_file_path): util.info("%s exists and won't be overwritten. Skipping..." % merge_file_path) else: temp_file_path_a = util.get_temp_path(merge_file_path) temp_file_path_b = util.get_temp_path(merge_file_path) # Make regions for parallelisation, splitting all chromos according to number of CPUs chromo_sizes = util.get_bam_chromo_sizes(bam_file_paths[0]) regions = [] region_fmt = '%s:%d-%d' for chromo, size in chromo_sizes: step = int(size / num_cpu) + 1 # will be rounded up i = 0 j = step while j < size: regions.append(region_fmt % (chromo, i, j)) i = j j += step regions.append(region_fmt % (chromo, i, size)) # Call haplotype for all strains at once, split into parallel regions common_args = [genome_fasta_path, bam_file_paths] region_vcf_paths = util.parallel_split_job(freebayes_genotype_job, regions, common_args, num_cpu, collect_output=True) # Combine the regions which were run in parallel util.info('Combining freebayes regions') out_file_obj = open(temp_file_path_a, 'w') write = out_file_obj.write for i, region_vcf in enumerate(region_vcf_paths): with open(region_vcf) as file_obj: for line in file_obj: if '\n' in line: if line[0] == '#': if i == 0: write(line) else: write(line) else: util.critical('No end of line in %s. Exiting...' % region_vcf) out_file_obj.close() cmd_args = [exe.EXE['vcfuniq']] util.call(cmd_args, stdin=temp_file_path_a, stdout=merge_file_path) # Cleanup temp files os.unlink(temp_file_path_a) for file_path in region_vcf_paths: os.unlink(file_path) return merge_file_path
def genome_map(aligner, strain_name, strain_num, fastq_paths, genome_index_path, genome_fasta_path, num_cpu=util.MAX_CORES): dir_name, base_name = os.path.split(fastq_paths[0]) path_root = os.path.join(dir_name, strain_name) sam_file_path = '%s.sam' % path_root if os.path.exists(sam_file_path): util.info("SAM file %s already exists. Skipping genome mapping" % sam_file_path) else: util.info("Running aligner %s on %s..." % (aligner, strain_name)) if aligner == ALIGNER_BWA: rg_header = "@RG\\tID:%s\\tSM:sample_%s\\tPL:illumina\\tLB:lib%d\\tPU:unit%d" % ( strain_name, strain_name, strain_num, strain_num) cmd_args = [ exe.EXE[ALIGNER_BWA], 'mem', '-t', str(num_cpu), '-M', '-R', rg_header, #genome_index_path] + list(fastq_paths) genome_fasta_path ] + list(fastq_paths) util.call(cmd_args, stdout=open(sam_file_path, 'w')) elif aligner == ALIGNER_BT2: cmd_args = [ exe.EXE[ALIGNER_BT2], '--sensitive', '-x', genome_index_path, '-p', str(num_cpu), '-q', # FASTQ input '--rg-id', strain_name, '--rg', "SM:sample_%s\tPL:illumina\tLB:lib%d\tPU:unit%d" % (strain_name, strain_num, strain_num), '-S', sam_file_path ] if len(fastq_paths) > 1: cmd_args += ['-1', fastq_paths[0], '-2', fastq_paths[1]] else: cmd_args += ['-U', fastq_paths[0]] util.call(cmd_args) else: # bbmap cmd_args = [ exe.EXE[ALIGNER_BBMAP], 'ref=%s' % genome_fasta_path, 'path=%s' % genome_index_path, 'sam=1.3', 'in=%s' % fastq_paths[0], 'out=%s' % sam_file_path, 't=%d' % num_cpu, 'rgid=%s' % strain_name, 'rgsm=sample_%s' % strain_name, 'rgpl=illumina', 'rglb=lib%d' % strain_num, 'rgpu=unit%d' % strain_num ] if len(fastq_paths) > 1: cmd_args += ['in2=%s' % fastq_paths[1]] util.call(cmd_args) util.info('Done %s genome alignment for strain %s' % (aligner, strain_name)) return sam_file_path
def bedtools_coverage(bam_file_path, genome_fasta_path, exon_gff_file_path): dir_name, base_name = os.path.split(bam_file_path) file_root = os.path.splitext(base_name)[0] dir_name = os.path.join(dir_name, 'coverage') util.makedirs( dir_name, exist_ok=True) # Not the os version to be Python 2 and 3 compatible genome_cvr_file_path = os.path.join(dir_name, base_name + '.genomecov') exon_cvr_file_path = os.path.join(dir_name, base_name + '_exon.coverage') exon_cvr_temp_file_path = os.path.join(dir_name, base_name + '_exon.coverage.temp') R_cvr_file_path = os.path.join(dir_name, base_name + '_R_coverage.out') if os.path.exists(R_cvr_file_path): util.info( "Coverage file %s already exists. Skipping coverage calculations" % (R_cvr_file_path, )) return bedtools_exe = exe.EXE['bedtools'] util.info("Running bedtools genomecov...") # cmd_args = [bedtools_exe, 'genomecov', '-ibam', bam_file_path, '-g', genome_fasta_path] cmd_args = [bedtools_exe, 'genomecov', '-pc', '-ibam', bam_file_path] util.call(cmd_args, stdout=genome_cvr_file_path) util.info("Done... Results saved in: %s" % genome_cvr_file_path) util.info("Converting %s into a sorted bed file..." % bam_file_path) temp_dir = os.path.join(dir_name, 'TEMP_%s' % uuid.uuid4()) os.makedirs(temp_dir) temp_bed_file1 = os.path.join(temp_dir, '%s.bed' % file_root) temp_bed_file2 = os.path.join(temp_dir, '%s_sortBed.bed' % file_root) cmd_args = [bedtools_exe, 'bamtobed', '-i', bam_file_path] util.call(cmd_args, stdout=temp_bed_file1) # cmd_args = [bedtools_exe, 'sort', '-i', temp_bed_file1] cmd_args = ['sort', '-k1,1', '-k2,2n', '--batch-size=5', temp_bed_file1] # Update to reduce RAM usage util.call(cmd_args, stdout=temp_bed_file2) util.info("Done... Results saved in temporary directory: %s" % temp_dir) util.info("Running bedtools coverage...") # cmd_args = [bedtools_exe, 'coverage', '-hist' ,'-a', exon_gff_file_path,'-b', temp_bed_file2] cmd_args = [ bedtools_exe, 'coverage', '-sorted', '-hist', '-a', exon_gff_file_path, '-b', temp_bed_file2 ] # Update to reduce RAM usage util.call(cmd_args, stdout=exon_cvr_file_path) util.info("Done... Results saved in: %s" % exon_cvr_file_path) # In order to calculate exon coverage in R, we need to extract all lines starting with "all" from exon.coverage file # This file is saved in a temporary directory cmd_args = ['grep', 'all', exon_cvr_file_path] util.call(cmd_args, stdout=exon_cvr_temp_file_path) util.info( "Running R to compute mean genome coverage and mean exon coverage..") cmd_args = [ 'Rscript', '--vanilla', exe.EXE['mgcr'], genome_cvr_file_path, exon_cvr_temp_file_path ] util.call(cmd_args, stdout=R_cvr_file_path) util.info("Delete temporary directory and files...")
def sam_cleanup(sam_file_path, num_cpu=2): file_tag = util.FILE_TAG path_root, file_ext = os.path.splitext(sam_file_path) strain_name = os.path.basename(path_root) bam_file_path = '%s%ssrt.bam' % (path_root, file_tag) clean_bam_path = '%s%ssrt_%s.bam' % (path_root, file_tag, CLEAN_TAG) out_bam_path = '%s%ssrt_%s_%s.bam' % (path_root, file_tag, CLEAN_TAG, PICARD_TAG) metrics_file_path = '%s%ssrt_%s_%s_metrics.txt' % (path_root, file_tag, CLEAN_TAG, PICARD_TAG) if os.path.exists(out_bam_path): util.info("BAM file %s already exists. Skipping SAM cleanup" % out_bam_path) return out_bam_path util.info( "Converting SAM file from genome aligner output into sorted BAM...") cmd_args = [ exe.EXE['samtools'], 'sort', '-O', 'bam', # '-@', str(num_cpu), '-o', bam_file_path, sam_file_path ] util.call(cmd_args) util.info( 'Removing unmapped reads, PCR duplicates and low quality ones (MAPQ smaller than 1) keeping only paired reads which are properly mapped...' ) # Log strains individually cmd_args = [ 'samtools', 'view', '-b', '-f', '3', '-F', '4', '-q', '1', bam_file_path ] util.call(cmd_args, stdout=open(clean_bam_path, 'wb')) util.info("Marking duplicate reads using Picard") cwd = os.getcwd() os.chdir('/') # Picard picky about relative paths cmd_args = list(util.JAVA) cmd_args += [ '-jar', exe.EXE['picard'], 'MarkDuplicatesWithMateCigar', 'I=%s' % clean_bam_path, 'O=%s' % out_bam_path, 'M=%s' % metrics_file_path ] util.call(cmd_args) os.chdir(cwd) util.info("Indexing %s" % out_bam_path) util.call([exe.EXE['samtools'], 'index', out_bam_path]) util.info('Done BAM clean-up for strain %s' % strain_name) return out_bam_path
def execute_CAM(self): """ This function collects variables and starts process """ if self.csv_opt == "Create": self.csv_file = self.csv_create.csv_file self.contrast = 'Condition' elif self.csv_opt == "Upload": self.csv_file = self.csv_upload.csv_file fileObj = open(self.csv_file, 'r') line = fileObj.readline() line = line.rstrip('\n') line = line.split('\t') self.contrast = line[3] fileObj.close() else: show_error_message('Please provide a samples file.') self.csv_file = None self.contrast = None self.soft = self.soft_opt.selected self.lib = self.lib_opt.selected self.seq = self.seq_opt.selected self.al = self.al_opt.selected self.fa_file = self.fa_file_frame.lbox.text() self.tgalore_args = self.tgalore.lbox.text() self.fastqc_args = self.fastqc.lbox.text() self.al_args = self.aligner.lbox.text() self.cpu_args = self.cpu.lbox.text() self.flags = [] # Arguments to run CAM args = [self.csv_file, self.fa_file] soft_dict = {'MAGeCK': 'mageck', 'Bagel': 'bagel'} dict_aux = {'Bowtie': 'bowtie', 'Bowtie2': 'bowtie2'} dict_guides = {'Bassik': 'bassik', 'Other': 'other'} dict_args = { 'al': dict_aux[self.al], 'crispr_software': soft_dict[self.soft], 'guide_library': dict_guides[self.lib] } if len(self.tgalore_args) > 0: dict_args['trim_galore'] = '"%s"' % self.tgalore_args if len(self.fastqc_args) > 0: dict_args['fastqc_args'] = '"%s"' % self.fastqc_args if len(self.al_args) > 0: dict_args['aligner_args'] = '"%s"' % self.al_args if self.seq == 'single-end': self.flags.append('-se') if len(self.cpu_args) > 0: dict_args['cpu'] = self.cpu_args for key, item in dict_args.items(): key = '-' + key aux = '='.join([key, str(item)]) args.append(aux) args += self.flags # Run CAM on the LMB cluster as a qsub job if self.qsub.isChecked(): if self.seq == 'paired-end': args = args + ['-pe', self.pe_tags] command = ' '.join(args) command = 'module load python3/3.7.1\nmodule load multiqc\npython3 /net/nfs1/public/genomics/CAM/CAM.py %s ' % ( command) temp = 'job_' + util.get_rand_string(5) + ".sh" tempObj = open(temp, 'w') tempObj.write(command) tempObj.close() qsubArgs = ['qsub', '-cwd', '-j', 'y', '-V'] if self.node.isChecked(): qsubArgs = qsubArgs + ['-l', 'dedicated=24', temp] else: if len(self.cpu_args) > 0: cpu = dict_args['cpu'] else: cpu = '4' qsubArgs = qsubArgs + ['-pe', 'smp', cpu, temp] util.call(qsubArgs) show_pop_up(msg='Job submitted to LMB cluster!') os.remove(temp) # Run CAM on local machine else: if self.seq == 'paired-end': args = args + ['-pe'] + self.pe_tags.split(' ') CAM = '%s/CAM.py' % os.path.dirname(os.path.realpath(__file__)) args = ['python3', CAM] + args util.call(args, shell=True)
def run_aligner(trimmed_fq,fastq_dirs,aligner='bowtie2',guide_library='bassik',reference_fasta=None,genome_index=None,num_cpu=util.MAX_CORES, is_single_end=True,pair_tags=['r_1','r_2'],aligner_args=None,convert_to_bam=True): # Generate genome indexes if not provided if aligner == 'bowtie': genome_index_default = os.path.dirname(reference_fasta) + '/bt-genome/' index_builder = 'bowtie-build' elif aligner == 'bowtie2': genome_index_default = os.path.dirname(reference_fasta) + '/bt2-genome/' index_builder = 'bowtie2-build' if aligner in [ 'bowtie', 'bowtie2']: if genome_index is None: genome_index = genome_index_default util.warn('Folder where %s indices are located hasn\'t been specified. Program will default to %s...' % (aligner,genome_index)) base = os.path.basename(reference_fasta).split('.')[:-1] base = '.'.join(base) base = genome_index + base if not os.path.exists(genome_index): os.mkdir(genome_index) util.info('Bowtie2 indices not found. Generating indices...') cmdArgs = [index_builder,reference_fasta,base] util.call(cmdArgs) genome_index = base # Alignment util.info('Aligning reads using %s...' % aligner) def format_aligner_input(trimmed_fq,aligner,aligner_args,is_single_end,convert_to_bam): if aligner == 'bowtie': ext = 'bt' elif aligner == 'bowtie2': ext = 'bt2' k = 0 if is_single_end: sam_log_list = [] for f in trimmed_fq: cmdArgs = [aligner] + aligner_args fo = os.path.basename(f) fo = fastq_dirs[k]+ '/' + fo sam = fo + '.%s.sam' % ext log = fo + '.%s.log' % ext sam_log_list.append([f,sam,log]) return(sam_log_list) if aligner == 'bowtie': if convert_to_bam: sam_args = ['-S','--no-unal'] else: sam_args = [] if aligner_args is None: aligner_args = ['-v', '0', '-m', '1', '--strata', '--best'] # allow no mismatches and report reads that align only once if guide_library == 'bassik': aligner_args = aligner_args + ['-5','1'] sam_log_list = format_aligner_input(trimmed_fq=trimmed_fq,aligner=aligner,aligner_args=aligner_args,is_single_end=is_single_end,convert_to_bam=convert_to_bam) file_list = [] for f, sam , log in sam_log_list: if convert_to_bam: wd = os.path.dirname(sam) sam_header = os.path.basename(sam).split('.')[:-1] sam_header = '.'.join(sam_header) check_exists = wd + '/' + sam_header + '.bam' else: check_exists = sam file_list.append(sam) if pragui.exists_skip(check_exists): # Function pragui.exists_skip() determines if next step should go ahead. # It skips the next step if the file path provided exists. # This prevents overwriting files and also saves processing time. cmdArgs = [aligner] + aligner_args + ['-p',str(num_cpu), genome_index,f] + sam_args + [sam] util.call(cmdArgs,stderr=log) if aligner == 'bowtie2': if convert_to_bam: header_opt = [] else: header_opt = ['--no-hd'] if aligner_args is None: # Allow no mismatches and no pre-alignment before multiseed heuristic aligner_args = ['-N','0','--no-1mm-upfront','--score-min', 'L,0,0', '--no-unal'] + header_opt if guide_library == 'bassik': aligner_args = aligner_args + ['-5','1'] sam_log_list = format_aligner_input(trimmed_fq=trimmed_fq,aligner=aligner,aligner_args=aligner_args,is_single_end=is_single_end,convert_to_bam=convert_to_bam) file_list = [] for f, sam , log in sam_log_list: if convert_to_bam: wd = os.path.dirname(sam) sam_header = os.path.basename(sam).split('.')[:-1] sam_header = '.'.join(sam_header) check_exists = wd + '/' + sam_header + '.bam' else: check_exists = sam file_list.append(sam) if pragui.exists_skip(check_exists): cmdArgs = [aligner] + aligner_args + ['-p',str(num_cpu),'-x', genome_index,'-U', f, '-S', sam] util.call(cmdArgs,stderr=log) # Convert sam to bam if convert_to_bam is True: file_list = [] for f, sam , log in sam_log_list: wd = os.path.dirname(sam) sam_header = os.path.basename(sam).split('.')[:-1] sam_header = '.'.join(sam_header) check_exists = wd + '/' + sam_header + '.bam' if pragui.exists_skip(check_exists): file = convert_sam_to_bam(sam=sam) else: file = check_exists file_list.append(file) return(file_list)
def cross_fil_background(strain_vcf_files, out_vcf_path=None, min_num_obs=3): for file_path in strain_vcf_files: is_ok, msg = util.check_regular_file(file_path) if not is_ok: util.critical(msg) if not out_vcf_path: out_vcf_path = 'bg_s%d_m%d.vcf' % (len(strain_vcf_files), min_num_obs) file_root, file_ext = os.path.splitext(out_vcf_path) comb_vcf_path = '%s_comb_input.vcf' % file_root temp_comb_vcf_path = util.get_temp_path(comb_vcf_path) util.info('Creating background VCF file for %d input files' % (len(strain_vcf_files))) # Combine each strain's diploid variants into a combined VCF # vcfcombine: Combine multiple VCF files together, handling samples when alternate allele descriptions are identical # vcfintersect -u any better? cmd_args = [exe.EXE['vcfcombine']] cmd_args += strain_vcf_files util.call(cmd_args, stdout=temp_comb_vcf_path) # Check chromosome sorting cmd_args = [exe.EXE['vcfstreamsort'], '-a'] util.call(cmd_args, stdin=temp_comb_vcf_path, stdout=comb_vcf_path) os.unlink(temp_comb_vcf_path) # Filter on number of samples represented in the genotype fields # i.e. for vars that occur at least a given number of times across strainss out_file_obj = open(out_vcf_path, 'w') write = out_file_obj.write num_samples = None # Filled from header info with open(comb_vcf_path) as file_obj: for line in file_obj: if line[0] == '#': if line[1:6] == 'CHROM': header = line.split() if len(header) < 10: util.critical('Cannot filter sample genotypes in a VCF file without FORMAT and sample/genotype information') else: sample_names = header[9:] #Could use in future to track which strains are selected num_samples = len(sample_names) write(line) else: data = line.split() genotypes = data[9:] if ref_allele is None: num_obs = num_samples - genotypes.count('.') else: genotypes2 = [] for r in range(len(ref_allele)): genotypes2 = genotypes2 + [ref_allele[r] in x for x in genotypes] num_obs = num_samples - genotypes2.count(True) if num_obs >= min_num_obs: write(line) util.info('Background VCF file output at "%s"' % (out_vcf_path, ))
def subtract_background(strain_vcf_path, background_vcf_path, genome_fasta_path, out_dir, genome_version, interval_length, output_tag): bg_path_root, bg_file_ext = os.path.splitext(background_vcf_path) bg_file_root = os.path.basename(bg_path_root) util.info('Filtering VCF file %s' % strain_vcf_path) path_root, file_ext = os.path.splitext(strain_vcf_path) if out_dir: file_root = os.path.basename(path_root) path_root = os.path.join(out_dir, file_root) path_root = '%s%s%s' % ( path_root, output_tag, bg_file_root ) # Combines background name and sample/strain name out_vcf_path = path_root + '.vcf' out_vcf_path = util.get_safe_file_path(out_vcf_path) # Avoid overwrites path_root, file_ext = os.path.splitext( out_vcf_path ) # Path root should have changed if a substitute name was used snpeff_vcf_path = path_root + '_SnpEff.vcf' snpeff_summ_path = path_root + '_summary.html' snpsift_tab_path = path_root + '_SnpSift.tabular' cmd_args = list(util.JAVA) + [ '-jar', exe.EXE['gatk'], '-T', 'SelectVariants', '-R', genome_fasta_path, '-V', strain_vcf_path, '--discordance', background_vcf_path, '-o', out_vcf_path, ] util.call(cmd_args) util.info('Running SnpEff on %s' % out_vcf_path) # Run SnpEff on resulting VCF file cmd_args = list(util.JAVA) + [ '-jar', exe.EXE['snpeff'], '-v', '-upDownStreamLen', str(interval_length), '-stats', snpeff_summ_path, genome_version, out_vcf_path ] util.call(cmd_args, stdout=snpeff_vcf_path) # Create tabular output from VCF file using SnpSift util.info('Running SnpSift on %s' % snpeff_vcf_path) cmd_args = list(util.JAVA) + [ '-jar', exe.EXE['snpsift'], 'extractFields', snpeff_vcf_path, '-s', ',', '-e', '.', 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'DP', 'ANN[*].ERRORS', 'ANN[*].GENEID', 'ANN[*].GENE', 'ANN[*].BIOTYPE', 'ANN[*].TRID', 'ANN[*].RANK', 'ANN[*].EFFECT', 'ANN[*].IMPACT:', 'ANN[*].HGVS_P', 'ANN[*].HGVS_C', 'ANN[*].CDS_POS', 'ANN[*].CDS_LEN', 'ANN[*].DISTANCE' ] util.call(cmd_args, stdout=snpsift_tab_path) util.info('Results saved to %s and similarly named analysis files' % out_vcf_path)