def removeDuplicates(runCFG, bam_files, threads='1'): #initial parameters outDir = runCFG['exec']['outdir'] logfile = runCFG['exec']['logfile'] checkexists(os.path.join(outDir, 'rm_dups')) outDir = os.path.join(outDir, 'rm_dups') #notify starting to remove duplicates procTitle('Remove Duplicates', runCFG) print('\nSniffles: Removing duplicate reads') #get time at start start = time.time() #generate commands cmds = [] output_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] #remove duplicate reads command cmd = f'java -Xmx2g -jar /tools/picard.jar MarkDuplicates I=/in_dir/{id}.bam O=/out_dir/{id}.bam REMOVE_DUPLICATES=true M=/out_dir/{id}.removeDupMetrics.txt' cmds.append(cmd) #add id to finished list output_list.append(os.path.join(outDir, f'{id}.bam')) #set up multiprocessing pool = mp.Pool(processes=threads) #denote start of remove duplicate reads in logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Removing Duplicates\n') #start multiprocessing results = pool.starmap_async( cd.call, [[cmd, '/reads', { path: "/in_dir", outDir: "/out_dir" }] for cmd in cmds]) pool.close() pool.join() stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get time at end end = time.time() #determine runtime of processes runtime = round(end - start, 2) print(f'\nSniffles: Finished removing duplicates in {runtime} seconds') return output_list
def mapping(runCFG,param_paths,outDir,threads='1'): logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile']) num_jobs,num_threads = cpu_count(threads) cmds = [] read_path = '' ref_path = '' output_bam_list = [] for param_path in param_paths: id = param_path[0] read1 = os.path.basename(param_path[1]) read2 = os.path.basename(param_path[2]) read_path = os.path.dirname(os.path.abspath(param_path[1])) ref_path = runCFG['exec']['outdir'] + '/ref_sequence' reference_sequence_name = os.path.basename(param_path[3]) #check output folder exists checkexists(os.path.join(outDir)) if read2 != '': #generate command for paired end cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} -1 /reads/{read1} -2 /reads/{read2} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'" else: #generate command for interleaved cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} --interleaved /reads/{read1} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'" cmds.append(cmd) #data for next stage output_bam_list.append(os.path.join(outDir,f'{id}.bam')) #set up multiprocessing #start multiprocessing pool = mp.Pool(processes=num_jobs) #notify starting mapping procTitle('Mapping Reads') print('\nSniffles: Started mapping') #get start time start = time.time() #denote start of mapping in logs with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Mapping\n') #start multiprocessing results = pool.starmap_async(cd.call,[[cmd,'/reads',{ref_path:"/reference",read_path:"/reads",outDir:"/output"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start,2) print(f'\nSniffles: Finished mapping in {runtime} seconds') return output_bam_list
def indexing(runCFG,*paths): logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile']) outDir = runCFG['exec']['outdir'] + '/ref_sequence' checkexists(outDir) procTitle('Indexing Reference Genome') for path in paths: reference_sequence_abspath = os.path.abspath(path) reference_sequence_name = os.path.basename(reference_sequence_abspath) #index reference cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name}' with open(logfile,'a') as outlog: outlog.write("*************************\n") outlog.write("Bowtie2 indexing the reference\n") copyfile(reference_sequence_abspath,os.path.join(outDir,reference_sequence_name)) outlog.write(cd.call(cmd,'/data',{outDir:"/data"})) outlog.write("*************************\n")
def indexing(runCFG, *paths): #print('\n-----------------------Sniffles: Indexing reference sequence-----------------------\n') logfile = runCFG['exec']['logfile'] outDir = runCFG['exec']['outdir'] + '/ref_sequence' checkexists(outDir) procTitle("Indexing reference sequence", runCFG) for path in paths: reference_sequence_abspath = os.path.abspath(path) reference_sequence_name = os.path.basename(reference_sequence_abspath) print(path) #index reference cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name} --quiet' #' --threads {threads}' with open(logfile, 'a') as outlog: outlog.write("***********\n") outlog.write("Bowtie2 indexing the reference\n") copyfile(reference_sequence_abspath, os.path.join(outDir, reference_sequence_name)) outlog.write(cd.call(cmd, '/data', {outDir: "/data"})) outlog.write("***********\n")
cfg['exec']['outdir'] + '/norm_mapping', numThreads) #generate consensus if cfg['exec']['generateConsensus']: fasta_list = consensus(cfg, bam_list, numThreads) #map reads to consensus if cfg['exec']['mapToConsensus']: mapping_list = [] indexing(cfg, *fasta_list) for id in readData.runtime['trimmed']: for fasta in fasta_list: fasta_id = os.path.basename(fasta).split('.')[0] if fasta_id == id: mapping_list.append( (id, readData.runtime['trimmed'][id][0], readData.runtime['trimmed'][id][1], os.path.abspath(fasta))) mapping(cfg, mapping_list, cfg['exec']['outdir'] + '/map_to_consensus', numThreads) #call snps if cfg['exec']['callSNPs']: snpcaller(cfg, bam_list, numThreads) sc.procTitle('Finished Sniffles') end = time.time() runtime = round(end - start, 2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'Sniffles: Finished with a total runtime of {runtime}.')
def snpcaller(runCFG, bam_files, threads='1'): #set parameters outDir = runCFG['exec']['outdir'] logfile = os.path.join(outDir, runCFG['exec']['logfile']) outDir = os.path.join(outDir, 'snp_calls') checkexists(outDir) #set reference sequence reference_sequence_path = runCFG['exec']['outdir'] + '/ref_sequence' reference_sequence_name = os.path.basename( runCFG['exec']['referenceSequence']) #starting time point start = time.time() procTitle('SNP Calling') print(f'\nSniffles: Started calling SNPs') bams = [] sample_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] sample_list.append(id) bams.append('/infile/' + file_name) #generate mpileup cmd1 = 'bash -c \'samtools mpileup -ABR -d 1000000 {bams} -f /ref/{reference_sequence_name} > all.mpileup &&'.format( bams=' '.join(bams), reference_sequence_name=reference_sequence_name) #call snps snp_frequency = runCFG['snpcalling']['snpFrequency'] min_cov = runCFG['snpcalling']['minCoverage'] snp_qual_threshold = runCFG['snpcalling']['snpQualityThreshold'] cmd2 = 'java -jar /tools/varscan.jar mpileup2cns all.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{samples}") > all_snps.vcf\''.format( min_cov=min_cov, snp_qual_threshold=snp_qual_threshold, snp_frequency=snp_frequency, samples='\n'.join(sample_list)) #add commands to list for multiprocessing cmd = cmd1 + cmd2 #future code block for annotating aa changes #if runCFG['exec']['annotateAAChanges']: #pass #TODO add annotater for annotating aa changes with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Calling SNPs\n') results = cd.call(cmd, '/outfile', { reference_sequence_path: "/ref", path: "/infile", outDir: "/outfile" }) outlog.write('-----------\n') outlog.write(results) #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start, 2) print(f'\nSniffles: Finished calling snps in {runtime} seconds')
def consensus(runCFG,bam_list,threads='1'): #inital parameters outDir =runCFG['exec']['outdir'] logfile = os.path.join(outDir,runCFG['exec']['logfile']) outDir = os.path.join(outDir,'consensus') checkexists(outDir) #notify starting mapping procTitle('Generate Consensus') print('\nSniffles: Started generating consensus vcf') #get start time overall_start = time.time() start = time.time() #set reference sequence reference_sequence_abspath = os.path.abspath(runCFG['exec']['referenceSequence']) reference_sequence_name = os.path.basename(reference_sequence_abspath) reference_sequence_dir = runCFG['exec']['outdir'] + '/ref_sequence' #command list cmds = [] vcf_list = [] for path in bam_list: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] #run varscan mpileup2cns to generate vcf with consensus information minCov = runCFG['snpcalling']['minCoverage'] quality = runCFG['snpcalling']['snpQualityThreshold'] freq = runCFG['snpcalling']['consensusFrequency'] #make multiway pileup using samtools cmd1 = f'bash -c \'samtools mpileup -ABd 1000000 /infile/{file_name} -f /ref/{reference_sequence_name} -o {id}.pileup && ' cmd2 = f'java -jar /tools/varscan.jar mpileup2cns {id}.pileup --min-coverage {minCov} --min-avg-qual {quality} --min-var-freq {freq} --strand-filter 1 --output-vcf 1 > {id}.vcf\'' cmds.append(cmd1 + cmd2) vcf_list.append(os.path.join(outDir,f'{id}.vcf')) #setup multiprocessing pool = mp.Pool(processes=threads) #start multiprocessing with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Generating Consensus\n') #start multiprocessing results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",path:"/infile",outDir:"/outfile"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #check if vcf file is empty, if it is skip id and remove vcf file filtered_vcf_list = [] for path in vcf_list: try: if os.path.getsize(path)>0: filtered_vcf_list.append(path) else: os.remove(path) except: pass end = time.time() runtime = round(end - start,2) print(f'\nSniffles: Finished generating the consensus vcf in {runtime} seconds') start = time.time() print(f'\nSniffles: Generating consensus fasta') #command list for compressing files cmds = [] out_fasta = [] for vcf in filtered_vcf_list: full_path = os.path.abspath(vcf) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] #compress vcf file with bgzip cmd = f'bash -c \'bgzip {id}.vcf && tabix {id}.vcf.gz && bcftools consensus -f /ref/{reference_sequence_name} {id}.vcf.gz -o {id}.fasta\'' out_fasta.append(os.path.join(outDir,f'{id}.fasta')) cmds.append(cmd) #start multiprocessing with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Creating consensus Fasta\n') #start multiprocessing results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",outDir:"/outfile"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') end = time.time() runtime = round(end - start,2) print(f'\nSniffles: Finished generating consensus fasta in {runtime} seconds') #determine runtime of processes end = time.time() runtime = round(end - overall_start,2) print(f'\nSniffles: Finished generating consensus sequence in {runtime} seconds') return out_fasta
sc.checkexists(outDir) cfg['exec']['outdir'] = os.path.join(outDir,cfg['exec']['outdir']) try: os.mkdir(cfg['exec']['outdir']) except FileExistsError: cfg['exec']['outdir'] = cfg['exec']['outdir']+'_'+str(int(time.time())) os.mkdir(cfg['exec']['outdir']) outDir = cfg['exec']['outdir'] logfile=os.path.join(outDir,cfg['exec']['logfile']) cfg['exec']['logfile'] = logfile startRunMessage = f"Beginning run at {strftime('%a, %d %b %Y %I:%M:%S %p', time.localtime())}" sc.procTitle(startRunMessage, cfg) with open(logfile,'a') as outlog: outlog.write(startRunMessage + "\n") cfg['Errors'] = [] for reference, gtf in zip(list(cfg['exec']['referenceSequences']), cfg['postprocessing']['gtfFileNames']): sc.procTitle (f"Processing samples for reference sequence {reference.split('.')[0]}", cfg) #assign reference sequence value and gtf value to current reference and gtf cfg['exec']['referenceSequence'] = os.path.join(os.getcwd(), reference) cfg['postprocessing']['gtfFileName'] = os.path.join(os.getcwd(), gtf) try: inDir = os.path.abspath(args.i)
def normCoverage(runCFG,bam_files,threads='1'): #NOTE: normalizing with bbnorm uses all available memory, thus can only be run serialy #inital parameters outDir =runCFG['exec']['outdir'] checkexists(os.path.join(outDir,'normalized')) logfile = os.path.join(outDir,runCFG['exec']['logfile']) outDir = os.path.join(outDir,'normalized') #notify starting to remove duplicates procTitle('Normalize Coverage') print('\nSniffles: Normalizing read coverage') #get time at start start = time.time() #denote start of remove duplicate reads in logs with open(logfile,'a') as outlog: outlog.write('********************\n') outlog.write('Normalizing coverage\n') #run normalization output_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split('.')[0] #get reads from mapped bamfile cmd_get_reads = f'bash -c \'samtools fastq /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq -2 /out_dir/{id}_mapped_2.fastq && ' #run seqtk to subsample reads total_reads = runCFG['exec']['totalReads'] cmd_normalization = f'seqtk sample -s100 /out_dir/{id}_mapped_1.fastq {total_reads} > {id}_1.fastq && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq {total_reads} > {id}_2.fastq\'' #start docker containers and run outlog.write(f'{id}-----------\n') stdout=cd.call(cmd_get_reads+cmd_normalization,'/out_dir',{path:"/bam_files",outDir:"/out_dir"}) outlog.write(stdout) outlog.write(f'-----------\n') output_list.append([os.path.join(outDir,f'{id}_1.fastq'),os.path.join(outDir,f'{id}_2.fastq')]) #cleanup try: os.remove(f'{outDir}/{id}_mapped_1.fastq') except: pass try: os.remove(f'{outDir}/{id}_mapped_2.fastq') except: pass outlog.write('********************\n') #get time at end end = time.time() #determine runtime of processes runtime = round(end - start,2) print(f'\nSniffles: Finished normalizing read coverage in {runtime} seconds') return output_list
def normCoverage(runCFG, bam_files, threads='1'): #initial parameters outDir = runCFG['exec']['outdir'] checkexists(os.path.join(outDir, 'normalized')) logfile = runCFG['exec']['logfile'] outDir = os.path.join(outDir, 'normalized') #notify starting to remove duplicates procTitle("Downsampling with seqtk to normalize coverage", runCFG) #print('\n-----------------------Sniffles: Downsampling with seqtk to normalize coverage-----------------------') #get time at start start = time.time() #denote start of remove duplicate reads in logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Downsampling with seqtk to normalize coverage\n') #run normalization output_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split('.')[0] #get reads from mapped bamfile cmd_get_reads = f'bash -c \'samtools collate /bam_files/{id}.bam collating && samtools fastq -n /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq.gz -2 /out_dir/{id}_mapped_2.fastq.gz' #run seqtk to subsample readsc total_reads = runCFG['exec']['totalReads'] cmd_normalization = f' && seqtk sample -s100 /out_dir/{id}_mapped_1.fastq.gz {total_reads} > {id}_1.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq.gz {total_reads} > /out_dir/{id}_2.fastq.gz\'' if runCFG['exec']['unpaired']: cmd_get_reads += f' -0 /out_dir/{id}_mapped_U.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_U.fastq.gz {total_reads} > /out_dir/{id}_U.fastq.gz' output_list.append([ os.path.join(outDir, f'{id}_1.fastq.gz'), os.path.join(outDir, f'{id}_2.fastq.gz'), os.path.join(outDir, f'{id}_U.fastq.gz') ]) else: output_list.append([ os.path.join(outDir, f'{id}_1.fastq.gz'), os.path.join(outDir, f'{id}_2.fastq.gz') ]) #start docker containers and run outlog.write(f'{id}\n-----------\n') stdout = cd.call(cmd_get_reads + cmd_normalization, '/out_dir', { path: "/bam_files", outDir: "/out_dir" }) outlog.write(stdout) #cleanup try: os.remove(f'{outDir}/{id}_mapped_1.fastq.gz') except: pass try: os.remove(f'{outDir}/{id}_mapped_2.fastq.gz') except: pass try: os.remove(f'{outDir}/{id}_mapped_U.fastq.gz') except: pass outlog.write('***********\n') #get time at end end = time.time() #determine runtime of processes runtime = round(end - start, 2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'\nSniffles: Finished normalizing read coverage in {runtime}') return output_list
def VCFannotator(runCFG, vcffiles): # read in reference sequences and gtfs and store information about protein sequences # create a dictionary of gene names and start/stop sites, allowing for more than one start/stop site. #import file location parameters from config file outDir = os.path.join(runCFG['exec']['outdir'], 'vcf_annotations') checkexists(outDir) logfile = os.path.join(outDir, runCFG['exec']['logfile']) refseqfasta = runCFG['exec']['referenceSequence'] refseqname = refseqfasta.split(".")[0] if runCFG['exec']['mapToConsensus']: refseqfasta = os.path.join(runCFG['exec']['outdir'], 'ref_sequence', refseqfasta) #get start time start1 = time.time() procTitle('Annotating SNPs', runCFG) #Extract coding sequence coordinates from gtf files: coding_regions = { } #will be dictionary of dictionaryies (format segment:gene:[[startExon1, stopExon1], [startExon2, stopExon2]]) with open(runCFG['postprocessing']['gtfFileName'], "r") as gtf: for line in gtf: if line.strip( "\n" ) != "": # ignore blank lines (otherwise throws an index error) line = line.replace("/", "_") lineitems = line.split("\t") segment_name = lineitems[0] annotation_type = lineitems[2] start = int( lineitems[3]) - 1 # adding the -1 here for 0 indexing stop = int( lineitems[4]) - 1 # adding the -1 here for 0 indexing gene_name = lineitems[8] gene_name = gene_name.split(";")[0] gene_name = gene_name.replace("gene_id ", "") gene_name = gene_name.replace("\"", "") if annotation_type.lower() == "cds": if segment_name not in coding_regions: coding_regions[segment_name] = {} coding_regions[segment_name][gene_name] = [[ start, stop ]] elif segment_name in coding_regions and gene_name not in coding_regions[ segment_name]: coding_regions[segment_name][gene_name] = [[ start, stop ]] elif gene_name in coding_regions[segment_name]: coding_regions[segment_name][gene_name].append( [start, stop]) # pull in reference fasta file, separate gene segments into a dictionary ref_segments = {} for seq in SeqIO.parse(refseqfasta, "fasta"): refseqname = str(seq.id).replace("/", "_") sequence = str(seq.seq).lower() ref_segments[refseqname] = sequence # use gene coordinates to create coding sequences from reference sequences transcripts = {} #Reminder of current data structures: #coding_regions[segment][gene]:coordinates of genes #ref_segments[nameofsegment]:sequence for segment in coding_regions: for gene in coding_regions[segment]: transcripts[gene] = "" coordinates = coding_regions[segment][ gene] # define the coding regions for each gene for start, stop in coordinates: # loop through start/stop sites in coding regions sequence_chunk = ref_segments[segment][start:stop + 1] transcripts[gene] = transcripts[ gene] + sequence_chunk # append each piece of the transcript together # loop through each transcript to make sure that it begins with a start codon and ends with a stop codon #for t in transcripts: #if transcripts[t][0:3] != start_codon: #print("WARNING! " + refseqname + " " + t + " does not contain a start codon! The first three nucleotides are " + transcripts[t][0:3]) #if transcripts[t][-3:] not in stop_codons: #print("WARNING! " + refseqname + " " + t + " does not contain a stop codon! These are the last 3 nucleotides: " + transcripts[t][-3:]) print(vcffiles) if os.path.isdir(vcffiles): vcffiles = glob.glob(vcffiles + "/*.vcf") elif type(vcffiles) == list: if vcffiles[0].split(".")[-1] == "vcf": pass else: print("vcffiles has no vcf files!") listofmutstoExport = [] ##Loop through each vcf file and annotate amino acid changes print(vcffiles) for i, vcfname in tqdm(enumerate(vcffiles)): with open(vcfname, "r") as TextVCF: for index, line in enumerate(TextVCF, 0): if "#CHROM" in line: rowstoskip = index #Reads the vcf file into a pandas DataFrame print(vcfname) try: vcfDF = pd.read_csv(vcfname, sep='\t', skiprows=rowstoskip) except OSError as inst: print("\n" + vcfname + " did not open appropriately. Please check file.\n") #fix the / in chrome bug vcfDF["#CHROM"] = vcfDF["#CHROM"].str.replace("/", "_") #extract frequencies for list of muts to export: #In order to make this easier, I'm going to assume each VCF has only one sample. This code DOES NOT WORK for more than one sample per VCF. #print(vcfDF.iloc[:,-1].str.split(":").str[6].str.rstrip('%').astype('float')/100) freqlocation = vcfDF.loc[0, "FORMAT"].split(":").index("FREQ") try: vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split( ":").str[freqlocation].astype('float') except ValueError: vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split( ":").str[freqlocation].str.rstrip('%').astype('float') / 100 except: raise listofmuts = [] #loop through each line in vcfDF, extract chrom, pos, reference nucleotide, alternate nucleotide for chrom, pos, ref, alt, freq in zip(vcfDF['#CHROM'], vcfDF["POS"], vcfDF["REF"].str.lower(), vcfDF["ALT"].str.lower(), vcfDF["FREQ"]): pos -= 1 #subtract one from position to convert from VCF's 1 indexing to python's 0 for gene in coding_regions[chrom].keys( ): #loop through each gene potentially applicable to that position (i.e., all on chromosome) priorExonLength = 0 #print (gene) for start, stop in coding_regions[chrom][ gene]: #loop through each exon of gene #if pos in exon, calculate codon, reference aa, and variant aa #print (f"pos: {pos} start: {start} stop: {stop}") if pos in range(start, stop): #print ('is in range, annotating.') within_gene_position = pos - start + priorExonLength #within gene position is the position in this exon (pos-startOfExon), plus the length of any prior exons (exonstart) codon_pos = (within_gene_position % 3) alternatetranscript = transcripts[ gene][:within_gene_position] + alt + transcripts[ gene][within_gene_position + 1:] codon = transcripts[gene][(within_gene_position - codon_pos):( within_gene_position + (3 - codon_pos))] variantcodon = alternatetranscript[( within_gene_position - codon_pos):(within_gene_position + (3 - codon_pos))] ref_aa = Seq(codon).translate() variant_aa = Seq(variantcodon).translate() aa_num = str(int(within_gene_position / 3) + 1) #Catch errors in annotation calculations where the math results in an incorrect codon if codon[codon_pos] != ref: print( "Something's quite wrong here. The reference SNP is not what it should be." ) print(f"\n\nchrom: {chrom}, gene: {gene}") print( f"\npos: {pos} within_gene_position: {within_gene_position}\ncodon_pos: {codon_pos} codon: {codon} variantcodon: {variantcodon}\n\n" ) print( f"ref: {ref} alt: {alt} ref_aa: {ref_aa} \nvariant_aa: {variant_aa}\n aa_num: {aa_num}\n" ) print(transcripts[gene]) ref_aa = Seq(codon).translate() variant_aa = Seq(variantcodon).translate() aa_num = str(int(within_gene_position / 3) + 1) if ref_aa != variant_aa: listofmuts.append([ chrom, gene, pos + 1, str(ref_aa + aa_num + variant_aa) ]) if freq > 0.01 and freq < 0.99: listofmutstoExport.append({ "segment": chrom, 'gene': gene, 'position': pos + 1, 'frequency': float(freq), 'AAchange': str(ref_aa + aa_num + variant_aa) }) elif ref_aa == variant_aa: listofmuts.append([chrom, gene, pos + 1, "."]) break #if pos is in exon, stop looping though exons else: priorExonLength += ( stop + 1 - start ) #The next exon will begin after the length of this exon, i.e., after the stop point minus the start point #else statement only executed if for loop finishes without breaking (ie if pos is never within the gene being examined) else: listofmuts.append([chrom, gene, pos + 1, "not in ORF"]) continue #continue onto next gene AAchange = pd.DataFrame(listofmuts, columns=['#CHROM', 'gene', 'POS', 'AAchange']) vcfDF = vcfDF.merge(AAchange, how='left', on=['#CHROM', 'POS']) vcfDF['gene'] = vcfDF['gene'].astype(str) vcfDF['gene'] = vcfDF['gene'].replace("NA", "NA gene") annotatedVCFname = os.path.basename(vcfname).split( ".")[0] + ".annotated_vcf" outputfile = vcfDF.to_csv(os.path.join(outDir, annotatedVCFname), sep="\t", index=None, header=True) vcffiles[i] = annotatedVCFname #importantMuts = pd.DataFrame(listofmutstoExport) #print (importantMuts) #importantMuts = importantMuts.groupby(['segment', 'gene', 'position', 'AAchange'], as_index=False).mean() #importantMuts = importantMuts.loc[importantMuts['freq']>0.02 & importantMuts['freq']<0.98] #importantMuts['freq'] = importantMuts['freq']/len(vcffiles) #importantMutsexport = importantMuts.to_csv(os.path.join(outDir, "allMutationsPresent.tsv"), sep = '\t', index=None, header=True) #get end time end = time.time() #get total runtime runtimeSeconds = end - start1 runtime = datetime.timedelta(seconds=runtimeSeconds) print(f'\nSniffles: Finished annotating snps in {str(runtime)}') return (vcffiles)
def RePlow(runCFG, bam_files, threads='1'): #set parameters outDir = runCFG['exec']['outdir'] logfile = runCFG['exec']['logfile'] bamfilespath = os.path.dirname(bam_files[0])#os.path.join(outDir, "norm_mapping") outDir = os.path.join(outDir,'snp_calls') reference_sequence_path = os.path.join(runCFG['exec']['outdir'], 'ref_sequence') reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence']) # if os.path.isdir(reference_sequence_path) and os.listdir("ref_sequence") != os.listdir(reference_sequence_path) : # rmtree (reference_sequence_path) # copytree("ref_sequence", reference_sequence_path) # if not os.path.isdir(reference_sequence_path): # copytree("ref_sequence", reference_sequence_path) #starting time point start = time.time() procTitle('Analyzing SNPs with RePlow', runCFG) #print('\n-----------------------Sniffles: Calling SNPs with RePlow-----------------------') bams = [] sample_list = [] repDict = {} #create list of bam files to run print ("bam_files:") print (bam_files) for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] sample_list.append(id) bams.append('/infile/'+file_name) print (file_name) #if processing replicate runs, create dictionary samplename:[list of replicate bam files for that sample] if runCFG['exec']['replicates']: repBreakdown = runCFG['exec']['replicateNotation'].split("_") repBreakdown = "_".join(repBreakdown[:-1]) repBreakdown = repBreakdown[:repBreakdown.find(r"\d")] repBreakdown = repBreakdown.split("Sample") repKey = file_name[file_name.find(repBreakdown[0])+len(repBreakdown[0]):file_name.find(repBreakdown[1])] if repKey not in repDict.keys(): repDict[repKey] = [file_name] else: repDict[repKey].append(file_name) print (repDict) #import SNP quality parameters from config snp_qual_threshold=runCFG['snpcalling']['snpQualityThreshold'] consensus_frequency=runCFG['snpcalling']['consensusFrequency'] mut_rate=runCFG['replow_settings']['mutrate'] map_qual_threshold=runCFG['replow_settings']['mapquality'] #make a default .bed file from reference that instructs replow to call SNPs on the whole genome chrom = [] chromEnd = [] with open(os.path.join(reference_sequence_path, reference_sequence_name),'r') as refseq: for line in refseq.readlines(): if line[0] == ">": chrom.append(line[1:].rstrip()) elif len(chrom) != len(chromEnd): chromEnd.append(len(line)-2) #-1 to adjust for zero indexing in a bed file else: chromEnd[len(chrom)] =+ len(line) bedfile = pd.DataFrame() bedfile["chrom"] = chrom bedfile["chromStart"] = 0 bedfile["chromEnd"] = pd.Series(chromEnd) print (reference_sequence_name) bedfilename = "".join(reference_sequence_name.split(".")[:-1])+".bed" bedcsv = bedfile.to_csv(os.path.join(reference_sequence_path, bedfilename), index=False, header=False, sep="\t") #Prep RePlow cmds cmds=[] keycmds=[] indexcmds=[] #Adds command to index reference sequence for RePlow indexcmds.append(f'samtools index /ref/{reference_sequence_name}') #add commands to list for multiprocessing ###TO BE IMPROVED: Currently I only generate commands if I am running RePlow on replicate sequencing runs. ### While that is the typical case, I should eventually expand this to cover single runs as well. #run through each sample name in replicate dictionary # print ("repDict:") print (repDict) for key in repDict.keys(): #generate command to index each replicate bam file for i, bam in enumerate(repDict[key]): repDict[key][i] = "/data/"+ bam indexcmds.append(f'samtools index {repDict[key][i]}') #create comma-deliniated list of replicate bam files, generate unique replow command for each sample bamslist=','.join(repDict[key]) #replowcmd = f'java -cp dependency/*:classes tgil.replow.RePlow -r /ref/{reference_sequence_name} -b {bamslist} -T /ref/{bedfilename} -R /usr/bin/Rscript -f {consensus_frequency} -q {snp_qual_threshold} -Q {map_qual_threshold} -m {mut_rate} -o /output -L {key}' outlogHeader = f"\"{key}\n-----------\n\"" replowcmd = f'bash -c \'printf {outlogHeader} >> {os.path.join("/logfile", os.path.basename(logfile))} && java -jar /source/RePlow-1.1.0.jar -r /ref/{reference_sequence_name} -b {bamslist} -T /ref/{bedfilename} -R /usr/bin/Rscript -f {consensus_frequency} -q {snp_qual_threshold} -Q {map_qual_threshold} -m {mut_rate} -o /output -L {key}\'' keycmds.append(replowcmd) print ("Indexcmds:") print (indexcmds) print ("Keycmds:") print (keycmds) #generate mulitprocessing pool pool = mp.Pool(processes=threads) #index files first with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('RePlow\n') outlog.write('***********\n') #run commands in docker contaniers with multiprocessing indexresults = pool.starmap_async(cd.call,[[cmd,'/source',{os.path.join (os.getcwd(), "replow"):"/source", bamfilespath:"/data", outDir:"/output", reference_sequence_path:"/ref",os.path.dirname(logfile):"/logfile"}] for cmd in indexcmds]) pool.close() pool.join() pool = mp.Pool(processes=threads) results = pool.starmap_async(cd.call,[[cmd,'/source',{os.path.join (os.getcwd(), "replow"):"/source", bamfilespath:"/data", outDir:"/output", reference_sequence_path:"/ref",os.path.dirname(logfile):"/logfile"}] for cmd in keycmds]) pool.close() pool.join() stdouts = indexresults.get() + results.get() for stdout in stdouts: #outlog.write('-----------\n') outlog.write(stdout) #Convert .call into .vcf for file in glob.glob(outDir + "/*.call"): samplename = calltoVCF(file, outDir) outlog.write(f"Call file {samplename} converted to VCF format") #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start,2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'\nSniffles: Finished calling snps in {runtime}') return (os.path.join(outDir, samplename +".vcf"))
def snpcaller(runCFG,bam_files,threads='1'): #set parameters outDir = runCFG['exec']['outdir'] logfile = runCFG['exec']['logfile'] outDir = os.path.join(outDir,'snp_calls') checkexists(outDir) #set reference sequence reference_sequence_path = os.path.dirname(runCFG['exec']['referenceSequence']) reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence']) #starting time point start = time.time() if runCFG['exec']['replicates']: message = 'Calling replicate SNPs with Varscan' else: message = 'Calling SNPs with Varscan' procTitle(message, runCFG) bams = [] sample_list = [] listofVCFs = [] repDict = {} #Create list of bam files to call SNPs on for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] sample_list.append(id) bams.append('/infile/'+file_name) #if processing replicate runs, create dictionary samplename:[list of replicate vcf files for that sample] #this dictionary will be used later to merge and average replicate vcfs if runCFG['exec']['replicates']: repBreakdown = runCFG['exec']['replicateNotation'].split("_") repBreakdown = "_".join(repBreakdown[:-1]) repBreakdown = repBreakdown[:repBreakdown.find(r"\d")] repBreakdown = repBreakdown.split("Sample") repKey = file_name[file_name.find(repBreakdown[0])+len(repBreakdown[0]):file_name.find(repBreakdown[1])] vcf_name = (id+".vcf") listofVCFs.append(vcf_name) if repKey not in repDict.keys(): repDict[repKey] = [vcf_name] else: repDict[repKey].append(vcf_name) #import SNP calling quality parameters from config file snp_frequency=runCFG['snpcalling']['snpFrequency'] min_cov=runCFG['snpcalling']['minCoverage'] snp_qual_threshold=runCFG['snpcalling']['snpQualityThreshold'] #generate commands to call variants cmds=[] for bam, sample in zip(bams, sample_list): #mpileup command outlogHeader = f"{bam.split('/')[-1].split('.')[0]}\n-----------\n" cmd1 = f'printf \"{outlogHeader}\" >> {os.path.join("/logfile", os.path.basename(logfile))} && samtools mpileup -ABR -d 1000000 {bam} -f /ref/{reference_sequence_name} > {sample}.mpileup' #varscan command cmd2 = f'java -jar /tools/varscan.jar mpileup2snp {sample}.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{sample}") > {sample}_temp.vcf' #compress and normalize vcf cmd3 = f'bcftools norm -c sw -m - -f /ref/{reference_sequence_name} -o {sample}.vcf {sample}_temp.vcf && rm {sample}_temp.vcf' if not runCFG['exec']['replicates']: listofVCFs.append(os.path.join(outDir, f"{sample}.vcf")) #add commands to list for multiprocessing cmds.append("bash -c \'" + cmd1 + " && " + cmd2 + " && " + cmd3 + "\'") #initialize multiprocessing pool pool = mp.Pool(processes=threads) #open logfile with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Calling SNPs\n') #run commands with mutliprocessing results = pool.starmap_async(cd.call,[[cmd, '/outfile',{reference_sequence_path:"/ref",path:"/infile",outDir:"/outfile",os.path.dirname(logfile):"/logfile"}] for cmd in cmds]) pool.close() pool.join() stdouts = results.get() print ('finished all results') for stdout in stdouts: #outlog.write('-----------\n') outlog.write(stdout) #if processing duplicate runs, merge and average SNP calls if runCFG['exec']['replicates']: listofVCFs = (VCFaverager(runCFG, repDict, listofVCFs)) outlog.write(str(listofVCFs)) outlog.write('-----------\n') #Combine sample vcfs into one master VCF: #allSNPs = VCFcombiner(runCFG, listofVCFs, "allVarscanSNVs.vcf") #outlog.write(f"\nCombined all vcf files into master vcf file allVarscanSNVs.vcf\n") outlog.write('-----------\n') #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start,2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'\nSniffles: Finished calling snps in {runtime}') return (listofVCFs)
def trim(readData, runCFG, threads, ids=''): #parameters minlength = runCFG['trimmomatic']['minlength'] windowsize = runCFG['trimmomatic']['windowSize'] qscore = runCFG['trimmomatic']['qscore'] adapterpath = "/tools/adapters/" + runCFG['trimmomatic']['adaptersFileName'] outDir = runCFG['exec']['outdir'] logfile = os.path.join(outDir, runCFG['exec']['logfile']) #set up list of ids to trim if not ids: ids = readData.ids #generate commands for each trim job cmds = [] for id in ids: #main command main_cmd = f'java -jar /tools/trimmomatic.jar ' #get read path if readData.reads[id]: read_path = os.path.dirname(os.path.abspath( readData.reads[id].fwd)) read1_basename = os.path.basename(readData.reads[id].fwd) read2_basename = os.path.basename(readData.reads[id].rev) #determine args if runCFG['trimmomatic']['removeAdapters']: if runCFG['trimmomatic']['paired']: args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz', f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz') else: args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz') else: if runCFG['trimmomatic']['paired']: args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz', f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz') else: args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz') #prepare command and add to list sample_cmd = main_cmd + args cmds.append(sample_cmd) #make out dir if it doesn't already exist try: os.mkdir(os.path.join(outDir, 'trimmed')) except: pass #set up multiprocessing #start multiprocessing pool = mp.Pool(processes=threads) #notify starting trimming procTitle('Quality Trimming') print('\nSniffles: Started quality trimming') #start timer start = time.time() #denote logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Trimmomatic\n') #begin multiprocessing results = pool.starmap_async(cd.call, [[ cmd, '/data', { read_path: "/data", os.path.join(outDir, 'trimmed'): "/output" } ] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get time end = time.time() #determine runtime of processes runtime = round(end - start, 2) print(f'\nSniffles: Finished trimming in {runtime} seconds')
def trim(readData, runCFG, threads=1, ids=''): #parameters minlength = runCFG['trimmomatic']['minlength'] windowsize = runCFG['trimmomatic']['windowSize'] qscore = runCFG['trimmomatic']['qscore'] adapterpath = "/Trimmomatic-0.36/adapters/" + runCFG['trimmomatic'][ 'adaptersFileName'] outDir = runCFG['exec']['outdir'] logfile = runCFG['exec']['logfile'] #set up list of ids to trim if not ids: ids = readData.ids #generate commands for each trim job cmds = [] #rmspacecmds = [] for id in ids: #get read path if readData.reads[id]: read_path = os.path.dirname(os.path.abspath( readData.reads[id].fwd)) read1_basename = os.path.basename(readData.reads[id].fwd) read2_basename = os.path.basename(readData.reads[id].rev) #main command outlogHeader = f"\"{id}\n-----------\n\"" containerLogpath = os.path.join("/logfile", os.path.basename(logfile)) main_cmd = f'bash -c \'printf {outlogHeader} >> {containerLogpath} && java -jar /tools/trimmomatic.jar ' #regexExpression='s/(^@.*) (.*)/\\1_\\2/g' #determine args if runCFG['trimmomatic']['removeAdapters']: if runCFG['trimmomatic']['paired']: args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\'' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz', f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz') #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed_1P.fastq.gz && sed -re \"{regexExpression}\" -i /output/{id}_trimmed_2P.fastq\'" else: args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\'' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz') #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed.fastq\'" else: if runCFG['trimmomatic']['paired']: args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\'' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz', f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz') #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed_1P.fastq && sed -re \"{regexExpression}\" -i /output/{id}_trimmed_2P.fastq\'" else: args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\'' readData.add_runtime( 'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz') #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed.fastq\'" #prepare command and add to list sample_cmd = main_cmd + args #rmspacecmds.append(rmspaces) cmds.append(sample_cmd) for cmd in cmds: print(cmd) #make out dir if it doesn't already exist try: os.mkdir(os.path.join(outDir, 'trimmed')) except: pass #set up multiprocessing pool = mp.Pool(processes=threads) # pool2 = mp.Pool(processes=threads) #notify starting trimming procTitle("Started quality trimming", runCFG) #start timer start = time.time() #denote logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Trimmomatic\n') outlog.write('***********\n') #begin multiprocessing results = pool.starmap_async(cd.call, [[ cmd, '/data', { read_path: "/data", os.path.join(outDir, 'trimmed'): "/output", os.path.dirname(logfile): "/logfile" } ] for cmd in cmds]) pool.close() pool.join() stdouts = results.get() for stdout in stdouts: outlog.write(stdout) # results = pool2.starmap_async(cd.call,[[cmd,'/data',{read_path:"/data",os.path.join(outDir,'trimmed'):"/output",os.path.dirname(logfile):"/logfile"}] for cmd in rmspacecmds]) # pool.close() # pool.join() # stdouts = results.get() # for stdout in stdouts: # outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get time end = time.time() #determine runtime of processes runtime = round(end - start, 2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'Sniffles: Finished trimming in {runtime} seconds\n')