def create_index(reference_fasta_file, bowtie_path, samtools_path, tmpdir, logger): 'Build a bowtie2 index and fai index from the given input(s)' fasta_file = tmpdir + "reference.fasta" shutil.copyfile(reference_fasta_file, fasta_file) bt2_index = fasta_file + '.1.bt2' fai_index = fasta_file + '.fai' log_writer.info_header( logger, 'Building bowtie2 index for {}...'.format(fasta_file)) if os.path.exists(bt2_index): log_writer.write_log( logger, 'Bowtie2 index for {} is already built...'.format(fasta_file), 'info') else: bowtie2_index = bowtie_path + '-build' subprocess.call([bowtie2_index, '-f', fasta_file, fasta_file]) log_writer.info_header( logger, 'Building samtools index for {}...'.format(fasta_file)) if os.path.exists(fai_index): log_writer.write_log( logger, 'Samtools index for {} is already built...'.format(fasta_file), 'info') else: subprocess.call([samtools_path, 'faidx', fasta_file]) return fasta_file
def clean_up(tmp, logger): ' Remove temporary files' log_writer.info_header(logger, 'Removing temporary files:' + tmp) try: shutil.rmtree(tmp) except OSError: raise
def get_from_config( CONFIG_FILE, get_items=None ): ''' Returns data for the calling function from CONFIG_FILE. Args CONFIG_FILE, string : environment variable loaded using module, pointing to config file get_items, list : optional, list of items to return from config file Return data-structure : default return is the data structure represented by the config file for the calling function items, unpacked tuple : if get_items is specified, unpacked variables specified by the config file for the calling function The config_file.yml must represent a dictionary, with the primary key == name_of_func_using_value <k> = function_using_FOO <k> = FOO ` <v> = value_of_foo 'function_using_FOO' is retrieved from the call stack and used selectively to return data from the config_file.yml N.B. the calling code for a single item needs a comma to unpack the return from tuple return_item, = get_from_config( config_file, get_items=['return_item'] ) MGGoulden 20130906 amended 20130919 ''' import os import yaml import logging import inspect # from common_modules import log_writer # setup_logger, write_log, error_header, info_header, log_process # get a pointer to the logger logger = logging.getLogger( 'stdout_stderr_logger' ) log_txt = 'IN get_from_config( CONFIG_FILE = '+CONFIG_FILE+', get_items = ' +str(get_items)+ ')' log_writer.info_header( logger, log_txt ) # sanity check if not CONFIG_FILE in os.environ: log_txt = 'The environment variable ('+CONFIG_FILE+') is not available; module load may be required; quitting ... ' log_writer.error_header( logger, log_txt ) elif not os.path.exists( os.environ[ CONFIG_FILE ] ): log_txt = 'The environment variable ('+CONFIG_FILE+') points to a file ('+os.environ[CONFIG_FILE]+') which does not exist; module amendment required; quitting ... ' log_writer.error_header( logger, log_txt ) # who calls? - identify the calling func from stack caller = inspect.stack()[1][3] if caller == 'try_and_except': caller = inspect.stack()[2][3] read_me = os.environ[CONFIG_FILE] with open( read_me, 'r') as f: CONFIG = yaml.load(f) if not get_items: return CONFIG[caller] else: return [ CONFIG[ caller ][x] for x in get_items ]
def pileup(sorted_bamfile, reference, samtools, outdir, logger): ' Create a mpileup file ' log_writer.info_header(logger, 'Create mpileup file') filename = os.path.join(outdir, os.path.basename(sorted_bamfile).split('.')[0] + '.mpileup') pileupFile = open(filename, 'w') process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-f', reference, sorted_bamfile], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # -A -count anomalous read pairs, -B - disable BAQ computation and -f FILE - indexed reference sequence file result = process.stdout for l in result: pileupFile.write(l) pileupFile.close()
def modify_bowtie_sam(samfile, logger): 'Modify SAM formatted output from Bowtie to maintain secondary alignments for downstream pileup' with open(samfile) as sam, open(samfile + '.mod', 'w') as sam_mod: log_writer.info_header(logger, 'Modifying SAM formatted output from Bowtie to maintain secondary alignments for downstream pileup...') for line in sam: if not line.startswith('@'): fields = line.split('\t') flag = int(fields[1]) flag = (flag - 256) if (flag > 256) else flag sam_mod.write('\t'.join([fields[0], str(flag)] + fields[2:])) else: sam_mod.write(line) return samfile + '.mod'
def mapping(fastq_file, reference, bowtie_path, samtools_path, outdir, logger): sample = os.path.basename(fastq_file).split('.')[0] # create index if not available tmp = outdir + "/{0}_tmp/".format(sample) if not os.path.exists(tmp): os.mkdir(tmp) reference_fasta_file = create_index(reference, bowtie_path, samtools_path, tmp, logger) fastq1 = fastq_file #if _args.source == 0: fastq2 = fastq_file.replace('1.fastq', '2.fastq') #else: #fastq2 = fastq_file.replace('_1.fq', '_2.fq') samfile = tmp + sample + '.sam' bamfile = tmp + sample + '.bam' sorted_bam_prefix = outdir + '/' + sample + '.sorted' if not os.path.isfile(sorted_bam_prefix + '.bam'): # change this to bamfiles/prefix.sorted.bam # create sam file log_writer.info_header(logger, "Running bowtie to generate sam file") subprocess.call([ bowtie_path, '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastq1, '-2', fastq2, '-S', samfile, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ]) # write to tmp # remove flags > 256 to allow reads to map in more than one locations sam_mod = modify_bowtie_sam(samfile, logger) # convert to bam log_writer.info_header(logger, "Running samtools to convert sam to bam file") subprocess.call([samtools_path, 'view', '-bS', '-o', bamfile, sam_mod]) # sort bam file log_writer.info_header(logger, "Sort the bam file") subprocess.call([samtools_path, 'sort', bamfile, sorted_bam_prefix]) # index bam file log_writer.info_header(logger, "Index the BAM file") subprocess.call([samtools_path, 'index', sorted_bam_prefix + '.bam']) # get stats output = subprocess.check_output( [samtools_path, "flagstat", sorted_bam_prefix + '.bam']) ## clean up all unnecessary files #clean_up(tmp, logger) return sorted_bam_prefix + '.bam', reference_fasta_file
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger): """ Function Generate pileup file by using SAMtools mpileup command. NB: use -B -A -f option to optimises coverage and --A flag count anomalous read The option for method: tmp_dir[str]: the path to where pileup file will be created sorted_bam_file[str]: the path to the BAM file location refFn[str]: the path to the reference file location samtools[str]: the path to SAMtools command logger[str]: the path to where the stderr and stdout logged """ #1. Index bam file log_writer.info_header(logger, "index bam file") process = subprocess.Popen([samtools, 'index', sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #2. Generate pileup file pileFn = os.path.join(tmp_dir, 'all.pileup') pileupFile = open(pileFn, 'w') log_writer.info_header(logger, "Generate pileup file") process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-f', refFn, sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) for l in process.stdout: pileupFile.write(l) process.wait() log_writer.log_process(logger, process, log_error_to = "info") pileupFile.close()
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger): """ Function Generate pileup file by using SAMtools mpileup command. NB: use -B -A -f option to optimises coverage and --A flag count anomalous read The option for method: tmp_dir[str]: the path to where pileup file will be created sorted_bam_file[str]: the path to the BAM file location refFn[str]: the path to the reference file location samtools[str]: the path to SAMtools command logger[str]: the path to where the stderr and stdout logged """ #1. Index bam file log_writer.info_header(logger, "index bam file") process = subprocess.Popen([samtools, 'index', sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #2. Generate pileup file pileFn = os.path.join(tmp_dir, 'all.pileup') pileupFile = open(pileFn, 'wb') log_writer.info_header(logger, "Generate pileup file") process = subprocess.Popen( [samtools, 'mpileup', '-B', '-A', '-f', refFn, sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) for l in process.stdout: pileupFile.write(l) process.wait() log_writer.log_process(logger, process, log_error_to="info") pileupFile.close()
def flanking_regions(profile_file_directory, output_directory, logger): """ Function (1) Extract flanking regions of 100bp upstream and downstream of each MLST locus by blast against a reference genome.BLAST uses the first locus sequence as a query. (2) Creates summary.txt file (a tab-delimited text file display the path to the loci and flanking sequences) The option of the method profile_file_directory[str]: The path to the reference.seq, profile.txt and the Locus variant sequences (*.fas) files location output_directory[str]: The path to where the summary.txt file will be created logger[str]: The path to where the stderr and stdout logged """ reference_fasta_file = profile_file_directory + "/reference.seq" refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna) locus_files = glob.glob(profile_file_directory + "/*.fas") locus_files = sorted(locus_files) summary_file_handle = open(output_directory + "/summary.txt", "w") for seq in locus_files: (seqDir, seqFileName) = os.path.split(seq) (seqBaseName, ext) = os.path.splitext(seqFileName) bait = seqBaseName + "_bait.fasta" log_writer.info_header(logger, "create bait file") process = subprocess.Popen([ 'seqret', seq, '-firstonly', '-auto', '-out', output_directory + '/' + bait ], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") cline = NcbiblastnCommandline(query=output_directory + '/' + bait, db=profile_file_directory + "/reference", evalue=0.001, out=output_directory + "/my_blast_tmp.xml", outfmt=5) stdout_log_output, stderr_log_output = cline() result_handle = open(output_directory + "/my_blast_tmp.xml") blast_record = NCBIXML.read(result_handle) query_length = blast_record.query_letters for alignment in blast_record.alignments: hsp = alignment.hsps[0] if hsp.align_length / float(query_length) > 0.5: if hsp.sbjct_start > hsp.sbjct_end: subject_start = hsp.sbjct_start + (hsp.query_start - 1) else: subject_start = hsp.sbjct_start - (hsp.query_start - 1) if hsp.sbjct_start > hsp.sbjct_end: subject_end = hsp.sbjct_end - (query_length - hsp.query_end) else: subject_end = hsp.sbjct_end + (query_length - hsp.query_end) revcomp = 1 if hsp.sbjct_start > hsp.sbjct_end: revcomp = -1 left_coords = [ min(subject_start, subject_end) - 100, min(subject_start, subject_end) - 1 ] right_coords = [ max(subject_start, subject_end) + 1, max(subject_start, subject_end) + 100 ] left_cmd = [ "seqret ", reference_fasta_file, " -sbegin ", str(left_coords[0]), " -send ", str(left_coords[1]), " -osformat fasta -auto -out " + output_directory + "/tmp_left_flank.fasta" ] os.system(''.join(left_cmd)) right_cmd = [ "seqret ", reference_fasta_file, " -sbegin ", str(right_coords[0]), " -send ", str(right_coords[1]), " -osformat fasta -auto -out " + output_directory + "/tmp_right_flank.fasta" ] os.system(''.join(right_cmd)) left_record = SeqIO.read( output_directory + "/tmp_left_flank.fasta", "fasta") if revcomp < 0: left_record.id = "down" left_record.seq = left_record.seq.reverse_complement() else: left_record.id = "up" right_record = SeqIO.read( output_directory + "/tmp_right_flank.fasta", "fasta") if revcomp < 0: right_record.id = "up" right_record.seq = right_record.seq.reverse_complement() else: right_record.id = "down" right_record.description = "" left_record.description = "" out_handle = open( output_directory + "/" + seqBaseName + "_flanks.fasta", "w") out_handle.write(right_record.format("fasta")) out_handle.write(left_record.format("fasta")) out_handle.close() summary_file_handle.write('\t'.join([ seqBaseName, seq, output_directory + "/" + seqBaseName + "_flanks.fasta" ]) + "\n") summary_file_handle.close()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger): """ Function (1) Concatenate flanking regions to correspondent locus variants sequence in fasta format. Newly concatenated sequence are then indexed by Bowtie2 (2) Then extract and store as pickled object: a. locus- variant names (loci.pkl) b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl) c. Locus variants sequence (refSeqs.pkl) The option of the method specFn[str]: A tab-delimited text file display the path to the seven flanking and loci sequences(summary.txt) tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created bowtie[str]: The command used to index the reference sequence logger[str]: The path to where the stderr and stdout logged Return loci[list]: loci name """ (specDir,summaryFileName) = os.path.split(specFn) spc = [] for l in open(specFn): spc.append(l.split()) refFn = os.path.join(tmp_dir, "reference.fa") rf = open(refFn, "w") ranges = {} loci = [] refSeqs = {} for (loc, variantsFn, flanksFn) in spc: loci.append(loc) fs = {} f = open(os.path.join(specDir, flanksFn)) for r in Bio.SeqIO.parse(f, "fasta"): fs[r.id] = r.seq f = open(os.path.join(specDir, variantsFn)) for r in Bio.SeqIO.parse(f, "fasta"): s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna) s += fs['up'] s += r.seq s += fs['down'] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta") ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq)) refSeqs[r.id] = s rf.close() rangesFn = os.path.join(tmp_dir, "ranges.pkl") #start and end position of locus variant sequences (without the flanking sequences) f = open(rangesFn, 'w') pickle.dump(ranges, f) f.close() lociFn = os.path.join(tmp_dir, "loci.pkl") f = open(lociFn, 'w') pickle.dump(loci, f) f.close() refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl") #Locus variants sequence f = open(refSeqsFn, 'w') pickle.dump(refSeqs, f) f.close() bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "bowtie_indexed") process = subprocess.Popen([bowtie2_index, refFn, refFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference fasta for mapping process.wait() log_writer.log_process(logger, process, log_error_to = "info") os.system("rm -f summary.txt")
def flanking_regions(profile_file_directory, output_directory, logger): """ Function (1) Extract flanking regions of 100bp upstream and downstream of each MLST locus by blast against a reference genome.BLAST uses the first locus sequence as a query. (2) Creates summary.txt file (a tab-delimited text file display the path to the loci and flanking sequences) The option of the method profile_file_directory[str]: The path to the reference.seq, profile.txt and the Locus variant sequences (*.fas) files location output_directory[str]: The path to where the summary.txt file will be created logger[str]: The path to where the stderr and stdout logged """ reference_fasta_file = profile_file_directory + "/reference.seq" refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna) locus_files = glob.glob(profile_file_directory + "/*.fas") locus_files = sorted(locus_files) summary_file_handle = open(output_directory + "/summary.txt", "w") for seq in locus_files: (seqDir,seqFileName) = os.path.split(seq) (seqBaseName,ext) = os.path.splitext(seqFileName) bait = seqBaseName + "_bait.fasta" log_writer.info_header(logger, "create bait file") process = subprocess.Popen(['seqret',seq,'-firstonly','-auto','-out',output_directory+ '/' + bait], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") cline = NcbiblastnCommandline(query=output_directory+ '/' + bait, db=profile_file_directory + "/reference",evalue=0.001, out=output_directory + "/my_blast_tmp.xml", outfmt=5) stdout_log_output, stderr_log_output = cline() result_handle = open(output_directory + "/my_blast_tmp.xml") blast_record = NCBIXML.read(result_handle) query_length = blast_record.query_letters for alignment in blast_record.alignments: hsp = alignment.hsps[0] if hsp.align_length/float(query_length) > 0.5: if hsp.sbjct_start > hsp.sbjct_end: subject_start = hsp.sbjct_start + (hsp.query_start - 1) else: subject_start = hsp.sbjct_start - (hsp.query_start - 1) if hsp.sbjct_start > hsp.sbjct_end: subject_end = hsp.sbjct_end - (query_length - hsp.query_end) else: subject_end = hsp.sbjct_end + (query_length - hsp.query_end) revcomp = 1 if hsp.sbjct_start > hsp.sbjct_end: revcomp = -1 left_coords = [min(subject_start,subject_end)-100,min(subject_start,subject_end)-1] right_coords = [max(subject_start,subject_end)+1,max(subject_start,subject_end)+100] left_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(left_coords[0])," -send ",str(left_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_left_flank.fasta"] os.system(''.join(left_cmd)) right_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(right_coords[0])," -send ",str(right_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_right_flank.fasta"] os.system(''.join(right_cmd)) left_record = SeqIO.read(output_directory + "/tmp_left_flank.fasta", "fasta") if revcomp < 0: left_record.id = "down" left_record.seq = left_record.seq.reverse_complement() else: left_record.id = "up" right_record = SeqIO.read(output_directory + "/tmp_right_flank.fasta", "fasta") if revcomp < 0: right_record.id = "up" right_record.seq = right_record.seq.reverse_complement() else: right_record.id = "down" right_record.description = "" left_record.description = "" out_handle = open(output_directory + "/" + seqBaseName + "_flanks.fasta", "w") out_handle.write(right_record.format("fasta")) out_handle.write(left_record.format("fasta")) out_handle.close() summary_file_handle.write('\t'.join([seqBaseName,seq,output_directory + "/" + seqBaseName + "_flanks.fasta"]) + "\n") summary_file_handle.close()
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir, bowtie, samtools, id, logger): """ This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided. :param fastqs: directory that contains two fastq files. It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq :type fastqs: directory :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function. :type reference_fasta_file: file :param bowtie: path to bowtie :type bowtie: path :param samtools: path to samtools :type samtools: path :returns: sorted and indexed bam file. :rtype: file """ try: os.makedirs(output_dir + "/tmp") except OSError: if os.path.isdir(output_dir + "/tmp"): # We are nearly safe pass else: # There was an error on creation, so make sure we know about it raise null = open(os.devnull, 'w') bam_sorted = os.path.join(output_dir, id + '-sorted') bam_out = os.path.join(output_dir, id + '-sorted.bam') sam_parsed = os.path.join(output_dir + "/tmp", id + '.tmp') # temporary sam output sam = os.path.join(output_dir + "/tmp", id + '.sam') bam = os.path.join(output_dir + "/tmp", id + '.bam') # copy the reference fasta file to the tmp directory and index reference_fasta_file = output_dir + "/tmp/reference.fasta" shutil.copyfile(reference_fasta_file_path, reference_fasta_file) print "running bowtie index" bowtie_index = bowtie + "-build" log_writer.info_header(logger, "Creating reference_fasta_fileerence index") process = subprocess.Popen( [bowtie_index, reference_fasta_file, reference_fasta_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE ) # generate index of reference_fasta_fileerence fasta for mapping process.wait() log_writer.log_process(logger, process) # # run bowtie cmd = [bowtie] cmd += [ '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1], '-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ] # write to tmp log_writer.info_header(logger, "Running bowtie to generate sam file") # print "running bowtie" process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file. The sam_parsed file is the output that is used to convert to bam. try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr", remove_secondary_mapping_bit, sam, sam_parsed) log_writer.info_header(logger, "Convert sam to bam") process = subprocess.Popen( [samtools, 'view', '-bhS', '-o', bam, sam_parsed], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") # sort bam log_writer.info_header(logger, "Sort the bam file") process = subprocess.Popen([samtools, 'sort', bam, bam_sorted], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") # index bam log_writer.info_header(logger, "Index the BAM file") process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") return bam_out
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids, logger): """ Function (1) Map each read set to each of the possible locus variants by calling Bowtie2 (with very sensitive options) and create tmp file (2) Convert the tmp to sam file by unset the secondary alignment bit score (3) Convert the sam to BAM file (4) sort BAM file The option for method: tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created fastq_files[list]: the path to the fastq file location refFn[str]: the path to the reference file location expand[str] : True bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: unique identifier number logger[str]: the path to where the stderr and stdout logged Return sorted_bam_file[str]: sorted BAM file """ tmp = os.path.join(tmp_dir, ids +'.tmp') # temporary sam output sam = os.path.join(tmp_dir, ids + '.sam') if expand: #1. Creating tmp file log_writer.info_header(logger, "Creating tmp file") # -k = report up to 99999 good alignments per read. #--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 process = subprocess.Popen([bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', tmp, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #2.remove_secondary_mapping_bit log_writer.info_header(logger, "remove_secondary_mapping_bit") i = open(tmp) o = open(sam, 'w') remove_secondary_mapping_bit(tmp, sam) i.close() o.close() else: log_writer.info_header(logger, "Creating sam file") process= subprocess.Popen([bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #3.Converting sam to bam file bam = os.path.join(tmp_dir, ids + '.unsortedbam') log_writer.info_header(logger, "Converting sam to bam") process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, sam], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #4.Sort bam file out0 = os.path.join(tmp_dir,ids + '-all') log_writer.info_header(logger, "Sorting bam") sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam') process = subprocess.Popen([samtools, 'sort', bam, out0], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") return sorted_bam_file
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids, logger): """ Function (1) Map each read set to each of the possible locus variants by calling Bowtie2 (with very sensitive options) and create tmp file (2) Convert the tmp to sam file by unset the secondary alignment bit score (3) Convert the sam to BAM file (4) sort BAM file The option for method: tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created fastq_files[list]: the path to the fastq file location refFn[str]: the path to the reference file location expand[str] : True bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: unique identifier number logger[str]: the path to where the stderr and stdout logged Return sorted_bam_file[str]: sorted BAM file """ tmp = os.path.join(tmp_dir, ids + '.tmp') # temporary sam output sam = os.path.join(tmp_dir, ids + '.sam') if expand: #1. Creating tmp file log_writer.info_header(logger, "Creating tmp file") # -k = report up to 99999 good alignments per read. #--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 process = subprocess.Popen([ bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', tmp, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #2.remove_secondary_mapping_bit log_writer.info_header(logger, "remove_secondary_mapping_bit") i = open(tmp) o = open(sam, 'w') remove_secondary_mapping_bit(tmp, sam) i.close() o.close() else: log_writer.info_header(logger, "Creating sam file") process = subprocess.Popen([ bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #3.Converting sam to bam file bam = os.path.join(tmp_dir, ids + '.unsortedbam') log_writer.info_header(logger, "Converting sam to bam") process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, sam], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #4.Sort bam file out0 = os.path.join(tmp_dir, ids + '-all') log_writer.info_header(logger, "Sorting bam") sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam') process = subprocess.Popen([samtools, 'sort', bam, out0], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") return sorted_bam_file
def getNovelAllele(variant, locus, fastq_files, bowtie, samtools, ids, tmp_dir, logger): """ Function Generate SAM, BAM and pileup file for novel allele The option for method: variant[str]: locus variant number locus[str]: locus name fastq_files[str]: the path to the fastq file bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: sample unique identifier number tmp_dir[str]: the path to where SAM, BAM, Pileup will be created """ refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl"),'rb')) allele_name = locus + "-" + variant typeFn = os.path.join(tmp_dir, allele_name + ".fa") typeFile = open(typeFn, "w") s = refSeqs[allele_name] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta") typeFile.close() #index refrence sample bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "index refrence sample") process = subprocess.Popen([bowtie2_index, typeFn, typeFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref process.wait() log_writer.log_process(logger, process, log_error_to = "info") # create sam and bam files log_writer.info_header(logger, "Creating sam and bam files") bam = create_bam_file(tmp_dir, fastq_files, typeFn, False, bowtie, samtools, ids, logger) #name bam file bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam") process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #index bam file process = subprocess.Popen([samtools, 'index', bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") log_writer.info_header(logger, "index bam file") #generate pilup file piFn= os.path.join(tmp_dir, ids + "." + allele_name + '.pileup') f = open(piFn, 'wb') log_writer.info_header(logger, "generate pileup file") process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-cf', typeFn, bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup for l in process.stdout: f.write(l) f.close() process.wait()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger): """ Function (1) Concatenate flanking regions to correspondent locus variants sequence in fasta format. Newly concatenated sequence are then indexed by Bowtie2 (2) Then extract and store as pickled object: a. locus- variant names (loci.pkl) b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl) c. Locus variants sequence (refSeqs.pkl) The option of the method specFn[str]: A tab-delimited text file display the path to the seven flanking and loci sequences(summary.txt) tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created bowtie[str]: The command used to index the reference sequence logger[str]: The path to where the stderr and stdout logged Return loci[list]: loci name """ (specDir, summaryFileName) = os.path.split(specFn) spc = [] for l in open(specFn): spc.append(l.split()) refFn = os.path.join(tmp_dir, "reference.fa") rf = open(refFn, "w") ranges = {} loci = [] refSeqs = {} for (loc, variantsFn, flanksFn) in spc: loci.append(loc) fs = {} f = open(os.path.join(specDir, flanksFn)) for r in Bio.SeqIO.parse(f, "fasta"): fs[r.id] = r.seq f = open(os.path.join(specDir, variantsFn)) for r in Bio.SeqIO.parse(f, "fasta"): s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna) s += fs['up'] s += r.seq s += fs['down'] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta") ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq)) refSeqs[r.id] = s rf.close() rangesFn = os.path.join( tmp_dir, "ranges.pkl" ) #start and end position of locus variant sequences (without the flanking sequences) f = open(rangesFn, 'w') pickle.dump(ranges, f) f.close() lociFn = os.path.join(tmp_dir, "loci.pkl") f = open(lociFn, 'w') pickle.dump(loci, f) f.close() refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl") #Locus variants sequence f = open(refSeqsFn, 'w') pickle.dump(refSeqs, f) f.close() bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "bowtie_indexed") process = subprocess.Popen( [bowtie2_index, refFn, refFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE ) # generate index of reference fasta for mapping process.wait() log_writer.log_process(logger, process, log_error_to="info") os.system("rm -f summary.txt")
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir, bowtie, samtools, id, logger, threads): """ This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided. :param fastqs: directory that contains two fastq files. It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq :type fastqs: directory :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function. :type reference_fasta_file: file :param bowtie: path to bowtie :type bowtie: path :param samtools: path to samtools :type samtools: path :returns: sorted and indexed bam file. :rtype: file """ try: os.makedirs(output_dir + "/tmp") except OSError: if os.path.isdir(output_dir + "/tmp"): # We are nearly safe pass else: # There was an error on creation, so make sure we know about it raise null = open(os.devnull, 'w') bam_sorted = os.path.join(output_dir, id + '-sorted') bam_out = os.path.join(output_dir, id + '-sorted.bam') sam_parsed = os.path.join(output_dir + "/tmp", id + '.tmp') # temporary sam output sam = os.path.join(output_dir + "/tmp", id + '.sam') bam = os.path.join(output_dir + "/tmp", id + '.bam') # copy the reference fasta file to the tmp directory and index reference_fasta_file = output_dir + "/tmp/reference.fasta" shutil.copyfile(reference_fasta_file_path, reference_fasta_file) print "running bowtie index" bowtie_index= bowtie + "-build" log_writer.info_header(logger, "Creating reference_fasta_fileerence index") process = subprocess.Popen([bowtie_index, reference_fasta_file, reference_fasta_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference_fasta_fileerence fasta for mapping process.wait() log_writer.log_process(logger, process) # # run bowtie cmd = [bowtie] cmd += [ '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1],'-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50', '-p', str(threads)] # write to tmp log_writer.info_header(logger, "Running bowtie to generate sam file") # print "running bowtie" process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file. The sam_parsed file is the output that is used to convert to bam. try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr", remove_secondary_mapping_bit, sam, sam_parsed) log_writer.info_header(logger, "Convert sam to bam") process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, '-@', str(threads), sam_parsed], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") # sort bam log_writer.info_header(logger, "Sort the bam file") process = subprocess.Popen([samtools, 'sort', '-o', bam_sorted + ".bam", '-@', str(threads), bam], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") # index bam log_writer.info_header(logger, "Index the BAM file") process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") return bam_out
def getNovelAllele(variant, locus, fastq_files, bowtie, samtools, ids, tmp_dir, logger): """ Function Generate SAM, BAM and pileup file for novel allele The option for method: variant[str]: locus variant number locus[str]: locus name fastq_files[str]: the path to the fastq file bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: sample unique identifier number tmp_dir[str]: the path to where SAM, BAM, Pileup will be created """ refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl"))) allele_name = locus + "-" + variant typeFn = os.path.join(tmp_dir, allele_name + ".fa") typeFile = open(typeFn, "w") s = refSeqs[allele_name] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta") typeFile.close() #index refrence sample bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "index refrence sample") process = subprocess.Popen([bowtie2_index, typeFn, typeFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref process.wait() log_writer.log_process(logger, process, log_error_to = "info") # create sam and bam files log_writer.info_header(logger, "Creating sam and bam files") bam = create_bam_file(tmp_dir, fastq_files, typeFn, False, bowtie, samtools, ids, logger) #name bam file bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam") process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #index bam file process = subprocess.Popen([samtools, 'index', bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") log_writer.info_header(logger, "index bam file") #generate pilup file piFn= os.path.join(tmp_dir, ids + "." + allele_name + '.pileup') f = open(piFn, 'w') log_writer.info_header(logger, "generate pileup file") process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-cf', typeFn, bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup for l in process.stdout: f.write(l) f.close() process.wait()