def test_map_to_ref_seq_01(self): # test that after mapping to ref the reads in the sorted bam files are correct test_filenames = mapping_to_ref.map_to_ref_seq(self.ref, self.sample, self.left_flanking, self.right_flanking, self.tmp_folder, self.out_folder, '1', False) test_left_sorted_bam = test_filenames['left_sorted'] test_right_sorted_bam = test_filenames['right_sorted'] # verify that the resulting bam files are the same as the gold standard using filecmp # increase the buffersize so we check more than the first 8Kb of the file, check first 8Mb filecmp.BUFSIZE = 1024 * 10 gold_left_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_left_CP010781.1.sorted.sam' gold_right_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_right_CP010781.1.sorted.sam' # we need to convert the BAM files to SAM files to do the check, as the BAM files contain a header # specifying the version of samtools # set up samtools samtools_runner = mapping_to_query.RunSamtools() # run samtools view to convert to SAM run_commands.run_command(samtools_runner.view_bam_to_sam(test_left_sorted_bam, test_left_sorted_bam + '.sam'), shell=True) run_commands.run_command(samtools_runner.view_bam_to_sam(test_right_sorted_bam, test_right_sorted_bam + '.sam'), shell=True) # check if the SAM files match self.assertTrue(filecmp.cmp(test_left_sorted_bam + '.sam', gold_left_sorted_sam, shallow=False)) self.assertTrue(filecmp.cmp(test_right_sorted_bam + '.sam', gold_right_sorted_sam, shallow=False))
def doBlast(blast_input, blast_output, database): ''' Perform a BLAST using the NCBI command line tools in BioPython. ''' run_command(['makeblastdb', '-dbtype nucl', '-in', database], shell=True) run_command(['blastn', '-query', blast_input, '-db', database, '-outfmt "6 qseqid qlen sacc pident length slen sstart send evalue bitscore qcovs"', '>', blast_output], shell=True)
def bwa_index(fasta): """ Check to see if bwa index for given input fasta exists. If it doesn't, build an index from the given input fasta. """ built_index = fasta + '.bwt' if os.path.exists(built_index): logging.info('Index for {} is already built...'.format(fasta)) else: logging.info('Building bwa index for {}...'.format(fasta)) run_command(['bwa', 'index', fasta], shell=True)
def run(car): if test_mode: print('RUNNING TEST!') run_test(test_file, car) else: try: print('RUNNING SIMULATION!') while(car.running): command_ = input() run_command(command_, car) car.print_state() print('SIMULATION OVER.') # run_simulation(sys.argv, car) except KeyboardInterrupt(): print('EXITING SIMULATION!') car.check_simulation_end()
def get_ref_positions(reference, is_query, positions_list): ''' Get the coordinates of known IS sites in the reference. Takes the reference genbank, the IS query and the dictionary to add_argument IS query positions into, as well as a dictionary to add orientations of each of these positions. Returns these positions and orientations, as well as the reference name for file naming. ''' # Get the name of the IS query to create temp file is_name = os.path.split(is_query)[1] ref_name = os.path.split(reference)[1] blast_output = os.path.join(os.getcwd(), is_name + '_' + ref_name + '.tmp') # Do a BLAST of the IS query and the reference doBlast(is_query, blast_output, reference) # Open the BLAST output and get IS query sites with open(blast_output) as out: for line in out: fields = line.strip('\n').split('\t') # To be a known site, hast to match query at least 90% with coverage of 95 if float(fields[3]) >= 90 and float(fields[10]) >= 95: left_pos = int(fields[6]) right_pos = int(fields[7]) if left_pos > right_pos: orientation = 'R' else: orientation = 'F' new_pos = Position(min(left_pos, right_pos), max(left_pos, right_pos)) new_pos.orientation = orientation new_pos.isolate_dict = {ref_name: '+'} positions_list.append(new_pos) # remove the output file run_command(['rm', blast_output], shell=True) # return the list of positions and the name of the reference return positions_list, ref_name
def test_map_to_ref_seq_01(self): # test that after mapping to ref the reads in the sorted bam files are correct test_filenames = mapping_to_ref.map_to_ref_seq( self.ref, self.sample, self.left_flanking, self.right_flanking, self.tmp_folder, self.out_folder, '1', False) test_left_sorted_bam = test_filenames['left_sorted'] test_right_sorted_bam = test_filenames['right_sorted'] # verify that the resulting bam files are the same as the gold standard using filecmp # increase the buffersize so we check more than the first 8Kb of the file, check first 8Mb filecmp.BUFSIZE = 1024 * 10 gold_left_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_left_CP010781.1.sorted.sam' gold_right_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_right_CP010781.1.sorted.sam' # we need to convert the BAM files to SAM files to do the check, as the BAM files contain a header # specifying the version of samtools # set up samtools samtools_runner = mapping_to_query.RunSamtools() # run samtools view to convert to SAM run_commands.run_command(samtools_runner.view_bam_to_sam( test_left_sorted_bam, test_left_sorted_bam + '.sam'), shell=True) run_commands.run_command(samtools_runner.view_bam_to_sam( test_right_sorted_bam, test_right_sorted_bam + '.sam'), shell=True) # check if the SAM files match self.assertTrue( filecmp.cmp(test_left_sorted_bam + '.sam', gold_left_sorted_sam, shallow=False)) self.assertTrue( filecmp.cmp(test_right_sorted_bam + '.sam', gold_right_sorted_sam, shallow=False))
def map_to_ref_seq(ref_seq, sample_name, left_flanking, right_flanking, tmp, out, bwa_threads, bwa_all): filenames = set_ref_output_filenames(sample_name, ref_seq.id, tmp, out) # make temp file ref_seq_file = create_tmp_file(ref_seq, filenames['ref_tmp'], 'fasta') # index the ref seq bwa_index(ref_seq_file) # set up samtools samtools_runner = RunSamtools() # Map reads to reference, reporting all alignments if bwa_all: run_command([ 'bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, left_flanking, '>', filenames['left_sam'] ], shell=True) run_command([ 'bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, right_flanking, '>', filenames['right_sam'] ], shell=True) # map reads to the reference sequence run_command([ 'bwa', 'mem', '-t', bwa_threads, ref_seq_file, left_flanking, '>', filenames['left_sam'] ], shell=True) run_command([ 'bwa', 'mem', '-t', bwa_threads, ref_seq_file, right_flanking, '>', filenames['right_sam'] ], shell=True) # convert sams to bams run_command(samtools_runner.view(filenames['left_bam'], filenames['left_sam']), shell=True) run_command(samtools_runner.view(filenames['right_bam'], filenames['right_sam']), shell=True) # sort bams run_command(samtools_runner.sort(filenames['left_sorted'], filenames['left_bam']), shell=True) run_command(samtools_runner.sort(filenames['right_sorted'], filenames['right_bam']), shell=True) # index sorted bams run_command(samtools_runner.index(filenames['left_sorted']), shell=True) run_command(samtools_runner.index(filenames['right_sorted']), shell=True) return (filenames)
def create_bed_files(filenames, cutoff, merging): left_sorted = filenames['left_sorted'] right_sorted = filenames['right_sorted'] # Create BED files with coverage information run_command([ 'bedtools', 'genomecov', '-ibam', left_sorted, '-bg', '>', filenames['left_cov'] ], shell=True) run_command([ 'bedtools', 'genomecov', '-ibam', right_sorted, '-bg', '>', filenames['right_cov'] ], shell=True) run_command([ 'bedtools', 'merge', '-d', merging, '-i', filenames['left_cov'], '>', filenames['left_merged'] ], shell=True) run_command([ 'bedtools', 'merge', '-d', merging, '-i', filenames['right_cov'], '>', filenames['right_merged'] ], shell=True) # Filter coveraged BED files on coverage cutoff (so only take # high coverage regions for further analysis) filter_on_depth(filenames['left_cov'], filenames['left_final_cov'], cutoff) filter_on_depth(filenames['right_cov'], filenames['right_final_cov'], cutoff) run_command([ 'bedtools', 'merge', '-d', merging, '-i', filenames['left_final_cov'], '>', filenames['left_merged_bed'] ], shell=True) run_command([ 'bedtools', 'merge', '-d', merging, '-i', filenames['right_final_cov'], '>', filenames['right_merged_bed'] ], shell=True) # Find intersects of regions run_command([ 'bedtools', 'intersect', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged_bed'], '-wo', '>', filenames['intersect'] ], shell=True) # Find regions that are close but not overlapping try: run_command([ 'closestBed', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged_bed'], '-d', '>', filenames['closest'] ], shell=True) # One or more of these files are empty so we need to quit and report no hits except BedtoolsError: create_typing_output(filenames, None, None, None, None, None) return (filenames) # Check all unpaired hits to see if there are any that should be paired up # If any of these fail because there are no hits, just make empty unapired files to pass to create_typing_out try: run_command([ 'closestBed', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged'], '-d', '>', filenames['left_unpaired'] ], shell=True) except BedtoolsError: if not os.path.isfile(filenames['left_unpaired']) or os.stat( filenames['left_unpaired'])[6] == 0: open(filenames['left_unpaired'], 'w').close() try: run_command([ 'closestBed', '-a', filenames['left_merged'], '-b', filenames['right_merged_bed'], '-d', '>', filenames['right_unpaired'] ], shell=True) except BedtoolsError: if not os.path.isfile(filenames['right_unpaired']) or os.stat( filenames['right_unpaired'])[6] == 0: open(filenames['right_unpaired'], 'w').close() # return the filepaths for all the output file names return (filenames)
def create_bed_files(filenames, cutoff, merging): left_sorted = filenames['left_sorted'] right_sorted = filenames['right_sorted'] # Create BED files with coverage information run_command(['bedtools', 'genomecov', '-ibam', left_sorted, '-bg', '>', filenames['left_cov']], shell=True) run_command(['bedtools', 'genomecov', '-ibam', right_sorted, '-bg', '>', filenames['right_cov']], shell=True) run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['left_cov'], '>', filenames['left_merged']], shell=True) run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['right_cov'], '>', filenames['right_merged']], shell=True) # Filter coveraged BED files on coverage cutoff (so only take # high coverage regions for further analysis) filter_on_depth(filenames['left_cov'], filenames['left_final_cov'], cutoff) filter_on_depth(filenames['right_cov'], filenames['right_final_cov'], cutoff) run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['left_final_cov'], '>', filenames['left_merged_bed']], shell=True) run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['right_final_cov'], '>', filenames['right_merged_bed']], shell=True) # Find intersects of regions run_command(['bedtools', 'intersect', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged_bed'], '-wo', '>', filenames['intersect']], shell=True) # Find regions that are close but not overlapping try: run_command(['closestBed', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged_bed'], '-d', '>', filenames['closest']], shell=True) # One or more of these files are empty so we need to quit and report no hits except BedtoolsError: create_typing_output(filenames, None, None, None, None, None) return(filenames) # Check all unpaired hits to see if there are any that should be paired up # If any of these fail because there are no hits, just make empty unapired files to pass to create_typing_out try: run_command(['closestBed', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged'], '-d', '>', filenames['left_unpaired']], shell=True) except BedtoolsError: if not os.path.isfile(filenames['left_unpaired']) or os.stat(filenames['left_unpaired'])[6] == 0: open(filenames['left_unpaired'], 'w').close() try: run_command(['closestBed', '-a', filenames['left_merged'], '-b', filenames['right_merged_bed'], '-d', '>', filenames['right_unpaired']], shell=True) except BedtoolsError: if not os.path.isfile(filenames['right_unpaired']) or os.stat(filenames['right_unpaired'])[6] == 0: open(filenames['right_unpaired'], 'w').close() # return the filepaths for all the output file names return(filenames)
def map_to_ref_seq(ref_seq, sample_name, left_flanking, right_flanking, tmp, out, bwa_threads, bwa_all): filenames = set_ref_output_filenames(sample_name, ref_seq.id, tmp, out) # make temp file ref_seq_file = create_tmp_file(ref_seq, filenames['ref_tmp'], 'fasta') # index the ref seq bwa_index(ref_seq_file) # set up samtools samtools_runner = RunSamtools() # Map reads to reference, reporting all alignments if bwa_all: run_command(['bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, left_flanking, '>', filenames['left_sam']], shell=True) run_command(['bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, right_flanking, '>', filenames['right_sam']], shell=True) # map reads to the reference sequence run_command(['bwa', 'mem', '-t', bwa_threads, ref_seq_file, left_flanking, '>', filenames['left_sam']], shell=True) run_command(['bwa', 'mem', '-t', bwa_threads, ref_seq_file, right_flanking, '>', filenames['right_sam']], shell=True) # convert sams to bams run_command(samtools_runner.view(filenames['left_bam'], filenames['left_sam']), shell=True) run_command(samtools_runner.view(filenames['right_bam'], filenames['right_sam']), shell=True) # sort bams run_command(samtools_runner.sort(filenames['left_sorted'], filenames['left_bam']), shell=True) run_command(samtools_runner.sort(filenames['right_sorted'], filenames['right_bam']), shell=True) # index sorted bams run_command(samtools_runner.index(filenames['left_sorted']), shell=True) run_command(samtools_runner.index(filenames['right_sorted']), shell=True) return(filenames)
def map_to_is_query(sample, is_query, output_sample, min_clip, max_clip, threads): """ Take the sample object (containing paths to reads and read prefix), the IS query (fasta file) and the output folder. - Create output folders for this IS within the sample folder - Set up output files (both temporary and final) - Create a temp file for the query - Index IS query and map reads to it - Extract unmapped reads flanking the IS query - Create fastq files from these resulting bam files - Extract reads which are clipped (partially mapped to the IS query) - Add these clipped reads to the fastq files Return the file names of the clipped reads for subsequent analysis. """ samtools_runner = RunSamtools() # set up output folders is_query_out = os.path.join(output_sample, is_query.id) is_query_tmp_folder = os.path.join(output_sample, is_query.id, 'tmp') make_directories([is_query_out, is_query_tmp_folder]) logging.info('Created output folder %s', is_query_out) # set up output file names filenames = set_output_filenames(is_query_tmp_folder, sample.prefix, is_query.id, is_query_out) # create temp file of IS query is_query_tmp = create_tmp_file(is_query, filenames['query_tmp'], 'fasta') # index the query bwa_index(is_query_tmp) # map to the query run_command([ 'bwa', 'mem', '-t', threads, is_query_tmp, str(sample.forward), str(sample.reverse), '>', filenames['sam'] ], shell=True) # pull out unmapped reads flanking IS run_command(samtools_runner.view(filenames['left_bam'], filenames['sam'], smallF=36), shell=True) run_command(samtools_runner.view(filenames['right_bam'], filenames['sam'], smallF=4, bigF=40), shell=True) # Turn bams to reads for mapping run_command([ 'bedtools', 'bamtofastq', '-i', filenames['left_bam'], '-fq', filenames['left_reads'] ], shell=True) run_command([ 'bedtools', 'bamtofastq', '-i', filenames['right_bam'], '-fq', filenames['right_reads'] ], shell=True) # Extract clipped reads logging.info( 'Extracting soft clipped reads that are <= %s bp and >= %s bp', str(max_clip), str(min_clip)) extract_clipped_reads(filenames['sam'], min_clip, max_clip, filenames['left_clipped'], filenames['right_clipped']) # Add clipped reads to the final fastq files run_command([ 'cat', filenames['left_clipped'], filenames['left_reads'], '>', filenames['left_final'] ], shell=True) run_command([ 'cat', filenames['right_clipped'], filenames['right_reads'], '>', filenames['right_final'] ], shell=True) logging.info( 'Successfully extracted reads flanking left and right end of IS query') # return the paths to these reads return filenames['left_final'], filenames[ 'right_final'], is_query_out, is_query_tmp_folder