예제 #1
0
    def test_map_to_ref_seq_01(self):

        # test that after mapping to ref the reads in the sorted bam files are correct
        test_filenames = mapping_to_ref.map_to_ref_seq(self.ref, self.sample, self.left_flanking,
                                                       self.right_flanking, self.tmp_folder, self.out_folder, '1', False)
        test_left_sorted_bam = test_filenames['left_sorted']
        test_right_sorted_bam = test_filenames['right_sorted']

        # verify that the resulting bam files are the same as the gold standard using filecmp
        # increase the buffersize so we check more than the first 8Kb of the file, check first 8Mb
        filecmp.BUFSIZE = 1024 * 10

        gold_left_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_left_CP010781.1.sorted.sam'
        gold_right_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_right_CP010781.1.sorted.sam'

        # we need to convert the BAM files to SAM files to do the check, as the BAM files contain a header
        # specifying the version of samtools
        # set up samtools
        samtools_runner = mapping_to_query.RunSamtools()
        # run samtools view to convert to SAM
        run_commands.run_command(samtools_runner.view_bam_to_sam(test_left_sorted_bam, test_left_sorted_bam + '.sam'), shell=True)
        run_commands.run_command(samtools_runner.view_bam_to_sam(test_right_sorted_bam, test_right_sorted_bam + '.sam'), shell=True)

        # check if the SAM files match
        self.assertTrue(filecmp.cmp(test_left_sorted_bam + '.sam', gold_left_sorted_sam, shallow=False))
        self.assertTrue(filecmp.cmp(test_right_sorted_bam + '.sam', gold_right_sorted_sam, shallow=False))
예제 #2
0
def doBlast(blast_input, blast_output, database):
    '''
    Perform a BLAST using the NCBI command line tools
    in BioPython.
    '''
    run_command(['makeblastdb', '-dbtype nucl', '-in', database], shell=True)
    run_command(['blastn', '-query', blast_input, '-db', database, '-outfmt "6 qseqid qlen sacc pident length slen sstart send evalue bitscore qcovs"', '>', blast_output], shell=True)
예제 #3
0
def bwa_index(fasta):
    """
    Check to see if bwa index for given input fasta exists.
    If it doesn't, build an index from the given input fasta.
    """

    built_index = fasta + '.bwt'
    if os.path.exists(built_index):
        logging.info('Index for {} is already built...'.format(fasta))
    else:
        logging.info('Building bwa index for {}...'.format(fasta))
        run_command(['bwa', 'index', fasta], shell=True)
 def run(car):
     if test_mode:
         print('RUNNING TEST!')
         run_test(test_file, car)
     else:
         try:
             print('RUNNING SIMULATION!')
             while(car.running):
                 command_ = input()
                 run_command(command_, car)
             car.print_state()
             print('SIMULATION OVER.')
         # run_simulation(sys.argv, car)
         except KeyboardInterrupt():
             print('EXITING SIMULATION!')
             car.check_simulation_end()
예제 #5
0
def get_ref_positions(reference, is_query, positions_list):
    '''
    Get the coordinates of known IS sites in the reference.

    Takes the reference genbank, the IS query and the dictionary to add_argument
    IS query positions into, as well as a dictionary to add orientations
    of each of these positions.
    Returns these positions and orientations, as well as the reference name
    for file naming.
    '''
    # Get the name of the IS query to create temp file
    is_name = os.path.split(is_query)[1]
    ref_name = os.path.split(reference)[1]
    blast_output = os.path.join(os.getcwd(), is_name + '_' + ref_name + '.tmp')

    # Do a BLAST of the IS query and the reference
    doBlast(is_query, blast_output, reference)

    # Open the BLAST output and get IS query sites
    with open(blast_output) as out:
        for line in out:
            fields = line.strip('\n').split('\t')
            # To be a known site, hast to match query at least 90% with coverage of 95
            if float(fields[3]) >= 90 and float(fields[10]) >= 95:
                left_pos = int(fields[6])
                right_pos = int(fields[7])
                if left_pos > right_pos:
                    orientation = 'R'
                else:
                    orientation = 'F'
                new_pos = Position(min(left_pos, right_pos),
                                   max(left_pos, right_pos))
                new_pos.orientation = orientation
                new_pos.isolate_dict = {ref_name: '+'}
                positions_list.append(new_pos)
    # remove the output file
    run_command(['rm', blast_output], shell=True)

    # return the list of positions and the name of the reference

    return positions_list, ref_name
예제 #6
0
def get_ref_positions(reference, is_query, positions_list):
    '''
    Get the coordinates of known IS sites in the reference.

    Takes the reference genbank, the IS query and the dictionary to add_argument
    IS query positions into, as well as a dictionary to add orientations
    of each of these positions.
    Returns these positions and orientations, as well as the reference name
    for file naming.
    '''
    # Get the name of the IS query to create temp file
    is_name = os.path.split(is_query)[1]
    ref_name = os.path.split(reference)[1]
    blast_output = os.path.join(os.getcwd(), is_name + '_' + ref_name + '.tmp')

    # Do a BLAST of the IS query and the reference
    doBlast(is_query, blast_output, reference)

    # Open the BLAST output and get IS query sites
    with open(blast_output) as out:
        for line in out:
            fields = line.strip('\n').split('\t')
            # To be a known site, hast to match query at least 90% with coverage of 95
            if float(fields[3]) >= 90 and float(fields[10]) >= 95:
                left_pos = int(fields[6])
                right_pos = int(fields[7])
                if left_pos > right_pos:
                    orientation = 'R'
                else:
                    orientation = 'F'
                new_pos = Position(min(left_pos, right_pos), max(left_pos, right_pos))
                new_pos.orientation = orientation
                new_pos.isolate_dict = {ref_name: '+'}
                positions_list.append(new_pos)
    # remove the output file
    run_command(['rm', blast_output], shell=True)

    # return the list of positions and the name of the reference

    return positions_list, ref_name
예제 #7
0
    def test_map_to_ref_seq_01(self):

        # test that after mapping to ref the reads in the sorted bam files are correct
        test_filenames = mapping_to_ref.map_to_ref_seq(
            self.ref, self.sample, self.left_flanking, self.right_flanking,
            self.tmp_folder, self.out_folder, '1', False)
        test_left_sorted_bam = test_filenames['left_sorted']
        test_right_sorted_bam = test_filenames['right_sorted']

        # verify that the resulting bam files are the same as the gold standard using filecmp
        # increase the buffersize so we check more than the first 8Kb of the file, check first 8Mb
        filecmp.BUFSIZE = 1024 * 10

        gold_left_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_left_CP010781.1.sorted.sam'
        gold_right_sorted_sam = '/Users/jane/Desktop/ismap_v2/gold_standard_files/9262_1#29_right_CP010781.1.sorted.sam'

        # we need to convert the BAM files to SAM files to do the check, as the BAM files contain a header
        # specifying the version of samtools
        # set up samtools
        samtools_runner = mapping_to_query.RunSamtools()
        # run samtools view to convert to SAM
        run_commands.run_command(samtools_runner.view_bam_to_sam(
            test_left_sorted_bam, test_left_sorted_bam + '.sam'),
                                 shell=True)
        run_commands.run_command(samtools_runner.view_bam_to_sam(
            test_right_sorted_bam, test_right_sorted_bam + '.sam'),
                                 shell=True)

        # check if the SAM files match
        self.assertTrue(
            filecmp.cmp(test_left_sorted_bam + '.sam',
                        gold_left_sorted_sam,
                        shallow=False))
        self.assertTrue(
            filecmp.cmp(test_right_sorted_bam + '.sam',
                        gold_right_sorted_sam,
                        shallow=False))
예제 #8
0
def map_to_ref_seq(ref_seq, sample_name, left_flanking, right_flanking, tmp,
                   out, bwa_threads, bwa_all):

    filenames = set_ref_output_filenames(sample_name, ref_seq.id, tmp, out)

    # make temp file
    ref_seq_file = create_tmp_file(ref_seq, filenames['ref_tmp'], 'fasta')

    # index the ref seq
    bwa_index(ref_seq_file)

    # set up samtools
    samtools_runner = RunSamtools()

    # Map reads to reference, reporting all alignments
    if bwa_all:
        run_command([
            'bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, left_flanking,
            '>', filenames['left_sam']
        ],
                    shell=True)
        run_command([
            'bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file,
            right_flanking, '>', filenames['right_sam']
        ],
                    shell=True)

    # map reads to the reference sequence
    run_command([
        'bwa', 'mem', '-t', bwa_threads, ref_seq_file, left_flanking, '>',
        filenames['left_sam']
    ],
                shell=True)
    run_command([
        'bwa', 'mem', '-t', bwa_threads, ref_seq_file, right_flanking, '>',
        filenames['right_sam']
    ],
                shell=True)

    # convert sams to bams
    run_command(samtools_runner.view(filenames['left_bam'],
                                     filenames['left_sam']),
                shell=True)
    run_command(samtools_runner.view(filenames['right_bam'],
                                     filenames['right_sam']),
                shell=True)
    # sort bams
    run_command(samtools_runner.sort(filenames['left_sorted'],
                                     filenames['left_bam']),
                shell=True)
    run_command(samtools_runner.sort(filenames['right_sorted'],
                                     filenames['right_bam']),
                shell=True)
    # index sorted bams
    run_command(samtools_runner.index(filenames['left_sorted']), shell=True)
    run_command(samtools_runner.index(filenames['right_sorted']), shell=True)

    return (filenames)
예제 #9
0
def create_bed_files(filenames, cutoff, merging):

    left_sorted = filenames['left_sorted']
    right_sorted = filenames['right_sorted']
    # Create BED files with coverage information
    run_command([
        'bedtools', 'genomecov', '-ibam', left_sorted, '-bg', '>',
        filenames['left_cov']
    ],
                shell=True)
    run_command([
        'bedtools', 'genomecov', '-ibam', right_sorted, '-bg', '>',
        filenames['right_cov']
    ],
                shell=True)
    run_command([
        'bedtools', 'merge', '-d', merging, '-i', filenames['left_cov'], '>',
        filenames['left_merged']
    ],
                shell=True)
    run_command([
        'bedtools', 'merge', '-d', merging, '-i', filenames['right_cov'], '>',
        filenames['right_merged']
    ],
                shell=True)
    # Filter coveraged BED files on coverage cutoff (so only take
    # high coverage regions for further analysis)
    filter_on_depth(filenames['left_cov'], filenames['left_final_cov'], cutoff)
    filter_on_depth(filenames['right_cov'], filenames['right_final_cov'],
                    cutoff)

    run_command([
        'bedtools', 'merge', '-d', merging, '-i', filenames['left_final_cov'],
        '>', filenames['left_merged_bed']
    ],
                shell=True)
    run_command([
        'bedtools', 'merge', '-d', merging, '-i', filenames['right_final_cov'],
        '>', filenames['right_merged_bed']
    ],
                shell=True)

    # Find intersects of regions
    run_command([
        'bedtools', 'intersect', '-a', filenames['left_merged_bed'], '-b',
        filenames['right_merged_bed'], '-wo', '>', filenames['intersect']
    ],
                shell=True)

    # Find regions that are close but not overlapping
    try:
        run_command([
            'closestBed', '-a', filenames['left_merged_bed'], '-b',
            filenames['right_merged_bed'], '-d', '>', filenames['closest']
        ],
                    shell=True)
    # One or more of these files are empty so we need to quit and report no hits
    except BedtoolsError:
        create_typing_output(filenames, None, None, None, None, None)
        return (filenames)

    # Check all unpaired hits to see if there are any that should be paired up
    # If any of these fail because there are no hits, just make empty unapired files to pass to create_typing_out
    try:
        run_command([
            'closestBed', '-a', filenames['left_merged_bed'], '-b',
            filenames['right_merged'], '-d', '>', filenames['left_unpaired']
        ],
                    shell=True)
    except BedtoolsError:
        if not os.path.isfile(filenames['left_unpaired']) or os.stat(
                filenames['left_unpaired'])[6] == 0:
            open(filenames['left_unpaired'], 'w').close()
    try:
        run_command([
            'closestBed', '-a', filenames['left_merged'], '-b',
            filenames['right_merged_bed'], '-d', '>',
            filenames['right_unpaired']
        ],
                    shell=True)
    except BedtoolsError:
        if not os.path.isfile(filenames['right_unpaired']) or os.stat(
                filenames['right_unpaired'])[6] == 0:
            open(filenames['right_unpaired'], 'w').close()

    # return the filepaths for all the output file names
    return (filenames)
예제 #10
0
def create_bed_files(filenames, cutoff, merging):

    left_sorted = filenames['left_sorted']
    right_sorted = filenames['right_sorted']
    # Create BED files with coverage information
    run_command(['bedtools', 'genomecov', '-ibam', left_sorted, '-bg', '>', filenames['left_cov']], shell=True)
    run_command(['bedtools', 'genomecov', '-ibam', right_sorted, '-bg', '>', filenames['right_cov']], shell=True)
    run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['left_cov'], '>', filenames['left_merged']], shell=True)
    run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['right_cov'], '>', filenames['right_merged']], shell=True)
    # Filter coveraged BED files on coverage cutoff (so only take
    # high coverage regions for further analysis)
    filter_on_depth(filenames['left_cov'], filenames['left_final_cov'], cutoff)
    filter_on_depth(filenames['right_cov'], filenames['right_final_cov'], cutoff)

    run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['left_final_cov'], '>', filenames['left_merged_bed']], shell=True)
    run_command(['bedtools', 'merge', '-d', merging, '-i', filenames['right_final_cov'], '>', filenames['right_merged_bed']], shell=True)

    # Find intersects of regions
    run_command(['bedtools', 'intersect', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged_bed'], '-wo', '>',
                 filenames['intersect']], shell=True)
    
    # Find regions that are close but not overlapping
    try:
        run_command(['closestBed', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged_bed'], '-d', '>', filenames['closest']], shell=True)
    # One or more of these files are empty so we need to quit and report no hits
    except BedtoolsError:
        create_typing_output(filenames, None, None, None, None, None)
        return(filenames)
    
    # Check all unpaired hits to see if there are any that should be paired up
    # If any of these fail because there are no hits, just make empty unapired files to pass to create_typing_out
    try:
        run_command(['closestBed', '-a', filenames['left_merged_bed'], '-b', filenames['right_merged'], '-d', '>', filenames['left_unpaired']],
                    shell=True)
    except BedtoolsError:
        if not os.path.isfile(filenames['left_unpaired']) or os.stat(filenames['left_unpaired'])[6] == 0:
            open(filenames['left_unpaired'], 'w').close()
    try:
        run_command(['closestBed', '-a', filenames['left_merged'], '-b', filenames['right_merged_bed'], '-d', '>', filenames['right_unpaired']],
                    shell=True)
    except BedtoolsError:
        if not os.path.isfile(filenames['right_unpaired']) or os.stat(filenames['right_unpaired'])[6] == 0:
            open(filenames['right_unpaired'], 'w').close()
            
    # return the filepaths for all the output file names
    return(filenames)
예제 #11
0
def map_to_ref_seq(ref_seq, sample_name, left_flanking, right_flanking, tmp, out, bwa_threads, bwa_all):

    filenames = set_ref_output_filenames(sample_name, ref_seq.id, tmp, out)

    # make temp file
    ref_seq_file = create_tmp_file(ref_seq, filenames['ref_tmp'], 'fasta')

    # index the ref seq
    bwa_index(ref_seq_file)

    # set up samtools
    samtools_runner = RunSamtools()

    # Map reads to reference, reporting all alignments
    if bwa_all:
        run_command(['bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, left_flanking, '>', filenames['left_sam']],
                    shell=True)
        run_command(['bwa', 'mem', '-t', bwa_threads, '-a', ref_seq_file, right_flanking, '>', filenames['right_sam']],
                    shell=True)

    # map reads to the reference sequence
    run_command(['bwa', 'mem', '-t', bwa_threads, ref_seq_file, left_flanking, '>', filenames['left_sam']], shell=True)
    run_command(['bwa', 'mem', '-t', bwa_threads, ref_seq_file, right_flanking, '>', filenames['right_sam']], shell=True)

    # convert sams to bams
    run_command(samtools_runner.view(filenames['left_bam'], filenames['left_sam']), shell=True)
    run_command(samtools_runner.view(filenames['right_bam'], filenames['right_sam']), shell=True)
    # sort bams
    run_command(samtools_runner.sort(filenames['left_sorted'], filenames['left_bam']), shell=True)
    run_command(samtools_runner.sort(filenames['right_sorted'], filenames['right_bam']), shell=True)
    # index sorted bams
    run_command(samtools_runner.index(filenames['left_sorted']), shell=True)
    run_command(samtools_runner.index(filenames['right_sorted']), shell=True)

    return(filenames)
예제 #12
0
def map_to_is_query(sample, is_query, output_sample, min_clip, max_clip,
                    threads):
    """
    Take the sample object (containing paths to reads and read prefix), the IS query (fasta file) and the
    output folder.
    - Create output folders for this IS within the sample folder
    - Set up output files (both temporary and final)
    - Create a temp file for the query
    - Index IS query and map reads to it
    - Extract unmapped reads flanking the IS query
    - Create fastq files from these resulting bam files
    - Extract reads which are clipped (partially mapped to the IS query)
    - Add these clipped reads to the fastq files

    Return the file names of the clipped reads for subsequent analysis.
    """
    samtools_runner = RunSamtools()

    # set up output folders
    is_query_out = os.path.join(output_sample, is_query.id)
    is_query_tmp_folder = os.path.join(output_sample, is_query.id, 'tmp')
    make_directories([is_query_out, is_query_tmp_folder])
    logging.info('Created output folder %s', is_query_out)

    # set up output file names
    filenames = set_output_filenames(is_query_tmp_folder, sample.prefix,
                                     is_query.id, is_query_out)

    # create temp file of IS query
    is_query_tmp = create_tmp_file(is_query, filenames['query_tmp'], 'fasta')

    # index the query
    bwa_index(is_query_tmp)
    # map to the query
    run_command([
        'bwa', 'mem', '-t', threads, is_query_tmp,
        str(sample.forward),
        str(sample.reverse), '>', filenames['sam']
    ],
                shell=True)

    # pull out unmapped reads flanking IS
    run_command(samtools_runner.view(filenames['left_bam'],
                                     filenames['sam'],
                                     smallF=36),
                shell=True)
    run_command(samtools_runner.view(filenames['right_bam'],
                                     filenames['sam'],
                                     smallF=4,
                                     bigF=40),
                shell=True)

    # Turn bams to reads for mapping
    run_command([
        'bedtools', 'bamtofastq', '-i', filenames['left_bam'], '-fq',
        filenames['left_reads']
    ],
                shell=True)
    run_command([
        'bedtools', 'bamtofastq', '-i', filenames['right_bam'], '-fq',
        filenames['right_reads']
    ],
                shell=True)

    # Extract clipped reads
    logging.info(
        'Extracting soft clipped reads that are <= %s bp and >= %s bp',
        str(max_clip), str(min_clip))
    extract_clipped_reads(filenames['sam'], min_clip, max_clip,
                          filenames['left_clipped'],
                          filenames['right_clipped'])

    # Add clipped reads to the final fastq files
    run_command([
        'cat', filenames['left_clipped'], filenames['left_reads'], '>',
        filenames['left_final']
    ],
                shell=True)
    run_command([
        'cat', filenames['right_clipped'], filenames['right_reads'], '>',
        filenames['right_final']
    ],
                shell=True)

    logging.info(
        'Successfully extracted reads flanking left and right end of IS query')

    # return the paths to these reads
    return filenames['left_final'], filenames[
        'right_final'], is_query_out, is_query_tmp_folder