Пример #1
0
def main(args, outs):
    chunk = args.chunk

    if not chunk['reads_interleaved'] and (chunk['read1'] is None
                                           or chunk['read2'] is None):
        martian.throw(
            "must supply a read1 and read2 when reads_interleave == False")

    if chunk['reads_interleaved']:
        reads = chunk['read1']
    else:
        reads = [chunk['read1']]
        if chunk['read2'] is not None:
            reads.append(chunk['read2'])

    a = tenkit.align.Aligner(reads, outs.default)
    aligner = args.aligner

    ref_fasta = tenkit.reference.get_fasta(args.reference_path)
    rg_string = chunk['read_group']
    read_group_header = tk_bam.make_rg_header(rg_string)
    a.output_alignment(aligner=aligner,
                       aligner_params={
                           'ref_fasta': ref_fasta,
                           'algorithm': args.aligner_method
                       },
                       num_threads=args.__threads,
                       read_group_header=read_group_header)
Пример #2
0
def main(args, outs):
    # this silences a weird non-failure in --strict=error mode
    # TODO(lhepler): remove this when martian upstream handles this itself
    outs.default = []

    chunk = args.chunk

    if not chunk['reads_interleaved'] and (chunk['read1'] is None
                                           or chunk['read2'] is None):
        martian.throw(
            "must supply a read1 and read2 when reads_interleave == False")

    if chunk['reads_interleaved']:
        reads = chunk['read1']
    else:
        reads = [chunk['read1']]
        if chunk['read2'] is not None:
            reads.append(chunk['read2'])
    a = tenkit.align.Aligner(reads, outs.output)
    aligner = args.aligner

    ref_fasta = tenkit.reference.get_fasta(args.reference_path)
    rg_string = chunk['read_group']
    read_group_header = tk_bam.make_rg_header(rg_string)
    a.output_alignment(aligner=aligner,
                       aligner_params={
                           'ref_fasta': ref_fasta,
                           'algorithm': args.aligner_method
                       },
                       num_threads=args.__threads,
                       read_group_header=read_group_header)
Пример #3
0
def create_unaligned_bam(args, outs):
    star_ref_path = cr_utils.get_reference_star_path(args.reference_path)

    header_buf = cStringIO.StringIO()

    header_buf.write('@HD\tVN:1.4\n')

    # SQ header lines
    with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f:
        for line in f:
            chr_name, chr_len = line.strip().split('\t')
            header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len))

    # RG header lines
    for packed_rg in args.read_groups:
        header_buf.write(
            re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n')

    # Get read group ID for this chunk of reads
    read_group = args.read_group

    # pysam doesn't support reading SAM from a StringIO object
    with open('tmphdr', 'w') as f:
        f.write(header_buf.getvalue())
    samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False)

    outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile)

    fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk)
    fastq_file2 = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else None
    read1s = tk_fasta.read_generator_fastq(fastq_file1)
    read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else []

    record = pysam.AlignedSegment()
    record.flag = 4

    for read1, read2 in itertools.izip_longest(read1s, read2s):
        name, seq, qual = read1
        record.query_name, record.query_sequence = name.split(' ')[0], seq
        record.query_qualities = tk_fasta.get_qvs(qual)
        record.set_tag('RG', read_group, 'Z')
        outbam.write(record)

        if read2:
            name, seq, qual = read2
            record.query_name, record.query_sequence = name.split(' ')[0], seq
            record.query_qualities = tk_fasta.get_qvs(qual)
            record.set_tag('RG', read_group, 'Z')
            outbam.write(record)

    samfile.close()
    fastq_file1.close()
    if fastq_file2 is not None:
        fastq_file2.close()
    outbam.close()
Пример #4
0
def bwa_align_unpaired(ref_fasta,
                       read_fastq,
                       out_name,
                       algorithm='ALN',
                       max_hits=None,
                       read_group_header=None,
                       num_threads=24):
    """ Runs bwa aligner on reads without using paired-information (using bam as input format).
    """
    assert (type(read_fastq) != list)

    if read_group_header is None:
        read_group_header = tk_bam.make_rg_header()

    if algorithm == 'MEM':
        # Temp file names
        sam_name = out_name + '.sam'

        sam_out_file = open(sam_name, 'w')
        log_subprocess.check_call([
            'bwa', 'mem', '-t',
            str(num_threads), '-M', '-R', read_group_header, ref_fasta,
            read_fastq
        ],
                                  stdout=sam_out_file)
        sam_out_file.close()

        # Create final bam file from the sam file
        tk_bam.convert_to_bam(sam_name, out_name)

        # Remove temp files
        subprocess.check_call(['rm', sam_name])

    elif algorithm == 'ALN':
        # Temp file names
        sam_name = out_name + '.sam'
        index_name = out_name + '.sai'

        sam_out_file = open(sam_name, 'w')
        index_file = open(index_name, 'w')
        log_subprocess.check_call(
            ['bwa', 'aln', '-t',
             str(num_threads), ref_fasta, read_fastq],
            stdout=index_file)
        index_file.close()
        if max_hits:
            log_subprocess.check_call([
                'bwa', 'samse', '-n',
                str(max_hits), ref_fasta, index_name, read_fastq
            ],
                                      stdout=sam_out_file)
        else:
            log_subprocess.check_call(
                ['bwa', 'samse', ref_fasta, index_name, read_fastq],
                stdout=sam_out_file)
        sam_out_file.close()

        # Create final bam file from the sam file
        tk_bam.convert_to_bam(sam_name, out_name)

        # Remove temp files
        subprocess.check_call(['rm', index_name])
        subprocess.check_call(['rm', sam_name])
    else:
        raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)
Пример #5
0
def bwa_align_paired(ref_fasta,
                     read_fastq,
                     out_name,
                     algorithm='ALN',
                     max_hits=None,
                     read_group_header=None,
                     num_threads=24):
    """Runs bwa paired-end aligner on reads using paired-end information
    Algorithm choices are currently
    MEM: Maximal Exact Matching (better for longer reads)
    ALN: Better for longer reads
    Haven't yet implemented BWA-SW
    Currently assumes the input read_fastq is in interleaved format, i.e. the reads of a pair
    are alternating.
    """
    if read_group_header is None:
        read_group_header = tk_bam.make_rg_header()

    if algorithm == 'MEM':
        if type(read_fastq) == list:
            assert (len(read_fastq) == 2)
            ## This restricts to primary alignments only
            out_file = open(out_name, 'w')
            ps = log_subprocess.Popen([
                'bwa', 'mem', '-t',
                str(num_threads), '-M', '-R', read_group_header, ref_fasta,
                read_fastq[0], read_fastq[1]
            ],
                                      stdout=subprocess.PIPE)
            #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed
            errors_file = open(out_name + '_ERRORS', 'w')
            log_subprocess.check_call(['samtools', 'view', '-bSh', '-'],
                                      stdin=ps.stdout,
                                      stdout=out_file,
                                      stderr=errors_file)
            out_file.close()
            errors_file.close()
        else:
            ## This restricts to primary alignments only
            out_file = open(out_name, 'w')
            ps = log_subprocess.Popen([
                'bwa', 'mem', '-p', '-t',
                str(num_threads), '-M', '-R', read_group_header, ref_fasta,
                read_fastq
            ],
                                      stdout=subprocess.PIPE)
            #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed
            errors_file = open(out_name + '_ERRORS', 'w')
            log_subprocess.check_call(['samtools', 'view', '-bSh', '-'],
                                      stdin=ps.stdout,
                                      stdout=out_file,
                                      stderr=errors_file)
            out_file.close()
            errors_file.close()

    elif algorithm == 'ALN':
        # Temp file names
        temp_fastq_name1 = out_name + '1.fastq'
        temp_fastq_name2 = out_name + '2.fastq'
        index_name_1 = out_name + '1.sai'
        index_name_2 = out_name + '2.sai'
        sam_name = out_name + '.sam'

        # Create the temp non-interleaved files
        in_fastq = open(read_fastq, 'r')
        temp_fastq1 = open(temp_fastq_name1, 'w')
        temp_fastq2 = open(temp_fastq_name2, 'w')
        tk_fasta.uninterleave_fastq(in_fastq, temp_fastq1, temp_fastq2)
        temp_fastq1.close()
        temp_fastq2.close()

        # Create the bwa index files
        index_file_1 = open(index_name_1, 'w')
        index_file_2 = open(index_name_2, 'w')
        log_subprocess.check_call([
            'bwa', 'aln', '-t',
            str(num_threads), ref_fasta, temp_fastq_name1
        ],
                                  stdout=index_file_1)
        log_subprocess.check_call([
            'bwa', 'aln', '-t',
            str(num_threads), ref_fasta, temp_fastq_name2
        ],
                                  stdout=index_file_2)
        index_file_1.close()
        index_file_2.close()

        # Create the sorted SAM file
        sam_out_file = open(sam_name, 'w')
        if max_hits:
            log_subprocess.check_call([
                'bwa', 'sampe', '-n',
                str(max_hits), ref_fasta, index_name_1, index_name_2,
                temp_fastq_name1, temp_fastq_name2
            ],
                                      stdout=sam_out_file)
        else:
            log_subprocess.check_call([
                'bwa', 'sampe', ref_fasta, index_name_1, index_name_2,
                temp_fastq_name1, temp_fastq_name2
            ],
                                      stdout=sam_out_file)

        sam_out_file.close()

        # Create final bam file from the sam file
        tk_bam.convert_to_bam(sam_name, out_name)

        # Clean up temporary files
        subprocess.check_call(['rm', temp_fastq_name1])
        subprocess.check_call(['rm', temp_fastq_name2])
        subprocess.check_call(['rm', index_name_1])
        subprocess.check_call(['rm', index_name_2])
        subprocess.check_call(['rm', sam_name])
    else:
        raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)