예제 #1
0
def parse_m5_record(r_m5_record):
    if r_m5_record['tStrand'] != '+':
        raise NotImplementedError, (
            'Mapping indicates negative strand reference mapping.')

    if r_m5_record['qStrand'] == "+":
        alignVals = zip(r_m5_record['qAlignedSeq'], r_m5_record['tAlignedSeq'])
    else:
        alignVals = zip(nh.rev_comp(r_m5_record['qAlignedSeq']),
                        nh.rev_comp(r_m5_record['tAlignedSeq']))

    alignVals, start_clipped_bases, end_clipped_bases, genome_loc \
        = clip_m5_alignment(
            alignVals, int(r_m5_record['tStart']),
            r_m5_record['qStrand'], r_m5_record['tName'])

    return (alignVals, genome_loc, start_clipped_bases, end_clipped_bases)
예제 #2
0
def align_to_genome(basecalls, genome_filename, graphmap_path):
    ## align to genome
    read_fp = NamedTemporaryFile(delete=False, suffix='.fasta')
    read_fp.write(">" + ''.join(basecalls[:5]) + '\n' + ''.join(basecalls) +
                  '\n')
    read_fp.close()
    out_fp = NamedTemporaryFile(delete=False)
    try:
        # suppress output from graphmap with devnull sink
        with open(os.devnull, 'w') as FNULL:
            exitStatus = call([
                graphmap_path, 'align', '-r', genome_filename, '-d',
                read_fp.name, '-o', out_fp.name, '-L', 'm5'
            ],
                              stdout=FNULL,
                              stderr=STDOUT)

        alignment = dict(zip(GRAPHMAP_FIELDS, out_fp.readline().split()))
        out_fp.close()
    except:
        raise OSError, ('Problem running/parsing graphmap. Ensure ' +
                        'you have a compatible version installed.')

    ## flip read and genome to match raw signal if neg strand mapped
    if len(alignment) != len(GRAPHMAP_FIELDS):
        raise NotImplementedError, ('Graphmap did not produce alignment.')

    if alignment['tStrand'] != '+':
        raise NotImplementedError, (
            'Graphmap indicates negative strand reference mapping.')

    if alignment['qStrand'] == "+":
        alignVals = zip(alignment['qAlignedSeq'], alignment['tAlignedSeq'])
    else:
        alignVals = zip(rev_comp(alignment['qAlignedSeq']),
                        rev_comp(alignment['tAlignedSeq']))

    return alignVals, genomeLoc(int(alignment['tStart']), alignment['qStrand'],
                                alignment['tName'])
예제 #3
0
def write_most_signif(files1, files2, num_regions, qval_thresh,
                      corrected_group, basecall_subgroups, seqs_fn, num_bases,
                      test_type, obs_filter, min_test_vals, stats_fn, fasta_fn,
                      fishers_method_offset):
    calc_stats = stats_fn is None or not os.path.isfile(stats_fn)
    if not calc_stats:
        if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
        all_stats = ns.parse_stats(stats_fn)

    if calc_stats or fasta_fn is None:
        if VERBOSE: sys.stderr.write('Parsing files.\n')
        raw_read_coverage1 = nh.parse_fast5s(files1, corrected_group,
                                             basecall_subgroups)
        raw_read_coverage2 = nh.parse_fast5s(files2, corrected_group,
                                             basecall_subgroups)
        raw_read_coverage1 = nh.filter_reads(raw_read_coverage1, obs_filter)
        raw_read_coverage2 = nh.filter_reads(raw_read_coverage2, obs_filter)

    if calc_stats:
        if VERBOSE: sys.stderr.write('Calculating statistics.\n')
        all_stats = ns.get_all_significance(raw_read_coverage1,
                                            raw_read_coverage2, test_type,
                                            min_test_vals, stats_fn,
                                            fishers_method_offset)

    plot_intervals = ns.get_most_signif_regions(all_stats, num_bases,
                                                num_regions, qval_thresh)
    if fasta_fn is None:
        reg_seqs = get_region_sequences(plot_intervals, raw_read_coverage1,
                                        raw_read_coverage2, num_bases,
                                        corrected_group)
    else:
        fasta_records = nh.parse_fasta(fasta_fn)
        reg_seqs = [(p_int, fasta_records[chrm][start:start + num_bases])
                    for p_int, (chrm, start, strand,
                                reg_name) in plot_intervals
                    if chrm in fasta_records]

    # get reads overlapping each region
    if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n')
    with open(seqs_fn, 'w') as seqs_fp:
        for reg_i, reg_seq in reg_seqs:
            chrm, start, strand, stat = next(
                p_int for p_reg_i, p_int in plot_intervals if p_reg_i == reg_i)
            if strand == '-':
                reg_seq = nh.rev_comp(reg_seq)
            seqs_fp.write('>{0}::{1:d}::{2} {3}\n{4}\n'.format(
                chrm, start, strand, stat, ''.join(reg_seq)))

    return
예제 #4
0
def parse_sam_record(r_sam_record, genome_index):
    # parse cigar string
    cigar = [(int(reg_len), reg_type)
             for reg_len, reg_type in CIGAR_PAT.findall(r_sam_record['cigar'])]
    if len(cigar) < 1:
        raise RuntimeError, 'Invalid cigar string produced.'

    strand = '-' if int(r_sam_record['flag']) & 0x10 else '+'
    if strand == '-':
        cigar = cigar[::-1]

    # record clipped bases and remove from query seq as well as cigar
    qSeq = r_sam_record['seq'] if strand == '+' else nh.rev_comp(
        r_sam_record['seq'])
    start_clipped_bases = 0
    end_clipped_bases = 0
    # handle clipping elements (H and S)
    if cigar[0][1] == 'H':
        start_clipped_bases += cigar[0][0]
        cigar = cigar[1:]
    if cigar[-1][1] == 'H':
        end_clipped_bases += cigar[-1][0]
        cigar = cigar[:-1]
    if cigar[0][1] == 'S':
        start_clipped_bases += cigar[0][0]
        qSeq = qSeq[cigar[0][0]:]
        cigar = cigar[1:]
    if cigar[-1][1] == 'S':
        end_clipped_bases += cigar[-1][0]
        qSeq = qSeq[:-cigar[-1][0]]
        cigar = cigar[:-1]

    tLen = sum([reg_len for reg_len, reg_type in cigar if reg_type in 'MDN=X'])
    tSeq = genome_index[r_sam_record['rName']][int(r_sam_record['pos']) -
                                               1:int(r_sam_record['pos']) +
                                               tLen - 1]
    if strand == '-': tSeq = nh.rev_comp(tSeq)

    # check that cigar starts and ends with matched bases
    while cigar[0][1] not in 'M=X':
        if cigar[0][1] in 'IP':
            tSeq = tSeq[cigar[0][0]:]
        else:
            qSeq = qSeq[cigar[0][0]:]
            start_clipped_bases += cigar[0][0]
        cigar = cigar[1:]
    while cigar[-1][1] not in 'M=X':
        if cigar[-1][1] in 'IP':
            tSeq = tSeq[:-cigar[-1][0]]
        else:
            qSeq = qSeq[:-cigar[-1][0]]
            end_clipped_bases += cigar[0][0]
        cigar = cigar[:-1]

    qLen = sum([reg_len for reg_len, reg_type in cigar if reg_type in 'MIP=X'])
    assert len(qSeq) == qLen, 'Read sequence from SAM and ' + \
        'cooresponding cigar string do not agree.'

    # create pairwise alignment via zipped pairs
    alignVals = []
    for reg_len, reg_type in cigar:
        if reg_type in 'M=X':
            alignVals.extend(zip(qSeq[:reg_len], tSeq[:reg_len]))
            qSeq = qSeq[reg_len:]
            tSeq = tSeq[reg_len:]
        elif reg_type in 'IP':
            alignVals.extend(zip(qSeq[:reg_len], repeat('-')))
            qSeq = qSeq[reg_len:]
        else:
            alignVals.extend(zip(repeat('-'), tSeq[:reg_len]))
            tSeq = tSeq[reg_len:]

    return (alignVals,
            genomeLoc(
                int(r_sam_record['pos']) - 1, strand,
                r_sam_record['rName']), start_clipped_bases, end_clipped_bases)