def parse_m5_record(r_m5_record): if r_m5_record['tStrand'] != '+': raise NotImplementedError, ( 'Mapping indicates negative strand reference mapping.') if r_m5_record['qStrand'] == "+": alignVals = zip(r_m5_record['qAlignedSeq'], r_m5_record['tAlignedSeq']) else: alignVals = zip(nh.rev_comp(r_m5_record['qAlignedSeq']), nh.rev_comp(r_m5_record['tAlignedSeq'])) alignVals, start_clipped_bases, end_clipped_bases, genome_loc \ = clip_m5_alignment( alignVals, int(r_m5_record['tStart']), r_m5_record['qStrand'], r_m5_record['tName']) return (alignVals, genome_loc, start_clipped_bases, end_clipped_bases)
def align_to_genome(basecalls, genome_filename, graphmap_path): ## align to genome read_fp = NamedTemporaryFile(delete=False, suffix='.fasta') read_fp.write(">" + ''.join(basecalls[:5]) + '\n' + ''.join(basecalls) + '\n') read_fp.close() out_fp = NamedTemporaryFile(delete=False) try: # suppress output from graphmap with devnull sink with open(os.devnull, 'w') as FNULL: exitStatus = call([ graphmap_path, 'align', '-r', genome_filename, '-d', read_fp.name, '-o', out_fp.name, '-L', 'm5' ], stdout=FNULL, stderr=STDOUT) alignment = dict(zip(GRAPHMAP_FIELDS, out_fp.readline().split())) out_fp.close() except: raise OSError, ('Problem running/parsing graphmap. Ensure ' + 'you have a compatible version installed.') ## flip read and genome to match raw signal if neg strand mapped if len(alignment) != len(GRAPHMAP_FIELDS): raise NotImplementedError, ('Graphmap did not produce alignment.') if alignment['tStrand'] != '+': raise NotImplementedError, ( 'Graphmap indicates negative strand reference mapping.') if alignment['qStrand'] == "+": alignVals = zip(alignment['qAlignedSeq'], alignment['tAlignedSeq']) else: alignVals = zip(rev_comp(alignment['qAlignedSeq']), rev_comp(alignment['tAlignedSeq'])) return alignVals, genomeLoc(int(alignment['tStart']), alignment['qStrand'], alignment['tName'])
def write_most_signif(files1, files2, num_regions, qval_thresh, corrected_group, basecall_subgroups, seqs_fn, num_bases, test_type, obs_filter, min_test_vals, stats_fn, fasta_fn, fishers_method_offset): calc_stats = stats_fn is None or not os.path.isfile(stats_fn) if not calc_stats: if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats = ns.parse_stats(stats_fn) if calc_stats or fasta_fn is None: if VERBOSE: sys.stderr.write('Parsing files.\n') raw_read_coverage1 = nh.parse_fast5s(files1, corrected_group, basecall_subgroups) raw_read_coverage2 = nh.parse_fast5s(files2, corrected_group, basecall_subgroups) raw_read_coverage1 = nh.filter_reads(raw_read_coverage1, obs_filter) raw_read_coverage2 = nh.filter_reads(raw_read_coverage2, obs_filter) if calc_stats: if VERBOSE: sys.stderr.write('Calculating statistics.\n') all_stats = ns.get_all_significance(raw_read_coverage1, raw_read_coverage2, test_type, min_test_vals, stats_fn, fishers_method_offset) plot_intervals = ns.get_most_signif_regions(all_stats, num_bases, num_regions, qval_thresh) if fasta_fn is None: reg_seqs = get_region_sequences(plot_intervals, raw_read_coverage1, raw_read_coverage2, num_bases, corrected_group) else: fasta_records = nh.parse_fasta(fasta_fn) reg_seqs = [(p_int, fasta_records[chrm][start:start + num_bases]) for p_int, (chrm, start, strand, reg_name) in plot_intervals if chrm in fasta_records] # get reads overlapping each region if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') with open(seqs_fn, 'w') as seqs_fp: for reg_i, reg_seq in reg_seqs: chrm, start, strand, stat = next( p_int for p_reg_i, p_int in plot_intervals if p_reg_i == reg_i) if strand == '-': reg_seq = nh.rev_comp(reg_seq) seqs_fp.write('>{0}::{1:d}::{2} {3}\n{4}\n'.format( chrm, start, strand, stat, ''.join(reg_seq))) return
def parse_sam_record(r_sam_record, genome_index): # parse cigar string cigar = [(int(reg_len), reg_type) for reg_len, reg_type in CIGAR_PAT.findall(r_sam_record['cigar'])] if len(cigar) < 1: raise RuntimeError, 'Invalid cigar string produced.' strand = '-' if int(r_sam_record['flag']) & 0x10 else '+' if strand == '-': cigar = cigar[::-1] # record clipped bases and remove from query seq as well as cigar qSeq = r_sam_record['seq'] if strand == '+' else nh.rev_comp( r_sam_record['seq']) start_clipped_bases = 0 end_clipped_bases = 0 # handle clipping elements (H and S) if cigar[0][1] == 'H': start_clipped_bases += cigar[0][0] cigar = cigar[1:] if cigar[-1][1] == 'H': end_clipped_bases += cigar[-1][0] cigar = cigar[:-1] if cigar[0][1] == 'S': start_clipped_bases += cigar[0][0] qSeq = qSeq[cigar[0][0]:] cigar = cigar[1:] if cigar[-1][1] == 'S': end_clipped_bases += cigar[-1][0] qSeq = qSeq[:-cigar[-1][0]] cigar = cigar[:-1] tLen = sum([reg_len for reg_len, reg_type in cigar if reg_type in 'MDN=X']) tSeq = genome_index[r_sam_record['rName']][int(r_sam_record['pos']) - 1:int(r_sam_record['pos']) + tLen - 1] if strand == '-': tSeq = nh.rev_comp(tSeq) # check that cigar starts and ends with matched bases while cigar[0][1] not in 'M=X': if cigar[0][1] in 'IP': tSeq = tSeq[cigar[0][0]:] else: qSeq = qSeq[cigar[0][0]:] start_clipped_bases += cigar[0][0] cigar = cigar[1:] while cigar[-1][1] not in 'M=X': if cigar[-1][1] in 'IP': tSeq = tSeq[:-cigar[-1][0]] else: qSeq = qSeq[:-cigar[-1][0]] end_clipped_bases += cigar[0][0] cigar = cigar[:-1] qLen = sum([reg_len for reg_len, reg_type in cigar if reg_type in 'MIP=X']) assert len(qSeq) == qLen, 'Read sequence from SAM and ' + \ 'cooresponding cigar string do not agree.' # create pairwise alignment via zipped pairs alignVals = [] for reg_len, reg_type in cigar: if reg_type in 'M=X': alignVals.extend(zip(qSeq[:reg_len], tSeq[:reg_len])) qSeq = qSeq[reg_len:] tSeq = tSeq[reg_len:] elif reg_type in 'IP': alignVals.extend(zip(qSeq[:reg_len], repeat('-'))) qSeq = qSeq[reg_len:] else: alignVals.extend(zip(repeat('-'), tSeq[:reg_len])) tSeq = tSeq[reg_len:] return (alignVals, genomeLoc( int(r_sam_record['pos']) - 1, strand, r_sam_record['rName']), start_clipped_bases, end_clipped_bases)