def _align_to_mature(seq, hairpin, mature): """Get alignment between seq and mature""" mirna = get_mature_sequence(hairpin, mature) hit = align(seq, mirna) start = hit[0][:8].count("-") - 4 + int(mature[0]) logger.debug("PROST::align:sequence to mature %s" % hit[0]) logger.debug("PROST::align:start: %s -> %s" % (mature[0], start)) return start
def test_locala(self): """testing pairwise alignment""" from mirtop.mirna.realign import align print "\nExamples of perfect match, deletion, mutation" print align("TGAGGTAGTAGGTTGTATAGTT", "TGAGGTAGTAGGTTGTATAGTT")[0] print align("TGAGGTGTAGGTTGTATAGTT", "TGAGGTAGTAGGTTGTATAGTT")[0] print align("TGAGGTAGTAGGCTGTATAGTT", "TGAGGTAGTAGGTTGTATAGTT")[0]
def test_locala(self): """testing pairwise alignment""" from mirtop.mirna.realign import align print("\nExamples of perfect match, deletion, mutation") print(align("TGAGTAGTAGGTTGTATAGTT", "TGAGGTAGTAGGTTGTATAGTT")[0]) print(align("TGAGGTGTAGGTTGTATAGTT", "TGAGGTAGTAGGTTGTATAGTT")[0]) print(align("TGAGGTAGTAGGCTGTATAGTT", "TGAGGTAGTAGGTTGTATAGTT")[0]) print(align("TGANTAGTAGNTTGTATNGTT", "TGAGTAGTAGGTTGTATAGTTT")[0]) print(align("TGANTAGTNGNTTGTATNGTT", "TGAGTATAGGCCTTGTATAGTT")[0]) print(align("NCANAGTCCAAGNTCATN", "TCATAGTCCAAGGTCATG")[0])
def tune(seq, precursor, start, cigar): """ The actual fn that will realign the sequence """ if cigar: seq, mature = cigar_correction(cigar, seq, precursor[start:]) else: seq, mature, score, p, size = align(seq, precursor[start:start + len(seq)]) cigar = make_cigar(seq, mature) if seq.startswith("-"): seq = seq[1:] if seq.endswith("-"): seq = seq[:-1] logger.debug("TUNE:: %s %s %s" % (cigar, seq, mature)) error = set() pattern_addition = [[1, 1, 0], [1, 0, 1], [0, 1, 0], [0, 1, 1], [0, 0, 1], [1, 1, 1]] for pos in range(0, len(seq)): if seq[pos] != mature[pos]: error.add(pos) subs, add = [], [] for e in error: if e < len(seq) - 3: subs.append([e, seq[e], mature[e]]) pattern, error_add = [], [] for e in range(len(seq) - 3, len(seq)): if e in error: pattern.append(1) error_add.append(e) else: pattern.append(0) for p in pattern_addition: if pattern == p: add = seq[error_add[0]:].replace("-", "") break if not add and error_add: for e in error_add: subs.append([e, seq[e], mature[e]]) return subs, add, make_cigar(seq, mature)
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation( info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([ randSeq, e, ]) # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq] # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([randSeq, e,]) # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]) # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def tune(seq, precursor, start, cigar): """ The actual fn that will realign the sequence to find the nt changes at 5', 3' sequence and nt variations. Args: *seq (str)*: sequence of the read. *precursor (str)*: sequence of the precursor. *start (int)*: start position of sequence on the precursor, +1. *cigar (str)*: similar to SAM CIGAR attribute. Returns: *list* with: subs (list): substitutions add (list): nt added to the end cigar (str): updated cigar """ end = len(seq) if start < 0: end = end + start start = 0 if cigar: seq, mature = cigar_correction(cigar, seq, precursor[start:]) else: seq, mature, score, p, size = align(seq, precursor[start:start + end]) cigar = make_cigar(seq, mature) if seq.startswith("-"): seq = seq[1:] if seq.endswith("-"): seq = seq[:-1] logger.debug("TUNE:: %s %s %s" % (cigar, seq, mature)) error = set() for pos in range(0, len(seq)): if seq[pos] != mature[pos]: error.add(pos) subs, add = [], [] prob = 0 add_position = [] for e in range(len(seq) - 1, len(seq) - 6, -1): if e in error: prob = 1 if prob == 1: add.append(seq[e]) add_position.append(e) if e not in error and prob == 0 and seq[e] in ["A", "T"]: add.append(seq[e]) add_position.append(e) continue if e not in error: if add: add.pop() add_position.pop() if prob == 0: add = [] add_position = [] break for e in error: if e not in add_position: subs.append([e, seq[e], mature[e]]) logger.debug("TUNE:: %s %s" % (subs, add)) return subs, "".join(add), make_cigar(seq, mature)