Exemplo n.º 1
0
def createMapping(set_a, set_b):
    sol = {}
    for sa in set_a:
        a = sa.split()
        candidate = None
        candidate_val = 1e4
        for sb in set_b:
            b = sb.split()
            assert len(a) == 2
            assert len(b) == 2

            v1 = edlib.align(a[0], b[0])['editDistance'] + edlib.align(
                a[1], b[1])['editDistance']
            v2 = edlib.align(a[0], b[1])['editDistance'] + edlib.align(
                a[1], b[0])['editDistance']
            d = min(v1, v2)
            if d < candidate_val:
                candidate = sb
                candidate_val = d
        sol[sa] = candidate
        if candidate_val > 4:
            sol[sa] = None
        print(sa, candidate, candidate_val)

    return sol
Exemplo n.º 2
0
def revprimerStrip(file):
    base = os.path.basename(file).split('.')[0]
    goodseq = os.path.join(tmpdir, base + '.good')
    badseq = os.path.join(tmpdir, base + '.bad')
    with open(goodseq, 'w') as good:
        with open(badseq, 'w') as bad:
            for title, seq, qual in FastqGeneralIterator(open(file)):
                foralign = edlib.align(args.rev_primer,
                                       seq,
                                       mode="HW",
                                       k=args.primer_mismatch,
                                       additionalEqualities=amptklib.degenNuc)
                if foralign["editDistance"] >= 0:
                    ForCutPos = foralign["locations"][0][1] + 1
                    Seq = seq[ForCutPos:]
                    Qual = qual[ForCutPos:]
                    #align reverse
                    revalign = edlib.align(
                        RevForPrimer,
                        Seq,
                        mode="HW",
                        task="locations",
                        k=args.primer_mismatch,
                        additionalEqualities=amptklib.degenNuc)
                    if revalign["editDistance"] >= 0:
                        RevCutPos = revalign["locations"][0][0]
                        Seq = Seq[:RevCutPos]
                        Qual = Qual[:RevCutPos]
                    good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual))
                else:
                    bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
Exemplo n.º 3
0
def remove_title_page(txt_file_without_tp, txt_file_with_tp, num_lines,
                      out_dir):
    result = None
    lines_without = txt_file_without_tp.readlines()
    lines_with = txt_file_with_tp.readlines()

    without_txt = ''.join(lines_without[:num_lines]).lower()
    with_txt = ''.join(lines_with[:num_lines]).lower()
    res = edlib.align(without_txt, with_txt)
    prev_ld = res['editDistance']

    for i in range(num_lines):
        without_txt = ''.join(lines_without[:num_lines]).lower()
        with_txt = ''.join(lines_with[i:num_lines]).lower()

        res = edlib.align(without_txt, with_txt)
        ld = res['editDistance']

        if ld > prev_ld:
            result = ''.join(lines_with[i - 1:])
            break
        elif ld < prev_ld:
            prev_ld = ld

    if result is None:
        warnings.warn('No title page found')
        out_file = out_file_name(out_dir, txt_file_with_tp.name)
        shutil.copy2(txt_file_with_tp.name, out_file)
    else:
        out_file = out_file_name(out_dir, txt_file_with_tp.name)
        with codecs.open(out_file, 'w', encoding='utf8') as f:
            f.write(result)
Exemplo n.º 4
0
def check_library_barcode(b1: str, b2: str) -> bool:
    """only some library with libary barcode."""
    #  if len(b1) == 0 or len(b2) == 0:
    #  return False
    d1 = edlib.align(b1, _LIB5_SEQ, mode="NW", task="distance")["editDistance"]
    d2 = edlib.align(b2, _LIB3_SEQ, mode="NW", task="distance")["editDistance"]
    return d1 <= 3 and d2 <= 3
Exemplo n.º 5
0
def detect_read_strand(read_5p_seq, upstream_context_fwd,
                       upstream_context_rev):
    result_fwd = edlib.align(upstream_context_fwd, read_5p_seq, mode="HW", k=5)
    result_rev = edlib.align(upstream_context_rev, read_5p_seq, mode="HW", k=5)
    strand = "-"
    if result_fwd["editDistance"] < result_rev["editDistance"]:
        strand = "+"

    return strand
Exemplo n.º 6
0
def sequences_overlap(query, targets, min_match=30, check_revcom=True):
    """Check if a query overlaps with any sequence in targets."""
    for target in targets:
        cigar = align(query, target, 'HW', 'path')['cigar']
        if cigar_to_max_operation(cigar) > min_match:
            return True
        elif check_revcom:
            cigar = align(revcom(query), target, 'HW', 'path')['cigar']
            if cigar_to_max_operation(cigar) > min_match:
                return True
    return False
Exemplo n.º 7
0
def get_direction(query, ref):
    """
    Determine the strand direction for query sequence by comparison with
    ref sequence
    """
    fwd = query
    rev = query.reverse_complement()
    d_fwd = edlib.align(str(fwd.seq), str(ref.seq), task="distance",
                        mode="NW")["editDistance"]
    d_rev = edlib.align(str(rev.seq), str(ref.seq), task="distance",
                        mode="NW")["editDistance"]
    return fwd if d_fwd < d_rev else rev
Exemplo n.º 8
0
    def compare_polished_sequences(self, final_sequences):
        report_fn = os.path.join(self.params.outdir, 'report.txt')
        with open(report_fn, 'w') as f:
            for i in range(1, self.params.num_iters):
                seq_i, seq_i1 = final_sequences[i], final_sequences[i+1]
                alignment = edlib.align(seq_i, seq_i1)
                print(f'Alignment polishing seq {i} vs {i+1}:', file=f)
                print(alignment, file=f)

                hpc_seq_i = compress_homopolymer(final_sequences[i])
                hpc_seq_i1 = compress_homopolymer(final_sequences[i+1])
                alignment = edlib.align(hpc_seq_i, hpc_seq_i1)
                print(f'Alignment homopolymer compressed polishing seq {i} vs {i+1}:', file=f)
                print(alignment, file=f)
Exemplo n.º 9
0
 def create_synteny_matrix_mul(self, gene_seq, g1, g2, n):
     for gene in g1:
         if gene == "NULL_GENE":
             continue
         try:
             _ = gene_seq[gene]
         except BaseException:
             return np.zeros((n, n, 2)), np.zeros((n, n, 2))
     for gene in g2:
         if gene == "NULL_GENE":
             continue
         try:
             _ = gene_seq[gene]
         except BaseException:
             return np.zeros((n, n, 2)), np.zeros((n, n, 2))
     sm = np.zeros((n, n, 2))
     sml = np.zeros((n, n, 2))
     for i in range(n):
         if g1[i] == "NULL_GENE":
             continue
         if gene_seq[g1[i]] == "":
             return np.zeros((n, n, 2)), np.zeros((n, n, 2))
         for j in range(n):
             if g2[j] == "NULL_GENE":
                 continue
             if gene_seq[g2[j]] == "":
                 return np.zeros((n, n, 2)), np.zeros((n, n, 2))
             norm_len = max(len(gene_seq[g1[i]]), len(gene_seq[g2[j]]))
             try:
                 result = ed.align(gene_seq[g1[i]],
                                   gene_seq[g2[j]],
                                   mode="NW",
                                   task="distance")
                 sm[i][j][0] = result["editDistance"] / (norm_len)
                 result = ed.align(gene_seq[g1[i]],
                                   gene_seq[g2[j]][::-1],
                                   mode="NW",
                                   task="distance")
                 sm[i][j][1] = result["editDistance"] / (norm_len)
                 _, result, _ = local_pairwise_align_ssw(
                     DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]]))
                 sml[i][j][0] = result / (norm_len)
                 _, result, _ = local_pairwise_align_ssw(
                     DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]][::-1]))
                 sml[i][j][1] = result / (norm_len)
             except BaseException:
                 return np.zeros((n, n, 2)), np.zeros((n, n, 2))
     return sm, sml
 def replace_word(self, word, normalize=True):
     """uses levenshtein by default, no argument to avoid time spent on conditionals"""
     """edlib.align faster than levenshteinDistance"""
     if normalize:
         scores = []
         for vocab_word in self.limited_vocab:
             align_score = align(vocab_word, word)
             edit_distance = align_score['editDistance']
             if edit_distance != 0:
                 edit_distance /= align_score['alphabetLength']
             scores.append(edit_distance)
         scores = np.array([scores])
     else:
         scores = np.array([align(vocab_word, word)['editDistance'] for vocab_word in self.limited_vocab])
     replacement_word = self.idx2word[np.argmin(scores)]
     self.word2idx[word] = self.word2idx[replacement_word]
Exemplo n.º 11
0
def edist(lst):
    if len(str(lst[0])) == 0:
        return len(str(lst[1]))
    if len(str(lst[1])) == 0:
        return len(str(lst[0]))
    result = edlib.align(str(lst[0]),
                         str(lst[1]),
                         mode="NW",
                         additionalEqualities=[('U', 'T'), ('R', 'A'),
                                               ('R', 'G'), ('Y', 'C'),
                                               ('Y', 'T'), ('Y', 'U'),
                                               ('K', 'G'), ('K', 'T'),
                                               ('K', 'U'), ('M', 'A'),
                                               ('M', 'C'), ('S', 'C'),
                                               ('S', 'G'), ('W', 'A'),
                                               ('W', 'T'), ('W', 'U'),
                                               ('B', 'C'), ('B', 'G'),
                                               ('B', 'T'), ('B', 'U'),
                                               ('D', 'A'), ('D', 'G'),
                                               ('D', 'T'), ('D', 'U'),
                                               ('H', 'A'), ('H', 'C'),
                                               ('H', 'T'), ('H', 'U'),
                                               ('V', 'A'), ('V', 'C'),
                                               ('V', 'G'), ('N', 'C'),
                                               ('N', 'C'), ('N', 'G'),
                                               ('N', 'T'), ('N', 'U')])
    return result["editDistance"]
Exemplo n.º 12
0
    def identity_test(self, target_identity, read_length, error_model, qscore_model):
        target_errors = 1.0 - target_identity
        read_delta = self.read_delta * target_errors
        mean_delta = self.mean_delta * target_errors

        if VERBOSE:
            print(f'\nRead length: {read_length}, target identity: {target_identity}')
            print(f'    allowed error per read:   {read_delta:.4f}')
            print(f'    allowed error in mean:    {mean_delta:.4f}')
            print('    identities: ', end='')

        read_identities = []
        for i in range(self.trials):
            frag = badread.misc.get_random_sequence(read_length)
            seq, qual, _, _ = badread.simulate.sequence_fragment(frag, target_identity,
                                                                 error_model, qscore_model)
            cigar = edlib.align(frag, seq, task='path')['cigar']
            read_identity = badread.misc.identity_from_edlib_cigar(cigar)
            read_identities.append(read_identity)

            if VERBOSE:
                print('{:.4f}'.format(read_identity), flush=True,
                      end='\n                ' if (i+1) % 20 == 0 else ' ')

            self.assertAlmostEqual(read_identity, target_identity, delta=read_delta)

        mean_identity = statistics.mean(read_identities)
        if VERBOSE:
            print('\r' if self.trials % 20 == 0 else '\n', end='')
            print(f'    mean:       {mean_identity:.4f}')

        self.assertAlmostEqual(mean_identity, target_identity, delta=mean_delta)
        if VERBOSE:
            print('    PASS')
Exemplo n.º 13
0
    def get_aligement(self, fast5_path, ref_path, verbose, fasta_out_dir=None, ref=None):
        t = perf_counter()
        fasta_out = None
        if fasta_out_dir is not None:
            fasta_out = os.path.join(fasta_out_dir, os.path.splitext(fast5_path)[0].split('/')[-1] + ".fasta")
        basecalled = self.basecall_sample(fast5_path, fasta_out=fasta_out, ref=ref)
        with open(ref_path) as f:
            target = f.readlines()[-1]

        if verbose:
            print("Basecalled:\n", basecalled)
            print("Target\n", target)
        self.logger.debug("fast5_path %s, ref_path %s", fast5_path, ref_path)
        result = edlib.align(basecalled, target, task='path')
        cigar_pairs = butils.cigar_str_to_pairs(result['cigar'])

        self.logger.debug("\nBasecalled: %s\nTarget    : %s",
                          butils.query_align_string(basecalled, cigar_pairs),
                          butils.reference_align_string(target, cigar_pairs)
                          )

        self.logger.debug("extCigar %s", result['cigar'])
        self.logger.debug("Whole time %.3f", perf_counter() - t)

        cigar_stat = defaultdict(int)
        total = 0
        for num, op in breakCigar(result['cigar']):
            total += num
            cigar_stat[op] += num

        acc = cigar_stat['='] / total
        nedit = result['editDistance'] / len(target)
        return nedit, acc, len(basecalled), cigar_stat
Exemplo n.º 14
0
    def custom_sequencing_adapter_check(self, r1_seq):
        ''' Check for custom sequencing adapter on r1
        :param bytes r1_seq: R1 sequence
        :rtype int
        :returns end pos of adapter, -1 if not found
        '''
        # NOTE: Will check multiple adapters for all reads
        # Should the adapter be identified first and only that sequence used going forward ?
        # Maybe add this logic in the trim_custom_sequencing_adapter function.
        best_adapter = {"seq": None, "align": None, "score": None}
        for a in self.custom_seq_adapter:
            alignment = edlib.align(a,
                                    r1_seq[0:len(a) + 3],
                                    mode="SHW",
                                    task="locations")
            score = float(alignment["editDistance"]) / len(
                self.custom_seq_adapter)
            if best_adapter["seq"] is None or score < best_adapter["score"]:
                best_adapter["seq"] = a
                best_adapter["align"] = alignment
                best_adapter["score"] = score

        if best_adapter["score"] < 0.18:
            return best_adapter["align"]["locations"][-1][1]
        else:
            return -1
Exemplo n.º 15
0
def _cutr(reads, adp, th, r, len_list=[]):
    iden_max = -1
    match_num = 0
    cut_pos = []
    skip_num = 0
    repat = re.compile(r'(\d+)[DHIMNPSX=]{1}')
    if len(reads[0]) > 2:
        has_qual = True
    else:
        has_qual = False
    for read in reads:
        read_length = len(read[1])
        if not len_list:
            pass
        else:
            len_list.append(read_length)
        if read_length < 2 * r:
            #logger.debug("%s is too short: %d bp. Skip." % (read[0], read_length))
            skip_num += 1
            continue
        result = edlib.align(adp, read[1][-r:], mode="HW", task='path')
        identity = 1.0 - float(result['editDistance'] / np.sum(
            [int(i) for i in repat.findall(result['cigar'])]))
        if identity > th:
            start = len(read[1]) - r + result['locations'][0][0]
            cut_pos.append(r - result['locations'][0][0])
            match_num += 1
            if identity > iden_max:
                iden_max = identity
            read[1] = read[1][:start]
            if has_qual:
                read[2] = read[2][:start]

    logger.info("%d reads were skipped due to their short lengths." % skip_num)
    return (iden_max, match_num, cut_pos)
Exemplo n.º 16
0
def get_pairwise_alignments(seqs):
    section_header('Pairwise global alignments')
    explanation(
        'Trycycler uses the edlib aligner to get global alignments between all pairs of '
        'sequences. This can help you to spot any problematic sequences that should be '
        'excluded before continuing. If you see any sequences with notably worse '
        'identities or max indels, you can remove them (delete the contig\'s FASTA) and '
        'run this command again.')
    seq_names = list(seqs.keys())
    max_seq_name_len = max(len(x) for x in seq_names)
    pairwise_cigars, percent_identities, max_indels = {}, {}, {}

    for i, a in enumerate(seq_names):
        seq_a = seqs[a]
        for j in range(i + 1, len(seq_names)):
            b = seq_names[j]
            seq_b = seqs[b]
            log(' ' * (max_seq_name_len - len(a)) + a, end='')
            log(' vs ', end='')
            log(b + '...' + ' ' * (max_seq_name_len - len(b)), end=' ')

            result = edlib.align(seq_a, seq_b, mode="NW", task="path")
            cigar = result['cigar']
            percent_identity, max_indel = identity_and_max_indel_from_cigar(
                cigar)
            log(f'{percent_identity:.2f}% identity, max indel = {max_indel}')

            pairwise_cigars[(a, b)] = cigar
            percent_identities[(a, b)] = percent_identity
            percent_identities[(b, a)] = percent_identity
            max_indels[(a, b)] = max_indel
            max_indels[(b, a)] = max_indel
    log()

    return pairwise_cigars, percent_identities, max_indels
Exemplo n.º 17
0
def get_global_aln_results(ref_seq, query_seq, min_seq_len):
    """
    Aligns two sequences globally, and returns (delta_len, idt, cov) values
    compatible with the legacy deduplication code.
    Currently unused - it was used in an intermediate version, and might be useful
    at some point in the future.
    """
    log('Aligning (Edlib): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
    delta_len = len(query_seq) - len(ref_seq)

    idt = 0.0
    cov = 0.0

    if len(ref_seq) < min_seq_len or len(query_seq) < min_seq_len:
        return delta_len, idt, cov

    result = edlib.align(query_seq, ref_seq, mode="NW", task="path")

    cigar = result['cigar']
    num_m, num_i, num_d, total_len = count_cigar_ops(result['cigar'])
    num_eq = (num_m + num_i + num_d) - result['editDistance']
    num_x = num_m - num_eq

    idt_query = float(num_eq) / float(num_eq + num_x + num_i)
    idt_ref = float(num_eq) / float(num_eq + num_x + num_d)
    idt = min(idt_query, idt_ref)

    cov = 1.0

    log('  - Alignment stats: num_m = %d, num_i = %d, num_d = %d, total_len = %d, num_eq = %d, num_x = %d' % (num_m, num_i, num_d, total_len, num_eq, num_x))

    return delta_len, idt, cov
Exemplo n.º 18
0
def sequences_match(seq1, seq2, compare='5p_clip', min_length=4):
    """
    Compare 2 sequences and determine whether they are likely the same.

    >>> s1, s2 = 'AAAAAAAAAA', 'AAAAAAAAATGC'
    >>> sequences_match(s1, s2, compare='3p_clip')
    True
    >>> sequences_match('TTGAAAAAAA', s2, compare='3p_clip')
    False
    >>> sequences_match(s1[::-1], s2[::-1], compare='5p_clip')
    True
    >>> sequences_match('A', 'A')
    False
    """
    if compare == '3p_clip':
        seq1 = seq1[:10]
        seq2 = seq2[:10]
    else:
        seq1 = seq1[-10:]
        seq2 = seq2[-10:]
    if len(seq1) > min_length and len(seq2) > min_length:
        if len(seq1) >= 10 and len(seq2) >= 10:
            if align(seq1, seq2)['editDistance'] < 2:
                return True
        elif compare == '3p_clip':
            return seq1.startswith(seq2) or seq2.startswith(seq1)
        else:
            return seq1.endswith(seq2) or seq2.endswith(seq1)
    return False
Exemplo n.º 19
0
def print_alignment(s1, s2, k=None, mode='NW', width=60):
    if k is None:
        k = max(len(s1), len(s2))
    alignment = edlib.align(s1, s2, task='path', k=k, mode=mode)
    cigar = alignment['cigar']
    locs = alignment['locations']
    st, en = locs[0]
    _, _, a1, a2 = parse_cigar(cigar, s1, s2[st:])
    status = []
    for c1, c2 in zip(a1, a2):
        if c1 == c2:
            status.append('|')
        elif c1 == '-' or c2 == '-':
            status.append('-')
        elif c1 != c2:
            status.append('X')
        else:
            assert False
    a1 = ''.join(a1)
    status = ''.join(status)
    a2 = ''.join(a2)

    a1 = textwrap.wrap(a1, width=width)
    status = textwrap.wrap(status, width=width)
    a2 = textwrap.wrap(a2, width=width)

    for wa1, wst, wa2 in zip(a1, status, a2):
        print(wa1)
        print(wst)
        print(wa2)
        print("")
Exemplo n.º 20
0
def min_ed(max_insertion, q_ins):
    result = edlib.align(max_insertion, q_ins, task="path", mode="NW")
    cigar = result["cigar"]
    tuples = []
    # do not allow deletions in max_insertion: because we do not want to alter this sequence
    if "D" in cigar:
        return ""
    matches = re.split(r'[=DXSMI]+', cigar)
    i = 0
    for length in matches[:-1]:
        i += len(length)
        type_ = cigar[i]
        i += 1
        tuples.append((int(length), type_))

    q_insertion_modified = ""
    q_ins_pos = 0
    for length, type_ in tuples:
        # if we reach here we are guaranteed no deletions in alignment of max_insertion
        # we therefore simply thread in the matching or mismatching characters (if type '=' or 'X')
        # or we put a "-" (if type is 'I')
        if type_ == "I":
            q_insertion_modified += "-" * length
        else:
            q_insertion_modified += q_ins[q_ins_pos:q_ins_pos + length]
            q_ins_pos += length
    return q_insertion_modified
 def getRatio(self, stringA=None, stringB=None):
     """
     With default(None) atributes - returns editdistance ratio based on main files.
     With different strings can be helpful for substring comparision (.getRatio(SubS1,Subs2))
     >>> ED1 = EditDistance("testABC1", "testABB2")
     >>> ED1.getRatio()
     0.75
     >>> ED1.getRatio("Ala ma kota", "Alan ma psa")
     0.6363636363636364
     >>> ED1.getRatio(1,"999")
     -1
     >>> ED1.getRatio(3987,4789)
     -1
     """
     if (stringA == None and stringB == None):
         stringA, stringB = self.string1, self.string2
         alignmentEditDistance = self.editDistance
     else:
         if not (type(stringA) == str and type(stringB) == str):
             return -1
         alignmentEditDistance = edlib.align(stringA, stringB,
                                             task="path")["editDistance"]
     stringLenMax = max(len(stringA), len(stringB))
     ratioAB = 1 - (float(alignmentEditDistance) / float(stringLenMax))
     return ratioAB
Exemplo n.º 22
0
def align(query, pattern_info, max_ed, normalise=False):
    pattern, wildcard, equalities, forward = pattern_info

    result = edlib.align(
        pattern,
        query,
        task="path",
        mode="HW",
        k=max_ed,
        additionalEqualities=equalities,
    )
    if result["editDistance"] == -1:
        return None, None

    ed = result["editDistance"]
    if not normalise:
        locs = result["locations"][0]
        umi = query[locs[0]:locs[1] + 1]
        return ed, umi

    # Extract and normalise UMI
    umi = ""
    align = edlib.getNiceAlignment(result, pattern, query)
    for q, t in zip(align["query_aligned"], align["target_aligned"]):
        if q != wildcard:
            continue
        if t == "-":
            umi += "N"
        else:
            umi += t

    if len(umi) != 16:
        raise RuntimeError("UMI length incorrect: {}".format(umi))

    return ed, umi
Exemplo n.º 23
0
def edlib_traceback_allow_ends(x, y, mode="NW", task="path", k=1, end_threshold = 0):
    result = edlib.align(x, y, mode=mode, task=task, k=k)
    ed = result["editDistance"]
    locations =  result["locations"]
    cigar =  result["cigar"]

    if cigar:
        tuples = []
        result = re.split(r'[=DXSMI]+', cigar)
        i = 0
        for length in result[:-1]:
            i += len(length)
            type_ = cigar[i]
            i += 1
            tuples.append((length, type_ ))

        ed_ignore_ends = ed
        if tuples[0][1] == "D" or  tuples[0][1] == "I":
            begin_snippet = int(tuples[0][0])
            if begin_snippet <= end_threshold:
                ed_ignore_ends -= int(begin_snippet)
        if tuples[-1][1] == "D" or  tuples[-1][1] == "I":
            end_snippet = int(tuples[-1][0])
            if end_snippet <= end_threshold:
                ed_ignore_ends -= int(end_snippet)  
        # if ed > ed_ignore_ends:          
        #     print("ed global:", ed, "ed after:", ed_ignore_ends)
        ed = ed_ignore_ends

    # if ed ==0:
    #     print("here")

    return ed, locations, cigar
Exemplo n.º 24
0
def edist(lst):
    if len(str(lst[0])) == 0:
        return -1, ""
    if len(str(lst[1])) == 0:
        return -1, ""
    result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", task="path")
    return result["editDistance"], result["cigar"]
Exemplo n.º 25
0
def get_align_index_path(query: Iterable,
                         target: Iterable) -> List[CharAlignToken]:

    path_ = edlib.align(query, target, task="path")["cigar"]
    if path_ is None:
        return []
    path_ = expand_cigar_format(path_)

    index_out = 0
    index_path = 0
    out = []
    for index_query in range(len(query)):
        while path_[index_path] == "D":
            index_out += 1
            index_path += 1

        action = path_[index_path]

        out.append(CharAlignToken(index_out, action))
        if action == "=":
            assert query[index_query] == target[index_out]
        if action in ["=", "X"]:
            index_out += 1

        index_path += 1

    return out
Exemplo n.º 26
0
def exec(peptide, time_node):
    file = open("../src/public/jobs/service3/service3.fasta", "w") 
    file.write(peptide)
    file.close()
    fasta = SeqIO.parse("../src/public/jobs/service3/service3.fasta", "fasta")
    if(any(fasta) == False): #False when `fasta` is empty
        return "error"
    count = 0
    for record in SeqIO.parse("../src/public/jobs/service3/service3.fasta", "fasta"):
        sequence_input = str(record.seq)
        count = count+1
    print(count)
    print(sequence_input)
    if (count > 1):
        return "error"
    dataset = pd.read_csv("data_values_activity_non_modified.csv")

    dict_response = []
    for i in range(len(dataset)):

        align_result = edlib.align(sequence_input, dataset['sequence'][i], mode = "HW", task = "path")
        view_alignment = edlib.getNiceAlignment(align_result, sequence_input, dataset['sequence'][i])
        dict_aligment = {"input_sequence":view_alignment['query_aligned'], "space_format":view_alignment['matched_aligned'], "compare_sequence": view_alignment['target_aligned'], "id_sequence" : str(dataset['index_sequence'][i]), 'distance_sequences':str(align_result['editDistance'])}
        
        dict_response.append(dict_aligment)

    dict_data_results = {"summary_alignment":dict_response}

    #export result alignment
    return dict_data_results
Exemplo n.º 27
0
def edist_nw(lst):
    if len(str(lst[0])) == 0:
        return -1, ""
    if len(str(lst[1])) == 0:
        return -1, ""
    result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", k=500)
    return result["editDistance"]
Exemplo n.º 28
0
def distance_matrix(sequences):
    """
    Construct a distance matrix from pairwise alignments sequences
    """
    dists = np.array([
        np.array([0 for _ in range(len(sequences))])
        for _ in range(len(sequences))
    ])

    if dists.shape[0] == 0:
        return dists

    base_seq = sequences[0]
    adjusted_sequences = []
    for s in tqdm(sequences, desc="{:<10}".format("prescan")):
        adjusted_sequences.append(get_direction(s, base_seq))

    for i in tqdm(range(len(dists)), desc="{:<10}".format("align")):
        query = str(adjusted_sequences[i].seq)

        for j in range(i, len(dists[i])):
            if i != j:
                target = str(adjusted_sequences[j].seq)
                d = edlib.align(query, target, task="distance",
                                mode="NW")["editDistance"]
                dists[i][j] = d
                dists[j][i] = d
    return dists
Exemplo n.º 29
0
def find_barcode_locations(center, barcodes, primer_max_ed):
    "Find barcodes in a center using edlib"
    
    # Creation of a IUPAC equivalence map for edlib to allow IUPAC code in primers
    # The IUPAC map was created with:
    # from Bio.Data import IUPACData
    # IUPAC_map = [(i, k) for i, j in IUPACData.ambiguous_dna_values.items() for k in j]
    IUPAC_map = [('A', 'A'), ('C', 'C'), ('G', 'G'), ('T', 'T'), ('M', 'A'), ('M', 'C'),
                 ('R', 'A'), ('R', 'G'), ('W', 'A'), ('W', 'T'), ('S', 'C'), ('S', 'G'),
                 ('Y', 'C'), ('Y', 'T'), ('K', 'G'), ('K', 'T'), ('V', 'A'), ('V', 'C'),
                 ('V', 'G'), ('H', 'A'), ('H', 'C'), ('H', 'T'), ('D', 'A'), ('D', 'G'),
                 ('D', 'T'), ('B', 'C'), ('B', 'G'), ('B', 'T'), ('X', 'G'), ('X', 'A'),
                 ('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')]
    all_locations = []
    for primer_acc, primer_seq in barcodes.items():
        # print(primer_acc, primer_seq,center)
        # Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code
        result = edlib.align(primer_seq, center,
                             mode="HW", task="locations", k=primer_max_ed,
                             additionalEqualities=IUPAC_map)
        ed = result["editDistance"]
        locations = result["locations"]
        print(locations, ed)
        if locations:
            all_locations.append((primer_acc, locations[0][0], locations[0][1], ed))
    return all_locations
Exemplo n.º 30
0
def edist(lst):
    if len(str(lst[0])) == 0:
        return 100500
    if len(str(lst[1])) == 0:
        return 100500
    result = edlib.align(str(lst[0]), str(lst[1]), mode="NW")
    return result["editDistance"]
Exemplo n.º 31
0
def correct_basecalled(bucketed_basecall, reference, nedit_tol=0.2):
    basecalled = "".join(bucketed_basecall)
    origin = np.zeros(len(basecalled), dtype=np.int32)
    idx = 0
    for i, b in enumerate(bucketed_basecall):
        origin[idx:idx + len(b)] = i
        idx += len(b)

    result_set = edlib.align(basecalled, reference, task="path")
    nedit = result_set['editDistance'] / len(reference)
    if nedit > nedit_tol:
        raise TooLargeEditDistance(
            "Normalized edit distance is large...%.3f" % nedit
        )

    result = ["" for _ in bucketed_basecall]
    idx_ref = 0
    idx_bcalled = 0

    for num, op in breakCigar(result_set['cigar']):
        for _ in range(num):
            if op in CIGAR_MATCH_MISSMATCH:
                result[origin[idx_bcalled]] += reference[idx_ref]
                idx_bcalled = min(idx_bcalled + 1, len(basecalled) - 1)
                idx_ref += 1
            elif op in CIGAR_INSERTION:
                idx_bcalled += 1
            elif op in CIGAR_DELETION:
                result[origin[idx_bcalled]] += reference[idx_ref]
                idx_ref += 1
    return result
Exemplo n.º 32
0
def get_aln_data(t_seq, q_seq):
    aln_data = []
    K = 8
    seq0 = t_seq
    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
    sa_ptr = kup.allocate_seq(len(seq0))
    sda_ptr = kup.allocate_seq_addr(len(seq0))
    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
    q_id = "dummy"

    kmer_match_ptr = kup.find_kmer_pos_for_seq(
        q_seq, len(q_seq), K, sda_ptr, lk_ptr)
    kmer_match = kmer_match_ptr[0]

    if kmer_match.count != 0:
        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
        aln_range = aln_range_ptr[0]

        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2

        log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))

        if e1 - s1 > 100:
            log('Calling edlib.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))

            # Align using Edlib instead of DWA.
            edlib_result = edlib.align(q_seq[s1:e1], seq0[s2:e2], mode="NW")

            delta_l = len(q_seq) - len(t_seq)
            cov = float(e1 - s1) / float(len(q_seq))
            idt = float(e1 - s1 - edlib_result['editDistance']) / float(e1 - s1)

            aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0),
                            delta_l, idt, cov))

        kup.free_aln_range(aln_range_ptr)

    kup.free_kmer_match(kmer_match_ptr)
    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data #, x, y
Exemplo n.º 33
0
except:
  print("ERROR: PLEASE RUN pip install edlib")
  sys.exit(1)

#read_length is 2nd argument; default 150 bp
try:
  read_length = int(sys.argv[2])
except:
  read_length = 150

# Max # of mismatches is 3rd argument; default 3
try:
  mismatch = int(sys.argv[3])
except:
  mismatch = 3

#FASTA file is 1st argument
for rec in SeqIO.parse(sys.argv[1], 'fasta'):

  start = str(rec.seq[:read_length])
  end = str(rec.seq[len(rec.seq)-read_length:])



  if edlib.align(start,end)['editDistance'] <= mismatch:
    if sys.argv[4] == 'contigs':
      print(">" + str(rec.id))
      print(str(rec.seq))
    else:
      print(rec.id)
Exemplo n.º 34
0
with open('../../test_data/Enterobacteria_Phage_1/mutated_90_perc_oneline.fasta', 'r') as f:
    queryFull = f.readline()
print('Read query: ', len(queryFull) ,' characters.')

with open('../../test_data/Enterobacteria_Phage_1/Enterobacteria_phage_1_oneline.fa', 'r') as f:
    targetFull = f.readline()
print('Read target: ', len(targetFull) ,' characters.')

for seqLen in [30, 100, 1000, 10000, 50000]:
    query = queryFull[:seqLen]
    target = targetFull[:seqLen]
    numRuns = max(1000000000 // (seqLen**2), 1)

    print('Sequence length: ', seqLen)

    edlibTime = timeit.timeit(stmt="edlib.align(query, target)",
                              number=numRuns, globals=globals()) / numRuns
    print('Edlib: ', edlibTime)
    print(edlib.align(query, target))

    editdistanceTime = timeit.timeit(stmt="editdistance.eval(query, target)",
                                     number=numRuns, globals=globals()) / numRuns
    print('editdistance: ', editdistanceTime)

    levenshteinTime = timeit.timeit(stmt="Levenshtein.distance(query, target)",
                                     number=numRuns, globals=globals()) / numRuns
    print('levenshtein: ', levenshteinTime)

    print('edlib is %f times faster than editdistance.' % (editdistanceTime / edlibTime))
    print('edlib is %f times faster than Levenshtein.' % (levenshteinTime / edlibTime))
Exemplo n.º 35
0
import sys
import edlib

testFailed = False

result = edlib.align("telephone", "elephant")
if not (result and result["editDistance"] == 3):
    testFailed = True

result = edlib.align(b"telephone", b"elephant")
if not (result and result["editDistance"] == 3):
    testFailed = True

result = edlib.align("ACTG", "CACTRT", mode="HW", task="path", additionalEqualities=[("R", "A"), ("R", "G")])
if not (result and result["editDistance"] == 0):
    testFailed = True

if testFailed:
    print("Some of the tests failed!")
else:
    print("All tests passed!")

sys.exit(testFailed)