def createMapping(set_a, set_b): sol = {} for sa in set_a: a = sa.split() candidate = None candidate_val = 1e4 for sb in set_b: b = sb.split() assert len(a) == 2 assert len(b) == 2 v1 = edlib.align(a[0], b[0])['editDistance'] + edlib.align( a[1], b[1])['editDistance'] v2 = edlib.align(a[0], b[1])['editDistance'] + edlib.align( a[1], b[0])['editDistance'] d = min(v1, v2) if d < candidate_val: candidate = sb candidate_val = d sol[sa] = candidate if candidate_val > 4: sol[sa] = None print(sa, candidate, candidate_val) return sol
def revprimerStrip(file): base = os.path.basename(file).split('.')[0] goodseq = os.path.join(tmpdir, base + '.good') badseq = os.path.join(tmpdir, base + '.bad') with open(goodseq, 'w') as good: with open(badseq, 'w') as bad: for title, seq, qual in FastqGeneralIterator(open(file)): foralign = edlib.align(args.rev_primer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if foralign["editDistance"] >= 0: ForCutPos = foralign["locations"][0][1] + 1 Seq = seq[ForCutPos:] Qual = qual[ForCutPos:] #align reverse revalign = edlib.align( RevForPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] >= 0: RevCutPos = revalign["locations"][0][0] Seq = Seq[:RevCutPos] Qual = Qual[:RevCutPos] good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual)) else: bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
def remove_title_page(txt_file_without_tp, txt_file_with_tp, num_lines, out_dir): result = None lines_without = txt_file_without_tp.readlines() lines_with = txt_file_with_tp.readlines() without_txt = ''.join(lines_without[:num_lines]).lower() with_txt = ''.join(lines_with[:num_lines]).lower() res = edlib.align(without_txt, with_txt) prev_ld = res['editDistance'] for i in range(num_lines): without_txt = ''.join(lines_without[:num_lines]).lower() with_txt = ''.join(lines_with[i:num_lines]).lower() res = edlib.align(without_txt, with_txt) ld = res['editDistance'] if ld > prev_ld: result = ''.join(lines_with[i - 1:]) break elif ld < prev_ld: prev_ld = ld if result is None: warnings.warn('No title page found') out_file = out_file_name(out_dir, txt_file_with_tp.name) shutil.copy2(txt_file_with_tp.name, out_file) else: out_file = out_file_name(out_dir, txt_file_with_tp.name) with codecs.open(out_file, 'w', encoding='utf8') as f: f.write(result)
def check_library_barcode(b1: str, b2: str) -> bool: """only some library with libary barcode.""" # if len(b1) == 0 or len(b2) == 0: # return False d1 = edlib.align(b1, _LIB5_SEQ, mode="NW", task="distance")["editDistance"] d2 = edlib.align(b2, _LIB3_SEQ, mode="NW", task="distance")["editDistance"] return d1 <= 3 and d2 <= 3
def detect_read_strand(read_5p_seq, upstream_context_fwd, upstream_context_rev): result_fwd = edlib.align(upstream_context_fwd, read_5p_seq, mode="HW", k=5) result_rev = edlib.align(upstream_context_rev, read_5p_seq, mode="HW", k=5) strand = "-" if result_fwd["editDistance"] < result_rev["editDistance"]: strand = "+" return strand
def sequences_overlap(query, targets, min_match=30, check_revcom=True): """Check if a query overlaps with any sequence in targets.""" for target in targets: cigar = align(query, target, 'HW', 'path')['cigar'] if cigar_to_max_operation(cigar) > min_match: return True elif check_revcom: cigar = align(revcom(query), target, 'HW', 'path')['cigar'] if cigar_to_max_operation(cigar) > min_match: return True return False
def get_direction(query, ref): """ Determine the strand direction for query sequence by comparison with ref sequence """ fwd = query rev = query.reverse_complement() d_fwd = edlib.align(str(fwd.seq), str(ref.seq), task="distance", mode="NW")["editDistance"] d_rev = edlib.align(str(rev.seq), str(ref.seq), task="distance", mode="NW")["editDistance"] return fwd if d_fwd < d_rev else rev
def compare_polished_sequences(self, final_sequences): report_fn = os.path.join(self.params.outdir, 'report.txt') with open(report_fn, 'w') as f: for i in range(1, self.params.num_iters): seq_i, seq_i1 = final_sequences[i], final_sequences[i+1] alignment = edlib.align(seq_i, seq_i1) print(f'Alignment polishing seq {i} vs {i+1}:', file=f) print(alignment, file=f) hpc_seq_i = compress_homopolymer(final_sequences[i]) hpc_seq_i1 = compress_homopolymer(final_sequences[i+1]) alignment = edlib.align(hpc_seq_i, hpc_seq_i1) print(f'Alignment homopolymer compressed polishing seq {i} vs {i+1}:', file=f) print(alignment, file=f)
def create_synteny_matrix_mul(self, gene_seq, g1, g2, n): for gene in g1: if gene == "NULL_GENE": continue try: _ = gene_seq[gene] except BaseException: return np.zeros((n, n, 2)), np.zeros((n, n, 2)) for gene in g2: if gene == "NULL_GENE": continue try: _ = gene_seq[gene] except BaseException: return np.zeros((n, n, 2)), np.zeros((n, n, 2)) sm = np.zeros((n, n, 2)) sml = np.zeros((n, n, 2)) for i in range(n): if g1[i] == "NULL_GENE": continue if gene_seq[g1[i]] == "": return np.zeros((n, n, 2)), np.zeros((n, n, 2)) for j in range(n): if g2[j] == "NULL_GENE": continue if gene_seq[g2[j]] == "": return np.zeros((n, n, 2)), np.zeros((n, n, 2)) norm_len = max(len(gene_seq[g1[i]]), len(gene_seq[g2[j]])) try: result = ed.align(gene_seq[g1[i]], gene_seq[g2[j]], mode="NW", task="distance") sm[i][j][0] = result["editDistance"] / (norm_len) result = ed.align(gene_seq[g1[i]], gene_seq[g2[j]][::-1], mode="NW", task="distance") sm[i][j][1] = result["editDistance"] / (norm_len) _, result, _ = local_pairwise_align_ssw( DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]])) sml[i][j][0] = result / (norm_len) _, result, _ = local_pairwise_align_ssw( DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]][::-1])) sml[i][j][1] = result / (norm_len) except BaseException: return np.zeros((n, n, 2)), np.zeros((n, n, 2)) return sm, sml
def replace_word(self, word, normalize=True): """uses levenshtein by default, no argument to avoid time spent on conditionals""" """edlib.align faster than levenshteinDistance""" if normalize: scores = [] for vocab_word in self.limited_vocab: align_score = align(vocab_word, word) edit_distance = align_score['editDistance'] if edit_distance != 0: edit_distance /= align_score['alphabetLength'] scores.append(edit_distance) scores = np.array([scores]) else: scores = np.array([align(vocab_word, word)['editDistance'] for vocab_word in self.limited_vocab]) replacement_word = self.idx2word[np.argmin(scores)] self.word2idx[word] = self.word2idx[replacement_word]
def edist(lst): if len(str(lst[0])) == 0: return len(str(lst[1])) if len(str(lst[1])) == 0: return len(str(lst[0])) result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", additionalEqualities=[('U', 'T'), ('R', 'A'), ('R', 'G'), ('Y', 'C'), ('Y', 'T'), ('Y', 'U'), ('K', 'G'), ('K', 'T'), ('K', 'U'), ('M', 'A'), ('M', 'C'), ('S', 'C'), ('S', 'G'), ('W', 'A'), ('W', 'T'), ('W', 'U'), ('B', 'C'), ('B', 'G'), ('B', 'T'), ('B', 'U'), ('D', 'A'), ('D', 'G'), ('D', 'T'), ('D', 'U'), ('H', 'A'), ('H', 'C'), ('H', 'T'), ('H', 'U'), ('V', 'A'), ('V', 'C'), ('V', 'G'), ('N', 'C'), ('N', 'C'), ('N', 'G'), ('N', 'T'), ('N', 'U')]) return result["editDistance"]
def identity_test(self, target_identity, read_length, error_model, qscore_model): target_errors = 1.0 - target_identity read_delta = self.read_delta * target_errors mean_delta = self.mean_delta * target_errors if VERBOSE: print(f'\nRead length: {read_length}, target identity: {target_identity}') print(f' allowed error per read: {read_delta:.4f}') print(f' allowed error in mean: {mean_delta:.4f}') print(' identities: ', end='') read_identities = [] for i in range(self.trials): frag = badread.misc.get_random_sequence(read_length) seq, qual, _, _ = badread.simulate.sequence_fragment(frag, target_identity, error_model, qscore_model) cigar = edlib.align(frag, seq, task='path')['cigar'] read_identity = badread.misc.identity_from_edlib_cigar(cigar) read_identities.append(read_identity) if VERBOSE: print('{:.4f}'.format(read_identity), flush=True, end='\n ' if (i+1) % 20 == 0 else ' ') self.assertAlmostEqual(read_identity, target_identity, delta=read_delta) mean_identity = statistics.mean(read_identities) if VERBOSE: print('\r' if self.trials % 20 == 0 else '\n', end='') print(f' mean: {mean_identity:.4f}') self.assertAlmostEqual(mean_identity, target_identity, delta=mean_delta) if VERBOSE: print(' PASS')
def get_aligement(self, fast5_path, ref_path, verbose, fasta_out_dir=None, ref=None): t = perf_counter() fasta_out = None if fasta_out_dir is not None: fasta_out = os.path.join(fasta_out_dir, os.path.splitext(fast5_path)[0].split('/')[-1] + ".fasta") basecalled = self.basecall_sample(fast5_path, fasta_out=fasta_out, ref=ref) with open(ref_path) as f: target = f.readlines()[-1] if verbose: print("Basecalled:\n", basecalled) print("Target\n", target) self.logger.debug("fast5_path %s, ref_path %s", fast5_path, ref_path) result = edlib.align(basecalled, target, task='path') cigar_pairs = butils.cigar_str_to_pairs(result['cigar']) self.logger.debug("\nBasecalled: %s\nTarget : %s", butils.query_align_string(basecalled, cigar_pairs), butils.reference_align_string(target, cigar_pairs) ) self.logger.debug("extCigar %s", result['cigar']) self.logger.debug("Whole time %.3f", perf_counter() - t) cigar_stat = defaultdict(int) total = 0 for num, op in breakCigar(result['cigar']): total += num cigar_stat[op] += num acc = cigar_stat['='] / total nedit = result['editDistance'] / len(target) return nedit, acc, len(basecalled), cigar_stat
def custom_sequencing_adapter_check(self, r1_seq): ''' Check for custom sequencing adapter on r1 :param bytes r1_seq: R1 sequence :rtype int :returns end pos of adapter, -1 if not found ''' # NOTE: Will check multiple adapters for all reads # Should the adapter be identified first and only that sequence used going forward ? # Maybe add this logic in the trim_custom_sequencing_adapter function. best_adapter = {"seq": None, "align": None, "score": None} for a in self.custom_seq_adapter: alignment = edlib.align(a, r1_seq[0:len(a) + 3], mode="SHW", task="locations") score = float(alignment["editDistance"]) / len( self.custom_seq_adapter) if best_adapter["seq"] is None or score < best_adapter["score"]: best_adapter["seq"] = a best_adapter["align"] = alignment best_adapter["score"] = score if best_adapter["score"] < 0.18: return best_adapter["align"]["locations"][-1][1] else: return -1
def _cutr(reads, adp, th, r, len_list=[]): iden_max = -1 match_num = 0 cut_pos = [] skip_num = 0 repat = re.compile(r'(\d+)[DHIMNPSX=]{1}') if len(reads[0]) > 2: has_qual = True else: has_qual = False for read in reads: read_length = len(read[1]) if not len_list: pass else: len_list.append(read_length) if read_length < 2 * r: #logger.debug("%s is too short: %d bp. Skip." % (read[0], read_length)) skip_num += 1 continue result = edlib.align(adp, read[1][-r:], mode="HW", task='path') identity = 1.0 - float(result['editDistance'] / np.sum( [int(i) for i in repat.findall(result['cigar'])])) if identity > th: start = len(read[1]) - r + result['locations'][0][0] cut_pos.append(r - result['locations'][0][0]) match_num += 1 if identity > iden_max: iden_max = identity read[1] = read[1][:start] if has_qual: read[2] = read[2][:start] logger.info("%d reads were skipped due to their short lengths." % skip_num) return (iden_max, match_num, cut_pos)
def get_pairwise_alignments(seqs): section_header('Pairwise global alignments') explanation( 'Trycycler uses the edlib aligner to get global alignments between all pairs of ' 'sequences. This can help you to spot any problematic sequences that should be ' 'excluded before continuing. If you see any sequences with notably worse ' 'identities or max indels, you can remove them (delete the contig\'s FASTA) and ' 'run this command again.') seq_names = list(seqs.keys()) max_seq_name_len = max(len(x) for x in seq_names) pairwise_cigars, percent_identities, max_indels = {}, {}, {} for i, a in enumerate(seq_names): seq_a = seqs[a] for j in range(i + 1, len(seq_names)): b = seq_names[j] seq_b = seqs[b] log(' ' * (max_seq_name_len - len(a)) + a, end='') log(' vs ', end='') log(b + '...' + ' ' * (max_seq_name_len - len(b)), end=' ') result = edlib.align(seq_a, seq_b, mode="NW", task="path") cigar = result['cigar'] percent_identity, max_indel = identity_and_max_indel_from_cigar( cigar) log(f'{percent_identity:.2f}% identity, max indel = {max_indel}') pairwise_cigars[(a, b)] = cigar percent_identities[(a, b)] = percent_identity percent_identities[(b, a)] = percent_identity max_indels[(a, b)] = max_indel max_indels[(b, a)] = max_indel log() return pairwise_cigars, percent_identities, max_indels
def get_global_aln_results(ref_seq, query_seq, min_seq_len): """ Aligns two sequences globally, and returns (delta_len, idt, cov) values compatible with the legacy deduplication code. Currently unused - it was used in an intermediate version, and might be useful at some point in the future. """ log('Aligning (Edlib): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq))) delta_len = len(query_seq) - len(ref_seq) idt = 0.0 cov = 0.0 if len(ref_seq) < min_seq_len or len(query_seq) < min_seq_len: return delta_len, idt, cov result = edlib.align(query_seq, ref_seq, mode="NW", task="path") cigar = result['cigar'] num_m, num_i, num_d, total_len = count_cigar_ops(result['cigar']) num_eq = (num_m + num_i + num_d) - result['editDistance'] num_x = num_m - num_eq idt_query = float(num_eq) / float(num_eq + num_x + num_i) idt_ref = float(num_eq) / float(num_eq + num_x + num_d) idt = min(idt_query, idt_ref) cov = 1.0 log(' - Alignment stats: num_m = %d, num_i = %d, num_d = %d, total_len = %d, num_eq = %d, num_x = %d' % (num_m, num_i, num_d, total_len, num_eq, num_x)) return delta_len, idt, cov
def sequences_match(seq1, seq2, compare='5p_clip', min_length=4): """ Compare 2 sequences and determine whether they are likely the same. >>> s1, s2 = 'AAAAAAAAAA', 'AAAAAAAAATGC' >>> sequences_match(s1, s2, compare='3p_clip') True >>> sequences_match('TTGAAAAAAA', s2, compare='3p_clip') False >>> sequences_match(s1[::-1], s2[::-1], compare='5p_clip') True >>> sequences_match('A', 'A') False """ if compare == '3p_clip': seq1 = seq1[:10] seq2 = seq2[:10] else: seq1 = seq1[-10:] seq2 = seq2[-10:] if len(seq1) > min_length and len(seq2) > min_length: if len(seq1) >= 10 and len(seq2) >= 10: if align(seq1, seq2)['editDistance'] < 2: return True elif compare == '3p_clip': return seq1.startswith(seq2) or seq2.startswith(seq1) else: return seq1.endswith(seq2) or seq2.endswith(seq1) return False
def print_alignment(s1, s2, k=None, mode='NW', width=60): if k is None: k = max(len(s1), len(s2)) alignment = edlib.align(s1, s2, task='path', k=k, mode=mode) cigar = alignment['cigar'] locs = alignment['locations'] st, en = locs[0] _, _, a1, a2 = parse_cigar(cigar, s1, s2[st:]) status = [] for c1, c2 in zip(a1, a2): if c1 == c2: status.append('|') elif c1 == '-' or c2 == '-': status.append('-') elif c1 != c2: status.append('X') else: assert False a1 = ''.join(a1) status = ''.join(status) a2 = ''.join(a2) a1 = textwrap.wrap(a1, width=width) status = textwrap.wrap(status, width=width) a2 = textwrap.wrap(a2, width=width) for wa1, wst, wa2 in zip(a1, status, a2): print(wa1) print(wst) print(wa2) print("")
def min_ed(max_insertion, q_ins): result = edlib.align(max_insertion, q_ins, task="path", mode="NW") cigar = result["cigar"] tuples = [] # do not allow deletions in max_insertion: because we do not want to alter this sequence if "D" in cigar: return "" matches = re.split(r'[=DXSMI]+', cigar) i = 0 for length in matches[:-1]: i += len(length) type_ = cigar[i] i += 1 tuples.append((int(length), type_)) q_insertion_modified = "" q_ins_pos = 0 for length, type_ in tuples: # if we reach here we are guaranteed no deletions in alignment of max_insertion # we therefore simply thread in the matching or mismatching characters (if type '=' or 'X') # or we put a "-" (if type is 'I') if type_ == "I": q_insertion_modified += "-" * length else: q_insertion_modified += q_ins[q_ins_pos:q_ins_pos + length] q_ins_pos += length return q_insertion_modified
def getRatio(self, stringA=None, stringB=None): """ With default(None) atributes - returns editdistance ratio based on main files. With different strings can be helpful for substring comparision (.getRatio(SubS1,Subs2)) >>> ED1 = EditDistance("testABC1", "testABB2") >>> ED1.getRatio() 0.75 >>> ED1.getRatio("Ala ma kota", "Alan ma psa") 0.6363636363636364 >>> ED1.getRatio(1,"999") -1 >>> ED1.getRatio(3987,4789) -1 """ if (stringA == None and stringB == None): stringA, stringB = self.string1, self.string2 alignmentEditDistance = self.editDistance else: if not (type(stringA) == str and type(stringB) == str): return -1 alignmentEditDistance = edlib.align(stringA, stringB, task="path")["editDistance"] stringLenMax = max(len(stringA), len(stringB)) ratioAB = 1 - (float(alignmentEditDistance) / float(stringLenMax)) return ratioAB
def align(query, pattern_info, max_ed, normalise=False): pattern, wildcard, equalities, forward = pattern_info result = edlib.align( pattern, query, task="path", mode="HW", k=max_ed, additionalEqualities=equalities, ) if result["editDistance"] == -1: return None, None ed = result["editDistance"] if not normalise: locs = result["locations"][0] umi = query[locs[0]:locs[1] + 1] return ed, umi # Extract and normalise UMI umi = "" align = edlib.getNiceAlignment(result, pattern, query) for q, t in zip(align["query_aligned"], align["target_aligned"]): if q != wildcard: continue if t == "-": umi += "N" else: umi += t if len(umi) != 16: raise RuntimeError("UMI length incorrect: {}".format(umi)) return ed, umi
def edlib_traceback_allow_ends(x, y, mode="NW", task="path", k=1, end_threshold = 0): result = edlib.align(x, y, mode=mode, task=task, k=k) ed = result["editDistance"] locations = result["locations"] cigar = result["cigar"] if cigar: tuples = [] result = re.split(r'[=DXSMI]+', cigar) i = 0 for length in result[:-1]: i += len(length) type_ = cigar[i] i += 1 tuples.append((length, type_ )) ed_ignore_ends = ed if tuples[0][1] == "D" or tuples[0][1] == "I": begin_snippet = int(tuples[0][0]) if begin_snippet <= end_threshold: ed_ignore_ends -= int(begin_snippet) if tuples[-1][1] == "D" or tuples[-1][1] == "I": end_snippet = int(tuples[-1][0]) if end_snippet <= end_threshold: ed_ignore_ends -= int(end_snippet) # if ed > ed_ignore_ends: # print("ed global:", ed, "ed after:", ed_ignore_ends) ed = ed_ignore_ends # if ed ==0: # print("here") return ed, locations, cigar
def edist(lst): if len(str(lst[0])) == 0: return -1, "" if len(str(lst[1])) == 0: return -1, "" result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", task="path") return result["editDistance"], result["cigar"]
def get_align_index_path(query: Iterable, target: Iterable) -> List[CharAlignToken]: path_ = edlib.align(query, target, task="path")["cigar"] if path_ is None: return [] path_ = expand_cigar_format(path_) index_out = 0 index_path = 0 out = [] for index_query in range(len(query)): while path_[index_path] == "D": index_out += 1 index_path += 1 action = path_[index_path] out.append(CharAlignToken(index_out, action)) if action == "=": assert query[index_query] == target[index_out] if action in ["=", "X"]: index_out += 1 index_path += 1 return out
def exec(peptide, time_node): file = open("../src/public/jobs/service3/service3.fasta", "w") file.write(peptide) file.close() fasta = SeqIO.parse("../src/public/jobs/service3/service3.fasta", "fasta") if(any(fasta) == False): #False when `fasta` is empty return "error" count = 0 for record in SeqIO.parse("../src/public/jobs/service3/service3.fasta", "fasta"): sequence_input = str(record.seq) count = count+1 print(count) print(sequence_input) if (count > 1): return "error" dataset = pd.read_csv("data_values_activity_non_modified.csv") dict_response = [] for i in range(len(dataset)): align_result = edlib.align(sequence_input, dataset['sequence'][i], mode = "HW", task = "path") view_alignment = edlib.getNiceAlignment(align_result, sequence_input, dataset['sequence'][i]) dict_aligment = {"input_sequence":view_alignment['query_aligned'], "space_format":view_alignment['matched_aligned'], "compare_sequence": view_alignment['target_aligned'], "id_sequence" : str(dataset['index_sequence'][i]), 'distance_sequences':str(align_result['editDistance'])} dict_response.append(dict_aligment) dict_data_results = {"summary_alignment":dict_response} #export result alignment return dict_data_results
def edist_nw(lst): if len(str(lst[0])) == 0: return -1, "" if len(str(lst[1])) == 0: return -1, "" result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", k=500) return result["editDistance"]
def distance_matrix(sequences): """ Construct a distance matrix from pairwise alignments sequences """ dists = np.array([ np.array([0 for _ in range(len(sequences))]) for _ in range(len(sequences)) ]) if dists.shape[0] == 0: return dists base_seq = sequences[0] adjusted_sequences = [] for s in tqdm(sequences, desc="{:<10}".format("prescan")): adjusted_sequences.append(get_direction(s, base_seq)) for i in tqdm(range(len(dists)), desc="{:<10}".format("align")): query = str(adjusted_sequences[i].seq) for j in range(i, len(dists[i])): if i != j: target = str(adjusted_sequences[j].seq) d = edlib.align(query, target, task="distance", mode="NW")["editDistance"] dists[i][j] = d dists[j][i] = d return dists
def find_barcode_locations(center, barcodes, primer_max_ed): "Find barcodes in a center using edlib" # Creation of a IUPAC equivalence map for edlib to allow IUPAC code in primers # The IUPAC map was created with: # from Bio.Data import IUPACData # IUPAC_map = [(i, k) for i, j in IUPACData.ambiguous_dna_values.items() for k in j] IUPAC_map = [('A', 'A'), ('C', 'C'), ('G', 'G'), ('T', 'T'), ('M', 'A'), ('M', 'C'), ('R', 'A'), ('R', 'G'), ('W', 'A'), ('W', 'T'), ('S', 'C'), ('S', 'G'), ('Y', 'C'), ('Y', 'T'), ('K', 'G'), ('K', 'T'), ('V', 'A'), ('V', 'C'), ('V', 'G'), ('H', 'A'), ('H', 'C'), ('H', 'T'), ('D', 'A'), ('D', 'G'), ('D', 'T'), ('B', 'C'), ('B', 'G'), ('B', 'T'), ('X', 'G'), ('X', 'A'), ('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')] all_locations = [] for primer_acc, primer_seq in barcodes.items(): # print(primer_acc, primer_seq,center) # Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code result = edlib.align(primer_seq, center, mode="HW", task="locations", k=primer_max_ed, additionalEqualities=IUPAC_map) ed = result["editDistance"] locations = result["locations"] print(locations, ed) if locations: all_locations.append((primer_acc, locations[0][0], locations[0][1], ed)) return all_locations
def edist(lst): if len(str(lst[0])) == 0: return 100500 if len(str(lst[1])) == 0: return 100500 result = edlib.align(str(lst[0]), str(lst[1]), mode="NW") return result["editDistance"]
def correct_basecalled(bucketed_basecall, reference, nedit_tol=0.2): basecalled = "".join(bucketed_basecall) origin = np.zeros(len(basecalled), dtype=np.int32) idx = 0 for i, b in enumerate(bucketed_basecall): origin[idx:idx + len(b)] = i idx += len(b) result_set = edlib.align(basecalled, reference, task="path") nedit = result_set['editDistance'] / len(reference) if nedit > nedit_tol: raise TooLargeEditDistance( "Normalized edit distance is large...%.3f" % nedit ) result = ["" for _ in bucketed_basecall] idx_ref = 0 idx_bcalled = 0 for num, op in breakCigar(result_set['cigar']): for _ in range(num): if op in CIGAR_MATCH_MISSMATCH: result[origin[idx_bcalled]] += reference[idx_ref] idx_bcalled = min(idx_bcalled + 1, len(basecalled) - 1) idx_ref += 1 elif op in CIGAR_INSERTION: idx_bcalled += 1 elif op in CIGAR_DELETION: result[origin[idx_bcalled]] += reference[idx_ref] idx_ref += 1 return result
def get_aln_data(t_seq, q_seq): aln_data = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq( q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] if kmer_match.count != 0: aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12) aln_range = aln_range_ptr[0] s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format( s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) if e1 - s1 > 100: log('Calling edlib.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format( s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) # Align using Edlib instead of DWA. edlib_result = edlib.align(q_seq[s1:e1], seq0[s2:e2], mode="NW") delta_l = len(q_seq) - len(t_seq) cov = float(e1 - s1) / float(len(q_seq)) idt = float(e1 - s1 - edlib_result['editDistance']) / float(e1 - s1) aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), delta_l, idt, cov)) kup.free_aln_range(aln_range_ptr) kup.free_kmer_match(kmer_match_ptr) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data #, x, y
except: print("ERROR: PLEASE RUN pip install edlib") sys.exit(1) #read_length is 2nd argument; default 150 bp try: read_length = int(sys.argv[2]) except: read_length = 150 # Max # of mismatches is 3rd argument; default 3 try: mismatch = int(sys.argv[3]) except: mismatch = 3 #FASTA file is 1st argument for rec in SeqIO.parse(sys.argv[1], 'fasta'): start = str(rec.seq[:read_length]) end = str(rec.seq[len(rec.seq)-read_length:]) if edlib.align(start,end)['editDistance'] <= mismatch: if sys.argv[4] == 'contigs': print(">" + str(rec.id)) print(str(rec.seq)) else: print(rec.id)
with open('../../test_data/Enterobacteria_Phage_1/mutated_90_perc_oneline.fasta', 'r') as f: queryFull = f.readline() print('Read query: ', len(queryFull) ,' characters.') with open('../../test_data/Enterobacteria_Phage_1/Enterobacteria_phage_1_oneline.fa', 'r') as f: targetFull = f.readline() print('Read target: ', len(targetFull) ,' characters.') for seqLen in [30, 100, 1000, 10000, 50000]: query = queryFull[:seqLen] target = targetFull[:seqLen] numRuns = max(1000000000 // (seqLen**2), 1) print('Sequence length: ', seqLen) edlibTime = timeit.timeit(stmt="edlib.align(query, target)", number=numRuns, globals=globals()) / numRuns print('Edlib: ', edlibTime) print(edlib.align(query, target)) editdistanceTime = timeit.timeit(stmt="editdistance.eval(query, target)", number=numRuns, globals=globals()) / numRuns print('editdistance: ', editdistanceTime) levenshteinTime = timeit.timeit(stmt="Levenshtein.distance(query, target)", number=numRuns, globals=globals()) / numRuns print('levenshtein: ', levenshteinTime) print('edlib is %f times faster than editdistance.' % (editdistanceTime / edlibTime)) print('edlib is %f times faster than Levenshtein.' % (levenshteinTime / edlibTime))
import sys import edlib testFailed = False result = edlib.align("telephone", "elephant") if not (result and result["editDistance"] == 3): testFailed = True result = edlib.align(b"telephone", b"elephant") if not (result and result["editDistance"] == 3): testFailed = True result = edlib.align("ACTG", "CACTRT", mode="HW", task="path", additionalEqualities=[("R", "A"), ("R", "G")]) if not (result and result["editDistance"] == 0): testFailed = True if testFailed: print("Some of the tests failed!") else: print("All tests passed!") sys.exit(testFailed)