def _get_sentences_and_offsets(txt_handle, ss_handle): s_starts_and_sentences = [] txt_handle_reads = 0 for s_text in (l.rstrip('\n') for l in ss_handle): # XXX: We allow multiple spaces to be aligned due to issues with the SS aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', ))) t_char = None started_at = txt_handle.tell() started_at_read = txt_handle_reads while True: t_char = unicode(txt_handle.read(1), encoding='utf-8') txt_handle_reads += 1 if not t_char: assert False, ('could not align all sentences for: ' '"{}" and "{}" stopped at the sentence: "{}" ' 'aligner in state: {}').format( txt_handle.name, ss_handle.name, s_text, aligner.__repr__()) try: if aligner.align(t_char): source_text = _str(aligner) # We are aligned! s_starts_and_sentences.append(( #txt_handle.tell() - len(source_text), #started_at, started_at_read, Sentence(source_text, []))) #last_end += aligner.char_cnt break except MisalignedError: started_at = txt_handle.tell() started_at_read = txt_handle_reads pass #s_starts_and_sentences.sort() return s_starts_and_sentences
def _get_sentences_and_offsets(txt_handle, ss_handle): s_starts_and_sentences = [] txt_handle_reads = 0 for s_text in (l.rstrip('\n') for l in ss_handle): # XXX: We allow multiple spaces to be aligned due to issues with the SS aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', ))) t_char = None started_at = txt_handle.tell() started_at_read = txt_handle_reads while True: t_char = unicode(txt_handle.read(1), encoding='utf-8') txt_handle_reads += 1 if not t_char: assert False, ('could not align all sentences for: ' '"{}" and "{}" stopped at the sentence: "{}" ' 'aligner in state: {}' ).format(txt_handle.name, ss_handle.name, s_text, aligner.__repr__()) try: if aligner.align(t_char): source_text = _str(aligner) # We are aligned! s_starts_and_sentences.append(( #txt_handle.tell() - len(source_text), #started_at, started_at_read, Sentence(source_text, []))) #last_end += aligner.char_cnt break except MisalignedError: started_at = txt_handle.tell() started_at_read = txt_handle_reads pass #s_starts_and_sentences.sort() return s_starts_and_sentences
class VariantSeqLib(SeqLib): """ Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted. """ def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError("Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int(config['wild type'] ['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None def is_coding(self): return self.wt_protein is not None def set_wt(self, sequence, coding=True): """ Set the wild type DNA *sequence*. The *sequence* is translated if *coding* is ``True``. The *sequence* may only contain ``ACGT``, but may contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame. """ sequence = "".join(sequence.split()) # remove whitespace if not re.match("^[ACGTacgt]+$", sequence): raise EnrichError("WT DNA sequence contains unexpected " "characters", self.name) if len(sequence) % 3 != 0 and coding: raise EnrichError("WT DNA sequence contains incomplete codons", self.name) self.wt_dna = sequence.upper() if coding: self.wt_protein = "" for i in xrange(0, len(self.wt_dna), 3): self.wt_protein += codon_table[self.wt_dna[i:i + 3]] else: self.wt_protein = None def align_variant(self, variant_dna): """ Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the wild type sequence. Returns a list of HGVS variant strings. Aligned variants are stored in a local dictionary to avoid recomputing alignments. This dictionary should be cleared after all variants are counted, to save memory. .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime. """ if variant_dna in self.aligner_cache.keys(): return self.aligner_cache[variant_dna] mutations = list() traceback = self.aligner.align(self.wt_dna, variant_dna) for x, y, cat, length in traceback: if cat == "match": continue elif cat == "mismatch": mut = "{pre}>{post}".format(pre=self.wt_dna[x], post=variant_dna[y]) elif cat == "insertion": if y > length: dup = variant_dna[y:y + length] if dup == variant_dna[y - length:y]: mut = "dup{seq}".format(seq=dup) else: mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup) else: mut = "_{pos}ins{seq}".format(pos=x + 2, seq=variant_dna[y:y + length]) elif cat == "deletion": mut = "_{pos}del".format(pos=x + length) mutations.append((x, mut)) self.aligner_cache[variant_dna] = mutations return mutations def count_variant(self, variant_dna, copies=1, include_indels=True): """ Identifies mutations and counts the *variant_dna* sequence. The algorithm attempts to call variants by comparing base-by-base. If the *variant_dna* and wild type DNA are different lengths, or if there are an excess of mismatches (indicating a possible indel), local alignment is performed using :py:meth:`align_variant` if this option has been selected in the configuration. Each variant is stored as a tab-delimited string of mutations in HGVS format. Returns a list of HGSV variant strings. Returns an empty list if the variant is wild type. Returns None if the variant was discarded due to excess mismatches. """ if not re.match("^[ACGTNXacgtnx]+$", variant_dna): raise EnrichError("Variant DNA sequence contains unexpected " "characters", self.name) variant_dna = variant_dna.upper() if len(variant_dna) != len(self.wt_dna): if self.aligner is not None: mutations = self.align_variant(variant_dna) else: return None else: mutations = list() for i in xrange(len(variant_dna)): if variant_dna[i] != self.wt_dna[i]: mutations.append((i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i]))) if len(mutations) > self.filters['max mutations']: if self.aligner is not None: mutations = self.align_variant(variant_dna) if len(mutations) > self.filters['max mutations']: # too many mutations post-alignment return None else: # stop looping over this variant break else: # too many mutations and not using aligner return None mutation_strings = list() if self.is_coding(): variant_protein = "" for i in xrange(0, len(variant_dna), 3): try: variant_protein += codon_table[variant_dna[i:i + 3]] except KeyError: # garbage codon due to indel variant_protein += '?' for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 ref_pro_pos = (pos + self.reference_offset) / 3 + 1 mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change) if has_indel(change): mut += " (p.{pre}{pos}fs)".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos) elif variant_protein[pos / 3] == self.wt_protein[pos / 3]: mut += " (p.=)" else: mut += " (p.{pre}{pos}{post})".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos, post=aa_codes[variant_protein[pos / 3]]) mutation_strings.append(mut) else: for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change) mutation_strings.append(mut) if len(mutation_strings) > 0: variant_string = ', '.join(mutation_strings) else: variant_string = WILD_TYPE_VARIANT try: self.df_dict['variants'][variant_string] += copies except KeyError: self.df_dict['variants'][variant_string] = copies return variant_string def count_mutations(self, include_indels=False): """ Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted independently of the corresponding nucleotide change. """ # restore the counts if they were saved to disk if self.df_dict['variants'] is None: self.load_counts(keys=['variants']) # create new dictionaries self.df_dict['mutations_nt'] = dict() if self.is_coding(): self.df_dict['mutations_aa'] = dict() if not include_indels: mask = self.df_dict['variants'].index.map(has_indel) variant_data = self.df_dict['variants'][np.invert(mask)] del mask else: variant_data = self.df_dict['variants'] if self.is_coding(): for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") # get just the nucleotide changes for m in mutations: m = m.split(" (")[0] try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count # get the amino acid changes aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]", variant) for a in aa_changes: try: self.df_dict['mutations_aa'][a] += count except KeyError: self.df_dict['mutations_aa'][a] = count else: for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") for m in mutations: try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count self.df_dict['mutations_nt'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_nt'], orient="index", dtype="int32") if self.is_coding(): self.df_dict['mutations_aa'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_aa'], orient="index", dtype="int32")
class VariantSeqLib(SeqLib): """ Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted. """ def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError( "Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int( config['wild type']['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None def is_coding(self): return self.wt_protein is not None def set_wt(self, sequence, coding=True): """ Set the wild type DNA *sequence*. The *sequence* is translated if *coding* is ``True``. The *sequence* may only contain ``ACGT``, but may contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame. """ sequence = "".join(sequence.split()) # remove whitespace if not re.match("^[ACGTacgt]+$", sequence): raise EnrichError( "WT DNA sequence contains unexpected " "characters", self.name) if len(sequence) % 3 != 0 and coding: raise EnrichError("WT DNA sequence contains incomplete codons", self.name) self.wt_dna = sequence.upper() if coding: self.wt_protein = "" for i in xrange(0, len(self.wt_dna), 3): self.wt_protein += codon_table[self.wt_dna[i:i + 3]] else: self.wt_protein = None def align_variant(self, variant_dna): """ Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the wild type sequence. Returns a list of HGVS variant strings. Aligned variants are stored in a local dictionary to avoid recomputing alignments. This dictionary should be cleared after all variants are counted, to save memory. .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime. """ if variant_dna in self.aligner_cache.keys(): return self.aligner_cache[variant_dna] mutations = list() traceback = self.aligner.align(self.wt_dna, variant_dna) for x, y, cat, length in traceback: if cat == "match": continue elif cat == "mismatch": mut = "{pre}>{post}".format(pre=self.wt_dna[x], post=variant_dna[y]) elif cat == "insertion": if y > length: dup = variant_dna[y:y + length] if dup == variant_dna[y - length:y]: mut = "dup{seq}".format(seq=dup) else: mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup) else: mut = "_{pos}ins{seq}".format(pos=x + 2, seq=variant_dna[y:y + length]) elif cat == "deletion": mut = "_{pos}del".format(pos=x + length) mutations.append((x, mut)) self.aligner_cache[variant_dna] = mutations return mutations def count_variant(self, variant_dna, copies=1, include_indels=True): """ Identifies mutations and counts the *variant_dna* sequence. The algorithm attempts to call variants by comparing base-by-base. If the *variant_dna* and wild type DNA are different lengths, or if there are an excess of mismatches (indicating a possible indel), local alignment is performed using :py:meth:`align_variant` if this option has been selected in the configuration. Each variant is stored as a tab-delimited string of mutations in HGVS format. Returns a list of HGSV variant strings. Returns an empty list if the variant is wild type. Returns None if the variant was discarded due to excess mismatches. """ if not re.match("^[ACGTNXacgtnx]+$", variant_dna): raise EnrichError( "Variant DNA sequence contains unexpected " "characters", self.name) variant_dna = variant_dna.upper() if len(variant_dna) != len(self.wt_dna): if self.aligner is not None: mutations = self.align_variant(variant_dna) else: return None else: mutations = list() for i in xrange(len(variant_dna)): if variant_dna[i] != self.wt_dna[i]: mutations.append( (i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i]))) if len(mutations) > self.filters['max mutations']: if self.aligner is not None: mutations = self.align_variant(variant_dna) if len(mutations) > self.filters['max mutations']: # too many mutations post-alignment return None else: # stop looping over this variant break else: # too many mutations and not using aligner return None mutation_strings = list() if self.is_coding(): variant_protein = "" for i in xrange(0, len(variant_dna), 3): try: variant_protein += codon_table[variant_dna[i:i + 3]] except KeyError: # garbage codon due to indel variant_protein += '?' for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 ref_pro_pos = (pos + self.reference_offset) / 3 + 1 mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change) if has_indel(change): mut += " (p.{pre}{pos}fs)".format( pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos) elif variant_protein[pos / 3] == self.wt_protein[pos / 3]: mut += " (p.=)" else: mut += " (p.{pre}{pos}{post})".format( pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos, post=aa_codes[variant_protein[pos / 3]]) mutation_strings.append(mut) else: for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change) mutation_strings.append(mut) if len(mutation_strings) > 0: variant_string = ', '.join(mutation_strings) else: variant_string = WILD_TYPE_VARIANT try: self.df_dict['variants'][variant_string] += copies except KeyError: self.df_dict['variants'][variant_string] = copies return variant_string def count_mutations(self, include_indels=False): """ Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted independently of the corresponding nucleotide change. """ # restore the counts if they were saved to disk if self.df_dict['variants'] is None: self.load_counts(keys=['variants']) # create new dictionaries self.df_dict['mutations_nt'] = dict() if self.is_coding(): self.df_dict['mutations_aa'] = dict() if not include_indels: mask = self.df_dict['variants'].index.map(has_indel) variant_data = self.df_dict['variants'][np.invert(mask)] del mask else: variant_data = self.df_dict['variants'] if self.is_coding(): for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") # get just the nucleotide changes for m in mutations: m = m.split(" (")[0] try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count # get the amino acid changes aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]", variant) for a in aa_changes: try: self.df_dict['mutations_aa'][a] += count except KeyError: self.df_dict['mutations_aa'][a] = count else: for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") for m in mutations: try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count self.df_dict['mutations_nt'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_nt'], orient="index", dtype="int32") if self.is_coding(): self.df_dict['mutations_aa'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_aa'], orient="index", dtype="int32")
# -*- coding: utf-8 -*- from text import * from lexicon import * from aligner import Aligner text = Text() lexicon = Lexicon() text.parse("../data/77b.txt") lexicon.parse("../data/arapaho_lexicon.json") aligner = Aligner(text, lexicon) aligner.align("../data/new_test_text_file.txt", "../data/test_log_file.txt")
class Inflection: _PARAMS = { 'C': 1.0, 'window': 3, 'cross_features': 2, 'classifier': 'mono', 'C_replace': 0.0, 'C_insert': 0.0 } def __init__(self, feature_type, **params): self.feature_type = feature_type self.a = Aligner(method='lcs') if feature_type == "sparse": self.get_features = self.get_sparse_features elif feature_type == "sparse2": self.get_features = self.get_sparse2_features elif feature_type == "onehot": self.get_features = self.get_positional_features self.old_window = None self.sample_weight = None self._reset() self.set_params(**params) def _reset(self): for k, v in self._PARAMS.items(): setattr(self, k, v) def set_params(self, **params): for k, v in params.items(): if k in self._PARAMS: setattr(self, k, v) def vectorize(self, lem, tag, wf=None): if self.old_window != self.window: self.old_window = self.window self.features = [] self.labels = [] for i, (l, t, w) in enumerate(zip(lem, tag, wf)): alignments = self.a.align(l, w) alignments = [[('<', '<')] + x + [('>', '>')] for x in alignments] for j, a in enumerate(alignments): if j > 0: break # in case there are multiple alignments take only the first li, wi = 0, 0 for k, (lc, wc) in enumerate(a): self.features.append( self.get_features('<' + l + '>', '<' + w[:wi], t, li, window=(self.window, self.window))) if lc == '': action = 'insert:' + wc wi += 1 elif lc == wc: action = 'copy:' li += 1 wi += 1 elif wc == '': action = 'delete:' li += 1 else: action = 'replace:' + wc li += 1 wi += 1 self.labels.append(action) if self.feature_type.startswith('sparse'): self.vec = TfidfVectorizer(sublinear_tf=True, analyzer=lambda x: x) self.x = self.vec.fit_transform(self.features) else: self.x = np.array(self.features) def fit(self, wf, lem, tag): print("vecorize....", file=sys.stderr) self.vectorize(lem, tag, wf) print(self.x.shape, file=sys.stderr) print("fit....", file=sys.stderr) if self.classifier == 'twostep': action = [s.split(':')[0] for s in self.labels] self.clf = LinearSVC(C=self.C, class_weight='balanced', max_iter=1000) self.clf.fit(self.x, action, sample_weight=self.sample_weight) replace_i = [i for i in range(len(self.labels))\ if self.labels[i].startswith('replace')] sw = None if len(replace_i): x = self.x[replace_i, :] y = np.array(self.labels)[replace_i] if self.C_replace == 0.0: self.C_replace = self.C if len(set(y)) == 1: self.clf_replace = DummyClassifier() else: self.clf_replace = LinearSVC(C=self.C_replace, class_weight='balanced', max_iter=50000) self.clf_replace.fit(x, y) else: self.clf_replace = DummyClassifier() insert_i = [i for i in range(len(self.labels))\ if self.labels[i].startswith('insert')] if len(insert_i): x = self.x[insert_i, :] y = np.array(self.labels)[insert_i] if self.C_insert == 0.0: self.C_insert = self.C if len(set(y)) == 1: self.clf_replace = DummyClassifier() else: self.clf_insert = LinearSVC(C=self.C_replace, class_weight='balanced', max_iter=50000) self.clf_insert.fit(x, y) else: self.clf_insert = DummyClassifier() else: self.clf = LinearSVC(C=self.C, class_weight='balanced', max_iter=50000) self.clf.fit(self.x, self.labels, sample_weight=self.sample_weight) def predict(self, x): if self.classifier == 'twostep': action = str(self.clf.predict(x)[0]) ch = '' if action == 'insert': if self.clf_insert is None: action = 'copy' else: ch = str(self.clf_insert.predict(x)[0]).split(':', 1)[1] elif action == 'replace': if self.clf_replace is None: action = 'copy' else: ch = str(self.clf_replace.predict(x)[0]).split(':', 1)[1] return action, ch else: return str(self.clf.predict(x)[0]).split(':', 1) def decode(self, lemma, tags, max_len=30): w_prefix = '' li = 0 while li < len(lemma): feat = self.get_features(lemma, w_prefix, tags, li) if self.feature_type.startswith('sparse'): x = self.vec.transform([feat]) else: x = np.array([feat]) act, arg = self.predict(x) if act == 'copy': w_prefix += lemma[li] li += 1 elif act == 'replace': w_prefix += arg li += 1 elif act == 'insert': w_prefix += arg elif act == 'delete': li += 1 if len(w_prefix) > max_len or w_prefix and w_prefix[-1] == '>': break return w_prefix def get_sparse_features(self, lemma, word_prefix, tags, idx, window=(10, 10)): cross = self.cross_features pfx_feat, sfx_feat, wpfx_feat = [], [], [] tag_feat = ["tag:{}".format(t) for t in tags] if cross >= 2: tag_feat += [ "tag2:{}-{}".format(t, t) for t in itertools.product(tags, tags) ] ch_feat = ["ch:{}".format(lemma[idx])] for i in range(1, window[0] + 1): if i <= idx: pfx_feat.append('lprefix:{}'.format(lemma[idx - i:idx])) if i <= len(word_prefix): wpfx_feat.append('wprefix:{}'.format(word_prefix[-i:])) for i in range(idx + 1, idx + window[1]): if i <= len(lemma): sfx_feat.append('lsuffix:{}'.format(lemma[idx:i])) str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat if cross > 3: cross = [ "&".join((x, y)) for x, y in itertools.product(pfx_feat, sfx_feat) ] cross = [ "&".join((x, y)) for x, y in itertools.product(wpfx_feat, cross) ] cross = [ "&".join((x, y)) for x, y in itertools.product(ch_feat, cross) ] str_feat += cross else: cross = [ "&".join((x, y)) for x, y in itertools.product(ch_feat, str_feat) ] str_feat += cross return str_feat + tag_feat + [ "&".join((x, y)) for x, y in itertools.product(tag_feat, str_feat) ] def get_sparse2_features(self, lemma, word_prefix, tags, idx, window=(10, 10)): cross = self.cross_features pfx_feat, sfx_feat, wpfx_feat = [], [], [] tag_feat = [{"t:{}".format(t)} for t in tags] ch_feat = [{"l0:{}".format(lemma[idx])}] for i in range(1, window[0] + 1): if i <= idx: pfx_feat.append({'l-{}:{}'.format(i, lemma[idx - i])}) if i <= len(word_prefix): wpfx_feat.append({'w-{}:{}'.format(i, word_prefix[-i])}) for i in range(1, window[1] + 1): if (idx + i) < len(lemma): sfx_feat.append({'l+{}:{}'.format(i, lemma[idx + i])}) str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat feat = str_feat + tag_feat feat_cross = feat for i in range(cross): feat_cross = [ x | y for x, y in itertools.product(feat, feat_cross) ] return ['&'.join(sorted(f)) for f in feat_cross] def get_positional_features(self, lemma, word_prefix, tags, idx, window=(3, 3)): chars = [lemma[idx]] tag_enc = self.data.te ch_enc = self.data.ce for i in range(idx - (window[0] + 1), idx - 1): if i >= 0: chars.append(lemma[i]) chars.append(word_prefix[i]) else: chars.append(ch_enc.pad) for i in range(idx + 1, idx + window[1] + 1): if i < len(lemma): chars.append(lemma[i]) else: chars.append(ch_enc.pad) feat = np.array(ch_enc.encode(chars, onehot=True)).flatten() feat = np.concatenate((feat, tag_enc.transform([tags])[0])) return feat def evaluate(self, wf, lemmas, tags): acc = 0 med = 0 for i, word in enumerate(wf): tag = tags[i] lem = lemmas[i] pred = self.decode(lem, tag) # print(word, pred, file=sys.stderr) acc += int(pred == word) med += editdistance.eval(pred, word) med = med / len(wf) acc = acc / len(wf) print(acc, med, file=sys.stderr) return (acc, med)
from __future__ import print_function from aligner import Aligner a = Aligner() x = "TCGAACTGAAAA" y = "AACTGA" trace = a.align(x, y) print(x) print(y) for t in trace: print(t)
in alignment time as the length of reads, length of genome, and number of edits increases. """ import sys from aligner import Aligner import random import time string = "ACTCTGCTTTAG" a = Aligner(string) #test exact match pos, edits = a.align("TCTGC") assert pos == 2 #print pos, edits #test insertion pos, edits = a.align("ACTTGC") #print pos, edits assert pos == 0 #test replacement pos, edits = a.align("TACTT") #print pos, edits assert pos == 4 #test deletion pos, edits = a.align("TCTTT")
def combine_records(forward_record, reverse_record, reference_sequences, min_overlap=-1, max_overlap=-1, max_length_delta=1e30, reference_scoring_ranges=None): ''' Computes the alignments of both forward and reverse reads to the reference sequences. Synthesizes those alignments, using the better-quality read in the case of a conflict. Returns (index, sequence, quality) where `index` is the index of the reference sequence used, `sequence` is the combined DNA sequence, and `quality` is the quality of each base in the combined sequence. The optional parameters min_overlap and max_overlap correspond to the overlap constraints on the alignment between the forward and reverse reads. ''' aligner = Aligner() forward_str = str(forward_record.seq) reverse_str = str(reverse_record.seq.reverse_complement()) # Align forward to references reference_index, forward_offset, forward_score = aligner.best_alignment( forward_str, reference_sequences, unidirectional=True, min_overlap=len(forward_str), candidate_scoring_ranges=reference_scoring_ranges) # Align forward to reverse reverse_offset, _ = aligner.align(forward_str, reverse_str, unidirectional=True, reverse=True, min_overlap=min_overlap, max_overlap=max_overlap) reference = reference_sequences[reference_index] reference_scoring_range = reference_scoring_ranges[ reference_index] if reference_scoring_ranges is not None else None # Align reverse to reference reverse_offset_to_ref, reverse_score = aligner.align( reference, reverse_str, unidirectional=True, reverse=True, min_overlap=15, scoring_ranges=(reference_scoring_range, None)) # Compare the pairwise scores of obeying the forward and obeying the reverse alignments to reference, # and adjust the alignment offsets accordingly. if reverse_score > forward_score: forward_offset = reverse_offset_to_ref - reverse_offset reverse_offset = reverse_offset_to_ref else: reverse_offset += forward_offset combined_sequence = "" combined_quality = [] alignment_set = [(reference, 0), (forward_str, forward_offset), (reverse_str, reverse_offset)] # Uncomment to print the resulting alignments # print('\n'.join(aligner.format_multiple(*alignment_set))) # Discard the read if total length is too different from reference length if max_length_delta <= len(reference): if math.fabs(aligner.length(*alignment_set) - len(reference)) > max_length_delta: sc.counter(1, STAT_DELETIONS_KEY, STAT_EXCESS_LENGTH_KEY) return -1, None, None # Combine the reads to produce the overall sequence. # The aligner will enumerate the aligned characters or elements of each iterable we give it. # Zipping generators for both the sequence and the quality allows us to enumerate them together. sequence_generator = aligner.enumerate_multiple(*alignment_set) quality_generator = aligner.enumerate_multiple( ([None for i in xrange(len(reference))], 0), (forward_record.letter_annotations[SEQUENCE_QUALITY_KEY], forward_offset), (reverse_record.letter_annotations[SEQUENCE_QUALITY_KEY], reverse_offset)) for bases, qualities in izip(sequence_generator, quality_generator): _, forward_base, reverse_base = bases _, forward_quality, reverse_quality = qualities if forward_base is None and reverse_base is None: combined_sequence += UNSPECIFIED_BASE combined_quality.append(0) elif forward_base is None: combined_sequence += reverse_base combined_quality.append(reverse_quality) elif reverse_base is None: combined_sequence += forward_base combined_quality.append(forward_quality) else: base, quality = max([(forward_base, forward_quality), (reverse_base, reverse_quality)], key=lambda x: x[1]) combined_sequence += base combined_quality.append(quality) return reference_index, combined_sequence, combined_quality
def main(): # Get command line arguments if len(sys.argv) < 3: printInfo() exit(1) ref = "" refLen = 0 refFile = sys.argv[1] READFILE = sys.argv[2].strip().split(',') with open(refFile) as fi: ref = fi.read().strip() refLen = len(ref) INTERVAL = [refLen] MINREADS = [10] TRIGGERPOINT = [10] CONFIDENCE = [50] # Convert args into a list of args if len(sys.argv) > 3: INTERVAL = [int(i) for i in sys.argv[3].strip().split(',')] if len(sys.argv) > 4: MINREADS = [int(i) for i in sys.argv[4].strip().split(',')] if len(sys.argv) > 5: TRIGGERPOINT = [int(i) for i in sys.argv[5].strip().split(',')] if len(sys.argv) > 6: CONFIDENCE = [int(i) for i in sys.argv[6].strip().split(',')] print("time \t Read File \t num changes \t interval \t minReads \t triggerPoint \t confidence") # Iterate through all combinations of parameters for readFile in READFILE: for interval in INTERVAL: for minReads in MINREADS: for triggerPoint in TRIGGERPOINT: for confidence in CONFIDENCE: start_time = timeit.default_timer() # Times the block with alignment # Init the alignment tracker and aligner a = Aligner(ref) rt = hashRangeTracker() rt.setRefLen(refLen) rt.setInterval(interval) # Split genome into this many blocks rt.setMinReads(minReads) # Minimum times a read should overlap a position rt.setTrigger(triggerPoint) # How many times to hit a region before reporting rt.setConfidence(confidence) allChanges = [] with open(readFile) as readsFi: for read in readsFi: read = read.strip() if read[0] == '#': # Comment line continue elapsed_time = timeit.default_timer() - start_time aligned = a.align(read) start_time = timeit.default_timer() changes = rt.addAlignment(read, aligned[1], aligned[0]) if len(changes) > 0: # We have some changes to make allChanges += changes for c in changes: makeChange(a, c) # ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index #a = Aligner(ref) # Get the remaining updates changes = rt.flush() if len(changes) > 0: allChanges += changes for c in changes: makeChange(a, c) # ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index elapsed_time += timeit.default_timer() - start_time ref = a.getRef() print(str(elapsed_time) + " \t " + readFile + " \t " + str(len(allChanges)) + " \t " + str(interval) + " \t " + str(minReads) + " \t " + str(triggerPoint) + " \t " + str(confidence)) print(ref)