示例#1
0
def _get_sentences_and_offsets(txt_handle, ss_handle):
    s_starts_and_sentences = []
    txt_handle_reads = 0
    for s_text in (l.rstrip('\n') for l in ss_handle):
        # XXX: We allow multiple spaces to be aligned due to issues with the SS
        aligner = Aligner(unicode(s_text, encoding='utf-8'),
                          ignore_mult=set((' ', )))

        t_char = None
        started_at = txt_handle.tell()
        started_at_read = txt_handle_reads
        while True:
            t_char = unicode(txt_handle.read(1), encoding='utf-8')
            txt_handle_reads += 1
            if not t_char:
                assert False, ('could not align all sentences for: '
                               '"{}" and "{}" stopped at the sentence: "{}" '
                               'aligner in state: {}').format(
                                   txt_handle.name, ss_handle.name, s_text,
                                   aligner.__repr__())
            try:
                if aligner.align(t_char):
                    source_text = _str(aligner)

                    # We are aligned!
                    s_starts_and_sentences.append((
                        #txt_handle.tell() - len(source_text),
                        #started_at,
                        started_at_read,
                        Sentence(source_text, [])))
                    #last_end += aligner.char_cnt
                    break
            except MisalignedError:
                started_at = txt_handle.tell()
                started_at_read = txt_handle_reads
                pass

    #s_starts_and_sentences.sort()
    return s_starts_and_sentences
示例#2
0
def _get_sentences_and_offsets(txt_handle, ss_handle):
    s_starts_and_sentences = []
    txt_handle_reads = 0
    for s_text in (l.rstrip('\n') for l in ss_handle):
        # XXX: We allow multiple spaces to be aligned due to issues with the SS
        aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', )))

        t_char = None
        started_at = txt_handle.tell()
        started_at_read = txt_handle_reads
        while True:
            t_char = unicode(txt_handle.read(1), encoding='utf-8')
            txt_handle_reads += 1
            if not t_char:
                assert False, ('could not align all sentences for: '
                        '"{}" and "{}" stopped at the sentence: "{}" '
                        'aligner in state: {}'
                        ).format(txt_handle.name, ss_handle.name,
                                s_text, aligner.__repr__())
            try:
                if aligner.align(t_char):
                    source_text = _str(aligner)

                    # We are aligned!
                    s_starts_and_sentences.append((
                            #txt_handle.tell() - len(source_text),
                            #started_at,
                            started_at_read,
                            Sentence(source_text, [])))
                    #last_end += aligner.char_cnt
                    break
            except MisalignedError:
                started_at = txt_handle.tell()
                started_at_read = txt_handle_reads
                pass
    
    #s_starts_and_sentences.sort()
    return s_starts_and_sentences
示例#3
0
文件: variant.py 项目: msr2009/Enrich
class VariantSeqLib(SeqLib):
    """
    Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding
    or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted.
    """
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'], 
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError("Missing required config value '{key}'".format(key), 
                              self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(config['wild type']
                                                  ['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None


    def is_coding(self):
        return self.wt_protein is not None


    def set_wt(self, sequence, coding=True):
        """
        Set the wild type DNA *sequence*. The *sequence* is translated if *coding* 
        is ``True``. The *sequence* may only contain ``ACGT``, but may 
        contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame.
        """
        sequence = "".join(sequence.split()) # remove whitespace

        if not re.match("^[ACGTacgt]+$", sequence):
            raise EnrichError("WT DNA sequence contains unexpected "
                              "characters", self.name)
        if len(sequence) % 3 != 0 and coding:
            raise EnrichError("WT DNA sequence contains incomplete codons", 
                              self.name)
        
        self.wt_dna = sequence.upper()
        if coding:
            self.wt_protein = ""
            for i in xrange(0, len(self.wt_dna), 3):
                self.wt_protein += codon_table[self.wt_dna[i:i + 3]]
        else:
            self.wt_protein = None


    def align_variant(self, variant_dna):
        """
        Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the 
        wild type sequence. Returns a list of HGVS variant strings.

        Aligned variants are stored in a local dictionary to avoid recomputing alignments. This 
        dictionary should be cleared after all variants are counted, to save memory.

        .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime.
        """
        if variant_dna in self.aligner_cache.keys():
            return self.aligner_cache[variant_dna]

        mutations = list()
        traceback = self.aligner.align(self.wt_dna, variant_dna)
        for x, y, cat, length in traceback:
            if cat == "match":
                continue
            elif cat == "mismatch":
                mut = "{pre}>{post}".format(pre=self.wt_dna[x], post=variant_dna[y])
            elif cat == "insertion":
                if y > length:
                    dup = variant_dna[y:y + length]
                    if dup == variant_dna[y - length:y]:
                        mut = "dup{seq}".format(seq=dup)
                    else:
                        mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup)
                else:                                    
                    mut = "_{pos}ins{seq}".format(pos=x + 2, seq=variant_dna[y:y + length])
            elif cat == "deletion":
                mut = "_{pos}del".format(pos=x + length)
            mutations.append((x, mut))

        self.aligner_cache[variant_dna] = mutations
        return mutations


    def count_variant(self, variant_dna, copies=1, include_indels=True):
        """
        Identifies mutations and counts the *variant_dna* sequence.
        The algorithm attempts to call variants by comparing base-by-base.
        If the *variant_dna* and wild type DNA are different lengths, or if there
        are an excess of mismatches (indicating a possible indel), local
        alignment is performed using :py:meth:`align_variant` if this option 
        has been selected in the configuration.

        Each variant is stored as a tab-delimited string of mutations in HGVS 
        format. Returns a list of HGSV variant strings. Returns an empty list 
        if the variant is wild type. Returns None if the variant was discarded
        due to excess mismatches.
        """
        if not re.match("^[ACGTNXacgtnx]+$", variant_dna):
            raise EnrichError("Variant DNA sequence contains unexpected "
                              "characters", self.name)

        variant_dna = variant_dna.upper()

        if len(variant_dna) != len(self.wt_dna):
            if self.aligner is not None:
                mutations = self.align_variant(variant_dna)
            else:
                return None
        else:
            mutations = list()
            for i in xrange(len(variant_dna)):
                if variant_dna[i] != self.wt_dna[i]:
                    mutations.append((i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i])))
                    if len(mutations) > self.filters['max mutations']:
                        if self.aligner is not None:
                            mutations = self.align_variant(variant_dna)
                            if len(mutations) > self.filters['max mutations']:
                                # too many mutations post-alignment
                                return None
                            else:
                                # stop looping over this variant
                                break
                        else:
                            # too many mutations and not using aligner
                            return None

        mutation_strings = list()
        if self.is_coding():
            variant_protein = ""
            for i in xrange(0, len(variant_dna), 3):
                try:
                    variant_protein += codon_table[variant_dna[i:i + 3]]
                except KeyError: # garbage codon due to indel
                    variant_protein += '?'

            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                ref_pro_pos = (pos + self.reference_offset) / 3 + 1
                mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change)
                if has_indel(change):
                    mut += " (p.{pre}{pos}fs)".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos)
                elif variant_protein[pos / 3] == self.wt_protein[pos / 3]:
                    mut += " (p.=)"
                else:
                    mut += " (p.{pre}{pos}{post})".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos,
                             post=aa_codes[variant_protein[pos / 3]])
                mutation_strings.append(mut)
        else:
            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change)
                mutation_strings.append(mut)

        if len(mutation_strings) > 0:
            variant_string = ', '.join(mutation_strings)
        else:
            variant_string = WILD_TYPE_VARIANT
        try:
            self.df_dict['variants'][variant_string] += copies
        except KeyError:
            self.df_dict['variants'][variant_string] = copies
        return variant_string


    def count_mutations(self, include_indels=False):
        """
        Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains 
        an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted
        independently of the corresponding nucleotide change.
        """
        # restore the counts if they were saved to disk
        if self.df_dict['variants'] is None:
            self.load_counts(keys=['variants'])

        # create new dictionaries
        self.df_dict['mutations_nt'] = dict()
        if self.is_coding():
            self.df_dict['mutations_aa'] = dict()

        if not include_indels:
            mask = self.df_dict['variants'].index.map(has_indel)
            variant_data = self.df_dict['variants'][np.invert(mask)]
            del mask
        else:
            variant_data = self.df_dict['variants']
        if self.is_coding():
            for variant, count in variant_data.iterrows():
                count = count['count'] # get the element from the Series
                mutations = variant.split(", ")
                # get just the nucleotide changes
                for m in mutations:
                    m = m.split(" (")[0]
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count
                # get the amino acid changes
                aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]", variant)
                for a in aa_changes:
                    try:
                        self.df_dict['mutations_aa'][a] += count
                    except KeyError:
                        self.df_dict['mutations_aa'][a] = count
        else:
            for variant, count in variant_data.iterrows():
                count = count['count'] # get the element from the Series
                mutations = variant.split(", ")
                for m in mutations:
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count

        self.df_dict['mutations_nt'] = \
                pd.DataFrame.from_dict(self.df_dict['mutations_nt'], 
                                       orient="index", dtype="int32")
        if self.is_coding():
            self.df_dict['mutations_aa'] = \
                    pd.DataFrame.from_dict(self.df_dict['mutations_aa'], 
                                           orient="index", dtype="int32")
示例#4
0
文件: variant.py 项目: msr2009/Enrich
class VariantSeqLib(SeqLib):
    """
    Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding
    or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted.
    """
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'],
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError(
                "Missing required config value '{key}'".format(key), self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(
                    config['wild type']['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None

    def is_coding(self):
        return self.wt_protein is not None

    def set_wt(self, sequence, coding=True):
        """
        Set the wild type DNA *sequence*. The *sequence* is translated if *coding* 
        is ``True``. The *sequence* may only contain ``ACGT``, but may 
        contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame.
        """
        sequence = "".join(sequence.split())  # remove whitespace

        if not re.match("^[ACGTacgt]+$", sequence):
            raise EnrichError(
                "WT DNA sequence contains unexpected "
                "characters", self.name)
        if len(sequence) % 3 != 0 and coding:
            raise EnrichError("WT DNA sequence contains incomplete codons",
                              self.name)

        self.wt_dna = sequence.upper()
        if coding:
            self.wt_protein = ""
            for i in xrange(0, len(self.wt_dna), 3):
                self.wt_protein += codon_table[self.wt_dna[i:i + 3]]
        else:
            self.wt_protein = None

    def align_variant(self, variant_dna):
        """
        Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the 
        wild type sequence. Returns a list of HGVS variant strings.

        Aligned variants are stored in a local dictionary to avoid recomputing alignments. This 
        dictionary should be cleared after all variants are counted, to save memory.

        .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime.
        """
        if variant_dna in self.aligner_cache.keys():
            return self.aligner_cache[variant_dna]

        mutations = list()
        traceback = self.aligner.align(self.wt_dna, variant_dna)
        for x, y, cat, length in traceback:
            if cat == "match":
                continue
            elif cat == "mismatch":
                mut = "{pre}>{post}".format(pre=self.wt_dna[x],
                                            post=variant_dna[y])
            elif cat == "insertion":
                if y > length:
                    dup = variant_dna[y:y + length]
                    if dup == variant_dna[y - length:y]:
                        mut = "dup{seq}".format(seq=dup)
                    else:
                        mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup)
                else:
                    mut = "_{pos}ins{seq}".format(pos=x + 2,
                                                  seq=variant_dna[y:y +
                                                                  length])
            elif cat == "deletion":
                mut = "_{pos}del".format(pos=x + length)
            mutations.append((x, mut))

        self.aligner_cache[variant_dna] = mutations
        return mutations

    def count_variant(self, variant_dna, copies=1, include_indels=True):
        """
        Identifies mutations and counts the *variant_dna* sequence.
        The algorithm attempts to call variants by comparing base-by-base.
        If the *variant_dna* and wild type DNA are different lengths, or if there
        are an excess of mismatches (indicating a possible indel), local
        alignment is performed using :py:meth:`align_variant` if this option 
        has been selected in the configuration.

        Each variant is stored as a tab-delimited string of mutations in HGVS 
        format. Returns a list of HGSV variant strings. Returns an empty list 
        if the variant is wild type. Returns None if the variant was discarded
        due to excess mismatches.
        """
        if not re.match("^[ACGTNXacgtnx]+$", variant_dna):
            raise EnrichError(
                "Variant DNA sequence contains unexpected "
                "characters", self.name)

        variant_dna = variant_dna.upper()

        if len(variant_dna) != len(self.wt_dna):
            if self.aligner is not None:
                mutations = self.align_variant(variant_dna)
            else:
                return None
        else:
            mutations = list()
            for i in xrange(len(variant_dna)):
                if variant_dna[i] != self.wt_dna[i]:
                    mutations.append(
                        (i, "{pre}>{post}".format(pre=self.wt_dna[i],
                                                  post=variant_dna[i])))
                    if len(mutations) > self.filters['max mutations']:
                        if self.aligner is not None:
                            mutations = self.align_variant(variant_dna)
                            if len(mutations) > self.filters['max mutations']:
                                # too many mutations post-alignment
                                return None
                            else:
                                # stop looping over this variant
                                break
                        else:
                            # too many mutations and not using aligner
                            return None

        mutation_strings = list()
        if self.is_coding():
            variant_protein = ""
            for i in xrange(0, len(variant_dna), 3):
                try:
                    variant_protein += codon_table[variant_dna[i:i + 3]]
                except KeyError:  # garbage codon due to indel
                    variant_protein += '?'

            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                ref_pro_pos = (pos + self.reference_offset) / 3 + 1
                mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change)
                if has_indel(change):
                    mut += " (p.{pre}{pos}fs)".format(
                        pre=aa_codes[self.wt_protein[pos / 3]],
                        pos=ref_pro_pos)
                elif variant_protein[pos / 3] == self.wt_protein[pos / 3]:
                    mut += " (p.=)"
                else:
                    mut += " (p.{pre}{pos}{post})".format(
                        pre=aa_codes[self.wt_protein[pos / 3]],
                        pos=ref_pro_pos,
                        post=aa_codes[variant_protein[pos / 3]])
                mutation_strings.append(mut)
        else:
            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change)
                mutation_strings.append(mut)

        if len(mutation_strings) > 0:
            variant_string = ', '.join(mutation_strings)
        else:
            variant_string = WILD_TYPE_VARIANT
        try:
            self.df_dict['variants'][variant_string] += copies
        except KeyError:
            self.df_dict['variants'][variant_string] = copies
        return variant_string

    def count_mutations(self, include_indels=False):
        """
        Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains 
        an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted
        independently of the corresponding nucleotide change.
        """
        # restore the counts if they were saved to disk
        if self.df_dict['variants'] is None:
            self.load_counts(keys=['variants'])

        # create new dictionaries
        self.df_dict['mutations_nt'] = dict()
        if self.is_coding():
            self.df_dict['mutations_aa'] = dict()

        if not include_indels:
            mask = self.df_dict['variants'].index.map(has_indel)
            variant_data = self.df_dict['variants'][np.invert(mask)]
            del mask
        else:
            variant_data = self.df_dict['variants']
        if self.is_coding():
            for variant, count in variant_data.iterrows():
                count = count['count']  # get the element from the Series
                mutations = variant.split(", ")
                # get just the nucleotide changes
                for m in mutations:
                    m = m.split(" (")[0]
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count
                # get the amino acid changes
                aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]",
                                        variant)
                for a in aa_changes:
                    try:
                        self.df_dict['mutations_aa'][a] += count
                    except KeyError:
                        self.df_dict['mutations_aa'][a] = count
        else:
            for variant, count in variant_data.iterrows():
                count = count['count']  # get the element from the Series
                mutations = variant.split(", ")
                for m in mutations:
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count

        self.df_dict['mutations_nt'] = \
                pd.DataFrame.from_dict(self.df_dict['mutations_nt'],
                                       orient="index", dtype="int32")
        if self.is_coding():
            self.df_dict['mutations_aa'] = \
                    pd.DataFrame.from_dict(self.df_dict['mutations_aa'],
                                           orient="index", dtype="int32")
示例#5
0
# -*- coding: utf-8 -*-

from text import *
from lexicon import *
from aligner import Aligner

text = Text()
lexicon = Lexicon()

text.parse("../data/77b.txt")
lexicon.parse("../data/arapaho_lexicon.json")

aligner = Aligner(text, lexicon)

aligner.align("../data/new_test_text_file.txt", "../data/test_log_file.txt")
示例#6
0
class Inflection:
    _PARAMS = {
        'C': 1.0,
        'window': 3,
        'cross_features': 2,
        'classifier': 'mono',
        'C_replace': 0.0,
        'C_insert': 0.0
    }

    def __init__(self, feature_type, **params):
        self.feature_type = feature_type
        self.a = Aligner(method='lcs')
        if feature_type == "sparse":
            self.get_features = self.get_sparse_features
        elif feature_type == "sparse2":
            self.get_features = self.get_sparse2_features
        elif feature_type == "onehot":
            self.get_features = self.get_positional_features
        self.old_window = None
        self.sample_weight = None
        self._reset()
        self.set_params(**params)

    def _reset(self):
        for k, v in self._PARAMS.items():
            setattr(self, k, v)

    def set_params(self, **params):
        for k, v in params.items():
            if k in self._PARAMS:
                setattr(self, k, v)

    def vectorize(self, lem, tag, wf=None):
        if self.old_window != self.window:
            self.old_window = self.window
            self.features = []
            self.labels = []
            for i, (l, t, w) in enumerate(zip(lem, tag, wf)):
                alignments = self.a.align(l, w)
                alignments = [[('<', '<')] + x + [('>', '>')]
                              for x in alignments]

                for j, a in enumerate(alignments):
                    if j > 0:
                        break  # in case there are multiple alignments take only the first
                    li, wi = 0, 0
                    for k, (lc, wc) in enumerate(a):
                        self.features.append(
                            self.get_features('<' + l + '>',
                                              '<' + w[:wi],
                                              t,
                                              li,
                                              window=(self.window,
                                                      self.window)))
                        if lc == '':
                            action = 'insert:' + wc
                            wi += 1
                        elif lc == wc:
                            action = 'copy:'
                            li += 1
                            wi += 1
                        elif wc == '':
                            action = 'delete:'
                            li += 1
                        else:
                            action = 'replace:' + wc
                            li += 1
                            wi += 1
                        self.labels.append(action)

            if self.feature_type.startswith('sparse'):
                self.vec = TfidfVectorizer(sublinear_tf=True,
                                           analyzer=lambda x: x)
                self.x = self.vec.fit_transform(self.features)
            else:
                self.x = np.array(self.features)

    def fit(self, wf, lem, tag):
        print("vecorize....", file=sys.stderr)
        self.vectorize(lem, tag, wf)
        print(self.x.shape, file=sys.stderr)

        print("fit....", file=sys.stderr)
        if self.classifier == 'twostep':
            action = [s.split(':')[0] for s in self.labels]
            self.clf = LinearSVC(C=self.C,
                                 class_weight='balanced',
                                 max_iter=1000)
            self.clf.fit(self.x, action, sample_weight=self.sample_weight)

            replace_i = [i for i in range(len(self.labels))\
                    if self.labels[i].startswith('replace')]
            sw = None
            if len(replace_i):
                x = self.x[replace_i, :]
                y = np.array(self.labels)[replace_i]
                if self.C_replace == 0.0: self.C_replace = self.C
                if len(set(y)) == 1:
                    self.clf_replace = DummyClassifier()
                else:
                    self.clf_replace = LinearSVC(C=self.C_replace,
                                                 class_weight='balanced',
                                                 max_iter=50000)
                self.clf_replace.fit(x, y)
            else:
                self.clf_replace = DummyClassifier()

            insert_i = [i for i in range(len(self.labels))\
                    if self.labels[i].startswith('insert')]
            if len(insert_i):
                x = self.x[insert_i, :]
                y = np.array(self.labels)[insert_i]
                if self.C_insert == 0.0: self.C_insert = self.C
                if len(set(y)) == 1:
                    self.clf_replace = DummyClassifier()
                else:
                    self.clf_insert = LinearSVC(C=self.C_replace,
                                                class_weight='balanced',
                                                max_iter=50000)
                self.clf_insert.fit(x, y)
            else:
                self.clf_insert = DummyClassifier()
        else:
            self.clf = LinearSVC(C=self.C,
                                 class_weight='balanced',
                                 max_iter=50000)
            self.clf.fit(self.x, self.labels, sample_weight=self.sample_weight)

    def predict(self, x):
        if self.classifier == 'twostep':
            action = str(self.clf.predict(x)[0])
            ch = ''
            if action == 'insert':
                if self.clf_insert is None:
                    action = 'copy'
                else:
                    ch = str(self.clf_insert.predict(x)[0]).split(':', 1)[1]
            elif action == 'replace':
                if self.clf_replace is None:
                    action = 'copy'
                else:
                    ch = str(self.clf_replace.predict(x)[0]).split(':', 1)[1]
            return action, ch
        else:
            return str(self.clf.predict(x)[0]).split(':', 1)

    def decode(self, lemma, tags, max_len=30):
        w_prefix = ''
        li = 0
        while li < len(lemma):
            feat = self.get_features(lemma, w_prefix, tags, li)
            if self.feature_type.startswith('sparse'):
                x = self.vec.transform([feat])
            else:
                x = np.array([feat])
            act, arg = self.predict(x)
            if act == 'copy':
                w_prefix += lemma[li]
                li += 1
            elif act == 'replace':
                w_prefix += arg
                li += 1
            elif act == 'insert':
                w_prefix += arg
            elif act == 'delete':
                li += 1
            if len(w_prefix) > max_len or w_prefix and w_prefix[-1] == '>':
                break
        return w_prefix

    def get_sparse_features(self,
                            lemma,
                            word_prefix,
                            tags,
                            idx,
                            window=(10, 10)):
        cross = self.cross_features
        pfx_feat, sfx_feat, wpfx_feat = [], [], []
        tag_feat = ["tag:{}".format(t) for t in tags]
        if cross >= 2:
            tag_feat += [
                "tag2:{}-{}".format(t, t)
                for t in itertools.product(tags, tags)
            ]
        ch_feat = ["ch:{}".format(lemma[idx])]
        for i in range(1, window[0] + 1):
            if i <= idx:
                pfx_feat.append('lprefix:{}'.format(lemma[idx - i:idx]))
            if i <= len(word_prefix):
                wpfx_feat.append('wprefix:{}'.format(word_prefix[-i:]))
        for i in range(idx + 1, idx + window[1]):
            if i <= len(lemma):
                sfx_feat.append('lsuffix:{}'.format(lemma[idx:i]))
        str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat
        if cross > 3:
            cross = [
                "&".join((x, y))
                for x, y in itertools.product(pfx_feat, sfx_feat)
            ]
            cross = [
                "&".join((x, y))
                for x, y in itertools.product(wpfx_feat, cross)
            ]
            cross = [
                "&".join((x, y)) for x, y in itertools.product(ch_feat, cross)
            ]
            str_feat += cross
        else:
            cross = [
                "&".join((x, y))
                for x, y in itertools.product(ch_feat, str_feat)
            ]
            str_feat += cross
        return str_feat + tag_feat + [
            "&".join((x, y)) for x, y in itertools.product(tag_feat, str_feat)
        ]

    def get_sparse2_features(self,
                             lemma,
                             word_prefix,
                             tags,
                             idx,
                             window=(10, 10)):
        cross = self.cross_features
        pfx_feat, sfx_feat, wpfx_feat = [], [], []
        tag_feat = [{"t:{}".format(t)} for t in tags]
        ch_feat = [{"l0:{}".format(lemma[idx])}]
        for i in range(1, window[0] + 1):
            if i <= idx:
                pfx_feat.append({'l-{}:{}'.format(i, lemma[idx - i])})
            if i <= len(word_prefix):
                wpfx_feat.append({'w-{}:{}'.format(i, word_prefix[-i])})
        for i in range(1, window[1] + 1):
            if (idx + i) < len(lemma):
                sfx_feat.append({'l+{}:{}'.format(i, lemma[idx + i])})
        str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat
        feat = str_feat + tag_feat
        feat_cross = feat
        for i in range(cross):
            feat_cross = [
                x | y for x, y in itertools.product(feat, feat_cross)
            ]
        return ['&'.join(sorted(f)) for f in feat_cross]

    def get_positional_features(self,
                                lemma,
                                word_prefix,
                                tags,
                                idx,
                                window=(3, 3)):
        chars = [lemma[idx]]
        tag_enc = self.data.te
        ch_enc = self.data.ce
        for i in range(idx - (window[0] + 1), idx - 1):
            if i >= 0:
                chars.append(lemma[i])
                chars.append(word_prefix[i])
            else:
                chars.append(ch_enc.pad)
        for i in range(idx + 1, idx + window[1] + 1):
            if i < len(lemma):
                chars.append(lemma[i])
            else:
                chars.append(ch_enc.pad)
        feat = np.array(ch_enc.encode(chars, onehot=True)).flatten()
        feat = np.concatenate((feat, tag_enc.transform([tags])[0]))
        return feat

    def evaluate(self, wf, lemmas, tags):
        acc = 0
        med = 0
        for i, word in enumerate(wf):
            tag = tags[i]
            lem = lemmas[i]
            pred = self.decode(lem, tag)
            #            print(word, pred, file=sys.stderr)
            acc += int(pred == word)
            med += editdistance.eval(pred, word)
        med = med / len(wf)
        acc = acc / len(wf)
        print(acc, med, file=sys.stderr)
        return (acc, med)
示例#7
0
from __future__  import print_function
from aligner import Aligner

a = Aligner()

x = "TCGAACTGAAAA"
y = "AACTGA"

trace = a.align(x, y)

print(x)
print(y)
for t in trace:
	print(t)
in alignment time as the length of reads, length of genome, and number
of edits increases. 
"""

import sys
from aligner import Aligner
import random
import time


string = "ACTCTGCTTTAG"
a = Aligner(string)


#test exact match
pos, edits = a.align("TCTGC")
assert pos == 2
#print pos, edits

#test insertion
pos, edits = a.align("ACTTGC")
#print pos, edits
assert pos == 0

#test replacement
pos, edits = a.align("TACTT")
#print pos, edits
assert pos == 4

#test deletion
pos, edits = a.align("TCTTT")
def combine_records(forward_record,
                    reverse_record,
                    reference_sequences,
                    min_overlap=-1,
                    max_overlap=-1,
                    max_length_delta=1e30,
                    reference_scoring_ranges=None):
    '''
    Computes the alignments of both forward and reverse reads to the reference
    sequences. Synthesizes those alignments, using the better-quality read in
    the case of a conflict. Returns (index, sequence, quality) where `index` is
    the index of the reference sequence used, `sequence` is the combined DNA
    sequence, and `quality` is the quality of each base in the combined sequence.

    The optional parameters min_overlap and max_overlap correspond to the overlap
    constraints on the alignment between the forward and reverse reads.
    '''
    aligner = Aligner()

    forward_str = str(forward_record.seq)
    reverse_str = str(reverse_record.seq.reverse_complement())

    # Align forward to references
    reference_index, forward_offset, forward_score = aligner.best_alignment(
        forward_str,
        reference_sequences,
        unidirectional=True,
        min_overlap=len(forward_str),
        candidate_scoring_ranges=reference_scoring_ranges)

    # Align forward to reverse
    reverse_offset, _ = aligner.align(forward_str,
                                      reverse_str,
                                      unidirectional=True,
                                      reverse=True,
                                      min_overlap=min_overlap,
                                      max_overlap=max_overlap)

    reference = reference_sequences[reference_index]
    reference_scoring_range = reference_scoring_ranges[
        reference_index] if reference_scoring_ranges is not None else None

    # Align reverse to reference
    reverse_offset_to_ref, reverse_score = aligner.align(
        reference,
        reverse_str,
        unidirectional=True,
        reverse=True,
        min_overlap=15,
        scoring_ranges=(reference_scoring_range, None))

    # Compare the pairwise scores of obeying the forward and obeying the reverse alignments to reference,
    # and adjust the alignment offsets accordingly.
    if reverse_score > forward_score:
        forward_offset = reverse_offset_to_ref - reverse_offset
        reverse_offset = reverse_offset_to_ref
    else:
        reverse_offset += forward_offset

    combined_sequence = ""
    combined_quality = []

    alignment_set = [(reference, 0), (forward_str, forward_offset),
                     (reverse_str, reverse_offset)]
    # Uncomment to print the resulting alignments
    # print('\n'.join(aligner.format_multiple(*alignment_set)))

    # Discard the read if total length is too different from reference length
    if max_length_delta <= len(reference):
        if math.fabs(aligner.length(*alignment_set) -
                     len(reference)) > max_length_delta:
            sc.counter(1, STAT_DELETIONS_KEY, STAT_EXCESS_LENGTH_KEY)
            return -1, None, None

    # Combine the reads to produce the overall sequence.
    # The aligner will enumerate the aligned characters or elements of each iterable we give it.
    # Zipping generators for both the sequence and the quality allows us to enumerate them together.
    sequence_generator = aligner.enumerate_multiple(*alignment_set)
    quality_generator = aligner.enumerate_multiple(
        ([None for i in xrange(len(reference))], 0),
        (forward_record.letter_annotations[SEQUENCE_QUALITY_KEY],
         forward_offset),
        (reverse_record.letter_annotations[SEQUENCE_QUALITY_KEY],
         reverse_offset))
    for bases, qualities in izip(sequence_generator, quality_generator):
        _, forward_base, reverse_base = bases
        _, forward_quality, reverse_quality = qualities

        if forward_base is None and reverse_base is None:
            combined_sequence += UNSPECIFIED_BASE
            combined_quality.append(0)
        elif forward_base is None:
            combined_sequence += reverse_base
            combined_quality.append(reverse_quality)
        elif reverse_base is None:
            combined_sequence += forward_base
            combined_quality.append(forward_quality)
        else:
            base, quality = max([(forward_base, forward_quality),
                                 (reverse_base, reverse_quality)],
                                key=lambda x: x[1])
            combined_sequence += base
            combined_quality.append(quality)

    return reference_index, combined_sequence, combined_quality
示例#10
0
def main():
	# Get command line arguments
	if len(sys.argv) < 3:
		printInfo()
		exit(1)


	ref = ""
	refLen = 0
	refFile = sys.argv[1]
	READFILE = sys.argv[2].strip().split(',')
	with open(refFile) as fi:
		ref = fi.read().strip()
		refLen = len(ref)

	INTERVAL = [refLen]
	MINREADS = [10]
	TRIGGERPOINT = [10]
	CONFIDENCE = [50]

	# Convert args into a list of args
	if len(sys.argv) > 3:
		INTERVAL = [int(i) for i in sys.argv[3].strip().split(',')]
	if len(sys.argv) > 4:
		MINREADS = [int(i) for i in sys.argv[4].strip().split(',')]
	if len(sys.argv) > 5:
		TRIGGERPOINT = [int(i) for i in sys.argv[5].strip().split(',')]
	if len(sys.argv) > 6:
		CONFIDENCE = [int(i) for i in sys.argv[6].strip().split(',')]

	print("time \t Read File \t num changes \t interval \t minReads \t triggerPoint \t confidence")

	# Iterate through all combinations of parameters
	for readFile in READFILE:
		for interval in INTERVAL:
			for minReads in MINREADS:
				for triggerPoint in TRIGGERPOINT:
					for confidence in CONFIDENCE:
						start_time = timeit.default_timer() # Times the block with alignment
						# Init the alignment tracker and aligner
						a = Aligner(ref)
						rt = hashRangeTracker()
						rt.setRefLen(refLen)
						rt.setInterval(interval) # Split genome into this many blocks
						rt.setMinReads(minReads) # Minimum times a read should overlap a position
						rt.setTrigger(triggerPoint) # How many times to hit a region before reporting
						rt.setConfidence(confidence)

						allChanges = []
						with open(readFile) as readsFi:
							for read in readsFi:
								read = read.strip()
								if read[0] == '#':
									# Comment line
									continue

								elapsed_time = timeit.default_timer() - start_time
								aligned = a.align(read)
								start_time = timeit.default_timer()

								changes = rt.addAlignment(read, aligned[1], aligned[0])
								if len(changes) > 0:
									# We have some changes to make
									allChanges += changes
									for c in changes:
										makeChange(a, c)
										# ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index
									#a = Aligner(ref)

							# Get the remaining updates
							changes = rt.flush()
							if len(changes) > 0:
								allChanges += changes
								for c in changes:
									makeChange(a, c)
									# ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index

						elapsed_time += timeit.default_timer() - start_time
						ref = a.getRef()

						print(str(elapsed_time) + " \t " + readFile + " \t " + str(len(allChanges)) + " \t " + str(interval) + " \t " + str(minReads) + " \t " + str(triggerPoint) + " \t " + str(confidence))
	print(ref)