Пример #1
0
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'],
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError(
                "Missing required config value '{key}'".format(key), self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(
                    config['wild type']['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None
Пример #2
0
def align_file(fin, fout, em_model, hmm_model):
    pairs = Aligner.readAMR(fin)
    print("Aligning")
    algs = Aligner.alignPairs(pairs, em_model, hmm_model)
    print("Writing alignments to file")
    Aligner.printAlignments(algs, pairs, fout)
    print("done")
Пример #3
0
 def __init__(self, feature_type, **params):
     self.feature_type = feature_type
     self.a = Aligner(method='lcs')
     if feature_type == "sparse":
         self.get_features = self.get_sparse_features
     elif feature_type == "sparse2":
         self.get_features = self.get_sparse2_features
     elif feature_type == "onehot":
         self.get_features = self.get_positional_features
     self.old_window = None
     self.sample_weight = None
     self._reset()
     self.set_params(**params)
def similarity_stats(all_sequences,
                     task,
                     out_dir,
                     out_prefix,
                     num_processes=15,
                     current_level=0):
    '''
    Writes the distribution of similarities of the top k (determined by `task`)
    sequences in all_sequences to all other sequences in the list to out_dir.
    '''
    if current_level == 0:
        sc.reset()
    _, level, k, scoring_ranges = task

    pool = multiprocessing.Pool(processes=num_processes)
    aligner = Aligner()

    for i in xrange(min(len(all_sequences), k)):
        seq = all_sequences[i]
        root = get_root_item(seq)[0]
        if scoring_ranges is not None:
            scoring_map = aligner.scoring_map(len(root), scoring_ranges)
        else:
            scoring_map = None

        if level == current_level:
            processor = partial(similarity_stats_processor, task, scoring_map,
                                seq)
            indexes = ((j, all_sequences[j])
                       for j in xrange(len((all_sequences))) if j != i)
            for index, result in pool.imap(processor, indexes, chunksize=1000):
                if result is not None:
                    if level == 0:
                        sc.counter(1, STAT_SIMILARITY_KEY, STAT_INDIVIDUAL_KEY,
                                   (i, result))
                    sc.counter(1, STAT_SIMILARITY_KEY, STAT_TOTAL_KEY, result)

        elif level > current_level:
            similarity_stats(seq[root],
                             task,
                             out_dir,
                             out_prefix,
                             num_processes=num_processes,
                             current_level=current_level + 1)

    pool.close()
    pool.join()
    if current_level == 0:
        sc.write(out_dir, prefix=out_prefix)
Пример #5
0
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'], 
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError("Missing required config value '{key}'".format(key), 
                              self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(config['wild type']
                                                  ['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None
def compare_sequence_dictionaries(source, new, task):
    '''
    Returns True if the two sequence dictionaries should be merged.
    '''
    aligner = Aligner(different_score=0)
    source_key = get_root_item(source)[0]
    new_key = get_root_item(new)[0]

    scoring_maps = None
    max_score = len(source_key)
    _, _, allowed_mutations, scoring_ranges = task
    if scoring_ranges is not None:
        score_map = aligner.scoring_map(len(source_key), scoring_ranges)
        scoring_maps = (score_map, score_map)
        max_score = sum(score_map)

    return aligner.score(
        source_key, new_key, 0,
        scoring_maps=scoring_maps) >= max_score - allowed_mutations
Пример #7
0
def _get_sentences_and_offsets(txt_handle, ss_handle):
    s_starts_and_sentences = []
    txt_handle_reads = 0
    for s_text in (l.rstrip('\n') for l in ss_handle):
        # XXX: We allow multiple spaces to be aligned due to issues with the SS
        aligner = Aligner(unicode(s_text, encoding='utf-8'),
                          ignore_mult=set((' ', )))

        t_char = None
        started_at = txt_handle.tell()
        started_at_read = txt_handle_reads
        while True:
            t_char = unicode(txt_handle.read(1), encoding='utf-8')
            txt_handle_reads += 1
            if not t_char:
                assert False, ('could not align all sentences for: '
                               '"{}" and "{}" stopped at the sentence: "{}" '
                               'aligner in state: {}').format(
                                   txt_handle.name, ss_handle.name, s_text,
                                   aligner.__repr__())
            try:
                if aligner.align(t_char):
                    source_text = _str(aligner)

                    # We are aligned!
                    s_starts_and_sentences.append((
                        #txt_handle.tell() - len(source_text),
                        #started_at,
                        started_at_read,
                        Sentence(source_text, [])))
                    #last_end += aligner.char_cnt
                    break
            except MisalignedError:
                started_at = txt_handle.tell()
                started_at_read = txt_handle_reads
                pass

    #s_starts_and_sentences.sort()
    return s_starts_and_sentences
Пример #8
0
def _get_sentences_and_offsets(txt_handle, ss_handle):
    s_starts_and_sentences = []
    txt_handle_reads = 0
    for s_text in (l.rstrip('\n') for l in ss_handle):
        # XXX: We allow multiple spaces to be aligned due to issues with the SS
        aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', )))

        t_char = None
        started_at = txt_handle.tell()
        started_at_read = txt_handle_reads
        while True:
            t_char = unicode(txt_handle.read(1), encoding='utf-8')
            txt_handle_reads += 1
            if not t_char:
                assert False, ('could not align all sentences for: '
                        '"{}" and "{}" stopped at the sentence: "{}" '
                        'aligner in state: {}'
                        ).format(txt_handle.name, ss_handle.name,
                                s_text, aligner.__repr__())
            try:
                if aligner.align(t_char):
                    source_text = _str(aligner)

                    # We are aligned!
                    s_starts_and_sentences.append((
                            #txt_handle.tell() - len(source_text),
                            #started_at,
                            started_at_read,
                            Sentence(source_text, [])))
                    #last_end += aligner.char_cnt
                    break
            except MisalignedError:
                started_at = txt_handle.tell()
                started_at_read = txt_handle_reads
                pass
    
    #s_starts_and_sentences.sort()
    return s_starts_and_sentences
Пример #9
0
def train_models(fnames, emiter, hmmiter, model_name):
    if not fnames:
        sys.exit("No file provided")

    print("Reading AMR files")
    pairs = []
    for fname in fnames:
        f = open(fname, "r")
        pairs += Aligner.readAMR(f)
        f.close()
    sentences = [Aligner.tokenize(pair[0]) for pair in pairs]
    graphs = [AMRGraph(pair[1], False) for pair in pairs]

    emprobs = EM.train(sentences, graphs, model_name + ".em", emiter)
    #emprobs = EM.load_model(model_name + ".em")
    print("Initializing rule-based alignments")
    n = len(sentences)
    initalgs = [{}] * n
    for i in range(n):
        initalgs[i] = Aligner.initalign(graphs[i].ref, sentences[i])
        if (i+1) % 1000 == 0:
            print(str(i+1) + "/" + str(n))
    hmmprobs = HMM.train(sentences, graphs, emprobs, model_name + ".hmm", hmmiter, initalgs)
    print("Done")
Пример #10
0
def main():
	# Get command line arguments
	if len(sys.argv) < 3:
		printInfo()
		exit(1)


	ref = ""
	refLen = 0
	refFile = sys.argv[1]
	READFILE = sys.argv[2].strip().split(',')
	with open(refFile) as fi:
		ref = fi.read().strip()
		refLen = len(ref)

	INTERVAL = [refLen]
	MINREADS = [10]
	TRIGGERPOINT = [10]
	CONFIDENCE = [50]

	# Convert args into a list of args
	if len(sys.argv) > 3:
		INTERVAL = [int(i) for i in sys.argv[3].strip().split(',')]
	if len(sys.argv) > 4:
		MINREADS = [int(i) for i in sys.argv[4].strip().split(',')]
	if len(sys.argv) > 5:
		TRIGGERPOINT = [int(i) for i in sys.argv[5].strip().split(',')]
	if len(sys.argv) > 6:
		CONFIDENCE = [int(i) for i in sys.argv[6].strip().split(',')]

	print("time \t Read File \t num changes \t interval \t minReads \t triggerPoint \t confidence")

	# Iterate through all combinations of parameters
	for readFile in READFILE:
		for interval in INTERVAL:
			for minReads in MINREADS:
				for triggerPoint in TRIGGERPOINT:
					for confidence in CONFIDENCE:
						start_time = timeit.default_timer() # Times the block with alignment
						# Init the alignment tracker and aligner
						a = Aligner(ref)
						rt = hashRangeTracker()
						rt.setRefLen(refLen)
						rt.setInterval(interval) # Split genome into this many blocks
						rt.setMinReads(minReads) # Minimum times a read should overlap a position
						rt.setTrigger(triggerPoint) # How many times to hit a region before reporting
						rt.setConfidence(confidence)

						allChanges = []
						with open(readFile) as readsFi:
							for read in readsFi:
								read = read.strip()
								if read[0] == '#':
									# Comment line
									continue

								elapsed_time = timeit.default_timer() - start_time
								aligned = a.align(read)
								start_time = timeit.default_timer()

								changes = rt.addAlignment(read, aligned[1], aligned[0])
								if len(changes) > 0:
									# We have some changes to make
									allChanges += changes
									for c in changes:
										makeChange(a, c)
										# ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index
									#a = Aligner(ref)

							# Get the remaining updates
							changes = rt.flush()
							if len(changes) > 0:
								allChanges += changes
								for c in changes:
									makeChange(a, c)
									# ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index

						elapsed_time += timeit.default_timer() - start_time
						ref = a.getRef()

						print(str(elapsed_time) + " \t " + readFile + " \t " + str(len(allChanges)) + " \t " + str(interval) + " \t " + str(minReads) + " \t " + str(triggerPoint) + " \t " + str(confidence))
	print(ref)
Пример #11
0
class VariantSeqLib(SeqLib):
    """
    Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding
    or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted.
    """
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'],
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError(
                "Missing required config value '{key}'".format(key), self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(
                    config['wild type']['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None

    def is_coding(self):
        return self.wt_protein is not None

    def set_wt(self, sequence, coding=True):
        """
        Set the wild type DNA *sequence*. The *sequence* is translated if *coding* 
        is ``True``. The *sequence* may only contain ``ACGT``, but may 
        contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame.
        """
        sequence = "".join(sequence.split())  # remove whitespace

        if not re.match("^[ACGTacgt]+$", sequence):
            raise EnrichError(
                "WT DNA sequence contains unexpected "
                "characters", self.name)
        if len(sequence) % 3 != 0 and coding:
            raise EnrichError("WT DNA sequence contains incomplete codons",
                              self.name)

        self.wt_dna = sequence.upper()
        if coding:
            self.wt_protein = ""
            for i in xrange(0, len(self.wt_dna), 3):
                self.wt_protein += codon_table[self.wt_dna[i:i + 3]]
        else:
            self.wt_protein = None

    def align_variant(self, variant_dna):
        """
        Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the 
        wild type sequence. Returns a list of HGVS variant strings.

        Aligned variants are stored in a local dictionary to avoid recomputing alignments. This 
        dictionary should be cleared after all variants are counted, to save memory.

        .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime.
        """
        if variant_dna in self.aligner_cache.keys():
            return self.aligner_cache[variant_dna]

        mutations = list()
        traceback = self.aligner.align(self.wt_dna, variant_dna)
        for x, y, cat, length in traceback:
            if cat == "match":
                continue
            elif cat == "mismatch":
                mut = "{pre}>{post}".format(pre=self.wt_dna[x],
                                            post=variant_dna[y])
            elif cat == "insertion":
                if y > length:
                    dup = variant_dna[y:y + length]
                    if dup == variant_dna[y - length:y]:
                        mut = "dup{seq}".format(seq=dup)
                    else:
                        mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup)
                else:
                    mut = "_{pos}ins{seq}".format(pos=x + 2,
                                                  seq=variant_dna[y:y +
                                                                  length])
            elif cat == "deletion":
                mut = "_{pos}del".format(pos=x + length)
            mutations.append((x, mut))

        self.aligner_cache[variant_dna] = mutations
        return mutations

    def count_variant(self, variant_dna, copies=1, include_indels=True):
        """
        Identifies mutations and counts the *variant_dna* sequence.
        The algorithm attempts to call variants by comparing base-by-base.
        If the *variant_dna* and wild type DNA are different lengths, or if there
        are an excess of mismatches (indicating a possible indel), local
        alignment is performed using :py:meth:`align_variant` if this option 
        has been selected in the configuration.

        Each variant is stored as a tab-delimited string of mutations in HGVS 
        format. Returns a list of HGSV variant strings. Returns an empty list 
        if the variant is wild type. Returns None if the variant was discarded
        due to excess mismatches.
        """
        if not re.match("^[ACGTNXacgtnx]+$", variant_dna):
            raise EnrichError(
                "Variant DNA sequence contains unexpected "
                "characters", self.name)

        variant_dna = variant_dna.upper()

        if len(variant_dna) != len(self.wt_dna):
            if self.aligner is not None:
                mutations = self.align_variant(variant_dna)
            else:
                return None
        else:
            mutations = list()
            for i in xrange(len(variant_dna)):
                if variant_dna[i] != self.wt_dna[i]:
                    mutations.append(
                        (i, "{pre}>{post}".format(pre=self.wt_dna[i],
                                                  post=variant_dna[i])))
                    if len(mutations) > self.filters['max mutations']:
                        if self.aligner is not None:
                            mutations = self.align_variant(variant_dna)
                            if len(mutations) > self.filters['max mutations']:
                                # too many mutations post-alignment
                                return None
                            else:
                                # stop looping over this variant
                                break
                        else:
                            # too many mutations and not using aligner
                            return None

        mutation_strings = list()
        if self.is_coding():
            variant_protein = ""
            for i in xrange(0, len(variant_dna), 3):
                try:
                    variant_protein += codon_table[variant_dna[i:i + 3]]
                except KeyError:  # garbage codon due to indel
                    variant_protein += '?'

            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                ref_pro_pos = (pos + self.reference_offset) / 3 + 1
                mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change)
                if has_indel(change):
                    mut += " (p.{pre}{pos}fs)".format(
                        pre=aa_codes[self.wt_protein[pos / 3]],
                        pos=ref_pro_pos)
                elif variant_protein[pos / 3] == self.wt_protein[pos / 3]:
                    mut += " (p.=)"
                else:
                    mut += " (p.{pre}{pos}{post})".format(
                        pre=aa_codes[self.wt_protein[pos / 3]],
                        pos=ref_pro_pos,
                        post=aa_codes[variant_protein[pos / 3]])
                mutation_strings.append(mut)
        else:
            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change)
                mutation_strings.append(mut)

        if len(mutation_strings) > 0:
            variant_string = ', '.join(mutation_strings)
        else:
            variant_string = WILD_TYPE_VARIANT
        try:
            self.df_dict['variants'][variant_string] += copies
        except KeyError:
            self.df_dict['variants'][variant_string] = copies
        return variant_string

    def count_mutations(self, include_indels=False):
        """
        Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains 
        an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted
        independently of the corresponding nucleotide change.
        """
        # restore the counts if they were saved to disk
        if self.df_dict['variants'] is None:
            self.load_counts(keys=['variants'])

        # create new dictionaries
        self.df_dict['mutations_nt'] = dict()
        if self.is_coding():
            self.df_dict['mutations_aa'] = dict()

        if not include_indels:
            mask = self.df_dict['variants'].index.map(has_indel)
            variant_data = self.df_dict['variants'][np.invert(mask)]
            del mask
        else:
            variant_data = self.df_dict['variants']
        if self.is_coding():
            for variant, count in variant_data.iterrows():
                count = count['count']  # get the element from the Series
                mutations = variant.split(", ")
                # get just the nucleotide changes
                for m in mutations:
                    m = m.split(" (")[0]
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count
                # get the amino acid changes
                aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]",
                                        variant)
                for a in aa_changes:
                    try:
                        self.df_dict['mutations_aa'][a] += count
                    except KeyError:
                        self.df_dict['mutations_aa'][a] = count
        else:
            for variant, count in variant_data.iterrows():
                count = count['count']  # get the element from the Series
                mutations = variant.split(", ")
                for m in mutations:
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count

        self.df_dict['mutations_nt'] = \
                pd.DataFrame.from_dict(self.df_dict['mutations_nt'],
                                       orient="index", dtype="int32")
        if self.is_coding():
            self.df_dict['mutations_aa'] = \
                    pd.DataFrame.from_dict(self.df_dict['mutations_aa'],
                                           orient="index", dtype="int32")
Пример #12
0
            return int(filename)
        except ValueError as e:
            return 0


# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

parser = argparse.ArgumentParser(description="Stack several image files to create digital long exposure photographies")
parser.add_argument("--align", action="store_true", help="run only the aligner, do not compress")
parser.add_argument("--transform", action="store_true", help="run only the aligner and transform, do not compress")
parser.add_argument("--stitch", action="store_true", help="stitch images for panoramic formats")
args = parser.parse_args()

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

aligner = Aligner()
stitcher = Stitcher()
stacker = Stacker(aligner)
input_images_aligner = []
input_images_stitcher = []
input_images_stacker = []

# transform to absolute paths
BASE_DIR = os.path.dirname(os.path.realpath(__file__))

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

# init aligner

if args.align or args.transform:
Пример #13
0
# -*- coding: utf-8 -*-

from text import *
from lexicon import *
from aligner import Aligner

text = Text()
lexicon = Lexicon()

text.parse("../data/77b.txt")
lexicon.parse("../data/arapaho_lexicon.json")

aligner = Aligner(text, lexicon)

aligner.align("../data/new_test_text_file.txt", "../data/test_log_file.txt")
Пример #14
0
import numpy

from aligner import Aligner
from model import autoencoder_A
from model import autoencoder_B
from model import encoder, decoder_A, decoder_B

encoder.load_weights("models/encoder.h5")
decoder_A.load_weights("models/decoder_A.h5")
decoder_B.load_weights("models/decoder_B.h5")

autoencoder = autoencoder_B

# landmark file can be found in http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
# unzip it in the same folder as the main scripts
aligner = Aligner("shape_predictor_68_face_landmarks.dat",
                  "mmod_human_face_detector.dat")


def convert_one_image(image):
    assert image.shape == (256, 256, 3)
    crop = slice(48, 208)
    face = image[crop, crop]
    face = cv2.resize(face, (64, 64))
    face = numpy.expand_dims(face, 0)
    new_face = autoencoder.predict(face / 255.0)[0]
    new_face = numpy.clip(new_face * 255, 0, 255).astype(image.dtype)
    new_face = cv2.resize(new_face, (160, 160))
    result = aligner.align(image.copy(), new_face)
    if result is None:
        return superpose(image, new_face, crop)
    else:
Пример #15
0
  gpu         = params["gpu"]
  queue_name  = params["queue_name"]
else:
  max_disp    = options.max_disp
  model_name  = options.model_name
  net_crop    = options.crop
  mip_range   = (options.net_mip_low, options.net_mip_high)
  patch_size  = options.patch_size
  xy_offset   = (options.x_offset, options.y_offset)
  xy_size     = (options.x_size, options.y_size)
  source_img  = options.source_img
  dest_img    = options.dest_img

  stack_start = options.stack_start
  stack_end   = options.stack_end
  move_anchor = options.move_anchor
  gpu         = options.gpu
  queue_name  = options.queue_name


model_path = 'model_repository/' + model_name + '.pt'
high_mip_chunk = (patch_size, patch_size)

a = Aligner(model_path, max_disp, net_crop, mip_range, high_mip_chunk,
		        source_img, dest_img, queue_name=queue_name, gpu=gpu)

bbox = BoundingBox(xy_offset[0], xy_offset[0]+xy_size[0],
                   xy_offset[1], xy_offset[1]+xy_size[1], mip=0, max_mip=9)

a.align_ng_stack(stack_start, stack_end, bbox, move_anchor=move_anchor)
Пример #16
0
  stack_start = params["stack_start"]
  stack_end   = params["stack_end"]
  move_anchor = params["move_anchor"]
  gpu         = params["gpu"]
  queue_name  = params["queue_name"]
else:
  max_disp    = options.max_disp
  model_name  = options.model_name
  net_crop    = options.crop
  mip_range   = (options.net_mip_low, options.net_mip_high)
  patch_size  = options.patch_size
  xy_offset   = (options.x_offset, options.y_offset)
  xy_size     = (options.x_size, options.y_size)
  source_img  = options.source_img
  dest_img    = options.dest_img

  stack_start = options.stack_start
  stack_end   = options.stack_end
  move_anchor = options.move_anchor
  gpu         = options.gpu
  queue_name  = options.queue_name



model_path = 'model_repository/' + model_name + '.pt'
high_mip_chunk = (patch_size, patch_size)

a = Aligner(model_path, max_disp, net_crop, mip_range, high_mip_chunk,
		        source_img, dest_img, queue_name=queue_name, gpu=gpu)
a.listen_for_tasks()
Пример #17
0
from aligner import Aligner
from rollback_pyramid import RollbackPyramid

aligners = {}
pyramid = RollbackPyramid()

for m in [8, 9, 10]:
    aligners[m] = Aligner(fms=[2, 16, 16, 16, 16, 2], k=7).cuda()
    aligners[m].load_state_dict(torch.load('./checkpoints/barak_aligner_mip{}.pth.tar'.format(m)))
    pyramid.set_mip_processor(aligners[m], m)
Пример #18
0
Gwen Hoffmann
David Gong

Tests for correct alignment of reads. Runs experiments to test change
in alignment time as the length of reads, length of genome, and number
of edits increases. 
"""

import sys
from aligner import Aligner
import random
import time


string = "ACTCTGCTTTAG"
a = Aligner(string)


#test exact match
pos, edits = a.align("TCTGC")
assert pos == 2
#print pos, edits

#test insertion
pos, edits = a.align("ACTTGC")
#print pos, edits
assert pos == 0

#test replacement
pos, edits = a.align("TACTT")
#print pos, edits
Пример #19
0
and reading these variants in a way which simulated sequencing
These variants have been written to a pickled dictionary object, and the reads written out to
a range of FastQ files, a R1 and R2 for each input gene

The condenser process aggregates all the R1 and R2 files across all these input genes into a 
single pair of FastQs, ready for alignment
"""
file_condenser = Condenser(geneset, run_number)
file_condenser.run()
"""
From this point the process involves running a couple of template commands for the rest of the process
- Aligning the reads to a reference
- Calling variants 
- Annotation of those variants (minimal, HGVS and gene name only)
"""
aligner = Aligner(sam_directory, output_name, reference, run_number)
bam_filename = aligner.run()
bam_location = os.path.join("fastQs", bam_filename)

"""
This section is for the variant calling on the aligned files.
Due to some aspect of the read generation, Platypus is unable to generate variant calls from the aligned input data.
The SAMtools mpileup feature, combined with the bcftools call function are used for the two-step variant calling.
"""
temp_bcf = os.path.join("VCFs", "temp.bcf")
vcf_location = os.path.join("VCFs", vcf_name)
variant_filled = variant_call_string % (reference, bam_location)
filled_filter = var_call_filter_string % temp_bcf


print variant_filled
Пример #20
0
    #unpickle
    with open("clusteredGerman.dat", "rb") as f:
        germanClusterList = pickle.load(f)

    with open("clusteredEnglish.dat", "rb") as f:
        englishClusterList = pickle.load(f)

    with open("intersection.dat", "rb") as f:
        intersection = pickle.load(f)
    print("done unpickling")
    #for cluster in germanClusterList:
    #print(cluster.predicates)
    entitySetLength = len(intersection) + 1

    #alignment of clusters begins here
    a = Aligner()
    print("begin aligning: ", datetime.datetime.now())
    clusterTupleList = a.alignClustersNew(germanClusterList,
                                          englishClusterList, entitySetLength,
                                          intersection,
                                          "alignmentOutputWithcosineSim.txt")
    print("done aligning: ", datetime.datetime.now())

    #pickling of final list
    with open("alignedListNoPmi.dat", "wb") as f:
        pickle.dump(clusterTupleList, f)

    #for clusterTupel in clusterTupleList:
    #    clusterTupel[0].printClusterPredicates()
    #    clusterTupel[1].printClusterPredicates()
    #    print(clusterTupel[2])
Пример #21
0
    parser.add_argument("--do_test", default=True, type=util.str2bool)
    parser.add_argument("--cache_dataset", default=False, type=util.str2bool)
    parser.add_argument("--cache_path", default="", type=str)
    ############################################################################
    parser.add_argument("--default_save_path", default="./", type=str)
    parser.add_argument("--gradient_clip_val", default=0, type=float)
    parser.add_argument("--num_nodes", default=1, type=int)
    parser.add_argument("--gpus", default=None, type=int)
    parser.add_argument("--overfit_batches", default=0.0, type=float)
    parser.add_argument("--track_grad_norm", default=-1, type=int)
    parser.add_argument("--check_val_every_n_epoch", default=1, type=int)
    parser.add_argument("--fast_dev_run", default=False, type=util.str2bool)
    parser.add_argument("--accumulate_grad_batches", default=1, type=int)
    parser.add_argument("--max_epochs", default=1000, type=int)
    parser.add_argument("--min_epochs", default=1, type=int)
    parser.add_argument("--max_steps", default=None, type=int)
    parser.add_argument("--min_steps", default=None, type=int)
    parser.add_argument("--val_check_interval", default=1.0, type=float)
    parser.add_argument("--log_every_n_steps", default=10, type=int)
    parser.add_argument("--distributed_backend", default=None, type=str)
    parser.add_argument("--precision", default=32, type=int)
    parser.add_argument("--resume_from_checkpoint", default=None, type=str)
    ############################################################################
    parser = Model.add_model_specific_args(parser)
    parser = Tagger.add_model_specific_args(parser)
    parser = Classifier.add_model_specific_args(parser)
    parser = DependencyParser.add_model_specific_args(parser)
    parser = Aligner.add_model_specific_args(parser)
    hparams = parser.parse_args()
    main(hparams)
def combine_records(forward_record,
                    reverse_record,
                    reference_sequences,
                    min_overlap=-1,
                    max_overlap=-1,
                    max_length_delta=1e30,
                    reference_scoring_ranges=None):
    '''
    Computes the alignments of both forward and reverse reads to the reference
    sequences. Synthesizes those alignments, using the better-quality read in
    the case of a conflict. Returns (index, sequence, quality) where `index` is
    the index of the reference sequence used, `sequence` is the combined DNA
    sequence, and `quality` is the quality of each base in the combined sequence.

    The optional parameters min_overlap and max_overlap correspond to the overlap
    constraints on the alignment between the forward and reverse reads.
    '''
    aligner = Aligner()

    forward_str = str(forward_record.seq)
    reverse_str = str(reverse_record.seq.reverse_complement())

    # Align forward to references
    reference_index, forward_offset, forward_score = aligner.best_alignment(
        forward_str,
        reference_sequences,
        unidirectional=True,
        min_overlap=len(forward_str),
        candidate_scoring_ranges=reference_scoring_ranges)

    # Align forward to reverse
    reverse_offset, _ = aligner.align(forward_str,
                                      reverse_str,
                                      unidirectional=True,
                                      reverse=True,
                                      min_overlap=min_overlap,
                                      max_overlap=max_overlap)

    reference = reference_sequences[reference_index]
    reference_scoring_range = reference_scoring_ranges[
        reference_index] if reference_scoring_ranges is not None else None

    # Align reverse to reference
    reverse_offset_to_ref, reverse_score = aligner.align(
        reference,
        reverse_str,
        unidirectional=True,
        reverse=True,
        min_overlap=15,
        scoring_ranges=(reference_scoring_range, None))

    # Compare the pairwise scores of obeying the forward and obeying the reverse alignments to reference,
    # and adjust the alignment offsets accordingly.
    if reverse_score > forward_score:
        forward_offset = reverse_offset_to_ref - reverse_offset
        reverse_offset = reverse_offset_to_ref
    else:
        reverse_offset += forward_offset

    combined_sequence = ""
    combined_quality = []

    alignment_set = [(reference, 0), (forward_str, forward_offset),
                     (reverse_str, reverse_offset)]
    # Uncomment to print the resulting alignments
    # print('\n'.join(aligner.format_multiple(*alignment_set)))

    # Discard the read if total length is too different from reference length
    if max_length_delta <= len(reference):
        if math.fabs(aligner.length(*alignment_set) -
                     len(reference)) > max_length_delta:
            sc.counter(1, STAT_DELETIONS_KEY, STAT_EXCESS_LENGTH_KEY)
            return -1, None, None

    # Combine the reads to produce the overall sequence.
    # The aligner will enumerate the aligned characters or elements of each iterable we give it.
    # Zipping generators for both the sequence and the quality allows us to enumerate them together.
    sequence_generator = aligner.enumerate_multiple(*alignment_set)
    quality_generator = aligner.enumerate_multiple(
        ([None for i in xrange(len(reference))], 0),
        (forward_record.letter_annotations[SEQUENCE_QUALITY_KEY],
         forward_offset),
        (reverse_record.letter_annotations[SEQUENCE_QUALITY_KEY],
         reverse_offset))
    for bases, qualities in izip(sequence_generator, quality_generator):
        _, forward_base, reverse_base = bases
        _, forward_quality, reverse_quality = qualities

        if forward_base is None and reverse_base is None:
            combined_sequence += UNSPECIFIED_BASE
            combined_quality.append(0)
        elif forward_base is None:
            combined_sequence += reverse_base
            combined_quality.append(reverse_quality)
        elif reverse_base is None:
            combined_sequence += forward_base
            combined_quality.append(forward_quality)
        else:
            base, quality = max([(forward_base, forward_quality),
                                 (reverse_base, reverse_quality)],
                                key=lambda x: x[1])
            combined_sequence += base
            combined_quality.append(quality)

    return reference_index, combined_sequence, combined_quality
Пример #23
0
    description=
    "Stack several image files to create digital long exposure photographies")
parser.add_argument("--align",
                    action="store_true",
                    help="run only the aligner, do not compress")
parser.add_argument("--transform",
                    action="store_true",
                    help="run only the aligner and transform, do not compress")
parser.add_argument("--stitch",
                    action="store_true",
                    help="stitch images for panoramic formats")
args = parser.parse_args()

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

aligner = Aligner()
stitcher = Stitcher()
stacker = Stacker(aligner)
input_images_aligner = []
input_images_stitcher = []
input_images_stacker = []

# transform to absolute paths
BASE_DIR = os.path.dirname(os.path.realpath(__file__))

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

# init aligner

if args.align or args.transform:
    for cluster in new_hierarchy:
        #print(cluster)
        yield cluster


### Similarity stats


def similarity_stats_processor(task, scoring_map, seq, (index, other_seq)):
    '''
    Convenience function for multiprocessing that calls the merge_function and
    returns part of the input.
    '''
    root = get_root_item(seq)[0]
    other_root = get_root_item(other_seq)[0]
    aligner = Aligner(different_score=0)
    return index, aligner.score(root,
                                other_root,
                                0,
                                scoring_maps=(scoring_map, scoring_map))


def similarity_stats(all_sequences,
                     task,
                     out_dir,
                     out_prefix,
                     num_processes=15,
                     current_level=0):
    '''
    Writes the distribution of similarities of the top k (determined by `task`)
    sequences in all_sequences to all other sequences in the list to out_dir.
Пример #25
0
class VariantSeqLib(SeqLib):
    """
    Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding
    or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted.
    """
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'], 
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError("Missing required config value '{key}'".format(key), 
                              self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(config['wild type']
                                                  ['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None


    def is_coding(self):
        return self.wt_protein is not None


    def set_wt(self, sequence, coding=True):
        """
        Set the wild type DNA *sequence*. The *sequence* is translated if *coding* 
        is ``True``. The *sequence* may only contain ``ACGT``, but may 
        contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame.
        """
        sequence = "".join(sequence.split()) # remove whitespace

        if not re.match("^[ACGTacgt]+$", sequence):
            raise EnrichError("WT DNA sequence contains unexpected "
                              "characters", self.name)
        if len(sequence) % 3 != 0 and coding:
            raise EnrichError("WT DNA sequence contains incomplete codons", 
                              self.name)
        
        self.wt_dna = sequence.upper()
        if coding:
            self.wt_protein = ""
            for i in xrange(0, len(self.wt_dna), 3):
                self.wt_protein += codon_table[self.wt_dna[i:i + 3]]
        else:
            self.wt_protein = None


    def align_variant(self, variant_dna):
        """
        Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the 
        wild type sequence. Returns a list of HGVS variant strings.

        Aligned variants are stored in a local dictionary to avoid recomputing alignments. This 
        dictionary should be cleared after all variants are counted, to save memory.

        .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime.
        """
        if variant_dna in self.aligner_cache.keys():
            return self.aligner_cache[variant_dna]

        mutations = list()
        traceback = self.aligner.align(self.wt_dna, variant_dna)
        for x, y, cat, length in traceback:
            if cat == "match":
                continue
            elif cat == "mismatch":
                mut = "{pre}>{post}".format(pre=self.wt_dna[x], post=variant_dna[y])
            elif cat == "insertion":
                if y > length:
                    dup = variant_dna[y:y + length]
                    if dup == variant_dna[y - length:y]:
                        mut = "dup{seq}".format(seq=dup)
                    else:
                        mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup)
                else:                                    
                    mut = "_{pos}ins{seq}".format(pos=x + 2, seq=variant_dna[y:y + length])
            elif cat == "deletion":
                mut = "_{pos}del".format(pos=x + length)
            mutations.append((x, mut))

        self.aligner_cache[variant_dna] = mutations
        return mutations


    def count_variant(self, variant_dna, copies=1, include_indels=True):
        """
        Identifies mutations and counts the *variant_dna* sequence.
        The algorithm attempts to call variants by comparing base-by-base.
        If the *variant_dna* and wild type DNA are different lengths, or if there
        are an excess of mismatches (indicating a possible indel), local
        alignment is performed using :py:meth:`align_variant` if this option 
        has been selected in the configuration.

        Each variant is stored as a tab-delimited string of mutations in HGVS 
        format. Returns a list of HGSV variant strings. Returns an empty list 
        if the variant is wild type. Returns None if the variant was discarded
        due to excess mismatches.
        """
        if not re.match("^[ACGTNXacgtnx]+$", variant_dna):
            raise EnrichError("Variant DNA sequence contains unexpected "
                              "characters", self.name)

        variant_dna = variant_dna.upper()

        if len(variant_dna) != len(self.wt_dna):
            if self.aligner is not None:
                mutations = self.align_variant(variant_dna)
            else:
                return None
        else:
            mutations = list()
            for i in xrange(len(variant_dna)):
                if variant_dna[i] != self.wt_dna[i]:
                    mutations.append((i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i])))
                    if len(mutations) > self.filters['max mutations']:
                        if self.aligner is not None:
                            mutations = self.align_variant(variant_dna)
                            if len(mutations) > self.filters['max mutations']:
                                # too many mutations post-alignment
                                return None
                            else:
                                # stop looping over this variant
                                break
                        else:
                            # too many mutations and not using aligner
                            return None

        mutation_strings = list()
        if self.is_coding():
            variant_protein = ""
            for i in xrange(0, len(variant_dna), 3):
                try:
                    variant_protein += codon_table[variant_dna[i:i + 3]]
                except KeyError: # garbage codon due to indel
                    variant_protein += '?'

            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                ref_pro_pos = (pos + self.reference_offset) / 3 + 1
                mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change)
                if has_indel(change):
                    mut += " (p.{pre}{pos}fs)".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos)
                elif variant_protein[pos / 3] == self.wt_protein[pos / 3]:
                    mut += " (p.=)"
                else:
                    mut += " (p.{pre}{pos}{post})".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos,
                             post=aa_codes[variant_protein[pos / 3]])
                mutation_strings.append(mut)
        else:
            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change)
                mutation_strings.append(mut)

        if len(mutation_strings) > 0:
            variant_string = ', '.join(mutation_strings)
        else:
            variant_string = WILD_TYPE_VARIANT
        try:
            self.df_dict['variants'][variant_string] += copies
        except KeyError:
            self.df_dict['variants'][variant_string] = copies
        return variant_string


    def count_mutations(self, include_indels=False):
        """
        Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains 
        an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted
        independently of the corresponding nucleotide change.
        """
        # restore the counts if they were saved to disk
        if self.df_dict['variants'] is None:
            self.load_counts(keys=['variants'])

        # create new dictionaries
        self.df_dict['mutations_nt'] = dict()
        if self.is_coding():
            self.df_dict['mutations_aa'] = dict()

        if not include_indels:
            mask = self.df_dict['variants'].index.map(has_indel)
            variant_data = self.df_dict['variants'][np.invert(mask)]
            del mask
        else:
            variant_data = self.df_dict['variants']
        if self.is_coding():
            for variant, count in variant_data.iterrows():
                count = count['count'] # get the element from the Series
                mutations = variant.split(", ")
                # get just the nucleotide changes
                for m in mutations:
                    m = m.split(" (")[0]
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count
                # get the amino acid changes
                aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]", variant)
                for a in aa_changes:
                    try:
                        self.df_dict['mutations_aa'][a] += count
                    except KeyError:
                        self.df_dict['mutations_aa'][a] = count
        else:
            for variant, count in variant_data.iterrows():
                count = count['count'] # get the element from the Series
                mutations = variant.split(", ")
                for m in mutations:
                    try:
                        self.df_dict['mutations_nt'][m] += count
                    except KeyError:
                        self.df_dict['mutations_nt'][m] = count

        self.df_dict['mutations_nt'] = \
                pd.DataFrame.from_dict(self.df_dict['mutations_nt'], 
                                       orient="index", dtype="int32")
        if self.is_coding():
            self.df_dict['mutations_aa'] = \
                    pd.DataFrame.from_dict(self.df_dict['mutations_aa'], 
                                           orient="index", dtype="int32")
Пример #26
0
class Inflection:
    _PARAMS = {
        'C': 1.0,
        'window': 3,
        'cross_features': 2,
        'classifier': 'mono',
        'C_replace': 0.0,
        'C_insert': 0.0
    }

    def __init__(self, feature_type, **params):
        self.feature_type = feature_type
        self.a = Aligner(method='lcs')
        if feature_type == "sparse":
            self.get_features = self.get_sparse_features
        elif feature_type == "sparse2":
            self.get_features = self.get_sparse2_features
        elif feature_type == "onehot":
            self.get_features = self.get_positional_features
        self.old_window = None
        self.sample_weight = None
        self._reset()
        self.set_params(**params)

    def _reset(self):
        for k, v in self._PARAMS.items():
            setattr(self, k, v)

    def set_params(self, **params):
        for k, v in params.items():
            if k in self._PARAMS:
                setattr(self, k, v)

    def vectorize(self, lem, tag, wf=None):
        if self.old_window != self.window:
            self.old_window = self.window
            self.features = []
            self.labels = []
            for i, (l, t, w) in enumerate(zip(lem, tag, wf)):
                alignments = self.a.align(l, w)
                alignments = [[('<', '<')] + x + [('>', '>')]
                              for x in alignments]

                for j, a in enumerate(alignments):
                    if j > 0:
                        break  # in case there are multiple alignments take only the first
                    li, wi = 0, 0
                    for k, (lc, wc) in enumerate(a):
                        self.features.append(
                            self.get_features('<' + l + '>',
                                              '<' + w[:wi],
                                              t,
                                              li,
                                              window=(self.window,
                                                      self.window)))
                        if lc == '':
                            action = 'insert:' + wc
                            wi += 1
                        elif lc == wc:
                            action = 'copy:'
                            li += 1
                            wi += 1
                        elif wc == '':
                            action = 'delete:'
                            li += 1
                        else:
                            action = 'replace:' + wc
                            li += 1
                            wi += 1
                        self.labels.append(action)

            if self.feature_type.startswith('sparse'):
                self.vec = TfidfVectorizer(sublinear_tf=True,
                                           analyzer=lambda x: x)
                self.x = self.vec.fit_transform(self.features)
            else:
                self.x = np.array(self.features)

    def fit(self, wf, lem, tag):
        print("vecorize....", file=sys.stderr)
        self.vectorize(lem, tag, wf)
        print(self.x.shape, file=sys.stderr)

        print("fit....", file=sys.stderr)
        if self.classifier == 'twostep':
            action = [s.split(':')[0] for s in self.labels]
            self.clf = LinearSVC(C=self.C,
                                 class_weight='balanced',
                                 max_iter=1000)
            self.clf.fit(self.x, action, sample_weight=self.sample_weight)

            replace_i = [i for i in range(len(self.labels))\
                    if self.labels[i].startswith('replace')]
            sw = None
            if len(replace_i):
                x = self.x[replace_i, :]
                y = np.array(self.labels)[replace_i]
                if self.C_replace == 0.0: self.C_replace = self.C
                if len(set(y)) == 1:
                    self.clf_replace = DummyClassifier()
                else:
                    self.clf_replace = LinearSVC(C=self.C_replace,
                                                 class_weight='balanced',
                                                 max_iter=50000)
                self.clf_replace.fit(x, y)
            else:
                self.clf_replace = DummyClassifier()

            insert_i = [i for i in range(len(self.labels))\
                    if self.labels[i].startswith('insert')]
            if len(insert_i):
                x = self.x[insert_i, :]
                y = np.array(self.labels)[insert_i]
                if self.C_insert == 0.0: self.C_insert = self.C
                if len(set(y)) == 1:
                    self.clf_replace = DummyClassifier()
                else:
                    self.clf_insert = LinearSVC(C=self.C_replace,
                                                class_weight='balanced',
                                                max_iter=50000)
                self.clf_insert.fit(x, y)
            else:
                self.clf_insert = DummyClassifier()
        else:
            self.clf = LinearSVC(C=self.C,
                                 class_weight='balanced',
                                 max_iter=50000)
            self.clf.fit(self.x, self.labels, sample_weight=self.sample_weight)

    def predict(self, x):
        if self.classifier == 'twostep':
            action = str(self.clf.predict(x)[0])
            ch = ''
            if action == 'insert':
                if self.clf_insert is None:
                    action = 'copy'
                else:
                    ch = str(self.clf_insert.predict(x)[0]).split(':', 1)[1]
            elif action == 'replace':
                if self.clf_replace is None:
                    action = 'copy'
                else:
                    ch = str(self.clf_replace.predict(x)[0]).split(':', 1)[1]
            return action, ch
        else:
            return str(self.clf.predict(x)[0]).split(':', 1)

    def decode(self, lemma, tags, max_len=30):
        w_prefix = ''
        li = 0
        while li < len(lemma):
            feat = self.get_features(lemma, w_prefix, tags, li)
            if self.feature_type.startswith('sparse'):
                x = self.vec.transform([feat])
            else:
                x = np.array([feat])
            act, arg = self.predict(x)
            if act == 'copy':
                w_prefix += lemma[li]
                li += 1
            elif act == 'replace':
                w_prefix += arg
                li += 1
            elif act == 'insert':
                w_prefix += arg
            elif act == 'delete':
                li += 1
            if len(w_prefix) > max_len or w_prefix and w_prefix[-1] == '>':
                break
        return w_prefix

    def get_sparse_features(self,
                            lemma,
                            word_prefix,
                            tags,
                            idx,
                            window=(10, 10)):
        cross = self.cross_features
        pfx_feat, sfx_feat, wpfx_feat = [], [], []
        tag_feat = ["tag:{}".format(t) for t in tags]
        if cross >= 2:
            tag_feat += [
                "tag2:{}-{}".format(t, t)
                for t in itertools.product(tags, tags)
            ]
        ch_feat = ["ch:{}".format(lemma[idx])]
        for i in range(1, window[0] + 1):
            if i <= idx:
                pfx_feat.append('lprefix:{}'.format(lemma[idx - i:idx]))
            if i <= len(word_prefix):
                wpfx_feat.append('wprefix:{}'.format(word_prefix[-i:]))
        for i in range(idx + 1, idx + window[1]):
            if i <= len(lemma):
                sfx_feat.append('lsuffix:{}'.format(lemma[idx:i]))
        str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat
        if cross > 3:
            cross = [
                "&".join((x, y))
                for x, y in itertools.product(pfx_feat, sfx_feat)
            ]
            cross = [
                "&".join((x, y))
                for x, y in itertools.product(wpfx_feat, cross)
            ]
            cross = [
                "&".join((x, y)) for x, y in itertools.product(ch_feat, cross)
            ]
            str_feat += cross
        else:
            cross = [
                "&".join((x, y))
                for x, y in itertools.product(ch_feat, str_feat)
            ]
            str_feat += cross
        return str_feat + tag_feat + [
            "&".join((x, y)) for x, y in itertools.product(tag_feat, str_feat)
        ]

    def get_sparse2_features(self,
                             lemma,
                             word_prefix,
                             tags,
                             idx,
                             window=(10, 10)):
        cross = self.cross_features
        pfx_feat, sfx_feat, wpfx_feat = [], [], []
        tag_feat = [{"t:{}".format(t)} for t in tags]
        ch_feat = [{"l0:{}".format(lemma[idx])}]
        for i in range(1, window[0] + 1):
            if i <= idx:
                pfx_feat.append({'l-{}:{}'.format(i, lemma[idx - i])})
            if i <= len(word_prefix):
                wpfx_feat.append({'w-{}:{}'.format(i, word_prefix[-i])})
        for i in range(1, window[1] + 1):
            if (idx + i) < len(lemma):
                sfx_feat.append({'l+{}:{}'.format(i, lemma[idx + i])})
        str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat
        feat = str_feat + tag_feat
        feat_cross = feat
        for i in range(cross):
            feat_cross = [
                x | y for x, y in itertools.product(feat, feat_cross)
            ]
        return ['&'.join(sorted(f)) for f in feat_cross]

    def get_positional_features(self,
                                lemma,
                                word_prefix,
                                tags,
                                idx,
                                window=(3, 3)):
        chars = [lemma[idx]]
        tag_enc = self.data.te
        ch_enc = self.data.ce
        for i in range(idx - (window[0] + 1), idx - 1):
            if i >= 0:
                chars.append(lemma[i])
                chars.append(word_prefix[i])
            else:
                chars.append(ch_enc.pad)
        for i in range(idx + 1, idx + window[1] + 1):
            if i < len(lemma):
                chars.append(lemma[i])
            else:
                chars.append(ch_enc.pad)
        feat = np.array(ch_enc.encode(chars, onehot=True)).flatten()
        feat = np.concatenate((feat, tag_enc.transform([tags])[0]))
        return feat

    def evaluate(self, wf, lemmas, tags):
        acc = 0
        med = 0
        for i, word in enumerate(wf):
            tag = tags[i]
            lem = lemmas[i]
            pred = self.decode(lem, tag)
            #            print(word, pred, file=sys.stderr)
            acc += int(pred == word)
            med += editdistance.eval(pred, word)
        med = med / len(wf)
        acc = acc / len(wf)
        print(acc, med, file=sys.stderr)
        return (acc, med)
Пример #27
0
from __future__  import print_function
from aligner import Aligner

a = Aligner()

x = "TCGAACTGAAAA"
y = "AACTGA"

trace = a.align(x, y)

print(x)
print(y)
for t in trace:
	print(t)
def cluster_sequences(all_sequences, task, num_processes=15, current_level=0):
    '''
    Performs the given clustering task. The task should contain the number of
    mutations to allow for sequences to be clustered, the hierarchical level at
    which the sequences are clustered, and optionally a list of ranges to use
    for scoring.

    The algorithm works as follows: let n be the length of all_sequences, k be
    the number of bases to score, and t be the number of mutations allowed. For
    every k-choose-t combination of bases in each sequence, the sequence string
    is hashed into a dictionary where that set of bases is replaced by a neutral
    character. If a hashing collision occurs, then that sequence is marked as
    belonging in a cluster with the originally hashed item. Once all of the
    sequences have been hashed, the clusters are merged together (using the first-
    read sequence as the consensus) and yielded. Altogether, this algorithm
    requires O(nk^t) running time as well as O(nk^t) space complexity.
    '''
    level = task[1]

    if level == current_level and len(all_sequences) == 1:
        yield all_sequences[0]
        return

    if level == current_level:
        # Get a scoring map to know which bases to include in the hashes
        _, _, allowed_mutations, scoring_ranges = task
        if scoring_ranges is not None and len(scoring_ranges) > 0:
            aligner = Aligner(different_score=0)
            score_map = aligner.scoring_map(scoring_ranges[-1][1],
                                            scoring_ranges)
        else:
            score_map = None

        # Build hashes corresponding to the sequences with the specified
        # allowed_mutations # of bases excluded
        if level == 0:
            print("Hashing and clustering...")
        hashes = {}
        clusters = {}
        for i, seq in enumerate(all_sequences):
            root = get_root_item(seq)[0]
            for key_1, key_2 in sequence_hash_excluding_bases(
                    root, score_map, allowed_mutations):
                if key_1 not in hashes:
                    hashes[key_1] = {}
                relevant_dict = hashes[key_1]

                if key_2 in relevant_dict:
                    # Found an overlap - create a cluster
                    marker_index = relevant_dict[key_2]
                    if marker_index not in clusters:
                        clusters[marker_index] = set()
                    clusters[marker_index].add(i)
                else:
                    relevant_dict[key_2] = i

        # Merge the found clusters and yield them in decreasing order of frequency
        if level == 0:
            print("Merging and returning clusters...")
        visited_indexes = set()
        for i, seq in enumerate(all_sequences):
            if i in visited_indexes:
                continue
            merged_seq = seq
            root = get_root_item(merged_seq)[0]
            visited_indexes.add(i)

            if i in clusters:
                for other_index in sorted(list(clusters[i])):
                    other_seq = all_sequences[other_index]
                    other_root = get_root_item(other_seq)[0]
                    merged_seq[root] = merge_sequence_info(
                        merged_seq[root], other_seq[other_root])
                    visited_indexes.add(other_index)

            yield merged_seq
            if len(visited_indexes) == len(all_sequences):
                break
    else:
        num_seqs = len(all_sequences)
        for i in xrange(num_seqs):
            if current_level == 0 and i % 1000 == 0:
                print("Clustering sequence {} of {}...".format(i, num_seqs))
            seq = all_sequences[i]
            root = get_root_item(seq)[0]
            merged_seq = {root: []}

            for result in cluster_sequences(seq[root], task, num_processes,
                                            current_level + 1):
                merged_seq[root].append(result)
            yield merged_seq