def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError( "Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int( config['wild type']['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None
def align_file(fin, fout, em_model, hmm_model): pairs = Aligner.readAMR(fin) print("Aligning") algs = Aligner.alignPairs(pairs, em_model, hmm_model) print("Writing alignments to file") Aligner.printAlignments(algs, pairs, fout) print("done")
def __init__(self, feature_type, **params): self.feature_type = feature_type self.a = Aligner(method='lcs') if feature_type == "sparse": self.get_features = self.get_sparse_features elif feature_type == "sparse2": self.get_features = self.get_sparse2_features elif feature_type == "onehot": self.get_features = self.get_positional_features self.old_window = None self.sample_weight = None self._reset() self.set_params(**params)
def similarity_stats(all_sequences, task, out_dir, out_prefix, num_processes=15, current_level=0): ''' Writes the distribution of similarities of the top k (determined by `task`) sequences in all_sequences to all other sequences in the list to out_dir. ''' if current_level == 0: sc.reset() _, level, k, scoring_ranges = task pool = multiprocessing.Pool(processes=num_processes) aligner = Aligner() for i in xrange(min(len(all_sequences), k)): seq = all_sequences[i] root = get_root_item(seq)[0] if scoring_ranges is not None: scoring_map = aligner.scoring_map(len(root), scoring_ranges) else: scoring_map = None if level == current_level: processor = partial(similarity_stats_processor, task, scoring_map, seq) indexes = ((j, all_sequences[j]) for j in xrange(len((all_sequences))) if j != i) for index, result in pool.imap(processor, indexes, chunksize=1000): if result is not None: if level == 0: sc.counter(1, STAT_SIMILARITY_KEY, STAT_INDIVIDUAL_KEY, (i, result)) sc.counter(1, STAT_SIMILARITY_KEY, STAT_TOTAL_KEY, result) elif level > current_level: similarity_stats(seq[root], task, out_dir, out_prefix, num_processes=num_processes, current_level=current_level + 1) pool.close() pool.join() if current_level == 0: sc.write(out_dir, prefix=out_prefix)
def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError("Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int(config['wild type'] ['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None
def compare_sequence_dictionaries(source, new, task): ''' Returns True if the two sequence dictionaries should be merged. ''' aligner = Aligner(different_score=0) source_key = get_root_item(source)[0] new_key = get_root_item(new)[0] scoring_maps = None max_score = len(source_key) _, _, allowed_mutations, scoring_ranges = task if scoring_ranges is not None: score_map = aligner.scoring_map(len(source_key), scoring_ranges) scoring_maps = (score_map, score_map) max_score = sum(score_map) return aligner.score( source_key, new_key, 0, scoring_maps=scoring_maps) >= max_score - allowed_mutations
def _get_sentences_and_offsets(txt_handle, ss_handle): s_starts_and_sentences = [] txt_handle_reads = 0 for s_text in (l.rstrip('\n') for l in ss_handle): # XXX: We allow multiple spaces to be aligned due to issues with the SS aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', ))) t_char = None started_at = txt_handle.tell() started_at_read = txt_handle_reads while True: t_char = unicode(txt_handle.read(1), encoding='utf-8') txt_handle_reads += 1 if not t_char: assert False, ('could not align all sentences for: ' '"{}" and "{}" stopped at the sentence: "{}" ' 'aligner in state: {}').format( txt_handle.name, ss_handle.name, s_text, aligner.__repr__()) try: if aligner.align(t_char): source_text = _str(aligner) # We are aligned! s_starts_and_sentences.append(( #txt_handle.tell() - len(source_text), #started_at, started_at_read, Sentence(source_text, []))) #last_end += aligner.char_cnt break except MisalignedError: started_at = txt_handle.tell() started_at_read = txt_handle_reads pass #s_starts_and_sentences.sort() return s_starts_and_sentences
def _get_sentences_and_offsets(txt_handle, ss_handle): s_starts_and_sentences = [] txt_handle_reads = 0 for s_text in (l.rstrip('\n') for l in ss_handle): # XXX: We allow multiple spaces to be aligned due to issues with the SS aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', ))) t_char = None started_at = txt_handle.tell() started_at_read = txt_handle_reads while True: t_char = unicode(txt_handle.read(1), encoding='utf-8') txt_handle_reads += 1 if not t_char: assert False, ('could not align all sentences for: ' '"{}" and "{}" stopped at the sentence: "{}" ' 'aligner in state: {}' ).format(txt_handle.name, ss_handle.name, s_text, aligner.__repr__()) try: if aligner.align(t_char): source_text = _str(aligner) # We are aligned! s_starts_and_sentences.append(( #txt_handle.tell() - len(source_text), #started_at, started_at_read, Sentence(source_text, []))) #last_end += aligner.char_cnt break except MisalignedError: started_at = txt_handle.tell() started_at_read = txt_handle_reads pass #s_starts_and_sentences.sort() return s_starts_and_sentences
def train_models(fnames, emiter, hmmiter, model_name): if not fnames: sys.exit("No file provided") print("Reading AMR files") pairs = [] for fname in fnames: f = open(fname, "r") pairs += Aligner.readAMR(f) f.close() sentences = [Aligner.tokenize(pair[0]) for pair in pairs] graphs = [AMRGraph(pair[1], False) for pair in pairs] emprobs = EM.train(sentences, graphs, model_name + ".em", emiter) #emprobs = EM.load_model(model_name + ".em") print("Initializing rule-based alignments") n = len(sentences) initalgs = [{}] * n for i in range(n): initalgs[i] = Aligner.initalign(graphs[i].ref, sentences[i]) if (i+1) % 1000 == 0: print(str(i+1) + "/" + str(n)) hmmprobs = HMM.train(sentences, graphs, emprobs, model_name + ".hmm", hmmiter, initalgs) print("Done")
def main(): # Get command line arguments if len(sys.argv) < 3: printInfo() exit(1) ref = "" refLen = 0 refFile = sys.argv[1] READFILE = sys.argv[2].strip().split(',') with open(refFile) as fi: ref = fi.read().strip() refLen = len(ref) INTERVAL = [refLen] MINREADS = [10] TRIGGERPOINT = [10] CONFIDENCE = [50] # Convert args into a list of args if len(sys.argv) > 3: INTERVAL = [int(i) for i in sys.argv[3].strip().split(',')] if len(sys.argv) > 4: MINREADS = [int(i) for i in sys.argv[4].strip().split(',')] if len(sys.argv) > 5: TRIGGERPOINT = [int(i) for i in sys.argv[5].strip().split(',')] if len(sys.argv) > 6: CONFIDENCE = [int(i) for i in sys.argv[6].strip().split(',')] print("time \t Read File \t num changes \t interval \t minReads \t triggerPoint \t confidence") # Iterate through all combinations of parameters for readFile in READFILE: for interval in INTERVAL: for minReads in MINREADS: for triggerPoint in TRIGGERPOINT: for confidence in CONFIDENCE: start_time = timeit.default_timer() # Times the block with alignment # Init the alignment tracker and aligner a = Aligner(ref) rt = hashRangeTracker() rt.setRefLen(refLen) rt.setInterval(interval) # Split genome into this many blocks rt.setMinReads(minReads) # Minimum times a read should overlap a position rt.setTrigger(triggerPoint) # How many times to hit a region before reporting rt.setConfidence(confidence) allChanges = [] with open(readFile) as readsFi: for read in readsFi: read = read.strip() if read[0] == '#': # Comment line continue elapsed_time = timeit.default_timer() - start_time aligned = a.align(read) start_time = timeit.default_timer() changes = rt.addAlignment(read, aligned[1], aligned[0]) if len(changes) > 0: # We have some changes to make allChanges += changes for c in changes: makeChange(a, c) # ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index #a = Aligner(ref) # Get the remaining updates changes = rt.flush() if len(changes) > 0: allChanges += changes for c in changes: makeChange(a, c) # ref = ref[:c[0]] + c[1] + ref[c[0] + 1:] # to update ref w/out the index elapsed_time += timeit.default_timer() - start_time ref = a.getRef() print(str(elapsed_time) + " \t " + readFile + " \t " + str(len(allChanges)) + " \t " + str(interval) + " \t " + str(minReads) + " \t " + str(triggerPoint) + " \t " + str(confidence)) print(ref)
class VariantSeqLib(SeqLib): """ Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted. """ def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError( "Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int( config['wild type']['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None def is_coding(self): return self.wt_protein is not None def set_wt(self, sequence, coding=True): """ Set the wild type DNA *sequence*. The *sequence* is translated if *coding* is ``True``. The *sequence* may only contain ``ACGT``, but may contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame. """ sequence = "".join(sequence.split()) # remove whitespace if not re.match("^[ACGTacgt]+$", sequence): raise EnrichError( "WT DNA sequence contains unexpected " "characters", self.name) if len(sequence) % 3 != 0 and coding: raise EnrichError("WT DNA sequence contains incomplete codons", self.name) self.wt_dna = sequence.upper() if coding: self.wt_protein = "" for i in xrange(0, len(self.wt_dna), 3): self.wt_protein += codon_table[self.wt_dna[i:i + 3]] else: self.wt_protein = None def align_variant(self, variant_dna): """ Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the wild type sequence. Returns a list of HGVS variant strings. Aligned variants are stored in a local dictionary to avoid recomputing alignments. This dictionary should be cleared after all variants are counted, to save memory. .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime. """ if variant_dna in self.aligner_cache.keys(): return self.aligner_cache[variant_dna] mutations = list() traceback = self.aligner.align(self.wt_dna, variant_dna) for x, y, cat, length in traceback: if cat == "match": continue elif cat == "mismatch": mut = "{pre}>{post}".format(pre=self.wt_dna[x], post=variant_dna[y]) elif cat == "insertion": if y > length: dup = variant_dna[y:y + length] if dup == variant_dna[y - length:y]: mut = "dup{seq}".format(seq=dup) else: mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup) else: mut = "_{pos}ins{seq}".format(pos=x + 2, seq=variant_dna[y:y + length]) elif cat == "deletion": mut = "_{pos}del".format(pos=x + length) mutations.append((x, mut)) self.aligner_cache[variant_dna] = mutations return mutations def count_variant(self, variant_dna, copies=1, include_indels=True): """ Identifies mutations and counts the *variant_dna* sequence. The algorithm attempts to call variants by comparing base-by-base. If the *variant_dna* and wild type DNA are different lengths, or if there are an excess of mismatches (indicating a possible indel), local alignment is performed using :py:meth:`align_variant` if this option has been selected in the configuration. Each variant is stored as a tab-delimited string of mutations in HGVS format. Returns a list of HGSV variant strings. Returns an empty list if the variant is wild type. Returns None if the variant was discarded due to excess mismatches. """ if not re.match("^[ACGTNXacgtnx]+$", variant_dna): raise EnrichError( "Variant DNA sequence contains unexpected " "characters", self.name) variant_dna = variant_dna.upper() if len(variant_dna) != len(self.wt_dna): if self.aligner is not None: mutations = self.align_variant(variant_dna) else: return None else: mutations = list() for i in xrange(len(variant_dna)): if variant_dna[i] != self.wt_dna[i]: mutations.append( (i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i]))) if len(mutations) > self.filters['max mutations']: if self.aligner is not None: mutations = self.align_variant(variant_dna) if len(mutations) > self.filters['max mutations']: # too many mutations post-alignment return None else: # stop looping over this variant break else: # too many mutations and not using aligner return None mutation_strings = list() if self.is_coding(): variant_protein = "" for i in xrange(0, len(variant_dna), 3): try: variant_protein += codon_table[variant_dna[i:i + 3]] except KeyError: # garbage codon due to indel variant_protein += '?' for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 ref_pro_pos = (pos + self.reference_offset) / 3 + 1 mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change) if has_indel(change): mut += " (p.{pre}{pos}fs)".format( pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos) elif variant_protein[pos / 3] == self.wt_protein[pos / 3]: mut += " (p.=)" else: mut += " (p.{pre}{pos}{post})".format( pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos, post=aa_codes[variant_protein[pos / 3]]) mutation_strings.append(mut) else: for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change) mutation_strings.append(mut) if len(mutation_strings) > 0: variant_string = ', '.join(mutation_strings) else: variant_string = WILD_TYPE_VARIANT try: self.df_dict['variants'][variant_string] += copies except KeyError: self.df_dict['variants'][variant_string] = copies return variant_string def count_mutations(self, include_indels=False): """ Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted independently of the corresponding nucleotide change. """ # restore the counts if they were saved to disk if self.df_dict['variants'] is None: self.load_counts(keys=['variants']) # create new dictionaries self.df_dict['mutations_nt'] = dict() if self.is_coding(): self.df_dict['mutations_aa'] = dict() if not include_indels: mask = self.df_dict['variants'].index.map(has_indel) variant_data = self.df_dict['variants'][np.invert(mask)] del mask else: variant_data = self.df_dict['variants'] if self.is_coding(): for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") # get just the nucleotide changes for m in mutations: m = m.split(" (")[0] try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count # get the amino acid changes aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]", variant) for a in aa_changes: try: self.df_dict['mutations_aa'][a] += count except KeyError: self.df_dict['mutations_aa'][a] = count else: for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") for m in mutations: try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count self.df_dict['mutations_nt'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_nt'], orient="index", dtype="int32") if self.is_coding(): self.df_dict['mutations_aa'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_aa'], orient="index", dtype="int32")
return int(filename) except ValueError as e: return 0 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- parser = argparse.ArgumentParser(description="Stack several image files to create digital long exposure photographies") parser.add_argument("--align", action="store_true", help="run only the aligner, do not compress") parser.add_argument("--transform", action="store_true", help="run only the aligner and transform, do not compress") parser.add_argument("--stitch", action="store_true", help="stitch images for panoramic formats") args = parser.parse_args() # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- aligner = Aligner() stitcher = Stitcher() stacker = Stacker(aligner) input_images_aligner = [] input_images_stitcher = [] input_images_stacker = [] # transform to absolute paths BASE_DIR = os.path.dirname(os.path.realpath(__file__)) # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- # init aligner if args.align or args.transform:
# -*- coding: utf-8 -*- from text import * from lexicon import * from aligner import Aligner text = Text() lexicon = Lexicon() text.parse("../data/77b.txt") lexicon.parse("../data/arapaho_lexicon.json") aligner = Aligner(text, lexicon) aligner.align("../data/new_test_text_file.txt", "../data/test_log_file.txt")
import numpy from aligner import Aligner from model import autoencoder_A from model import autoencoder_B from model import encoder, decoder_A, decoder_B encoder.load_weights("models/encoder.h5") decoder_A.load_weights("models/decoder_A.h5") decoder_B.load_weights("models/decoder_B.h5") autoencoder = autoencoder_B # landmark file can be found in http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 # unzip it in the same folder as the main scripts aligner = Aligner("shape_predictor_68_face_landmarks.dat", "mmod_human_face_detector.dat") def convert_one_image(image): assert image.shape == (256, 256, 3) crop = slice(48, 208) face = image[crop, crop] face = cv2.resize(face, (64, 64)) face = numpy.expand_dims(face, 0) new_face = autoencoder.predict(face / 255.0)[0] new_face = numpy.clip(new_face * 255, 0, 255).astype(image.dtype) new_face = cv2.resize(new_face, (160, 160)) result = aligner.align(image.copy(), new_face) if result is None: return superpose(image, new_face, crop) else:
gpu = params["gpu"] queue_name = params["queue_name"] else: max_disp = options.max_disp model_name = options.model_name net_crop = options.crop mip_range = (options.net_mip_low, options.net_mip_high) patch_size = options.patch_size xy_offset = (options.x_offset, options.y_offset) xy_size = (options.x_size, options.y_size) source_img = options.source_img dest_img = options.dest_img stack_start = options.stack_start stack_end = options.stack_end move_anchor = options.move_anchor gpu = options.gpu queue_name = options.queue_name model_path = 'model_repository/' + model_name + '.pt' high_mip_chunk = (patch_size, patch_size) a = Aligner(model_path, max_disp, net_crop, mip_range, high_mip_chunk, source_img, dest_img, queue_name=queue_name, gpu=gpu) bbox = BoundingBox(xy_offset[0], xy_offset[0]+xy_size[0], xy_offset[1], xy_offset[1]+xy_size[1], mip=0, max_mip=9) a.align_ng_stack(stack_start, stack_end, bbox, move_anchor=move_anchor)
stack_start = params["stack_start"] stack_end = params["stack_end"] move_anchor = params["move_anchor"] gpu = params["gpu"] queue_name = params["queue_name"] else: max_disp = options.max_disp model_name = options.model_name net_crop = options.crop mip_range = (options.net_mip_low, options.net_mip_high) patch_size = options.patch_size xy_offset = (options.x_offset, options.y_offset) xy_size = (options.x_size, options.y_size) source_img = options.source_img dest_img = options.dest_img stack_start = options.stack_start stack_end = options.stack_end move_anchor = options.move_anchor gpu = options.gpu queue_name = options.queue_name model_path = 'model_repository/' + model_name + '.pt' high_mip_chunk = (patch_size, patch_size) a = Aligner(model_path, max_disp, net_crop, mip_range, high_mip_chunk, source_img, dest_img, queue_name=queue_name, gpu=gpu) a.listen_for_tasks()
from aligner import Aligner from rollback_pyramid import RollbackPyramid aligners = {} pyramid = RollbackPyramid() for m in [8, 9, 10]: aligners[m] = Aligner(fms=[2, 16, 16, 16, 16, 2], k=7).cuda() aligners[m].load_state_dict(torch.load('./checkpoints/barak_aligner_mip{}.pth.tar'.format(m))) pyramid.set_mip_processor(aligners[m], m)
Gwen Hoffmann David Gong Tests for correct alignment of reads. Runs experiments to test change in alignment time as the length of reads, length of genome, and number of edits increases. """ import sys from aligner import Aligner import random import time string = "ACTCTGCTTTAG" a = Aligner(string) #test exact match pos, edits = a.align("TCTGC") assert pos == 2 #print pos, edits #test insertion pos, edits = a.align("ACTTGC") #print pos, edits assert pos == 0 #test replacement pos, edits = a.align("TACTT") #print pos, edits
and reading these variants in a way which simulated sequencing These variants have been written to a pickled dictionary object, and the reads written out to a range of FastQ files, a R1 and R2 for each input gene The condenser process aggregates all the R1 and R2 files across all these input genes into a single pair of FastQs, ready for alignment """ file_condenser = Condenser(geneset, run_number) file_condenser.run() """ From this point the process involves running a couple of template commands for the rest of the process - Aligning the reads to a reference - Calling variants - Annotation of those variants (minimal, HGVS and gene name only) """ aligner = Aligner(sam_directory, output_name, reference, run_number) bam_filename = aligner.run() bam_location = os.path.join("fastQs", bam_filename) """ This section is for the variant calling on the aligned files. Due to some aspect of the read generation, Platypus is unable to generate variant calls from the aligned input data. The SAMtools mpileup feature, combined with the bcftools call function are used for the two-step variant calling. """ temp_bcf = os.path.join("VCFs", "temp.bcf") vcf_location = os.path.join("VCFs", vcf_name) variant_filled = variant_call_string % (reference, bam_location) filled_filter = var_call_filter_string % temp_bcf print variant_filled
#unpickle with open("clusteredGerman.dat", "rb") as f: germanClusterList = pickle.load(f) with open("clusteredEnglish.dat", "rb") as f: englishClusterList = pickle.load(f) with open("intersection.dat", "rb") as f: intersection = pickle.load(f) print("done unpickling") #for cluster in germanClusterList: #print(cluster.predicates) entitySetLength = len(intersection) + 1 #alignment of clusters begins here a = Aligner() print("begin aligning: ", datetime.datetime.now()) clusterTupleList = a.alignClustersNew(germanClusterList, englishClusterList, entitySetLength, intersection, "alignmentOutputWithcosineSim.txt") print("done aligning: ", datetime.datetime.now()) #pickling of final list with open("alignedListNoPmi.dat", "wb") as f: pickle.dump(clusterTupleList, f) #for clusterTupel in clusterTupleList: # clusterTupel[0].printClusterPredicates() # clusterTupel[1].printClusterPredicates() # print(clusterTupel[2])
parser.add_argument("--do_test", default=True, type=util.str2bool) parser.add_argument("--cache_dataset", default=False, type=util.str2bool) parser.add_argument("--cache_path", default="", type=str) ############################################################################ parser.add_argument("--default_save_path", default="./", type=str) parser.add_argument("--gradient_clip_val", default=0, type=float) parser.add_argument("--num_nodes", default=1, type=int) parser.add_argument("--gpus", default=None, type=int) parser.add_argument("--overfit_batches", default=0.0, type=float) parser.add_argument("--track_grad_norm", default=-1, type=int) parser.add_argument("--check_val_every_n_epoch", default=1, type=int) parser.add_argument("--fast_dev_run", default=False, type=util.str2bool) parser.add_argument("--accumulate_grad_batches", default=1, type=int) parser.add_argument("--max_epochs", default=1000, type=int) parser.add_argument("--min_epochs", default=1, type=int) parser.add_argument("--max_steps", default=None, type=int) parser.add_argument("--min_steps", default=None, type=int) parser.add_argument("--val_check_interval", default=1.0, type=float) parser.add_argument("--log_every_n_steps", default=10, type=int) parser.add_argument("--distributed_backend", default=None, type=str) parser.add_argument("--precision", default=32, type=int) parser.add_argument("--resume_from_checkpoint", default=None, type=str) ############################################################################ parser = Model.add_model_specific_args(parser) parser = Tagger.add_model_specific_args(parser) parser = Classifier.add_model_specific_args(parser) parser = DependencyParser.add_model_specific_args(parser) parser = Aligner.add_model_specific_args(parser) hparams = parser.parse_args() main(hparams)
def combine_records(forward_record, reverse_record, reference_sequences, min_overlap=-1, max_overlap=-1, max_length_delta=1e30, reference_scoring_ranges=None): ''' Computes the alignments of both forward and reverse reads to the reference sequences. Synthesizes those alignments, using the better-quality read in the case of a conflict. Returns (index, sequence, quality) where `index` is the index of the reference sequence used, `sequence` is the combined DNA sequence, and `quality` is the quality of each base in the combined sequence. The optional parameters min_overlap and max_overlap correspond to the overlap constraints on the alignment between the forward and reverse reads. ''' aligner = Aligner() forward_str = str(forward_record.seq) reverse_str = str(reverse_record.seq.reverse_complement()) # Align forward to references reference_index, forward_offset, forward_score = aligner.best_alignment( forward_str, reference_sequences, unidirectional=True, min_overlap=len(forward_str), candidate_scoring_ranges=reference_scoring_ranges) # Align forward to reverse reverse_offset, _ = aligner.align(forward_str, reverse_str, unidirectional=True, reverse=True, min_overlap=min_overlap, max_overlap=max_overlap) reference = reference_sequences[reference_index] reference_scoring_range = reference_scoring_ranges[ reference_index] if reference_scoring_ranges is not None else None # Align reverse to reference reverse_offset_to_ref, reverse_score = aligner.align( reference, reverse_str, unidirectional=True, reverse=True, min_overlap=15, scoring_ranges=(reference_scoring_range, None)) # Compare the pairwise scores of obeying the forward and obeying the reverse alignments to reference, # and adjust the alignment offsets accordingly. if reverse_score > forward_score: forward_offset = reverse_offset_to_ref - reverse_offset reverse_offset = reverse_offset_to_ref else: reverse_offset += forward_offset combined_sequence = "" combined_quality = [] alignment_set = [(reference, 0), (forward_str, forward_offset), (reverse_str, reverse_offset)] # Uncomment to print the resulting alignments # print('\n'.join(aligner.format_multiple(*alignment_set))) # Discard the read if total length is too different from reference length if max_length_delta <= len(reference): if math.fabs(aligner.length(*alignment_set) - len(reference)) > max_length_delta: sc.counter(1, STAT_DELETIONS_KEY, STAT_EXCESS_LENGTH_KEY) return -1, None, None # Combine the reads to produce the overall sequence. # The aligner will enumerate the aligned characters or elements of each iterable we give it. # Zipping generators for both the sequence and the quality allows us to enumerate them together. sequence_generator = aligner.enumerate_multiple(*alignment_set) quality_generator = aligner.enumerate_multiple( ([None for i in xrange(len(reference))], 0), (forward_record.letter_annotations[SEQUENCE_QUALITY_KEY], forward_offset), (reverse_record.letter_annotations[SEQUENCE_QUALITY_KEY], reverse_offset)) for bases, qualities in izip(sequence_generator, quality_generator): _, forward_base, reverse_base = bases _, forward_quality, reverse_quality = qualities if forward_base is None and reverse_base is None: combined_sequence += UNSPECIFIED_BASE combined_quality.append(0) elif forward_base is None: combined_sequence += reverse_base combined_quality.append(reverse_quality) elif reverse_base is None: combined_sequence += forward_base combined_quality.append(forward_quality) else: base, quality = max([(forward_base, forward_quality), (reverse_base, reverse_quality)], key=lambda x: x[1]) combined_sequence += base combined_quality.append(quality) return reference_index, combined_sequence, combined_quality
description= "Stack several image files to create digital long exposure photographies") parser.add_argument("--align", action="store_true", help="run only the aligner, do not compress") parser.add_argument("--transform", action="store_true", help="run only the aligner and transform, do not compress") parser.add_argument("--stitch", action="store_true", help="stitch images for panoramic formats") args = parser.parse_args() # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- aligner = Aligner() stitcher = Stitcher() stacker = Stacker(aligner) input_images_aligner = [] input_images_stitcher = [] input_images_stacker = [] # transform to absolute paths BASE_DIR = os.path.dirname(os.path.realpath(__file__)) # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- # init aligner if args.align or args.transform:
for cluster in new_hierarchy: #print(cluster) yield cluster ### Similarity stats def similarity_stats_processor(task, scoring_map, seq, (index, other_seq)): ''' Convenience function for multiprocessing that calls the merge_function and returns part of the input. ''' root = get_root_item(seq)[0] other_root = get_root_item(other_seq)[0] aligner = Aligner(different_score=0) return index, aligner.score(root, other_root, 0, scoring_maps=(scoring_map, scoring_map)) def similarity_stats(all_sequences, task, out_dir, out_prefix, num_processes=15, current_level=0): ''' Writes the distribution of similarities of the top k (determined by `task`) sequences in all_sequences to all other sequences in the list to out_dir.
class VariantSeqLib(SeqLib): """ Abstract :py:class:`SeqLib` class for for Enrich libraries containing variants. Implements core functionality for assessing variants, either coding or noncoding. Subclasess must evaluate the variant DNA sequences that are being counted. """ def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError("Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int(config['wild type'] ['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None def is_coding(self): return self.wt_protein is not None def set_wt(self, sequence, coding=True): """ Set the wild type DNA *sequence*. The *sequence* is translated if *coding* is ``True``. The *sequence* may only contain ``ACGT``, but may contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame. """ sequence = "".join(sequence.split()) # remove whitespace if not re.match("^[ACGTacgt]+$", sequence): raise EnrichError("WT DNA sequence contains unexpected " "characters", self.name) if len(sequence) % 3 != 0 and coding: raise EnrichError("WT DNA sequence contains incomplete codons", self.name) self.wt_dna = sequence.upper() if coding: self.wt_protein = "" for i in xrange(0, len(self.wt_dna), 3): self.wt_protein += codon_table[self.wt_dna[i:i + 3]] else: self.wt_protein = None def align_variant(self, variant_dna): """ Use the local :py:class:`~seqlib.aligner.Aligner` instance to align the *variant_dna* to the wild type sequence. Returns a list of HGVS variant strings. Aligned variants are stored in a local dictionary to avoid recomputing alignments. This dictionary should be cleared after all variants are counted, to save memory. .. warning:: Using the :py:class:`~seqlib.aligner.Aligner` dramatically increases runtime. """ if variant_dna in self.aligner_cache.keys(): return self.aligner_cache[variant_dna] mutations = list() traceback = self.aligner.align(self.wt_dna, variant_dna) for x, y, cat, length in traceback: if cat == "match": continue elif cat == "mismatch": mut = "{pre}>{post}".format(pre=self.wt_dna[x], post=variant_dna[y]) elif cat == "insertion": if y > length: dup = variant_dna[y:y + length] if dup == variant_dna[y - length:y]: mut = "dup{seq}".format(seq=dup) else: mut = "_{pos}ins{seq}".format(post=x + 2, seq=dup) else: mut = "_{pos}ins{seq}".format(pos=x + 2, seq=variant_dna[y:y + length]) elif cat == "deletion": mut = "_{pos}del".format(pos=x + length) mutations.append((x, mut)) self.aligner_cache[variant_dna] = mutations return mutations def count_variant(self, variant_dna, copies=1, include_indels=True): """ Identifies mutations and counts the *variant_dna* sequence. The algorithm attempts to call variants by comparing base-by-base. If the *variant_dna* and wild type DNA are different lengths, or if there are an excess of mismatches (indicating a possible indel), local alignment is performed using :py:meth:`align_variant` if this option has been selected in the configuration. Each variant is stored as a tab-delimited string of mutations in HGVS format. Returns a list of HGSV variant strings. Returns an empty list if the variant is wild type. Returns None if the variant was discarded due to excess mismatches. """ if not re.match("^[ACGTNXacgtnx]+$", variant_dna): raise EnrichError("Variant DNA sequence contains unexpected " "characters", self.name) variant_dna = variant_dna.upper() if len(variant_dna) != len(self.wt_dna): if self.aligner is not None: mutations = self.align_variant(variant_dna) else: return None else: mutations = list() for i in xrange(len(variant_dna)): if variant_dna[i] != self.wt_dna[i]: mutations.append((i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i]))) if len(mutations) > self.filters['max mutations']: if self.aligner is not None: mutations = self.align_variant(variant_dna) if len(mutations) > self.filters['max mutations']: # too many mutations post-alignment return None else: # stop looping over this variant break else: # too many mutations and not using aligner return None mutation_strings = list() if self.is_coding(): variant_protein = "" for i in xrange(0, len(variant_dna), 3): try: variant_protein += codon_table[variant_dna[i:i + 3]] except KeyError: # garbage codon due to indel variant_protein += '?' for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 ref_pro_pos = (pos + self.reference_offset) / 3 + 1 mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change) if has_indel(change): mut += " (p.{pre}{pos}fs)".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos) elif variant_protein[pos / 3] == self.wt_protein[pos / 3]: mut += " (p.=)" else: mut += " (p.{pre}{pos}{post})".format(pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos, post=aa_codes[variant_protein[pos / 3]]) mutation_strings.append(mut) else: for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change) mutation_strings.append(mut) if len(mutation_strings) > 0: variant_string = ', '.join(mutation_strings) else: variant_string = WILD_TYPE_VARIANT try: self.df_dict['variants'][variant_string] += copies except KeyError: self.df_dict['variants'][variant_string] = copies return variant_string def count_mutations(self, include_indels=False): """ Count the individual mutations in all variants. If *include_indels* is ``False``, all mutations in a variant that contains an insertion/deletion/duplication will not be counted. For coding sequences, amino acid substitutions are counted independently of the corresponding nucleotide change. """ # restore the counts if they were saved to disk if self.df_dict['variants'] is None: self.load_counts(keys=['variants']) # create new dictionaries self.df_dict['mutations_nt'] = dict() if self.is_coding(): self.df_dict['mutations_aa'] = dict() if not include_indels: mask = self.df_dict['variants'].index.map(has_indel) variant_data = self.df_dict['variants'][np.invert(mask)] del mask else: variant_data = self.df_dict['variants'] if self.is_coding(): for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") # get just the nucleotide changes for m in mutations: m = m.split(" (")[0] try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count # get the amino acid changes aa_changes = re.findall("p\.[A-Z][a-z][a-z]\d+[A-Z][a-z][a-z]", variant) for a in aa_changes: try: self.df_dict['mutations_aa'][a] += count except KeyError: self.df_dict['mutations_aa'][a] = count else: for variant, count in variant_data.iterrows(): count = count['count'] # get the element from the Series mutations = variant.split(", ") for m in mutations: try: self.df_dict['mutations_nt'][m] += count except KeyError: self.df_dict['mutations_nt'][m] = count self.df_dict['mutations_nt'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_nt'], orient="index", dtype="int32") if self.is_coding(): self.df_dict['mutations_aa'] = \ pd.DataFrame.from_dict(self.df_dict['mutations_aa'], orient="index", dtype="int32")
class Inflection: _PARAMS = { 'C': 1.0, 'window': 3, 'cross_features': 2, 'classifier': 'mono', 'C_replace': 0.0, 'C_insert': 0.0 } def __init__(self, feature_type, **params): self.feature_type = feature_type self.a = Aligner(method='lcs') if feature_type == "sparse": self.get_features = self.get_sparse_features elif feature_type == "sparse2": self.get_features = self.get_sparse2_features elif feature_type == "onehot": self.get_features = self.get_positional_features self.old_window = None self.sample_weight = None self._reset() self.set_params(**params) def _reset(self): for k, v in self._PARAMS.items(): setattr(self, k, v) def set_params(self, **params): for k, v in params.items(): if k in self._PARAMS: setattr(self, k, v) def vectorize(self, lem, tag, wf=None): if self.old_window != self.window: self.old_window = self.window self.features = [] self.labels = [] for i, (l, t, w) in enumerate(zip(lem, tag, wf)): alignments = self.a.align(l, w) alignments = [[('<', '<')] + x + [('>', '>')] for x in alignments] for j, a in enumerate(alignments): if j > 0: break # in case there are multiple alignments take only the first li, wi = 0, 0 for k, (lc, wc) in enumerate(a): self.features.append( self.get_features('<' + l + '>', '<' + w[:wi], t, li, window=(self.window, self.window))) if lc == '': action = 'insert:' + wc wi += 1 elif lc == wc: action = 'copy:' li += 1 wi += 1 elif wc == '': action = 'delete:' li += 1 else: action = 'replace:' + wc li += 1 wi += 1 self.labels.append(action) if self.feature_type.startswith('sparse'): self.vec = TfidfVectorizer(sublinear_tf=True, analyzer=lambda x: x) self.x = self.vec.fit_transform(self.features) else: self.x = np.array(self.features) def fit(self, wf, lem, tag): print("vecorize....", file=sys.stderr) self.vectorize(lem, tag, wf) print(self.x.shape, file=sys.stderr) print("fit....", file=sys.stderr) if self.classifier == 'twostep': action = [s.split(':')[0] for s in self.labels] self.clf = LinearSVC(C=self.C, class_weight='balanced', max_iter=1000) self.clf.fit(self.x, action, sample_weight=self.sample_weight) replace_i = [i for i in range(len(self.labels))\ if self.labels[i].startswith('replace')] sw = None if len(replace_i): x = self.x[replace_i, :] y = np.array(self.labels)[replace_i] if self.C_replace == 0.0: self.C_replace = self.C if len(set(y)) == 1: self.clf_replace = DummyClassifier() else: self.clf_replace = LinearSVC(C=self.C_replace, class_weight='balanced', max_iter=50000) self.clf_replace.fit(x, y) else: self.clf_replace = DummyClassifier() insert_i = [i for i in range(len(self.labels))\ if self.labels[i].startswith('insert')] if len(insert_i): x = self.x[insert_i, :] y = np.array(self.labels)[insert_i] if self.C_insert == 0.0: self.C_insert = self.C if len(set(y)) == 1: self.clf_replace = DummyClassifier() else: self.clf_insert = LinearSVC(C=self.C_replace, class_weight='balanced', max_iter=50000) self.clf_insert.fit(x, y) else: self.clf_insert = DummyClassifier() else: self.clf = LinearSVC(C=self.C, class_weight='balanced', max_iter=50000) self.clf.fit(self.x, self.labels, sample_weight=self.sample_weight) def predict(self, x): if self.classifier == 'twostep': action = str(self.clf.predict(x)[0]) ch = '' if action == 'insert': if self.clf_insert is None: action = 'copy' else: ch = str(self.clf_insert.predict(x)[0]).split(':', 1)[1] elif action == 'replace': if self.clf_replace is None: action = 'copy' else: ch = str(self.clf_replace.predict(x)[0]).split(':', 1)[1] return action, ch else: return str(self.clf.predict(x)[0]).split(':', 1) def decode(self, lemma, tags, max_len=30): w_prefix = '' li = 0 while li < len(lemma): feat = self.get_features(lemma, w_prefix, tags, li) if self.feature_type.startswith('sparse'): x = self.vec.transform([feat]) else: x = np.array([feat]) act, arg = self.predict(x) if act == 'copy': w_prefix += lemma[li] li += 1 elif act == 'replace': w_prefix += arg li += 1 elif act == 'insert': w_prefix += arg elif act == 'delete': li += 1 if len(w_prefix) > max_len or w_prefix and w_prefix[-1] == '>': break return w_prefix def get_sparse_features(self, lemma, word_prefix, tags, idx, window=(10, 10)): cross = self.cross_features pfx_feat, sfx_feat, wpfx_feat = [], [], [] tag_feat = ["tag:{}".format(t) for t in tags] if cross >= 2: tag_feat += [ "tag2:{}-{}".format(t, t) for t in itertools.product(tags, tags) ] ch_feat = ["ch:{}".format(lemma[idx])] for i in range(1, window[0] + 1): if i <= idx: pfx_feat.append('lprefix:{}'.format(lemma[idx - i:idx])) if i <= len(word_prefix): wpfx_feat.append('wprefix:{}'.format(word_prefix[-i:])) for i in range(idx + 1, idx + window[1]): if i <= len(lemma): sfx_feat.append('lsuffix:{}'.format(lemma[idx:i])) str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat if cross > 3: cross = [ "&".join((x, y)) for x, y in itertools.product(pfx_feat, sfx_feat) ] cross = [ "&".join((x, y)) for x, y in itertools.product(wpfx_feat, cross) ] cross = [ "&".join((x, y)) for x, y in itertools.product(ch_feat, cross) ] str_feat += cross else: cross = [ "&".join((x, y)) for x, y in itertools.product(ch_feat, str_feat) ] str_feat += cross return str_feat + tag_feat + [ "&".join((x, y)) for x, y in itertools.product(tag_feat, str_feat) ] def get_sparse2_features(self, lemma, word_prefix, tags, idx, window=(10, 10)): cross = self.cross_features pfx_feat, sfx_feat, wpfx_feat = [], [], [] tag_feat = [{"t:{}".format(t)} for t in tags] ch_feat = [{"l0:{}".format(lemma[idx])}] for i in range(1, window[0] + 1): if i <= idx: pfx_feat.append({'l-{}:{}'.format(i, lemma[idx - i])}) if i <= len(word_prefix): wpfx_feat.append({'w-{}:{}'.format(i, word_prefix[-i])}) for i in range(1, window[1] + 1): if (idx + i) < len(lemma): sfx_feat.append({'l+{}:{}'.format(i, lemma[idx + i])}) str_feat = ch_feat + pfx_feat + sfx_feat + wpfx_feat feat = str_feat + tag_feat feat_cross = feat for i in range(cross): feat_cross = [ x | y for x, y in itertools.product(feat, feat_cross) ] return ['&'.join(sorted(f)) for f in feat_cross] def get_positional_features(self, lemma, word_prefix, tags, idx, window=(3, 3)): chars = [lemma[idx]] tag_enc = self.data.te ch_enc = self.data.ce for i in range(idx - (window[0] + 1), idx - 1): if i >= 0: chars.append(lemma[i]) chars.append(word_prefix[i]) else: chars.append(ch_enc.pad) for i in range(idx + 1, idx + window[1] + 1): if i < len(lemma): chars.append(lemma[i]) else: chars.append(ch_enc.pad) feat = np.array(ch_enc.encode(chars, onehot=True)).flatten() feat = np.concatenate((feat, tag_enc.transform([tags])[0])) return feat def evaluate(self, wf, lemmas, tags): acc = 0 med = 0 for i, word in enumerate(wf): tag = tags[i] lem = lemmas[i] pred = self.decode(lem, tag) # print(word, pred, file=sys.stderr) acc += int(pred == word) med += editdistance.eval(pred, word) med = med / len(wf) acc = acc / len(wf) print(acc, med, file=sys.stderr) return (acc, med)
from __future__ import print_function from aligner import Aligner a = Aligner() x = "TCGAACTGAAAA" y = "AACTGA" trace = a.align(x, y) print(x) print(y) for t in trace: print(t)
def cluster_sequences(all_sequences, task, num_processes=15, current_level=0): ''' Performs the given clustering task. The task should contain the number of mutations to allow for sequences to be clustered, the hierarchical level at which the sequences are clustered, and optionally a list of ranges to use for scoring. The algorithm works as follows: let n be the length of all_sequences, k be the number of bases to score, and t be the number of mutations allowed. For every k-choose-t combination of bases in each sequence, the sequence string is hashed into a dictionary where that set of bases is replaced by a neutral character. If a hashing collision occurs, then that sequence is marked as belonging in a cluster with the originally hashed item. Once all of the sequences have been hashed, the clusters are merged together (using the first- read sequence as the consensus) and yielded. Altogether, this algorithm requires O(nk^t) running time as well as O(nk^t) space complexity. ''' level = task[1] if level == current_level and len(all_sequences) == 1: yield all_sequences[0] return if level == current_level: # Get a scoring map to know which bases to include in the hashes _, _, allowed_mutations, scoring_ranges = task if scoring_ranges is not None and len(scoring_ranges) > 0: aligner = Aligner(different_score=0) score_map = aligner.scoring_map(scoring_ranges[-1][1], scoring_ranges) else: score_map = None # Build hashes corresponding to the sequences with the specified # allowed_mutations # of bases excluded if level == 0: print("Hashing and clustering...") hashes = {} clusters = {} for i, seq in enumerate(all_sequences): root = get_root_item(seq)[0] for key_1, key_2 in sequence_hash_excluding_bases( root, score_map, allowed_mutations): if key_1 not in hashes: hashes[key_1] = {} relevant_dict = hashes[key_1] if key_2 in relevant_dict: # Found an overlap - create a cluster marker_index = relevant_dict[key_2] if marker_index not in clusters: clusters[marker_index] = set() clusters[marker_index].add(i) else: relevant_dict[key_2] = i # Merge the found clusters and yield them in decreasing order of frequency if level == 0: print("Merging and returning clusters...") visited_indexes = set() for i, seq in enumerate(all_sequences): if i in visited_indexes: continue merged_seq = seq root = get_root_item(merged_seq)[0] visited_indexes.add(i) if i in clusters: for other_index in sorted(list(clusters[i])): other_seq = all_sequences[other_index] other_root = get_root_item(other_seq)[0] merged_seq[root] = merge_sequence_info( merged_seq[root], other_seq[other_root]) visited_indexes.add(other_index) yield merged_seq if len(visited_indexes) == len(all_sequences): break else: num_seqs = len(all_sequences) for i in xrange(num_seqs): if current_level == 0 and i % 1000 == 0: print("Clustering sequence {} of {}...".format(i, num_seqs)) seq = all_sequences[i] root = get_root_item(seq)[0] merged_seq = {root: []} for result in cluster_sequences(seq[root], task, num_processes, current_level + 1): merged_seq[root].append(result) yield merged_seq