def count_kmers(self): # We need to clear all previous primers each time due to uniqueness # constraints if Primer.select().count() > 0: if not self.force: click.confirm( "Remove all previously-found primers and re-count?", abort=True) self.workspace.reset_primers() mkdirp(output_dir) kmers = [] for k in xrange(self.min_size, self.max_size + 1): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir) if self.exclude_fp: assert os.path.isfile(self.exclude_fp) ex = swga.kmers.count_kmers(k, self.exclude_fp, output_dir, self.exclude_threshold) else: ex = {} # Keep kmers found in foreground, merging bg binding values, and # excluding those found in the excluded fasta kmers = [ primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind, self.max_dimer_bp) for seq in fg.viewkeys() if seq not in ex.viewkeys() ] kmers = filter(lambda x: x != {}, kmers) nkmers = len(kmers) chunk_size = 199 message("Writing {n} {k}-mers into db in blocks of {cs}...".format( n=nkmers * 2, k=k, cs=chunk_size)) Primers.add(kmers, add_revcomp=True) message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
def count_kmers(self): # We need to clear all previous primers each time due to uniqueness # constraints if Primer.select().count() > 0: if not self.force: click.confirm( "Remove all previously-found primers and re-count?", abort=True) self.workspace.reset_primers() mkdirp(output_dir) kmers = [] for k in xrange(self.min_size, self.max_size + 1): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir) if self.exclude_fp: assert os.path.isfile(self.exclude_fp) ex = swga.kmers.count_kmers( k, self.exclude_fp, output_dir, self.exclude_threshold) else: ex = {} # Keep kmers found in foreground, merging bg binding values, and # excluding those found in the excluded fasta kmers = [ primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind, self.max_dimer_bp) for seq in fg.viewkeys() if seq not in ex.viewkeys() ] kmers = filter(lambda x: x != {}, kmers) nkmers = len(kmers) chunk_size = 199 message( "Writing {n} {k}-mers into db in blocks of {cs}..." .format(n=nkmers * 2, k=k, cs=chunk_size)) Primers.add(kmers, add_revcomp=True) message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
def count_specific_kmers(self, kmers): try: # Skip primers that already exist and warn users existing = Primers.select_by_seqs(kmers) for p in existing: message("{} already exists in db, skipping...".format(p)) kmers = [p for p in kmers if p not in existing] except OperationalError: # If this fails due to an OperationalError, it probably means the # database tables haven't been created yet. error( "It doesn't appear that the workspace has been initialized: " "run `swga init' first.") mkdirp(output_dir) # Group the kmers by length to avoid repeatedly counting kmers of the # same size kmers_by_length = defaultdict(list) for kmer in kmers: kmers_by_length[len(kmer)].append(kmer) for k, mers in kmers_by_length.items(): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1) primers = [] for mer in mers: try: primers.append(primer_dict(mer, fg, bg, 0, INF, INF)) except KeyError: message( "{} does not exist in foreground genome, skipping..." .format(mer)) # Omitting any primers that were returned empty # primers = filter(lambda p: p == {}, primers) chunk_size = 199 message( "Writing {n} {k}-mers into db in blocks of {cs}..." .format(n=len(primers), k=k, cs=chunk_size)) Primers.add(primers, add_revcomp=False)
def count_specific_kmers(self, kmers): try: # Skip primers that already exist and warn users existing = Primers.select_by_seqs(kmers) for p in existing: message("{} already exists in db, skipping...".format(p)) kmers = [p for p in kmers if p not in existing] except OperationalError: # If this fails due to an OperationalError, it probably means the # database tables haven't been created yet. error("It doesn't appear that the workspace has been initialized: " "run `swga init' first.") mkdirp(output_dir) # Group the kmers by length to avoid repeatedly counting kmers of the # same size kmers_by_length = defaultdict(list) for kmer in kmers: kmers_by_length[len(kmer)].append(kmer) for k, mers in kmers_by_length.items(): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1) primers = [] for mer in mers: try: primers.append(primer_dict(mer, fg, bg, 0, INF, INF)) except KeyError: message( "{} does not exist in foreground genome, skipping...". format(mer)) # Omitting any primers that were returned empty # primers = filter(lambda p: p == {}, primers) chunk_size = 199 message("Writing {n} {k}-mers into db in blocks of {cs}...".format( n=len(primers), k=k, cs=chunk_size)) Primers.add(primers, add_revcomp=False)
def test_add_primers(self): '''Must add the reverse complement of a primer if requested.''' primers = [{'seq': "AAAA"}] Primers.add(primers, add_revcomp=True) assert Primer.select().where(Primer.seq == "TTTT").count() == 1