Exemplo n.º 1
0
    def count_kmers(self):

        # We need to clear all previous primers each time due to uniqueness
        # constraints
        if Primer.select().count() > 0:
            if not self.force:
                click.confirm(
                    "Remove all previously-found primers and re-count?",
                    abort=True)
            self.workspace.reset_primers()

        mkdirp(output_dir)

        kmers = []
        for k in xrange(self.min_size, self.max_size + 1):
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir)

            if self.exclude_fp:
                assert os.path.isfile(self.exclude_fp)
                ex = swga.kmers.count_kmers(k, self.exclude_fp, output_dir,
                                            self.exclude_threshold)
            else:
                ex = {}

            # Keep kmers found in foreground, merging bg binding values, and
            # excluding those found in the excluded fasta

            kmers = [
                primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind,
                            self.max_dimer_bp) for seq in fg.viewkeys()
                if seq not in ex.viewkeys()
            ]

            kmers = filter(lambda x: x != {}, kmers)

            nkmers = len(kmers)

            chunk_size = 199
            message("Writing {n} {k}-mers into db in blocks of {cs}...".format(
                n=nkmers * 2, k=k, cs=chunk_size))
            Primers.add(kmers, add_revcomp=True)

        message("Counted kmers in range %d-%d" %
                (self.min_size, self.max_size))
Exemplo n.º 2
0
    def count_kmers(self):

        # We need to clear all previous primers each time due to uniqueness
        # constraints
        if Primer.select().count() > 0:
            if not self.force:
                click.confirm(
                    "Remove all previously-found primers and re-count?",
                    abort=True)
            self.workspace.reset_primers()

        mkdirp(output_dir)

        kmers = []
        for k in xrange(self.min_size, self.max_size + 1):
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir)

            if self.exclude_fp:
                assert os.path.isfile(self.exclude_fp)
                ex = swga.kmers.count_kmers(
                    k, self.exclude_fp, output_dir, self.exclude_threshold)
            else:
                ex = {}

            # Keep kmers found in foreground, merging bg binding values, and
            # excluding those found in the excluded fasta

            kmers = [
                primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind,
                            self.max_dimer_bp)
                for seq in fg.viewkeys() if seq not in ex.viewkeys()
            ]

            kmers = filter(lambda x: x != {}, kmers)

            nkmers = len(kmers)

            chunk_size = 199
            message(
                "Writing {n} {k}-mers into db in blocks of {cs}..."
                .format(n=nkmers * 2, k=k, cs=chunk_size))
            Primers.add(kmers, add_revcomp=True)

        message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
Exemplo n.º 3
0
    def count_specific_kmers(self, kmers):
        try:
            # Skip primers that already exist and warn users
            existing = Primers.select_by_seqs(kmers)
            for p in existing:
                message("{} already exists in db, skipping...".format(p))
            kmers = [p for p in kmers if p not in existing]
        except OperationalError:
            # If this fails due to an OperationalError, it probably means the
            # database tables haven't been created yet. 
            error(
                "It doesn't appear that the workspace has been initialized: "
                "run `swga init' first.")
        mkdirp(output_dir)

        # Group the kmers by length to avoid repeatedly counting kmers of the
        # same size
        kmers_by_length = defaultdict(list)
        for kmer in kmers:
            kmers_by_length[len(kmer)].append(kmer)

        for k, mers in kmers_by_length.items():
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1)
            primers = []
            for mer in mers:
                try:
                    primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
                except KeyError:
                    message(
                        "{} does not exist in foreground genome, skipping..."
                        .format(mer))

            # Omitting any primers that were returned empty
            # primers = filter(lambda p: p == {}, primers)
            chunk_size = 199
            message(
                "Writing {n} {k}-mers into db in blocks of {cs}..."
                .format(n=len(primers), k=k, cs=chunk_size))
            Primers.add(primers, add_revcomp=False)
Exemplo n.º 4
0
    def count_specific_kmers(self, kmers):
        try:
            # Skip primers that already exist and warn users
            existing = Primers.select_by_seqs(kmers)
            for p in existing:
                message("{} already exists in db, skipping...".format(p))
            kmers = [p for p in kmers if p not in existing]
        except OperationalError:
            # If this fails due to an OperationalError, it probably means the
            # database tables haven't been created yet.
            error("It doesn't appear that the workspace has been initialized: "
                  "run `swga init' first.")
        mkdirp(output_dir)

        # Group the kmers by length to avoid repeatedly counting kmers of the
        # same size
        kmers_by_length = defaultdict(list)
        for kmer in kmers:
            kmers_by_length[len(kmer)].append(kmer)

        for k, mers in kmers_by_length.items():
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1)
            primers = []
            for mer in mers:
                try:
                    primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
                except KeyError:
                    message(
                        "{} does not exist in foreground genome, skipping...".
                        format(mer))

            # Omitting any primers that were returned empty
            # primers = filter(lambda p: p == {}, primers)
            chunk_size = 199
            message("Writing {n} {k}-mers into db in blocks of {cs}...".format(
                n=len(primers), k=k, cs=chunk_size))
            Primers.add(primers, add_revcomp=False)
Exemplo n.º 5
0
 def test_add_primers(self):
     '''Must add the reverse complement of a primer if requested.'''
     primers = [{'seq': "AAAA"}]
     Primers.add(primers, add_revcomp=True)
     assert Primer.select().where(Primer.seq == "TTTT").count() == 1