def export_sparse_features(sigmers, sample, indir, outfile):

    outfh = open(outfile, 'w')
    i = 0
    for (s, l) in sample:
        i = i + 1

        if (i % 50 == 0):
            echo("\t\t ... Completed %f" % (float(i) / float(len(sample))))

        filename = indir + s + "_count.jf"
        qf = jellyfish.QueryMerFile(filename)

        outfh.write("%s " % (l))
        j = 0
        for mer in sigmers:
            j = j + 1
            jmer = jellyfish.MerDNA(mer)
            jmer.canonicalize()

            if (qf[jmer] > 0):
                outfh.write("%d:%d " % (j, qf[jmer]))

        #	outfh.write("%d\t%d\t%d\n" %(i, sigmers[mer], qf[jmer]))
        outfh.write("\n")
    outfh.close()
def pickle_profiles(file_lists, resource_path, kmer_size=31):
    jellyfish.MerDNA_k(kmer_size)
    #instantiate the pickle obj
    mlst_profiles_dict = OrderedDict()
    for species, file_list in file_lists.items():

        if species not in mlst_profiles_dict:
            mlst_profiles_dict.update({
                species: {
                    "ST": OrderedDict(),
                    "GENES": OrderedDict(),
                    "GENE_ORDER": None
                }
            })

        number_of_genes = len(file_list) - 1
        for i, l in enumerate(
                open([
                    os.path.join(resource_path, species, f) for f in file_list
                    if '.txt' in f
                ][0])):
            line = l.strip().split("\t")
            if i == 0:
                gene_list = line[1:number_of_genes + 1]
            else:
                profile = ":".join(line[1:number_of_genes + 1])
                st = line[0]
                mlst_profiles_dict[species]["ST"].update({profile: st})
                mlst_profiles_dict[species]["GENE_ORDER"] = gene_list

        for _file in [f for f in file_list if f[-4:] == '.tfa']:
            for seq_record in SeqIO.parse(
                    os.path.join(resource_path, species, _file), 'fasta'):
                seq_num = seq_record.name.replace("-", "_").split("_")[-1]
                gene_name = "_".join(
                    seq_record.name.replace("__",
                                            "_").replace("-",
                                                         "_").split("_")[:-1])
                if gene_name not in mlst_profiles_dict[species]["GENES"]:
                    mlst_profiles_dict[species]["GENES"].update(
                        {gene_name: {
                            seq_num: set([])
                        }})
                else:
                    mlst_profiles_dict[species]["GENES"][gene_name].update(
                        {seq_num: set([])})
                for j in range(0, len(seq_record.seq) - kmer_size + 1):
                    kmer = seq_record.seq[j:j + kmer_size]
                    mer = jellyfish.MerDNA(str(kmer))
                    mer.canonicalize()
                    mlst_profiles_dict[species]["GENES"][gene_name][
                        seq_num].add(str(mer))
            sys.stderr.write("\tparsing: {0} : {1}\n".format(
                species, gene_name))
    pickle.dump(mlst_profiles_dict,
                open(os.path.join(resource_path, "mlst_profiles.pkl"), "wb"))
    return
示例#3
0
 def test_all_mers(self):
     count = 0
     good = True
     mers = jellyfish.string_mers(self.str)
     for m in mers:
         m2 = jellyfish.MerDNA(self.str[count:count + self.k])
         good = good and m == m2
         count += 1
     self.assertTrue(good)
     self.assertEqual(len(self.str) - self.k + 1, count)
示例#4
0
 def test_canonical_mers(self):
     good = True
     mers = jellyfish.string_canonicals(self.str)
     for count, m in enumerate(mers):
         m2 = jellyfish.MerDNA(self.str[count:count + self.k])
         rm2 = m2.get_reverse_complement()
         good = good and (m == m2 or m == rm2)
         good = good and (not (m > m2)) and (not (m > rm2))
         # count += 1
     self.assertTrue(good)
     self.assertEqual(len(self.str) - self.k + 0, count)
示例#5
0
def compare_ard(so, kmer_size=31):
    from Bio import SeqIO
    import jellyfish
    _p = "/home/ksimmon/reference/ard/"
    _p = "/Users/ksimmon/Box Sync/ARUP/strainTypeMer_resources/ard/"
    sys.stderr.write("Retrieving antibiotic resistance genes\n")

    descriptions = {}
    for i in open(_p + "categories.txt"):
        v = i.strip().split("\t")
        name = ".".join(v[0].split(".")[:-1])
        descriptions.update({name: v})

    aro_tags = {}
    for i in open(_p + "AROtags.txt"):
        v = i.strip().split("\t")
        # print v
        aro_tags.update({v[2]: v[1]})
    count = 0

    num_of_sequences = len(
        [i.name for i in SeqIO.parse(_p + "ARmeta-genes.fa", "fasta")])
    for s in SeqIO.parse(_p + "ARmeta-genes.fa", "fasta"):
        count += 1

        sys.stderr.write(
            "\rAnalyzed {0} of {1} antibiotic resistant genes".format(
                count, num_of_sequences))
        if count != num_of_sequences:
            sys.stderr.flush()
        else:
            sys.stderr.write("\n")

        id = s.description.split(" ")[0]
        species = s.description[s.description.rfind("[") +
                                1:s.description.rfind("]")]
        aro_tag = [
            i.split(" ")[0] for i in s.description.split(". ")
            if "ARO:" in i and "ARO:1000001" not in i
        ]
        # print id, species, descriptions[id][1], ",".join([aro_tags[tag] for tag in aro_tag])
        for j in range(0, len(s.seq) - kmer_size + 1):
            kmer = s.seq[j:j + kmer_size]
            mer = jellyfish.MerDNA(str(kmer))
            mer.canonicalize()

            if id in so.ard:
                so.ard[id][0].append(so.qf[mer])
            else:
                so.ard.update({
                    id: ([so.qf[mer]], species, descriptions[id][1],
                         [aro_tags[tag] for tag in aro_tag])
                })
    return
示例#6
0
def kmercount(k, fname):
    try:
        qf = jellyfish.QueryMerFile(fname)
    except RuntimeError:
        raise
    else:
        # initialize with pseudo count
        # add 0.5 for smoothing
        # store data in doble quantity to use int vector
        c = np.ones(1 << (2 * k), dtype=np.uint16)
        i = 0
        for l in allkmers(k):
            c[i] += 2 * qf[jellyfish.MerDNA(''.join(l))]
            i += 1
        # print len(c);
        return c
示例#7
0
def get_kmer_freq_v(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf',
                    k=5):
    try:
        qf = jellyfish.QueryMerFile(jfdb)
    except RuntimeError:
        raise
    else:
        alph = ('A', 'C', 'G', 'T')
        freq_l = []
        kmer = None
        for km in itertools.product(alph, repeat=k):
            kmer = ''.join(km)
            freq = qf[jellyfish.MerDNA(kmer)]
            freq_l.append(freq)
        # how to close qf??
        a = np.array([freq_l], dtype=np.float64)
        a /= np.sum(a)
        return a
示例#8
0
def test07(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf', k=5):
    try:
        qf = jellyfish.QueryMerFile(jfdb)
    except RuntimeError:
        print 'jellyfish runtime error'
        raise
    else:
        alph = ('A', 'C', 'G', 'T')
        freq_l = []
        for km in itertools.product(alph, repeat=k):
            kmer = ''.join(km)
            freq = qf[jellyfish.MerDNA(kmer)]
            freq_l.append(freq)
            #print '{kmer}\t{freq}'.format(kmer =kmer, freq = freq);
        a = np.array([freq_l], dtype=np.float64)
        a /= np.sum(a)
        print a
    return
示例#9
0
 def test_add(self):
     mer = jellyfish.MerDNA()
     good = True
     for i in range(1000):
         mer.randomize()
         val = random.randrange(1000)
         good = good and self.hash.add(mer, val)
         if not good: break
         if i % 3 > 0:
             nval = random.randrange(1000)
             val = val + nval
             if i % 3 == 1:
                 good = good and (not self.hash.add(mer, nval))
             else:
                 good = good and self.hash.update_add(mer, nval)
         if not good: break
         good = good and (val == self.hash.get(mer)) and (val
                                                          == self.hash[mer])
         if not good: break
     self.assertTrue(good)
示例#10
0
def ttest_kmer(positive_qfs, negative_qfs, positive_factor, negative_factor,
               kmer_candidates, outfile):

    kmer_fh = open(kmer_candidates, 'r')
    outfh = open(outfile, 'w')

    i = 0
    for line in kmer_fh:
        mer = jellyfish.MerDNA(line.rstrip())
        mer.canonicalize()

        positive = []
        negative = []

        for x in xrange(len(positive_qfs)):
            factor = positive_factor[x]
            p_qfs = positive_qfs[x]
            positive.append(float(p_qfs[mer]) / float(factor))

        for j in xrange(len(negative_qfs)):
            factor = negative_factor[j]
            n_qfs = negative_qfs[j]
            negative.append(float(n_qfs[mer]) / float(factor))

        p_mean = numpy.mean(positive)
        n_mean = numpy.mean(negative)

        if (not p_mean == 0 and not n_mean == 0):
            t_stat, p_val = stats.ttest_ind(positive,
                                            negative,
                                            equal_var=False)  ## running t-test
        outfh.write(
            "%s\t%E\t%E\t%f\t%E\n" %
            (mer, Decimal(p_mean), Decimal(n_mean), t_stat, Decimal(p_val)))

        if (i % 1 == 1000):
            echo("------ completed %d" % (i))
        i = i + 1
    kmer_fh.close()
    outfh.close()
示例#11
0
def kmercount(k, pos, chr = 21,
              fname_head = '/data/yt/GRCh38.p2.ch21/GRCh38.p2'):
    try:
        fname = '{head}.ch{chr}.{pos}.fasta.{k}.jf'.format(head = fname_head,
                                                           chr = chr,
                                                           pos = pos, 
                                                           k = k);
        qf = jellyfish.QueryMerFile(fname);
    except RuntimeError:
        raise;
    else:        
        # initialize with pseudo count
        # add 0.5 for smoothing
        # store data in doble quantity to use int vector
        c = np.ones((1 << (2 * k), 1), dtype = np.uint16);
        i = 0;
        for l in allkmers(k):
            c[i][0] += 2 * qf[jellyfish.MerDNA(''.join(l))];
            i += 1;
        # print c.T
        # print len(c);
        return c;
示例#12
0
    def mlst_profiles(self, mlst_profiles):
        results = []
        if mlst_profiles is None:
            return ["no profiles loaded"]

        matching_sequences = OrderedDict()
        for species, _d in mlst_profiles.items():
            matching_sequences.update({species: OrderedDict()})
            for i, gene in enumerate(_d["GENE_ORDER"]):
                matching_sequences[species].update({gene: []})
                for profile_number, profile in _d["GENES"][gene].items():
                    for kmer in profile:
                        mer = jellyfish.MerDNA(kmer)
                        mer.canonicalize()
                        if self.qf[mer] == 0:
                            break
                    else:
                        matching_sequences[species][gene].append(
                            profile_number)

            st_keys = [
                ":".join(t) for t in list(
                    itertools.product(*matching_sequences[species].values()))
            ]
            # print st_keys
            for k in st_keys:
                if k in _d["ST"]:
                    st = _d["ST"][k]
                else:
                    st = 'NONE'
                results.append("{0}\tST: {2}\tprofile: {1} [{3}]".format(
                    species, k, st, ":".join(_d["GENE_ORDER"])))

        if len(results) == 0:
            return ["no matching profiles found"]
        return results
示例#13
0
文件: kmer.py 项目: iric-soft/nektar
 def get_count_jf(self, jf):
     res_k = jellyfish.MerDNA(self._seq)
     res_k.canonicalize()
     return jf[res_k]
示例#14
0
 def query(self, seq):
     kmer = jellyfish.MerDNA(seq)
     if (self.canonical):
         kmer.canonicalize()
     return self.jf[kmer]
示例#15
0
    def compare_to_and_filter(self,
                              strain,
                              complexity_cutoff=12,
                              coverage_cutoff=3,
                              reference_set=None,
                              inverse=False,
                              filtering_cutoff=85,
                              verbose=False):
        """
        Compares the strains using a pairwise filter

        :param strain:
        :param complexity_cutoff:
        :param coverage_cutoff:
        :param reference_set:
        :param inverse:
        :param filtering_cutoff:
        :param verbose:
        :return:
        """
        # USE THE ARCHIVE SET IF RAPID_MODE=True

        if self.rapid_mode:
            strain_1_kmer_set = self.kmer_archive
            strain_2_kmer_set = strain.kmer_archive
        else:
            strain_1_kmer_set = self.kmer_set
            strain_2_kmer_set = strain.kmer_set

        # FILTER IN OR OUT THE REFERENCE SET
        if reference_set is None:
            strain_1 = strain_1_kmer_set
            strain_2 = strain_2_kmer_set
        else:
            if inverse:
                strain_1 = strain_1_kmer_set.difference(reference_set)
                strain_2 = strain_2_kmer_set.difference(reference_set)
            else:
                strain_1 = strain_1_kmer_set.intersection(reference_set)
                strain_2 = strain_2_kmer_set.intersection(reference_set)

        intersection = float(len(strain_1.intersection(strain_2)))
        denom = ((len(strain_1) - intersection) +
                 (len(strain_2) - intersection)) + intersection

        total = intersection / denom * 100.0
        smallest_count = float(len(strain_1))
        strain_1_smallest = True
        if len(strain_2) < smallest_count:
            smallest_count = len(strain_2)
            strain_1_smallest = False

        #catching a divide by zero error and returning 0
        try:
            rescue_numerator = float(len(strain_1.intersection(strain_2)))
            rescue = rescue_numerator / smallest_count * 100.0
        except ZeroDivisionError as e:
            print(
                "###############\nWARNING:\tSample {} or {} does not have sufficient coverage."
                "\n###############".format(self.name, strain.name))
            rescue = 0

        # return self.name, strain.name, total, rescue, denom, smallest_count
        if total < filtering_cutoff or self.do_not_filter or strain.do_not_filter:
            return self.name, strain.name, total, rescue, denom, smallest_count

        # get the difference kmers
        differences_1 = strain_1.difference(strain_2)
        differences_2 = strain_2.difference(strain_1)
        differences = strain_1.symmetric_difference(strain_2)
        # differences = differences_1.union(differences_2) # combined

        complexity_count = 0
        within_1_strain_1 = 0
        within_2_strain_1 = 0
        within_3_strain_1 = 0
        within_1_strain_2 = 0
        within_2_strain_2 = 0
        within_3_strain_2 = 0
        counter_not_filtered = 0
        counter_filtered = 0
        coverage_100 = 0
        kept_1 = 0
        kept_2 = 0
        nucleotide_skew = 0
        filtered_1 = 0
        filtered_2 = 0
        below_cutoff_1 = 0
        below_cutoff_2 = 0

        for i, kmer in enumerate(differences):
            is_filtered_kmer = False
            mer = jellyfish.MerDNA(kmer)
            mer.canonicalize()
            s1_count = int(self.qf_filtered[mer])
            s2_count = int(strain.qf_filtered[mer])

            # s1_out = "{0}:n={1} [cutoff={2}]".format(self.name, s1_count, self.kmer_cutoff)
            # s2_out = "{0}:n={1} [cutoff={2}]".format(strain.name, s2_count, strain.kmer_cutoff)

            # if s1_count > self.coverage * coverage_cutoff:
            #     coverage_100 += 1
            #     is_filtered_kmer = True
            # elif s2_count > strain.coverage * coverage_cutoff:
            #     coverage_100 += 1
            #     is_filtered_kmer = True

            if s2_count == 0:
                if s1_count - int(self.kmer_cutoff) == 1:
                    within_1_strain_1 += 1
                    #within_2_strain_1 += 1
                    #within_3_strain_1 += 1
                    is_filtered_kmer = True
                elif s1_count - int(self.kmer_cutoff) == 2:
                    # within_1 += 1
                    within_2_strain_1 += 1
                    #within_3_strain_1 += 1
                    is_filtered_kmer = True
                elif s1_count - int(self.kmer_cutoff) == 3:
                    # within_1 += 1
                    # within_2 += 1
                    within_3_strain_1 += 1
                    is_filtered_kmer = True
            else:
                if s2_count - int(strain.kmer_cutoff) == 1:
                    within_1_strain_2 += 1
                    #within_2_strain_2 += 1
                    #within_3_strain_2 += 1
                    is_filtered_kmer = True
                elif s2_count - int(strain.kmer_cutoff) == 2:
                    # within_1 += 1
                    within_2_strain_2 += 1
                    #within_3_strain_2 += 1
                    is_filtered_kmer = True
                elif s2_count - int(strain.kmer_cutoff) == 3:
                    # within_1 += 1
                    # within_2 += 1
                    within_3_strain_2 += 1
                    is_filtered_kmer = True

            complexity = [[k, len(list(g))] for k, g in groupby(kmer)]
            complexity = sorted(complexity, key=lambda l: l[1], reverse=True)
            complexity = sum([v for g, v in complexity[:3]])

            complexity_char = sorted(Counter(kmer).values(), reverse=True)
            if complexity_char[0] > (31.0 / 2):
                nucleotide_skew += 1
                is_filtered_kmer = True

            if complexity > complexity_cutoff:
                is_filtered_kmer = True
                complexity_count += 1

            if is_filtered_kmer is False:  # is_filtered_kmer is False:
                # counter_not_filtered += 1
                if s1_count == 0:
                    if self.qf[mer] == 0:
                        kept_2 += 1
                        if verbose:
                            print("strain: 2\tcount: {0}\t{1}|{2}\t{3}".format(
                                s2_count, kmer, rc(kmer), self.qf[mer]))

                    else:
                        below_cutoff_1 += 1
                        is_filtered_kmer = True

                else:
                    if strain.qf[mer] == 0:
                        kept_1 += 1
                        if verbose:
                            print("strain: 1\tcount: {0}\t{1}|{2}\t{3}".format(
                                s1_count, kmer, rc(kmer), strain.qf[mer]))

                    else:
                        below_cutoff_2 += 1
                        is_filtered_kmer = True

            # count updating
            if is_filtered_kmer:
                counter_filtered += 1
                if s1_count == 0:
                    filtered_2 += 1
                else:
                    filtered_1 += 1
            else:
                counter_not_filtered += 1

                # sys.stdout.write(">{0}\t{1}\t{2}\tcomplexity:{3}\n{4}\n".format(
                #    counter_not_filtered, s1_out, s2_out, complexity, kmer))
        s = "\n"
        s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
        s += "{0:.1f} X\tcoverage '{1}' [kmer cutoff = {2}]\n".format(
            self.coverage, self.name, self.kmer_cutoff)
        s += "{0:.1f} X\tcoverage '{1}' [kmer cutoff = {2}]\n".format(
            strain.coverage, strain.name, strain.kmer_cutoff)
        s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
        s += "{0}\tkmers found in '{1}' but not '{2}'\n".format(
            len(differences_1), self.name, strain.name)
        s += "{0}\tkmers found in '{1}' but not '{2}' [AFTER FILTERING]\n".format(
            kept_1, self.name, strain.name)
        s += "{0}\tkmers found in '{1}' but not '{2}'\n".format(
            len(differences_2), strain.name, self.name)
        s += "{0}\tkmers found in '{1}' but not '{2}' [AFTER FILTERING]\n".format(
            kept_2, strain.name, self.name)
        s += "~~~~~~~~~ FILTER ATTRS ~~~~~~~~~~~\n"
        s += "{1}\tHomopolymer runs summing >= {0} [sum of 3 homopolymer runs]\n".format(
            complexity_cutoff, complexity_count)
        s += "{0}\tHalf of the kmer contains a single base\n".format(
            nucleotide_skew)
        s += "{0} : {1} : {2}\twithin 1:2:3 count of cutoff [strain_1] " \
             "(e.g the kmer is near the histogram tail)\n".format(
            within_1_strain_1, within_2_strain_1, within_3_strain_1)
        s += "{0} : {1} : {2}\tWithin 1:2:3 count of cutoff [strain_2] " \
             "(e.g the kmer is near the histogram tail)\n".format(
            within_1_strain_2, within_2_strain_2, within_3_strain_2)
        s += "{1}\tkmers with excessive coverage [{0}X]\n".format(
            coverage_cutoff, coverage_100)
        s += "{0}\tkmer found below initial cutoff [strain_1]\n".format(
            below_cutoff_1)
        s += "{0}\tkmer found below initial cutoff [strain_2]\n".format(
            below_cutoff_2)
        s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
        s += "{0}\tkmers filtered\n{1}\tkmers retained\n".format(
            counter_filtered, counter_not_filtered)
        s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
        s += "\n"

        denom -= counter_filtered
        if strain_1_smallest:
            smallest_count -= filtered_1
        else:
            smallest_count -= filtered_2
        total = intersection / denom * 100.0
        rescue = rescue_numerator / smallest_count * 100.0
        return self.name, strain.name, total, rescue, denom, smallest_count
示例#16
0
#! /usr/bin/env python

import jellyfish
import sys

qf = jellyfish.QueryMerFile(sys.argv[1])
for str in sys.argv[2:]:
    print("%s %d" % (str, qf[jellyfish.MerDNA(str)]))

示例#17
0
def get_count(kmer, jf):
    res_k = jellyfish.MerDNA(kmer.seq)
    res_k.canonicalize()

    return jf[res_k]
示例#18
0
def genquery(genomeFile, jellyFile, totedits, medindel, insprob, delprob,
             queryfreq, querycount, outputFile):
    #genome - path to genome
    #totedits - total number of edits to make
    #medindel - median (mean) size of indel edits. actual edit length determined from gaussian with mean medindel and std medindel/2
    #insprob - probability of insertion
    #delprob - probability of deletion
    #outputs all edits into a text file called "sampleedits.txt"

    if delprob + insprob > 1.0:
        raise "Error, delprob = {} and insprob = {}. "\
              "The sum is {} > 1.0".format(
                delprob, insprob, delprob + insprob)

    genome = genomeFile.readline()
    genomeFile.close()
    #mf = jellyfish.ReadMerFile(jellyFile)
    qf = jellyfish.QueryMerFile(jellyFile)
    numbases = len(genome) - 1
    genome = genome[0:numbases]
    letters = ['A', 'C', 'G', 'T']
    randr = []
    allinds = []
    snpProb = 1.0 - (insprob + delprob)
    SNPrange = int(snpProb * totedits)
    insrange = int(insprob * totedits)
    delrange = int(delprob * totedits)

    editTypes = (['S'] * SNPrange) +\
                (['D'] * delrange) +\
                (['I'] * insrange)

    random.shuffle(editTypes)
    qcount = 0
    effectedkmers = set()
    for val in editTypes:
        qcount += 1
        if val == 'I':
            p, s, seq = random_insertion(numbases, medindel)
            numbases += s
            outputFile.write('I %d %s\n' % (p, seq))
            add_kmers_in_seq(effectedkmers, seq)
            add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K])

        elif val == 'D':
            p, s = random_deletion(numbases, medindel)
            numbases -= s
            outputFile.write('D %d %d\n' % (p, p + s - 1))
            #add_kmers_in_seq(effectedkmers, genome[p-K+1:p+s-1+K])

        else:
            p, seq = random_snp(numbases)
            outputFile.write('S %d %s\n' % (p, seq))
            add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K - 1])

        # if it's time to output some queries
        if qcount == queryfreq:
            qcount = 0
            for qlist in xrange(querycount):
                dart = random.random()
                if dart <= EDIT_QUERY_PROB:
                    kmer = random.sample(effectedkmers, 1)[0]
                    editflag = 'I'
                else:
                    p = random.randrange(K * 2, numbases - K * 2)
                    kmer = genome[p:p + K].upper()
                    editflag = 'N'

                kcount = int(qf[jellyfish.MerDNA(kmer)])
                outputFile.write('Q %s %s %d\n' % (kmer, editflag, kcount))

    outputFile.close()
示例#19
0
 def get_kmer_count(self, kmer):
     canon = jellyfish.MerDNA(str(kmer))
     canon.canonicalize()
     return self.qf[canon]