예제 #1
0
    def set_jf_file(self, path):
        """
        set the path to the query and read mer files

        :param path:
        :return: None
        """
        self.qf_filtered = jellyfish.QueryMerFile(path)
        self.rf = jellyfish.ReadMerFile(path)
        return None
예제 #2
0
    def __filter_jf_file(self):
        """
        filters the raw kmer count set based on kmer cutoff and set the queryfile and readfile paths
        :return: None
        """
        if self.do_not_filter:
            self.filtered_jf_file = self.path
            self.qf_filtered = jellyfish.QueryMerFile(self.path)
            self.rf = jellyfish.ReadMerFile(self.path)
            self.__create_set()
        else:
            dummy_jf_file = pkg_resources.resource_filename(
                'straintypemer', 'data/dummy_A.jf')

            subprocess.check_call([
                "jellyfish", "merge", "-L",
                str(int(self.kmer_cutoff) + 1), "-o", self.filtered_jf_file,
                self.path, dummy_jf_file
            ], )

            self.qf_filtered = jellyfish.QueryMerFile(self.filtered_jf_file)
            self.rf = jellyfish.ReadMerFile(self.filtered_jf_file)
            self.__create_set()
        return
예제 #3
0
 def setUp(self):
     self.mf = jellyfish.ReadMerFile(os.path.join(data, "sequence.jf"))
예제 #4
0
    print "If jellyfish indexes are not built for folder with data, please run smash.sh"
    sys.exit()
k = int(sys.argv[2])
cosineFile = open("%scosinek%d.log" % (sys.argv[1], k), 'w')
jaccardFile = open("%sjaccardk%d.log" % (sys.argv[1], k), 'w')

#build our list of files / genomes to compare
files = [
    sys.argv[1] + f for f in listdir(sys.argv[1])
    if isfile(join(sys.argv[1], f)) and f.endswith('k%d.jf' % (k))
]

#print files
for idx, jfi_1 in enumerate(files[:-1]):
    for jfi_2 in files[idx + 1:]:
        jfi1_RFile = jellyfish.ReadMerFile(jfi_1)
        jfi1_QFile = jellyfish.QueryMerFile(jfi_1)
        jfi2_RFile = jellyfish.ReadMerFile(jfi_2)
        jfi2_QFile = jellyfish.QueryMerFile(jfi_2)
        t1 = []
        t2 = []
        notUnion = 0
        for mer, count1 in jfi1_RFile:
            #print count1
            count2 = jfi2_QFile[mer]
            if count2 == 0:
                notUnion += 1
            t1.append(int(count1))
            t2.append(int(count2))
        for mer, count2 in jfi2_RFile:
            if jfi1_QFile[mer] == 0:
예제 #5
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique 101-mers...')
    addsitedir(jellyfish_python_dirpath)
    try:
        compile_jellyfish(logger)
        import jellyfish
        try:
            import imp
            imp.reload(jellyfish)
        except:
            reload(jellyfish)
        jellyfish.MerDNA.k(KMERS_LEN)
    except:
        logger.warning('Failed unique 101-mers analysis.')
        return

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths,
                                     ref_fpath):
            jf_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(jf_stats_fpath).read().split('\n')
            if len(stats_content) < 4:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                '%.2f' % float(stats_content[1].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                '%.2f' % float(stats_content[2].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                '%.2f' % float(stats_content[3].strip().split(': ')[-1]))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    logger.info('Running Jellyfish on reference...')
    jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf')
    qutils.call_subprocess([
        jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
        str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t',
        str(qconfig.max_threads), ref_fpath
    ])
    ref_kmers = jellyfish.ReadMerFile(jf_out_fpath)
    os.remove(jf_out_fpath)

    logger.info('Running Jellyfish on assemblies...')
    contigs_kmers = []
    for contigs_fpath in contigs_fpaths:
        jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf')
        qutils.call_subprocess([
            jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
            str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t',
            str(qconfig.max_threads), contigs_fpath
        ])
        contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath))
        os.remove(jf_out_fpath)

    logger.info('Analyzing completeness and accuracy of assemblies...')
    unique_kmers = 0
    matched_kmers = defaultdict(int)
    shared_kmers = set()
    kmer_i = 0
    for kmer, count in ref_kmers:
        unique_kmers += 1
        matches = 0
        for idx in range(len(contigs_fpaths)):
            if contigs_kmers[idx][kmer]:
                matched_kmers[idx] += 1
                matches += 1
        if matches == len(contigs_fpaths):
            if kmer_i % 100 == 0:
                shared_kmers.add(str(kmer))
            kmer_i += 1

    for idx, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        completeness = matched_kmers[idx] * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)

    shared_kmers_by_chrom = dict()
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    for name, seq in ref_contigs.items():
        seq_kmers = jellyfish.string_mers(seq)
        for kmer in seq_kmers:
            if str(kmer) in shared_kmers:
                shared_kmers_by_chrom[str(kmer)] = name

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = 0
        len_map_to_multi_chrom = 0
        total_len = 0

        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            seq_kmers = jellyfish.string_mers(seq)
            chrom_markers = []
            for kmer in seq_kmers:
                kmer_str = str(kmer)
                if kmer_str in shared_kmers_by_chrom:
                    chrom = shared_kmers_by_chrom[kmer_str]
                    chrom_markers.append(chrom)
            if len(chrom_markers) < MIN_MARKERS:
                continue
            if len(set(chrom_markers)) == 1:
                len_map_to_one_chrom += len(seq)
            else:
                len_map_to_multi_chrom += len(seq)

        len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                         '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                         '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                         '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_jf_stats_file(
            output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS),
            len_map_to_one_chrom, len_map_to_multi_chrom,
            len_map_to_none_chrom)

    logger.info('Done.')
예제 #6
0
#! /usr/bin/env python

import jellyfish
import sys

mf = jellyfish.ReadMerFile(sys.argv[1])
for mer, count in mf:
    print("%s %d" % (mer, count))
def main(argv=None):

    opt = parse_args(argv)

    data_dir = opt.data_dir
    data_file = opt.data_file
    save_dir = opt.save_dir
    save_file = opt.save_file
    min_count = opt.min

    if save_dir is None:
        save_dir = data_dir

    if save_file is None:
        save_file = data_file[:-3] + '.hdf5'  # Same name, differente extention

    data_file = os.path.join(data_dir, data_file)
    save_file = os.path.join(save_dir, save_file)

    print "We will process {}, keep the kmer that has a count >= {}, and save it in {}".format(
        data_file, min_count, save_file)

    start = time.time()
    mf = jellyfish.ReadMerFile(data_file)
    kmers = []
    kept_kmer = 0
    all_kmer = 0
    tmp_kmers = []
    fmy = h5py.File(save_file, "w")

    for i, [mer, count] in enumerate(mf):

        all_kmer += 1

        if count < min_count:
            continue

        kept_kmer += 1

        #if i > 1000:
        #    break
        mer = str(mer)
        mer = mer.replace('A',
                          '0').replace('C',
                                       '1').replace('G',
                                                    '2').replace('T', '3')

        sample = list(mer)
        sample.append(int(count))
        sample = np.array(sample).astype(int)
        kmers.append(sample)
        #kmers.append((str(mer), int(count)))

        #import ipdb; ipdb.set_trace()

        if i % 1000000 == 0:
            print "Done {} in {} seconds ".format(i, int(time.time() - start))
            print i, mer, count
            add_data(fmy, np.array(kmers))
            kmers = []

    print "Keeping {}/{} kmers".format(kept_kmer, all_kmer)
    #import ipdb; ipdb.set_trace()
    #kmers = np.array(kmers)

    # Save the data here.

    #import ipdb; ipdb.set_trace()

    # TODO, add a tissue and patient group. Right now is only one single thing.
    #fmy.create_dataset("kmer", kmers.shape, data=kmers)
    fmy.close()
    print "Done!"
예제 #8
0
 def setUp(self):
     self.mf = jellyfish.ReadMerFile(os.path.join(data, "swig_python.jf"))
예제 #9
0
 def stream_kmers(self):
     mf = jellyfish.ReadMerFile(self.path)
     for kmer, count in mf:
         yield str(kmer), count