def set_jf_file(self, path): """ set the path to the query and read mer files :param path: :return: None """ self.qf_filtered = jellyfish.QueryMerFile(path) self.rf = jellyfish.ReadMerFile(path) return None
def __filter_jf_file(self): """ filters the raw kmer count set based on kmer cutoff and set the queryfile and readfile paths :return: None """ if self.do_not_filter: self.filtered_jf_file = self.path self.qf_filtered = jellyfish.QueryMerFile(self.path) self.rf = jellyfish.ReadMerFile(self.path) self.__create_set() else: dummy_jf_file = pkg_resources.resource_filename( 'straintypemer', 'data/dummy_A.jf') subprocess.check_call([ "jellyfish", "merge", "-L", str(int(self.kmer_cutoff) + 1), "-o", self.filtered_jf_file, self.path, dummy_jf_file ], ) self.qf_filtered = jellyfish.QueryMerFile(self.filtered_jf_file) self.rf = jellyfish.ReadMerFile(self.filtered_jf_file) self.__create_set() return
def setUp(self): self.mf = jellyfish.ReadMerFile(os.path.join(data, "sequence.jf"))
print "If jellyfish indexes are not built for folder with data, please run smash.sh" sys.exit() k = int(sys.argv[2]) cosineFile = open("%scosinek%d.log" % (sys.argv[1], k), 'w') jaccardFile = open("%sjaccardk%d.log" % (sys.argv[1], k), 'w') #build our list of files / genomes to compare files = [ sys.argv[1] + f for f in listdir(sys.argv[1]) if isfile(join(sys.argv[1], f)) and f.endswith('k%d.jf' % (k)) ] #print files for idx, jfi_1 in enumerate(files[:-1]): for jfi_2 in files[idx + 1:]: jfi1_RFile = jellyfish.ReadMerFile(jfi_1) jfi1_QFile = jellyfish.QueryMerFile(jfi_1) jfi2_RFile = jellyfish.ReadMerFile(jfi_2) jfi2_QFile = jellyfish.QueryMerFile(jfi_2) t1 = [] t2 = [] notUnion = 0 for mer, count1 in jfi1_RFile: #print count1 count2 = jfi2_QFile[mer] if count2 == 0: notUnion += 1 t1.append(int(count1)) t2.append(int(count2)) for mer, count2 in jfi2_RFile: if jfi1_QFile[mer] == 0:
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique 101-mers...') addsitedir(jellyfish_python_dirpath) try: compile_jellyfish(logger) import jellyfish try: import imp imp.reload(jellyfish) except: reload(jellyfish) jellyfish.MerDNA.k(KMERS_LEN) except: logger.warning('Failed unique 101-mers analysis.') return checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): jf_stats_fpath = join(output_dir, label + '.stat') stats_content = open(jf_stats_fpath).read().split('\n') if len(stats_content) < 4: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % float(stats_content[1].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % float(stats_content[2].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % float(stats_content[3].strip().split(': ')[-1])) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: logger.info('Done.') return logger.info('Running Jellyfish on reference...') jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), ref_fpath ]) ref_kmers = jellyfish.ReadMerFile(jf_out_fpath) os.remove(jf_out_fpath) logger.info('Running Jellyfish on assemblies...') contigs_kmers = [] for contigs_fpath in contigs_fpaths: jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), contigs_fpath ]) contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath)) os.remove(jf_out_fpath) logger.info('Analyzing completeness and accuracy of assemblies...') unique_kmers = 0 matched_kmers = defaultdict(int) shared_kmers = set() kmer_i = 0 for kmer, count in ref_kmers: unique_kmers += 1 matches = 0 for idx in range(len(contigs_fpaths)): if contigs_kmers[idx][kmer]: matched_kmers[idx] += 1 matches += 1 if matches == len(contigs_fpaths): if kmer_i % 100 == 0: shared_kmers.add(str(kmer)) kmer_i += 1 for idx, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) completeness = matched_kmers[idx] * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) shared_kmers_by_chrom = dict() ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) for name, seq in ref_contigs.items(): seq_kmers = jellyfish.string_mers(seq) for kmer in seq_kmers: if str(kmer) in shared_kmers: shared_kmers_by_chrom[str(kmer)] = name for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 total_len = 0 for name, seq in read_fasta(contigs_fpath): total_len += len(seq) seq_kmers = jellyfish.string_mers(seq) chrom_markers = [] for kmer in seq_kmers: kmer_str = str(kmer) if kmer_str in shared_kmers_by_chrom: chrom = shared_kmers_by_chrom[kmer_str] chrom_markers.append(chrom) if len(chrom_markers) < MIN_MARKERS: continue if len(set(chrom_markers)) == 1: len_map_to_one_chrom += len(seq) else: len_map_to_multi_chrom += len(seq) len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_jf_stats_file( output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom) logger.info('Done.')
#! /usr/bin/env python import jellyfish import sys mf = jellyfish.ReadMerFile(sys.argv[1]) for mer, count in mf: print("%s %d" % (mer, count))
def main(argv=None): opt = parse_args(argv) data_dir = opt.data_dir data_file = opt.data_file save_dir = opt.save_dir save_file = opt.save_file min_count = opt.min if save_dir is None: save_dir = data_dir if save_file is None: save_file = data_file[:-3] + '.hdf5' # Same name, differente extention data_file = os.path.join(data_dir, data_file) save_file = os.path.join(save_dir, save_file) print "We will process {}, keep the kmer that has a count >= {}, and save it in {}".format( data_file, min_count, save_file) start = time.time() mf = jellyfish.ReadMerFile(data_file) kmers = [] kept_kmer = 0 all_kmer = 0 tmp_kmers = [] fmy = h5py.File(save_file, "w") for i, [mer, count] in enumerate(mf): all_kmer += 1 if count < min_count: continue kept_kmer += 1 #if i > 1000: # break mer = str(mer) mer = mer.replace('A', '0').replace('C', '1').replace('G', '2').replace('T', '3') sample = list(mer) sample.append(int(count)) sample = np.array(sample).astype(int) kmers.append(sample) #kmers.append((str(mer), int(count))) #import ipdb; ipdb.set_trace() if i % 1000000 == 0: print "Done {} in {} seconds ".format(i, int(time.time() - start)) print i, mer, count add_data(fmy, np.array(kmers)) kmers = [] print "Keeping {}/{} kmers".format(kept_kmer, all_kmer) #import ipdb; ipdb.set_trace() #kmers = np.array(kmers) # Save the data here. #import ipdb; ipdb.set_trace() # TODO, add a tissue and patient group. Right now is only one single thing. #fmy.create_dataset("kmer", kmers.shape, data=kmers) fmy.close() print "Done!"
def setUp(self): self.mf = jellyfish.ReadMerFile(os.path.join(data, "swig_python.jf"))
def stream_kmers(self): mf = jellyfish.ReadMerFile(self.path) for kmer, count in mf: yield str(kmer), count