def test_read_bundler(): infile = utils.get_test_data('unclean-reads.fastq') records = [r for r in clean_input_reads(screed.open(infile))] bundle = khmer.utils.ReadBundle(*records) raw_seqs = ( 'GGTTGACGGGGNNNAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGANNNCCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', ) cleaned_seqs = ( 'GGTTGACGGGGAAAAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGAAAACCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', ) assert bundle.num_reads == 2 assert bundle.total_length == 200 for read, raw_seq, clean_seq in zip(bundle.reads, raw_seqs, cleaned_seqs): assert read.sequence == raw_seq assert read.cleaned_seq == clean_seq
def verbose_loader(filename): """Screed iterator that additionally prints progress info to stderr.""" screed_iter = clean_input_reads(screed.open(filename)) for num, record in enumerate(screed_iter): if num % 100000 == 0: log_info('... filtering {num}', num=num) yield record
def test_read_bundler_empty_file(): infile = utils.get_test_data('empty-file') records = [r for r in clean_input_reads(screed.open(infile))] bundle = khmer.utils.ReadBundle(*records) assert bundle.num_reads == 0
def test_read_bundler_single_read(): infile = utils.get_test_data('single-read.fq') records = [r for r in clean_input_reads(screed.open(infile))] bundle = khmer.utils.ReadBundle(*records) assert bundle.num_reads == 1 assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') args = parser.parse_args() force_single = args.force_single #if args.reads == '-': # args.reads = sys.stdin # check that input files exist check_valid_file_exists(args.input_filenames) filenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # create object of Nodetable in Khmer to use its kh = khmer.Nodetable(args.ksize, 1, 1) # load database mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list def readFusion(read): global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion flag = None lf_ids = set() rt_ids = set() families = [] shared_kmers = [] gaps = [] hashvals = kh.get_kmer_hashes(read.sequence) # find a matching k-mer at the beginning of the read lf = hashvals[0] lf_ids = get_kmer_to_family_ids(lf) idx = 1 while idx < len(hashvals) and len(lf_ids) == 0: lf = hashvals[idx] lf_ids = get_kmer_to_family_ids(lf) idx += 1 if len(lf_ids) == 0: #print('no single match') n_unmatched += 1 flag = "unmatched" elif idx == len(hashvals): #print('same, only last kmer matched') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # len(lf_ids) > 0 & idx < len(hashvals) # find a matching k-mer at the end of the read rt = hashvals[-1] rt_ids = get_kmer_to_family_ids(rt) idy = len(hashvals) - 2 while idy >= idx and len(rt_ids) == 0: rt = hashvals[idy] rt_ids = get_kmer_to_family_ids(rt) idy -= 1 if len(rt_ids) == 0: #print('same, only one non-last kmer matched ') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: intersect_ids = lf_ids.intersection(rt_ids) if len(intersect_ids) > 0: families.append(intersect_ids) if len(intersect_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # fusion to be resolved shared_kmer = 1 gap_size = 0 gap = False while idx <= idy + 1: temp = hashvals[idx] temp_ids = get_kmer_to_family_ids(temp) if len(temp_ids) > 0: intersect_ids = lf_ids.intersection(temp_ids) if len(intersect_ids) > 0: lf_ids = intersect_ids shared_kmer += 1 gap_size = 0 else: # len(intersect_ids) == 0 families.append(lf_ids) shared_kmers.append(shared_kmer) lf_ids = temp_ids shared_kmer = 1 gaps.append(gap_size) gap_size = 0 else: gap_size += 1 idx += 1 families.append(lf_ids) shared_kmers.append(shared_kmer) assert len(families) > 1 if len(families) == 2: if len(families[0]) == 1 and len(families[1]) == 1: n_clear_fusion += 1 flag = "clear_fusion" else: n_ambig_fusion += 1 flag = "ambig_fusion" else: # len(families) > 2 n_mutli_fusion += 1 flag = "multi_fusion" #if len(families) == 0: # families = "-" #if len(shared_kmers) == 0: # shared_kmers = "-" return flag, families, shared_kmers, gaps fusion_filename = args.database + '_fusion.fa' fusion_fp = open(fusion_filename, 'w') fusionInfo_filename = args.database + '_fusion.info' fusionInfo_fp = open(fusionInfo_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "gene_families", "shared_kmers", "gaps", file=fusionInfo_fp, sep='\t') fusionCalc_filename = args.database + '_fusion.calc' fusionCalc_fp = open(fusionCalc_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "familiy_A", "familiy_B", "no_families", "len_families", "shared_kmers", "gaps", "sorted_keys", file=fusionCalc_fp, sep='\t') fusionPairs_filename = args.database + '_fusionPairs.fa' fusPair_fp = open(fusionPairs_filename, 'w') fusionPairsInfo_filename = args.database + '_fusionPairs.info' fusPairInfo_fp = open(fusionPairsInfo_filename, 'w') print("fileName", "recordIndex", "fusion_class", "R1_family", "R2_family", file=fusPairInfo_fp, sep='\t') fusionPairsCalc_filename = args.database + '_fusionPairs.calc' fusPairCalc_fp = open(fusionPairsCalc_filename, 'w') print("fileName", "recordIndex", "fusion_class", "familiy_A", "familiy_B", "len_families", "sorted_keys", file=fusPairCalc_fp, sep='\t') corrupt_files = [] family_names = dict(zip(family_ids.values(), family_ids.keys())) n = 0 n_paired_fusion = 0 sameRef = ("unique", "ambiguous") fusion = ("clear_fusion", "ambig_fusion", "multi_fusion") for filename, require_paired in files: with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp, fusPair_fp, fusPairInfo_fp, fusPairCalc_fp, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) for r_index, is_paired, read0, read1 in reader: n += 1 if n % 10000 == 0: print('...', n) #if n > 5000: # break flag0, families0, shared_kmers0, gaps0 = readFusion(read0) if not is_paired and flag0 in fusion: #families_names0 = [] #for gp in families0: # gp_names = [] # for family_id in gp: # family_name = family_names[family_id] # gp_names.append(family_name) # families_names0.append(gp_names) print(filename, r_index, "single", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) #i = 1 #while i < len(families0): # for g1 in families0[i-1]: # for g2 in families0[i]: # print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]), # shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t') # i += 1 i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "single", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if is_paired: flag1, families1, shared_kmers1, gaps1 = readFusion(read1) if flag0 in fusion or flag1 in fusion: print(filename, r_index, "Read_1", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) print(filename, r_index, "Read_2", flag1, families1, shared_kmers1, gaps1, file=fusionInfo_fp, sep='\t') write_record(read1, fusion_fp) if flag0 in fusion: i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "Read_1", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if flag1 in fusion: i = len(families1) - 1 for g1 in families1[0]: g1_name = family_names[g1] for g2 in families1[i]: g2_name = family_names[g2] print(filename, r_index, "Read_2", flag1, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families1), [len(f) for f in families1], shared_kmers1, gaps1, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') elif flag0 in sameRef and flag1 in sameRef: if len(families0[0].intersection(families1[0])) == 0: n_paired_fusion += 1 if flag0 == "unique" and flag1 == "unique": fusion_class = "clear_fusion" else: fusion_class = "ambig_fusion" print(filename, r_index, fusion_class, families0, families1, file=fusPairInfo_fp, sep='\t') write_record(read0, fusPair_fp) write_record(read1, fusPair_fp) for g1 in families0[0]: g1_name = family_names[g1] for g2 in families1[0]: g2_name = family_names[g2] print(filename, r_index, fusion_class, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), [ len(f) for f in (families0[0], families1[0]) ], sorted([g1, g2]), file=fusPairCalc_fp, sep='\t') print('No of input fragments: ', n) print('unmatched:', n_unmatched) print('Unique:', n_same) print('Ambiguous:', n_amb_same) print('Single read clear fusion:', n_clear_fusion) print('Single read ambiguous fusion:', n_ambig_fusion) print('Single read multi fusion:', n_mutli_fusion) print('paired read fusion:', n_paired_fusion)
def test_clean_input_reads(): # all Read attributes are read only stream = [khmer.Read(name='seq1/1', sequence='ACGT')] with pytest.raises(AttributeError): next(clean_input_reads(stream))
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))