def get_parser(): """ returns the parser object for the oxli subcommand handler """ parser = argparse.ArgumentParser( description='Single entry point script for khmer', formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() # build-graph (formerly load-graph.py) parsers here parser_build_graph = \ subparsers.add_parser('build-graph', help="Load sequences into the compressible graph" "format plus optional tagset", description="Load sequences into the " "compressible graph format plus optional tagset") khmer_args.build_nodegraph_args("Load sequences into the compressible" "graph format plus optional tagset.", None, parser=parser_build_graph) build_graph.build_parser(parser_build_graph) parser_build_graph.set_defaults(func=build_graph.main) return parser
def get_parser(): parser = build_nodegraph_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('--max_queue_size', type=int, default=1000) parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', default='', help='output directory; default is location of \ fastp file') parser.add_argument('--query', dest='query', nargs='+', help='Reads to be swept and sorted') parser.add_argument('--db', dest='db', nargs='+', help='Database reads for sweep', required=True) return parser
def get_parser(): epilog = """\ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. This script combines the functionality of :program:`load-graph.py`, :program:`partition-graph.py`, :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. Example:: do-partition.py -k 20 example tests/test-data/random-20-a.fa """ parser = build_nodegraph_args( descr='Load, partition, and annotate FAST[AQ] sequences', epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase', help="base name for output files") parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filenames') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): parser = build_nodegraph_args(descr="Load sequences into the compressible " "graph format plus optional tagset.", citations=['graph', 'SeqAn']) parser = build_graph.build_parser(parser) return parser
def main(): parser = khmer_args.build_nodegraph_args() parser.add_argument("--samples", nargs="+") parser.add_argument("--save-prefix") parser.add_argument("--print-tree", action="store_true", default=False) args = parser.parse_args() if not args.save_prefix: print >> sys.stderr, "No save prefix specified! Exiting..." sys.exit(1) factory = NodegraphFactory(args) root = sbt.Node(factory) for sample_fn in args.samples: print "*** Build node for", sample_fn leaf = sbt.Leaf(os.path.basename(sample_fn), os.path.basename(sample_fn), factory.create_nodegraph()) print "--- Consuming file..." leaf.graph.consume_fasta(sample_fn) print "--- Adding node to SBT..." root.add_node(leaf) print "--- Done with", sample_fn if args.print_tree: sbt.print_sbt(root) print "\n*** Saving to disk" fn = sbt.save_sbt(root, args.save_prefix) print "--- Save to", fn
def get_parser(): parser = build_nodegraph_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') return parser
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_seqfile(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_fasta(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def get_parser(): """Return the parser object for the oxli subcommand handler.""" parser = argparse.ArgumentParser( description='Single entry point script for khmer', formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() # build-graph (formerly load-graph.py) parsers here parser_build_graph = \ subparsers.add_parser( name='build-graph', help="Load sequences into the compressible graph format " "plus optional tagset") parser_build_graph = build_nodegraph_args(parser=parser_build_graph) build_graph.build_parser(parser_build_graph) parser_build_graph.set_defaults(func=build_graph.main) return parser
def get_parser(): parser = build_nodegraph_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, default=DEFAULT_MAX_READS, help='Max total reads to buffer before flushing') parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, default=DEFAULT_BUFFER_SIZE, help='Max length of an individual label buffer \ before flushing') parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', help='output directory; default is location of \ fastp file') parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, default=DEFAULT_NUM_BUFFERS, help='Max individual label buffers before flushing') labeling = parser.add_mutually_exclusive_group(required=True) labeling.add_argument('--label-by-pid', dest='label_by_pid', action='store_true', help='separate reads by\ reference partition id') labeling.add_argument('--label-by-seq', dest='label_by_seq', action='store_true', help='separate reads by\ reference sequence') labeling.add_argument('--label-by-group', dest='group_size', type=int, help='separate reads by arbitrary sized groups\ of reference sequences') parser.add_argument(dest='input_fastp', help='Reference fasta or fastp') parser.add_argument('input_files', nargs='+', help='Reads to be swept and sorted') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): parser = build_nodegraph_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('--max_queue_size', type=int, default=1000) parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', default='', help='output directory; default is location of \ fastp file') parser.add_argument('--query', dest='query', nargs='+', help='Reads to be swept and sorted') parser.add_argument('--db', dest='db', nargs='+', help='Database reads for sweep', required=True) return parser
def main(): parser = khmer_args.build_nodegraph_args() parser.add_argument('--samples', nargs='+') parser.add_argument('--save-prefix') parser.add_argument('--print-tree', action='store_true', default=False) args = parser.parse_args() if not args.save_prefix: print('No save prefix specified! Exiting...', file=sys.stderr) sys.exit(1) factory = NodegraphFactory(args) tree = sbt.SBT(factory) for sample_fn in args.samples: print('*** Build node for', sample_fn) leaf = sbt.Leaf(os.path.basename(sample_fn), factory.create_nodegraph()) fname = os.path.join('.sbt.' + args.save_prefix, ".".join([args.save_prefix, os.path.basename(sample_fn), 'sbt'])) if os.path.exists(fname): print('--- Loading existing file...') leaf.graph.load(fname) else: print('--- Consuming file...') leaf.graph.consume_fasta(sample_fn) print('--- Adding node to SBT...') tree.add_node(leaf) print('--- Done with', sample_fn) if args.print_tree: tree.print() print('\n*** Saving to disk') fn = tree.save(args.save_prefix) print('--- Save to', fn)
def main(): parser = build_nodegraph_args() parser.add_argument('readfile1', help='fasta sequence file to be loaded in hashtable, use "-" if from stdin') parser.add_argument('readfile2', help='fasta readfile to query against hashtable, use "-" if from stdin') parser.add_argument('--shared', help='shared kmer in readfile 1 and 2') parser.add_argument('--uniq2', help='uniq kmer in readfile2') parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') args = parser.parse_args() print(args) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional readfile1 = args.readfile1 readfile2 = args.readfile2 shared = args.shared uniq2 = args.uniq2 if readfile1 == '-' and readfile2 == '-': mes = ('*** Only one of readfile1 and readfile2 ' 'can be read from stdin') print(mes, file=sys.stderr) try: if readfile1 == '-': fp1 = sys.stdin else: fp1 = open(readfile1) if readfile2 == '-': fp2 = sys.stdin else: fp2 = open(readfile2) if uniq2: fw2 = open(uniq2, 'w') # create a hashbits data structure ht = khmer.Nodetable(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print('loading input reads from {}..'.format(os.path.basename(readfile1)), file=sys.stderr) #ht.consume_seqfile(readfile1) for record in fasta_iter(fp1): ht.consume(record.sequence) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print('fp rate estimated to be {:1.3f}'.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** {}. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') print(mes.format(os.path.basename(readfile1)), file=sys.stderr) sys.exit(-1) n_unique1 = ht.n_unique_kmers() # create a hashbits data structure ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2) n_unique2 = 0 n_shared = 0 for n, record in enumerate(fasta_iter(fp2)): name = record['name'] sequence = record['sequence'] seq_len = len(sequence) for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 else: mes = '>{}__{} length_{};k_{}\n{}\n' fw2.write(mes.format(name, i, seq_len, K, kmer)) ht2.count(kmer) mes = ('Unique kmer in {}:\t{}\n' 'Shared kmer:\t{}\n' 'Unique kmer in {}:\t{}\n') print(mes.format(os.path.basename(readfile1), n_unique1, n_shared, os.path.basename(readfile2), n_unique2)) finally: fp1.close() fp2.close() fw2.close()
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") parser.add_argument( 'ref', nargs='+', help='fasta sequence file to be loaded in bloom filter') parser.add_argument('--bfout', default='nodetable.bf', help='output bloom filter of ref') args = parser.parse_args() K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables # positional refs = args.ref start_time = time.time() print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded with in {:.2f} hours ..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: ht.consume_seqfile(filename) except OSError as e: mes = ('*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue print('Saving bloom filter to {}..'.format(args.bfout), file=sys.stderr) ht.save(args.bfout) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() mes = ('Unique kmer:\t{}\n') print(mes.format(n_unique1), file=sys.stderr)
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") parser.add_argument('query', help=('fasta readfile to query against' 'hashtable, use "-" if from stdin')) parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') parser.add_argument('--bfout', help='output bloom filter of ref') group = parser.add_mutually_exclusive_group() group.add_argument('--shared', dest='output', action='store_const', const='shared', help='output shared kmers') group.add_argument('--uniq', dest='output', action='store_const', const='uniq', help='output uniq kmers in query') group2 = parser.add_mutually_exclusive_group(required=True) group2.add_argument( '--ref', nargs='+', help='fasta sequence file to be loaded in bloom filter') group2.add_argument('--load', help='load existing bloom filter') parser.set_defaults(output='uniq') args = parser.parse_args() #print(args, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional query = args.query output = args.output start_time = time.time() # load from existing bloom filter if args.load: print('loading bloom filter from {}..'.format(args.load), file=sys.stderr) ht = khmer.load_nodetable(args.load) k = ht.ksize() mes = ('*** incompatible ksize ({}) in {} with parameters K on ' 'command line ({})') assert k == K, mes.format(k, args.load, K) end_time = time.time() secs = end_time - start_time mes = 'load bloom filter ({}) took {:.2f} hours..' print(mes.format(os.path.basename(args.load), secs / 3600.0), file=sys.stderr) # create a hashbits data structure else: refs = args.ref print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) if query == '-' and refs == ['-']: print('*** query and ref can not both be "-" (read from stdin)', file=sys.stderr) ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded with in {:.2f} hours ..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: ht.consume_seqfile(filename) except OSError as e: mes = ( '*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue if args.bfout: if args.load: mes = '*** Bloom filter exists as {}, NOT saving again as {}..' print(mes.format(args.load, args.bfout), file=sys.stderr) else: print('*** Saving bloom filter to {}..'.format(args.bfout), file=sys.stderr) ht.save(args.bfout) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() # create a hashbits data structure ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2) n_unique2 = 0 n_shared = 0 if output == 'uniq': for n, record in enumerate(khmer.ReadParser(query)): #for n, record in enumerate(screed.open(query)): _l = record.name.split(None, 1) if len(_l) == 2: name, desc = _l else: name = _l[0] desc = '' sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 else: mes = '>{}__{} {}||length_{};k_{}\n{}' print(mes.format(name, i, desc, seq_len, K, kmer)) ht2.count(kmer) elif output == 'shared': for n, record in enumerate(khmer.ReadParser(query)): #for n, record in enumerate(screed.open(query)): _l = record.name.split(None, 1) if len(_l) == 2: name, desc = _l else: name = _l[0] desc = '' sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 mes = '>{}__{} {}||length_{};k_{}\n{}' print(mes.format(name, i, desc, seq_len, K, kmer)) else: pass ht2.count(kmer) mes = ('Unique kmer in {} (query):\t{}\n' 'Shared kmer:\t{}\n' 'Unique kmer in {}:\t{}\n') print(mes.format(os.path.basename(query), n_unique2, n_shared, 'refs', n_unique1), file=sys.stderr)
def get_parser(): parser = build_nodegraph_args( "Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to" ) parser.epilog = EPILOG parser.add_argument( "-r", "--traversal_range", type=int, dest="traversal_range", default=DEFAULT_RANGE, help="depth of breadth-first search to perform\ from each read", ) parser.add_argument( "-b", "--buffer_size", dest="max_reads", type=int, default=DEFAULT_MAX_READS, help="Max total reads to buffer before flushing", ) parser.add_argument( "-l", "--buffer_length", dest="buffer_size", type=int, default=DEFAULT_BUFFER_SIZE, help="Max length of an individual label buffer \ before flushing", ) parser.add_argument("--prefix", dest="output_prefix", default=DEFAULT_OUT_PREF, help="Prefix for sorted read files") parser.add_argument( "--outdir", dest="outdir", help="output directory; default is location of \ fastp file", ) parser.add_argument( "-m", "--max_buffers", dest="max_buffers", type=int, default=DEFAULT_NUM_BUFFERS, help="Max individual label buffers before flushing", ) labeling = parser.add_mutually_exclusive_group(required=True) labeling.add_argument( "--label-by-pid", dest="label_by_pid", action="store_true", help="separate reads by\ reference partition id", ) labeling.add_argument( "--label-by-seq", dest="label_by_seq", action="store_true", help="separate reads by\ reference sequence", ) labeling.add_argument( "--label-by-group", dest="group_size", type=int, help="separate reads by arbitrary sized groups\ of reference sequences", ) parser.add_argument(dest="input_fastp", help="Reference fasta or fastp") parser.add_argument("input_files", nargs="+", help="Reads to be swept and sorted") parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") parser.add_argument('query', help=('fasta readfile to query against' 'hashtable, use "-" if from stdin')) parser.add_argument('ref', nargs='+', help='fasta sequence file to be loaded in hashtable') parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') args = parser.parse_args() #print(args, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional query = args.query refs = args.ref print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) if query == '-' and refs == ['-']: print('*** query and ref can not both be "-" (read from stdin)', file=sys.stderr) # create a hashbits data structure start_time = time.time() ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded with in {:.2f} hours ..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: ht.consume_seqfile(filename) except OSError as e: mes = ('*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() pair = 0 forward = 0 reverse = 0 other = 0 total_pair = 0 for n, is_pair, r1, r2 in broken_paired_reader( khmer.ReadParser(query, require_paired=True)): #for n, record in enumerate(screed.open(query)): total_pair += 1 share_list = [] for record in [r1, r2]: name, desc = record.name.split(None, 1) sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: print('*** {} is shorter than {}..'.format(r1.name, K), file=sys.stderr) continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if ht.get(kmer): share_list.append(1) break else: share_list.append(0) if share_list == [1, 1]: pair += 1 elif share_list == [1, 0]: forward += 1 elif share_list == [0, 1]: reverse += 1 else: #[0, 0] other += 1 # do not print continue mes = ('>{} {}||uniq_{}\n{}\n' '>{} {}||uniq_{}\n{}') l1 = r1.name.split(None, 1) l2 = r2.name.split(None, 1) print( mes.format(l1[0], l1[1], share_list[0], r1.sequence, l2[0], l2[1], share_list[1], r2.sequence)) mes = ('Unique kmer in ref:\t{}\n' 'Total pair:\t{}\n' 'Both primers uniq:\t{}\n' 'Pair with forward uniq:\t{}\n' 'Pair with reverse uniq:\t{}') print(mes.format(n_unique1, total_pair, pair, forward, reverse), file=sys.stderr)