def test_extract_partitions_output_unassigned(): seqfile = utils.get_test_data('random-20-a.fa') graphbase = _make_graph( seqfile, do_partition=True, annotate_partitions=True) in_dir = os.path.dirname(graphbase) # get the final part file partfile = os.path.join(in_dir, 'random-20-a.fa.part') # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['-U', 'extracted', partfile] utils.runscript(script, args, in_dir) distfile = os.path.join(in_dir, 'extracted.dist') groupfile = os.path.join(in_dir, 'extracted.group0000.fa') unassigned_file = os.path.join(in_dir, 'extracted.unassigned.fa') assert os.path.exists(distfile) assert os.path.exists(groupfile) assert os.path.exists(unassigned_file) dist = open(distfile).readline() assert dist.strip() == '99 1 1 99' parts = [r.name.split('\t')[1] for r in screed.open(partfile)] assert len(parts) == 99, len(parts) parts = set(parts) assert len(parts) == 1, len(parts)
def test_make_initial_stoptags(): # gen input files using load-graph.py -t # should keep test_data directory size down # or something like that # this assumes (obv.) load-graph works properly bzinfile = utils.get_temp_filename('test-reads.fq.bz2') shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile) in_dir = os.path.dirname(bzinfile) genscript = scriptpath('load-graph.py') genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2'] utils.runscript(genscript, genscriptargs, in_dir) # test input file gen'd by load-graphs infile = utils.get_temp_filename('test-reads.pt') infile2 = utils.get_temp_filename('test-reads.tagset', in_dir) # get file to compare against ex_outfile = utils.get_test_data('test-reads.stoptags') # actual output file outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir) script = scriptpath('make-initial-stoptags.py') # make-initial-stoptags has weird file argument syntax # read the code before modifying args = ['test-reads'] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1
def test_extract_paired_reads_1_fa(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # actual output files... outfile1 = utils.get_temp_filename('paired-mixed.fa.pe') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir) script = scriptpath('extract-paired-reads.py') args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_feature_extraction_metamark(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'), infile) outfile = infile + '.metamark_vector' script = scriptpath('feature_extraction_metamark.py') mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified") print mmp tmp = os.path.abspath("./") print tmp print in_dir args = [ "--input", infile, "--outfile", outfile, "--tmp", tmp, "--mmp", mmp, "--taxid", "12345" ] utils.runscript(script, args, in_dir) assert os.path.exists(outfile), outfile print outfile data = [x.strip() for x in open(outfile)] print len(data) assert len(data) == 30 assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True assert data[0].endswith("0.018679\t0.016415\t0.016415") == True assert data[-1].startswith( "12345\tgi|511782593|ref|NC_021399.1||pos|295582..300582") == True assert data[-1].endswith("0.023325\t0.019296\t0.021310") == True utils.cleanup()
def test_filter_stoptags(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) stopfile = utils.get_temp_filename('stoptags', in_dir) # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir. shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) # now, create a file with some stop tags in it -- K = 18 kh = khmer.new_hashbits(K, 1, 1) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh # finally, run filter-stoptags. script = scriptpath('filter-stoptags.py') args = ['-k', str(K), stopfile, infile, infile] utils.runscript(script, args, in_dir) # verify that the basic output file exists outfile = infile + '.stopfilt' assert os.path.exists(outfile), outfile # it should contain only one unique sequence, because we've trimmed # off everything after the beginning of the only long sequence in there. seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
def test_extract_partitions_output_unassigned(): seqfile = utils.get_test_data('random-20-a.fa') graphbase = _make_graph(seqfile, do_partition=True, annotate_partitions=True) in_dir = os.path.dirname(graphbase) # get the final part file partfile = os.path.join(in_dir, 'random-20-a.fa.part') # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['-U', 'extracted', partfile] utils.runscript(script, args, in_dir) distfile = os.path.join(in_dir, 'extracted.dist') groupfile = os.path.join(in_dir, 'extracted.group0000.fa') unassigned_file = os.path.join(in_dir, 'extracted.unassigned.fa') assert os.path.exists(distfile) assert os.path.exists(groupfile) assert os.path.exists(unassigned_file) dist = open(distfile).readline() assert dist.strip() == '99 1 1 99' parts = [r.name.split('\t')[1] for r in screed.open(partfile)] assert len(parts) == 99, len(parts) parts = set(parts) assert len(parts) == 1, len(parts)
def test_abund_dist_gz_bigcount(): infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) outfile = utils.get_temp_filename('test_ct.gz') script = scriptpath('load-into-counting.py') htfile = utils.get_temp_filename('test_ct') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = open(htfile, 'rb').read() f_out = gzip.open(outfile, 'wb') # compress the created bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table counting_hash = khmer.load_counting_hash(outfile) hashsizes = counting_hash.hashsizes() kmer_size = counting_hash.ksize() tracking = khmer._Hashbits(kmer_size, hashsizes) abundances = counting_hash.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print _, i if _ > 255 and i > 0: flag = True break assert flag
def test_split_paired_reads_2_fq(): # test input file infile = utils.get_test_data('paired.fq') ex_outfile1 = utils.get_test_data('paired.fq.1') ex_outfile2 = utils.get_test_data('paired.fq.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fq.1') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired.fq.2', in_dir) script = scriptpath('split-paired-reads.py') args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0
def test_extract_long_sequences(): script = scriptpath('extract-long-sequences.py') fq_infile = utils.get_temp_filename('test.fq') fa_infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile) shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile) fq_outfile = fq_infile + '.keep.fq' fa_outfile = fa_infile + '.keep.fa' in_dir_fq = os.path.dirname(fq_infile) in_dir_fa = os.path.dirname(fa_infile) args = [fq_infile, '-l', '10', '-o', 'fq_outfile'] (status, out, err) = utils.runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fq_infile)) assert countlines == 44, countlines args = [fa_infile, '-l', '10', '-o', 'fa_outfile'] (status, out, err) = utils.runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fa_infile)) assert countlines == 22, countlines
def test_partition_graph_nojoin_stoptags(): # test with stoptags graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) # add in some stop tags ht = khmer.load_hashbits(graphbase + '.pt') ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') stoptags_file = graphbase + '.stoptags' ht.save_stop_tags(stoptags_file) del ht # run script with stoptags option script = scriptpath('partition-graph.py') args = ['--stoptags', stoptags_file, graphbase] utils.runscript(script, args) script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(20)] utils.runscript(script, args) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.pt') ht.load_tagset(graphbase + '.tagset') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (2, 0), x # should be 2 partitions
def test_feature_extraction_kmer(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'), infile) outfile = infile + '.kmer_vector' script = scriptpath('feature_extraction_kmer.py') mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified") print mmp tmp = os.path.abspath("./") print tmp print in_dir args = [ "--input", infile, "--outfile", outfile, "--taxid", "12345", "--label", "taxid" ] utils.runscript(script, args, in_dir) assert os.path.exists(outfile), outfile print outfile data = [x.strip() for x in open(outfile)] print len(data) assert len(data) == 54 assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True assert data[0].endswith( "0.00300180108065\t0.00640384230538\t0.00380228136882") == True assert data[-1].startswith( "12345\tgi|511782593|ref|NC_021399.1||pos|304493..309493") == True assert data[-1].endswith( "0.00300180108065\t0.00500300180108\t0.00200120072043") == True utils.cleanup()
def test_sample_reads_randomly_S(): infile = utils.get_temp_filename('test.fq') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('sample-reads-randomly.py') # fix random number seed for reproducibility args = ['-N', '10', '-R', '1', '-S', '3'] badargs = list(args) badargs.extend(['-o', 'test', 'test.fq', 'test.fq']) (status, out, err) = utils.runscript(script, badargs, in_dir, fail_ok=True) assert status == -1, (status, out, err) args.append('test.fq') utils.runscript(script, args, in_dir) outfile = infile + '.subset.0' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set([ '895:1:1:1298:13380', '895:1:1:1347:3237', '895:1:1:1295:6189', '895:1:1:1342:11001', '895:1:1:1252:19493', '895:1:1:1318:10532', '895:1:1:1314:10430', '895:1:1:1347:8723', '895:1:1:1381:4958', '895:1:1:1338:6614' ]) outfile = infile + '.subset.1' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set([ '895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1348:18672', '895:1:1:1290:11501', '895:1:1:1386:7536', '895:1:1:1373:13994', '895:1:1:1355:13535', '895:1:1:1303:6251', '895:1:1:1381:4958', '895:1:1:1338:6614' ]) outfile = infile + '.subset.2' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set([ '895:1:1:1326:7273', '895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1353:6642', '895:1:1:1340:19387', '895:1:1:1252:19493', '895:1:1:1381:7062', '895:1:1:1383:3089', '895:1:1:1342:20695', '895:1:1:1303:6251' ])
def test_normalize_by_median_impaired(): CUTOFF = '1' infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-impaired.fa'), infile) script = scriptpath('normalize-by-median.py') args = ['-C', CUTOFF, '-p', '-k', '17', infile] utils.runscript(script, args, in_dir, fail_ok=True)
def test_sample_reads_randomly_S(): infile = utils.get_temp_filename('test.fq') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('sample-reads-randomly.py') # fix random number seed for reproducibility args = ['-N', '10', '-R', '1', '-S', '3'] badargs = list(args) badargs.extend(['-o', 'test', 'test.fq', 'test.fq']) (status, out, err) = utils.runscript(script, badargs, in_dir, fail_ok=True) assert status == -1, (status, out, err) args.append('test.fq') utils.runscript(script, args, in_dir) outfile = infile + '.subset.0' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set(['895:1:1:1298:13380', '895:1:1:1347:3237', '895:1:1:1295:6189', '895:1:1:1342:11001', '895:1:1:1252:19493', '895:1:1:1318:10532', '895:1:1:1314:10430', '895:1:1:1347:8723', '895:1:1:1381:4958', '895:1:1:1338:6614']) outfile = infile + '.subset.1' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set(['895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1348:18672', '895:1:1:1290:11501', '895:1:1:1386:7536', '895:1:1:1373:13994', '895:1:1:1355:13535', '895:1:1:1303:6251', '895:1:1:1381:4958', '895:1:1:1338:6614']) outfile = infile + '.subset.2' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set(['895:1:1:1326:7273', '895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1353:6642', '895:1:1:1340:19387', '895:1:1:1252:19493', '895:1:1:1381:7062', '895:1:1:1383:3089', '895:1:1:1342:20695', '895:1:1:1303:6251'])
def test_trim_low_abund_1_duplicate_filename_err(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) args = ["-k", "17", "-x", "1e7", "-N", "2", '-C', '1', infile, infile] try: utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True) raise Exception("should not reach this") except AssertionError: # an error should be raised by passing 'infile' twice. pass
def test_normalize_by_median_empty(): CUTOFF = '1' infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-empty.fa'), infile) script = scriptpath('normalize-by-median.py') args = ['-C', CUTOFF, '-k', '17', infile] utils.runscript(script, args, in_dir) outfile = infile + '.keep' assert os.path.exists(outfile), outfile
def test_trim_low_abund_4_retain_low_abund(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) args = ["-k", "17", "-x", "1e7", "-N", "2", '-V', infile] utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True) outfile = infile + '.abundtrim' assert os.path.exists(outfile), outfile seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 2, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs
def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True): script = scriptpath('load-into-counting.py') args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)] if not BIGCOUNT: args.append('-b') outfile = utils.get_temp_filename('out.kh') args.extend([outfile, infilename]) utils.runscript(script, args) assert os.path.exists(outfile) return outfile
def test_sweep_reads_3(): infile = utils.get_temp_filename('seqs.fa') shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile) wdir = os.path.dirname(infile) script = scriptpath('sweep-reads.py') args = ['-m', '75', '-k', '20', '-l', '1', '--prefix', 'test', '--label-by-group', '10', infile, infile] status, out, err = utils.runscript(script, args, wdir, sandbox=True) for i in xrange(10): p = os.path.join(wdir, 'test_{i}.fa'.format(i=i)) print p, err, out assert os.path.exists(p) os.remove(p) counts_fn = os.path.join(wdir, 'test.counts.csv') with open(counts_fn) as cfp: for line in cfp: _, _, c = line.partition(',') assert int(c) in [9, 10] assert os.path.exists(counts_fn) assert os.path.exists(os.path.join(wdir, 'test.dist.txt')) assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
def test_load_graph(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-t'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) (status, out, err) = utils.runscript(script, args) assert 'Total number of k-mers: 3959' in err, err ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append( utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def test_do_partition(): seqfile = utils.get_test_data('random-20-a.fa') graphbase = utils.get_temp_filename('out') in_dir = os.path.dirname(graphbase) script = scriptpath('do-partition.py') args = ["-k", "20", graphbase, seqfile] utils.runscript(script, args, in_dir) partfile = os.path.join(in_dir, 'random-20-a.fa.part') parts = [r.name.split('\t')[1] for r in screed.open(partfile)] parts = set(parts) assert '2' in parts assert len(parts) == 1
def test_count_overlap(): seqfile1 = utils.get_temp_filename('test-overlap1.fa') in_dir = os.path.dirname(seqfile1) seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir) outfile = utils.get_temp_filename('overlap.out', in_dir) curvefile = utils.get_temp_filename('overlap.out.curve', in_dir) shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1) shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = scriptpath('count-overlap.py') args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', htfile + '.pt', seqfile2, outfile] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert '# of unique k-mers in dataset2: 759047' in data assert '# of overlap unique k-mers: 245621' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) assert '178633 1155' in data assert '496285 2970' in data assert '752053 238627' in data
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename( 'test-good.fq', tempdir=os.path.dirname(corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def test_count_overlap(): seqfile1 = utils.get_temp_filename('test-overlap1.fa') in_dir = os.path.dirname(seqfile1) seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir) outfile = utils.get_temp_filename('overlap.out', in_dir) curvefile = utils.get_temp_filename('overlap.out.curve', in_dir) shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1) shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = scriptpath('count-overlap.py') args = [ '--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', htfile + '.pt', seqfile2, outfile ] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert '# of unique k-mers in dataset2: 759047' in data assert '# of overlap unique k-mers: 245621' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) assert '178633 1155' in data assert '496285 2970' in data assert '752053 238627' in data
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename('test-good.fq', tempdir=os.path.dirname( corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def test_abundance_dist_single_nobigcount(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) script = scriptpath('abundance-dist-single.py') args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile] utils.runscript(script, args, in_dir) fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '255 2 98 1.0', line
def test_normalize_by_median_indent(): infile = utils.get_test_data('paired-mixed.fa.pe') hashfile = utils.get_test_data('normC20k20.kh') script = scriptpath('normalize-by-median.py') args = ['--loadtable', hashfile, infile] (status, out, err) = utils.runscript(script, args) assert status == 0, (out, err) print(out, err)
def test_extract_partitions_no_groups(): empty_file = utils.get_temp_filename('empty-file') basefile = utils.get_test_data('empty-file') shutil.copyfile(basefile, empty_file) in_dir = os.path.dirname(empty_file) # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['extracted', empty_file] utils.runscript(script, args, in_dir, fail_ok=True) # No group files should be created groupfile = os.path.join(in_dir, 'extracted.group0000.fa') assert not os.path.exists(groupfile)
def test_filter_abund_2(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) counting_ht = _make_counting(infile, K=17) script = scriptpath('filter-abund.py') args = ['-C', '1', counting_ht, infile, infile] utils.runscript(script, args, in_dir) outfile = infile + '.abundfilt' assert os.path.exists(outfile), outfile seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 2, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs
def test_load_graph_multithread(): script = scriptpath('load-graph.py') outfile = utils.get_temp_filename('test') infile = utils.get_test_data('test-reads.fa') args = ['-N', '4', '-x', '1e9', '-T', '8', outfile, infile] (status, out, err) = utils.runscript(script, args)
def test_count_median(): infile = utils.get_temp_filename('test.fa') outfile = infile + '.counts' shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) counting_ht = _make_counting(infile, K=8) script = scriptpath('count-median.py') args = [counting_ht, infile, outfile] utils.runscript(script, args) assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert len(data) == 2, data assert 'seq 1001 1001.0 0.0 18' in data assert '895:1:37:17593:9954/1 1 103.803741455 303.702941895 114' in data
def test_abundance_dist_nobigcount(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) htfile = _make_counting(infile, K=17, BIGCOUNT=False) script = scriptpath('abundance-dist.py') args = ['-z', htfile, infile, outfile] utils.runscript(script, args, in_dir) fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '255 2 98 1.0', line
def test_load_graph_no_tags(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-n'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) utils.runscript(script, args) ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert not os.path.exists(tagset_file), tagset_file assert khmer.load_hashbits(ht_file)