def test_label_tag_correctness_save_load(): lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_seqfile_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags lb = GraphLabels.NodeGraphLabels(20, 1e7, 4) lb.load_labels_and_tags(savepath) # read A labels = list( lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')) print( lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) print(labels) print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels # read B labels = list( lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')) print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels assert 2 in labels # read C labels = list( lb.sweep_label_neighborhood( 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA')) print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels # read D labels = list( lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')) print(labels) assert len(labels) == 1 assert 3 in labels
def test_n_labels(): lh = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lh.consume_fasta_and_tag_with_labels(filename) print(lh.n_labels()) assert lh.n_labels() == 4
def test_save_load_corrupted(): lb_pre = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre lb = GraphLabels(20, 1e7, 4) # produce all possible truncated versions of this file data = open(savepath, 'rb').read() for i in range(len(data)): truncated = utils.get_temp_filename('trunc.labels') fp = open(truncated, 'wb') fp.write(data[:i]) fp.close() try: lb.load_labels_and_tags(truncated) assert 0, "this should not succeed -- truncated file len %d" % ( i, ) except OSError as err: print('expected failure for', i, ': ', str(err))
def test_sweep_tag_neighborhood(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.graph.consume_fasta_and_tag(filename) tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(tags) == 1 assert list(tags) == [173473779682]
def test_sweep_tag_neighborhood(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.graph.consume_fasta_and_tag(filename) tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(tags) == 1 assert tags.pop() == 173473779682
def test_sweep_label_neighborhood(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.consume_fasta_and_tag_with_labels(filename) labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(labels) == 1 assert labels.pop() == 0
def test_consume_partitioned_seqfile_and_label(Graphtype): infile = utils.get_test_data('valid-read-testing.fq') # read this in consume_and_tag graph = Graphtype(15, *params_1m) x = GraphLabels(graph) x.consume_partitioned_fasta_and_tag_with_labels(infile) assert x.n_labels == 9
def test_get_tag_labels(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.consume_fasta_and_tag_with_labels(filename) tag = 173473779682 labels = lb.get_tag_labels(tag) assert len(labels) == 1 assert labels.pop() == 0
def test_consume_sequence_and_tag_with_labels(): lb = GraphLabels(20, 1e6, 4) label = 0 sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG' lb.consume_sequence_and_tag_with_labels(sequence, label) labels = set() labels.update(lb.sweep_label_neighborhood(sequence)) assert label in labels assert len(labels) == 1
def test_consume_sequence_and_tag_with_labels(): lb = GraphLabels(20, 1e6, 4) label = 0 sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG' n_consumed = lb.consume_sequence_and_tag_with_labels(sequence, label) labels = set() labels.update(lb.sweep_label_neighborhood(sequence)) assert label in labels assert len(labels) == 1
def test_load_wrong_fileversion(): lb = GraphLabels(20, 1e7, 4) # try to load a tagset from an old version filename = utils.get_test_data('badversion-k32.tagset') try: lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file type" except OSError as err: print(str(err)) assert "Incorrect file format version" in str(err)
def test_get_label_dict(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb.consume_fasta_and_tag_with_labels(filename) labels = lb.get_label_dict() expected = [0, 1, 2, 3] for e_label in expected: assert e_label in labels for a_label in labels: assert a_label in expected
def test_get_all_labels(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb.consume_fasta_and_tag_with_labels(filename) labels = lb.get_all_labels() expected = [0, 1, 2, 3] for e_label in expected: assert e_label in labels for a_label in labels: assert a_label in expected
def test_save_load_corrupted(): lb_pre = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre lb = GraphLabels(20, 1e7, 4) # produce all possible truncated versions of this file data = open(savepath, 'rb').read() for i in range(len(data)): truncated = utils.get_temp_filename('trunc.labels') fp = open(truncated, 'wb') fp.write(data[:i]) fp.close() try: lb.load_labels_and_tags(truncated) assert 0, "this should not succeed -- truncated file len %d" % (i,) except OSError as err: print('expected failure for', i, ': ', str(err))
def test_consume_partitioned_fasta_and_tag_with_labels(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('real-partition-small.fa') total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels( filename) labels = set() for record in screed.open(filename): seq = record.sequence labels.update(lb.sweep_label_neighborhood(seq, 0, False, False)) # print(lb.n_labels()) # print(labels) assert len(labels) == 1 assert labels.pop() == 2 assert lb.n_labels() == 1
def test_n_labels(): lh = GraphLabels.NodeGraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lh.consume_seqfile_and_tag_with_labels(filename) print(lh.n_labels) assert lh.n_labels == 4
def test_save_fail_readonly(): lb_pre = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') fp = open(savepath, 'w') fp.close() os.chmod(savepath, 0x444) try: lb_pre.save_labels_and_tags(savepath) assert 0, "this should fail: read-only file" except OSError as err: print(str(err))
def test_link_tag_and_label_using_string(): lb = GraphLabels(20, 1, 1) kmer = lb.reverse_hash(173473779682) lb.add_tag(kmer) lb.link_tag_and_label(kmer, 1) labels = lb.get_tag_labels(kmer) assert len(labels) == 1 assert labels.pop() == 1
def test_link_tag_and_label_using_string_2(): lb = GraphLabels(20, 1, 1) tag = 173473779682 kmer = lb.reverse_hash(tag) lb.add_tag(kmer) lb.link_tag_and_label(kmer, 1) labels = lb.get_tag_labels(tag) # <-- use 'tag' instead of 'kmer' assert len(labels) == 1 assert labels.pop() == 1
def test_link_tag_and_label(): lb = GraphLabels.NodeGraphLabels(20, 1, 1) tag = 173473779682 lb.add_tag(tag) lb.link_tag_and_label(tag, 1) labels = list(lb.get_tag_labels(tag)) assert len(labels) == 1 assert labels.pop() == 1
def test_consume_sequence_and_tag_with_labels_2(): lb = GraphLabels.NodeGraphLabels(20, 1e6, 4) label = 56 # randomly chosen / non-zero sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG' lb.consume_sequence_and_tag_with_labels(sequence, label) labels = set() labels.update(lb.sweep_label_neighborhood(sequence)) assert label in labels assert len(labels) == 1
def test_get_labels_save_load_wrong_ksize(): lb_pre = GraphLabels(19, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags lb = GraphLabels(20, 1e7, 4) try: lb.load_labels_and_tags(savepath) assert 0, "this should not succeed - different ksize" except OSError as err: print(str(err)) assert "Incorrect k-mer size 19" in str(err)
def test_load_wrong_filetype(): lb = GraphLabels(20, 1e7, 4) # try to load a tagset filename = utils.get_test_data('goodversion-k32.tagset') try: lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file type" except OSError as err: print(str(err)) assert "Incorrect file format type" in str(err) # try to load a nonsense file filename = utils.get_test_data('all-A.fa') try: lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file signature" except OSError as err: print(str(err)) assert "Incorrect file signature" in str(err)
def test_label_tag_correctness(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb.consume_fasta_and_tag_with_labels(filename) # read A labels = lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print( lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) print(labels) print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels # read B labels = lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels assert 2 in labels # read C labels = lb.sweep_label_neighborhood( 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA') print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels # read D labels = lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') print(labels) assert len(labels) == 1 assert 3 in labels
def test_get_labels_save_load(): lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_seqfile_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags graph = Nodegraph(20, 1e7, 4) lb = GraphLabels.load(savepath, graph) labels = list(lb.labels()) expected = [0, 1, 2, 3] for e_label in expected: assert e_label in labels for a_label in labels: assert a_label in expected
def test_consume_seqfile_and_tag_with_labels(): lb = GraphLabels(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') total_reads, _ = lb.consume_seqfile_and_tag_with_labels(filename) print("doing get") assert lb.get(read_1[:20]) assert total_reads == 3 print("doing n_labels") print(lb.n_labels()) print("doing all labels") print(lb.get_all_labels()) print("get tagset") for tag in lb.get_tagset(): print("forward hash") print(tag, khmer.forward_hash(tag, 20)) for record in screed.open(filename): print("Sweeping tags") print(lb.sweep_tag_neighborhood(record.sequence, 40)) print("Sweeping labels...") print(lb.sweep_label_neighborhood(record.sequence, 40)) assert lb.n_labels() == 3
def test_get_labels_for_sequence(): lb = GraphLabels.NodeGraphLabels(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.consume_seqfile_and_tag_with_labels(filename) seq = [r.sequence for r in screed.open(filename)][0] labels = list(lb.get_labels_for_sequence(seq)) tag = 173473779682 labels2 = list(lb.get_tag_labels(tag)) assert labels == labels2 assert len(labels) == 1 assert labels.pop() == 0
def test_label_tag_correctness(): lb = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb.consume_fasta_and_tag_with_labels(filename) # read A labels = lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print(lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) print(labels) print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels # read B labels = lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels assert 2 in labels # read C labels = lb.sweep_label_neighborhood( 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA') print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels # read D labels = lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') print(labels) assert len(labels) == 1 assert 3 in labels
def test_get_label_dict_save_load(): lb_pre = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags lb = GraphLabels(20, 1e7, 4) lb.load_labels_and_tags(savepath) labels = lb.get_label_dict() expected = [0, 1, 2, 3] for e_label in expected: assert e_label in labels for a_label in labels: assert a_label in expected
def test_save_fail_readonly(): lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_seqfile_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') fp = open(savepath, 'w') fp.close() os.chmod(savepath, 0x444) try: lb_pre.save_labels_and_tags(savepath) assert 0, "this should fail: read-only file" except OSError as err: print(str(err))
def test_get_label_dict_save_load_wrong_ksize(): lb_pre = GraphLabels(19, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags lb = GraphLabels(20, 1e7, 4) try: lb.load_labels_and_tags(savepath) assert 0, "this should not succeed - different ksize" except OSError as err: print(str(err)) assert "Incorrect k-mer size 19" in str(err)
def test_load_wrong_filetype(): lb = GraphLabels.NodeGraphLabels(20, 1e7, 4) # try to load a tagset filename = utils.get_test_data('goodversion-k32.tagset') try: lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file type" except OSError as err: print(str(err)) assert "Incorrect file format type" in str(err) # try to load a nonsense file filename = utils.get_test_data('all-A.fa') try: lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file signature" except OSError as err: print(str(err)) assert "Incorrect file signature" in str(err)
def test_consume_fasta_and_tag_with_labels(): lb = GraphLabels(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename) print("doing get") assert lb.graph.get(read_1[:20]) assert total_reads == 3 print("doing n_labels") print(lb.n_labels()) print("doing label dict") print(lb.get_label_dict()) print("get tagset") for tag in lb.graph.get_tagset(): print("forward hash") print(tag, khmer.forward_hash(tag, 20)) for record in screed.open(filename): print("Sweeping tags") print(lb.sweep_tag_neighborhood(record.sequence, 40)) print("Sweeping labels...") print(lb.sweep_label_neighborhood(record.sequence, 40)) assert lb.n_labels() == 3
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = sanitize_help(get_parser()) args = parser.parse_args() if args.max_tablesize < MAX_HSIZE: args.max_tablesize = MAX_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, graphtype='nodegraph') K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = next(ix) del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = GraphLabels.NodeGraphLabels(K, HT_SIZE, N_HT) try: print('consuming input sequences...', file=sys.stderr) if args.label_by_pid: print('...labeling by partition id (pid)', file=sys.stderr) ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print('...labeling by sequence', file=sys.stderr) for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print('...labeling to create groups of size {s}'.format( s=args.group_size), file=sys.stderr) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels( record.sequence, label) write_record(record, outfp) except (IOError, OSError) as e: print('!! ERROR !!', e, file=sys.stderr) print('...error splitting input. exiting...', file=sys.stderr) except (IOError, OSError) as e: print('!! ERROR: !!', e, file=sys.stderr) print('...error consuming \ {i}. exiting...'.format(i=input_fastp), file=sys.stderr) print('done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.graph.n_tags, l=ht.n_labels)) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print('** sweeping {read_file} for labels...'.format( read_file=read_file), file=sys.stderr) file_t = 0.0 try: read_fp = screed.open(read_file) except (IOError, OSError) as error: print('!! ERROR: !!', error, file=sys.stderr) print('*** Could not open {fn}, skipping...'.format(fn=read_file), file=sys.stderr) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print('\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t), file=sys.stderr) start_t = time.clock() seq = record.sequence name = record.name try: labels = list( ht.sweep_label_neighborhood(seq, traversal_range)) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print('** End of run...', file=sys.stderr) output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print('! WARNING: Sweep finished with errors !', file=sys.stderr) print('** {writee} reads not written'.format( writee=output_buffer.num_write_errors), file=sys.stderr) print('** {filee} errors opening files'.format( filee=output_buffer.num_file_errors), file=sys.stderr) print('swept {n_reads} for labels...'.format(n_reads=n_labeled + n_orphaned), file=sys.stderr) print('...with {nc} labeled and {no} orphaned'.format(nc=n_labeled, no=n_orphaned), file=sys.stderr) print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr) print('** outputting label number distribution...', file=sys.stderr) fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'w', encoding='utf-8') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print('** outputting label read counts...', file=sys.stderr) with open(fn, 'w', encoding='utf-8') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def test_toobig(): try: GraphLabels(20, 1e13, 1) assert 0, "This should fail." except MemoryError as err: print(str(err))
def test_error_create(): try: GraphLabels.NodeGraphLabels(None, None, None) assert 0, "This should fail." except TypeError as err: print(str(err))
def test_label_tag_correctness_save_load(): lb_pre = GraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags lb = GraphLabels(20, 1e7, 4) lb.load_labels_and_tags(savepath) # read A labels = lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print(lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) print(labels) print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels # read B labels = lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels assert 2 in labels # read C labels = lb.sweep_label_neighborhood( 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA') print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels # read D labels = lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') print(labels) assert len(labels) == 1 assert 3 in labels