def annotate_fasta(): annotations = GFF3Parser(gff3_fn).read() with open(output_fn, 'w') as fp: for n, record in enumerate(ReadParser(transcriptome_fn)): df = annotations.query('seqid == "{0}"'.format(record.name)) desc = generate_sequence_summary(record.name, record.sequence, df) fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))
def test_bzip2_decompression(): reads_count = 0 rparser = ReadParser(utils.get_test_data("100-reads.fq.bz2")) for read in rparser: reads_count += 1 assert 100 == reads_count
def test_read_truncated(): rparser = ReadParser(utils.get_test_data("truncated.fq")) try: for read in rparser: pass assert 0, "No exception raised on a truncated file" except IOError as err: assert "Sequence is empty" in str(err), str(err)
def fix(): names = [] with open(output_fn, 'w') as fp: for record in ReadParser(transcriptome_fn): header = header_func(record.name) fp.write('>{0}\n{1}\n'.format(header, record.sequence)) names.append((record.name, header)) pd.DataFrame(names, columns=['original', 'renamed']).to_csv(names_fn, index=False)
def test_iternext(): rparser = ReadParser(utils.get_test_data("fakelump.fa.stoptags.txt")) read_pairs = [] try: for read_1, read_2 in rparser.iter_read_pairs(): read_pairs.append(read_1, read_2) assert 0, "Shouldn't be able to iterate over non FASTA file" except IOError, err: print str(err)
def test_error_badly_formatted_file(): fname = utils.get_temp_filename('badly-formatted.fa') with open(fname, 'w') as f: f.write("not-sequence") with pytest.raises(OSError) as e: ReadParser(fname) assert e.match("contains badly formatted sequence")
def test_num_reads(): """Test ReadParser.num_reads""" reads_count = 0 rparser = ReadParser(utils.get_test_data("100-reads.fq.gz")) for _ in rparser: reads_count += 1 assert reads_count == 100 assert rparser.num_reads == 100
def test_gzip_decompression_truncated(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz")) try: for read in rparser: pass assert 0, "this should fail" except IOError, err: print str(err)
def test_bzip2_decompression_truncated_pairiter(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2")) try: for read in rparser.iter_read_pairs(): pass assert 0, "this should fail" except IOError, err: print str(err)
def test_with_zero_threads(): N_THREADS = 0 try: rparser = \ ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS) assert 0, "should fail" except ValueError as e: assert str(e) == \ 'Invalid thread number, must be integer greater than zero.'
def test_constructor(): # Note: Using a data file with only one read. try: rparser = ReadParser(utils.get_test_data("single-read.fq"), "a") assert 0, ("ReadParser's constructor shouldn't accept a character for " "the number of threads") except TypeError, err: print str(err)
def test_badbzip2(): try: rparser = ReadParser(utils.get_test_data("test-empty.fa.bz2")) for read in rparser: pass assert 0, "this should fail" except IOError as err: print str(err) except ValueError, err: print str(err)
def test_consume_seqfile_reads_parser(AnyTabletype): kh = AnyTabletype(5) rparser = ReadParser(utils.get_test_data('test-fastq-reads.fq')) kh.consume_seqfile(rparser) kh2 = AnyTabletype(5) for record in screed.open(utils.get_test_data('test-fastq-reads.fq')): kh2.consume(record.sequence) assert kh.get('CCGGC') == kh2.get('CCGGC')
def test_read_properties(): # Note: Using a data file with only one read. rparser = ReadParser(utils.get_test_data("single-read.fq")) # Check the properties of all one reads in data set. for read in rparser: assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN" assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" assert read.annotations == "" assert read.accuracy == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
def test_bzip2_decompression_truncated(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2")) try: for read in rparser: pass assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_gzip_decompression_truncated_pairiter(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz")) try: for read in rparser.iter_read_pairs(): pass assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_num_reads_truncated(): n_reads = 0 rparser = ReadParser(utils.get_test_data("truncated.fq")) try: for read in rparser: n_reads += 1 except IOError as err: assert "Sequence is empty" in str(err), str(err) assert rparser.num_reads == 1, "%d valid reads in file, got %d" % ( n_reads, rparser.num_reads)
def test_read_properties_fa(): # Note: Using a data file with only one read. rparser = ReadParser(utils.get_test_data("single-read.fa")) # Check the properties of all one reads in data set. for read in rparser: print(read.name) assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN" assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" # if an attribute is empty it shouldn't exist assert not hasattr(read, 'quality')
def test_read_pair_iterator_in_ignore_mode(): assert 0 rparser = \ ReadParser(utils.get_test_data("test-abund-read-impaired.fa")) read_pairs = [] for read_1, read_2 \ in rparser.iter_read_pairs(ReadParser.PAIR_MODE_IGNORE_UNPAIRED): read_pairs.append([read_1, read_2]) assert read_1.name[:19] == read_2.name[:19] assert 2 == len(read_pairs)
def test_read_properties(): # Note: Using a data file with only one read. rparser = ReadParser(utils.get_test_data("single-read.fq")) # Check the properties of all one reads in data set. for read in rparser: assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN" assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" # if an attribute is empty it shouldn't exist assert not hasattr(read, 'annotations') assert read.quality == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def test_read_pair_iterator_in_error_mode_xfail(): rparser = \ ReadParser(utils.get_test_data("test-abund-read-impaired.fa")) failed = True try: for rpair in rparser.iter_read_pairs(): pass failed = False except IOError as exc: pass assert failed
def test_abund_dist_A_readparser(tabletype): A_filename = utils.get_test_data('all-A.fa') rparser = ReadParser(A_filename) kh = tabletype(4, PRIMES_1m) tracking = khmer._Nodetable(4, PRIMES_1m) kh.consume_seqfile(A_filename) dist = kh.abundance_distribution(A_filename, tracking) print(dist[:10]) assert sum(dist) == 1 assert dist[0] == 0
def test_read_pair_iterator_in_error_mode_xfail(): rparser = \ ReadParser(utils.get_test_data("test-abund-read-impaired.fa")) failed = True try: for rpair in rparser.iter_read_pairs(): pass failed = False except ValueError as exc: assert "Invalid read pair" in str(exc), str(exc) assert failed
def test_abund_dist_A_readparser(AnyTabletype): A_filename = utils.get_test_data('all-A.fa') rparser = ReadParser(A_filename) kh = AnyTabletype(4) tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m) kh.consume_seqfile(A_filename) dist = kh.abundance_distribution(rparser, tracking) print(dist[:10]) assert sum(dist) == 1 assert dist[0] == 0
def test_consume_absentfasta_with_reads_parser(): presencetable = khmer._Hashbits(31, [1]) try: presencetable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) presencetable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_consume_absentfasta(): nodegraph = khmer.Nodegraph(31, 1, 1) try: nodegraph.consume_seqfile() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) nodegraph.consume_seqfile(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_with_default_arguments(): read_names = [] # Note: Using a data file where read names are just integers on [0,99). rparser = ReadParser(utils.get_test_data("random-20-a.fa")) for read in rparser: read_names.append(int(read.name)) # "Derandomize". read_names.sort() # Each read number should match the corresponding name. for m, n in enumerate(read_names): assert m == n
def test_consume_absentfasta_with_reads_parser(): countingtable = khmer.new_counting_hash(4, 4**4, 4) try: countingtable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: print str(err) try: readparser = ReadParser(utils.get_test_data('empty-file')) countingtable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except IOError as err: print str(err) except ValueError as err: print str(err)
def test_consume_absentfasta_with_reads_parser(): countgraph = khmer.Countgraph(4, 4**4, 4) try: countgraph.consume_seqfile_with_reads_parser() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) countgraph.consume_seqfile_with_reads_parser(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))