예제 #1
0
 def annotate_fasta():
     annotations = GFF3Parser(gff3_fn).read()
     with open(output_fn, 'w') as fp:
         for n, record in enumerate(ReadParser(transcriptome_fn)):
             df = annotations.query('seqid == "{0}"'.format(record.name))
             desc = generate_sequence_summary(record.name, record.sequence,
                                              df)
             fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))
예제 #2
0
def test_bzip2_decompression():

    reads_count = 0
    rparser = ReadParser(utils.get_test_data("100-reads.fq.bz2"))
    for read in rparser:
        reads_count += 1

    assert 100 == reads_count
예제 #3
0
def test_read_truncated():

    rparser = ReadParser(utils.get_test_data("truncated.fq"))
    try:
        for read in rparser:
            pass
        assert 0, "No exception raised on a truncated file"
    except IOError as err:
        assert "Sequence is empty" in str(err), str(err)
예제 #4
0
 def fix():
     names = []
     with open(output_fn, 'w') as fp:
         for record in ReadParser(transcriptome_fn):
             header = header_func(record.name)
             fp.write('>{0}\n{1}\n'.format(header, record.sequence))
             names.append((record.name, header))
     pd.DataFrame(names, columns=['original',
                                  'renamed']).to_csv(names_fn, index=False)
예제 #5
0
def test_iternext():
    rparser = ReadParser(utils.get_test_data("fakelump.fa.stoptags.txt"))
    read_pairs = []
    try:
        for read_1, read_2 in rparser.iter_read_pairs():
            read_pairs.append(read_1, read_2)
        assert 0, "Shouldn't be able to iterate over non FASTA file"
    except IOError, err:
        print str(err)
예제 #6
0
def test_error_badly_formatted_file():
    fname = utils.get_temp_filename('badly-formatted.fa')
    with open(fname, 'w') as f:
        f.write("not-sequence")

    with pytest.raises(OSError) as e:
        ReadParser(fname)

    assert e.match("contains badly formatted sequence")
예제 #7
0
def test_num_reads():
    """Test ReadParser.num_reads"""
    reads_count = 0
    rparser = ReadParser(utils.get_test_data("100-reads.fq.gz"))
    for _ in rparser:
        reads_count += 1

    assert reads_count == 100
    assert rparser.num_reads == 100
예제 #8
0
def test_gzip_decompression_truncated():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz"))
    try:
        for read in rparser:
            pass
        assert 0, "this should fail"
    except IOError, err:
        print str(err)
예제 #9
0
def test_bzip2_decompression_truncated_pairiter():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2"))
    try:
        for read in rparser.iter_read_pairs():
            pass
        assert 0, "this should fail"
    except IOError, err:
        print str(err)
예제 #10
0
def test_with_zero_threads():
    N_THREADS = 0
    try:
        rparser = \
            ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS)
        assert 0, "should fail"
    except ValueError as e:
        assert str(e) == \
            'Invalid thread number, must be integer greater than zero.'
예제 #11
0
def test_constructor():

    # Note: Using a data file with only one read.
    try:
        rparser = ReadParser(utils.get_test_data("single-read.fq"), "a")
        assert 0, ("ReadParser's constructor shouldn't accept a character for "
                   "the number of threads")
    except TypeError, err:
        print str(err)
예제 #12
0
def test_badbzip2():
    try:
        rparser = ReadParser(utils.get_test_data("test-empty.fa.bz2"))
        for read in rparser:
            pass
        assert 0, "this should fail"
    except IOError as err:
        print str(err)
    except ValueError, err:
        print str(err)
예제 #13
0
def test_consume_seqfile_reads_parser(AnyTabletype):
    kh = AnyTabletype(5)
    rparser = ReadParser(utils.get_test_data('test-fastq-reads.fq'))

    kh.consume_seqfile(rparser)

    kh2 = AnyTabletype(5)
    for record in screed.open(utils.get_test_data('test-fastq-reads.fq')):
        kh2.consume(record.sequence)

    assert kh.get('CCGGC') == kh2.get('CCGGC')
예제 #14
0
def test_read_properties():

    # Note: Using a data file with only one read.
    rparser = ReadParser(utils.get_test_data("single-read.fq"))

    # Check the properties of all one reads in data set.
    for read in rparser:
        assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN"
        assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
        assert read.annotations == ""
        assert read.accuracy == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
def test_bzip2_decompression_truncated():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2"))
    try:
        for read in rparser:
            pass
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
def test_gzip_decompression_truncated_pairiter():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz"))
    try:
        for read in rparser.iter_read_pairs():
            pass
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
예제 #17
0
def test_num_reads_truncated():

    n_reads = 0
    rparser = ReadParser(utils.get_test_data("truncated.fq"))
    try:
        for read in rparser:
            n_reads += 1
    except IOError as err:
        assert "Sequence is empty" in str(err), str(err)
    assert rparser.num_reads == 1, "%d valid reads in file, got %d" % (
        n_reads, rparser.num_reads)
예제 #18
0
def test_read_properties_fa():

    # Note: Using a data file with only one read.
    rparser = ReadParser(utils.get_test_data("single-read.fa"))

    # Check the properties of all one reads in data set.
    for read in rparser:
        print(read.name)
        assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN"
        assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
        # if an attribute is empty it shouldn't exist
        assert not hasattr(read, 'quality')
예제 #19
0
def test_read_pair_iterator_in_ignore_mode():
    assert 0

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-impaired.fa"))

    read_pairs = []
    for read_1, read_2 \
            in rparser.iter_read_pairs(ReadParser.PAIR_MODE_IGNORE_UNPAIRED):
        read_pairs.append([read_1, read_2])
        assert read_1.name[:19] == read_2.name[:19]
    assert 2 == len(read_pairs)
예제 #20
0
def test_read_properties():

    # Note: Using a data file with only one read.
    rparser = ReadParser(utils.get_test_data("single-read.fq"))

    # Check the properties of all one reads in data set.
    for read in rparser:
        assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN"
        assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
        # if an attribute is empty it shouldn't exist
        assert not hasattr(read, 'annotations')
        assert read.quality == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
예제 #21
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
예제 #22
0
def test_read_pair_iterator_in_error_mode_xfail():

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-impaired.fa"))

    failed = True
    try:
        for rpair in rparser.iter_read_pairs():
            pass
        failed = False
    except IOError as exc:
        pass
    assert failed
예제 #23
0
def test_abund_dist_A_readparser(tabletype):
    A_filename = utils.get_test_data('all-A.fa')
    rparser = ReadParser(A_filename)

    kh = tabletype(4, PRIMES_1m)
    tracking = khmer._Nodetable(4, PRIMES_1m)

    kh.consume_seqfile(A_filename)
    dist = kh.abundance_distribution(A_filename, tracking)

    print(dist[:10])
    assert sum(dist) == 1
    assert dist[0] == 0
def test_read_pair_iterator_in_error_mode_xfail():

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-impaired.fa"))

    failed = True
    try:
        for rpair in rparser.iter_read_pairs():
            pass
        failed = False
    except ValueError as exc:
        assert "Invalid read pair" in str(exc), str(exc)
    assert failed
예제 #25
0
def test_abund_dist_A_readparser(AnyTabletype):
    A_filename = utils.get_test_data('all-A.fa')
    rparser = ReadParser(A_filename)

    kh = AnyTabletype(4)
    tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m)

    kh.consume_seqfile(A_filename)
    dist = kh.abundance_distribution(rparser, tracking)

    print(dist[:10])
    assert sum(dist) == 1
    assert dist[0] == 0
예제 #26
0
def test_consume_absentfasta_with_reads_parser():
    presencetable = khmer._Hashbits(31, [1])
    try:
        presencetable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        presencetable.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
예제 #27
0
def test_consume_absentfasta():
    nodegraph = khmer.Nodegraph(31, 1, 1)
    try:
        nodegraph.consume_seqfile()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        nodegraph.consume_seqfile(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
예제 #28
0
def test_with_default_arguments():

    read_names = []
    # Note: Using a data file where read names are just integers on [0,99).
    rparser = ReadParser(utils.get_test_data("random-20-a.fa"))

    for read in rparser:
        read_names.append(int(read.name))

    # "Derandomize".
    read_names.sort()

    # Each read number should match the corresponding name.
    for m, n in enumerate(read_names):
        assert m == n
예제 #29
0
def test_consume_absentfasta_with_reads_parser():
    countingtable = khmer.new_counting_hash(4, 4**4, 4)
    try:
        countingtable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        countingtable.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except IOError as err:
        print str(err)
    except ValueError as err:
        print str(err)
예제 #30
0
def test_consume_absentfasta_with_reads_parser():
    countgraph = khmer.Countgraph(4, 4**4, 4)
    try:
        countgraph.consume_seqfile_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        countgraph.consume_seqfile_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))