Exemplo n.º 1
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]
        
    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, 1, 1)
    ht.load(counting_ht)

    for infile in infiles:
       print 'filtering', infile
       outfile = infile + '.abundfilt'

       outfp = open(outfile, 'w')

       def process_fn(record, ht=ht):
          name = record['name']
          seq = record['sequence']
          if 'N' in seq:
              return None, None

          trim_seq, trim_at = ht.trim_on_abundance(seq, 2)

          if trim_at >= K:
              return name, trim_seq

          return None, None

       tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

       tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 2
0
def main():
    htfile = sys.argv[1]
    outfiles = sys.argv[2:]

    print 'loading hashbits'
    ht = khmer.load_hashbits(htfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)

        if trim_at >= ht.ksize():
            return name, trim_seq

        return None, None

    for filename in outfiles:
        outpath = os.path.basename(filename) + '.sodd'
        outfp = open(outpath, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Exemplo n.º 3
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                return name, trim_seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 4
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)
    print('-- settings:')
    print('N THREADS', WORKER_THREADS)
    print('--')

    print('making hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                return name, trim_seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 5
0
def main():
    htfile = sys.argv[1]
    outfiles = sys.argv[2:]

    print 'loading hashbits'
    ht = khmer.load_hashbits(htfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)

        if trim_at >= ht.ksize():
            return name, trim_seq

        return None, None

    for filename in outfiles:
        outpath = os.path.basename(filename) + '.sodd'
        outfp = open(outpath, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Exemplo n.º 6
0
def main():
    repfile = sys.argv[1]
    infile = sys.argv[1]
    if len(sys.argv) >= 3:
        infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.loess'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file with representative artifacts: %s' % repfile
    print 'input file to degree filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
    print 'N HASHTABLES %d' % N_HT
    print 'N THREADS', WORKER_THREADS
    print 'RADIUS', RADIUS
    print 'MAX DENSITY', MAX_VOLUME / RADIUS
    print '--'

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    outfp = open(outfile, 'w')

    print 'eating', repfile
    ht.consume_fasta(repfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS,
                                                         MAX_VOLUME)

#        if trim_at >= K:
#            return name, trim_seq

        if trim_at == len(seq):
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 7
0
def main():
    repfile = sys.argv[1]
    infile = sys.argv[1]
    if len(sys.argv) >= 3:
        infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.loess'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file with representative artifacts: %s' % repfile
    print 'input file to degree filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
    print 'N HASHTABLES %d' % N_HT
    print 'N THREADS', WORKER_THREADS
    print 'RADIUS', RADIUS
    print 'MAX DENSITY', MAX_VOLUME / RADIUS
    print '--'

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    outfp = open(outfile, 'w')

    print 'eating', repfile
    ht.consume_fasta(repfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS,
                                                         MAX_VOLUME)

#        if trim_at >= K:
#            return name, trim_seq

        if trim_at == len(seq):
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 8
0
def main():
    repfile = sys.argv[1]
    infile = sys.argv[1]
    if len(sys.argv) >= 3:
        infile = sys.argv[2]

    outfile = os.path.basename(infile) + ".loess"
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print "file with representative artifacts: %s" % repfile
    print "input file to degree filter: %s" % infile
    print "filtering to output:", outfile
    print "-- settings:"
    print "K", K
    print "HASHTABLE SIZE %g" % HASHTABLE_SIZE
    print "N HASHTABLES %d" % N_HT
    print "N THREADS", WORKER_THREADS
    print "RADIUS", RADIUS
    print "MAX DENSITY", MAX_VOLUME / RADIUS
    print "--"

    print "making hashtable"
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    outfp = open(outfile, "w")

    print "eating", repfile
    ht.consume_fasta(repfile)

    def process_fn(record, ht=ht):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME)

        #        if trim_at >= K:
        #            return name, trim_seq

        if trim_at == len(seq):
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 9
0
def main():
    stoptags = sys.argv[1]
    infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.stopkeep'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file with stop tags: %s' % stoptags
    print 'input file to filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_hashbits(K, 1, 1)

    ht.load_stop_tags(stoptags)

    outfp = open(outfile, 'w')

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at < K:
            return name, seq

        seq = seq[trim_at:]
        if seq:
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 10
0
def main():
    stoptags = sys.argv[1]
    infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.stopkeep'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file with stop tags: %s' % stoptags
    print 'input file to filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_hashbits(K, 1, 1)

    ht.load_stop_tags(stoptags)

    outfp = open(outfile, 'w')

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at < K:
            return name, seq

        seq = seq[trim_at:]
        if seq:
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 11
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, 1, 1)
    ht.load(counting_ht)

    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.ham1filt'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            for pos in range(len(seq) - K):
                kmer = seq[pos:pos + K]
                if ht.max_hamming1_count(kmer) > 2000:
                    trim_at = pos + K - 1
                    seq = seq[:trim_at]
                    break

            if len(seq) >= K:
                return name, seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 12
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, 1, 1)
    ht.load(counting_ht)

    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.ham1filt'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            for pos in range(len(seq) - K):
                kmer = seq[pos:pos + K]
                if ht.max_hamming1_count(kmer) > 2000:
                    trim_at = pos + K - 1
                    seq = seq[:trim_at]
                    break

            if len(seq) >= K:
                return name, seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 13
0
def main():
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)


    for filename in sys.argv[1:]:
       print 'consuming input', filename
       ht.consume_fasta(filename)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if len(seq) < K:
            return None, None

        if ht.get_min_count(seq) < 2:
            return None, None

        return name, seq

    for filename in sys.argv[1:]:
       print '***', filename
       outfile = os.path.basename(filename) + '.f2'
       if os.path.exists(outfile):
          print 'SKIPPING', outfile, ' -- already exists'
          continue

       outfp = open(outfile, 'w')

       tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
       tsp.start(verbose_fasta_iter(filename), outfp)
Exemplo n.º 14
0
def main():
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    for filename in sys.argv[1:]:
        print 'consuming input', filename
        ht.consume_fasta(filename)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if len(seq) < K:
            return None, None

        if ht.get_min_count(seq) < 2:
            return None, None

        return name, seq

    for filename in sys.argv[1:]:
        print '***', filename
        outfile = os.path.basename(filename) + '.f2'
        if os.path.exists(outfile):
            print 'SKIPPING', outfile, ' -- already exists'
            continue

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Exemplo n.º 15
0
def main():
    print "-- settings:"
    print "K", K
    print "N THREADS", WORKER_THREADS
    print "--"

    print "making hashtable"
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    for filename in sys.argv[1:]:
        print "consuming input", filename
        ht.consume_fasta(filename)

    def process_fn(record, ht=ht):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        if len(seq) < K:
            return None, None

        if ht.get_min_count(seq) < 2:
            return None, None

        return name, seq

    for filename in sys.argv[1:]:
        print "***", filename
        outfile = os.path.basename(filename) + ".f2"
        if os.path.exists(outfile):
            print "SKIPPING", outfile, " -- already exists"
            continue

        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Exemplo n.º 16
0
def main():
    infile = sys.argv[1]
    outfile = os.path.basename(infile) + '.graphsize'
    if len(sys.argv) == 3:
        outfile = sys.argv[2]

    print('input file to graphsize filter: %s' % infile)
    print('filtering to output:', outfile)
    print('-- settings:')
    print('K', K)
    print('HASHTABLE SIZE %g' % HASHTABLE_SIZE)
    print('N HASHTABLES %d' % N_HT)
    print('THRESHOLD', THRESHOLD)
    print('N THREADS', WORKER_THREADS)
    print('--')

    print('creating ht')
    ht = khmer.Nodegraph(K, HASHTABLE_SIZE, N_HT)
    print('eating fa', infile)
    total_reads, n_consumed = ht.consume_fasta(infile)
    outfp = open(outfile, 'w')

    ###

    def process_fn(record, ht=ht):
        kmer = record['sequence'][:K]
        size = ht.calc_connected_graph_size(kmer, THRESHOLD)
        if size >= THRESHOLD:
            return record['name'], record['sequence']

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 17
0
def main():
    infile = sys.argv[1]
    outfile = os.path.basename(infile) + '.graphsize'
    if len(sys.argv) == 3:
        outfile = sys.argv[2]

    print('input file to graphsize filter: %s' % infile)
    print('filtering to output:', outfile)
    print('-- settings:')
    print('K', K)
    print('HASHTABLE SIZE %g' % HASHTABLE_SIZE)
    print('N HASHTABLES %d' % N_HT)
    print('THRESHOLD', THRESHOLD)
    print('N THREADS', WORKER_THREADS)
    print('--')

    print('creating ht')
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    print('eating fa', infile)
    total_reads, n_consumed = ht.consume_fasta(infile)
    outfp = open(outfile, 'w')

    ###

    def process_fn(record, ht=ht):
        kmer = record['sequence'][:K]
        size = ht.calc_connected_graph_size(kmer, THRESHOLD)
        if size >= THRESHOLD:
            return record['name'], record['sequence']

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)