示例#1
0
def get_data_set_size(data_set):
    "@return: # seqs, # bases."
    import corebio.seq
    import corebio.seq_io.fasta_io as F
    alphabet = corebio.seq.reduced_nucleic_alphabet
    num_bases = num_seqs = 0
    for seq in F.iterseq(
        open(fasta_filenames[data_set], 'r'),
        alphabet
    ):
        num_seqs += 1
        num_bases += len(seq)
    return num_seqs, num_bases
示例#2
0
文件: head.py 项目: JohnReid/biopsy
from optparse import OptionParser

#
# Parse the options
#
option_parser = OptionParser()
option_parser.add_option(
    "-n",
    "--num-seqs",
    dest="num_seqs",
    default=10,
    type='int',
    help="Number of sequences to output."
)
options, args = option_parser.parse_args()

#
# Check args
#
if 1 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file>' % __file__
    sys.exit(-1)
fasta = args[0]
if '-' == fasta:
    input = sys.stdin
else:
    input = open(fasta, 'r')

for i, seq in zip(xrange(options.num_seqs), F.iterseq(input, corebio.seq.dna_alphabet)):
    F.writeseq(sys.stdout, seq)
示例#3
0
"""
Code that reads in sequences from a FASTA file and counts occurrences of characters.
"""


import sys, corebio.seq_io.fasta_io as F, corebio.seq, numpy

for fasta in sys.argv[1:]:
    #
    # Tally the sequences
    #
    alphabet = corebio.seq.reduced_nucleic_alphabet
    tally = numpy.zeros(len(alphabet), dtype=int)
    num_seqs = 0
    for seq in F.iterseq(
        open(fasta, 'r'),
        alphabet
    ):
        tally += seq.tally()
        num_seqs += 1
    
    #
    # Get number of known bases
    #
    num_known = tally[:4].sum()
    percentages = tally * 100. / num_known
    
    #
    # Print the tally out
    #
    print fasta
    print "    %8d bp in %d sequences" % (tally.sum(), num_seqs)
示例#4
0
def yield_fasta_sequences(filename):
    "@return: Yield the sequences from a FASTA file."
    from corebio.seq_io.fasta_io import iterseq, dna_alphabet
    for s in iterseq(open(filename, 'r'), dna_alphabet):
        yield s
示例#5
0
"""
Code that reads in 2 sets of sequences and outputs those in the first that are not in the second.
"""

import sys, corebio.seq_io.fasta_io as F, corebio.seq
from optparse import OptionParser

#
# Parse the options
#
option_parser = OptionParser()
options, args = option_parser.parse_args()

#
# Check args
#
if 2 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file> <fasta file>' % __file__
    sys.exit(-1)
fasta1, fasta2 = args

# read in second fasta
to_subtract = set(
    seq.description.strip().lower()
    for seq in F.iterseq(open(fasta2, 'r'), corebio.seq.dna_alphabet))

# read in first fasta
for seq in F.iterseq(open(fasta1, 'r'), corebio.seq.dna_alphabet):
    if seq.description.strip().lower() not in to_subtract:
        F.writeseq(sys.stdout, seq)
示例#6
0
    '--max-sequences',
    dest='max_seqs',
    type='int',
    default=-1,
    help="Set a limit on the number of sequences output."
)
options, args = option_parser.parse_args()

#
# Check args
#
if 2 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file> <max sequence length>' % __file__
    sys.exit(-1)

fasta = args[0]
length = int(args[1])
if '-' == fasta:
    input = sys.stdin
else:
    input = open(fasta, 'r')

#
# Read the sequences
#
alphabet = corebio.seq.reduced_nucleic_alphabet
for i, seq in enumerate(F.iterseq(input, alphabet)):
    if options.max_seqs == i:
        break
    F.writeseq(sys.stdout, shorten(seq))
示例#7
0
def yield_fasta_sequences(filename):
    "@return: Yield the sequences from a FASTA file."
    from corebio.seq_io.fasta_io import iterseq, dna_alphabet
    for s in iterseq(open(filename, 'r'), dna_alphabet):
        yield s
示例#8
0
option_parser.add_option('-m',
                         '--max-sequences',
                         dest='max_seqs',
                         type='int',
                         default=-1,
                         help="Set a limit on the number of sequences output.")
options, args = option_parser.parse_args()

#
# Check args
#
if 2 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file> <max sequence length>' % __file__
    sys.exit(-1)

fasta = args[0]
length = int(args[1])
if '-' == fasta:
    input = sys.stdin
else:
    input = open(fasta, 'r')

#
# Read the sequences
#
alphabet = corebio.seq.reduced_nucleic_alphabet
for i, seq in enumerate(F.iterseq(input, alphabet)):
    if options.max_seqs == i:
        break
    F.writeseq(sys.stdout, shorten(seq))
示例#9
0
    shuffled.name = '%s (shuffled)' % seq.name,
    shuffled.description = '%s (shuffled)' % seq.description,
    shuffled.alphabet = seq.alphabet
    return shuffled


#
# Parse the options
#
option_parser = OptionParser()
options, args = option_parser.parse_args()


#
# Check args
#
if 1 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file>' % __file__
    sys.exit(-1)
fasta = args[0]


#
# Shuffle the sequences
#
for seq in F.iterseq(
    open(fasta, 'r'),
    corebio.seq.dna_alphabet
):
    F.writeseq(sys.stdout, shuffle(seq))
示例#10
0
"""
Code that reads in 2 sets of sequences and outputs those in the first that are not in the second.
"""


import sys, corebio.seq_io.fasta_io as F, corebio.seq
from optparse import OptionParser

#
# Parse the options
#
option_parser = OptionParser()
options, args = option_parser.parse_args()

#
# Check args
#
if 2 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file> <fasta file>' % __file__
    sys.exit(-1)
fasta1, fasta2 = args

# read in second fasta
to_subtract = set(seq.description.strip().lower() for seq in F.iterseq(open(fasta2, 'r'), corebio.seq.dna_alphabet))

# read in first fasta
for seq in F.iterseq(open(fasta1, 'r'), corebio.seq.dna_alphabet):
    if seq.description.strip().lower() not in to_subtract:
        F.writeseq(sys.stdout, seq)
示例#11
0
import sys, corebio.seq_io.fasta_io as F, corebio.seq
from optparse import OptionParser

#
# Parse the options
#
option_parser = OptionParser()
option_parser.add_option("-n",
                         "--num-seqs",
                         dest="num_seqs",
                         default=10,
                         type='int',
                         help="Number of sequences to output.")
options, args = option_parser.parse_args()

#
# Check args
#
if 1 != len(args):
    print >> sys.stderr, 'USAGE: %s <fasta file>' % __file__
    sys.exit(-1)
fasta = args[0]
if '-' == fasta:
    input = sys.stdin
else:
    input = open(fasta, 'r')

for i, seq in zip(xrange(options.num_seqs),
                  F.iterseq(input, corebio.seq.dna_alphabet)):
    F.writeseq(sys.stdout, seq)