def parse_file(file_name, type = 'DNA'): """Parse the given file into a FastaAlignment object. Arguments: o file_name - The location of the file to parse. o type - The type of information contained in the file. """ if type.upper() == 'DNA': alphabet = IUPAC.ambiguous_dna elif type.upper() == 'RNA': alphabet = IUPAC.ambiguous_rna elif type.upper() == 'PROTEIN': alphabet = IUPAC.protein else: raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN" % type) # create a new alignment object fasta_align = FastaAlignment(Alphabet.Gapped(alphabet)) # now parse the file and fill up the alignment object align_file = open(file_name, 'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(align_file, parser) cur_align = iterator.next() while cur_align: fasta_align.add_sequence(cur_align.title, cur_align.sequence) cur_align = iterator.next() return fasta_align
def test_schema_representation(self): """Convert sequences into schema representations. """ # get a set of schemas we want to code the sequence in schema_bank = self._load_schema_repository() top_schemas = schema_bank.get_top(25) schema_coder = Schema.SchemaCoder(top_schemas, self.schema) # get the sequences one at a time, and encode them fasta_handle = open(self.test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(fasta_handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break schema_values = schema_coder.representation(seq_record.seq) if VERBOSE: print "Schema values:", schema_values fasta_handle.close()
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') diff_file = os.path.join('NeuralNetwork', 'repeat.fasta') self.test_records = [] self.diff_records = [] # load the records for file, records in ((test_file, self.test_records), (diff_file, self.diff_records)): handle = open(file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break records.append(seq_record) handle.close() self.motif_finder = Motif.MotifFinder()
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') diff_file = os.path.join('NeuralNetwork', 'repeat.fasta') self.test_records = [] self.diff_records = [] # load the records for file, records in ((test_file, self.test_records), (diff_file, self.diff_records)): handle = open(file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break records.append(seq_record) handle.close() self.num_schemas = 2 schema_ga = Schema.GeneticAlgorithmFinder() schema_ga.min_generations = 1 self.finder = Schema.SchemaFinder(num_schemas=self.num_schemas, schema_finder=schema_ga)
def test_record_iterator(self): """Test the iterator with a Record Parser. """ parser = Fasta.RecordParser() iterator = Fasta.Iterator(self.test_handle, parser) for rec in iter(iterator): assert isinstance(rec, Fasta.Record)
def test_sequence_iterator(self): """Test the iterator with a Sequence Parser. """ parser = Fasta.SequenceParser() iterator = Fasta.Iterator(self.test_handle, parser) for rec in iter(iterator): assert isinstance(rec, SeqRecord.SeqRecord)
def ReadFile(self): self.parser = Fasta.RecordParser() self.iter = Fasta.Iterator(handle=open(self.file), parser=self.parser) while 1: rec = self.iter.next() if not rec: break self.header = rec.title.split()[0].split(',')[0] self.HandleRecord(rec)
def read_fasta_file(self, file): genes = [] iter = Fasta.Iterator(handle = open(file), parser = Fasta.RecordParser()) while 1: rec = iter.next() if not rec: break genes.append((rec.sequence, rec.title)) return genes
def test_parsing_comments(self): """Parse FASTA files with # style comment lines in them. """ handle = open(os.path.join("Fasta", "f003")) iterator = Fasta.Iterator(handle, Fasta.RecordParser()) num_recs = 0 for rec in iter(iterator): num_recs += 1 assert num_recs == 2
def runDisEMBLpipeline(): try: smooth_frame = 8 peak_frame = 8 join_frame = 4 fold_coils = 1.2 fold_hotloops = 1.4 fold_rem465 = 1.2 mode = 'scores' try: file = open(sys.argv[1], 'r') except: mode = 'default' except: print '\nDisEMBL.py sequence_file \n' print 'A default run would be: ./DisEMBL.py fasta_file' raise SystemExit #db = sys.stdin parser = Fasta.RecordParser() iterator = Fasta.Iterator(file, parser) while 1: try: cur_record = iterator.next() sequence = upper(cur_record.sequence) # Run NN COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence) # Run Savitzky-Golay REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw) COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw) HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw) sys.stdout.write('> ' + cur_record.title + '\n') sys.stdout.write('# COILS ') reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence) sys.stdout.write('# REM465 ') reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence) sys.stdout.write('# HOTLOOPS ') reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence) sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n') for i in range(len(REM465_smooth)): sys.stdout.write(sequence[i] + '\t' + fpformat.fix(COILS_smooth[i], 5) + '\t' + fpformat.fix(REM465_smooth[i], 5) + '\t' + fpformat.fix(HOTLOOPS_smooth[i], 5) + '\n') except AttributeError: break file.close() return
def test_sequence_alphabet(self): """Setting the alphabet for the Sequence Parser. """ parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna) rec = parser.parse(self.handles[0]) assert rec.seq.alphabet == IUPAC.unambiguous_dna
def test_new_iterator(self): """Ensure the Fasta iterator works like a Python 2.2 iterator. """ n = 0 iterator = Fasta.Iterator(self.test_handle) for rec in iter(iterator): n += 1 assert n == 3
def test_record_parser(self): """Basic operation of the Record Parser. """ parser = Fasta.RecordParser() for index in range(len(self.handles)): handle = self.handles[index] rec = parser.parse(handle) assert isinstance(rec, Fasta.Record) assert len(rec.title) == self.lengths[index][0] assert len(rec.sequence) == self.lengths[index][1]
def test_sequence_title_convert(self): """Test title conversion for the Sequence Parser. """ def test_title2ids(title): return "id", "name", "description" parser = Fasta.SequenceParser(title2ids = test_title2ids) rec = parser.parse(self.handles[0]) assert rec.id == "id" assert rec.name == "name" assert rec.description == "description"
def runDisEMBLpipeline(): try: smooth_frame = int(sys.argv[1]) peak_frame = int(sys.argv[2]) join_frame = int(sys.argv[3]) fold_coils = float(sys.argv[4]) fold_hotloops = float(sys.argv[5]) fold_rem465 = float(sys.argv[6]) file = str(sys.argv[7]) except: print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file\n' print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2 fasta_file' raise SystemExit db = open(file, 'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(db, parser) while 1: try: cur_record = iterator.next() sequence = upper(cur_record.sequence) # Run NN COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence) # Run Savitzky-Golay REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw) COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw) HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw) sys.stdout.write('> ' + cur_record.title + '_COILS ') reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence) sys.stdout.write('> ' + cur_record.title + '_REM465 ') reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence) sys.stdout.write('> ' + cur_record.title + '_HOTLOOPS ') reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence) sys.stdout.write('\n') except AttributeError: break return
def test_record_basic(self): """Basic test on Record """ def pbool(b): if b: return 1 return 0 r = Fasta.Record() assert pbool(type(r.title) is StringType) # StringType assert pbool(type(r.sequence) is StringType) # StringType
def _load_schema_repository(self): """Helper function to load a schema repository from a file. This also caches a schema bank, to prevent having to do this time consuming operation multiple times. """ # if we already have a cached repository, return it if self.schema_bank is not None: return self.schema_bank # otherwise, we'll read in a new schema bank # read in the all of the motif records motif_handle = open(self.test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(motif_handle, seq_parser) seq_records = [] while 1: seq_record = iterator.next() if seq_record is None: break seq_records.append(seq_record) motif_handle.close() # find motifs from the file motif_finder = Motif.MotifFinder() motif_size = 9 motif_bank = motif_finder.find(seq_records, motif_size) schema_bank = self.factory.from_motifs(motif_bank, .1, 2) # cache the repository self.schema_bank = schema_bank return schema_bank
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') self.test_records = [] # load the records handle = open(test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break self.test_records.append(seq_record) handle.close() self.sig_finder = Signature.SignatureFinder()
def main(): # create a substitution matrix sub_matrix = SubstitutionMatrix('blosum50') # set up for alignment aligner = NWAlign(sub_matrix) print "Testing a simple alignment..." seq1 = "HEAGAWGHEE" seq2 = "PAWHEAE" aligner.align(seq1, seq2) align1, align2 = aligner.get_optimal_alignment() score = aligner.get_optimal_score() print "Alignment Score:", score print align1.data print align2.data print "Testing a more complex alignment..." test_file = "PEPCarboxylase.fasta" print "Getting sequences from the file PEPCarboxylase.fasta..." seq_list = [] scanner = Fasta._Scanner() handler = FASTAHandler(seq_list) file = open(test_file, 'r') scanner.feed(file, handler) scanner.feed(file, handler) #print seq_list print "Aligning sequences..." aligner = NWAlign(sub_matrix) aligner.align(seq_list[0][0:150], seq_list[1][0:150]) align1, align2 = aligner.get_optimal_alignment() score = aligner.get_optimal_score() print "Alignment Score:", score line_width = 25 current_position = 0 current_position = current_position + line_width # pretty print the alignment while current_position < len(align1): print "" print align1.data[current_position - line_width:current_position] print align2.data[current_position - line_width:current_position] current_position = current_position + line_width # print whatever is left print "" print align1.data[current_position - line_width:len(align1) - 1] print align2.data[current_position - line_width:len(align2) - 1]
def __str__(self): """Print out a fasta version of the alignment info.""" return_string = '' for item in self._records: new_f_record = Fasta.Record() new_f_record.title = item.description new_f_record.sequence = item.seq.data return_string = return_string + str(new_f_record) + os.linesep + os.linesep # have a extra newline, so strip two off and add one before returning return return_string.rstrip() + os.linesep
def extract_organisms(file, num_records): scanner = Fasta._Scanner() consumer = SpeciesExtractor() file_to_parse = UndoHandle(open(file, "r")) for fasta_record in range(num_records): scanner.feed(file_to_parse, consumer) file_to_parse.close() return consumer.species_list
def test_sequence_parser(self): """Basic operation of the Sequence Parser. """ parser = Fasta.SequenceParser() for index in range(len(self.handles)): handle = self.handles[index] rec = parser.parse(handle) assert isinstance(rec, SeqRecord.SeqRecord) assert isinstance(rec.seq, Seq.Seq) assert rec.seq.alphabet == Alphabet.generic_alphabet assert len(rec.seq) == self.lengths[index][1] assert len(rec.description) == self.lengths[index][0]
def extract_organisms(file_to_parse): # set up the parser and iterator parser = Fasta.RecordParser() file = open(file_to_parse, 'r') iterator = Fasta.Iterator(file, parser) all_species = [] while 1: cur_record = iterator.next() if cur_record is None: break # extract the info from the title new_species = cur_record.title.split()[1] # append the new species to the list if it isn't there if new_species not in all_species: all_species.append(new_species) return all_species
def get_seqs(blastRootDirectory): if len(sys.argv) >= 2: numSeqs = int(sys.argv[1]) if numSeqs < 0 or numSeqs > 100000: print 'requested number of sequences is outside allowable range (1-100000). Using default (1000)' numSeqs = 10 else: numSeqs = 10 print 'requesting', numSeqs, 'query sequences from the server' seqs = phamServer.request_seqs(server, numSeqs, client) '''Builds the file to be blasted from the sequences given''' f = open(os.path.join(blastRootDirectory, 'filetoblast.txt'), 'w') print seqs '''takes the new set of sequences and checks if they exist in the local database and, if so, writes the sequence id and translation to a separate FASTA formated input file to be passed to the BLASTALL executable''' for GeneID in seqs: parser = Fasta.RecordParser() infile = open(os.path.join(blastRootDirectory, 'blastDB.fasta')) iterator = Fasta.Iterator(infile, parser) while 1: record = iterator.next() if not record: break record_id = record.title if GeneID == record_id: f.write('>' + record.title + '\n' + record.sequence + '\n') f.close() return (len(seqs))
def test_basic_iterator(self): """Ensure the Fasta iterator works returning text. """ i = Fasta.Iterator(self.test_handle) rec_info = {0 : ">gi|1348912|gb|G26680|G26680", 1 : ">gi|1348917|gb|G26685|G26685", 2 : ">gi|1592936|gb|G29385|G29385"} for rec_num in range(3): rec = i.next() lines = rec.split("\n") title_part = lines[0].split() assert title_part[0] == rec_info[rec_num] # make sure we keep getting None when the iterator is done assert i.next() is None assert i.next() is None
def __init__(self,**kwargs): self.db_dir = DEFAULT_DB_DIR self.index_filename = DEFAULT_INDEX_FILENAME self.seqres_filename = DEFAULT_SEQRES_FILENAME for key,value in kwargs: if key in ['db_dir']: self.db_path = arg elif key in ['index_filename']: self.index_filename = arg elif key in ['seqres_filename']: self.seqres_filename = arg self.full_index_filename = os.path.join(self.db_dir,self.index_filename) self.full_seqres_filename = os.path.join(self.db_dir,self.seqres_filename) self.offsets = {} self.namesByPdbid = {} self.seqres_file = file(self.full_seqres_filename) self.fasta_parser = Fasta.RecordParser() self.load_index_file()
def main(blast_file): db_dir = os.path.join(os.getcwd(), "db") cur_dbs = get_available_dbs(db_dir) length_cutoff = 0.2 blast_clusters, all_lengths = get_blast_clusters(blast_file, length_cutoff) filter_clusters = filter_by_organism(blast_clusters, org_includes, cur_dbs) length_plot(all_lengths, blast_file) cluster_grouper = SimilarityClusterGrouper(2, 200, [(0.9, 10)]) all_groups = cluster_grouper.get_final_groups(filter_clusters) base, ext = os.path.splitext(blast_file) cluster_file = base + "-bcluster%s.txt" for gindex, group in enumerate(all_groups): print '-----------' with open(cluster_file % gindex, "w") as out_handle: for gitem in group: db_rec = get_db_rec(gitem, cur_dbs) print gitem, db_rec["org_scientific_name"] rec = Fasta.Record() rec.title = gitem rec.sequence = db_rec["seq"] out_handle.write(str(rec) + "\n")
# and builds an index as a set of files on disc in the sub-directory # my_orchid_dict.idx # Note that the alphabet is explicitly defined for the sequences. import os from Bio import Fasta from Bio.Alphabet import IUPAC def get_accession_num(fasta_record): title_atoms = fasta_record.title.split() accession_atoms = title_atoms[0].split('|') gb_name = accession_atoms[3] # strip the version info before returning return gb_name[:-2] if not os.path.isdir("my_orchid_dict.idx") : #Build a new index Fasta.index_file("ls_orchid.fasta", "my_orchid_dict.idx", get_accession_num) else : print "Reusing existing index" dna_parser = Fasta.SequenceParser(IUPAC.ambiguous_dna) orchid_dict = Fasta.Dictionary("my_orchid_dict.idx", dna_parser) for id_num in orchid_dict.keys(): print 'id number:', id_num print 'description:', orchid_dict[id_num].description print 'sequence:', orchid_dict[id_num].seq
else: return open(file_name, 'r') if __name__ == "__main__": import getopt opts, args = getopt.getopt(sys.argv[1:], 'hs:t:') if not opts or len(args) != 1: usage() sys.exit('Error usage') fasta_file = open(args[0]) parser = Fasta.RecordParser() for o, a in opts: if o == '-h': usage() sys.exit(0) elif o == '-s': sieve = get_sieve(get_input_handle(a)) iterator = FastaSelectiveIterator(sieve, fasta_file, parser) for record in iterator: print record elif o == '-t': translator = FastaTranslator(get_input_handle(a), reverse=True) iterator = Fasta.Iterator(fasta_file, parser) for record in iterator: print translator(record)
#!/usr/bin/env python """Example showing how to deal with internet BLAST from Biopython. This code is described in great detail in the BLAST section of the Biopython documentation. """ # standard library import cStringIO # biopython from Bio.Blast import NCBIWWW from Bio import Fasta # first get the sequence we want to parse from a FASTA file file_for_blast = open('m_cold.fasta', 'r') f_iterator = Fasta.Iterator(file_for_blast) f_record = f_iterator.next() print 'Doing the BLAST and retrieving the results...' result_handle = NCBIWWW.qblast('blastn', 'nr', f_record) # save the results for later, in case we want to look at it save_file = open('m_cold_blast.out', 'w') blast_results = result_handle.read() save_file.write(blast_results) save_file.close() print 'Parsing the results and extracting info...' b_parser = NCBIWWW.BlastParser()
def main(ipr_number, num_clusters, out_dir): charge_window = 75 db_dir = os.path.join(os.getcwd(), "db") cur_db = shelve.open(os.path.join(db_dir, ipr_number)) tax_graph = build_tax_graph(cur_db) uniprot_ids = [] info_array = [] for db_domain in cur_db.keys(): if not cur_db[db_domain].get("is_uniref_child", ""): uniprot_ids.append(db_domain) db_item = cur_db[db_domain] cur_cluster_info = [ float(db_item["charge"]), float(db_item["charge_region"]) * 10.0, len(db_item.get("db_refs", [])) * 5.0, calc_domain_distance(db_item) * 50.0, #max(len(db_item.get("string_interactors", [])) - 1, 0), ] info_array.append(cur_cluster_info) info_array = numpy.array(info_array) print 'Num genes', len(info_array), num_clusters cluster_ids, error, nfound = Cluster.kcluster( info_array, nclusters=num_clusters, npass=50) #, method='a', dist='c') #tree = Cluster.treecluster(info_array, method='a', dist='c') #cluster_ids = tree.cut(num_clusters) cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number)) out_seq_handle = open(out_seq_file, "w") for index, cluster_group in enumerate(cluster_dict.values()): print '***********', index org_dists = [] for uniprot_id in cluster_group: org = cur_db[uniprot_id]["org_scientific_name"] distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus', org) org_dists.append((distance, org, uniprot_id)) org_dists.sort() members = [] for d, o, u in org_dists: charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window, out_dir) base, ext = os.path.splitext(charge_plot_img) disorder_plot_img = "%s-idr%s" % (base, ext) rec = Fasta.Record() rec.title = u rec.sequence = cur_db[u]["seq"] out_seq_handle.write(str(rec) + "\n") members.append( dict( organism=o, uniprot_id=get_uniprot_links([u]), alt_names=get_alt_names(cur_db[u]), alt_ids=get_uniprot_links(cur_db[u].get( "uniref_children", [])), charge=cur_db[u]["charge"], charge_region="%0.2f" % cur_db[u]["charge_region"], charge_plot_img=charge_plot_img, disorder_plot_img=disorder_plot_img, domains=len(cur_db[u].get("db_refs", [])), interactions=get_string_link( u, max( len(cur_db[u].get("string_interactors", [])) - 1, 0)), description=cur_db[u].get("function_descr", " "), c_distance="%0.2f" % calc_domain_distance(cur_db[u]), )) with open( os.path.join(out_dir, "%s-cluster%s.html" % (ipr_number, index)), "w") as out_handle: tmpl = Template(cluster_template) out_handle.write(tmpl.render(cluster_members=members))
def initialize(): """ Parse command line options, and read input Fasta file. Construct a dictionary contains the following fields: sequences a list of dictionary objects having 'title', 'sequence', and 'motif_position' attributes (see also the docstring of gibbs.Gibbs.__init__) width width of motif to find weight weight to use for pseudocounts iterations number of non-improving iterations before stopping shifts maximum phase shifts to detect ps_freq frequency of detecting phase shifts init_occurrences number of base occurrences to use for initial motif positions heuristic init_width width of patterns to use for initial motif positions heuristic Return the constructed dictionary. """ parser = OptionParser(usage = "usage: %prog -i FILE -w WIDTH [-h] " "[options]", version = "PyMotif %s (%s)" % (VERSION, DATE), description = "PyMotif is an implementation of the " "Gibbs sampling algorithm for finding local " "alignments of DNA sequences. " "See the accompanied README file for usage " "instructions and the documentation directory for " "implementation details.") parser.add_option("-i", "--input", dest="input", metavar="FILE", help="read FILE in Fasta format") parser.add_option("-w", "--width", dest="width", metavar="WIDTH", type="int", help="find motif of width WIDTH") parser.add_option("-t", "--iterations", dest="iterations", metavar="ITERATIONS", default=ITERATIONS_DEFAULT, type="int", help="number of non-improving iterations " "(default " + str(ITERATIONS_DEFAULT) + ")") parser.add_option("-p", "--pseudo", dest="pseudo", metavar="WEIGHT", default=PSEUDOCOUNTS_WEIGHT_DEFAULT, type="float", help="use WEIGHT for weight of pseudocounts (default " + str(PSEUDOCOUNTS_WEIGHT_DEFAULT) + ")") parser.add_option("-s", "--phase-shifts", dest="shifts", metavar="SHIFTS", default=PHASE_SHIFTS_DEFAULT, type="int", help="detect phase shifts of width SHIFTS (default " + str(PHASE_SHIFTS_DEFAULT) + ")") parser.add_option("-f", "--ps-frequency", dest="frequency", metavar="FREQ", default=PS_FREQUENCY_DEFAULT, type="int", help="if SHIFTS>0, detect phase shifts " "every FREQ iterations (default " + str(PS_FREQUENCY_DEFAULT) + ")") parser.add_option("-n", "--init-num-occurrences", dest="initoccurrences", metavar="OCCURRENCES", default=INIT_NUM_OCCURRENCES_DEFAULT, type="int", help="number of base occurrences to use for initial " "positions heuristic (default " + str(INIT_NUM_OCCURRENCES_DEFAULT) + ")") parser.add_option("-v", "--init-pattern-width", dest="initwidth", metavar="WIDTH", default=INIT_PATTERN_WIDTH_DEFAULT, type="int", help="if OCCURRENCES>0, width of pattern " "to use for initial positions heuristic (defaults to " "value of --width)") parser.add_option("-c", "--cow", action="store_true", dest="cow", default=False, help="display cow (not recommended)") (options, args) = parser.parse_args() if options.cow: s = "" for _ in range(10): s += choice("ATCG") # Created with the cowsay program print """ ____________ < %s > ------------ \ ^__^ \ (oo)\_______ (__)\ )\/\\ ||----w | || ||""" % s sys.exit(0) if not options.input: parser.error("input file required") if not options.width: parser.error("width argument required") if options.width < 2: parser.error("please use a sane motif width") # Read contents of Fasta file try: file = open(options.input) except IOError: parser.error("could not read file %s" % options.input) fasta_parser = Fasta.RecordParser() # Iterator for sample data fasta_iterator = Fasta.Iterator(file, fasta_parser) # A list containing a dictionary object for each sequence sequences = [{'title': record.title, 'sequence': record.sequence, 'motif_position': 0} for record in fasta_iterator] # We could do some more error checking on the input file here, like # checking there's only ATCG and at least a few of them, but for now # this is enough if len(sequences) < 2: parser.error("found %i sequences in input file %s" % (len(sequences), options.input)) return {'sequences': sequences, 'width': options.width, 'weight': options.pseudo, 'iterations': options.iterations, 'shifts': options.shifts, 'ps_freq': options.frequency, 'init_occurrences': options.initoccurrences, 'init_width': options.initwidth}
#! /usr/bin/env python import sys, os import time from Bio import Fasta DEFAULT_DICT_FILE = '/project1/structure/mliang/pdb/derived_data/pdb_seqres.idx' DEFAULT_OUTFH = sys.stdout dict_file = DEFAULT_DICT_FILE outfh = DEFAULT_OUTFH start_time = time.time() fdict = Fasta.Dictionary(dict_file) elapse_time = time.time() - start_time print >> sys.stderr, "Time to load dictionary:", elapse_time start_time = time.time() chainmap = {} for key in fdict.keys(): chainmap.setdefault(key[:4], []).append(key) elapse_time = time.time() - start_time print >> sys.stderr, "Time to build chain map:", elapse_time start_time = time.time() args = sys.argv[1:] if not args: args = sys.stdin for field in args: fields = field.strip().split()