def test_blast_record_set(self): # prepare database seqfile_ops.write_fasta(self.db_file, self.db_records) db_records_list = seqfile_ops.load_multifasta(self.db_file) index = 0 for record in db_records_list: self.assertEqual(record.id,self.db_records[index].id) self.assertEqual(str(record.seq),str(self.db_records[index].seq)) index +=1 # make database self.dbfile_path, db_report = blasting.make_blastDB(self.temp_dir, self.db_name, self.db_file, 'nucl') self.assertIs(db_report['status'], 0) self.assertEquals(db_report['message'], 'database exists') # run local blast batch (with multiple queries) matches_multi = blasting.blast_record_set(self.dbfile_path, self.multi_records, self.prefs) self.assertIs(len(matches_multi), 3) index = 0 for record in self.multi_records: self.assertEqual(matches_multi[record.id][0]['contig_id'], self.multi_records[index].id) self.assertEqual(matches_multi[record .id][0]['details']['match_p100'], 100) index +=1
def test_local_blastn(self): # prepare query seqfile_ops.write_fasta(self.single_q_file, self.single_record) query_record = seqfile_ops.load_fasta(self.single_q_file) self.assertEqual(query_record.id,self.record_1.id) self.assertEqual(str(query_record.seq),str(self.record_1.seq)) # prepare database seqfile_ops.write_fasta(self.db_file, self.db_records) records_list = seqfile_ops.load_multifasta(self.db_file) index = 0 for record in records_list: self.assertEqual(record.id,self.db_records[index].id) self.assertEqual(str(record.seq),str(self.db_records[index].seq)) index +=1 # make database self.dbfile_path, db_report = blasting.make_blastDB(self.temp_dir, self.db_name, self.db_file, 'nucl') self.assertIs(db_report['status'], 0) self.assertEquals(db_report['message'], 'database exists') # run local blast with single query self.status = blasting.local_blastn(self.single_q_file, self.single_out_file, self.dbfile_path, self.prefs) self.assertEquals(self.status['output'], '') self.assertIsNone(self.status['error']) # parse blast output matches_single = blasting.parse_blast_out6(self.single_out_file, self.prefs) self.assertIs(len(matches_single), 1) self.assertEqual(matches_single[0]['contig_id'], self.single_record.id) self.assertEqual(matches_single[0]['details']['match_p100'], 100)
def test_seq_subset_load_from_chop_by_size(self): seqfile_ops.write_fasta(self.single_q_file, self.single_record) subset_mode = 'size' subset_args = {'size': 5, 'chop_mode': 'exact_size'} subset, subset_file = dataset_load.seq_subset_load(self.single_q_file, subset_mode, subset_args) self.assertIs(len(subset), 10) self.assertEqual(subset[0].id, 'temp_1_0-5')
def seq_subset_load(infile, subset_mode, subset_args): """Load a subset of sequence segments from a sequence file.""" from analysis.sequence_ops import feat_collect, feature_coords, \ coord_chop, get_seq_subset_by_coords from analysis.seqfile_ops import load_multifasta, surefmt_load, \ write_fasta from analysis.text_manipulation import adaptive_list_load if subset_mode is 'flatfile': # in this case the sequence file MUST be multifasta try: subset = load_multifasta(infile) except: raise else: print "set of", len(subset), "sequence segments" subset_file = infile else: # load the query single sequence file (convert format if necessary) try: seq_record = surefmt_load(infile, 'fasta', 'generic_dna') except: raise else: print "query sequence loaded from", infile # load or generate coordinate pairs for target segments if subset_mode is 'coordinates': try: coords_file = subset_args['file'] header = subset_args['header'] columns = subset_args['columns'] coords_list = adaptive_list_load(coords_file, header, columns) except: raise else: print len(coords_list), "segments loaded from", infile elif subset_mode is 'features': try: feat_mode = subset_args features = feat_collect(infile, feat_mode) coords_list = feature_coords(features) print coords_list except: raise else: print len(coords_list),"features loaded from", infile elif subset_mode is 'size': try: size = subset_args['size'] chop_mode = subset_args['chop_mode'] coords_list = coord_chop(len(seq_record.seq), size, chop_mode) except: raise else: print len(coords_list), "segments generated to fit", size else: print "ERROR: A mode MUST be specified." coords_list = None # collect subset of sequence segments using resulting coords_list try: subset = get_seq_subset_by_coords(seq_record, coords_list) except: raise else: print "subset of", len(subset), "sequence segments" # save subset to multifasta file for later use or reference subset_file = seq_record.id+'_subset.fas' try: write_fasta(subset_file, subset) except: raise else: print "subset written to fasta file", subset_file return subset, subset_file
def test_seq_subset_load_from_multifasta(self): seqfile_ops.write_fasta(self.multi_q_file, self.multi_records) subset_mode = 'flatfile' subset_args = None subset, subset_file = dataset_load.seq_subset_load(self.multi_q_file, subset_mode, subset_args) self.assertIs(len(subset), 5) index = 0 for record in subset: self.assertEqual(subset[index].id, self.multi_records[index].id) index += 1 self.assertIs(subset_file, self.multi_q_file)
def test_seq_subset_load_from_coords(self): seqfile_ops.write_fasta(self.single_q_file, self.single_record) temp_file = open(self.coords_file, 'w') temp_file.write(self.str_contents) temp_file.close() subset_mode = 'coordinates' subset_args = {'file': self.coords_file, 'header': 1, 'columns': (1, 2)} subset, subset_file = dataset_load.seq_subset_load(self.single_q_file, subset_mode, subset_args) self.assertIs(len(subset), 3) self.assertEqual(subset[0].id, 'temp_1_0-10') self.assertEqual(str(subset[2].seq), 'TTTGGCGCTCGCGGCGGG')
def test_write_and_load_multifasta(self): count = seqfile_ops.write_fasta(self.fas_filename, self.three_records) self.assertIs(count, 3) fas_records = seqfile_ops.load_multifasta(self.fas_filename) for index in range (0,2): self.assertEqual(fas_records[index].id, self.three_records[index].id)
def blast_record_set(dbfile_path, fasta_records, blast_prefs): """Loop through fasta entries and blast against database.""" import os from analysis.seqfile_ops import write_fasta from analysis.blasting import local_blastn matches = {} for query_record in fasta_records: query_file = 'temp.fas' out_file = 'temp.blast' write_fasta(query_file,query_record) try: status = local_blastn(query_file, out_file, dbfile_path, blast_prefs) except: raise else: query_matches = parse_blast_out6(out_file, blast_prefs) matches[query_record.id] = query_matches finally: os.remove(query_file) os.remove(out_file) return matches
def setUp(self): # create temp directory self.temp_dir = "tests/temp_data/" os.mkdir(self.temp_dir) # assign database directory (but don't create it yet) self.db_path = "tests/temp_data/temp_db/" # define file names self.gen_filename_1 = "temp_gen1.fas" self.gen_filename_2 = "temp_gen2.fas" self.gen_filename_3 = "temp_gen3.fas" self.genomes_list = self.temp_dir+"temp_genomes.txt" # create content for the genomes file self.header_contents = ['genome_name', 'file'] self.line_1_contents = ['genome_1', self.gen_filename_1] self.line_2_contents = ['genome_2', self.gen_filename_2] self.line_3_contents = ['genome_3', self.gen_filename_3] self.raw_contents = [self.header_contents, self.line_1_contents, self.line_2_contents, self.line_3_contents] # transform into a single string self.str_contents = "" for line_set in self.raw_contents: joined = "\t".join(line_set) self.str_contents += joined+"\n" # create some sequence records self.seq_1 = Seq('AATTTAATGGCGCAGGCTAGGAGAGAGATTTTTGGCGCTCGCGGCGGGG') self.seq_2 = Seq('GGATTATACCAAAGGCTTAAACTATAGGCTAGGAGAGATAGACG') self.seq_3 = Seq('GGAATATACCTTAGGCTTAAACTATAGGCTAGGAGAGGCTCG') self.seq_4 = Seq('GGGGATTACAGCCATAGTAACCAGATATTAaGACG') self.seq_5 = Seq('GGAACCGCTGATACATGATTATAGATCTATAGGGTCTAAAACATCG') self.seq_6 = Seq('AGGTCATGTACGATGCAGAATTTGTCGTACGATGTTAGTACGATGGTA') self.seq_7 = Seq('TTTTTCGCGCGCTTAGACCCAAAATATATTGTCGCTATAGGTCCCTCT') self.seq_8 = Seq('ACCGTGTGGCATTTATATTACACCACACACAGATTGGGTGTGCCAATCAG') self.seq_9 = Seq('ACCGTACGTACCATATTATTATATAGGATAGATATTTAGAGGATTTAGAT') self.record_1 = SeqRecord(self.seq_1, id='temp_1') self.record_2 = SeqRecord(self.seq_2, id='temp_2') self.record_3 = SeqRecord(self.seq_3, id='temp_3') self.record_4 = SeqRecord(self.seq_4, id='temp_4') self.record_5 = SeqRecord(self.seq_5, id='temp_5') self.record_6 = SeqRecord(self.seq_5, id='temp_6') self.record_7 = SeqRecord(self.seq_5, id='temp_7') self.record_8 = SeqRecord(self.seq_5, id='temp_8') self.record_9 = SeqRecord(self.seq_5, id='temp_9') # create record sets self.gen1_records = [self.record_1, self.record_2, self.record_3] self.gen2_records = [self.record_4, self.record_5, self.record_6] self.gen3_records = [self.record_7, self.record_8, self.record_9] # create all primary files seqfile_ops.write_fasta(self.temp_dir+self.gen_filename_1, self.gen1_records) seqfile_ops.write_fasta(self.temp_dir+self.gen_filename_2, self.gen2_records) seqfile_ops.write_fasta(self.temp_dir+self.gen_filename_3, self.gen3_records) seq_file = open(self.genomes_list, 'w') seq_file.write(self.str_contents) seq_file.close()
def test_surefmt_load_fas2fas(self): count = seqfile_ops.write_fasta(self.fas_filename, self.record) self.assertIs(count, 1) fas_record = seqfile_ops.surefmt_load(self.fas_filename, 'fasta', generic_dna) self.assertEqual(fas_record.id, self.record.id)
def test_seqfile_format_fas(self): count = seqfile_ops.write_fasta(self.fas_filename, self.record) self.assertIs(count, 1) format, name = seqfile_ops.seqfile_format(self.fas_filename) self.assertEqual(format, 'fasta')
def test_load_agnostic_fas(self): count = seqfile_ops.write_fasta(self.fas_filename, self.record) self.assertIs(count, 1) fas_record, type = seqfile_ops.load_agnostic(self.fas_filename) self.assertEqual(fas_record.id, self.record.id) self.assertEqual(type, 'fasta')
def test_write_and_load_single_fasta(self): count = seqfile_ops.write_fasta(self.fas_filename, self.record) self.assertIs(count, 1) fas_record = seqfile_ops.load_fasta(self.fas_filename) self.assertEqual(fas_record.id, self.record.id)
def test_surefmt_load_fas2gbk(self): count = seqfile_ops.write_fasta(self.fas_filename, self.record) self.assertIs(count, 1) gbk_record = seqfile_ops.surefmt_load(self.fas_filename, 'genbank', generic_dna) self.assertEqual(gbk_record.id, self.record.id)