def parse_blast_out6(out_file, blast_prefs): """Parse BLAST results formatted in Tabular format (outfmt=6).""" from analysis.text_manipulation import adaptive_list_load score_pref = blast_prefs['score'] len_pref = blast_prefs['length'] query_matches = [] # default blast outfmt yields # 0=qseqid 1=sseqid 2=pident 3=length 4=mismatch 5=gapopen # 6=qstart 7=qend 8=sstart 9=send 10=evalue 11=bitscore results = adaptive_list_load(out_file, 0, (1, 2, 3, 6, 7, 8, 9, 10, 11)) for line in results: contig_ID = line[0] match_p100 = float(line[1]) length = int(line[2]) q_start = int(line[3]) q_end = int(line[4]) s_start = int(line[5]) s_end = int(line[6]) evalue = line[7] bitscore = float(line[8]) if bitscore < score_pref: pass # this is bad elif length < len_pref: pass # this is bad too else: if q_start > q_end: m_orient = '-' else: m_orient = '+' match_details = {'match_p100': match_p100, 'length': length, 'm_orient': m_orient, 'q_start': q_start, 'q_end': q_end, 's_start': s_start, 's_end': s_end, 'evalue': evalue, 'bitscore': bitscore} match = {'contig_id': contig_ID, 'details': match_details} query_matches.append(match) return query_matches
def genome_sets_load(genomes_path, input_file, input_prefs, db_path): """Load genome datasets listed in an input file.""" import os, sys from classes.analysis_obj import GenomeSet from analysis.seqfile_ops import ensure_fasta from analysis.text_manipulation import adaptive_list_load from analysis.blasting import make_blastDB header = input_prefs['header'] columns = input_prefs['columns'] genomes_list = adaptive_list_load(input_file, header, columns) print "prepping BLAST databases" genome_sets = [] for line in genomes_list: genome_name = line[0] seq_file = os.path.join(genomes_path, line[1]) try: db_infile = ensure_fasta(seq_file) except: raise else: print "genome FASTA sequence available in", db_infile dbfile_path, DB_report = make_blastDB(db_path, genome_name, seq_file, 'nucl') if DB_report['status'] is 1: print genome_name, ":", DB_report['message']['error'] sys.exit() elif DB_report['status'] is 0: print genome_name, ":", DB_report['message'] new_genome_set = GenomeSet(db_infile, genome_name) genome_sets.append(new_genome_set) print " ", len(genome_sets),"databases ready to search" return genome_sets
def seq_subset_load(infile, subset_mode, subset_args): """Load a subset of sequence segments from a sequence file.""" from analysis.sequence_ops import feat_collect, feature_coords, \ coord_chop, get_seq_subset_by_coords from analysis.seqfile_ops import load_multifasta, surefmt_load, \ write_fasta from analysis.text_manipulation import adaptive_list_load if subset_mode is 'flatfile': # in this case the sequence file MUST be multifasta try: subset = load_multifasta(infile) except: raise else: print "set of", len(subset), "sequence segments" subset_file = infile else: # load the query single sequence file (convert format if necessary) try: seq_record = surefmt_load(infile, 'fasta', 'generic_dna') except: raise else: print "query sequence loaded from", infile # load or generate coordinate pairs for target segments if subset_mode is 'coordinates': try: coords_file = subset_args['file'] header = subset_args['header'] columns = subset_args['columns'] coords_list = adaptive_list_load(coords_file, header, columns) except: raise else: print len(coords_list), "segments loaded from", infile elif subset_mode is 'features': try: feat_mode = subset_args features = feat_collect(infile, feat_mode) coords_list = feature_coords(features) print coords_list except: raise else: print len(coords_list),"features loaded from", infile elif subset_mode is 'size': try: size = subset_args['size'] chop_mode = subset_args['chop_mode'] coords_list = coord_chop(len(seq_record.seq), size, chop_mode) except: raise else: print len(coords_list), "segments generated to fit", size else: print "ERROR: A mode MUST be specified." coords_list = None # collect subset of sequence segments using resulting coords_list try: subset = get_seq_subset_by_coords(seq_record, coords_list) except: raise else: print "subset of", len(subset), "sequence segments" # save subset to multifasta file for later use or reference subset_file = seq_record.id+'_subset.fas' try: write_fasta(subset_file, subset) except: raise else: print "subset written to fasta file", subset_file return subset, subset_file
def test_adaptive_list_load(self): trimlines = text_manipulation.adaptive_list_load(self.filename, 0, (1,3)) self.assertEqual(len(trimlines), 4) self.assertEqual(trimlines[1][1], self.line_1_contents[3])