def test_min_scores_filter(self): 'We can keep the hits scores above the given one' blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml')) #with evalue filters = [{'kind' : 'score_threshold', 'score_key': 'expect', 'max_score': 1e-34, }] expected = {'cCL1Contig2':2, 'cCL1Contig3':0, 'cCL1Contig4':2, 'cCL1Contig5':2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected) #with similartiry filters = [{'kind' : 'score_threshold', 'score_key': 'similarity', 'min_score': 92, }] expected = {'cCL1Contig2':0, 'cCL1Contig3':0, 'cCL1Contig4':1, 'cCL1Contig5':2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected)
def similar_sequences_for_blast(blast_fhand, filters=None): "It look for similar sequences in a blast result" # now we parse the blast blast_parser = get_alignment_parser("blast+") blast_result = blast_parser(blast_fhand) # We filter the results with appropiate filters if filters is None: filters = [ {"kind": "score_threshold", "score_key": "similarity", "min_score": 90}, {"kind": "min_length", "min_num_residues": 100, "length_in_query": True}, ] alignments = filter_alignments(blast_result, config=filters) try: alignment = alignments.next() except StopIteration: return [] similar_seqs = [] for match in alignment["matches"]: # to which sequence our query is similar? name = match["subject"].name similar_seqs.append( { "name": name, "subject_start": match["subject_start"], "subject_end": match["subject_end"], "query_start": match["start"], "query_end": match["end"], } ) return similar_seqs
def similar_sequences_for_blast(blast_fhand, filters=None): 'It look fro similar sequences ina blast result' #now we parse the blast blast_parser = get_alignment_parser('blast+') blast_result = blast_parser(blast_fhand) # We filter the results with appropiate filters if filters is None: filters = [{'kind' : 'score_threshold', 'score_key': 'similarity', 'min_score': 90, }, {'kind' : 'min_length', 'min_num_residues': 100, 'length_in_query' : True } ] alignments = filter_alignments(blast_result, config=filters) try: alignment = alignments.next() except StopIteration: return [] similar_seqs = [] for match in alignment['matches']: #to which sequence our query is similar? name = match['subject'].name similar_seqs.append({'name':name, 'subject_start': match['subject_start'], 'subject_end': match['subject_end'], 'query_start': match['start'], 'query_end': match['end'] }) return similar_seqs
def get_hit_pairs_fom_blast(blast_fhand, sub_def_as_acc=None, filters=None): 'It return a iterator with query subjetc tuples of the hist in the blast' blasts = BlastParser(fhand=blast_fhand, subj_def_as_accesion=sub_def_as_acc) if filters is None: filters = [{'kind' : 'best_scores', 'score_key' : 'expect', 'max_score' : 1e-20, 'score_tolerance': 10}] filtered_results = filter_alignments(blasts, config=filters) get_id = lambda x : x.split()[0] for match in filtered_results: try: query = match['query'].id except AttributeError: query = match['query'].name query = get_id(query) for match_hit in match['matches']: try: subject = match_hit['subject'].id except AttributeError: subject = match_hit['subject'].name subject = get_id(subject) yield(query, subject)
def unique_contiguous_region_filter(sequence): '''It filters out the snv in regions repeated in the genome or discontiguous''' if sequence is None: return None for snv in sequence.get_features(kind='snv'): # Check if it is already done previous_result = _get_filter_result(snv, 'uniq_contiguous', threshold=distance) if previous_result is not None: continue #we make a blast #with the sequence around the snv location = snv.location.start.position start = location - distance end = location + distance if start < 0: start = 0 #print start, end seq_fragment = sequence[start:end] blast_fhand = blast_runner(seq_fragment)['blastn'] #now we parse the blast blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=match_filters) #are there any similar sequences? try: alignment = alignments.next() result = True except StopIteration: #if there is no similar sequence we assume that is unique result = False if result: #how many matches, it should be only one num_hits = len(alignment['matches']) if num_hits > 1: result = True else: #how many match parts have the first match? #we could do it with the blast result, but blast is not very #good aligning, so we realign with est2genome blast_fhand.seek(0) sim_seqs = similar_sequences_for_blast(blast_fhand) sim_seq = sim_seqs[0] if sim_seqs else None introns = infer_introns_for_cdna(sequence=seq_fragment, genomic_seqs_index=genomic_seqs_index, similar_sequence=sim_seq, genomic_db=genomic_db) if introns: result = True else: result = False blast_fhand.close() _add_filter_result(snv, 'uniq_contiguous', result, distance) return sequence
def do_alignment(self, query): 'It returns an alignment with this query' alignments = [] for subject in self._subjects: alignment = sw_align(query, subject) alignments.append(alignment) if self._filters is not None: alignments = filter_alignments(alignments, config=self._filters) return alignments
def do_alignment(self, query): 'It returns an alignment with this query' alignment_fhand = self._aligner(query)[self._program] # We need to parse the result alignments = self._parser(alignment_fhand) # We filter the results with appropriate filters if self._filters is not None: alignments = filter_alignments(alignments, config=self._filters) return alignments
def get_hit_pairs_from_blast(blast_fhand, sub_def_as_acc=None, filters=None): """It returns an iterator with query-subject tuples of the hits in the blast """ blasts = BlastParser(fhand=blast_fhand, subj_def_as_accesion=sub_def_as_acc) if filters is None: filters = [{"kind": "best_scores", "score_key": "expect", "max_score": 1e-20, "score_tolerance": 10}] filtered_results = filter_alignments(blasts, config=filters) return get_pairs_from_alignments(filtered_results)
def test_min_length_filter(self): 'We can keep the hits length above the given one' blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml')) #with the min length given in base pairs filters = [{'kind' : 'min_length', 'min_num_residues': 500, 'length_in_query':True }] expected = {'cCL1Contig2':3, 'cCL1Contig3':0, 'cCL1Contig4':1, 'cCL1Contig5':1} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected) #with the min length given in query % filters = [{'kind' : 'min_length', 'min_percentage': 70, 'length_in_query':True }] expected = {'cCL1Contig2':0, 'cCL1Contig3':0, 'cCL1Contig4':2, 'cCL1Contig5':0} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected) #with the min length given in subject % filters = [{'kind' : 'min_length', 'min_percentage' : 0.002, 'length_in_query': False }] expected = {'cCL1Contig2':3, 'cCL1Contig3':0, 'cCL1Contig4':1, 'cCL1Contig5':2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected)
def test_best_scores_filter(self): 'We can keep the hits with the bests expects' blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml')) filters = [{'kind' : 'best_scores', 'score_key' : 'expect', 'max_score' : 1e-4, 'score_tolerance': 10 }] expected = {'cCL1Contig2':2, 'cCL1Contig3':1, 'cCL1Contig4':1, 'cCL1Contig5':2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected)
def _filter(sequence): 'Giving a sequence it returns true or False depending on the exonerate' if sequence is None: return False source_result = run_align_for_seq(sequence)[aligner_cmd] results = parser(source_result) filtered_results = filter_alignments(results, config=match_filters) try: #only one sequence -> only one result filtered_results.next() except StopIteration: #there was no result for this sequence return False return True
def test_blast_no_result(self): 'It test that the xml output can be and empty string' blast_file = NamedTemporaryFile() blasts = BlastParser(fhand=blast_file) filters = [{'kind' : 'best_scores', 'score_key' : 'expect', 'max_score': 1e-4, 'score_tolerance': 10 }] filt_b = filter_alignments(blasts, config=filters,) try: filt_b.next() self.fail() except StopIteration: pass
def _get_descriptions_from_blasts(blasts): '''It gets a description from a list of blast outputs. Blast description in the xml may be modified to remove trash. This depends on blast xml, so the item of the list can be a blast or a dict with the blast and the function to modify the description field. It tries to find the name in the first file, after in the second, etc''' seq_annot = {} filters = [{'kind' : 'best_scores', 'score_key' : 'expect', 'max_score' : 1e-20, 'score_tolerance': 10}] for blast in blasts: blast_fhand = blast['blast'] if 'modifier' in blast: modifier = blast['modifier'] else: modifier = None blast_fhand = get_fhand(blast_fhand) blast = BlastParser(fhand=blast_fhand) filtered_results = filter_alignments(blast, config=filters) db_name = blast.db_name try: for match in filtered_results: try: query = match['query'].id except AttributeError: query = match['query'].name if query not in seq_annot: match_hit = match['matches'][0] description = match_hit['subject'].description subject_name = match_hit['subject'].name if modifier is not None: description = modifier(description) if description != "<unknown description>": seq_annot[query] = {'description':description.strip(), 'db_name':db_name, 'subj_name': subject_name} except ExpatError as error: msg = str(error) + ':%s' % blast_fhand.name raise ExpatError(msg) return seq_annot
def similar_sequences_for_blast(blast_fhand, filters): 'It look fro similar sequences ina blast result' #now we parse the blast blast_parser = get_alignment_parser('blast+') blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=filters) for alignment in alignments: query_name = alignment['query'].name for match in alignment['matches']: print match #to which sequence our query is similar? name = match['subject'].name subj_desc = match['subject'].description if 'expect' in match['scores']: evalue = str(match['scores']['expect']) else: evalue = None if 'identity'in match['scores']: identity = str(match['scores']['identity']) else: identity = None if 'similarity' in match['scores']: similarity = str(match['scores']['similarity']) else: similarity = None yield{'name':name, 'subject_description':subj_desc, 'query_name':query_name, 'subject_start': match['subject_start'], 'subject_end': match['subject_end'], 'query_start': match['start'], 'query_end': match['end'], 'evalue': evalue, 'identity': identity, 'similarity': similarity }
def test_min_score_mapper(self): 'We keep the matches with the scores above the threshold' filter1 = {'kind' : 'score_threshold', 'score_key': 'score', 'min_score': 100, } align1 = {'matches': [{'scores':{'score':400}, 'start':0, 'end':100, 'subject_start':0, 'subject_end':100, 'match_parts':[{'scores':{'score':400}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'score':300}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, {'scores':{'score':50}, 'query_start':50, 'query_end':60, 'subject_start':50, 'subject_end':60, }, {'scores':{'score':40}, 'query_start':80, 'query_end':100, 'subject_start':80, 'subject_end':100, } ], }, {'scores':{'score':20}, 'match_parts':[{'scores':{'score':20}}], }, {'scores':{'score':90}, 'match_parts':[{'scores':{'score':90}}], } ] } align2 = {'matches': [{'scores':{'score':20}, 'match_parts':[{'scores':{'score':20}}], }]} alignments = [align1, align2] filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'scores':{'score':400}, 'start':0, 'end':40, 'subject_start':0, 'subject_end':40, 'match_parts':[{'scores':{'score':400}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'score':300}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, ], }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1
def test_max_score_mapper(self): filter1 = {'kind' : 'best_scores', 'score_key' : 'expect', 'max_score' : 1e-3 } align1 = {'matches': [{'scores':{'expect':1e-4}, 'start':0, 'end':100, 'subject_start':0, 'subject_end':100, 'match_parts':[{'scores':{'expect':1e-4}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'expect':5e-4}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, {'scores':{'expect':1e-3}, 'query_start':50, 'query_end':60, 'subject_start':50, 'subject_end':60, }, {'scores':{'expect':1e-2}, 'query_start':80, 'query_end':100, 'subject_start':80, 'subject_end':100, } ], }, {'scores':{'expect':1e-3}, 'match_parts':[{'scores':{'expect':1e-3}}], }, {'scores':{'expect':1e-2}, 'match_parts':[{'scores':{'expect':1e-2}}], } ] } align2 = {'matches': [{'scores':{'expect':1e-2}, 'match_parts':[{'scores':{'expect':1e-2}}], }]} alignments = [align1, align2] filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'scores':{'expect':1e-4}, 'start':0, 'end':60, 'subject_start':0, 'subject_end':60, 'match_parts':[{'scores':{'expect':1e-4}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'expect':5e-4}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, {'scores':{'expect':1e-3}, 'query_start':50, 'query_end':60, 'subject_start':50, 'subject_end':60, }, ], }, {'scores':{'expect':1e-3}, 'match_parts':[{'scores':{'expect':1e-3}}], }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1
def test_min_length_mapper(self): 'We can filter the matches according to their length' filter1 = {'kind' : 'min_length', 'min_num_residues': 100, 'length_in_query' : True, } align1 = {'matches': [{'match_parts':[{'query_start':0, 'query_end':100, 'subject_start':0, 'subject_end':100, }]}, {'match_parts':[{'query_start':0, 'query_end':50, 'subject_start':0, 'subject_end':100, }]}, ] } alignments = [align1] filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 filter = {'kind' : 'min_length', 'min_num_residues': 100, 'length_in_query' : False, } filtered_alignments = list(filter_alignments(alignments, config=[filter])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':100, }, {'start':0, 'end':50, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 filter = {'kind' : 'min_length', 'min_percentage': 90, 'length_in_query' : True, } align1 = {'query':UnknownSeq(100), 'matches': [{'match_parts':[{'query_start':0, 'query_end':90, 'subject_start':0, 'subject_end':100, }]}, {'match_parts':[{'query_start':0, 'query_end':50, 'subject_start':0, 'subject_end':100, }]}, ] } alignments = [align1] filtered_alignments = list(filter_alignments(alignments, config=[filter])) expected_align1 = {'matches': [{'start':0, 'end':90, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 filter = {'kind' : 'min_length', 'min_percentage' : 90, 'length_in_query': False, } align1 = {'matches': [{'subject': UnknownSeq(100), 'match_parts':[{'query_start':0, 'query_end':100, 'subject_start':0, 'subject_end':90, }]}, {'subject': UnknownSeq(100), 'match_parts':[{'query_start':0, 'query_end':100, 'subject_start':0, 'subject_end':89, }]}, ] } alignments = [align1] filtered_alignments = list(filter_alignments(alignments, config=[filter])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':90, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1