def predict(self): self.h2_prediction = self.get_antigen_gene_blast_results( self.h2_prediction, FLJB_FASTA_PATH) if not self.h2_prediction.is_missing: if not self.h2_prediction.is_perfect_match: top_result = self.h2_prediction.top_result match_len = top_result['length'] pident = top_result['pident'] # short lower %ID matches are treated as missing or '-' for H2 if match_len <= 600 and pident < 88.0: self.h2_prediction.h2 = '-' self.h2_prediction.is_missing = True return if match_len <= 600 and not self.h2_prediction.is_trunc: self.h2_prediction.h2 = '-' self.h2_prediction.is_missing = True return df_blast_results = pd.DataFrame( self.h2_prediction.blast_results) df_blast_results = df_blast_results[ (df_blast_results['mismatch'] <= 50) & (df_blast_results['length'] >= 700)] if df_blast_results.shape[0] == 0: self.h2_prediction.is_missing = True self.h2_prediction.top_result = None self.h2_prediction.h2 = '-' return df_blast_results_over1000 = df_blast_results[ (df_blast_results['mismatch'] <= 5) & (df_blast_results['length'] >= 1000)] if df_blast_results_over1000.shape[0] > 0: df_blast_results = df_blast_results_over1000.sort_values( by='mismatch') else: df_blast_results = df_blast_results.sort_values( by='bitscore', ascending=False) result_dict = BlastReader.df_first_row_to_dict( df_blast_results) result_trunc = BlastReader.is_blast_result_trunc( qstart=result_dict['qstart'], qend=result_dict['qend'], sstart=result_dict['sstart'], send=result_dict['send'], qlen=result_dict['qlen'], slen=result_dict['slen']) self.h2_prediction.top_result = result_dict self.h2_prediction.is_trunc = result_trunc self.h2_prediction.h2 = get_antigen_name( self.h2_prediction.top_result['qseqid']) if self.h2_prediction.is_missing: self.h2_prediction.h2 = '-'
def test_BlastReader_is_blast_result_trunc(): # not truncated; match found in the middle of the subject sequence assert not BlastReader.is_blast_result_trunc(qstart=1, qend=100, sstart=101, send=200, qlen=100, slen=1000) # not truncated; shorter match (-10bp) found in the middle of the subject # sequence assert not BlastReader.is_blast_result_trunc(qstart=1, qend=90, sstart=101, send=190, qlen=100, slen=1000) # not truncated; shorter match (-20bp) found in the middle of the subject # sequence assert not BlastReader.is_blast_result_trunc(qstart=1, qend=80, sstart=101, send=180, qlen=100, slen=1000) # truncated at the start of the subject assert BlastReader.is_blast_result_trunc(qstart=51, qend=100, sstart=1, send=50, qlen=100, slen=1000) # truncated at the end of the subject assert BlastReader.is_blast_result_trunc(qstart=51, qend=100, sstart=951, send=1000, qlen=100, slen=1000)
def predict(self, filter=['N/A']): self.h1_prediction = self.get_antigen_gene_blast_results( self.h1_prediction, FLIC_FASTA_PATH, filter) if not self.h1_prediction.is_missing and self.h1_prediction.top_result is not None: if not self.h1_prediction.is_perfect_match: df_blast_results = pd.DataFrame( self.h1_prediction.blast_results) df_blast_results = df_blast_results[ (df_blast_results['mismatch'] <= 25) & (df_blast_results['length'] >= 700)] if df_blast_results.shape[0] == 0: df_blast_results = pd.DataFrame( self.h1_prediction.blast_results) df_blast_results = df_blast_results[ (df_blast_results['mismatch'] <= 0) & (df_blast_results['length'] >= 400)] if df_blast_results.shape[0] == 0: self.h1_prediction.is_missing = True self.h1_prediction.top_result = None self.h1_prediction.h1 = None return df_blast_results_over1000 = df_blast_results[ (df_blast_results['mismatch'] <= 5) & (df_blast_results['length'] >= 1000)] if df_blast_results_over1000.shape[0] > 0: df_blast_results = df_blast_results_over1000.sort_values( by='mismatch') else: df_blast_results = df_blast_results.sort_values( by='bitscore', ascending=False) result_dict = BlastReader.df_first_row_to_dict( df_blast_results) result_trunc = BlastReader.is_blast_result_trunc( qstart=result_dict['qstart'], qend=result_dict['qend'], sstart=result_dict['sstart'], send=result_dict['send'], qlen=result_dict['qlen'], slen=result_dict['slen']) self.h1_prediction.top_result = result_dict self.h1_prediction.is_trunc = result_trunc self.h1_prediction.h1 = get_antigen_name( self.h1_prediction.top_result['qseqid'])