def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, region, feature_count, results, features, feature_list, removed_results, line, file_loc): ''' Adds a value to the table that is a known hit ''' # Get orientation if y_L < x_R: start = y_L end = x_R orient = 'F' else: start = y_R end = x_L orient = 'R' # Get features and append to genbank note = 'Known hit' left_feature, right_feature = createFeature([x_L, y_L, x_R, y_R], orient, note) genbank.features.append(left_feature) genbank.features.append(right_feature) # Check to see if the sequence between actually belongs to the IS query seq_results = check_seq_between(ref, seq, start, end, 'region_' + str(region), temp) # This is a known site of coverage and %ID above 80 if len(seq_results) != 0 and seq_results[0] >= 80 and seq_results[1] >= 80: # Taking all four coordinates and finding min and max to avoid coordinates # that overlap the actual IS (don't want to return those in gene calls) # Mark as a known call to improve accuracy of gene calling gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) #gene_left = get_other_gene(ref, min(y_L, y_R, x_R, x_L), "left", cds, trna, rrna, known=True) #gene_right = get_other_gene(ref, max(y_L, y_R, x_R, x_L), "right", cds, trna, rrna, known=True) # If the genes are the same, then this gene must be interrupted by the known site if gene_left[0] == gene_right[0]: func_pred = 'Gene interrupted' # Remove + and - from distance as the gene is interrupted gene_right[1] = gene_right[1][:-1] gene_left[1] = gene_left[1][:-1] # Otherwise we need to determine who is upstream/downstream of what else: func_pred = '' func_pred = '' # Add to the final results if 'unpaired' in file_loc: call = 'Known?' else: call = 'Known' results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] else: # Then I'm not sure what this is # Get flanking genes anyway gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) if 'unpaired' in file_loc: call = 'Possible related IS?' else: call = 'Possible releated IS' func_pred = '' if len(seq_results) !=0: results['region_' + str(region)] = [orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred] else: removed_results['region_' + str(region)] = line.strip() + '\t' + file_loc +'\n'
def novel_hit(x_L, y_L, x_R, y_R, x, y, genbank, ref, cds, trna, rrna, gap, orient, feature_count, region, results, features, feature_list, unpaired=False, star=False): ''' Get flanking gene information for novel hits. ''' # Create features for genbank note = 'Novel hit' if unpaired == True: note += ' , unpaired hit' if star == True: note += ' , imprecise hit' left_feature, right_feature = createFeature([x_L, y_L, x_R, y_R], orient, note) # Add features to genbank genbank.features.append(left_feature) genbank.features.append(right_feature) # Get the genes flanking the left and right ends gene_left, gene_right = get_flanking_genes(features, feature_list, x, y, cds, trna, rrna) #print gene_left #print gene_right # If the genes are the same, then hit is inside the gene if gene_left[-1] == gene_right[-1]: func_pred = 'Gene interrupted' else: func_pred = '' func_pred = '' # This is a confident hit if unpaired == False: call = 'Novel' # Hit is paired with a low coverage end, so an unconfident hit elif unpaired == True: call = 'Novel?' # This hit is imprecise, as gap size is larger than expected if star == True: call = 'Novel*' # Store all information for final table output results['region_' + str(region)] = [orient, str(x), str(y), gap, call, '', '', gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred]
def add_known(x_L, x_R, y_L, y_R, gap, genbank, ref, seq, temp, cds, trna, rrna, region, feature_count, results, features, feature_list, removed_results, line, file_loc): ''' Adds a value to the table that is a known hit ''' # Get orientation if y_L < x_R: start = y_L end = x_R orient = 'F' else: start = y_R end = x_L orient = 'R' # Get features and append to genbank note = 'Known hit' left_feature, right_feature = createFeature([x_L, y_L, x_R, y_R], orient, note) genbank.features.append(left_feature) genbank.features.append(right_feature) # Check to see if the sequence between actually belongs to the IS query seq_results = check_seq_between(ref, seq, start, end, 'region_' + str(region), temp) # This is a known site of coverage and %ID above 80 if len(seq_results) != 0 and seq_results[0] >= 80 and seq_results[1] >= 80: # Taking all four coordinates and finding min and max to avoid coordinates # that overlap the actual IS (don't want to return those in gene calls) # Mark as a known call to improve accuracy of gene calling gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) #gene_left = get_other_gene(ref, min(y_L, y_R, x_R, x_L), "left", cds, trna, rrna, known=True) #gene_right = get_other_gene(ref, max(y_L, y_R, x_R, x_L), "right", cds, trna, rrna, known=True) # If the genes are the same, then this gene must be interrupted by the known site if gene_left[0] == gene_right[0]: func_pred = 'Gene interrupted' # Remove + and - from distance as the gene is interrupted gene_right[1] = gene_right[1][:-1] gene_left[1] = gene_left[1][:-1] # Otherwise we need to determine who is upstream/downstream of what else: func_pred = '' func_pred = '' # Add to the final results if 'unpaired' in file_loc: call = 'Known?' else: call = 'Known' results['region_' + str(region)] = [ orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred ] else: # Then I'm not sure what this is # Get flanking genes anyway gene_left, gene_right = get_flanking_genes(features, feature_list, start, end, cds, trna, rrna) if 'unpaired' in file_loc: call = 'Possible related IS?' else: call = 'Possible releated IS' func_pred = '' if len(seq_results) != 0: results['region_' + str(region)] = [ orient, str(start), str(end), gap, call, str(seq_results[0]), str('%.2f' % seq_results[1]), gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred ] else: removed_results[ 'region_' + str(region)] = line.strip() + '\t' + file_loc + '\n'
def novel_hit(x_L, y_L, x_R, y_R, x, y, genbank, ref, cds, trna, rrna, gap, orient, feature_count, region, results, features, feature_list, unpaired=False, star=False): ''' Get flanking gene information for novel hits. ''' # Create features for genbank note = 'Novel hit' if unpaired == True: note += ' , unpaired hit' if star == True: note += ' , imprecise hit' left_feature, right_feature = createFeature([x_L, y_L, x_R, y_R], orient, note) # Add features to genbank genbank.features.append(left_feature) genbank.features.append(right_feature) # Get the genes flanking the left and right ends gene_left, gene_right = get_flanking_genes(features, feature_list, x, y, cds, trna, rrna) #print gene_left #print gene_right # If the genes are the same, then hit is inside the gene if gene_left[-1] == gene_right[-1]: func_pred = 'Gene interrupted' else: func_pred = '' func_pred = '' # This is a confident hit if unpaired == False: call = 'Novel' # Hit is paired with a low coverage end, so an unconfident hit elif unpaired == True: call = 'Novel?' # This hit is imprecise, as gap size is larger than expected if star == True: call = 'Novel*' # Store all information for final table output results['region_' + str(region)] = [ orient, str(x), str(y), gap, call, '', '', gene_left[-1][:-1], gene_left[-1][-1], gene_left[1], gene_right[-1][:-1], gene_right[-1][-1], gene_right[1], func_pred ]