def annotate_blast_hits(blast_results_xml, gb_records, annotation_type, qualifiers_dict=None): from Bio.Blast import NCBIXML from Bio.SeqFeature import SeqFeature, FeatureLocation from krpy import krbioio blast_results_handle = open(blast_results_xml) blast_records = NCBIXML.parse(blast_results_handle) gb_records_dict = gb_records if isinstance(gb_records_dict, basestring): gb_records_dict = krbioio.read_sequence_file( file_path=gb_records, file_format='gb', ret_type='dict', key='gi') for blast_record in blast_records: for alignment in blast_record.alignments: # alignment_id = (alignment.title.split("|")[1]).split(" ")[0] alignment_id = alignment.title.split(' ')[1] if gb_records_dict.has_key(str(alignment_id)): gb_record = gb_records_dict[str(alignment_id)] for hsp in alignment.hsps: f_start = hsp.sbjct_start-1 f_end = hsp.sbjct_end f_strand = 1 if hsp.frame[1] < 0: f_strand = -1 default_qualifiers = {'query_start':hsp.query_start, 'query_end':hsp.query_end, 'label':blast_record.query} if qualifiers_dict: default_qualifiers = dict(default_qualifiers.items() + qualifiers_dict.items()) alignment_feature = SeqFeature(FeatureLocation(f_start, f_end), strand=f_strand, type=annotation_type, qualifiers=default_qualifiers) gb_record.features.append(alignment_feature) gb_records_dict[str(alignment_id)] = gb_record blast_results_handle.close() return gb_records_dict
input_file = None output_file = None min_taxa = 0 args = parser.parse_args() if args.input_file: input_file = args.input_file if args.output_file: output_file = args.output_file if args.min_taxa: min_taxa = args.min_taxa if input_file and output_file and min_taxa: records = krbioio.read_sequence_file(input_file, 'gb', ret_type='list') # print('Found', len(records), 'in', input_file) excluded_qualifiers = ['translation', 'db_xref', 'exception', 'rpt_unit_seq', 'gene_synonym', 'rpt_type', 'satellite', 'transl_table', 'replace', 'rpt_unit_range', 'protein_id', 'codon_recognized', 'EC_number', 'function', 'estimated_length', 'mobile_element_type', 'codon_start', 'transl_except', 'number', 'standard_name', 'allele', 'inference'] feature_dict = dict() taxa_dict = dict() for record in records: txid = krncbi.get_ncbi_tax_id_for_record(record)
def merge_record_features(gb_records, annotation_type_to_merge, annotation_type_merged, merged_label, qualifiers_dict=None): from Bio.SeqFeature import SeqFeature, FeatureLocation from krpy import krbioio from krpy import krother gb_records_dict = gb_records if isinstance(gb_records_dict, basestring): gb_records_dict = krbioio.read_sequence_file( file_path=gb_records, file_format='gb', ret_type='dict', key='gi') for gb_record in gb_records_dict.values(): merged_features = [] features = [] for feature in gb_record.features: if feature.type == annotation_type_to_merge: features.append(feature) features.sort(key=lambda x: x.location.start, reverse=False) prev_range = [-1,-1] start = -1 # q_start = -1 # q_end = -1 strand = None for feature in features: if prev_range[0] == -1: start = int(feature.location.nofuzzy_start) prev_range = [int(feature.location.nofuzzy_start),int(feature.location.nofuzzy_end)] # q_start_temp = feature.qualifiers['query_start'] # q_end_temp = feature.qualifiers['query_end'] # q_start = None # if not isinstance(q_start_temp, int): # q_start = int(feature.qualifiers['query_start'][0]) # else: # q_start = int(q_start_temp) # q_end = None # if not isinstance(q_end_temp, int): # q_end = int(feature.qualifiers['query_start'][0]) # else: # q_end = int(q_end_temp) if feature.strand: strand = int(feature.strand) if not krother.in_range(int(feature.location.nofuzzy_start),prev_range[0],prev_range[1],100): merged_features.append([start, prev_range[1], strand]) # q_start_temp = feature.qualifiers['query_start'] # q_end_temp = feature.qualifiers['query_end'] # q_start = None # if not isinstance(q_start_temp, int): # q_start = int(feature.qualifiers['query_start'][0]) # else: # q_start = int(q_start_temp) # q_end = None # if not isinstance(q_end_temp, int): # q_end = int(feature.qualifiers['query_start'][0]) # else: # q_end = int(q_end_temp) start = int(feature.location.nofuzzy_start) prev_range = [int(feature.location.nofuzzy_start),max(prev_range[1],int(feature.location.nofuzzy_end))] # q_start_temp = feature.qualifiers['query_start'] # q_end_temp = feature.qualifiers['query_end'] # q_start = None # if not isinstance(q_start_temp, int): # q_start = int(feature.qualifiers['query_start'][0]) # else: # q_start = int(q_start_temp) # q_end = None # if not isinstance(q_end_temp, int): # q_end = int(feature.qualifiers['query_start'][0]) # else: # q_end = int(q_end_temp) if feature.strand: strand = int(feature.strand) if len(features) > 0: merged_features.append([start, prev_range[1], strand]) for merged_feature in merged_features: # default_qualifiers = {'query_start':merged_feature[2], 'query_end':merged_feature[3], 'label':merged_label} # if qualifiers_dict: # default_qualifiers = dict(default_qualifiers.items() + qualifiers_dict.items()) # alignment_feature = SeqFeature(FeatureLocation(merged_feature[0], merged_feature[1]), strand=merged_feature[2], type=annotation_type_merged) alignment_feature = SeqFeature(FeatureLocation(merged_feature[0], merged_feature[1]), strand=1, type=annotation_type_merged) gb_record.features.append(alignment_feature) gb_records_dict[gb_record.annotations['gi']] = gb_record return gb_records_dict