Пример #1
0
def annotate_blast_hits(blast_results_xml, gb_records, annotation_type, qualifiers_dict=None):

    from Bio.Blast import NCBIXML
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    from krpy import krbioio

    blast_results_handle = open(blast_results_xml)
    blast_records = NCBIXML.parse(blast_results_handle)

    gb_records_dict = gb_records
    if isinstance(gb_records_dict, basestring):
        gb_records_dict = krbioio.read_sequence_file(
            file_path=gb_records,
            file_format='gb',
            ret_type='dict',
            key='gi')

    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            # alignment_id = (alignment.title.split("|")[1]).split(" ")[0]
            alignment_id = alignment.title.split(' ')[1]
            if gb_records_dict.has_key(str(alignment_id)):
                gb_record = gb_records_dict[str(alignment_id)]
                for hsp in alignment.hsps:
                    f_start = hsp.sbjct_start-1
                    f_end = hsp.sbjct_end
                    f_strand = 1
                    if hsp.frame[1] < 0:
                        f_strand = -1
                    default_qualifiers = {'query_start':hsp.query_start, 'query_end':hsp.query_end, 'label':blast_record.query}
                    if qualifiers_dict:
                        default_qualifiers = dict(default_qualifiers.items() + qualifiers_dict.items())
                    alignment_feature = SeqFeature(FeatureLocation(f_start, f_end), strand=f_strand, type=annotation_type, qualifiers=default_qualifiers)
                    gb_record.features.append(alignment_feature)

                    gb_records_dict[str(alignment_id)] = gb_record

    blast_results_handle.close()

    return gb_records_dict
Пример #2
0
    input_file = None
    output_file = None
    min_taxa = 0

    args = parser.parse_args()

    if args.input_file:
        input_file = args.input_file
    if args.output_file:
        output_file = args.output_file
    if args.min_taxa:
        min_taxa = args.min_taxa

    if input_file and output_file and min_taxa:
        records = krbioio.read_sequence_file(input_file, 'gb', ret_type='list')

        # print('Found', len(records), 'in', input_file)

        excluded_qualifiers = ['translation', 'db_xref', 'exception',
        'rpt_unit_seq', 'gene_synonym', 'rpt_type', 'satellite', 'transl_table',
        'replace', 'rpt_unit_range', 'protein_id', 'codon_recognized',
        'EC_number', 'function', 'estimated_length', 'mobile_element_type',
        'codon_start', 'transl_except', 'number', 'standard_name', 'allele',
        'inference']

        feature_dict = dict()
        taxa_dict = dict()

        for record in records:
            txid = krncbi.get_ncbi_tax_id_for_record(record)
Пример #3
0
def merge_record_features(gb_records, annotation_type_to_merge, annotation_type_merged, merged_label, qualifiers_dict=None):

    from Bio.SeqFeature import SeqFeature, FeatureLocation
    from krpy import krbioio
    from krpy import krother

    gb_records_dict = gb_records
    if isinstance(gb_records_dict, basestring):
        gb_records_dict = krbioio.read_sequence_file(
            file_path=gb_records,
            file_format='gb',
            ret_type='dict',
            key='gi')

    for gb_record in gb_records_dict.values():
        merged_features = []
        features = []
        for feature in gb_record.features:
            if feature.type == annotation_type_to_merge:
                features.append(feature)
        features.sort(key=lambda x: x.location.start, reverse=False)
        prev_range = [-1,-1]
        start = -1
        # q_start = -1
        # q_end = -1
        strand = None
        for feature in features:
            if prev_range[0] == -1:
                start = int(feature.location.nofuzzy_start)
                prev_range = [int(feature.location.nofuzzy_start),int(feature.location.nofuzzy_end)]

                # q_start_temp = feature.qualifiers['query_start']
                # q_end_temp = feature.qualifiers['query_end']

                # q_start = None
                # if not isinstance(q_start_temp, int):
                #     q_start = int(feature.qualifiers['query_start'][0])
                # else:
                #     q_start = int(q_start_temp)

                # q_end = None
                # if not isinstance(q_end_temp, int):
                #     q_end = int(feature.qualifiers['query_start'][0])
                # else:
                #     q_end = int(q_end_temp)

                if feature.strand:
                    strand = int(feature.strand)

            if not krother.in_range(int(feature.location.nofuzzy_start),prev_range[0],prev_range[1],100):
                merged_features.append([start, prev_range[1], strand])

                # q_start_temp = feature.qualifiers['query_start']
                # q_end_temp = feature.qualifiers['query_end']

                # q_start = None
                # if not isinstance(q_start_temp, int):
                #     q_start = int(feature.qualifiers['query_start'][0])
                # else:
                #     q_start = int(q_start_temp)

                # q_end = None
                # if not isinstance(q_end_temp, int):
                #     q_end = int(feature.qualifiers['query_start'][0])
                # else:
                #     q_end = int(q_end_temp)

                start = int(feature.location.nofuzzy_start)
            prev_range = [int(feature.location.nofuzzy_start),max(prev_range[1],int(feature.location.nofuzzy_end))]

            # q_start_temp = feature.qualifiers['query_start']
            # q_end_temp = feature.qualifiers['query_end']

            # q_start = None
            # if not isinstance(q_start_temp, int):
            #     q_start = int(feature.qualifiers['query_start'][0])
            # else:
            #     q_start = int(q_start_temp)

            # q_end = None
            # if not isinstance(q_end_temp, int):
            #     q_end = int(feature.qualifiers['query_start'][0])
            # else:
            #     q_end = int(q_end_temp)

            if feature.strand:
                strand = int(feature.strand)
        if len(features) > 0:
            merged_features.append([start, prev_range[1], strand])
        for merged_feature in merged_features:
            # default_qualifiers = {'query_start':merged_feature[2], 'query_end':merged_feature[3], 'label':merged_label}
            # if qualifiers_dict:
            #     default_qualifiers = dict(default_qualifiers.items() + qualifiers_dict.items())
            # alignment_feature = SeqFeature(FeatureLocation(merged_feature[0], merged_feature[1]), strand=merged_feature[2], type=annotation_type_merged)
            alignment_feature = SeqFeature(FeatureLocation(merged_feature[0], merged_feature[1]), strand=1, type=annotation_type_merged)
            gb_record.features.append(alignment_feature)
            gb_records_dict[gb_record.annotations['gi']] = gb_record
    return gb_records_dict