def post_process_protein(params, protein): def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False # these functions detect if and TM-containing IM proteins # have large loops / terminal regions in the periplasm or cytoplasm # that may be accessible / inaccessible in spheroplast shaving # experiments. def has_long_loops(protein, loop_str='_outer_loops', \ loop_length=params['internal_exposed_loop_min']): for annot in protein: if loop_str in annot: for loop in protein[annot]: l_len = loop[1]-loop[0] if l_len >= loop_length: return True return False def long_in_periplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_outer_loops', loop_length) def long_in_cytoplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_inner_loops', loop_length) details = [] category = "UNKNOWN" is_hmm_profile_match = dict_get(protein, 'hmmsearch') is_signalp = dict_get(protein, 'is_signalp') is_tatfind = dict_get(protein, 'is_tatfind') is_lipop = dict_get(protein, 'is_lipop') # in terms of most sublocalization logic, a Tat signal is similar to a # Sec (signalp) signal. We use has_signal_pept to denote that either # is present. has_signal_pept = False if is_signalp or is_tatfind or \ (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']): has_signal_pept = True # annotate the barrels - high scoring bomp hits don't require a # signal peptide, low scoring ones do has_barrel = False bomp_score = dict_get(protein, 'bomp') if (bomp_score >= params['bomp_clearly_cutoff']) or \ (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']): details += ['bomp(%i)' % (bomp_score)] has_barrel = True tmbhunt_prob = dict_get(protein, 'tmbhunt_prob') if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \ (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']): details += ['tmbhunt(%.2f)' % (tmbhunt_prob)] has_barrel = True if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'): details += ['tmbetadisc-rbf'] has_barrel = True if has_barrel: category = 'OM(barrel)' # we only regard the barrel prediction as a true positive # if a signal peptide is also present # is_barrel = False # if has_signal_pept and has_barrel: # TODO and num_tms <= 1: # category = 'OM(barrel)' # is_barrel = True # set number of predicted OM barrel strands in details if has_barrel and \ dict_get(protein, 'tmbeta_strands'): num_strands = len(protein['tmbeta_strands']) details += ['tmbeta_strands(%i)' % (num_strands)] if has_signal_pept and not is_lipop and \ (dict_get(protein, 'signalp_cleave_position')): # we use the SignalP signal peptidase cleavage site for Tat signals chop_nterminal_peptide(protein, protein['signalp_cleave_position']) if is_tatfind: details += ["tatfind"] if is_signalp: details += ["signalp"] if is_lipop: details += ["lipop"] chop_nterminal_peptide(protein, protein['lipop_cleave_position']) if is_hmm_profile_match: details += ["hmm(%s)" % "|".join(protein['hmmsearch'])] if has_tm_helix(protein) and not has_barrel: for program in params['helix_programs']: n = len(protein['%s_helices' % program]) details += [program + "(%d)" % n] category = "IM" if long_in_periplasm(protein): category += "+peri" if long_in_cytoplasm(protein): category += "+cyto" elif not has_barrel: if is_lipop: if dict_get(protein, 'lipop_im_retention_signal'): category = "LIPOPROTEIN(IM)" else: category = "LIPOPROTEIN(OM)" pass elif (has_signal_pept): category = "PERIPLASMIC/SECRETED" else: category = "CYTOPLASM" if details == []: details = ["."] protein['details'] = details protein['category'] = category return details, category
def post_process_protein(params, protein): """ This is the main analysis of the protein, where theprotein dictionary should contain all the necessary information from the annotations. Thus post_process_protein contain can determine the final analysis. """ def sequence_length(protein): return protein['sequence_length'] def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False def has_surface_exposed_loop(protein): for program in params['helix_programs']: if eval_surface_exposed_loop( protein['sequence_length'], len(protein['%s_helices' % (program)]), protein['%s_outer_loops' % (program)], params['terminal_exposed_loop_min'], params['internal_exposed_loop_min']): return True return False def exposed_loop_extent(protein): extents = [] for program in params['helix_programs']: if program+'_helices' in protein: extents.append(max_exposed_loop( protein['sequence_length'], len(protein['%s_helices' % (program)]), protein['%s_outer_loops' % (program)], params['terminal_exposed_loop_min'], params['internal_exposed_loop_min'])) if extents: return max(extents) else: return 0 terminal_exposed_loop_min = \ params['terminal_exposed_loop_min'] is_hmm_profile_match = dict_get(protein, 'hmmsearch') is_lipop = dict_get(protein, 'is_lipop') if is_lipop: i_lipop_cut = protein['lipop_cleave_position'] is_signalp = dict_get(protein, 'is_signalp') if is_signalp: i_signalp_cut = protein['signalp_cleave_position'] details = [] if is_hmm_profile_match: details += ["hmm(%s)" % "|".join(protein['hmmsearch'])] if is_lipop: details += ["lipop"] if is_signalp: details += ["signalp"] for program in params['helix_programs']: if has_tm_helix(protein): n = len(protein['%s_helices' % program]) details += [program + "(%d)" % n] if is_lipop: chop_nterminal_peptide(protein, i_lipop_cut) elif is_signalp: chop_nterminal_peptide(protein, i_signalp_cut) if is_hmm_profile_match: category = "PSE-Cellwall" elif has_tm_helix(protein): if has_surface_exposed_loop(protein): category = "PSE-Membrane" else: category = "MEMBRANE(non-PSE)" else: if is_lipop: # whole protein considered outer terminal loop if sequence_length(protein) < terminal_exposed_loop_min: category = "LIPOPROTEIN(non-PSE)" else: category = "PSE-Lipoprotein" elif is_signalp: category = "SECRETED" else: category = "CYTOPLASM(non-PSE)" if details == []: details = ["."] protein['details'] = details protein['category'] = category if 'CYTOPLASM' not in category and 'SECRETED' not in category: protein['loop_extent'] = exposed_loop_extent(protein) else: protein['loop_extent'] = "." return details, category
def post_process_protein(params, protein): def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False # these functions detect if and TM-containing IM proteins # have large loops / terminal regions in the periplasm or cytoplasm # that may be accessible / inaccessible in spheroplast shaving # experiments. def has_long_loops(protein, loop_str='_outer_loops', \ loop_length=params['internal_exposed_loop_min']): for annot in protein: if loop_str in annot: for loop in protein[annot]: l_len = loop[1]-loop[0] if l_len >= loop_length: return True return False def long_in_periplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_outer_loops', loop_length) def long_in_cytoplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_inner_loops', loop_length) details = [] category = "UNKNOWN" is_hmm_profile_match = dict_get(protein, 'hmmsearch') is_signalp = dict_get(protein, 'is_signalp') is_tatfind = dict_get(protein, 'is_tatfind') is_lipop = dict_get(protein, 'is_lipop') # in terms of most sublocalization logic, a Tat signal is similar to a # Sec (signalp) signal. We use has_signal_pept to denote that either # is present. has_signal_pept = False if is_signalp or is_tatfind or \ (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']): has_signal_pept = True # annotate the barrels - high scoring bomp hits don't require a # signal peptide, low scoring ones do has_barrel = False bomp_score = dict_get(protein, 'bomp') if (bomp_score >= params['bomp_clearly_cutoff']) or \ (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']): details += ['bomp(%i)' % (bomp_score)] has_barrel = True # DEPRECATED: TMB-HUNT server is permanently offline #tmbhunt_prob = dict_get(protein, 'tmbhunt_prob') #if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \ # (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']): # details += ['tmbhunt(%.2f)' % (tmbhunt_prob)] # has_barrel = True if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'): details += ['tmbetadisc-rbf'] has_barrel = True if has_barrel: category = 'OM(barrel)' # we only regard the barrel prediction as a true positive # if a signal peptide is also present # is_barrel = False # if has_signal_pept and has_barrel: # TODO and num_tms <= 1: # category = 'OM(barrel)' # is_barrel = True # set number of predicted OM barrel strands in details if has_barrel and \ dict_get(protein, 'tmbeta_strands'): num_strands = len(protein['tmbeta_strands']) details += ['tmbeta_strands(%i)' % (num_strands)] if has_signal_pept and not is_lipop and \ (dict_get(protein, 'signalp_cleave_position')): # we use the SignalP signal peptidase cleavage site for Tat signals chop_nterminal_peptide(protein, protein['signalp_cleave_position']) if is_tatfind: details += ["tatfind"] if is_signalp: details += ["signalp"] if is_lipop: details += ["lipop"] chop_nterminal_peptide(protein, protein['lipop_cleave_position']) if is_hmm_profile_match: details += ["hmm(%s)" % "|".join(protein['hmmsearch'])] if has_tm_helix(protein) and not has_barrel: for program in params['helix_programs']: n = len(protein['%s_helices' % program]) details += [program + "(%d)" % n] category = "IM" if long_in_periplasm(protein): category += "+peri" if long_in_cytoplasm(protein): category += "+cyto" elif not has_barrel: if is_lipop: if dict_get(protein, 'lipop_im_retention_signal'): category = "LIPOPROTEIN(IM)" else: category = "LIPOPROTEIN(OM)" pass elif (has_signal_pept): category = "PERIPLASMIC/SECRETED" else: category = "CYTOPLASM" if details == []: details = ["."] protein['details'] = details protein['category'] = category return details, category