Exemplo n.º 1
0
def post_process_protein(params, protein):
    
  def has_tm_helix(protein):
    for program in params['helix_programs']:
      if dict_get(protein, '%s_helices' % program):
        return True
    return False

  # these functions detect if and TM-containing IM proteins
  # have large loops / terminal regions in the periplasm or cytoplasm
  # that may be accessible / inaccessible in spheroplast shaving 
  # experiments.
  def has_long_loops(protein, loop_str='_outer_loops', \
                     loop_length=params['internal_exposed_loop_min']):
    for annot in protein:
      if loop_str in annot:
        for loop in protein[annot]:
          l_len = loop[1]-loop[0]
          if l_len >= loop_length:
            return True
    return False
  
  def long_in_periplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_outer_loops', loop_length)
  
  def long_in_cytoplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_inner_loops', loop_length)

  
  details = []
  category = "UNKNOWN"
  is_hmm_profile_match = dict_get(protein, 'hmmsearch')
  is_signalp = dict_get(protein, 'is_signalp')
  is_tatfind = dict_get(protein, 'is_tatfind')
  is_lipop = dict_get(protein, 'is_lipop')
  
  # in terms of most sublocalization logic, a Tat signal is similar to a 
  # Sec (signalp) signal. We use has_signal_pept to denote that either 
  # is present.
  has_signal_pept = False
  if is_signalp or is_tatfind or \
     (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']):
    has_signal_pept = True
  
  # annotate the barrels - high scoring bomp hits don't require a 
  # signal peptide, low scoring ones do
  has_barrel = False
  bomp_score = dict_get(protein, 'bomp')
  if (bomp_score >= params['bomp_clearly_cutoff']) or \
     (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']):
    
    details += ['bomp(%i)' % (bomp_score)]
    has_barrel = True
    
  tmbhunt_prob = dict_get(protein, 'tmbhunt_prob')
  if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \
     (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']):
    details += ['tmbhunt(%.2f)' % (tmbhunt_prob)]
    has_barrel = True
    
  if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'):
    details += ['tmbetadisc-rbf']
    has_barrel = True
    
  if has_barrel:
    category = 'OM(barrel)'
    
  # we only regard the barrel prediction as a true positive
  # if a signal peptide is also present
#  is_barrel = False
#  if has_signal_pept and has_barrel: # TODO and num_tms <= 1:
#    category = 'OM(barrel)'
#    is_barrel = True
    
  # set number of predicted OM barrel strands in details
  if has_barrel and \
      dict_get(protein, 'tmbeta_strands'):
    num_strands = len(protein['tmbeta_strands'])
    details += ['tmbeta_strands(%i)' % (num_strands)]
  
  if has_signal_pept and not is_lipop and \
    (dict_get(protein, 'signalp_cleave_position')):
    # we use the SignalP signal peptidase cleavage site for Tat signals
    chop_nterminal_peptide(protein,  protein['signalp_cleave_position'])
  
  if is_tatfind:
    details += ["tatfind"]
  
  if is_signalp:
    details += ["signalp"]
  
  if is_lipop:
    details += ["lipop"]
    chop_nterminal_peptide(protein, protein['lipop_cleave_position'])
  
  if is_hmm_profile_match:
    details += ["hmm(%s)" % "|".join(protein['hmmsearch'])]

  if has_tm_helix(protein) and not has_barrel:
    for program in params['helix_programs']:
      n = len(protein['%s_helices' % program])
      details += [program + "(%d)" % n]
    
    category = "IM"
    if long_in_periplasm(protein):
      category += "+peri"
    if long_in_cytoplasm(protein):
      category += "+cyto"
  elif not has_barrel:
    if is_lipop:
      if dict_get(protein, 'lipop_im_retention_signal'):
        category = "LIPOPROTEIN(IM)"
      else:
        category = "LIPOPROTEIN(OM)"
      pass
    elif (has_signal_pept):
      category = "PERIPLASMIC/SECRETED"
    else:
      category = "CYTOPLASM"

  if details == []:
    details = ["."]

  protein['details'] = details
  protein['category'] = category

  return details, category
Exemplo n.º 2
0
def post_process_protein(params, protein):
  """
  This is the main analysis of the protein, where theprotein
  dictionary should contain all the necessary information
  from the annotations. Thus post_process_protein contain
  can determine the final analysis.
  """

  def sequence_length(protein):
    return protein['sequence_length']
    
  def has_tm_helix(protein):
    for program in params['helix_programs']:
      if dict_get(protein, '%s_helices' % program):
        return True
    return False

  def has_surface_exposed_loop(protein):
    for program in params['helix_programs']:
      if eval_surface_exposed_loop(
          protein['sequence_length'], 
          len(protein['%s_helices' % (program)]), 
          protein['%s_outer_loops' % (program)], 
          params['terminal_exposed_loop_min'], 
          params['internal_exposed_loop_min']):
        return True
    return False

  def exposed_loop_extent(protein):
    extents = []
    for program in params['helix_programs']:
      if program+'_helices' in protein:
        extents.append(max_exposed_loop(
            protein['sequence_length'], 
            len(protein['%s_helices' % (program)]), 
            protein['%s_outer_loops' % (program)], 
            params['terminal_exposed_loop_min'], 
            params['internal_exposed_loop_min']))
    if extents:
      return max(extents)
    else:
      return 0

  terminal_exposed_loop_min = \
      params['terminal_exposed_loop_min']

  is_hmm_profile_match = dict_get(protein, 'hmmsearch')
  is_lipop = dict_get(protein, 'is_lipop')
  if is_lipop:
    i_lipop_cut = protein['lipop_cleave_position']
  is_signalp = dict_get(protein, 'is_signalp')
  if is_signalp:
    i_signalp_cut = protein['signalp_cleave_position']

  details = []
  if is_hmm_profile_match:
    details += ["hmm(%s)" % "|".join(protein['hmmsearch'])]
  if is_lipop: 
    details += ["lipop"]
  if is_signalp:
    details += ["signalp"]
  for program in params['helix_programs']:
    if has_tm_helix(protein):
      n = len(protein['%s_helices' % program])
      details += [program + "(%d)" % n]

  if is_lipop:
    chop_nterminal_peptide(protein, i_lipop_cut)
  elif is_signalp:
    chop_nterminal_peptide(protein, i_signalp_cut)

  if is_hmm_profile_match:
    category =  "PSE-Cellwall"
  elif has_tm_helix(protein):
    if has_surface_exposed_loop(protein):
      category = "PSE-Membrane"
    else:
      category = "MEMBRANE(non-PSE)"
  else:
    if is_lipop:
      # whole protein considered outer terminal loop
      if sequence_length(protein) < terminal_exposed_loop_min:
        category = "LIPOPROTEIN(non-PSE)"
      else:
        category = "PSE-Lipoprotein"
    elif is_signalp:
      category = "SECRETED"
    else:
      category = "CYTOPLASM(non-PSE)"

  if details == []:
    details = ["."]

  protein['details'] = details
  protein['category'] = category
  if 'CYTOPLASM' not in category and 'SECRETED' not in category:
    protein['loop_extent'] = exposed_loop_extent(protein)
  else:
    protein['loop_extent'] = "."

  return details, category
Exemplo n.º 3
0
def post_process_protein(params, protein):
    
  def has_tm_helix(protein):
    for program in params['helix_programs']:
      if dict_get(protein, '%s_helices' % program):
        return True
    return False

  # these functions detect if and TM-containing IM proteins
  # have large loops / terminal regions in the periplasm or cytoplasm
  # that may be accessible / inaccessible in spheroplast shaving 
  # experiments.
  def has_long_loops(protein, loop_str='_outer_loops', \
                     loop_length=params['internal_exposed_loop_min']):
    for annot in protein:
      if loop_str in annot:
        for loop in protein[annot]:
          l_len = loop[1]-loop[0]
          if l_len >= loop_length:
            return True
    return False
  
  def long_in_periplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_outer_loops', loop_length)
  
  def long_in_cytoplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_inner_loops', loop_length)

  
  details = []
  category = "UNKNOWN"
  is_hmm_profile_match = dict_get(protein, 'hmmsearch')
  is_signalp = dict_get(protein, 'is_signalp')
  is_tatfind = dict_get(protein, 'is_tatfind')
  is_lipop = dict_get(protein, 'is_lipop')
  
  # in terms of most sublocalization logic, a Tat signal is similar to a 
  # Sec (signalp) signal. We use has_signal_pept to denote that either 
  # is present.
  has_signal_pept = False
  if is_signalp or is_tatfind or \
     (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']):
    has_signal_pept = True
  
  # annotate the barrels - high scoring bomp hits don't require a 
  # signal peptide, low scoring ones do
  has_barrel = False
  bomp_score = dict_get(protein, 'bomp')
  if (bomp_score >= params['bomp_clearly_cutoff']) or \
     (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']):
    
    details += ['bomp(%i)' % (bomp_score)]
    has_barrel = True

  # DEPRECATED: TMB-HUNT server is permanently offline
  #tmbhunt_prob = dict_get(protein, 'tmbhunt_prob')
  #if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \
  #   (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']):
  #  details += ['tmbhunt(%.2f)' % (tmbhunt_prob)]
  #  has_barrel = True
    
  if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'):
    details += ['tmbetadisc-rbf']
    has_barrel = True
    
  if has_barrel:
    category = 'OM(barrel)'
    
  # we only regard the barrel prediction as a true positive
  # if a signal peptide is also present
#  is_barrel = False
#  if has_signal_pept and has_barrel: # TODO and num_tms <= 1:
#    category = 'OM(barrel)'
#    is_barrel = True
    
  # set number of predicted OM barrel strands in details
  if has_barrel and \
      dict_get(protein, 'tmbeta_strands'):
    num_strands = len(protein['tmbeta_strands'])
    details += ['tmbeta_strands(%i)' % (num_strands)]
  
  if has_signal_pept and not is_lipop and \
    (dict_get(protein, 'signalp_cleave_position')):
    # we use the SignalP signal peptidase cleavage site for Tat signals
    chop_nterminal_peptide(protein,  protein['signalp_cleave_position'])
  
  if is_tatfind:
    details += ["tatfind"]
  
  if is_signalp:
    details += ["signalp"]
  
  if is_lipop:
    details += ["lipop"]
    chop_nterminal_peptide(protein, protein['lipop_cleave_position'])
  
  if is_hmm_profile_match:
    details += ["hmm(%s)" % "|".join(protein['hmmsearch'])]

  if has_tm_helix(protein) and not has_barrel:
    for program in params['helix_programs']:
      n = len(protein['%s_helices' % program])
      details += [program + "(%d)" % n]
    
    category = "IM"
    if long_in_periplasm(protein):
      category += "+peri"
    if long_in_cytoplasm(protein):
      category += "+cyto"
  elif not has_barrel:
    if is_lipop:
      if dict_get(protein, 'lipop_im_retention_signal'):
        category = "LIPOPROTEIN(IM)"
      else:
        category = "LIPOPROTEIN(OM)"
      pass
    elif (has_signal_pept):
      category = "PERIPLASMIC/SECRETED"
    else:
      category = "CYTOPLASM"

  if details == []:
    details = ["."]

  protein['details'] = details
  protein['category'] = category

  return details, category