Пример #1
0
def annotate(params, proteins):
    """
    Runs THMHMM and parses the output files. Takes a standard 'inmembrane'
    params dictionary and a global proteins dictionary which it populates with
    results.

    In the current implementation, this function extracts and feeds sequences to tmhmm
    one by one via a temporary file.

    These keys are added to the proteins dictionary:
      - 'tmhmm_helices', a list of tuples describing the first and last residue
        number of each transmembrane segment;

      - 'tmhmm_scores', a list of confidence scores (floats) for each predicted
        tm segment;

      - 'tmhmm_inner_loops', a list of tuples describing the first and last residue
         number of each predicted internal loop segment;

      - 'tmhmm_outer_loops', a list of tuples describing the first and last residue
         number of each predicted outer loop segment;
    """

    tmhmm_out = 'tmhmm.out'
    run('%(tmhmm_bin)s %(fasta)s' % params, tmhmm_out)
    return parse_tmhmm(open('tmhmm.out').read(), proteins)
Пример #2
0
def annotate(params, proteins):
  """
  Runs THMHMM and parses the output files. Takes a standard 'inmembrane'
  params dictionary and a global proteins dictionary which it populates with
  results.
  
  In the current implementation, this function extracts and feeds sequences to tmhmm
  one by one via a temporary file.
  
  These keys are added to the proteins dictionary: 
    - 'tmhmm_helices', a list of tuples describing the first and last residue
      number of each transmembrane segment; 
    
    - 'tmhmm_scores', a list of confidence scores (floats) for each predicted 
      tm segment;
    
    - 'tmhmm_inner_loops', a list of tuples describing the first and last residue
       number of each predicted internal loop segment;
    
    - 'tmhmm_outer_loops', a list of tuples describing the first and last residue
       number of each predicted outer loop segment;
  """

  tmhmm_out = 'tmhmm.out'
  run('%(tmhmm_bin)s %(fasta)s' % params, tmhmm_out)
  return parse_tmhmm(open('tmhmm.out').read(), proteins)
Пример #3
0
def annotate(params, proteins):
    for seqid in proteins:
        proteins[seqid]['is_signalp'] = False
        proteins[seqid]['signalp_cleave_position'] = None

    signalp4_out = 'signalp.out'
    cmd = '%(signalp4_bin)s -t %(signalp4_organism)s  %(fasta)s' % \
          params
    run(cmd, signalp4_out)

    with open(signalp4_out) as signalp4_text:
        proteins = parse_signalp(signalp4_text, proteins)

    return proteins
Пример #4
0
def annotate(params, proteins):
    """
    Returns a reference to the proteins data structure.

    Uses HMMER to identify sequence motifs in proteins. This function
    annotates the proteins with:
      - 'hmmsearch': a list of motifs that are found in the protein. The
         motifs correspond to the basename of the .hmm files found in the directory
         indicated by the 'hmm_profiles_dir' field of 'params'.
    """

    log_stderr("# Searching for HMMER profiles in " +
               params['hmm_profiles_dir'])

    file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm')
    for hmm_profile in glob.glob(file_tag):
        params['hmm_profile'] = hmm_profile

        hmm_profile = os.path.basename(params['hmm_profile'])
        hmm_name = hmm_profile.replace('.hmm', '')
        hmmsearch3_out = 'hmm.%s.out' % hmm_name

        cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params
        run(cmd, hmmsearch3_out)

        # init proteins data structure with blank hmmsearch field first
        for seqid in proteins:
            if 'hmmsearch' not in proteins[seqid]:
                proteins[seqid]['hmmsearch'] = []

        # parse the hmmsearch output file
        seqid = None
        for l in open(hmmsearch3_out):
            words = l.split()

            if l.startswith(">>"):
                seqid = parse_fasta_header(l[3:])[0]
                continue

            if seqid is None:
                continue

            if 'conditional E-value' in l:
                evalue = float(words[-1])
                score = float(words[-5])
                if evalue <= params['hmm_evalue_max'] and \
                        score >= params['hmm_score_min']:
                    proteins[seqid]['hmmsearch'].append(hmm_name)

    return proteins
Пример #5
0
def annotate(params, proteins):
    """
    Returns a reference to the proteins data structure.

    Uses HMMER to identify sequence motifs in proteins. This function
    annotates the proteins with:
      - 'hmmsearch': a list of motifs that are found in the protein. The
         motifs correspond to the basename of the .hmm files found in the directory
         indicated by the 'hmm_profiles_dir' field of 'params'.
    """

    log_stderr(
        "# Searching for HMMER profiles in " + params['hmm_profiles_dir'])

    file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm')
    for hmm_profile in glob.glob(file_tag):
        params['hmm_profile'] = hmm_profile

        hmm_profile = os.path.basename(params['hmm_profile'])
        hmm_name = hmm_profile.replace('.hmm', '')
        hmmsearch3_out = 'hmm.%s.out' % hmm_name

        cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params
        run(cmd, hmmsearch3_out)

        # init proteins data structure with blank hmmsearch field first
        for seqid in proteins:
            if 'hmmsearch' not in proteins[seqid]:
                proteins[seqid]['hmmsearch'] = []

        # parse the hmmsearch output file
        seqid = None
        for l in open(hmmsearch3_out):
            words = l.split()

            if l.startswith(">>"):
                seqid = parse_fasta_header(l[3:])[0]
                continue

            if seqid is None:
                continue

            if 'conditional E-value' in l:
                evalue = float(words[-1])
                score = float(words[-5])
                if evalue <= params['hmm_evalue_max'] and \
                        score >= params['hmm_score_min']:
                    proteins[seqid]['hmmsearch'].append(hmm_name)

    return proteins
Пример #6
0
def annotate(params, proteins):
    for seqid in proteins:
        proteins[seqid]['is_signalp'] = False
        proteins[seqid]['signalp_cleave_position'] = None

    signalp4_out = 'signalp.out'
    cmd = '%(signalp4_bin)s -t %(signalp4_organism)s  %(fasta)s' % \
          params
    run(cmd, signalp4_out)

    with open(signalp4_out) as signalp4_text:
        proteins = parse_signalp(signalp4_text, proteins)

    return proteins
Пример #7
0
def annotate(params, proteins):
  """
  Uses LipoP to identify lipo-attachment signals in the protein. 
  The 'proteins' dictionary is annotated by adding two fields:
    - 'is_lipop' is a boolean indicating whether a signal is found or not
    - 'lipop_cleave_position' gives the position where the signal
        is cleaved and the protein is attached to a lipid

  Returns a reference to the proteins data structure.
  """

  lipop1_out = 'lipop.out'
  run('%(lipop1_bin)s %(fasta)s' % params, lipop1_out)

  proteins = parse_lipop(open(lipop1_out).read(), proteins)
      
  return proteins
Пример #8
0
def annotate(params, proteins):
    """
    Runs MEMSAT3 and parses the output files. Takes a standard 'inmembrane'
    params dictionary and a global proteins dictionary which it populates with
    results.

    In the current implementation, this function extracts and feeds sequences to MEMSAT3
    one by one via a temporary file.

    These keys are added to the proteins dictionary:
      - 'memsat3_helices', a list of tuples describing the first and last residue
        number of each transmembrane segment;

      - 'memsat3_scores', a list of confidence scores (floats) for each predicted
        tm segment;

      - 'memsat3_inner_loops', a list of tuples describing the first and last residue
         number of each predicted internal loop segment;

      - 'memsat3_outer_loops', a list of tuples describing the first and last residue
         number of each predicted outer loop segment;
    """

    for seqid in proteins:
        protein = proteins[seqid]

        # initialize the protein data structure
        protein.update({
            'memsat3_scores': [],
            'memsat3_helices': [],
            'memsat3_inner_loops': [],
            'memsat3_outer_loops': []
        })

        # write seq to single fasta file
        single_fasta = seqid_to_filename(seqid) + '.fasta'
        if not os.path.isfile(single_fasta):
            write_proteins_fasta(single_fasta, proteins, [seqid])

        memsat_out = single_fasta.replace('fasta', 'memsat')
        run('%s %s' % (params['memsat3_bin'], single_fasta), memsat_out)

        globmem_out = single_fasta.replace('fasta', 'globmem')
        if has_transmembrane_in_globmem(globmem_out):
            parse_memsat(protein, memsat_out)
Пример #9
0
def annotate(params, proteins):
  """
  Runs MEMSAT3 and parses the output files. Takes a standard 'inmembrane'
  params dictionary and a global proteins dictionary which it populates with
  results.
  
  In the current implementation, this function extracts and feeds sequences to MEMSAT3
  one by one via a temporary file.
  
  These keys are added to the proteins dictionary: 
    - 'memsat3_helices', a list of tuples describing the first and last residue
      number of each transmembrane segment; 
    
    - 'memsat3_scores', a list of confidence scores (floats) for each predicted 
      tm segment;
    
    - 'memsat3_inner_loops', a list of tuples describing the first and last residue
       number of each predicted internal loop segment;
    
    - 'memsat3_outer_loops', a list of tuples describing the first and last residue
       number of each predicted outer loop segment;
  """

  for seqid in proteins:
    protein = proteins[seqid]

    # initialize the protein data structure
    protein.update({
      'memsat3_scores':[],
      'memsat3_helices':[],
      'memsat3_inner_loops':[],
      'memsat3_outer_loops':[]
    })

    # write seq to single fasta file
    single_fasta = seqid_to_filename(seqid) + '.fasta'
    if not os.path.isfile(single_fasta):
      write_proteins_fasta(single_fasta, proteins, [seqid])

    memsat_out = single_fasta.replace('fasta', 'memsat')
    run('%s %s' % (params['memsat3_bin'], single_fasta), memsat_out)

    globmem_out = single_fasta.replace('fasta', 'globmem')
    if has_transmembrane_in_globmem(globmem_out):
      parse_memsat(protein, memsat_out)
Пример #10
0
def annotate(params, proteins):
  for seqid in proteins:
    proteins[seqid]['is_signalp'] = False
    proteins[seqid]['signalp_cleave_position'] = None

  signalp4_out = 'signalp.out'
  cmd = '%(signalp4_bin)s -t %(signalp4_organism)s  %(fasta)s' % \
             params
  run(cmd, signalp4_out)

  for line in open(signalp4_out):
    if line.startswith("#"):
      continue
    words = line.split()
    seqid = parse_fasta_header(words[0])[0]
    proteins[seqid]['signalp_cleave_position'] = int(words[4])
    if (words[9] == "Y"):
      proteins[seqid]['is_signalp'] = True

  return proteins