Пример #1
0
def run (args=(), params=None, out=sys.stdout) :
  assert (params is not None)
  seq_files = params.muscle.seq_file
  output_file = params.muscle.output_file
  if (output_file is None) or (output_file == "") :
    output_file = os.path.join(os.getcwd(), "muscle.aln")
  from iotbx import file_reader
  from iotbx.bioinformatics import any_sequence_format, sequence
  seqs = []
  for file_name in seq_files :
    if (file_name.endswith(".pdb") or file_name.endswith(".ent") or
        file_name.endswith(".pdb.gz") or file_name.endswith(".ent.gz")) :
      pdb_in = file_reader.any_file(file_name, force_type="pdb").file_object
      hierarchy = pdb_in.hierarchy
      first_model = hierarchy.models()[0]
      found_protein = False
      for chain in first_model.chains() :
        if chain.is_protein() :
          chain_seq = chain.as_padded_sequence()
          base_name = os.path.basename(file_name)
          seq_name = "%s_%s" % (os.path.splitext(base_name)[0], chain.id)
          seqs.append(sequence(chain_seq, seq_name))
          found_protein = True
      if (not found_protein) :
        raise Sorry(("The PDB file %s does not contain any recognizable "+
          "protein chains.") % file_name)
    else :
      try :
        seq_objects, non_compliant = any_sequence_format(file_name,
          assign_name_if_not_defined=True)
        seqs.extend(seq_objects)
      except Exception, e :
        raise Sorry(("Error parsing '%s' - not a recognizable sequence "+
          "format.  (Original message: %s)") % (file_name, str(e)))
Пример #2
0
def run (args=(), params=None, out=sys.stdout) :
  assert (params is not None)
  seq_files = params.muscle.seq_file
  output_file = params.muscle.output_file
  if (output_file is None) or (output_file == "") :
    output_file = os.path.join(os.getcwd(), "muscle.aln")
  from iotbx import file_reader
  from iotbx.bioinformatics import any_sequence_format, sequence
  seqs = []
  for file_name in seq_files :
    if (file_name.endswith(".pdb") or file_name.endswith(".ent") or
        file_name.endswith(".pdb.gz") or file_name.endswith(".ent.gz")) :
      pdb_in = file_reader.any_file(file_name, force_type="pdb").file_object
      hierarchy = pdb_in.hierarchy
      first_model = hierarchy.models()[0]
      found_protein = False
      for chain in first_model.chains() :
        if chain.is_protein() :
          chain_seq = chain.as_padded_sequence()
          base_name = os.path.basename(file_name)
          seq_name = "%s_%s" % (os.path.splitext(base_name)[0], chain.id)
          seqs.append(sequence(chain_seq, seq_name))
          found_protein = True
      if (not found_protein) :
        raise Sorry(("The PDB file %s does not contain any recognizable "+
          "protein chains.") % file_name)
    else :
      try :
        seq_objects, non_compliant = any_sequence_format(file_name,
          assign_name_if_not_defined=True)
        seqs.extend(seq_objects)
      except Exception, e :
        raise Sorry(("Error parsing '%s' - not a recognizable sequence "+
          "format.  (Original message: %s)") % (file_name, str(e)))
Пример #3
0
def get_muscle_alignment_ordered(sequences, out = None):

  from iotbx import bioinformatics

  name_for = {}

  for ( i, seq ) in enumerate( sequences, start = 1 ):
    name = name_for.get( seq, "Chain_%d" % i )
    name_for[ seq ] = name

  alignment = get_muscle_alignment(
    fasta_sequences = "\n".join(
      str( bioinformatics.sequence( name = name, sequence = seq.sequence ) )
      for ( seq, name ) in name_for.items()
      ),
    out = out,
    )

  lookup = dict( zip( alignment.names, alignment.alignments ) )
  assert all( n in lookup for n in name_for.values() )

  return bioinformatics.clustal_alignment(
    names = [ seq.name for seq in sequences ],
    alignments = [ lookup[ name_for[ seq ] ] for seq in sequences ],
    program = alignment.program
    )
Пример #4
0
def get_muscle_alignment_ordered(sequences, out = None):

  from iotbx import bioinformatics
  from iotbx.pdb.amino_acid_codes import validate_sequence

  name_for = {}

  for ( i, seq ) in enumerate( sequences, start = 1 ):
    name = name_for.get( seq, "Chain_%d" % i )
    name_for[ seq ] = name

  alignment, errors = get_muscle_alignment(
    fasta_sequences = "\n".join(
      str( bioinformatics.sequence( name = name, sequence = seq.sequence ) )
      for ( seq, name ) in name_for.items()
      ),
    out = out,
    )

  # check for errors and handle:
  #   invalid characters in sequences
  if (len(errors) > 0):
    for error in errors:
      error = error.strip()
      if ('Invalid character' in error):
        for seq in name_for.keys():
          invalid = validate_sequence(
            seq.sequence, protein=True, strict_protein=False,
            nucleic_acid=True, strict_nucleic_acid=False)
          if (len(invalid) > 0):
            name_for.pop(seq)
        sequences = name_for.keys()
      elif (len(error) > 0):
        raise Sorry(error)

  lookup = dict( zip( alignment.names, alignment.alignments ) )
  assert all( n in lookup for n in name_for.values() )

  return bioinformatics.clustal_alignment(
    names = [ seq.name for seq in sequences ],
    alignments = [ lookup[ name_for[ seq ] ] for seq in sequences ],
    program = alignment.program
    )