def run (args=(), params=None, out=sys.stdout) : assert (params is not None) seq_files = params.muscle.seq_file output_file = params.muscle.output_file if (output_file is None) or (output_file == "") : output_file = os.path.join(os.getcwd(), "muscle.aln") from iotbx import file_reader from iotbx.bioinformatics import any_sequence_format, sequence seqs = [] for file_name in seq_files : if (file_name.endswith(".pdb") or file_name.endswith(".ent") or file_name.endswith(".pdb.gz") or file_name.endswith(".ent.gz")) : pdb_in = file_reader.any_file(file_name, force_type="pdb").file_object hierarchy = pdb_in.hierarchy first_model = hierarchy.models()[0] found_protein = False for chain in first_model.chains() : if chain.is_protein() : chain_seq = chain.as_padded_sequence() base_name = os.path.basename(file_name) seq_name = "%s_%s" % (os.path.splitext(base_name)[0], chain.id) seqs.append(sequence(chain_seq, seq_name)) found_protein = True if (not found_protein) : raise Sorry(("The PDB file %s does not contain any recognizable "+ "protein chains.") % file_name) else : try : seq_objects, non_compliant = any_sequence_format(file_name, assign_name_if_not_defined=True) seqs.extend(seq_objects) except Exception, e : raise Sorry(("Error parsing '%s' - not a recognizable sequence "+ "format. (Original message: %s)") % (file_name, str(e)))
def get_muscle_alignment_ordered(sequences, out = None): from iotbx import bioinformatics name_for = {} for ( i, seq ) in enumerate( sequences, start = 1 ): name = name_for.get( seq, "Chain_%d" % i ) name_for[ seq ] = name alignment = get_muscle_alignment( fasta_sequences = "\n".join( str( bioinformatics.sequence( name = name, sequence = seq.sequence ) ) for ( seq, name ) in name_for.items() ), out = out, ) lookup = dict( zip( alignment.names, alignment.alignments ) ) assert all( n in lookup for n in name_for.values() ) return bioinformatics.clustal_alignment( names = [ seq.name for seq in sequences ], alignments = [ lookup[ name_for[ seq ] ] for seq in sequences ], program = alignment.program )
def get_muscle_alignment_ordered(sequences, out = None): from iotbx import bioinformatics from iotbx.pdb.amino_acid_codes import validate_sequence name_for = {} for ( i, seq ) in enumerate( sequences, start = 1 ): name = name_for.get( seq, "Chain_%d" % i ) name_for[ seq ] = name alignment, errors = get_muscle_alignment( fasta_sequences = "\n".join( str( bioinformatics.sequence( name = name, sequence = seq.sequence ) ) for ( seq, name ) in name_for.items() ), out = out, ) # check for errors and handle: # invalid characters in sequences if (len(errors) > 0): for error in errors: error = error.strip() if ('Invalid character' in error): for seq in name_for.keys(): invalid = validate_sequence( seq.sequence, protein=True, strict_protein=False, nucleic_acid=True, strict_nucleic_acid=False) if (len(invalid) > 0): name_for.pop(seq) sequences = name_for.keys() elif (len(error) > 0): raise Sorry(error) lookup = dict( zip( alignment.names, alignment.alignments ) ) assert all( n in lookup for n in name_for.values() ) return bioinformatics.clustal_alignment( names = [ seq.name for seq in sequences ], alignments = [ lookup[ name_for[ seq ] ] for seq in sequences ], program = alignment.program )