def toByteArray(structure, compressed): ''' Returns an MMTF-encoded byte array with optional gzip compression Returns: MMTF encoded and optionally gzipped structure data ''' byte_array = bytearray(msgpack.packb(MMTFEncoder.encode_data(structure))) if compressed: return gzip.compress(byte_array) else: return byte_array
def _to_byte_array(structure, compressed): '''Returns an MMTF-encoded byte array with optional gzip compression Returns ------- MMTF encoded and optionally gzipped structure data ''' if type(structure) == MmtfStructure: if not structure.alt_loc_set: structure = structure.set_alt_loc_list() byte_array = bytearray(msgpack.packb(MMTFEncoder.encode_data(structure), use_bin_type = True)) if compressed: return gzip.compress(byte_array) else: return byte_array
def __call__(self, t): structure = t[1] structure = structure.set_alt_loc_list() # print(structure.group_type_list) bioassemblies = structure.bio_assembly numBioassembly = len(bioassemblies) resList = list() for i in range(numBioassembly): bioAssembly = MMTFEncoder() structureId = structure.structure_id + '-BioAssembly' + \ bioassemblies[i][b'name'].decode('utf-8') totAtoms = 0 totBonds = 0 totGroups = 0 totChains = 0 totModels = structure.num_models numTrans = len(bioassemblies[i][b'transformList']) bioChainList = [[]] * numTrans transMatrix = [[]] * numTrans for ii in range(numTrans): bioChainList[ii] = bioassemblies[i][b'transformList'][ii][ b'chainIndexList'] transMatrix[ii] = bioassemblies[i][b'transformList'][ii][ b'matrix'] for j in range(totModels): totChains = totChains + len(bioChainList[ii]) groupCounter = 0 for k in range(structure.chains_per_model[j]): adding = False for currChain in bioChainList[ii]: if currChain == k: adding = True if adding: totGroups = totGroups + \ structure.groups_per_chain[k] for h in range(structure.groups_per_chain[k]): if adding: groupIndex = structure.group_type_list[ groupCounter] totAtoms = totAtoms + \ len(structure.group_list[groupIndex] ['atomNameList']) totBonds = totBonds + \ len(structure.group_list[groupIndex] ['bondOrderList']) groupCounter = groupCounter + 1 # Set header bioAssembly.init_structure(totBonds, totAtoms, totGroups, totChains, totModels, structureId) decoder_utils.add_xtalographic_info(structure, bioAssembly) decoder_utils.add_header_info(structure, bioAssembly) modelIndex = 0 chainIndex = 0 groupIndex = 0 atomIndex = 0 chainCounter = 0 for ii in range(totModels): numChainsPerModel = structure.chains_per_model[ modelIndex] * numTrans bioAssembly.set_model_info(modelIndex, numChainsPerModel) chainToEntityIndex = self._getChainToEntityIndex(structure) for j in range(structure.chains_per_model[modelIndex]): currGroupIndex = groupIndex currAtomIndex = atomIndex for k in range(numTrans): currChainList = bioChainList[k] currMatrix = transMatrix[k] addThisChain = False for currChain in currChainList: if currChain == j: addThisChain = True groupIndex = currGroupIndex atomIndex = currAtomIndex xCoords = structure.x_coord_list yCoords = structure.y_coord_list zCoords = structure.z_coord_list m = reshape(matrix(currMatrix), (4, 4)) if addThisChain: entityToChainIndex = chainToEntityIndex[chainIndex] bioAssembly.set_entity_info( [chainCounter], structure.entity_list[entityToChainIndex] ['sequence'], structure.entity_list[entityToChainIndex] ['description'], structure.entity_list[entityToChainIndex] ['type']) bioAssembly.set_chain_info( structure.chain_id_list[chainIndex], structure.chain_name_list[chainIndex], structure.groups_per_chain[chainIndex]) chainCounter = chainCounter + 1 for jj in range( structure.groups_per_chain[chainIndex]): # print(structure.group_type_list) currgroup = structure.group_type_list[groupIndex] # if ii == 0 and j == 0 and jj < 10: # print(currgroup) if addThisChain: bioAssembly.set_group_info( structure.group_list[currgroup] ['groupName'], structure.group_id_list[groupIndex], structure.ins_code_list[groupIndex], structure.group_list[currgroup] ['chemCompType'], len(structure.group_list[currgroup] ['atomNameList']), len(structure.group_list[currgroup] ['bondOrderList']), structure.group_list[currgroup] ['singleLetterCode'], structure.sequence_index_list[groupIndex], structure.sec_struct_list[groupIndex]) for kk in range( len(structure.group_list[currgroup] ['atomNameList'])): if addThisChain: p1 = array([ xCoords[atomIndex], yCoords[atomIndex], zCoords[atomIndex], 1 ]) p2 = matmul(p1, m) bioAssembly.set_atom_info( structure.group_list[currgroup] ['atomNameList'][kk], structure.atom_id_list[atomIndex], structure.alt_loc_list[atomIndex], p2.item(0), p2.item(1), p2.item(2), structure.occupancy_list[atomIndex], structure.b_factor_list[atomIndex], structure.group_list[currgroup] ['elementList'][kk], structure.group_list[currgroup] ['formalChargeList'][kk], ) atomIndex = atomIndex + 1 # bond not implemented if addThisChain: for l in range( len(structure.group_list[currgroup] ['bondOrderList'])): bondIndOne = structure.group_list[ currgroup]['bondAtomList'][l * 2] bondIndTwo = structure.group_list[ currgroup]['bondAtomList'][l * 2 + 1] bondOrder = structure.group_list[ currgroup]['bondOrderList'][l] #newChain.set_group_bond(bondIndOne, bondIndTwo, bondOrder) bioAssembly.current_group.bond_atom_list += [ bondIndOne, bondIndTwo ] bioAssembly.current_group.bond_order_list.append( bondOrder) groupIndex = groupIndex + 1 chainIndex = chainIndex + 1 modelIndex = modelIndex + 1 # print(type(currMatrix)) bioAssembly.finalize_structure() resList.append((structureId, bioAssembly)) return resList
def __call__(self, t): structure = t[1] # Precalculate indices numChains = structure.chains_per_model[0] chainToEntityIndex = self._get_chain_to_entity_index(structure) atomsPerChain, bondsPerChain = self._get_num_atoms_and_bonds(structure) chainList = list() seqSet = set() groupCounter = 0 atomCounter = 0 for i in range(numChains): polymerChain = MMTFEncoder() entityToChainIndex = chainToEntityIndex[i] chain_type = structure.entity_list[entityToChainIndex]['type'] polymer = chain_type == "polymer" polymerAtomCount = 0 atomMap = {} structureId = '' if polymer: # To avoid of information loss, add chainName/IDs and entity id # This required by some queries structureId = structure.structure_id + '.' +\ structure.chain_name_list[i] + '.' +\ structure.chain_id_list[i] + '.' +\ str(entityToChainIndex + 1) # Set header polymerChain.init_structure(bondsPerChain[i], atomsPerChain[i], structure.groups_per_chain[i], 1, 1, structureId) decoder_utils.add_xtalographic_info(structure, polymerChain) decoder_utils.add_header_info(structure, polymerChain) # Set model info (only one model: 0) polymerChain.set_model_info(0, 1) # Set entity and chain info polymerChain.set_entity_info( [0], structure.entity_list[entityToChainIndex]['sequence'], structure.entity_list[entityToChainIndex]['description'], structure.entity_list[entityToChainIndex]['type']) polymerChain.set_chain_info(structure.chain_id_list[i], structure.chain_name_list[i], structure.groups_per_chain[i]) for j in range(structure.groups_per_chain[i]): groupIndex = structure.group_type_list[groupCounter] if polymer: # Set group info polymerChain.set_group_info( structure.group_list[groupIndex]['groupName'], structure.group_id_list[groupCounter], structure.ins_code_list[groupCounter], structure.group_list[groupIndex]['chemCompType'], len(structure.group_list[groupIndex]['atomNameList']), len(structure.group_list[groupIndex]['bondOrderList']), structure.group_list[groupIndex]['singleLetterCode'], structure.sequence_index_list[groupCounter], structure.sec_struct_list[groupCounter]) for k in range( len(structure.group_list[groupIndex]['atomNameList'])): if polymer: atomMap[atomCounter] = polymerAtomCount polymerAtomCount += 1 polymerChain.set_atom_info( structure.group_list[groupIndex]['atomNameList'] [k], structure.atom_id_list[atomCounter], structure.alt_loc_list[atomCounter], structure.x_coord_list[atomCounter], structure.y_coord_list[atomCounter], structure.z_coord_list[atomCounter], structure.occupancy_list[atomCounter], structure.b_factor_list[atomCounter], structure.group_list[groupIndex]['elementList'][k], structure.group_list[groupIndex] ['formalChargeList'][k], ) atomCounter += 1 if polymer: # Add intra-group bond info for l in range( len(structure.group_list[groupIndex] ['bondOrderList'])): bondIndOne = structure.group_list[groupIndex][ 'bondAtomList'][l * 2] bondIndTwo = structure.group_list[groupIndex][ 'bondAtomList'][l * 2 + 1] bondOrder = structure.group_list[groupIndex][ 'bondOrderList'][l] polymerChain.set_group_bond(bondIndOne, bondIndTwo, bondOrder) groupCounter += 1 if polymer: # TODO skipping adding inter group bond info for now polymerChain.finalize_structure() chId = structure.chain_name_list[i] if self.useChainIdInsteadOfChainName: chId = structure.chain_id_list[i] if self.excludeDuplicates: if chainToEntityIndex[i] in seqSet: continue seqSet.add(chainToEntityIndex[i]) chainList.append( (structure.structure_id + "." + chId, polymerChain)) return chainList
def _save_structure(self, filepath, select): count_models, count_chains, count_groups, count_atoms = 0, 0, 0, 0 # If atom serials are missing, renumber atoms starting from 1 atom_serials = [a.serial_number for a in self.structure.get_atoms()] renumber_atoms = None in atom_serials encoder = MMTFEncoder() # The counts are set to 0 here and changed later once we have the values encoder.init_structure(total_num_bonds=0, total_num_atoms=0, total_num_groups=0, total_num_chains=0, total_num_models=0, structure_id=self.structure.id) encoder.set_xtal_info(space_group="", unit_cell=None) # The header information is missing for some structure objects header_dict = defaultdict(str, self.structure.header) if header_dict["resolution"] == "": header_dict["resolution"] = None if header_dict["structure_method"] == "": header_dict["structure_method"] = [] else: header_dict["structure_method"] = [header_dict["structure_method"]] encoder.set_header_info( r_free=None, r_work=None, resolution=header_dict["resolution"], title=header_dict["name"], deposition_date=header_dict["deposition_date"], release_date=header_dict["release_date"], experimental_methods=header_dict["structure_method"]) # Tracks values to replace them at the end chains_per_model = [] groups_per_chain = [] for mi, model in enumerate(self.structure.get_models()): if not select.accept_model(model): continue chain_id_iterator = self._chain_id_iterator() count_models += 1 encoder.set_model_info( model_id=mi, # According to mmtf-python this is meaningless chain_count=0 # Set to 0 here and changed later ) for chain in model.get_chains(): if not select.accept_chain(chain): continue seqs = [] seq = "" prev_residue_type = "" prev_resname = "" first_chain = True for residue in chain.get_unpacked_list(): if not select.accept_residue(residue): continue count_groups += 1 hetfield, resseq, icode = residue.get_id() if hetfield == " ": residue_type = "ATOM" entity_type = "polymer" elif hetfield == "W": residue_type = "HETATM" entity_type = "water" else: residue_type = "HETATM" entity_type = "non-polymer" resname = residue.get_resname() # Check if the molecule changes within the chain # This will always increment for the first residue in a # chain due to the starting values above # Checking for similar entities is non-trivial from the # structure object so we treat each molecule as a separate # entity if residue_type != prev_residue_type or ( residue_type == "HETATM" and resname != prev_resname): encoder.set_entity_info( chain_indices=[count_chains], sequence="", # Set to empty here and changed later description="", entity_type=entity_type) encoder.set_chain_info( chain_id=next(chain_id_iterator), chain_name="\x00" if len(chain.get_id().strip()) == 0 else chain.get_id(), num_groups=0 # Set to 0 here and changed later ) if count_chains > 0: groups_per_chain.append(count_groups - sum(groups_per_chain) - 1) if not first_chain: seqs.append(seq) first_chain = False count_chains += 1 seq = "" if entity_type == "polymer": seq += seq1(resname, custom_map=protein_letters_3to1) prev_residue_type = residue_type prev_resname = resname encoder.set_group_info( group_name=resname, group_number=residue.id[1], insertion_code="\x00" if residue.id[2] == " " else residue.id[2], group_type= "", # Value in the chemcomp dictionary, which is unknown here atom_count=sum(1 for a in residue.get_unpacked_list() if select.accept_atom(a)), bond_count=0, single_letter_code=seq1( resname, custom_map=protein_letters_3to1), sequence_index=len(seq) - 1 if entity_type == "polymer" else -1, secondary_structure_type=-1) for atom in residue.get_unpacked_list(): if select.accept_atom(atom): count_atoms += 1 encoder.set_atom_info( atom_name=atom.name, serial_number=count_atoms if renumber_atoms else atom.serial_number, alternative_location_id="\x00" if atom.altloc == " " else atom.altloc, x=atom.coord[0], y=atom.coord[1], z=atom.coord[2], occupancy=atom.occupancy, temperature_factor=atom.bfactor, element=atom.element, charge=0) seqs.append(seq) # Now that we have the sequences, edit the entities to add them start_ind = len(encoder.entity_list) - len(seqs) for i, seq in enumerate(seqs): encoder.entity_list[start_ind + i]["sequence"] = seq chains_per_model.append(count_chains - sum(chains_per_model)) groups_per_chain.append(count_groups - sum(groups_per_chain)) encoder.chains_per_model = chains_per_model encoder.groups_per_chain = groups_per_chain encoder.num_atoms = count_atoms encoder.num_groups = count_groups encoder.num_chains = count_chains encoder.num_models = count_models encoder.finalize_structure() encoder.write_file(filepath)
def _combine_chains(self, s1, s2): if not s1.alt_loc_set: s1 = s1.set_alt_loc_list() if not s2.alt_loc_set: s2 = s2.set_alt_loc_list() groupCounter = 0 atomCounter = 0 structureId = s1.structure_id + "_append_" + s2.structure_id combinedStructure = MMTFEncoder() # Set header combinedStructure.init_structure(s1.num_bonds + s2.num_bonds, s1.num_atoms + s2.num_atoms, s1.num_groups + s2.num_groups, 2, 1, structureId) decoder_utils.add_xtalographic_info(s1, combinedStructure) decoder_utils.add_header_info(s1, combinedStructure) # Set model info (only one model: 0) combinedStructure.set_model_info(0, 2) chainToEntityIndex = self._get_chain_to_entity_index(s1)[0] # Set entity and chain info combinedStructure.set_entity_info( [0], s1.entity_list[chainToEntityIndex]['sequence'], s1.entity_list[chainToEntityIndex]['description'], s1.entity_list[chainToEntityIndex]['type']) combinedStructure.set_chain_info(s1.chain_id_list[0], s1.chain_name_list[0], s1.groups_per_chain[0]) for i in range(s1.groups_per_chain[0]): groupIndex = s1.group_type_list[groupCounter] # Set group info combinedStructure.set_group_info( s1.group_list[groupIndex]['groupName'], s1.group_id_list[groupCounter], s1.ins_code_list[groupCounter], s1.group_list[groupIndex]['chemCompType'], len(s1.group_list[groupIndex]['atomNameList']), len(s1.group_list[groupIndex]['bondOrderList']), s1.group_list[groupIndex]['singleLetterCode'], s1.sequence_index_list[groupCounter], s1.sec_struct_list[groupCounter]) for j in range(len(s1.group_list[groupIndex]['atomNameList'])): combinedStructure.set_atom_info( s1.group_list[groupIndex]['atomNameList'][j], s1.atom_id_list[atomCounter], s1.alt_loc_list[atomCounter], s1.x_coord_list[atomCounter], s1.y_coord_list[atomCounter], s1.z_coord_list[atomCounter], s1.occupancy_list[atomCounter], s1.b_factor_list[atomCounter], s1.group_list[groupIndex]['elementList'][j], s1.group_list[groupIndex]['formalChargeList'][j]) atomCounter += 1 # TODO not sure if we should add bonds like this # TODO bondAtomList == getGroupBondIndices? for k in range(len(s1.group_list[groupIndex]["bondOrderList"])): bondIndOne = s1.group_list[groupIndex]["bondAtomList"][k * 2] bondIndTwo = s1.group_list[groupIndex]["bondAtomList"][k * 2 + 1] bondOrder = s1.group_list[groupIndex]["bondOrderList"][k] combinedStructure.set_group_bond(bondIndOne, bondIndTwo, bondOrder) groupCounter += 1 # Set entity and chain info for s2 chainToEntityIndex = self._get_chain_to_entity_index(s2)[0] combinedStructure.set_entity_info( [1], s2.entity_list[chainToEntityIndex]['sequence'], s2.entity_list[chainToEntityIndex]['description'], s2.entity_list[chainToEntityIndex]['type']) combinedStructure.set_chain_info(s2.chain_id_list[0], s2.chain_name_list[0], s2.groups_per_chain[0]) groupCounter = 0 atomCounter = 0 for i in range(s2.groups_per_chain[0]): groupIndex = s2.group_type_list[groupCounter] # Set group info combinedStructure.set_group_info( s2.group_list[groupIndex]['groupName'], s2.group_id_list[groupCounter], s2.ins_code_list[groupCounter], s2.group_list[groupIndex]['chemCompType'], len(s2.group_list[groupIndex]['atomNameList']), len(s2.group_list[groupIndex]['bondOrderList']), s2.group_list[groupIndex]['singleLetterCode'], s2.sequence_index_list[groupCounter], s2.sec_struct_list[groupCounter]) for j in range(len(s2.group_list[groupIndex]['atomNameList'])): combinedStructure.set_atom_info( s2.group_list[groupIndex]['atomNameList'][j], s2.atom_id_list[atomCounter], s2.alt_loc_list[atomCounter], s2.x_coord_list[atomCounter], s2.y_coord_list[atomCounter], s2.z_coord_list[atomCounter], s2.occupancy_list[atomCounter], s2.b_factor_list[atomCounter], s2.group_list[groupIndex]['elementList'][j], s2.group_list[groupIndex]['formalChargeList'][j]) atomCounter += 1 # TODO not sure if we should add bonds like this # TODO bondAtomList == getGroupBondIndices? for k in range(len(s2.group_list[groupIndex]["bondOrderList"])): bondIndOne = s2.group_list[groupIndex]["bondAtomList"][k * 2] bondIndTwo = s2.group_list[groupIndex]["bondAtomList"][k * 2 + 1] bondOrder = s2.group_list[groupIndex]["bondOrderList"][k] combinedStructure.set_group_bond(bondIndOne, bondIndTwo, bondOrder) groupCounter += 1 combinedStructure.finalize_structure() return (structureId, combinedStructure)
def _split_to_chains(self, s): '''split structure to a list of chains ''' chains = [] numChains = s.chains_per_model[0] chainToEntityIndex = self._get_chain_to_entity_index(s) atomsPerChain, bondsPerChain = self._get_num_atoms_and_bonds(s) groupCounter = 0 atomCounter = 0 for i in range(numChains): atomMap = {} newChain = MMTFEncoder() entityToChainIndex = chainToEntityIndex[i] structureId = s.structure_id + '.' +\ s.chain_name_list[i] + '.' +\ s.chain_id_list[i] + '.' +\ str(entityToChainIndex + 1) # Set header newChain.init_structure(bondsPerChain[i], atomsPerChain[i], s.groups_per_chain[i], 1, 1, structureId) decoder_utils.add_xtalographic_info(s, newChain) decoder_utils.add_header_info(s, newChain) # Set model info (only one model: 0) newChain.set_model_info(0, 1) # Set entity and chain info newChain.set_entity_info( [0], s.entity_list[entityToChainIndex]['sequence'], s.entity_list[entityToChainIndex]['description'], s.entity_list[entityToChainIndex]['type']) newChain.set_chain_info(s.chain_id_list[i], s.chain_name_list[i], s.groups_per_chain[i]) for j in range(s.groups_per_chain[i]): groupIndex = s.group_type_list[groupCounter] # print(s.group_type_list) # Set group info newChain.set_group_info( s.group_list[groupIndex]['groupName'], s.group_id_list[groupCounter], s.ins_code_list[groupCounter], s.group_list[groupIndex]['chemCompType'], len(s.group_list[groupIndex]['atomNameList']), len(s.group_list[groupIndex]['bondOrderList']), s.group_list[groupIndex]['singleLetterCode'], s.sequence_index_list[groupCounter], s.sec_struct_list[groupCounter]) for k in range(len(s.group_list[groupIndex]['atomNameList'])): newChain.set_atom_info( s.group_list[groupIndex]['atomNameList'][k], s.atom_id_list[atomCounter], s.alt_loc_list[atomCounter], s.x_coord_list[atomCounter], s.y_coord_list[atomCounter], s.z_coord_list[atomCounter], s.occupancy_list[atomCounter], s.b_factor_list[atomCounter], s.group_list[groupIndex]['elementList'][k], s.group_list[groupIndex]['formalChargeList'][k]) atomCounter += 1 for l in range(len(s.group_list[groupIndex]['bondOrderList'])): bondIndOne = s.group_list[groupIndex]['bondAtomList'][l * 2] bondIndTwo = s.group_list[groupIndex]['bondAtomList'][l * 2 + 1] bondOrder = s.group_list[groupIndex]['bondOrderList'][l] #newChain.set_group_bond(bondIndOne, bondIndTwo, bondOrder) newChain.current_group.bond_atom_list += [ bondIndOne, bondIndTwo ] newChain.current_group.bond_order_list.append(bondOrder) groupCounter += 1 # TODO skipping adding inter group bond info for now newChain.finalize_structure() # TODO double check if just getting from chain 0 chain_type = [chain['type'] for chain in newChain.entity_list] polymer = "polymer" in chain_type if polymer: match = True for j in range(newChain.groups_per_chain[0]): #print(newChain.groups_per_chain, newChain.group_type_list) groupIndex = newChain.group_type_list[j] if match: _type = newChain.group_list[groupIndex]["chemCompType"] match = (_type == "L-PEPTIDE LINKING") or ( _type == "PEPTIDE LINKING") if match: chains.append(newChain) return chains