Exemplo n.º 1
0
def test_adjacency_matrix(cell_size, threshold, periodic):
    """
    Compare the construction of an adjacency matrix using a cell list
    and using a computationally expensive but simpler distance matrix.
    """
    array = strucio.load_structure(join(data_dir, "3o5r.mmtf"))
    if periodic:
        # Create an orthorhombic box
        # with the outer coordinates as bounds
        array.box = np.diag(
            np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2))
    cell_list = struc.CellList(array, cell_size=cell_size, periodic=periodic)
    matrix = cell_list.create_adjacency_matrix(threshold)

    # Create distance matrix
    # Convert to float64 to avoid errorenous warning
    # https://github.com/ContinuumIO/anaconda-issues/issues/9129
    array.coord = array.coord.astype(np.float64)
    length = array.array_length()
    distance = struc.index_distance(
        array,
        np.stack([
            np.repeat(np.arange(length), length),
            np.tile(np.arange(length), length)
        ],
                 axis=-1), periodic)
    distance = np.reshape(distance, (length, length))
    # Create adjacency matrix from distance matrix
    expected_matrix = (distance <= threshold)

    # Both ways to create an adjacency matrix
    # should give the same result
    assert np.array_equal(matrix, expected_matrix)
Exemplo n.º 2
0
def test_get_atoms(cell_size):
    """
    Test the correct functionality of a cell list on a simple test case
    with known solutions.
    """
    array = struc.AtomArray(length=5)
    array.coord = np.array([[0,0,i] for i in range(5)])
    cell_list = struc.CellList(array, cell_size=cell_size)
    assert cell_list.get_atoms(np.array([0,0,0.1]), 1).tolist() == [0,1]
    assert cell_list.get_atoms(np.array([0,0,1.1]), 1).tolist() == [1,2]
    assert cell_list.get_atoms(np.array([0,0,1.1]), 2).tolist() == [0,1,2,3]
    # Multiple positions
    pos = np.array([[0,0,0.1],
                    [0,0,1.1],
                    [0,0,4.1]])
    expected_indices = [0, 1, 2,
                        0, 1, 2, 3,
                        3, 4]
    indices = cell_list.get_atoms(pos, 2)
    assert indices[indices != -1].tolist() == expected_indices
    # Multiple positions and multiple radii
    pos = np.array([[0,0,0.1],
                    [0,0,1.1],
                    [0,0,4.1]])
    rad = np.array([1.0, 2.0, 3.0])
    expected_indices = [0, 1,
                        0, 1, 2, 3,
                        2, 3, 4]
    indices = cell_list.get_atoms(pos, rad)
    assert indices[indices != -1].tolist() == expected_indices
Exemplo n.º 3
0
    def __getitem__(self, index):
        print(os.path.join(self.fileDir, self.files[index]))
        array = strucio.load_structure(
            os.path.join(self.fileDir, self.files[index]))
        if type(array) == biotite.structure.AtomArrayStack:
            array = array[0]
        # print(os.path.join(self.fileDir, self.files[index]))
        # print(type(array))

        ca = array[array.atom_name == "CA"]
        cell_list = struc.CellList(ca, cell_size=self.threshold)

        # cell_list = struc.CellList(array, cell_size=self.threshold)
        adj_matrix = cell_list.create_adjacency_matrix(
            self.threshold).astype(int)

        shape = adj_matrix.shape

        if shape[0] % 2 != 0:
            print(shape)
            adj_matrix = np.append(adj_matrix,
                                   np.zeros((1, shape[0]), dtype=float),
                                   axis=0)
            adj_matrix = np.append(adj_matrix,
                                   np.zeros((shape[0] + 1, 1), dtype=float),
                                   axis=1)
            print(adj_matrix.shape)

        # return torch.tensor(adj_matrix.astype('float'))
        return adj_matrix.astype('double')
Exemplo n.º 4
0
def test_outside_location():
    # Test result for location outside any cell
    array = strucio.load_structure(join(data_dir, "3o5r.mmtf"))
    array = array[struc.filter_amino_acids(array)]
    cell_list = struc.CellList(array, cell_size=5)
    outside_coord = np.min(array.coord, axis=0) - 100
    # Expect empty array
    assert len(cell_list.get_atoms(outside_coord, 5)) == 0
Exemplo n.º 5
0
def detect_disulfide_bonds(structure,
                           distance=2.05,
                           distance_tol=0.05,
                           dihedral=90,
                           dihedral_tol=10):
    # Array where detected disulfide bonds are stored
    disulfide_bonds = []
    # A mask that selects only S-gamma atoms of cysteins
    sulfide_mask = (structure.res_name == "CYS") & \
                   (structure.atom_name == "SG")
    # sulfides in adjacency to other sulfides are detected in an
    # efficient manner via a cell list
    cell_list = struc.CellList(structure,
                               cell_size=distance + distance_tol,
                               selection=sulfide_mask)
    # Iterate over every index corresponding to an S-gamma atom
    for sulfide_i in np.where(sulfide_mask)[0]:
        # Find indices corresponding to other S-gamma atoms,
        # that are adjacent to the position of structure[sulfide_i]
        # We use the faster 'get_atoms_in_cells()' instead of
        # `get_atoms()`, as precise distance measurement is done
        # afterwards anyway
        potential_bond_partner_indices = cell_list.get_atoms_in_cells(
            coord=structure.coord[sulfide_i])
        # Iterate over every index corresponding to an S-gamma atom
        # as bond partner
        for sulfide_j in potential_bond_partner_indices:
            if sulfide_i == sulfide_j:
                # A sulfide cannot create a bond with itself:
                continue
            # Create 'Atom' instances
            # of the potentially bonds S-gamma atoms
            sg1 = structure[sulfide_i]
            sg2 = structure[sulfide_j]
            # For dihedral angle measurement the corresponding
            # C-beta atoms are required, too
            cb1 = structure[(structure.chain_id == sg1.chain_id)
                            & (structure.res_id == sg1.res_id) &
                            (structure.atom_name == "CB")]
            cb2 = structure[(structure.chain_id == sg2.chain_id)
                            & (structure.res_id == sg2.res_id) &
                            (structure.atom_name == "CB")]
            # Measure distance and dihedral angle and check criteria
            bond_dist = struc.distance(sg1, sg2)
            bond_dihed = np.abs(np.rad2deg(struc.dihedral(cb1, sg1, sg2, cb2)))
            if bond_dist  > distance - distance_tol and \
               bond_dist  < distance + distance_tol and \
               bond_dihed > dihedral - dihedral_tol and \
               bond_dihed < dihedral + dihedral_tol:
                # Atom meet criteria -> we found a disulfide bond
                # -> the indices of the bond S-gamma atoms
                # are put into a tuple with the lower index first
                bond_tuple = sorted((sulfide_i, sulfide_j))
                # Add bond to list of bonds, but each bond only once
                if bond_tuple not in disulfide_bonds:
                    disulfide_bonds.append(bond_tuple)
    return np.array(disulfide_bonds, dtype=int)
Exemplo n.º 6
0
def find_leaflets(structure,
                  head_atom_mask,
                  cutoff_distance=15.0,
                  periodic=False):
    """
    Identify which lipids molecules belong to the same lipid bilayer
    leaflet.

    Parameters
    ----------
    structure : AtomArray, shape=(n,)
        The structure containing the membrane.
        May also include other molecules, e.g. water or an embedded
        protein.
    head_atom_mask : ndarray, dtype=bool, shape=(n,)
        A boolean mask that selects atoms from `structure` that
        represent lipid head groups.
    cutoff_distance : float, optional
        When the distance of two head groups is larger than this value,
        they are not (directly) connected in the same leaflet.
    periodic : bool, optional,
        If true, periodic boundary conditions are considered.
        This requires that `structure` has an associated `box`.
    
    Returns
    -------
    leaflets : ndarray, dtype=bool, shape=(m,n)
        Multiple boolean masks, one for each identified leaflet.
        Each masks indicates which atoms of the input `structure`
        are in the leaflet.
    """

    cell_list = struc.CellList(structure,
                               cell_size=cutoff_distance,
                               selection=head_atom_mask,
                               periodic=periodic)
    adjacency_matrix = cell_list.create_adjacency_matrix(cutoff_distance)
    graph = nx.Graph(adjacency_matrix)

    head_leaflets = [
        sorted(c) for c in nx.connected_components(graph)
        # A leaflet cannot consist of a single lipid
        # This also removes all entries
        # for atoms not in 'head_atom_mask'
        if len(c) > 1
    ]

    # 'leaflets' contains indices to head atoms
    # Broadcast each head atom index to all atoms in its corresponding
    # residue
    leaflet_masks = np.empty((len(head_leaflets), structure.array_length()),
                             dtype=bool)
    for i, head_leaflet in enumerate(head_leaflets):
        leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet) \
                                .any(axis=0)
    return leaflet_masks
Exemplo n.º 7
0
def test_selection():
    """
    Test whether the `selection` parameter in the constructor works.
    This is tested by comparing the selection done prior to cell list
    creation with the selection done in the cell list construction.
    """
    array = strucio.load_structure(join(data_dir, "3o5r.mmtf"))
    selection = np.array([False, True] * (array.array_length() // 2))
    
    # Selection prior to cell list creation
    selected = array[selection]
    cell_list = struc.CellList(selected, cell_size=10)
    ref_near_atoms = selected[cell_list.get_atoms(array.coord[0], 20.0)]

    # Selection in cell list creation
    cell_list = struc.CellList(array, cell_size=10, selection=selection)
    test_near_atoms = array[cell_list.get_atoms(array.coord[0], 20.0)]

    assert test_near_atoms == ref_near_atoms
Exemplo n.º 8
0
def water_in_prox(atoms, sele, cutoff):
    """
    Get the atom indices of water oxygen atoms that are in vicinity of
    the selected atoms.
    """
    cell_list = struct.CellList(atoms, cell_size=5,
                                selection=atoms.atom_name == "OW")
    adjacent_atoms = cell_list.get_atoms(atoms[sele].coord, cutoff)
    adjacent_atoms = np.unique(adjacent_atoms.flatten())
    adjacent_atoms = adjacent_atoms[adjacent_atoms > 0]
    return adjacent_atoms
Exemplo n.º 9
0
 def get_matrices(array):
     """
     Create a periodic and non-periodic adjacency matrix.
     """
     nonlocal CUTOFF
     if isinstance(array, struc.AtomArray):
         matrix     = struc.CellList(array, CUTOFF, periodic=False) \
                     .create_adjacency_matrix(CUTOFF)
         matrix_pbc = struc.CellList(array, CUTOFF, periodic=True) \
                     .create_adjacency_matrix(CUTOFF)
     elif isinstance(array, struc.AtomArrayStack):
         matrix = np.array([
             struc.CellList(model, CUTOFF,
                            periodic=False).create_adjacency_matrix(CUTOFF)
             for model in array
         ])
         matrix_pbc = np.array([
             struc.CellList(model, CUTOFF,
                            periodic=True).create_adjacency_matrix(CUTOFF)
             for model in array
         ])
     return matrix, matrix_pbc
Exemplo n.º 10
0
def test_adjacency_matrix(cell_size, threshold, periodic, use_selection):
    """
    Compare the construction of an adjacency matrix using a cell list
    and using a computationally expensive but simpler distance matrix.
    """
    array = strucio.load_structure(join(data_dir, "3o5r.mmtf"))
    
    if periodic:
        # Create an orthorhombic box
        # with the outer coordinates as bounds
        array.box = np.diag(
            np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2)
        )

    if use_selection:
        np.random.seed(0)
        selection = np.random.choice((False, True), array.array_length())
    else:
        selection = None

    cell_list = struc.CellList(
        array, cell_size=cell_size, periodic=periodic, selection=selection
    )
    test_matrix = cell_list.create_adjacency_matrix(threshold)
    
    length = array.array_length()
    distance = struc.index_distance(
        array,
        np.stack(
            [
                np.repeat(np.arange(length), length),
                  np.tile(np.arange(length), length)
            ],
            axis=-1
        ),
        periodic
    )
    distance = np.reshape(distance, (length, length))
    # Create adjacency matrix from distance matrix
    exp_matrix = (distance <= threshold)
    if use_selection:
        # Set rows and columns to False for filtered out atoms
        exp_matrix[~selection, :] = False
        exp_matrix[:, ~selection] = False
    
    # Both ways to create an adjacency matrix
    # should give the same result
    assert np.array_equal(test_matrix, exp_matrix)
Exemplo n.º 11
0
def test_adjacency_matrix(cell_size, threshold):
    array = strucio.load_structure(join(data_dir, "3o5r.mmtf"))
    array = array[struc.filter_amino_acids(array)]
    cell_list = struc.CellList(array, cell_size=cell_size)
    matrix = cell_list.create_adjacency_matrix(threshold)
    coord = array.coord
    # Create distance matrix
    diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :]
    # Convert to float64 to avoid errorenous warning
    # https://github.com/ContinuumIO/anaconda-issues/issues/9129
    diff = diff.astype(np.float64)
    distance = np.sqrt(np.sum(diff**2, axis=-1))
    # Create adjacency matrix from distance matrix
    expected_matrix = (distance <= threshold)
    # Both ways to create an adjacency matrix
    # should give the same result
    assert matrix.tolist() == expected_matrix.tolist()
Exemplo n.º 12
0
structure = mmtf.get_structure(mmtf_file, model=1)

# Separate structure into the DNA and the two identical protein chains
dna = structure[np.isin(structure.chain_id, ["A", "B"])
                & (structure.hetero == False)]
protein_l = structure[(structure.chain_id == "L")
                      & (structure.hetero == False)]
protein_r = structure[(structure.chain_id == "R")
                      & (structure.hetero == False)]
# Quick check if the two protein chains are really identical
assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r))

# Fast identification of contacts via a cell list:
# The cell list is initiliazed with the coordinates of the DNA
# and later provided with the atom coordinates of the two protein chains
cell_list = struc.CellList(dna, cell_size=THRESHOLD_DISTANCE)

# Sets to store the residue IDs of contact residues
# for each protein chain
id_set_l = set()
id_set_r = set()

for protein, res_id_set in zip((protein_l, protein_r), (id_set_l, id_set_r)):
    # For each atom in the protein chain,
    # find all atoms in the DNA that are in contact with it
    contacts = cell_list.get_atoms(protein.coord, radius=THRESHOLD_DISTANCE)
    # Only retain atoms in the protein with contact
    # to at least one atom of the DNA
    contact_indices = np.where((contacts != -1).any(axis=1))[0]
    # Get residue IDs for the atoms in the protein
    contact_res_ids = protein.res_id[contact_indices]
Exemplo n.º 13
0
def pdb2Gdata(dirName, fileName, saveDir=False):
    # print(os.path.join(dirName, fileName))
    array = strucio.load_structure(
        os.path.join(dirName, fileName),
        # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'],
        extra_fields=['b_factor', 'occupancy'],
        model=1)

    # уникальные цепи
    chainIdUnique = []
    for chain in array.chain_id:
        if chain not in chainIdUnique:
            chainIdUnique.append(chain)

    # вторичная структура используя алгоритм DSSP
    sse = dssp.DsspApp.annotate_sse(array)

    # "маски" цепи и остатки СА атомов
    chainMask = array[array.atom_name == 'CA'].chain_id
    resMask = array[array.atom_name == 'CA'].res_id

    # если sse короче масок, то расширим
    tmp = resMask.shape[0] - sse.shape[0]
    if tmp > 0:
        sse = np.append(sse, ['Null'] * tmp)

    # для каждой цепи, для каждого остатка - вторичная структура
    sseMaskDict = dict([(chain, {}) for chain in chainIdUnique])
    for chainId, resId, sseId in zip(chainMask, resMask, sse):
        sseMaskDict[chainId][resId] = sseId

    # матрица смежности
    cell_list = struc.CellList(array, cell_size=cfg.threshold)
    adj_matrix = cell_list.create_adjacency_matrix(cfg.threshold)

    # (adj_matrix[adj_matrix == True].shape[0] - 5385) / 2
    edge_index = [[], []]
    nodeFeatures = []

    # переводим матрицу смежности в COO и собираем признаки
    arrayShp = array.shape[0]
    for i in range(arrayShp - 1):
        for j in range(i + 1, arrayShp):
            if adj_matrix[i][j]:
                edge_index[0].append(i)
                edge_index[1].append(j)

        nodeFeatures.append(
            list(array.coord[i]) + [
                array.res_id[i], array.b_factor[i],
                float(array.hetero[i]), array.occupancy[i]
            ] + atomsDict.get(array.atom_name[i], atomsDict['Null']) +
            residualesDict.get(array.res_name[i], residualesDict['Null']) +
            ssesTypeDict.get(
                sseMaskDict[array.chain_id[i]].get(array.res_id[i], 'Null'),
                ssesTypeDict['Null']))
    nodeFeatures.append(
        list(array.coord[arrayShp - 1]) + [
            array.res_id[arrayShp - 1], array.b_factor[arrayShp - 1],
            float(array.hetero[arrayShp - 1]), array.occupancy[arrayShp - 1]
        ] + atomsDict.get(array.atom_name[arrayShp - 1], atomsDict['Null']) +
        residualesDict.get(array.res_name[arrayShp -
                                          1], residualesDict['Null']) +
        ssesTypeDict.get(
            sseMaskDict[array.chain_id[arrayShp - 1]].get(
                array.res_id[arrayShp - 1], 'Null'), ssesTypeDict['Null']))

    # графовый формат
    # nodeFeaturesT = torch.tensor(nodeFeatures, dtype=torch.float)
    # edge_indexT = torch.tensor(edge_index, dtype=torch.long)
    # data = Data(x=nodeFeaturesT, edge_index=edge_indexT)
    data = Data(x=torch.tensor(nodeFeatures, dtype=torch.float),
                edge_index=torch.tensor(edge_index, dtype=torch.long))

    if saveDir:
        torch.save(data, os.path.join(saveDir, fileName))

    return data
Exemplo n.º 14
0
def pdb2Gdata(dirName, fileName, saveDir=False):
    # print(os.path.join(dirName, fileName))
    array = strucio.load_structure(
        os.path.join(dirName, fileName),
        # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'],
        extra_fields=['b_factor', 'occupancy'],
        model=1)
    # if type(array) == biotite.structure.AtomArrayStack:
    #     array = array[0]

    # ca = array[array.atom_name == "CA"]
    # cell_list = struc.CellList(ca, cell_size=self.threshold)

    chain_id = []
    for chain in array.chain_id:
        if chain not in chain_id:
            chain_id.append(chain)

    sseDict = dict([(chain, struc.annotate_sse(array, chain_id=chain))
                    for chain in chain_id])

    sseMaskDict = {}
    for key, value in sseDict.items():
        mask = array[(array.chain_id == key)
                     & (array.atom_name == 'CA')].res_id
        tmp = mask.shape[0] - value.shape[0]
        if tmp > 0:
            sseDict[key] = np.append(value, ['Null'] * tmp)

        sseMaskDict[key] = {}
        for maskId, sseId in zip(mask, sseDict[key]):
            sseMaskDict[key][maskId] = sseId

    cell_list = struc.CellList(array, cell_size=cfg.threshold)
    adj_matrix = cell_list.create_adjacency_matrix(cfg.threshold)

    # (adj_matrix[adj_matrix == True].shape[0] - 5385) / 2
    edge_index = [[], []]

    nodeFeatures = []
    arrayShp = array.shape[0]
    for i in range(arrayShp - 1):
        for j in range(i + 1, arrayShp):
            if adj_matrix[i][j]:
                edge_index[0].append(i)
                edge_index[1].append(j)

        nodeFeatures.append(
            list(array.coord[i]) +
            [atomsDict.get(array.atom_name[i], atomsDict['Null'])] +
            [elementsDict.get(array.element[i], elementsDict['Null'])] +
            [array.res_id[i]] +
            [residualesDict.get(array.res_name[i], residualesDict['Null'])] +
            [float(array.hetero[i])] + [array.occupancy[i]] +
            [array.b_factor[i]] + [
                ssesTypeDict.get(
                    sseMaskDict[array.chain_id[i]].get(
                        array.res_id[i], 'Null'), ssesTypeDict['Null'])
            ])
    nodeFeatures.append(
        list(array.coord[arrayShp - 1]) +
        [atomsDict.get(array.atom_name[arrayShp - 1], atomsDict['Null'])] +
        [elementsDict.get(array.element[arrayShp - 1], elementsDict['Null'])] +
        [array.res_id[arrayShp - 1]] + [
            residualesDict.get(array.res_name[arrayShp -
                                              1], residualesDict['Null'])
        ] + [float(array.hetero[arrayShp - 1])] +
        [array.occupancy[arrayShp - 1]] + [array.b_factor[arrayShp - 1]] + [
            ssesTypeDict.get(
                sseMaskDict[array.chain_id[arrayShp - 1]].get(
                    array.res_id[arrayShp - 1], 'Null'), ssesTypeDict['Null'])
        ])

    nodeFeaturesT = torch.tensor(nodeFeatures, dtype=torch.float)
    edge_indexT = torch.tensor(edge_index, dtype=torch.long)
    data = Data(x=nodeFeaturesT, edge_index=edge_indexT)

    if saveDir:
        torch.save(data, os.path.join(saveDir, fileName))

    return data
Exemplo n.º 15
0
def pdb2Gdata(dirName, fileName, saveDir=False):
    array = strucio.load_structure(os.path.join(dirName, fileName), model=1)

    # уникальные цепи
    chainIdUnique = np.unique(array.chain_id)

    data = {}
    # для каждой цепи
    for chain in chainIdUnique:
        sseMaskDict = {}

        # берем текущую цепь, исключаем heatem атомы (== numpy.False)
        oneChainArray = array[(array.chain_id == chain)
                              & (array.hetero == False)]

        # только СА атомы
        backbone = oneChainArray[oneChainArray.atom_name == 'CA']

        backboneShp = backbone.shape[0]
        # НЕ считаем вторичную стуктуру, если в цепи нет (или мало) CA атомов
        if backboneShp < 5:
            continue

        # вторичная структура используя алгоритм DSSP
        sse = dssp.DsspApp.annotate_sse(oneChainArray)

        # если sse короче маски, то расширим
        tmp = backboneShp - sse.shape[0]
        if tmp > 0:
            sse = np.append(sse, ['C'] * tmp)

        # для каждого остатка - вторичная структура
        for resId, sseId in zip(backbone.res_id, sse):
            sseMaskDict[resId] = sseId

        # матрица смежности
        cellList = struc.CellList(backbone, cell_size=cfg.threshold)
        adjMatrix = cellList.create_adjacency_matrix(cfg.threshold)

        # вычитаем центроиду - смещаем центр белка в точку (0, 0, 0) (для нормировки признака)
        backbone.coord -= backbone.coord.mean(axis=0)

        # длина максимального вектора (для нормировки признака)
        maxNorm = np.linalg.norm(backbone.coord, axis=1).max()
        if maxNorm != 0:
            backbone.coord /= maxNorm

        edgeIndex = [[], []]
        nodeFeatures = []

        # переводим матрицу смежности в COO и собираем признаки
        for i in range(backboneShp - 1):
            for j in range(i + 1, backboneShp):
                if adjMatrix[i][j]:
                    edgeIndex[0].append(i)
                    edgeIndex[1].append(j)

            nodeFeatures.append(
                list(backbone.coord[i]) + residualesDict.get(
                    backbone.res_name[i], residualesDict['Null']) +
                ssesTypeDict.get(sseMaskDict.get(backbone.res_id[i], 'C')))
        nodeFeatures.append(
            list(backbone.coord[-1]) +
            residualesDict.get(backbone.res_name[-1], residualesDict['Null']) +
            ssesTypeDict.get(sseMaskDict.get(backbone.res_id[-1], 'C')))

        # графовый формат
        data[chain] = Data(x=torch.tensor(nodeFeatures, dtype=torch.float),
                           edge_index=torch.tensor(edgeIndex,
                                                   dtype=torch.long))

    # сохраняем все графы в отдельные файлы
    if saveDir:
        for chain, graph in data.items():
            fileNameSplit = fileName.split('.')
            # приписываем к названию файла название цепи
            fileNameSplit[0] += chain
            torch.save(graph, os.path.join(saveDir, '.'.join(fileNameSplit)))

    # возвращаем словарь
    return data
Exemplo n.º 16
0
def pdb2Gdata(dirName, fileName, saveDir=False):
    # print(os.path.join(dirName, fileName))
    array = strucio.load_structure(
        os.path.join(dirName, fileName),
        # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'],
        extra_fields=['b_factor', 'occupancy'],
        model=1)

    # уникальные цепи
    chainIdUnique = []
    for chain in array.chain_id:
        if chain not in chainIdUnique:
            chainIdUnique.append(chain)

    # вторичная структура используя алгоритм DSSP для каждой цепи
    # НЕ считаем вторичную стуктуру, если в цепи нет CA атомов
    sseChainDict = dict([
        (chain, dssp.DsspApp.annotate_sse(array[array.chain_id == chain]))
        for chain in chainIdUnique
        if array[(array.chain_id == chain)
                 & (array.atom_name == 'CA')].shape[0] != 0
    ])

    data = {}
    sseMaskDict = dict([(chain, {}) for chain in chainIdUnique])
    for chain, sse in sseChainDict.items():
        # "маска" остатков СА атомов
        resMask = array[(array.chain_id == chain)
                        & (array.atom_name == 'CA')].res_id

        # если sse короче маски, то расширим
        tmp = resMask.shape[0] - sse.shape[0]
        if tmp > 0:
            sseChainDict[chain] = np.append(sse, ['Null'] * tmp)

        # для каждой цепи, для каждого остатка - вторичная структура
        for resId, sseId in zip(resMask, sseChainDict[chain]):
            sseMaskDict[chain][resId] = sseId

        oneChainArray = array[array.chain_id == chain]

        # матрица смежности
        cell_list = struc.CellList(oneChainArray, cell_size=cfg.threshold)
        adj_matrix = cell_list.create_adjacency_matrix(cfg.threshold)

        edge_index = [[], []]
        nodeFeatures = []

        # переводим матрицу смежности в COO и собираем признаки
        arrayShp = oneChainArray.shape[0]
        for i in range(arrayShp - 1):
            for j in range(i + 1, arrayShp):
                if adj_matrix[i][j]:
                    edge_index[0].append(i)
                    edge_index[1].append(j)

            nodeFeatures.append(
                list(oneChainArray.coord[i]) + [
                    oneChainArray.res_id[i], oneChainArray.b_factor[i],
                    float(oneChainArray.hetero[i]), oneChainArray.occupancy[i]
                ] +
                atomsDict.get(oneChainArray.atom_name[i], atomsDict['Null']) +
                residualesDict.get(oneChainArray.res_name[i],
                                   residualesDict['Null']) +
                ssesTypeDict.get(
                    sseMaskDict[oneChainArray.chain_id[i]].get(
                        oneChainArray.res_id[i], 'Null'), ssesTypeDict['Null'])
            )
        nodeFeatures.append(
            list(oneChainArray.coord[arrayShp - 1]) + [
                oneChainArray.res_id[arrayShp -
                                     1], oneChainArray.b_factor[arrayShp - 1],
                float(oneChainArray.hetero[arrayShp - 1]),
                oneChainArray.occupancy[arrayShp - 1]
            ] + atomsDict.get(oneChainArray.atom_name[arrayShp -
                                                      1], atomsDict['Null']) +
            residualesDict.get(oneChainArray.res_name[arrayShp - 1],
                               residualesDict['Null']) +
            ssesTypeDict.get(
                sseMaskDict[oneChainArray.chain_id[arrayShp - 1]].get(
                    oneChainArray.res_id[arrayShp -
                                         1], 'Null'), ssesTypeDict['Null']))

        # графовый формат
        data[chain] = Data(x=torch.tensor(nodeFeatures, dtype=torch.float),
                           edge_index=torch.tensor(edge_index,
                                                   dtype=torch.long))

    # сохраняем все графы в отдельные файлы
    if saveDir:
        for chain, graph in data.items():
            fileNameSplit = fileName.split('.')
            fileNameSplit[0] += chain
            torch.save(graph, os.path.join(saveDir, '.'.join(fileNameSplit)))

    # возвращаем словарь
    return data
Exemplo n.º 17
0
import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.database.rcsb as rcsb
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

file_name = rcsb.fetch("1aki", "mmtf", biotite.temp_dir())
array = strucio.load_structure(file_name)
# We only consider CA atoms
ca = array[array.atom_name == "CA"]
# 7 Angstrom adjacency threshold
threshold = 7
# Create cell list of the CA atom array
# for efficient measurement of adjacency
cell_list = struc.CellList(ca, cell_size=threshold)
adjacency_matrix = cell_list.create_adjacency_matrix(threshold)

figure = plt.figure()
ax = figure.add_subplot(111)
cmap = ListedColormap(["white", biotite.colors["dimgreen"]])
#ax.matshow(adjacency_matrix, cmap=cmap, origin="lower")
ax.pcolormesh(ca.res_id, ca.res_id, adjacency_matrix, cmap=cmap)
ax.set_aspect("equal")
ax.set_xlabel("Residue number")
ax.set_xlabel("Residue number")
ax.set_title("Adjacency matrix of the lysozyme crystal structure")
figure.tight_layout()
plt.show()
Exemplo n.º 18
0
pymol_obj.color("black")
ammolite.cmd.set("stick_color", "red")
ammolite.cmd.set("stick_radius", 0.5)
ammolite.cmd.set("sphere_scale", 1.0)
ammolite.cmd.set("sphere_quality", 4)

# Adjust camera
pymol_obj.orient()
pymol_obj.zoom(buffer=10)
ammolite.cmd.rotate("z", 90)
ammolite.show(PNG_SIZE)

########################################################################

CUTOFF = 13

# Find contacts within cutoff distance
adjacency_matrix = struc.CellList(aptamer, CUTOFF) \
                   .create_adjacency_matrix(CUTOFF)
for i, j in zip(*np.where(adjacency_matrix)):
    pymol_obj.distance("", i, j, show_label=False, gap=0)

ammolite.cmd.set("dash_color", "firebrick")

# Add black outlines
ammolite.cmd.bg_color("white")
ammolite.cmd.set("ray_trace_mode", 1)
ammolite.cmd.set("ray_trace_disco_factor", 0.5)

ammolite.show(PNG_SIZE)
# sphinx_gallery_thumbnail_number = 2
Exemplo n.º 19
0
def pdb2Gdata(dirName, fileName, saveDir=False):
    # print(os.path.join(dirName, fileName))
    array = strucio.load_structure(os.path.join(dirName, fileName),
                                   # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'],
                                   extra_fields=['b_factor', 'occupancy'],
                                   model=1)

    # уникальные цепи
    chainIdUnique = np.unique(array.chain_id)

    data = {}
    sseMaskDict = dict([(chain, {}) for chain in chainIdUnique])
    for chain in chainIdUnique:
        # берем текущую цепь
        oneChainArray = array[array.chain_id == chain]

        # исключаем heatem атомы для вычисления sse (== numpy.False)
        notHeatemChain = oneChainArray[oneChainArray.hetero == False]

        # "маска" остатков СА (не heatem) атомов
        resMask = notHeatemChain[notHeatemChain.atom_name == 'CA'].res_id

        # НЕ считаем вторичную стуктуру, если в цепи нет (или мало) CA атомов
        if resMask.shape[0] < 5:
            continue

        # вторичная структура используя алгоритм DSSP для каждой цепи
        sse = dssp.DsspApp.annotate_sse(notHeatemChain)

        # если sse короче маски, то расширим
        tmp = resMask.shape[0] - sse.shape[0]
        if tmp > 0:
            sse = np.append(sse, ['Null'] * tmp)

        # для каждой цепи, для каждого остатка - вторичная структура
        for resId, sseId in zip(resMask, sse):
            sseMaskDict[chain][resId] = sseId

        # матрица смежности
        cellList = struc.CellList(oneChainArray, cell_size=cfg.threshold)
        adjMatrix = cellList.create_adjacency_matrix(cfg.threshold)

        # вычитаем центроиду - смещаем цетр белка в точку (0, 0, 0) (для нормировки признака)
        oneChainArray.coord -= oneChainArray.coord.mean(axis=0)

        # длина максимального вектора (для нормировки признака)
        maxNorm = max([np.linalg.norm(point) for point in oneChainArray.coord])
        if maxNorm != 0:
            oneChainArray.coord /= maxNorm

        # максимальный температурный фактор (для нормировки признака)
        maxBFactor = oneChainArray.b_factor.max()
        if maxBFactor != 0:
            oneChainArray.b_factor /= maxBFactor

        edgeIndex = [[], []]
        nodeFeatures = []

        # переводим матрицу смежности в COO и собираем признаки
        arrayShp = oneChainArray.shape[0]
        for i in range(arrayShp - 1):
            for j in range(i + 1, arrayShp):
                if adjMatrix[i][j]:
                    edgeIndex[0].append(i)
                    edgeIndex[1].append(j)

            nodeFeatures.append(
                list(oneChainArray.coord[i]) +
                [oneChainArray.b_factor[i],
                 float(oneChainArray.hetero[i]),
                 oneChainArray.occupancy[i]] +
                atomsDict.get(oneChainArray.atom_name[i], atomsDict['Null']) +
                residualesDict.get(oneChainArray.res_name[i], residualesDict['Null']) +
                ssesTypeDict.get(sseMaskDict[oneChainArray.chain_id[i]].get(oneChainArray.res_id[i],
                                                                            'Null'),
                                 ssesTypeDict['Null'])
            )
        nodeFeatures.append(
            list(oneChainArray.coord[-1]) +
            [oneChainArray.b_factor[-1],
             float(oneChainArray.hetero[-1]),
             oneChainArray.occupancy[-1]] +
            atomsDict.get(oneChainArray.atom_name[-1], atomsDict['Null']) +
            residualesDict.get(oneChainArray.res_name[-1], residualesDict['Null']) +
            ssesTypeDict.get(sseMaskDict[oneChainArray.chain_id[-1]].get(oneChainArray.res_id[-1],
                                                                         'Null'),
                             ssesTypeDict['Null'])
        )

        # графовый формат
        data[chain] = Data(x=torch.tensor(nodeFeatures, dtype=torch.float),
                           edge_index=torch.tensor(edgeIndex, dtype=torch.long))

    # сохраняем все графы в отдельные файлы
    if saveDir:
        for chain, graph in data.items():
            fileNameSplit = fileName.split('.')
            # приписываем к названию файла название цепи
            fileNameSplit[0] += chain
            torch.save(graph, os.path.join(saveDir, '.'.join(fileNameSplit)))

    # возвращаем словарь
    return data