Пример #1
0
    def setUp(self):
        self.spark = SparkSession.builder.master("local[*]") \
                                 .appName("columnarStructure") \
                                 .getOrCreate()
        self.pdb = mmtfReader.download_mmtf_files(['1STP'])

        structure = self.pdb.values().first()
        self.cs = ColumnarStructure(structure, True)
Пример #2
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'columnarStructure')
        self.sc = SparkContext(conf=conf)
        self.pdb = mmtfReader.download_mmtf_files(['1STP'], self.sc)

        structure = self.pdb.values().first()
        self.cs = ColumnarStructure(structure, True)
Пример #3
0
    def test1(self):

        structure = self.pdb.values().first()
        cs = ColumnarStructure(structure, True)

        center = self.get_coords(cs, 459)  # ZN A.101.ZN
        neighbors = []
        neighbors.append(self.get_coords(cs, 28))  # CYS A.7.SG
        neighbors.append(self.get_coords(cs, 44))  # CYS A.10.SG
        neighbors.append(self.get_coords(cs, 223))  # HIS A.31.ND1
        neighbors.append(self.get_coords(cs, 245))  # CYS A.34.SG
        neighbors.append(self.get_coords(cs, 45))  # CYS A.10.N
        neighbors.append(self.get_coords(cs, 220))  # HIS A.31.O

        geom = CoordinateGeometry(center, neighbors)

        self.assertTrue(isclose(geom.q3(), 0.9730115379131878, abs_tol=1e-4))
        self.assertTrue(isclose(geom.q4(), 0.9691494056145086, abs_tol=1e-4))
        self.assertTrue(isclose(geom.q5(), 0.5126001729084566, abs_tol=1e-4))
        self.assertTrue(isclose(geom.q6(), 0.2723305441457363, abs_tol=1e-4))
Пример #4
0
class ColumnarStructureTest(unittest.TestCase):
    def setUp(self):
        self.spark = SparkSession.builder.master("local[*]") \
                                 .appName("columnarStructure") \
                                 .getOrCreate()
        self.pdb = mmtfReader.download_mmtf_files(['1STP'])

        structure = self.pdb.values().first()
        self.cs = ColumnarStructure(structure, True)

    def test_get_x_coords(self):
        self.assertTrue(self.cs.get_x_coords()[20] == 26.260)

    def test_get_elements(self):
        self.assertTrue(self.cs.get_elements()[20] == "C")

    def test_get_atom_names(self):
        self.assertTrue(self.cs.get_atom_names()[900] == "CG2")

    def test_get_group_names(self):
        self.assertTrue(self.cs.get_group_names()[900] == "VAL")

    def test_is_polymer(self):
        self.assertTrue(self.cs.is_polymer()[100] == True)
        self.assertTrue(self.cs.is_polymer()[901] == False)
        self.assertTrue(self.cs.is_polymer()[917] == False)

    def test_get_group_numbers(self):
        self.assertTrue(self.cs.get_group_numbers()[877])

    def test_get_chain_ids(self):
        self.assertTrue(self.cs.get_chain_ids()[100] == 'A')
        self.assertTrue(self.cs.get_chain_ids()[901] == 'B')
        self.assertTrue(self.cs.get_chain_ids()[917] == 'C')

    def test_get_chem_comp_types(self):
        self.assertTrue(
            self.cs.get_chem_comp_types()[100] == 'PEPTIDE LINKING')
        self.assertTrue(self.cs.get_chem_comp_types()[901] == 'NON-POLYMER')
        self.assertTrue(self.cs.get_chem_comp_types()[917] == 'NON-POLYMER')

    def test_get_entity_types(self):
        self.assertTrue(self.cs.get_entity_types()[100] == 'PRO')
        self.assertTrue(self.cs.get_entity_types()[901] == 'LGO')
        self.assertTrue(self.cs.get_entity_types()[917] == 'WAT')

    def tearDown(self):
        self.spark.stop()
    def get_interactions(self, structureId, structure):
        rows = []

        cutoffDistanceSquared = self.filter.get_distance_cutoff()**2
        arrays = ColumnarStructure(structure, True)

        chainNames = arrays.get_chain_names()
        groupNames = arrays.get_group_names()
        groupNumbers = arrays.get_group_numbers()
        atomNames = arrays.get_atom_names()
        entityIndices = arrays.get_entity_indices()
        elements = arrays.get_elements()
        polymer = arrays.is_polymer()

        sequenceMapIndices = arrays.get_sequence_positions()
        x = arrays.get_x_coords()
        y = arrays.get_y_coords()
        z = arrays.get_z_coords()

        # create a distance box for quick lookup interactions of polymer atoms
        # of the specified elements
        boxes = {}
        for i in range(arrays.get_num_atoms()):

            if polymer[i] \
                and (self.filter.is_target_group(groupNames[i]) or self.filter.is_query_group(groupNames[i])) \
                and (self.filter.is_target_atom_name(atomNames[i]) or self.filter.is_query_atom_name(atomNames[i])) \
                and (self.filter.is_target_element(elements[i]) or self.filter_is_query_element_name(elements[i])) \
                and not self.filter.is_prohibited_target_group(groupNames[i]):

                if chainNames[i] not in boxes:
                    box = DistanceBox(self.filter.get_distance_cutoff())
                    boxes[chainNames[i]] = box

                newPoint = np.array([x[i], y[i], z[i]])
                boxes[chainNames[i]].add_point(newPoint, i)

        chainBoxes = [(k, v) for k, v in boxes.items()]

        # loop over all pairwise polymer chain interactions
        for i in range(len(chainBoxes) - 1):
            chainI = chainBoxes[i][0]
            boxI = chainBoxes[i][1]

            for j in range(i + 1, len(chainBoxes)):
                chainJ = chainBoxes[j][0]
                boxJ = chainBoxes[j][1]

                intersectionI = boxI.getIntersection(boxJ)
                intersectionJ = boxJ.getIntersection(boxI)

                # maps to store sequence indices mapped to group numbers
                indicesI = {}
                indicesJ = {}

                entityIndexI = -1
                entityIndexJ = -1

                # loop over pairs of atom interactions and check if
                # they satisfy the interaction filter criteria

                for n in intersectionI:

                    for m in intersectionJ:

                        dx = x[n] - x[m]
                        dy = y[n] - y[m]
                        dz = z[n] - z[m]
                        dSq = dx * dx + dy * dy + dz * dz

                        if dSq <= cutoffDistanceSquared:
                            if self.filter.is_target_group(groupNames[n]) \
                                and self.filter.is_target_atom_name(atomNames[n]) \
                                and self.filter.is_target_element(elements[n]) \
                                and self.filter.is_query_group(groupNames[m]) \
                                and self.filter.is_query_atom_name(atomNames[m]) \
                                and self.filter.is_query_element(elements[m]):

                                entityIndexI = entityIndices[n]
                                indicesI[
                                    sequenceMapIndices[n]] = groupNumbers[n]

                            if self.filter.is_target_group(groupNames[m]) \
                                and self.filter.is_target_atom_name(atomNames[m]) \
                                and self.filter.is_target_element(elements[m]) \
                                and self.filter.is_query_group(groupNames[n]) \
                                and self.filter.is_query_atom_name(atomNames[n]) \
                                and self.filter.is_query_element(elements[n]):

                                entityIndexJ = entityIndices[m]
                                indicesJ[
                                    sequenceMapIndices[m]] = groupNumbers[m]

            if len(indicesI) >= self.filter.get_min_interactions():
                sequenceIndiciesI = sorted([int(i) for i in indicesI.keys()])
                groupNumbersI = sorted(list(indicesI.values()))

                rows.append(Row(structureId + '.' + chainI, chainJ, chainI, \
                                groupNumbersI, sequenceIndiciesI, \
                                structure.entity_list[entityIndexI]['sequence']))

            if len(indicesJ) >= self.filter.get_min_interactions():
                sequenceIndiciesJ = sorted([int(i) for i in indicesJ.keys()])
                groupNumbersJ = sorted(list(indicesJ.values()))

                rows.append(Row(structureId + '.' + chainJ, chainI, chainJ, \
                                groupNumbersJ, sequenceIndiciesJ, \
                                structure.entity_list[entityIndexJ]['sequence']))

        return rows
    def get_interactions(self, structureId, structure):
        rows = []

        cutoffDistanceSquared = self.filter.get_distance_cutoff() ** 2
        arrays = ColumnarStructure(structure, True)

        chainNames = arrays.get_chain_names()
        groupNames = arrays.get_group_names()
        groupNumbers = arrays.get_group_numbers()
        atomNames = arrays.get_atom_names()
        entityIndices = arrays.get_entity_indices()
        elements = arrays.get_elements()
        polymer = arrays.is_polymer()

        sequenceMapIndices = arrays.get_sequence_positions()
        x = arrays.get_x_coords()
        y = arrays.get_y_coords()
        z = arrays.get_z_coords()

        # create a distance box for quick lookup interactions of polymer atoms
        # of the specified elements
        box = DistanceBox(self.filter.get_distance_cutoff())
        for i in range(arrays.get_num_atoms()):

            if polymer[i] \
                and self.filter.is_target_group(groupNames[i]) \
                and self.filter.is_target_atom_name(atomNames[i]) \
                and self.filter.is_target_element(elements[i]) \
                and not self.filter.is_prohibited_target_group(groupNames[i]):

                newPoint = np.array([x[i],y[i],z[i]])
                box.add_point(newPoint, i)

        groupToAtomIndices = arrays.get_group_to_atom_indices()

        for g in range(arrays.get_num_groups()):

            # position of first and last atom +1 in group
            start = groupToAtomIndices[g]
            end = groupToAtomIndices[g+1]

            # skip polymer groups
            if polymer[start]:
                continue

            # the specified filter conditions (some groups may be excluded,
            # e.g. water)
            if self.filter.is_query_group(groupNames[start]):

                print(groupNames[start])
                # create list of atoms that interact within the cutoff distance
                neighbors = []
                for a in range(start,end):

                    if self.filter.is_query_atom_name(atomNames[a]) \
                        and self.filter.is_query_element(elements[a]):

                        p = np.array([x[a], y[a], z[a]])

                        # loop up neighbors that are within a cubic
                        for j in box.get_neighbors(p):
                            dx = x[j] - x[a]
                            dy = y[j] - y[a]
                            dz = z[j] - z[a]
                            dSq = dx * dx + dy * dy + dz * dz

                            if dSq <= cutoffDistanceSquared:
                                neighbors.append(j)

                if len(neighbors) == 0:
                    continue

                interactions2 = {}
                for neighbor in neighbors:

                    if chainNames[neighbor] not in interactions2:
                        interactions2[chainNames[neighbor]] = []

                    # keep track of which group is interacting
                    seqPos = sequenceMapIndices[neighbor]

                    # non-polymer groups have a negative index and are exlcuded here
                    if seqPos > 0:
                        l = [seqPos, groupNumbers[neighbor], entityIndices[neighbor]]
                        interactions2[chainNames[neighbor]].append(l)

                for key, val in interactions2.items():

                    sequenceIndices = set()
                    residueNames = set()
                    sequence = None

                    for v in val:
                        sequenceIndices.add(int(v[0]))
                        residueNames.add(int(v[1]))
                        if sequence is None:
                            sequence = structure.entity_list[v[2]]['sequence']

                    if len(sequenceIndices) > 0:
                        rows.append(Row(structureId + "." + key, groupNames[start], \
                                        groupNumbers[start], chainNames[start], \
                                        key, sorted(list(residueNames)), \
                                        sorted(list(sequenceIndices)), sequence,\
                                        len(interactions2)))
        return rows
    def __call__(self, t):
        structure_id = t[0]

        if self.bio < 1:
            raise ValueError('bio assembly number must be >= 1, was:',
                             self.bio)

        # if the specified bio assembly does not exist, return an empty list
        if len(t[1].bio_assembly) < self.bio:
            return []

        structure = ColumnarStructure(t[1])

        # Get a pandas dataframe representation of the structure
        df = structure.to_pandas()
        if df is None:
            return []

        # Apply query filter
        if self.query is None:
            q = df
        else:
            q = df.query(self.query)

        if q is None or q.shape[0] == 0:
            return []

        # Apply target filter
        if self.target is None:
            t = df
        elif self.target == self.query:
            # if query and target are identical, reuse the query dataframe
            t = q
        else:
            t = df.query(self.target)

        if t is None or t.shape[0] == 0:
            return []

        # Group by chain ids
        q_chains = q.groupby('chain_id')
        t_chains = t.groupby('chain_id')

        rows = list()

        # Find interactions between pairs of chains in bio assembly
        transforms = self.get_transforms(structure)
        for q_transform in transforms:
            qindex = q_transform[0]  # transformation id
            qchain = q_transform[1]  # chain id

            if qchain in q_chains.groups.keys():
                qt = q_chains.get_group(qchain).reset_index(drop=True)
            else:
                continue

            # Stack coordinates into an nx3 array
            cq = np.column_stack(
                (qt['x'].values, qt['y'].values, qt['z'].values)).copy()
            # Create transformation matrix
            qmat = np.array(q_transform[2]).reshape((4, 4))

            # Apply bio assembly transformations
            #   apply rotation
            cqt = np.matmul(cq, qmat[0:3, 0:3])
            #   apply translation
            cqt += qmat[3, 0:3].transpose()

            for t_transform in transforms:
                tindex = t_transform[0]
                tchain = t_transform[1]

                # exclude intra interactions (same transformation and same chain id)
                if not self.intra and qindex == tindex and qchain == tchain:
                    continue

                if not self.inter and qindex != tindex and qchain != tchain:
                    continue

                if tchain in t_chains.groups.keys():
                    tt = t_chains.get_group(tchain).reset_index(drop=True)
                else:
                    continue

                # Stack coordinates into an nx3 array
                ct = np.column_stack(
                    (tt['x'].values, tt['y'].values, tt['z'].values)).copy()

                # Get a 4x4 transformation matrix
                tmat = np.array(t_transform[2]).reshape((4, 4))

                # Apply bio assembly transformations
                #   apply rotation
                ctt = np.matmul(ct, tmat[0:3, 0:3])
                #   apply translation
                ctt += tmat[3, 0:3].transpose()

                rows += _calc_interactions(structure_id, qt, tt, cqt, ctt,
                                           self.level, self.distance_cutoff,
                                           self.bio, qindex, tindex)

        return rows
    def __call__(self, t):
        structure_id = t[0]

        # Get a pandas dataframe representation of the structure
        structure = ColumnarStructure(t[1])

        df = structure.to_pandas()
        if df is None:
            return []

        # Apply query filter
        if self.query is None:
            q = df
        else:
            q = df.query(self.query)

        if q is None or q.shape[0] == 0:
            return []

        # Apply target filter
        if self.target is None:
            t = df
        elif self.target == self.query:
            # if query and target are identical, reuse the query dataframe
            t = q
        else:
            t = df.query(self.target)

        if t is None or t.shape[0] == 0:
            return []

        # group by chain ids
        q_chains = q.groupby('chain_id')
        t_chains = t.groupby('chain_id')

        rows = list()

        # Find interactions between pairs of chains
        for q_chain in q_chains.groups.keys():
            qt = q_chains.get_group(q_chain).reset_index(drop=True)

            for t_chain in t_chains.groups.keys():

                # exclude intra interactions (same chain id)
                if not self.intra and q_chain == t_chain:
                    continue

                if not self.inter and q_chain != t_chain:
                    continue

                tt = t_chains.get_group(t_chain).reset_index(drop=True)

                # Stack coordinates into an nx3 array
                cq = np.column_stack(
                    (qt['x'].values, qt['y'].values, qt['z'].values)).copy()
                ct = np.column_stack(
                    (tt['x'].values, tt['y'].values, tt['z'].values)).copy()

                rows += _calc_interactions(structure_id, qt, tt, cq, ct,
                                           self.level, self.distance_cutoff,
                                           None, -1, -1)

        return rows
Пример #9
0
    def __init__(self, structure, firstModelOnly=True):

        ColumnarStructure.__init__(self, structure, firstModelOnly)
        self.normalizedbFactors = None
        self.clampedNormalizedbFactor = None
Пример #10
0
    def __call__(self, t):
        structure_id = t[0]
        structure = t[1]

        arrays = ColumnarStructure(structure, True)

        # if there is only a single chain, there are no intermolecular interactions
        if structure.num_chains == 1 and self.inter and not self.intra:
            return []

        # Apply query filter
        group_names = arrays.get_group_names()
        qg = self.filter.is_query_group_np(group_names)
        if np.count_nonzero(qg) == 0:
            return []

        elements = arrays.get_elements()
        qe = self.filter.is_query_element_np(elements)
        if np.count_nonzero(qe) == 0:
            return []

        atom_names = arrays.get_atom_names()
        qa = self.filter.is_query_atom_name_np(atom_names)
        if np.count_nonzero(qa) == 0:
            return []

        # Create mask for polymer atoms
        polymer = arrays.is_polymer()

        # Apply query filter to polymer
        polyq = polymer & qg & qe & qa

        if np.count_nonzero(polyq) == 0:
            return []

        # Apply target filter to polymer atoms
        tg = self.filter.is_target_group_np(group_names)
        te = self.filter.is_target_element_np(elements)
        ta = self.filter.is_target_atom_name_np(atom_names)

        polyt = polymer & tg & te & ta

        if np.count_nonzero(polyt) == 0:
            return []

        chain_names = arrays.get_chain_names()
        group_numbers = arrays.get_group_numbers()
        entity_indices = arrays.get_entity_indices()
        sequence_positions = arrays.get_sequence_positions()

        # Stack coordinates into an nx3 array
        # TODO add this to ColumnarStructure
        c = np.stack((arrays.get_x_coords(), arrays.get_y_coords(), arrays.get_z_coords()), axis=-1)

        # Apply mask for query atoms
        cpq = c[polyq]
        pgq = group_names[polyq]
        pnq = group_numbers[polyq]
        paq = atom_names[polyq]
        pcq = chain_names[polyq]

        # Apply mask for target atoms
        cpt = c[polyt]
        pgt = group_names[polyt]
        pnt = group_numbers[polyt]
        pat = atom_names[polyt]
        pct = chain_names[polyt]
        pet = entity_indices[polyt]
        pst = sequence_positions[polyt]

        # Calculate distances between the two atom sets
        tree_t = cKDTree(cpt)
        tree_q = cKDTree(cpq)
        distance_cutoff = self.filter.get_distance_cutoff()
        sparse_dm = tree_t.sparse_distance_matrix(tree_q, max_distance=distance_cutoff, output_type='dict')

        # Add interactions to rows.
        # There are redundant interactions when aggregating the results at the 'group' level,
        # since multiple atoms in a group may be involved in interactions.
        # Therefore we use a set of rows to store only unique interactions.
        rows = set([])
        for ind, dis in sparse_dm.items():
            i = ind[0]  # polymer target atom index
            j = ind[1]  # polymer query atom index

            # handle intra vs inter-chain interactions
            if pcq[j] == pct[i]:
                # cases with interactions in the same chain
                if not self.intra:
                    # exclude intrachain interactions
                    continue

                elif pnq[j] == pnt[i]:
                    # exclude interactions within the same chain and group
                    continue

            else:
                # case with interactions in different chains
                if not self.inter:
                    # exclude inter-chain interactions
                    continue

            # exclude self interactions (this can happen if the query and target criteria overlap)
            if dis < 0.001:
                continue

            if self.level == 'chain':
                row = Row(structure_id + "." + pct[i],  # structureChainId
                          pgq[j],  # queryGroupId
                          pcq[j],  # queryChainId
                          pnq[j],  # queryGroupNumber
                          pct[i]  # targetChainId
                          )
                rows.add(row)
            elif self.level == 'group':
                row = Row(structure_id + "." + pct[i],  # structureChainId
                          pgq[j],  # queryGroupId
                          pcq[j],  # queryChainId
                          pnq[j],  # queryGroupNumber
                          pgt[i],  # targetGroupId
                          pct[i],  # targetChainId
                          pnt[i],  # targetGroupNumber
                          pst[i].item(),  # sequenceIndex
                          structure.entity_list[pet[i]]['sequence']  # sequence
                          )
                rows.add(row)
            elif self.level == 'atom':
                row = Row(structure_id + "." + pct[i],  # structureChainId
                          pgq[j],  # queryGroupId
                          pcq[j],  # queryChainId
                          pnq[j],  # queryGroupNumber
                          paq[j],  # queryAtomName
                          pgt[i],  # targetGroupId
                          pct[i],  # targetChainId
                          pnt[i],  # targetGroupNumber
                          pat[i],  # targetAtomName
                          dis,  # distance
                          pst[i].item(),  # sequenceIndex
                          structure.entity_list[pet[i]]['sequence']  # sequence
                          )
                rows.add(row)

        return rows
Пример #11
0
    def __call__(self, t):
        structure_id = t[0]
        structure = t[1]

        arrays = ColumnarStructure(structure, True)

        # Apply query (ligand) filter
        group_names = arrays.get_group_names()
        qg = self.filter.is_query_group_np(group_names)
        if np.count_nonzero(qg) == 0:
            return []

        elements = arrays.get_elements()
        qe = self.filter.is_query_element_np(elements)
        if np.count_nonzero(qe) == 0:
            return []

        atom_names = arrays.get_atom_names()
        qa = self.filter.is_query_atom_name_np(atom_names)
        if np.count_nonzero(qa) == 0:
            return []

        ### filter prohibited groups??

        # Create mask for polymer atoms
        polymer = arrays.is_polymer()

        # Create mask for ligand atoms
        lig = ~polymer & qg & qe & qa
        if np.count_nonzero(lig) == 0:
            return []

        # Apply target (polymer) filter
        tg = self.filter.is_target_group_np(group_names)
        te = self.filter.is_target_element_np(elements)
        ta = self.filter.is_target_atom_name_np(atom_names)

        poly = polymer & tg & te & ta

        if np.count_nonzero(poly) == 0:
            return []

        chain_names = arrays.get_chain_names()
        group_numbers = arrays.get_group_numbers()
        entity_indices = arrays.get_entity_indices()
        sequence_positions = arrays.get_sequence_positions()

        # Stack coordinates into an nx3 array
        # TODO add this to ColumnarStructure
        c = np.stack((arrays.get_x_coords(), arrays.get_y_coords(),
                      arrays.get_z_coords()),
                     axis=-1)

        # Apply ligand mask to ligand data
        c_ligand = c[lig]
        lg = group_names[lig]
        ln = group_numbers[lig]
        la = atom_names[lig]
        lc = chain_names[lig]

        # Apply polymer mask to polymer data
        c_polymer = c[poly]
        pg = group_names[poly]
        pn = group_numbers[poly]
        pa = atom_names[poly]
        pc = chain_names[poly]
        pt = entity_indices[poly]
        ps = sequence_positions[poly]

        # Calculate distances between polymer and ligand atoms
        poly_tree = cKDTree(c_polymer)
        lig_tree = cKDTree(c_ligand)
        distance_cutoff = self.filter.get_distance_cutoff()
        sparse_dm = poly_tree.sparse_distance_matrix(
            lig_tree, max_distance=distance_cutoff, output_type='dict')

        # Add interactions to rows.
        # There are redundant interactions when aggregating the results at the 'group' level,
        # since multiple atoms in a group may be involved in interactions.
        # Therefore we use a set of rows to store only unique interactions.
        rows = set([])
        for ind, dis in sparse_dm.items():
            i = ind[0]  # ligand atom index
            j = ind[1]  # polymer atom index
            if self.level == 'chain':
                row = Row(
                    structure_id + "." + pc[i],  # structureChainId
                    lg[j],  # queryLigandId
                    lc[j],  # queryLigandChainId
                    ln[j],  # queryLigandNumber
                    pc[i]  # targetChainId
                )
                rows.add(row)
            elif self.level == 'group':
                row = Row(
                    structure_id + "." + pc[i],  # structureChainId
                    lg[j],  # queryLigandId
                    lc[j],  # queryLigandChainId
                    ln[j],  # queryLigandNumber
                    pg[i],  # targetGroupId
                    pc[i],  # targetChainId
                    pn[i],  # targetGroupNumber
                    ps[i].item(),  # sequenceIndex
                    structure.entity_list[pt[i]]['sequence']  # sequence
                )
                rows.add(row)
            elif self.level == 'atom':
                row = Row(
                    structure_id + "." + pc[i],  # structureChainId
                    lg[j],  # queryLigandId
                    lc[j],  # queryLigandChainId
                    ln[j],  # queryLigandNumber
                    la[j],  # queryAtomName
                    pg[i],  # targetGroupId
                    pc[i],  # targetChainId
                    pn[i],  # targetGroupNumber
                    pa[i],  # targetAtomName
                    dis,  # distance
                    ps[i].item(),  # sequenceIndex
                    structure.entity_list[pt[i]]['sequence']  # sequence
                )
                rows.add(row)

        return rows