def test_plot_leafs(self, small_tree): tree = small_tree['tree'] ss_props = list() for i in range(tree.nleafs): seq = ''.join(random.sample(1000*SSP.dssp_codes, 42)) ss_props.append(SSP().from_dssp_sequence(seq)) ps.propagator_size_weighted_sum(ss_props, tree) tree.root['ss'].plot('leafs')
def sans_fit(sans_benchmark): r""" Parameters ---------- sans_benchmark : :function:`~pytest.fixture` Returns ------- dict A dictionary containing the following key, value pairs: tree: :class:`~idpflex.cnextend.Tree` A hiearchical tree with random distances among leafs, and endowed with a :class:`~idpflex.properties.SansProperty`. property_name: str Just the name of the property depth: int Tree depth resulting in the best fit to experiment_property coefficients: :py:`dict` weights of each node at Tree depth resulting in best fit. (key, val) pair is (node ID, weight). background : float Flat background added to the profile at depth for optimal fit experiment_property: :class:`~idpflex.properties.SansProperty` Experimental profile from a linear combination of the profiles at depth for optimal fit using `coefficients` and `background`. """ tree = deepcopy(sans_benchmark['tree_with_no_property']) values = sans_benchmark['property_list'] name = values[0].name # property name idprop.propagator_size_weighted_sum(values, tree) # create a SANS profile as a linear combination of the clusters at a # particular depth depth = 4 coeffs = (0.45, 0.00, 0.07, 0.25, 0.23) # they must add to one coefficients = dict() nodes = tree.nodes_at_depth(depth) n_nodes = 1 + depth # depth=0 corresponds to the root node (nclusters=1) q_values = (tree.root[name].x[:-1] + tree.root[name].x[1:]) / 2 # midpoint profile = np.zeros(len(q_values)) for i in range(n_nodes): coefficients[nodes[i].id] = coeffs[i] p = nodes[i][name] profile += coeffs[i] * (p.y[:-1] + p.y[1:]) / 2 background = 0.05 * max(profile) # flat background profile += background experiment_property = idprop.SansProperty(name=name, qvalues=q_values, profile=profile, errors=0.1 * profile) return { 'tree': tree, 'property_name': name, 'depth': depth, 'coefficients': coefficients, 'background': background, 'experiment_property': experiment_property }
def test_propagator_size_weighted_sum(self, sans_benchmark): tree = sans_benchmark['tree_with_no_property'] values = sans_benchmark['property_list'] ps.propagator_size_weighted_sum(values, tree) # Test the propagation of the profiles for a node randomly picked node_id = np.random.randint(tree.nleafs, len(tree)) # exclude leafs node = tree[node_id] ln = node.left rn = node.right w = float(ln.count) / (ln.count + rn.count) lnp = ln['sans'] # profile of the "left" sibling node rnp = rn['sans'] y = w * lnp.y + (1 - w) * rnp.y assert np.array_equal(y, node['sans'].y)
def benchmark(): z = np.loadtxt(os.path.join(data_dir, 'linkage_matrix')) t = cnextend.Tree(z) n_leafs = 22379 # Instantiate scalar properties for the leaf nodes, then propagate # up the tree sc = np.random.normal(loc=100.0, size=n_leafs) sc_p = [idprop.ScalarProperty(name='sc', y=s) for s in sc] idprop.propagator_size_weighted_sum(sc_p, t) return { 'z': z, 'tree': t, 'nnodes': 44757, 'nleafs': n_leafs, 'simple_property': [SimpleProperty(i) for i in range(22379)], }
def test_propagator_size_weighted_sum(self, small_tree): r"""Create random secondary sequences by shufling all codes and assign to the leafs of the tree. Then, propagate the profiles up the tree hiearchy. Finally, compare the profile of the root with expected profile. """ tree = small_tree['tree'] ss_props = list() for i in range(tree.nleafs): seq = ''.join(random.sample(SSP.dssp_codes, SSP.n_codes)) ss_props.append(SSP().from_dssp_sequence(seq)) ps.propagator_size_weighted_sum(ss_props, tree) # Manually calculate the average profile for the last residue y = np.asarray([ss_props[i].y for i in range(tree.nleafs)]) average_profile = np.mean(y, axis=0) np.testing.assert_array_almost_equal(average_profile, tree.root['ss'].y, decimal=12)
def cluster_with_properties(a_universe, pcls, p_names=None, selection='not name H*', segment_length=1000, n_representatives=1000): r"""Cluster a set of representative structures by structural similarity (RMSD) and by a set of properties The simulated trajectory is divided into segments, and hierarchical clustering is performed on each segment to yield a limited number of representative structures (the centroids). Properties are calculated for each centroid, thus each centroid is described by a property vector. The dimensionality of the vector is related to the number of properties and the dimensionality of each property. The distances between any two centroids is calculated as the Euclidean distance between their respective vector properties. The distance matrix containing distances between all possible centroid pairs is employed as the similarity measure to generate the hierarchical tree of centroids. The properties calculated for the centroids are stored in the leaf nodes of the hierarchical tree. Properties are then propagated up to the tree's root node. Parameters ---------- a_universe : :class:`~MDAnalysis.core.universe.Universe` Topology and trajectory. pcls : list Property classes, such as :class:`~idpflex.properties.Asphericity` of :class:`~idpflex.properties.SaSa` p_names : list Property names. If None, then default property names are used selection : str atoms for which to calculate RMSD. See the `selections page <https://www.mdanalysis.org/docs/documentation_pages/selections.html>`_ for atom selection syntax. segment_length: int divide trajectory into segments of this length n_representatives : int Desired total number of representative structures. The final number may be close but not equal to the desired number. Returns ------- :class:`~idpflex.cluster.ClusterTrove` Hierarchical clustering tree of the centroids """ # noqa: E501 rep_ifr = trajectory_centroids(a_universe, selection=selection, segment_length=segment_length, n_representatives=n_representatives) n_centroids = len(rep_ifr) # can be different than n_representatives # Create names if not passed if p_names is None: p_names = [Property.default_name for Property in pcls] # Calculate properties for each centroid l_prop = list() for p_name, Pcl in zip(p_names, pcls): l_prop.append([ Pcl(name=p_name).from_universe(a_universe, index=i) for i in tqdm(rep_ifr) ]) # Calculate distances between pair of centroids xyz = np.zeros((len(pcls), n_centroids)) for i_prop, prop in enumerate(l_prop): xyz[i_prop] = [p.y for p in prop] # zero mean and unity variance for each property xyz = np.transpose(zscore(xyz, axis=1)) distance_matrix = squareform(scipy.spatial.distance_matrix(xyz, xyz)) # Cluster the representative structures tree = Tree(z=hierarchy.linkage(distance_matrix, method='complete')) for i_leaf, leaf in enumerate(tree.leafs): leaf.add_property(ScalarProperty(name='iframe', y=rep_ifr[i_leaf])) # Propagate the properties up the tree [propagator_size_weighted_sum(prop, tree) for prop in l_prop] return ClusterTrove(rep_ifr, distance_matrix, tree)