def find_temporal_adjacency_matrix(min_abundance, phylo_column, full_svd): """ Find the adjacency matrix among clusters of bacteria from week to week, assuming the interaction between clusters is changing. :param min_abundance: ignore the bacteria if their abundance is always below the min_abundance :param phylo_column: the data is clustered based on the phylo_column :param full_svd:the method of singular value decomposition. full SVD is more accurate and slower than the reduced SVD """ # Default values if min_abundance is None: min_abundance = 0 if phylo_column is None: phylo_column = 'family' if full_svd is None: full_svd = False # snapshots of samples over 11 weeks # todo: python reserves capital letters for classes. snapshots = prepare_DMD_matrices(min_abundance, phylo_column, oxygen='all', debug=False) linear_mappings = {} nodes_list = {} for descriptive_tuple in snapshots.keys(): df = snapshots[descriptive_tuple] data = df.values for time in range(10): X = data[:, time:time+1] Y = data[:, time+1:time+2] # Preprocess the abundance data X = normalize(X, axis=0) Y = normalize(Y, axis=0) U, s, V = np.linalg.svd(X, full_matrices=full_svd) if full_svd is True: # slower S = np.zeros((len(U), len(s)), dtype=complex) S[:len(s), :len(s)] = np.diag(s) pseu_inv_x = np.dot(np.linalg.inv(V), np.dot(np.linalg.pinv(S), np.linalg.inv(U))) else: # faster S = np.diag(s) pseu_inv_x = np.dot(np.linalg.inv(V), np.dot(np.linalg.inv(S), np.linalg.pinv(U))) # Adjacency matrix between clusters A = np.dot(Y, pseu_inv_x) # A = np.dot(Y, np.linalg.pinv(X)) # full SVD (slower) key = descriptive_tuple + ('Week ' + str(time+1),) linear_mappings[key] = A nodes_list[key] = list(df.index) return linear_mappings, nodes_list
def find_fixed_adjacency_matrix(min_abundance=0.0, phylo_column='order', full_svd=True): """ This function find the adjacency matrix among clusters of bacteria over the 11 weeks of sampling assuming the interaction between clusters is fixed. It creates a dictionary of descriptive tuples like ("High", 2) for high-oxygen week 2, and corresponding dataframe values. These dataframes have weeks as columns and taxa ("bacteria") as rows. Unlike find_temporal_adjacency_matrix(), we get only one predictive matrix that represents the 10 transitions between sampling points. Since the dictionary has 8 tuple keys for High/Low oxygen and 4 replicates for each condition, 8 interaction ("A") matrices are created. These are accessed by the dictionary linear_mappings, with the same tuples as keys. The names of each node can be accessed by nodes_list, the other output. :param min_abundance: minimum abundance to loook for in original data :param phylo_column: most detailed phylogenetic column to consider :param full_svd: if True, runs the full svd algorithm. If False, runs a faster version. """ # Default values if min_abundance is None: min_abundance = 0 if phylo_column is None: phylo_column = 'order' if full_svd is None: full_svd = False # snapshots of samples over 11 weeks snapshots = prepare_DMD_matrices(min_abundance, phylo_column, oxygen='all',debug=False) linear_mappings = {} nodes_list = {} for descriptive_tuple in snapshots.keys(): df = snapshots[descriptive_tuple] data = df.values X = data[:, 0:10] Y = data[:, 1:11] # Preprocess the abundance data X = normalize(X, axis=0) Y = normalize(Y, axis=0) U, s, V = np.linalg.svd(X, full_matrices=full_svd) if full_svd is True: # slower S = np.zeros((len(U), len(s)), dtype=float) S[:len(s), :len(s)] = np.diag(s) pseu_inv_x = np.dot(np.linalg.inv(V), np.dot(np.linalg.pinv(S), np.linalg.inv(U))) else: # faster S = np.diag(s) pseu_inv_x = np.dot(np.linalg.inv(V), np.dot(np.linalg.inv(S), np.linalg.pinv(U))) # Adjacency matrix between clusters A = np.dot(Y, pseu_inv_x) # A = np.dot(Y, np.linalg.pinv(X)) # full SVD (slower) linear_mappings[descriptive_tuple] = A nodes_list[descriptive_tuple] = list(df.index) return linear_mappings, nodes_list