def extract_feature(self): seed(self.seed) print_info_nn( " >>> Adding D1 surface atoms shape distribution for {0} ... ". format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): print_info("{0}".format(protein.name)) pdb_file_name = self._database.directory + pdb_directory + protein.name + '.pdb' surface, normals = get_surface_atoms(pdb_file_name) distributions = np.zeros( (len(protein.residues), 2 * (self.number_of_bins + 1))) for i in range(len(protein.residues)): residue = protein.residues[i] distributions[i, :] = self.get_distributions( residue.center, surface, normals) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def calc_self_collision_matrix(self, combis, d=0.05, d2=0.0, num_rnd_tries=1000): # TODO computational expansive because of too many collision checks print(u'calculating self collision matrix') seed(1337) always = set() # find meaningless self-collisions for link_a, link_b in combis: if self.joint_id_to_info[link_a].parent_index == link_b or \ self.joint_id_to_info[link_b].parent_index == link_a: always.add((link_a, link_b)) rest = combis.difference(always) always = always.union( self._check_all_collisions(rest, d, self.get_zero_joint_state())) rest = rest.difference(always) # find meaningful self-collisions sometimes = self._check_all_collisions(rest, d2, self.get_min_joint_state()) rest = rest.difference(sometimes) sometimes2 = self._check_all_collisions(rest, d2, self.get_max_joint_state()) rest = rest.difference(sometimes2) sometimes = sometimes.union(sometimes2) for i in range(num_rnd_tries): sometimes2 = self._check_all_collisions(rest, d2, self.get_rnd_joint_state()) if len(sometimes2) > 0: rest = rest.difference(sometimes2) sometimes = sometimes.union(sometimes2) return sometimes
def extract_feature(self): seed(self.seed) counter = 0 print_info_nn(" >>> Adding D2 category based shape distribution for database {0} ... ".format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros((len(protein.residues), self.number_of_bins)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = neighbour_search.search(residue.center, self.radius, "R") distributions[i, :] = self._compute_distribution(nearby_residues) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.D2_CATEGORY_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def test_randint_117(self): # GH 14189 random.seed(0) expected = np.array([2357136044, 2546248239, 3071714933, 3626093760, 2588848963, 3684848379, 2340255427, 3638918503, 1819583497, 2678185683], dtype='int64') actual = random.randint(2**32, size=10) assert_array_equal(actual, expected)
def test_shuffle_mixed_dimension(self): # Test for trac ticket #2074 for t in [[1, 2, 3, None], [(1, 1), (2, 2), (3, 3), None], [1, (2, 2), (3, 3), None], [(1, 1), 2, 3, None]]: random.seed(12345) shuffled = list(t) random.shuffle(shuffled) assert_array_equal(shuffled, [t[0], t[3], t[1], t[2]])
def test_call_within_randomstate(self): # Check that custom RandomState does not call into global state m = random.RandomState() res = np.array([0, 8, 7, 2, 1, 9, 4, 7, 0, 3]) for i in range(3): random.seed(i) m.seed(4321) # If m.state is not honored, the result will change assert_array_equal(m.choice(10, size=10, p=np.ones(10) / 10.), res)
def extract_feature(self): seed(self.seed) print_info_nn( " >>> Adding D1 surface shape distribution for database {0} ... ". format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros( (len(protein.residues), self.number_of_bins + 1)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = [protein.biopython_residues[i]] temp_nearby_residues = neighbour_search.search( residue.center, self.radius, "R") for nearby_residue in temp_nearby_residues: if nearby_residue not in protein.biopython_residues: continue residues_index = protein.biopython_residues.index( nearby_residue) residue = protein.residues[residues_index] if residue.get_feature( Features.RELATIVE_ACCESSIBLE_SURFACE_AREA ) >= self.rASA_threshold: nearby_residues.append(nearby_residue) distributions[i, :] = self._compute_distribution( nearby_residues, residue.center) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def test_choice_sum_of_probs_tolerance(self): # The sum of probs should be 1.0 with some tolerance. # For low precision dtypes the tolerance was too tight. # See numpy github issue 6123. random.seed(1234) a = [1, 2, 3] counts = [4, 4, 2] for dt in np.float16, np.float32, np.float64: probs = np.array(counts, dtype=dt) / sum(counts) c = random.choice(a, p=probs) assert_(c in a) assert_raises(ValueError, random.choice, a, p=probs * 0.9)
def test_shuffle_of_array_of_objects(self): # Test that permuting an array of objects will not cause # a segfault on garbage collection. # See gh-7719 random.seed(1234) a = np.array([np.arange(1), np.arange(4)]) for _ in range(1000): random.shuffle(a) # Force Garbage Collection - should not segfault. import gc gc.collect()
def test_shuffle_of_array_of_different_length_strings(self): # Test that permuting an array of different length strings # will not cause a segfault on garbage collection # Tests gh-7710 random.seed(1234) a = np.array(['a', 'a' * 1000]) for _ in range(100): random.shuffle(a) # Force Garbage Collection - should not segfault. import gc gc.collect()
def test_logseries_convergence(self): # Test for ticket #923 N = 1000 random.seed(0) rvsn = random.logseries(0.8, size=N) # these two frequency counts should be close to theoretical # numbers with this large sample # theoretical large N result is 0.49706795 freq = np.sum(rvsn == 1) / float(N) msg = "Frequency was %f, should be > 0.45" % freq assert_(freq > 0.45, msg) # theoretical large N result is 0.19882718 freq = np.sum(rvsn == 2) / float(N) msg = "Frequency was %f, should be < 0.23" % freq assert_(freq < 0.23, msg)
def create_membership_matrix(self): """ Create a random membership matrix. @return: random array of shape length of data to cluster times number of clusters @rtype: array('f') """ ## default signature has changed oldnumeric->numpy if (self.seedx==0 or self.seedy==0): R.seed() else: R.seed((self.seedx, self.seedy)) r = R.random_sample((self.npoints, self.n_cluster)) return N0.transpose(r / N0.sum(r))
def create_membership_matrix(self): """ Create a random membership matrix. @return: random array of shape length of data to cluster times number of clusters @rtype: array('f') """ ## default signature has changed oldnumeric->numpy if (self.seedx == 0 or self.seedy == 0): R.seed() else: R.seed((self.seedx, self.seedy)) r = R.random_sample((self.npoints, self.n_cluster)) return N0.transpose(r / N0.sum(r))
def gibbs_sampling(self, n_topics, alpha, n_iterations): seed(0) # randomly assign topics to words self.word_topic_map = {w: randint(0, n_topics-1) for w in self.vocab} n_dt = [{t: 0 for t in range(n_topics)} for _ in range(len(self.corpus))] n_tw = [{w: 0 for w in self.vocab} for _ in range(n_topics)] for d_index in range(len(self.corpus)): d = self.corpus[d_index] for w in d: t = self.word_topic_map[w] n_dt[d_index][t] += 1 n_tw[t][w] += 1 for i in range(n_iterations): print("Iteration %d/%d (%f%%)..." % (i, n_iterations, 100 * i / float(n_iterations))) for d_index in range(len(self.corpus)): print("Document %d/%d (%f%%)..." % (d_index, len(self.corpus), 100 * d_index / float(len(self.corpus)))) d = self.corpus[d_index] for w in d: # i. remove current word from counts old_topic = self.word_topic_map[w] # TODO if n_dt[d_index][old_topic] == 0: print("oops dt", d_index, old_topic) else: n_dt[d_index][old_topic] -= 1 n_tw[old_topic][w] -= 1 # ii. estimate probabilities using 5.6, 5.7 word_topic_probs = [] for t in range(n_topics): p_t_d = float(n_dt[d_index][t] + alpha) / (sum(n_dt[d_index].values()) + n_topics * alpha) p_w_t = float(n_tw[t][w] + alpha) / (sum(n_tw[t].values()) + len(self.vocab) * alpha) word_topic_probs.append(p_w_t * p_t_d) # iii. assign w to a topic randomly word_topic_probs = [float(p) / sum(word_topic_probs) for p in word_topic_probs] self.word_topic_map[w] = choice(range(n_topics), p=word_topic_probs) # iv. increment counts accordingly topic = self.word_topic_map[w] n_tw[topic][w] += 1 n_dt[d_index][topic] += 1
def extract_feature(self): seed(self.seed) counter = 0 overall_time = datetime.now() print_info_nn( " >>> Adding D2 shape distribution for database {0} ... ".format( self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros( (len(protein.residues), self.number_of_bins)) # distributions = np.zeros((len(protein.residues), self.number_of_bins+2)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = neighbour_search.search( residue.center, self.radius, "R") distributions[i, :] = self._compute_distribution( nearby_residues) # distributions[i:, -1] = len(nearby_residues) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D2_PLAIN_SHAPE_DISTRIBUTION, distributions[i, :]) # protein.residues[i].add_feature(Features.NUMBER_OF_NEIGHBOURS, distributions[i, -1]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def variation(sigma, mu=[2, 3], power=2): y_var = [random.randint(2, 100) for i in range(30)] # print(y_variation) y_var = sorted(y_var) # print(y_variation) multiple_corr = [] for i in y_var: new_sigma = sigma[:] new_sigma[1][1] = i seed(1) new_data = np.random.multivariate_normal(mu, new_sigma, 100) multiple_corr.append((np.corrcoef(np.power(new_data[:, 0], power), np.power(new_data[:, 1], power)))[1][0]) return y_var, multiple_corr
def test_permutation_subclass(self): class N(np.ndarray): pass random.seed(1) orig = np.arange(3).view(N) perm = random.permutation(orig) assert_array_equal(perm, np.array([0, 2, 1])) assert_array_equal(orig, np.arange(3).view(N)) class M(object): a = np.arange(5) def __array__(self): return self.a random.seed(1) m = M() perm = random.permutation(m) assert_array_equal(perm, np.array([2, 1, 4, 0, 3])) assert_array_equal(m.__array__(), np.arange(5))
def extract_feature(self): seed(self.seed) print_info_nn(" >>> Adding D1 surface shape distribution for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros((len(protein.residues), self.number_of_bins + 1)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = [protein.biopython_residues[i]] temp_nearby_residues = neighbour_search.search(residue.center, self.radius, "R") for nearby_residue in temp_nearby_residues: if nearby_residue not in protein.biopython_residues: continue residues_index = protein.biopython_residues.index(nearby_residue) residue = protein.residues[residues_index] if residue.get_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA) >= self.rASA_threshold: nearby_residues.append(nearby_residue) distributions[i, :] = self._compute_distribution(nearby_residues, residue.center) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def seed(x=0, y=0): if (x == 0 or y == 0): mt.seed() else: mt.seed((x,y))
def seed(x=0, y=0): if (x == 0 or y == 0): mt.seed() else: mt.seed((x, y))
def main(): print_info("Starting the experiment") start_time = datetime.now() seed = 1 #number_of_samples = 5000 number_of_samples = 20000 dbd4 = DBD4(size=number_of_samples, ratio=-1, seed=seed) mtrand.seed(seed) feature_sets = [ #[ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, #], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.D2_PLAIN_SHAPE_DISTRIBUTION # ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.D1_PLAIN_SHAPE_DISTRIBUTION # ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, # Features.D2_SURFACE_SHAPE_DISTRIBUTION # ], [ Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, Features.D1_SURFACE_SHAPE_DISTRIBUTION ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.D2_CATEGORY_SHAPE_DISTRIBUTION # ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # # Features.PROTRUSION_INDEX, # # Features.B_VALUE, # Features.HALF_SPHERE_EXPOSURE, # Features.SECONDARY_STRUCTURE, # Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, # Features.POSITION_SPECIFIC_SCORING_MATRIX, # Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, # # # Features.PHI, # # # Features.PSI, # # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, # Features.D2_SURFACE_SHAPE_DISTRIBUTION, # # Features.D1_SURFACE_SHAPE_DISTRIBUTION, # # Features.D2_PLAIN_SHAPE_DISTRIBUTION, # # Features.D1_SURFACE_SHAPE_DISTRIBUTION, # Features.RESIDUE_DEPTH # ] ] results = [] for feature_set in feature_sets: print_special("Feature set {0}".format(feature_set)) e = Experiment(feature_set, dbd4, Classifier.SVM) e.run(number_of_bins=20, radius=15, number_of_samples=-1, seed=seed, gamma=0.5, save=True, folds=5, rASA=.5) results.append(e.pyml_result) print_info("Took {0} seconds.".format((datetime.now() - start_time).seconds)) save_results(number_of_samples, results, feature_sets)
import random import numpy as np from numpy.random.mtrand import seed import matplotlib.pyplot as plt mean = [2, 3] sigma = np.array([[1, 1.5], [1.5, 30]]) seed(10) data = np.random.multivariate_normal(mean, sigma, 100) # print(np.corrcoef(np.power(data[:, 0], 2), np.power(data[:, 1], 2))) def variation(sigma, mu=[2, 3], power=2): y_var = [random.randint(2, 100) for i in range(30)] # print(y_variation) y_var = sorted(y_var) # print(y_variation) multiple_corr = [] for i in y_var: new_sigma = sigma[:] new_sigma[1][1] = i seed(1) new_data = np.random.multivariate_normal(mu, new_sigma, 100) multiple_corr.append((np.corrcoef(np.power(new_data[:, 0], power), np.power(new_data[:, 1], power)))[1][0]) return y_var, multiple_corr
def test_beta_small_parameters(self): # Test that beta with small a and b parameters does not produce # NaNs due to roundoff errors causing 0 / 0, gh-5851 random.seed(1234567890) x = random.beta(0.0001, 0.0001, size=100) assert_(not np.any(np.isnan(x)), 'Nans in random.beta')
def autoencode(pipe: Pipe, layer_config: List[Dict], from_file: str, store_model: str, loss: str, optimiser: str, epochs: int, batch_size: int, shuffle: bool, validation_split: float, adjust_weights: float, mode: str): """Build and train an autoencoder.""" import keras from keras import regularizers, Sequential, Input, Model from keras.callbacks import EarlyStopping, TensorBoard from keras.engine import InputLayer from keras.engine.saving import model_from_yaml, model_from_json from keras.layers import Dense from numpy.random.mtrand import seed from tensorflow import set_random_seed from lyner.keras_extras import SignalHandler seed(1) set_random_seed(2) matrix = pipe.matrix.copy() if matrix.isnull().values.any(): LOGGER.warning("Dropping rows containing nan values") matrix.dropna(how='any', inplace=True) def parse_layout(layer_conf): get_layer_type = lambda t: getattr(keras.layers, t, None) regdict = {'l1_l2': regularizers.l1_l2, 'l1': regularizers.l1, 'l2': regularizers.l2} lc = layer_conf.copy() layer_type = lc.get('type', None) if layer_type: lc['type'] = get_layer_type(layer_type) # TODO parse regularizers kernel_reg_type = lc.get('kernel_regularizer', None) if kernel_reg_type: if '(' in kernel_reg_type and ')' in kernel_reg_type: params = kernel_reg_type[kernel_reg_type.index('(') + 1:kernel_reg_type.index(')')] if '+' in params: params = params.split('+') else: params = [params] params = [float(p) for p in params] kernel_reg_type = kernel_reg_type[:kernel_reg_type.index('(')] lc['kernel_regularizer'] = regdict[kernel_reg_type](*params) return lc.pop('type'), int(lc.pop('n')), lc layout = [parse_layout(layer_conf) for layer_conf in layer_config] labels = matrix.columns.values.tolist() data = matrix.values shape = (data.shape[0],) data = data.transpose() if layout: encoding_dim = layout[-1][1] encoder = Sequential(name="encoder") encoder.add(InputLayer(shape, name="encoder_input")) for layer_num, (Layer, n_nodes, extra_args) in enumerate(layout): encoder.add(Layer(n_nodes, name=f"encoder_{layer_num}_{n_nodes}", **extra_args)) # kernel_regularizer=regularizers.l1_l2(0.001, 0.001), # kernel_regularizer=regularizers.l1(0.0001), decoder = Sequential(name="decoder") decoder.add(InputLayer((encoding_dim,), name="decoder_input")) for layer_num, (Layer, n_nodes, _) in enumerate(layout[::-1][1:]): decoder.add(Layer(n_nodes, name=f"decoder_{layer_num}_{n_nodes}")) decoder.add(Dense(shape[0], activation='linear', name="decoder_output")) input_layer = Input(shape=shape, name="autoencoder_input") encode_layer = encoder(input_layer) decode_layer = decoder(encode_layer) autoencoder = Model(input_layer, decode_layer) if store_model: if store_model.endswith('.yaml'): model_string = autoencoder.to_yaml() elif store_model.endswith('.json'): model_string = autoencoder.to_json() else: model_string = autoencoder.to_yaml() with open(store_model, 'wt') as writer: writer.write(model_string) elif from_file: with open(from_file, 'rt') as reader: model_string = '\n'.join(reader.readlines()) if from_file.endswith('.yaml'): autoencoder = model_from_yaml(model_string) elif from_file.endswith('.json'): autoencoder = model_from_json(model_string) # TODO set encoder and decoder correctly else: raise ValueError("No model specified. Use either of --layer-config or --from-file.") # from pprint import pprint # pprint(autoencoder.get_config()) autoencoder.compile(optimizer=optimiser, loss=loss, metrics=['mse'], ) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0000001, patience=50) sh = SignalHandler() autoencoder.fit(np.vsplit(data, 1), np.vsplit(data, 1), callbacks=[TensorBoard(log_dir='/tmp/autoencoder'), sh, early_stopping], epochs=epochs, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle ) sh.uninit() class Autoencoder: def __init__(self, encoder=None, decoder=None): self._encoder = encoder self._decoder = decoder def inverse_transform(self, data): return self._decoder.predict(data).transpose() def transform(self, data): return self._encoder.predict(data).transpose() pipe.decomposition = Autoencoder(encoder, decoder) encoded_data = pipe.decomposition.transform(data) decoded_data = pipe.decomposition.inverse_transform(encoded_data.T) pre_error = ((data.T - decoded_data) ** 2).mean(axis=None) print(f"MSE: {pre_error}") pipe._index = pipe.matrix.index pipe._columns = pipe.matrix.columns if adjust_weights: quant = float(adjust_weights) for i, layer in enumerate(encoder.layers): W, b = layer.get_weights() low, median, high = np.quantile(W.flatten(), [quant, 0.5, 1 - quant]) W_low = W * (W < low) W_high = W * (W > high) selected_weights = W_low + W_high # oplot([Histogram(x=W.flatten()), Histogram(x=W[W < low].flatten()), Histogram(x=W[W > high].flatten())]) layer.set_weights([selected_weights, b]) break encoded_data = pipe.decomposition.transform(data) decoded_data = pipe.decomposition.inverse_transform(encoded_data.T) post_error = ((data.T - decoded_data) ** 2).mean(axis=None) print(f"MSE: {post_error}") if 'weights' == mode: layer = 0 layer_weights = encoder.layers[layer].get_weights() layer = encoder.layers[layer] if len(layer_weights) == 0: layer_weights = encoder.layers[0].get_weights() if len(layer_weights) >= 2: layer_weights = layer_weights[:-1] # last one is bias new_data = layer_weights[0] index = [f'Weight_{i}' for i in range(new_data.shape[0])] num_nodes = new_data.shape[1] columns = [f"{layer.name}_{i}" for i in range(num_nodes)] elif 'nodes' == mode: new_data = encoder.predict(np.vsplit(data, 1)).transpose() columns = labels index = [f"{mode}_{i}" for i in range(encoding_dim)] elif 'discard' == mode: W, b = encoder.layers[0].get_weights() W = np.sum(np.abs(W), axis=1) W[W != 0] = 1 print(f"Kept {np.sum(W)} weights") v: np.array = pipe.matrix.values new_data = (v.T * W).T columns = pipe.matrix.columns index = pipe.matrix.index else: raise ValueError(f"Unknown mode {mode}") pipe.matrix = pd.DataFrame(data=new_data, columns=columns, index=index, ) return
def test_permutation_longs(self): random.seed(1234) a = random.permutation(12) random.seed(1234) b = random.permutation(long(12)) assert_array_equal(a, b)