def extract_feature(self):
        seed(self.seed)
        print_info_nn(
            " >>> Adding D1 surface atoms shape distribution for {0} ... ".
            format(self._database.name))
        overall_time = datetime.now()
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    print_info("{0}".format(protein.name))
                    pdb_file_name = self._database.directory + pdb_directory + protein.name + '.pdb'
                    surface, normals = get_surface_atoms(pdb_file_name)
                    distributions = np.zeros(
                        (len(protein.residues), 2 * (self.number_of_bins + 1)))

                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        distributions[i, :] = self.get_distributions(
                            residue.center, surface, normals)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(
                        Features.D1_SURFACE_SHAPE_DISTRIBUTION,
                        distributions[i, :])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
Exemplo n.º 2
0
    def calc_self_collision_matrix(self,
                                   combis,
                                   d=0.05,
                                   d2=0.0,
                                   num_rnd_tries=1000):
        # TODO computational expansive because of too many collision checks
        print(u'calculating self collision matrix')
        seed(1337)
        always = set()

        # find meaningless self-collisions
        for link_a, link_b in combis:
            if self.joint_id_to_info[link_a].parent_index == link_b or \
                    self.joint_id_to_info[link_b].parent_index == link_a:
                always.add((link_a, link_b))
        rest = combis.difference(always)
        always = always.union(
            self._check_all_collisions(rest, d, self.get_zero_joint_state()))
        rest = rest.difference(always)

        # find meaningful self-collisions
        sometimes = self._check_all_collisions(rest, d2,
                                               self.get_min_joint_state())
        rest = rest.difference(sometimes)
        sometimes2 = self._check_all_collisions(rest, d2,
                                                self.get_max_joint_state())
        rest = rest.difference(sometimes2)
        sometimes = sometimes.union(sometimes2)
        for i in range(num_rnd_tries):
            sometimes2 = self._check_all_collisions(rest, d2,
                                                    self.get_rnd_joint_state())
            if len(sometimes2) > 0:
                rest = rest.difference(sometimes2)
                sometimes = sometimes.union(sometimes2)
        return sometimes
 def extract_feature(self):
     seed(self.seed)
     counter = 0
     print_info_nn(" >>> Adding D2 category based shape distribution for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             shape_dist_file = self._get_dir_name() + protein.name
             if not os.path.exists(shape_dist_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 atoms = protein.atoms
                 neighbour_search = NeighborSearch(atoms)
                 distributions = np.zeros((len(protein.residues), self.number_of_bins))
                 for i in range(len(protein.residues)):
                     residue = protein.residues[i]
                     nearby_residues = neighbour_search.search(residue.center, self.radius, "R")
                     distributions[i, :] = self._compute_distribution(nearby_residues)
                 np.save(shape_dist_file, distributions)
             distributions = np.load(shape_dist_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(Features.D2_CATEGORY_SHAPE_DISTRIBUTION, distributions[i, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
 def test_randint_117(self):
     # GH 14189
     random.seed(0)
     expected = np.array([2357136044, 2546248239, 3071714933, 3626093760,
                          2588848963, 3684848379, 2340255427, 3638918503,
                          1819583497, 2678185683], dtype='int64')
     actual = random.randint(2**32, size=10)
     assert_array_equal(actual, expected)
 def test_shuffle_mixed_dimension(self):
     # Test for trac ticket #2074
     for t in [[1, 2, 3, None], [(1, 1), (2, 2), (3, 3), None],
               [1, (2, 2), (3, 3), None], [(1, 1), 2, 3, None]]:
         random.seed(12345)
         shuffled = list(t)
         random.shuffle(shuffled)
         assert_array_equal(shuffled, [t[0], t[3], t[1], t[2]])
 def test_call_within_randomstate(self):
     # Check that custom RandomState does not call into global state
     m = random.RandomState()
     res = np.array([0, 8, 7, 2, 1, 9, 4, 7, 0, 3])
     for i in range(3):
         random.seed(i)
         m.seed(4321)
         # If m.state is not honored, the result will change
         assert_array_equal(m.choice(10, size=10, p=np.ones(10) / 10.), res)
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(
            " >>> Adding D1 surface shape distribution for database {0} ... ".
            format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))
                    atoms = protein.atoms
                    neighbour_search = NeighborSearch(atoms)
                    distributions = np.zeros(
                        (len(protein.residues), self.number_of_bins + 1))
                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        nearby_residues = [protein.biopython_residues[i]]
                        temp_nearby_residues = neighbour_search.search(
                            residue.center, self.radius, "R")
                        for nearby_residue in temp_nearby_residues:
                            if nearby_residue not in protein.biopython_residues:
                                continue
                            residues_index = protein.biopython_residues.index(
                                nearby_residue)
                            residue = protein.residues[residues_index]

                            if residue.get_feature(
                                    Features.RELATIVE_ACCESSIBLE_SURFACE_AREA
                            ) >= self.rASA_threshold:
                                nearby_residues.append(nearby_residue)
                        distributions[i, :] = self._compute_distribution(
                            nearby_residues, residue.center)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(
                        Features.D1_SURFACE_SHAPE_DISTRIBUTION,
                        distributions[i, :])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
 def test_choice_sum_of_probs_tolerance(self):
     # The sum of probs should be 1.0 with some tolerance.
     # For low precision dtypes the tolerance was too tight.
     # See numpy github issue 6123.
     random.seed(1234)
     a = [1, 2, 3]
     counts = [4, 4, 2]
     for dt in np.float16, np.float32, np.float64:
         probs = np.array(counts, dtype=dt) / sum(counts)
         c = random.choice(a, p=probs)
         assert_(c in a)
         assert_raises(ValueError, random.choice, a, p=probs * 0.9)
    def test_shuffle_of_array_of_objects(self):
        # Test that permuting an array of objects will not cause
        # a segfault on garbage collection.
        # See gh-7719
        random.seed(1234)
        a = np.array([np.arange(1), np.arange(4)])

        for _ in range(1000):
            random.shuffle(a)

        # Force Garbage Collection - should not segfault.
        import gc
        gc.collect()
    def test_shuffle_of_array_of_different_length_strings(self):
        # Test that permuting an array of different length strings
        # will not cause a segfault on garbage collection
        # Tests gh-7710
        random.seed(1234)

        a = np.array(['a', 'a' * 1000])

        for _ in range(100):
            random.shuffle(a)

        # Force Garbage Collection - should not segfault.
        import gc
        gc.collect()
 def test_logseries_convergence(self):
     # Test for ticket #923
     N = 1000
     random.seed(0)
     rvsn = random.logseries(0.8, size=N)
     # these two frequency counts should be close to theoretical
     # numbers with this large sample
     # theoretical large N result is 0.49706795
     freq = np.sum(rvsn == 1) / float(N)
     msg = "Frequency was %f, should be > 0.45" % freq
     assert_(freq > 0.45, msg)
     # theoretical large N result is 0.19882718
     freq = np.sum(rvsn == 2) / float(N)
     msg = "Frequency was %f, should be < 0.23" % freq
     assert_(freq < 0.23, msg)
Exemplo n.º 12
0
    def create_membership_matrix(self):
        """
        Create a random membership matrix.

        @return: random array of shape length of data to
                 cluster times number of clusters
        @rtype: array('f')
        """
        ## default signature has changed oldnumeric->numpy
        if (self.seedx==0 or self.seedy==0):  
            R.seed()
        else:
            R.seed((self.seedx, self.seedy))

        r = R.random_sample((self.npoints, self.n_cluster))
        return N0.transpose(r / N0.sum(r))
Exemplo n.º 13
0
    def create_membership_matrix(self):
        """
        Create a random membership matrix.

        @return: random array of shape length of data to
                 cluster times number of clusters
        @rtype: array('f')
        """
        ## default signature has changed oldnumeric->numpy
        if (self.seedx == 0 or self.seedy == 0):
            R.seed()
        else:
            R.seed((self.seedx, self.seedy))

        r = R.random_sample((self.npoints, self.n_cluster))
        return N0.transpose(r / N0.sum(r))
Exemplo n.º 14
0
    def gibbs_sampling(self, n_topics, alpha, n_iterations):
        seed(0)

        # randomly assign topics to words
        self.word_topic_map = {w: randint(0, n_topics-1) for w in self.vocab}
        n_dt = [{t: 0 for t in range(n_topics)} for _ in range(len(self.corpus))]
        n_tw = [{w: 0 for w in self.vocab} for _ in range(n_topics)]

        for d_index in range(len(self.corpus)):
            d = self.corpus[d_index]
            for w in d:
                t = self.word_topic_map[w]
                n_dt[d_index][t] += 1
                n_tw[t][w] += 1

        for i in range(n_iterations):
            print("Iteration %d/%d (%f%%)..." % (i, n_iterations, 100 * i / float(n_iterations)))
            for d_index in range(len(self.corpus)):
                print("Document %d/%d (%f%%)..." % (d_index, len(self.corpus), 100 * d_index / float(len(self.corpus))))
                d = self.corpus[d_index]
                for w in d:
                    # i. remove current word from counts
                    old_topic = self.word_topic_map[w]
                    # TODO
                    if n_dt[d_index][old_topic] == 0:
                        print("oops dt", d_index, old_topic)
                    else:
                        n_dt[d_index][old_topic] -= 1
                    n_tw[old_topic][w] -= 1

                    # ii. estimate probabilities using 5.6, 5.7
                    word_topic_probs = []
                    for t in range(n_topics):
                        p_t_d = float(n_dt[d_index][t] + alpha) / (sum(n_dt[d_index].values()) + n_topics * alpha)
                        p_w_t = float(n_tw[t][w] + alpha) / (sum(n_tw[t].values()) + len(self.vocab) * alpha)
                        word_topic_probs.append(p_w_t * p_t_d)

                    # iii. assign w to a topic randomly
                    word_topic_probs = [float(p) / sum(word_topic_probs) for p in word_topic_probs]
                    self.word_topic_map[w] = choice(range(n_topics), p=word_topic_probs)

                    # iv. increment counts accordingly
                    topic = self.word_topic_map[w]
                    n_tw[topic][w] += 1
                    n_dt[d_index][topic] += 1
 def extract_feature(self):
     seed(self.seed)
     counter = 0
     overall_time = datetime.now()
     print_info_nn(
         " >>> Adding D2 shape distribution for database {0} ... ".format(
             self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             shape_dist_file = self._get_dir_name() + protein.name
             if not os.path.exists(shape_dist_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 atoms = protein.atoms
                 neighbour_search = NeighborSearch(atoms)
                 distributions = np.zeros(
                     (len(protein.residues), self.number_of_bins))
                 # distributions = np.zeros((len(protein.residues), self.number_of_bins+2))
                 for i in range(len(protein.residues)):
                     residue = protein.residues[i]
                     nearby_residues = neighbour_search.search(
                         residue.center, self.radius, "R")
                     distributions[i, :] = self._compute_distribution(
                         nearby_residues)
                     # distributions[i:, -1] = len(nearby_residues)
                 np.save(shape_dist_file, distributions)
             distributions = np.load(shape_dist_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(
                     Features.D2_PLAIN_SHAPE_DISTRIBUTION,
                     distributions[i, :])
                 # protein.residues[i].add_feature(Features.NUMBER_OF_NEIGHBOURS, distributions[i, -1])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
Exemplo n.º 16
0
def variation(sigma, mu=[2, 3], power=2):
    y_var = [random.randint(2, 100) for i in range(30)]
    # print(y_variation)
    y_var = sorted(y_var)
    # print(y_variation)
    multiple_corr = []

    for i in y_var:
        new_sigma = sigma[:]
        new_sigma[1][1] = i

        seed(1)
        new_data = np.random.multivariate_normal(mu, new_sigma, 100)
        multiple_corr.append((np.corrcoef(np.power(new_data[:, 0], power),
                                          np.power(new_data[:, 1],
                                                   power)))[1][0])

    return y_var, multiple_corr
    def test_permutation_subclass(self):
        class N(np.ndarray):
            pass

        random.seed(1)
        orig = np.arange(3).view(N)
        perm = random.permutation(orig)
        assert_array_equal(perm, np.array([0, 2, 1]))
        assert_array_equal(orig, np.arange(3).view(N))

        class M(object):
            a = np.arange(5)

            def __array__(self):
                return self.a

        random.seed(1)
        m = M()
        perm = random.permutation(m)
        assert_array_equal(perm, np.array([2, 1, 4, 0, 3]))
        assert_array_equal(m.__array__(), np.arange(5))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(" >>> Adding D1 surface shape distribution for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))
                    atoms = protein.atoms
                    neighbour_search = NeighborSearch(atoms)
                    distributions = np.zeros((len(protein.residues), self.number_of_bins + 1))
                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        nearby_residues = [protein.biopython_residues[i]]
                        temp_nearby_residues = neighbour_search.search(residue.center, self.radius, "R")
                        for nearby_residue in temp_nearby_residues:
                            if nearby_residue not in protein.biopython_residues:
                                continue
                            residues_index = protein.biopython_residues.index(nearby_residue)
                            residue = protein.residues[residues_index]

                            if residue.get_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA) >= self.rASA_threshold:
                                nearby_residues.append(nearby_residue)
                        distributions[i, :] = self._compute_distribution(nearby_residues, residue.center)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :])
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def seed(x=0, y=0):
    if (x == 0 or y == 0):
        mt.seed()
    else:
        mt.seed((x,y))
Exemplo n.º 20
0
def seed(x=0, y=0):
    if (x == 0 or y == 0):
        mt.seed()
    else:
        mt.seed((x, y))
Exemplo n.º 21
0
def main():
    print_info("Starting the experiment")
    start_time = datetime.now()
    seed = 1
    #number_of_samples = 5000
    number_of_samples = 20000
    dbd4 = DBD4(size=number_of_samples, ratio=-1, seed=seed)
    mtrand.seed(seed)
    feature_sets = [
        #[
        #    Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #    Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX,
        #],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.D2_PLAIN_SHAPE_DISTRIBUTION
        # ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.D1_PLAIN_SHAPE_DISTRIBUTION
        # ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
        #     Features.D2_SURFACE_SHAPE_DISTRIBUTION
        # ],
         [
             Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
             Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
             Features.D1_SURFACE_SHAPE_DISTRIBUTION
         ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.D2_CATEGORY_SHAPE_DISTRIBUTION
        # ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     # Features.PROTRUSION_INDEX,
        #     # Features.B_VALUE,
        #     Features.HALF_SPHERE_EXPOSURE,
        #     Features.SECONDARY_STRUCTURE,
        #     Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX,
        #     Features.POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.POSITION_SPECIFIC_FREQUENCY_MATRIX,
        #     Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
        #     # # Features.PHI,
        #     # # Features.PSI,
        #     # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
        #     Features.D2_SURFACE_SHAPE_DISTRIBUTION,
        #     # Features.D1_SURFACE_SHAPE_DISTRIBUTION,
        #     # Features.D2_PLAIN_SHAPE_DISTRIBUTION,
        #     # Features.D1_SURFACE_SHAPE_DISTRIBUTION,
        #     Features.RESIDUE_DEPTH
        # ]
    ]
    results = []
    for feature_set in feature_sets:
        print_special("Feature set {0}".format(feature_set))
        e = Experiment(feature_set, dbd4, Classifier.SVM)
        e.run(number_of_bins=20, radius=15, number_of_samples=-1, seed=seed, gamma=0.5, save=True, folds=5, rASA=.5)
        results.append(e.pyml_result)
        print_info("Took {0} seconds.".format((datetime.now() - start_time).seconds))
    save_results(number_of_samples, results, feature_sets)
Exemplo n.º 22
0
import random
import numpy as np
from numpy.random.mtrand import seed
import matplotlib.pyplot as plt

mean = [2, 3]
sigma = np.array([[1, 1.5], [1.5, 30]])
seed(10)
data = np.random.multivariate_normal(mean, sigma, 100)

# print(np.corrcoef(np.power(data[:, 0], 2), np.power(data[:, 1], 2)))


def variation(sigma, mu=[2, 3], power=2):
    y_var = [random.randint(2, 100) for i in range(30)]
    # print(y_variation)
    y_var = sorted(y_var)
    # print(y_variation)
    multiple_corr = []

    for i in y_var:
        new_sigma = sigma[:]
        new_sigma[1][1] = i

        seed(1)
        new_data = np.random.multivariate_normal(mu, new_sigma, 100)
        multiple_corr.append((np.corrcoef(np.power(new_data[:, 0], power),
                                          np.power(new_data[:, 1],
                                                   power)))[1][0])

    return y_var, multiple_corr
 def test_beta_small_parameters(self):
     # Test that beta with small a and b parameters does not produce
     # NaNs due to roundoff errors causing 0 / 0, gh-5851
     random.seed(1234567890)
     x = random.beta(0.0001, 0.0001, size=100)
     assert_(not np.any(np.isnan(x)), 'Nans in random.beta')
Exemplo n.º 24
0
Arquivo: keras.py Projeto: tedil/lyner
def autoencode(pipe: Pipe,
               layer_config: List[Dict],
               from_file: str,
               store_model: str,
               loss: str,
               optimiser: str,
               epochs: int,
               batch_size: int,
               shuffle: bool,
               validation_split: float,
               adjust_weights: float,
               mode: str):
    """Build and train an autoencoder."""
    import keras
    from keras import regularizers, Sequential, Input, Model
    from keras.callbacks import EarlyStopping, TensorBoard
    from keras.engine import InputLayer
    from keras.engine.saving import model_from_yaml, model_from_json
    from keras.layers import Dense
    from numpy.random.mtrand import seed
    from tensorflow import set_random_seed
    from lyner.keras_extras import SignalHandler
    seed(1)
    set_random_seed(2)
    matrix = pipe.matrix.copy()
    if matrix.isnull().values.any():
        LOGGER.warning("Dropping rows containing nan values")
        matrix.dropna(how='any', inplace=True)

    def parse_layout(layer_conf):
        get_layer_type = lambda t: getattr(keras.layers, t, None)
        regdict = {'l1_l2': regularizers.l1_l2, 'l1': regularizers.l1, 'l2': regularizers.l2}
        lc = layer_conf.copy()
        layer_type = lc.get('type', None)
        if layer_type:
            lc['type'] = get_layer_type(layer_type)

        # TODO parse regularizers
        kernel_reg_type = lc.get('kernel_regularizer', None)
        if kernel_reg_type:
            if '(' in kernel_reg_type and ')' in kernel_reg_type:
                params = kernel_reg_type[kernel_reg_type.index('(') + 1:kernel_reg_type.index(')')]
                if '+' in params:
                    params = params.split('+')
                else:
                    params = [params]
                params = [float(p) for p in params]
                kernel_reg_type = kernel_reg_type[:kernel_reg_type.index('(')]
            lc['kernel_regularizer'] = regdict[kernel_reg_type](*params)
        return lc.pop('type'), int(lc.pop('n')), lc

    layout = [parse_layout(layer_conf) for layer_conf in layer_config]
    labels = matrix.columns.values.tolist()
    data = matrix.values
    shape = (data.shape[0],)
    data = data.transpose()
    if layout:
        encoding_dim = layout[-1][1]
        encoder = Sequential(name="encoder")
        encoder.add(InputLayer(shape, name="encoder_input"))
        for layer_num, (Layer, n_nodes, extra_args) in enumerate(layout):
            encoder.add(Layer(n_nodes, name=f"encoder_{layer_num}_{n_nodes}", **extra_args))
            # kernel_regularizer=regularizers.l1_l2(0.001, 0.001),
            # kernel_regularizer=regularizers.l1(0.0001),

        decoder = Sequential(name="decoder")
        decoder.add(InputLayer((encoding_dim,), name="decoder_input"))
        for layer_num, (Layer, n_nodes, _) in enumerate(layout[::-1][1:]):
            decoder.add(Layer(n_nodes, name=f"decoder_{layer_num}_{n_nodes}"))
        decoder.add(Dense(shape[0], activation='linear', name="decoder_output"))

        input_layer = Input(shape=shape, name="autoencoder_input")
        encode_layer = encoder(input_layer)
        decode_layer = decoder(encode_layer)

        autoencoder = Model(input_layer, decode_layer)
        if store_model:
            if store_model.endswith('.yaml'):
                model_string = autoencoder.to_yaml()
            elif store_model.endswith('.json'):
                model_string = autoencoder.to_json()
            else:
                model_string = autoencoder.to_yaml()
            with open(store_model, 'wt') as writer:
                writer.write(model_string)
    elif from_file:
        with open(from_file, 'rt') as reader:
            model_string = '\n'.join(reader.readlines())
        if from_file.endswith('.yaml'):
            autoencoder = model_from_yaml(model_string)
        elif from_file.endswith('.json'):
            autoencoder = model_from_json(model_string)
        # TODO set encoder and decoder correctly
    else:
        raise ValueError("No model specified. Use either of --layer-config or --from-file.")
    # from pprint import pprint
    # pprint(autoencoder.get_config())
    autoencoder.compile(optimizer=optimiser, loss=loss, metrics=['mse'], )

    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0000001, patience=50)

    sh = SignalHandler()
    autoencoder.fit(np.vsplit(data, 1), np.vsplit(data, 1),
                    callbacks=[TensorBoard(log_dir='/tmp/autoencoder'), sh, early_stopping],
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=validation_split,
                    shuffle=shuffle
                    )
    sh.uninit()

    class Autoencoder:
        def __init__(self, encoder=None, decoder=None):
            self._encoder = encoder
            self._decoder = decoder

        def inverse_transform(self, data):
            return self._decoder.predict(data).transpose()

        def transform(self, data):
            return self._encoder.predict(data).transpose()

    pipe.decomposition = Autoencoder(encoder, decoder)

    encoded_data = pipe.decomposition.transform(data)
    decoded_data = pipe.decomposition.inverse_transform(encoded_data.T)
    pre_error = ((data.T - decoded_data) ** 2).mean(axis=None)
    print(f"MSE: {pre_error}")

    pipe._index = pipe.matrix.index
    pipe._columns = pipe.matrix.columns
    if adjust_weights:
        quant = float(adjust_weights)
        for i, layer in enumerate(encoder.layers):
            W, b = layer.get_weights()
            low, median, high = np.quantile(W.flatten(), [quant, 0.5, 1 - quant])
            W_low = W * (W < low)
            W_high = W * (W > high)
            selected_weights = W_low + W_high
            # oplot([Histogram(x=W.flatten()), Histogram(x=W[W < low].flatten()), Histogram(x=W[W > high].flatten())])
            layer.set_weights([selected_weights, b])
            break
        encoded_data = pipe.decomposition.transform(data)
        decoded_data = pipe.decomposition.inverse_transform(encoded_data.T)
        post_error = ((data.T - decoded_data) ** 2).mean(axis=None)
        print(f"MSE: {post_error}")
    if 'weights' == mode:
        layer = 0
        layer_weights = encoder.layers[layer].get_weights()
        layer = encoder.layers[layer]
        if len(layer_weights) == 0:
            layer_weights = encoder.layers[0].get_weights()
        if len(layer_weights) >= 2:
            layer_weights = layer_weights[:-1]  # last one is bias
        new_data = layer_weights[0]
        index = [f'Weight_{i}' for i in range(new_data.shape[0])]
        num_nodes = new_data.shape[1]
        columns = [f"{layer.name}_{i}" for i in range(num_nodes)]
    elif 'nodes' == mode:
        new_data = encoder.predict(np.vsplit(data, 1)).transpose()
        columns = labels
        index = [f"{mode}_{i}" for i in range(encoding_dim)]
    elif 'discard' == mode:
        W, b = encoder.layers[0].get_weights()
        W = np.sum(np.abs(W), axis=1)
        W[W != 0] = 1
        print(f"Kept {np.sum(W)} weights")
        v: np.array = pipe.matrix.values
        new_data = (v.T * W).T
        columns = pipe.matrix.columns
        index = pipe.matrix.index
    else:
        raise ValueError(f"Unknown mode {mode}")
    pipe.matrix = pd.DataFrame(data=new_data,
                               columns=columns,
                               index=index,
                               )
    return
 def test_permutation_longs(self):
     random.seed(1234)
     a = random.permutation(12)
     random.seed(1234)
     b = random.permutation(long(12))
     assert_array_equal(a, b)