Пример #1
0
    def get_features(self, scale=False):
        train_features = self.Workflow.load_dataframe('fingerprint',
                                                      ids=self.train_ids)

        ids_train = train_features.pop('id')
        targets = self.Workflow.load_dataframe('target', ids=self.train_ids)
        ids_targets = targets.pop('id')
        test_features = self.Workflow.load_dataframe('fingerprint',
                                                     ids=self.test_ids)
        ids_test = test_features.pop('id')

        assert np.all(ids_train == ids_targets)
        assert np.all(ids_test == self.test_ids)
        self.targets = targets.values

        features, bad_indices = \
            clean_features({'train': train_features.values,
                            'test': test_features.values},
                           scale=scale)

        self.train_ids = np.delete(self.train_ids, bad_indices['train'])
        self.targets = np.delete(self.targets, bad_indices['train'])
        self.test_ids = np.delete(self.test_ids, bad_indices['test'])

        return features
Пример #2
0
    def plot_fingerprint_variation(self):
        test_ids = self.DB.get_initial_structure_ids()
        all_fingerprints = self.DB.load_dataframe('fingerprint', ids=test_ids)
        all_fingerprints = {'train': all_fingerprints.values, 'test': None}
        all_fingerprints, bad_ids = clean_features(all_fingerprints,
                                                   scale=True)
        all_fingerprints = all_fingerprints['train']
        max_values = np.max(all_fingerprints, axis=0)
        indices = np.argsort(max_values)
        p.plot(all_fingerprints[:, indices].T)

        p.xlabel('Feature id')
        p.ylabel('Standardized feature value')
        p.title('Fingerprints for {} structures'.format(len(all_fingerprints)))
        p.show()
Пример #3
0
    def run_ml_ga_optimization(self,
                               master_parameters=None,
                               optimize_lattice=True,
                               optimize_angles=True,
                               optimize_wyckoffs=True,
                               use_fitness_sharing=False,
                               batch_size=6,
                               max_candidates=1,
                               debug=False):
        """
        ML-accelerated Genetic algorithm optimization 
        for free wyckoff coordinates and lattice angles.
        """

        cell_parameters = self.initial_guess()
        if master_parameters:
            cell_parameters.update(master_parameters)
        population = []

        feature_variables = []
        if optimize_wyckoffs:
            feature_variables += self.coor_variables

        if optimize_angles:
            feature_variables += self.angle_variables

        n_parameters = len(feature_variables)
        population_size = min([5000, n_parameters * 100])
        population = []

        test_features = []
        for n in range(population_size):
            t_f = []
            parameters = {}
            if optimize_wyckoffs:
                for i, p in enumerate(self.coor_variables):
                    val = rand(1)[0] * 0.99
                    parameters.update({p: val})
                    t_f += [val]

            if optimize_angles:
                for i, p in enumerate(self.angle_variables):
                    val = (rand(1)[0] - 0.5) * 30 + 90
                    parameters.update({p: val})
                    t_f += [val]
            population += [parameters]
            test_features += [t_f]

        atoms = self.construct_atoms()
        covalent_radii = np.array([cradii[n] for n in atoms.numbers])
        M = covalent_radii * np.ones([len(atoms), len(atoms)])
        self.min_distances = (M + M.T)

        primitive_voronoi = len(atoms) > 64

        covalent_radii = np.array([cradii[n] for n in atoms.numbers])
        covalent_volume = np.sum(4 / 3 * np.pi * covalent_radii**3)
        cell_length = (covalent_volume * 2)**(1 / 3)

        test_features = np.array(test_features)
        batch_indices = np.random.randint(len(test_features), size=batch_size)

        train_features = None

        fitness = np.array([])
        all_structures = []

        converged = False
        iter_id = 1
        train_population = []
        while not converged:
            bad_indices = []
            for i in batch_indices:
                pop = population[i]
                train_population += [pop]
                cell_parameters.update(pop)
                atoms = self.construct_atoms(cell_parameters)
                atoms = self.optimize_lattice_constants(
                    atoms, proximity=0.9, optimize_wyckoffs=False)
                if atoms is None:
                    bad_indices += [i]
                    continue
                parameters = cell_to_cellpar(atoms.get_cell())

                if primitive_voronoi:
                    # Use primitive cell for voronoi analysis for
                    # large systems
                    for i, param_name in enumerate(
                        ['a', 'b', 'c', 'alpha', 'beta', 'gamma']):
                        if param_name in cell_parameters:
                            cell_parameters.update({param_name: parameters[i]})

                    atoms = self.construct_atoms(cell_parameters,
                                                 primitive_cell=True)

                fit = get_fitness(atoms)

                connections = None
                if fit > -2:
                    connections = get_connections(atoms, decimals=1)

                fitness = np.append(fitness, fit)
                all_structures += [{
                    'parameters': cell_parameters,
                    'atoms': atoms.copy(),
                    'fitness': fit,
                    'graph': connections
                }]

            best_fitness = np.max(fitness)
            if self.verbose:
                print('  {}'.format(np.max(fitness).round(2)))
            batch_indices = [
                idx for idx in batch_indices if not idx in bad_indices
            ]
            if train_features is None:
                train_features = test_features[batch_indices]
            else:
                train_features = np.append(train_features,
                                           test_features[batch_indices],
                                           axis=0)

            test_features = np.delete(test_features, batch_indices, axis=0)
            test_features = np.delete(test_features, bad_indices, axis=0)

            population = np.delete(population, batch_indices, axis=0)

            indices = np.argsort(fitness)[::-1]
            ga_survived = np.array(train_population)[indices][:10]
            new_population = self.crossover(ga_survived, n_children=50) + \
                self.mutation(ga_survived, n_children=10)

            for pop in new_population:
                val = []
                for v in feature_variables:
                    val += [pop[v]]
                val = np.expand_dims(val, axis=0)

                test_features = np.append(test_features, val, axis=0)
                population = np.append(population, pop)

            features, bad_indices = \
                clean_features({'train': train_features,
                                'test': test_features})

            fitness = np.delete(fitness, bad_indices['train'])
            test_features = np.delete(test_features,
                                      bad_indices['test'],
                                      axis=0)
            train_features = np.delete(train_features,
                                       bad_indices['train'],
                                       axis=0)
            population = np.delete(population, bad_indices['train'])

            try:
                Model = get_regression_model('catlearn')(
                    features['train'],
                    np.array(fitness),
                    optimize_hyperparameters=True,
                    kernel_width=1,
                    #bounds=((0.5, 5),)
                )
            except:
                Model = get_regression_model('catlearn')(
                    features['train'],
                    np.array(fitness),
                    optimize_hyperparameters=False,
                    kernel_width=3)

            result = Model.predict(features['test'])
            predictions = result['prediction']
            unc = result['uncertainty']

            # Acquisition function mix two acquisition functions
            AQU1 = predictions + 0.1 * unc
            AQU2 = predictions + 0.5 * unc

            indices1 = np.argsort(AQU1)[::-1][:batch_size]
            indices2 = np.argsort(AQU2)[::-1][:batch_size]

            indices2 = np.array(
                [int(aqu_i) for aqu_i in indices2 if not aqu_i in indices1])

            bs1 = batch_size // 2
            bs2 = batch_size - bs1
            batch_indices = np.array(np.append(indices1[:bs1], indices2[:bs2]),
                                     dtype=int)

            iter_id += 1

            if debug:
                import pylab as p
                idx = np.argsort(predictions)
                p.plot(range(len(predictions)), predictions[idx])
                p.plot(range(len(predictions)), predictions[idx] + unc[idx],
                       '--')
                p.plot(range(len(predictions)),
                       predictions[idx] * 0 + best_fitness, '--')
                p.show()

            if iter_id > 30 or len(predictions) < 7:
                converged = True
            elif not np.max(AQU1) > best_fitness and iter_id > 5:
                converged = True
            # elif best_fitness > 0.95:
            #    converged = True

        indices = np.argsort(fitness)[::-1][:max_candidates]
        fitness = fitness[indices]

        all_structures = np.array(all_structures)[indices]
        all_graphs = np.array([s['graph'] for s in all_structures])
        f_max = max(fitness)

        if fitness[0] < 0.8:
            indices = [0]
        else:
            indices = [
                i for i, f in enumerate(fitness) if f > 0.8 *
                f_max and not np.any(all_graphs[i] in all_graphs[:i])
            ]

        if primitive_voronoi:
            # generate conventional structure
            all_atoms = [
                self.construct_atoms(all_structures[i]['parameters'])
                for i in indices
            ]
        else:
            all_atoms = [s['atoms'] for s in all_structures]
            all_atoms = [all_atoms[i] for i in indices]
        return all_atoms