Пример #1
0
    def save(model, features=None, path="", label="ml4chem"):
        """Save a model

        Parameters
        ----------
        model : obj
            The model to be saved.
        features : obj
            Features object.
        path : str
            The path where to save the model.
        label : str
            Name of files. Default ml4chem.
        """

        model_name = model.name()

        path += label

        if model_name in Potentials.svm_models:
            params = {"model": model.params}

            # Save model weights to file
            dump(model.weights, path + ".ml4c")
        else:

            params = {
                "model": {
                    "name": model_name,
                    "hiddenlayers": model.hiddenlayers,
                    "activation": model.activation,
                    "type": "nn",
                    "input_dimension": model.input_dimension,
                }
            }

            torch.save(model.state_dict(), path + ".ml4c")

        if model_name == "AutoEncoder":
            output_dimension = {"output_dimension": model.output_dimension}
            params["model"].update(output_dimension)

        if features is not None:
            # Adding fingerprints to .params json file.
            fingerprints = {"fingerprints": features.params}
            params.update(fingerprints)

        # Save parameters to file
        with open(path + ".params", "wb") as json_file:
            json.dump(
                params,
                codecs.getwriter("utf-8")(json_file),
                ensure_ascii=False,
                indent=4,
            )
Пример #2
0
def autoencode():
    # Load the images with ASE
    images = Trajectory("cu_training.traj")
    purpose = "training"

    # Arguments for fingerprinting the images
    normalized = True
    """
    Data Structure Preparation
    """
    data_handler = DataSet(images, purpose=purpose)
    training_set, energy_targets = data_handler.get_images(purpose=purpose)
    """
    Let's create the targets of the model
    """
    fingerprints = Gaussian(cutoff=6.5,
                            normalized=normalized,
                            save_preprocessor="cu_training.scaler")

    targets = fingerprints.calculate_features(training_set,
                                              data=data_handler,
                                              purpose=purpose,
                                              svm=False)
    output_dimension = len(list(targets.values())[0][0][1])
    """
    Building AutoEncoder
    """
    # Arguments for building the model
    hiddenlayers = {"encoder": (20, 10, 4), "decoder": (4, 10, 20)}
    activation = "tanh"
    autoencoder = AutoEncoder(hiddenlayers=hiddenlayers, activation=activation)

    data_handler.get_unique_element_symbols(images, purpose=purpose)
    autoencoder.prepare_model(output_dimension,
                              output_dimension,
                              data=data_handler)
    # Arguments for training the potential
    convergence = {"rmse": 5e-2}
    epochs = 2000
    lr = 1e-0
    weight_decay = 0
    regularization = None

    opt_kwars = {"lr": lr}
    optimizer = ("lbfgs", opt_kwars)

    inputs = targets
    train(
        inputs,
        targets,
        model=autoencoder,
        data=data_handler,
        optimizer=optimizer,
        regularization=regularization,
        epochs=epochs,
        convergence=convergence,
        lossfxn=None,
        device="cpu",
    )

    latent_space = autoencoder.get_latent_space(targets, svm=True)

    dump(latent_space, filename="cu_training.latent")

    Potentials.save(autoencoder)

    return latent_space, energy_targets, data_handler
Пример #3
0
    def calculate_features(self,
                           images=None,
                           purpose="training",
                           data=None,
                           svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the DataSet class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        logger.info(" ")
        logger.info("Fingerprinting")
        logger.info("==============")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                'Using parameters from file to create symmetry functions...\n')

        self.print_fingerprint_params(self.GP)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic fingerprints.
        logger.info("")
        logger.info("Adding atomic feature calculations to scheduler...")

        ini = end = 0

        computations = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        for image in images.items():
            key, image = image
            end = ini + len(image)
            atoms_index_map.append(list(range(ini, end)))
            ini = end
            for atom in image:
                index = atom.index
                symbol = atom.symbol
                nl = get_neighborlist(image, cutoff=self.cutoff)
                # n_indices: neighbor indices for central atom_i.
                # n_offsets: neighbor offsets for central atom_i.
                n_indices, n_offsets = nl[atom.index]

                n_symbols = np.array(image.get_chemical_symbols())[n_indices]
                neighborpositions = image.positions[n_indices] + np.dot(
                    n_offsets, image.get_cell())

                afp = self.get_atomic_fingerprint(
                    atom,
                    index,
                    symbol,
                    n_symbols,
                    neighborpositions,
                    self.preprocessor,
                    image_molecule=image,
                    weighted=self.weighted,
                    n_indices=n_indices,
                )

                computations.append(afp)

        scheduler_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        # In this block we compute the fingerprints.
        logger.info("")
        logger.info("Computing fingerprints...")

        stacked_features = dask.compute(*computations,
                                        scheduler=self.scheduler)

        if self.preprocessor is not None:
            stacked_features = np.array(stacked_features)

        # Clean
        del computations

        if purpose == "training":
            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            client = dask.distributed.get_client()

            if self.preprocessor is not None:
                scaled_feature_space = []
                dim = stacked_features.shape
                stacked_features = dask.array.from_array(stacked_features,
                                                         chunks=dim)
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
                atoms_index_map = [
                    client.scatter(chunk) for chunk in atoms_index_map
                ]

                for indices in atoms_index_map:
                    features = client.submit(self.stack_features,
                                             *(indices, stacked_features))
                    scaled_feature_space.append(features)

                # More data processing depending on the method used.

            else:
                feature_space = []
                atoms_index_map = [
                    client.scatter(chunk) for chunk in atoms_index_map
                ]

                for indices in atoms_index_map:
                    features = client.submit(self.stack_features,
                                             *(indices, stacked_features))
                    feature_space.append(features)

            del stacked_features
            computations = []

            if svm:
                reference_space = []

                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(
                            i,
                            image,
                            scaled_feature_space=scaled_feature_space,
                            svm=svm))

                    # image = (hash, ase_image) -> tuple
                    for atom in image[1]:
                        reference_space.append(
                            self.restack_atom(i, atom, scaled_feature_space))

                reference_space = dask.compute(*reference_space,
                                               scheduler=self.scheduler)
            else:
                try:
                    for i, image in enumerate(images.items()):
                        computations.append(
                            self.restack_image(
                                i,
                                image,
                                scaled_feature_space=scaled_feature_space,
                                svm=svm,
                            ))

                except UnboundLocalError:
                    # scaled_feature_space does not exist.
                    for i, image in enumerate(images.items()):
                        computations.append(
                            self.restack_image(i,
                                               image,
                                               feature_space=feature_space,
                                               svm=svm))

            feature_space = dask.compute(*computations,
                                         scheduler=self.scheduler)
            feature_space = OrderedDict(feature_space)
            del computations

            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            fp_time = time.time() - initial_time

            h, m, s = convert_elapsed_time(fp_time)
            logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}"
                        " seconds.".format(h, m, s))

            if svm:
                if self.filename is not None:
                    logger.info("Fingerprints saved to {}.".format(
                        self.filename))
                    data = {"feature_space": feature_space}
                    data.update({"reference_space": reference_space})
                    dump(data, filename=self.filename)
                return feature_space, reference_space
            else:
                if self.filename is not None:
                    logger.info("Fingerprints saved to {}.".format(
                        self.filename))
                    dump(feature_space, filename=self.filename)
                return feature_space

        elif purpose == "inference":
            feature_space = OrderedDict()
            scaled_feature_space = preprocessor.transform(stacked_features)

            # TODO this has to be parallelized.
            for key, image in images.items():
                if key not in feature_space.keys():
                    feature_space[key] = []
                for index, atom in enumerate(image):
                    symbol = atom.symbol

                    if svm:
                        scaled = scaled_feature_space[index]
                        # TODO change this to something more elegant later
                        try:
                            self.reference_space
                        except AttributeError:
                            # If self.reference does not exist it means that
                            # reference_space is being loaded by Messagepack.
                            symbol = symbol.encode("utf-8")
                    else:
                        scaled = torch.tensor(
                            scaled_feature_space[index],
                            requires_grad=False,
                            dtype=torch.float,
                        )

                    feature_space[key].append((symbol, scaled))

            fp_time = time.time() - initial_time

            h, m, s = convert_elapsed_time(fp_time)

            logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}"
                        " seconds.".format(h, m, s))

            return feature_space
Пример #4
0
    def save(model=None, features=None, path=None, label="ml4chem"):
        """Save a model

        Parameters
        ----------
        model : obj
            The model to be saved.
        features : obj
            Features object.
        path : str
            The path where to save the model.
        label : str
            Name of files. Default ml4chem.
        """

        if path is None:
            path = "."

        if os.path.isdir(path) is False:
            os.makedirs(path)

        if path[-1] == "/":
            path += label
        else:
            path = path + "/" + label

        if model is not None:
            model_name = model.name()
            if model_name in Potentials.svm_models:
                params = {"model": model.params}

                # Save model weights to file
                dump(model.weights, path + ".ml4c")
            else:
                # FIXME a global class to save params?
                params = {
                    "model": {
                        "name": model_name,
                        "class_name": model.__class__.__name__,
                        "hiddenlayers": model.hiddenlayers,
                        "activation": model.activation,
                        "type": "nn",
                        "input_dimension": model.input_dimension,
                    }
                }

                torch.save(model.state_dict(), path + ".ml4c")

                if model_name in Potentials.autoencoders:
                    output_dimension = {
                        "output_dimension": model.output_dimension
                    }
                    params["model"].update(output_dimension)
                    variant = {"variant": model.variant}
                    params["model"].update(variant)
                    one_for_all = {"one_for_all": model.one_for_all}
                    params["model"].update(one_for_all)
        else:
            params = {}

        if features is not None:
            # Adding features to .params json file.
            features = {"features": features.params}
            params.update(features)

        # Save parameters to file
        with open(path + ".params", "wb") as json_file:
            json.dump(
                params,
                codecs.getwriter("utf-8")(json_file),
                ensure_ascii=False,
                indent=4,
            )
Пример #5
0
    def calculate_features(self,
                           images=None,
                           purpose="training",
                           data=None,
                           svm=False):
        """Return features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the DataSet class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        """

        logger.info(" ")
        logger.info("Fingerprinting")
        logger.info("==============")

        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space
            else:
                return data

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations with delayed functions to operate
        # with dask's scheduler. These computations get cartesian coordinates.
        computations = []

        for image in images.items():
            key, image = image

            feature_vectors = []

            computations.append(feature_vectors)

            for atom in image:
                if self.preprocessor is not None:
                    # In this case we will preprocess data and need numpy
                    # arrays to operate with sklearn.
                    afp = self.get_atomic_features(atom, svm=True)
                    feature_vectors.append(afp[1])
                else:
                    afp = self.get_atomic_features(atom, svm=svm)
                    feature_vectors.append(afp)

        # In this block we compute the delayed functions in computations.
        feature_space = dask.compute(*computations, scheduler=self.scheduler)

        hashes = list(images.keys())

        if self.preprocessor is not None and purpose == "training":
            feature_space = np.array(feature_space)
            dim = feature_space.shape

            if len(dim) > 1:
                d1, d2, d3 = dim
                feature_space = feature_space.reshape(d1 * d2, d3)
                feature_space = preprocessor.fit(feature_space,
                                                 scheduler=self.scheduler)
                feature_space = feature_space.reshape(d1, d2, d3)
            else:
                atoms_index_map = []
                stack = []

                d1 = ini = end = 0

                for i in feature_space:
                    end = ini + len(i)
                    atoms_map = list(range(ini, end))
                    atoms_index_map.append(atoms_map)
                    ini = end

                    for j in i:
                        stack.append(j)
                        d1 += 1

                feature_space = np.array(stack)

                d2 = len(stack[0])
                del stack

            # More data processing depending on the method used.
            computations = []

            if svm:
                reference_space = []

                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(i, image, feature_space, svm=svm))

                    # image = (hash, ase_image) -> tuple
                    for atom in image[1]:
                        reference_space.append(
                            self.restack_atom(i, atom, feature_space))

                reference_space = dask.compute(*reference_space,
                                               scheduler=self.scheduler)
            else:
                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(i, image, feature_space, svm=svm))

            feature_space = dask.compute(*computations,
                                         scheduler=self.scheduler)

            feature_space = OrderedDict(feature_space)

            # Save preprocessor.
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

        elif self.preprocessor is not None and purpose == "inference":
            # We take stacked features and preprocess them
            stacked_features = np.array(feature_space)
            d1, d2, d3 = stacked_features.shape
            stacked_features = stacked_features.reshape(d1 * d2, d3)
            feature_space = OrderedDict()
            scaled_feature_space = preprocessor.transform(stacked_features)

            # Once preprocessed, they are wrapped as a dictionary.
            # TODO this has to be parallelized.
            for key, image in images.items():
                if key not in feature_space.keys():
                    feature_space[key] = []
                for index, atom in enumerate(image):
                    symbol = atom.symbol

                    if svm:
                        scaled = scaled_feature_space[index]
                        # TODO change this to something more elegant later
                        try:
                            self.reference_space
                        except AttributeError:
                            # If self.reference does not exist it means that
                            # reference_space is being loaded by Messagepack.
                            symbol = symbol.encode("utf-8")
                    else:
                        scaled = torch.tensor(
                            scaled_feature_space[index],
                            requires_grad=False,
                            dtype=torch.float,
                        )

                    feature_space[key].append((symbol, scaled))
        else:

            feature_space = OrderedDict(zip(hashes, feature_space))

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Fingerprinting finished in {} hours {} minutes {:.2f} "
                    "seconds.\n".format(h, m, s))

        if svm:
            data = {"feature_space": feature_space}
            dump(data, filename=self.filename)
        else:
            dump(feature_space, filename=self.filename)

        return feature_space
Пример #6
0
def hybrid():
    # Load the images with ASE, and prepare data handler
    images = Trajectory("cu_training.traj")
    purpose = "training"

    latent_dimension = 32
    data_handler = Data(images, purpose=purpose)
    data_handler.get_unique_element_symbols(images, purpose=purpose)
    training_set, energy_targets = data_handler.get_data(purpose=purpose)

    # Preprocessor setup
    preprocessor = ("MinMaxScaler", {"feature_range": (-1, 1)})
    """
    Preparing the input
    """
    features = Cartesian(preprocessor=preprocessor,
                         save_preprocessor="cartesian.scaler")
    _inputs = features.calculate(training_set, data=data_handler)
    """
    Building AutoEncoder Model1
    """
    # Arguments for building the model
    hiddenlayers = {
        "encoder": (144, 72, latent_dimension),
        "decoder": (latent_dimension, 72, 144),
    }
    # hiddenlayers = {"encoder": (2, 2, 2), "decoder": (2, 2, 2)}
    activation = "tanh"
    autoencoder = AutoEncoder(hiddenlayers=hiddenlayers, activation=activation)
    autoencoder.prepare_model(3, 3, data=data_handler)
    """
    Building the ml potential model
    """

    # Arguments for building the model
    n = 40
    activation = "tanh"

    nn = NeuralNetwork(hiddenlayers=(n, n), activation=activation)
    nn.prepare_model(latent_dimension, data=data_handler)

    models = [autoencoder, nn]
    losses = [MSELoss, AtomicMSELoss]
    # losses = [EncoderMapLoss, AtomicMSELoss]

    merged = ModelMerger(models)
    # Arguments for training the potential
    convergence = {"rmse": [1.5e-1, 1.0e-1]}
    lr = 1e-4
    weight_decay = 1e-5
    regularization = None

    # Optimizer
    optimizer = ("adam", {
        "lr": lr,
        "weight_decay": weight_decay,
        "amsgrad": True
    })
    lr_scheduler = None

    inputs = [_inputs, autoencoder.get_latent_space]
    targets = [_inputs, energy_targets]
    batch_size = 2

    merged.train(
        inputs=inputs,
        targets=targets,
        data=data_handler,
        regularization=regularization,
        convergence=convergence,
        optimizer=optimizer,
        device="cpu",
        batch_size=batch_size,
        lr_scheduler=lr_scheduler,
        lossfxn=losses,
        independent_loss=True,
    )

    for index, model in enumerate(merged.models):
        label = "{}_{}".format(index, model.name())
        Potentials.save(model, label=label)

    dump_ls = merged.models[0].get_latent_space(inputs[0])
    dump(dump_ls, filename="checkme.latent")
Пример #7
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))
        logger.info(f"Module name: {self.name()}.")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning(f"Loading features from {self.filename}.")
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(f"Getting unique element symbols for {purpose}")

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                "Using parameters from file to create symmetry functions...\n")

        self.print_features_params(self.GP)

        symbol = data.unique_element_symbols[purpose][0]
        sample = np.zeros(len(self.GP[symbol]))

        self.dimension = len(sample)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        ini = end = 0
        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                _, image = image
                end = ini + len(image)
                atoms_index_map.append(list(range(ini, end)))
                ini = end
                for atom in image:
                    index = atom.index
                    symbol = atom.symbol

                    cutoff_keys = ["radial", "angular"]
                    n_symbols, neighborpositions = {}, {}

                    if isinstance(self.cutoff, dict):
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(
                                image, cutoff=self.cutoff[cutoff_key])
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_
                    else:
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(image, cutoff=self.cutoff)
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_

                    afp = self.get_atomic_features(
                        atom,
                        index,
                        symbol,
                        n_symbols,
                        neighborpositions,
                        image_molecule=image,
                        weighted=self.weighted,
                        n_indices=n_indices,
                    )

                    intermediate.append(afp)

            intermediate = client.persist(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        scheduler_time = time.time() - initial_time

        dask.distributed.wait(stacked_features)

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        logger.info("")

        if self.preprocessor is not None:

            scaled_feature_space = []

            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            logger.info("Converting features to dask array...")
            stacked_features = [
                da.from_delayed(lazy, dtype=float, shape=sample.shape)
                for lazy in stacked_features
            ]
            layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1}
            # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout)
            stacked_features = da.stack(stacked_features,
                                        axis=0).rechunk(layout)

            logger.info("Shape of array is {} and chunks {}.".format(
                stacked_features.shape, stacked_features.chunks))

            # Note that dask_ml by default convert the output of .fit
            # in a concrete value.
            if purpose == "training":
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
            else:
                stacked_features = preprocessor.transform(stacked_features)

            atoms_index_map = [
                client.scatter(indices) for indices in atoms_index_map
            ]
            # stacked_features = [client.scatter(features) for features in stacked_features]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            logger.info("Stacking features using atoms index map...")

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))

                # features = self.stack_features(indices, stacked_features)

                scaled_feature_space.append(features)

        else:
            scaled_feature_space = []
            atoms_index_map = [
                client.scatter(chunk) for chunk in atoms_index_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":
            logger.info("Building array with reference space.")
            reference_space = []

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                # image = (hash, ase_image) -> tuple
                for atom in image[1]:
                    restacked_atom = client.submit(
                        self.restack_atom, *(i, atom, scaled_feature_space))
                    reference_space.append(restacked_atom)

                feature_space.append(restacked)

            reference_space = client.gather(reference_space)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)
        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
Пример #8
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_symbols_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                key, image = image
                atoms_symbols_map.append(image.get_chemical_symbols())
                # Use .create() class method from dscribe.
                _features = dask.delayed(self.create)(image)
                intermediate.append(_features)

            intermediate = client.compute(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        # scheduler_time = time.time() - initial_time

        # dask.distributed.wait(stacked_features)

        logger.info("")

        if self.preprocessor is not None:
            raise NotImplementedError

        else:
            scaled_feature_space = []
            atoms_symbols_map = [
                client.scatter(chunk) for chunk in atoms_symbols_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for image_index, symbols in enumerate(atoms_symbols_map):
                features = client.submit(
                    self.stack_features,
                    *(symbols, image_index, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                feature_space.append(restacked)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)

        if svm and purpose == "training":
            # FIXME This might need to be improved
            logger.info("Building array with reference space.")
            hashes, reference_space = list(zip(*feature_space))
            del hashes
            reference_space = list(
                itertools.chain.from_iterable(reference_space))
            logger.info("Finished reference space.")

        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space