def load(Cls, model=None, params=None, preprocessor=None, **kwargs): """Load a model Parameters ---------- model : str The path to load the model from the .ml4c file for inference. params : srt The path to load .params file with users' inputs. preprocessor : str The path to load the file with the sklearn preprocessor object. """ kwargs["ml4chem_path"] = model kwargs["preprocessor"] = preprocessor with open(params) as ml4chem_params: ml4chem_params = json.load(ml4chem_params) model_type = ml4chem_params["model"].get("type") if model_type == "svm": model_params = ml4chem_params["model"] del model_params["name"] # delete unneeded key, value del model_params["type"] # delete unneeded key, value from ml4chem.models.kernelridge import KernelRidge weights = load(model) # TODO remove after de/serialization is fixed. weights = { key.decode("utf-8"): value for key, value in weights.items() } model_params.update({"weights": weights}) model = KernelRidge(**model_params) else: # Instantiate the model class model_params = ml4chem_params["model"] del model_params["name"] # delete unneeded key, value del model_params["type"] # delete unneeded key, value from ml4chem.models.neuralnetwork import NeuralNetwork model = NeuralNetwork(**model_params) # Instantiation of fingerprint class fingerprint_params = ml4chem_params.get("fingerprints", None) if fingerprint_params is None: fingerprints = fingerprint_params else: name = fingerprint_params.get("name") del fingerprint_params["name"] fingerprints = dynamic_import(name, "ml4chem.fingerprints") fingerprints = fingerprints(**fingerprint_params) calc = Cls(fingerprints=fingerprints, model=model, **kwargs) return calc
def autoencode(): # Load the images with ASE latent_space = load("cu_training.latent") latent_load = [] for e in list(latent_space.values()): for symbol, features in e: latent_load.append(features) latent_load = np.array(latent_load).flatten() images = Trajectory("cu_training.traj") purpose = "training" # Arguments for fingerprinting the images normalized = True data_handler = Data(images, purpose=purpose) images, energies = data_handler.get_data(purpose=purpose) preprocessor = ("MinMaxScaler", {"feature_range": (-1, 1)}) features = ( "Gaussian", { "cutoff": 6.5, "normalized": normalized, "preprocessor": preprocessor, "save_preprocessor": "inference.scaler", }, ) encoder = {"model": "ml4chem.ml4c", "params": "ml4chem.params"} features = LatentFeatures( features=features, encoder=encoder, preprocessor=None, save_preprocessor="latent_space_min_max.scaler", ) features = features.calculate(images, purpose=purpose, data=data_handler, svm=True) latent_svm = [] for e in list(features.values()): for symbol, features in e: latent_svm.append(features) latent_svm = np.array(latent_svm).flatten() assert np.allclose(latent_load, latent_svm)
def autoencode(): # Load the images with ASE latent_space = load("cu_training.latent") print("Latent space from file") print(latent_space) images = Trajectory("cu_training.traj") purpose = "training" # Arguments for fingerprinting the images normalized = True data_handler = DataSet(images, purpose=purpose) images, energies = data_handler.get_images(purpose=purpose) fingerprints = ( "Gaussian", { "cutoff": 6.5, "normalized": normalized, "save_preprocessor": "inference.scaler", }, ) encoder = {"model": "model.ml4c", "params": "model.params"} preprocessor = ("MinMaxScaler", {"feature_range": (-1, 1)}) fingerprints = LatentFeatures( features=fingerprints, encoder=encoder, preprocessor=preprocessor, save_preprocessor="latent_space_min_max.scaler", ) fingerprints = fingerprints.calculate_features(images, purpose=purpose, data=data_handler, svm=False) print("Latent space from LatentFeatures class") print(fingerprints)
def calculate(self, atoms, properties, system_changes): """Calculate things Parameters ---------- atoms : object, list List if images in ASE format. properties : """ purpose = "inference" Calculator.calculate(self, atoms, properties, system_changes) model_name = self.model.name() # We convert the atoms in atomic fingerprints data_handler = DataSet([atoms], purpose=purpose) atoms = data_handler.get_data(purpose=purpose) # We copy the loaded fingerprint class fingerprints = copy.deepcopy(self.fingerprints) kwargs = {"data": data_handler, "purpose": purpose} if model_name in Potentials.svm_models: kwargs.update({"svm": True}) if fingerprints.name() == "LatentFeatures": fingerprints = fingerprints.calculate_features(atoms, **kwargs) else: fingerprints.preprocessor = self.preprocessor fingerprints = fingerprints.calculate_features(atoms, **kwargs) if "energy" in properties: logger.info("Computing energy...") if model_name in Potentials.svm_models: try: reference_space = load(self.reference_space) except: raise ("This is not a database...") energy = self.model.get_potential_energy( fingerprints, reference_space) else: input_dimension = len(list(fingerprints.values())[0][0][-1]) model = copy.deepcopy(self.model) model.prepare_model(input_dimension, data=data_handler, purpose=purpose) try: model.load_state_dict(torch.load(self.ml4chem_path), strict=True) except RuntimeError: logger.warning( 'Your image does not have some atoms present in the loaded model.\n' ) model.load_state_dict(torch.load(self.ml4chem_path), strict=False) model.eval() energy = model(fingerprints).item() # Populate ASE's self.results dict self.results["energy"] = energy
def calculate_features(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the DataSet class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ logger.info(" ") logger.info("Fingerprinting") logger.info("==============") # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) # we make the features self.GP = self.custom.get("GP", None) if self.GP is None: custom = self.custom.get("user_input", None) self.GP = self.make_symmetry_functions( unique_element_symbols, custom=custom, angular_type=self.angular_type) self.custom.update({"GP": self.GP}) else: logger.info( 'Using parameters from file to create symmetry functions...\n') self.print_fingerprint_params(self.GP) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic fingerprints. logger.info("") logger.info("Adding atomic feature calculations to scheduler...") ini = end = 0 computations = [] atoms_index_map = [ ] # This list is used to reconstruct images from atoms. for image in images.items(): key, image = image end = ini + len(image) atoms_index_map.append(list(range(ini, end))) ini = end for atom in image: index = atom.index symbol = atom.symbol nl = get_neighborlist(image, cutoff=self.cutoff) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols = np.array(image.get_chemical_symbols())[n_indices] neighborpositions = image.positions[n_indices] + np.dot( n_offsets, image.get_cell()) afp = self.get_atomic_fingerprint( atom, index, symbol, n_symbols, neighborpositions, self.preprocessor, image_molecule=image, weighted=self.weighted, n_indices=n_indices, ) computations.append(afp) scheduler_time = time.time() - initial_time h, m, s = convert_elapsed_time(scheduler_time) logger.info("... finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) # In this block we compute the fingerprints. logger.info("") logger.info("Computing fingerprints...") stacked_features = dask.compute(*computations, scheduler=self.scheduler) if self.preprocessor is not None: stacked_features = np.array(stacked_features) # Clean del computations if purpose == "training": # To take advantage of dask_ml we need to convert our numpy array # into a dask array. client = dask.distributed.get_client() if self.preprocessor is not None: scaled_feature_space = [] dim = stacked_features.shape stacked_features = dask.array.from_array(stacked_features, chunks=dim) stacked_features = preprocessor.fit(stacked_features, scheduler=self.scheduler) atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) scaled_feature_space.append(features) # More data processing depending on the method used. else: feature_space = [] atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) feature_space.append(features) del stacked_features computations = [] if svm: reference_space = [] for i, image in enumerate(images.items()): computations.append( self.restack_image( i, image, scaled_feature_space=scaled_feature_space, svm=svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: reference_space.append( self.restack_atom(i, atom, scaled_feature_space)) reference_space = dask.compute(*reference_space, scheduler=self.scheduler) else: try: for i, image in enumerate(images.items()): computations.append( self.restack_image( i, image, scaled_feature_space=scaled_feature_space, svm=svm, )) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): computations.append( self.restack_image(i, image, feature_space=feature_space, svm=svm)) feature_space = dask.compute(*computations, scheduler=self.scheduler) feature_space = OrderedDict(feature_space) del computations preprocessor.save_to_file(preprocessor, self.save_preprocessor) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm: if self.filename is not None: logger.info("Fingerprints saved to {}.".format( self.filename)) data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) return feature_space, reference_space else: if self.filename is not None: logger.info("Fingerprints saved to {}.".format( self.filename)) dump(feature_space, filename=self.filename) return feature_space elif purpose == "inference": feature_space = OrderedDict() scaled_feature_space = preprocessor.transform(stacked_features) # TODO this has to be parallelized. for key, image in images.items(): if key not in feature_space.keys(): feature_space[key] = [] for index, atom in enumerate(image): symbol = atom.symbol if svm: scaled = scaled_feature_space[index] # TODO change this to something more elegant later try: self.reference_space except AttributeError: # If self.reference does not exist it means that # reference_space is being loaded by Messagepack. symbol = symbol.encode("utf-8") else: scaled = torch.tensor( scaled_feature_space[index], requires_grad=False, dtype=torch.float, ) feature_space[key].append((symbol, scaled)) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) return feature_space
def plot_atomic_features( latent_space, method="PCA", dimensions=2, backend="seaborn", data_only=False, preprocessor=None, backend_kwargs=None, **kwargs, ): """Plot high dimensional atomic feature vectors This function can take a feature space dictionary, or a database file and plot the atomic features using PCA or t-SNE. $ ml4chem --plot tsne --file path.db Parameters ---------- latent_space : dict or str Dictionary of atomic features of path to database file. method : str, optional Dimensionality reduction method to employed, by default "PCA". Supported are: "PCA" and "TSNE". dimensions : int, optional Number of dimensions to reduce the high dimensional atomic feature vectors, by default 2. backend : str, optional Select the backend to plot features. Supported are "plotly" and "seaborn", by default "plotly". preprocessor : obj One of the preprocessors supported by sklearn e.g.: StandardScaler(), Normalizer(). backend_kwargs : dict Dictionary with extra keyword arguments to extend functionality of backends that cannot be set with the defaults keyword arguments of the plot_atomic_features function. For more information see: - https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html - https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html data_only : bool If set to True, this function returns only data in a dataframe with the following structure: """ if backend_kwargs == None: backend_kwargs = {} method = method.lower() backend = backend.lower() dot_size = kwargs.get("dot_size", 2) supported_methods = ["pca", "tsne"] if method not in supported_methods: raise NotImplementedError if backend == "seaborn": # This hack is needed because it seems plotly import overwrite # everything. import matplotlib.pyplot as plt axis = ["x", "y", "z"] if dimensions > 3: raise NotImplementedError elif dimensions == 2: axis.pop(-1) if isinstance(latent_space, str): latent_space = load(latent_space) full_ls = [] full_symbols = [] # This conditional is needed if you are passing an atomic feature database. if b"feature_space" in latent_space.keys(): latent_space = latent_space[b"feature_space"] for hash, feature_space in latent_space.items(): for symbol, feature_vector in feature_space: try: symbol = symbol.decode("utf-8") except AttributeError: pass if isinstance(feature_vector, np.ndarray) is False: feature_vector = feature_vector.numpy() full_symbols.append(symbol) full_ls.append(feature_vector) if method == "pca": from sklearn.decomposition import PCA labels = {str(axis[i]): "PCA-{}".format(i + 1) for i in range(len(axis))} dim_reduction = PCA(n_components=dimensions, **backend_kwargs) if preprocessor != None: logger.info( f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..." ) dim_reduction = make_pipeline(preprocessor, dim_reduction) pca_result = dim_reduction.fit_transform(full_ls) to_pandas = [] entry = [] for i, element in enumerate(pca_result): entry = [full_symbols[i]] for d in range(dimensions): entry.append(element[d]) to_pandas.append(entry) columns = ["Symbol"] args = {} for key in axis: columns.append(labels[key]) args[key] = labels[key] df = pd.DataFrame(to_pandas, columns=columns) if dimensions == 3 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter_3d(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 2 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 3 and backend == "seaborn": raise ("This backend is for 2D visualization") elif dimensions == 2 and backend == "seaborn": sns.scatterplot(**labels, data=df, hue="Symbol") elif method == "tsne": from sklearn import manifold labels = {str(axis[i]): "t-SNE-{}".format(i + 1) for i in range(len(axis))} dim_reduction = manifold.TSNE(n_components=dimensions, **backend_kwargs) if preprocessor != None: logger.info( f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..." ) dim_reduction = make_pipeline(preprocessor, dim_reduction) tsne_result = dim_reduction.fit_transform(full_ls) to_pandas = [] entry = [] for i, element in enumerate(tsne_result): entry = [full_symbols[i]] for d in range(dimensions): entry.append(element[d]) to_pandas.append(entry) columns = ["Symbol"] args = {} for key in axis: columns.append(labels[key]) args[key] = labels[key] df = pd.DataFrame(to_pandas, columns=columns) if dimensions == 3 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter_3d(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 2 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 3 and backend == "seaborn": raise ("This backend is for 2D visualization") elif dimensions == 2 and backend == "seaborn": sns.scatterplot(**labels, data=df, hue="Symbol") if data_only: return df, dim_reduction else: try: plt.show() except: pass return plt, df, dim_reduction
def load(Cls, model=None, params=None, preprocessor=None, **kwargs): """Load ML4Chem models Parameters ---------- model : str The path to load the model from the .ml4c file for inference. params : srt The path to load .params file with users' inputs. preprocessor : str The path to load the file with the sklearn preprocessor object. """ kwargs["ml4chem_path"] = model kwargs["preprocessor"] = preprocessor with open(params, "rb") as ml4chem_params: ml4chem_params = json.load(ml4chem_params) model_type = ml4chem_params["model"].get("type") model_params = ml4chem_params["model"] class_name = model_params["class_name"] module_name = Potentials.module_names[model_params["name"]] model_class = dynamic_import(class_name, "ml4chem.atomistic.models", alt_name=module_name) delete = ["name", "type", "class_name"] for param in delete: # delete unneeded (key, value) pairs. del model_params[param] if model_type == "svm": weights = load(model) # TODO remove after de/serialization is fixed. try: weights = { key.decode("utf-8"): value for key, value in weights.items() } except AttributeError: weights = {key: value for key, value in weights.items()} model_params.update({"weights": weights}) model = model_class(**model_params) else: # Instantiate the model class model = model_class(**model_params) # Instantiation of fingerprint class fingerprint_params = ml4chem_params.get("features", None) if fingerprint_params == None: features = None else: if "kwargs" in fingerprint_params.keys(): update_dict_with = fingerprint_params.pop("kwargs") fingerprint_params.update(update_dict_with) if fingerprint_params is None: features = fingerprint_params else: name = fingerprint_params.get("name") del fingerprint_params["name"] features = dynamic_import(name, "ml4chem.atomistic.features") features = features(**fingerprint_params) calc = Cls(features=features, model=model, **kwargs) return calc
def calculate_features(self, images=None, purpose="training", data=None, svm=False): """Return features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the DataSet class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} """ logger.info(" ") logger.info("Fingerprinting") logger.info("==============") if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space else: return data initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations with delayed functions to operate # with dask's scheduler. These computations get cartesian coordinates. computations = [] for image in images.items(): key, image = image feature_vectors = [] computations.append(feature_vectors) for atom in image: if self.preprocessor is not None: # In this case we will preprocess data and need numpy # arrays to operate with sklearn. afp = self.get_atomic_features(atom, svm=True) feature_vectors.append(afp[1]) else: afp = self.get_atomic_features(atom, svm=svm) feature_vectors.append(afp) # In this block we compute the delayed functions in computations. feature_space = dask.compute(*computations, scheduler=self.scheduler) hashes = list(images.keys()) if self.preprocessor is not None and purpose == "training": feature_space = np.array(feature_space) dim = feature_space.shape if len(dim) > 1: d1, d2, d3 = dim feature_space = feature_space.reshape(d1 * d2, d3) feature_space = preprocessor.fit(feature_space, scheduler=self.scheduler) feature_space = feature_space.reshape(d1, d2, d3) else: atoms_index_map = [] stack = [] d1 = ini = end = 0 for i in feature_space: end = ini + len(i) atoms_map = list(range(ini, end)) atoms_index_map.append(atoms_map) ini = end for j in i: stack.append(j) d1 += 1 feature_space = np.array(stack) d2 = len(stack[0]) del stack # More data processing depending on the method used. computations = [] if svm: reference_space = [] for i, image in enumerate(images.items()): computations.append( self.restack_image(i, image, feature_space, svm=svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: reference_space.append( self.restack_atom(i, atom, feature_space)) reference_space = dask.compute(*reference_space, scheduler=self.scheduler) else: for i, image in enumerate(images.items()): computations.append( self.restack_image(i, image, feature_space, svm=svm)) feature_space = dask.compute(*computations, scheduler=self.scheduler) feature_space = OrderedDict(feature_space) # Save preprocessor. preprocessor.save_to_file(preprocessor, self.save_preprocessor) elif self.preprocessor is not None and purpose == "inference": # We take stacked features and preprocess them stacked_features = np.array(feature_space) d1, d2, d3 = stacked_features.shape stacked_features = stacked_features.reshape(d1 * d2, d3) feature_space = OrderedDict() scaled_feature_space = preprocessor.transform(stacked_features) # Once preprocessed, they are wrapped as a dictionary. # TODO this has to be parallelized. for key, image in images.items(): if key not in feature_space.keys(): feature_space[key] = [] for index, atom in enumerate(image): symbol = atom.symbol if svm: scaled = scaled_feature_space[index] # TODO change this to something more elegant later try: self.reference_space except AttributeError: # If self.reference does not exist it means that # reference_space is being loaded by Messagepack. symbol = symbol.encode("utf-8") else: scaled = torch.tensor( scaled_feature_space[index], requires_grad=False, dtype=torch.float, ) feature_space[key].append((symbol, scaled)) else: feature_space = OrderedDict(zip(hashes, feature_space)) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Fingerprinting finished in {} hours {} minutes {:.2f} " "seconds.\n".format(h, m, s)) if svm: data = {"feature_space": feature_space} dump(data, filename=self.filename) else: dump(feature_space, filename=self.filename) return feature_space
def calculate(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ client = dask.distributed.get_client() logger.info(" ") logger.info("Featurization") logger.info("=============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) logger.info(f"Module name: {self.name()}.") # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning(f"Loading features from {self.filename}.") logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info(f"Getting unique element symbols for {purpose}") unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info(f"Unique chemical elements: {unique_element_symbols}") elif isinstance(data.unique_element_symbols, dict): unique_element_symbols = data.unique_element_symbols[purpose] logger.info(f"Unique chemical elements: {unique_element_symbols}") # we make the features self.GP = self.custom.get("GP", None) if self.GP is None: custom = self.custom.get("user_input", None) self.GP = self.make_symmetry_functions( unique_element_symbols, custom=custom, angular_type=self.angular_type) self.custom.update({"GP": self.GP}) else: logger.info( "Using parameters from file to create symmetry functions...\n") self.print_features_params(self.GP) symbol = data.unique_element_symbols[purpose][0] sample = np.zeros(len(self.GP[symbol])) self.dimension = len(sample) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic features. logger.info("") logger.info( "Embarrassingly parallel computation of atomic features...") stacked_features = [] atoms_index_map = [ ] # This list is used to reconstruct images from atoms. if self.batch_size is None: self.batch_size = data.get_total_number_atoms() chunks = get_chunks(images, self.batch_size, svm=svm) ini = end = 0 for chunk in chunks: images_ = OrderedDict(chunk) intermediate = [] for image in images_.items(): _, image = image end = ini + len(image) atoms_index_map.append(list(range(ini, end))) ini = end for atom in image: index = atom.index symbol = atom.symbol cutoff_keys = ["radial", "angular"] n_symbols, neighborpositions = {}, {} if isinstance(self.cutoff, dict): for cutoff_key in cutoff_keys: nl = get_neighborlist( image, cutoff=self.cutoff[cutoff_key]) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols_ = np.array( image.get_chemical_symbols())[n_indices] n_symbols[cutoff_key] = n_symbols_ neighborpositions_ = image.positions[ n_indices] + np.dot(n_offsets, image.get_cell()) neighborpositions[cutoff_key] = neighborpositions_ else: for cutoff_key in cutoff_keys: nl = get_neighborlist(image, cutoff=self.cutoff) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols_ = np.array( image.get_chemical_symbols())[n_indices] n_symbols[cutoff_key] = n_symbols_ neighborpositions_ = image.positions[ n_indices] + np.dot(n_offsets, image.get_cell()) neighborpositions[cutoff_key] = neighborpositions_ afp = self.get_atomic_features( atom, index, symbol, n_symbols, neighborpositions, image_molecule=image, weighted=self.weighted, n_indices=n_indices, ) intermediate.append(afp) intermediate = client.persist(intermediate, scheduler=self.scheduler) stacked_features += intermediate del intermediate scheduler_time = time.time() - initial_time dask.distributed.wait(stacked_features) h, m, s = convert_elapsed_time(scheduler_time) logger.info("... finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) logger.info("") if self.preprocessor is not None: scaled_feature_space = [] # To take advantage of dask_ml we need to convert our numpy array # into a dask array. logger.info("Converting features to dask array...") stacked_features = [ da.from_delayed(lazy, dtype=float, shape=sample.shape) for lazy in stacked_features ] layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1} # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout) stacked_features = da.stack(stacked_features, axis=0).rechunk(layout) logger.info("Shape of array is {} and chunks {}.".format( stacked_features.shape, stacked_features.chunks)) # Note that dask_ml by default convert the output of .fit # in a concrete value. if purpose == "training": stacked_features = preprocessor.fit(stacked_features, scheduler=self.scheduler) else: stacked_features = preprocessor.transform(stacked_features) atoms_index_map = [ client.scatter(indices) for indices in atoms_index_map ] # stacked_features = [client.scatter(features) for features in stacked_features] stacked_features = client.scatter(stacked_features, broadcast=True) logger.info("Stacking features using atoms index map...") for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) # features = self.stack_features(indices, stacked_features) scaled_feature_space.append(features) else: scaled_feature_space = [] atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] stacked_features = client.scatter(stacked_features, broadcast=True) for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) scaled_feature_space.append(features) scaled_feature_space = client.gather(scaled_feature_space) # Clean del stacked_features # Restack images feature_space = [] if svm and purpose == "training": logger.info("Building array with reference space.") reference_space = [] for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: restacked_atom = client.submit( self.restack_atom, *(i, atom, scaled_feature_space)) reference_space.append(restacked_atom) feature_space.append(restacked) reference_space = client.gather(reference_space) elif svm is False and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) else: try: for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): restacked = client.submit(self.restack_image, *(i, image, feature_space, svm)) feature_space.append(restacked) feature_space = client.gather(feature_space) feature_space = OrderedDict(feature_space) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Featurization finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info(f"features saved to {self.filename}.") data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) self.feature_space = feature_space self.reference_space = reference_space return self.feature_space, self.reference_space elif svm is False and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info(f"features saved to {self.filename}.") dump(feature_space, filename=self.filename) self.feature_space = feature_space return self.feature_space else: self.feature_space = feature_space return self.feature_space
def calculate(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ client = dask.distributed.get_client() logger.info(" ") logger.info("Featurization") logger.info("=============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) elif isinstance(data.unique_element_symbols, dict): unique_element_symbols = data.unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) # we make the features preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic features. logger.info("") logger.info( "Embarrassingly parallel computation of atomic features...") stacked_features = [] atoms_symbols_map = [ ] # This list is used to reconstruct images from atoms. if self.batch_size is None: self.batch_size = data.get_total_number_atoms() chunks = get_chunks(images, self.batch_size, svm=svm) for chunk in chunks: images_ = OrderedDict(chunk) intermediate = [] for image in images_.items(): key, image = image atoms_symbols_map.append(image.get_chemical_symbols()) # Use .create() class method from dscribe. _features = dask.delayed(self.create)(image) intermediate.append(_features) intermediate = client.compute(intermediate, scheduler=self.scheduler) stacked_features += intermediate del intermediate # scheduler_time = time.time() - initial_time # dask.distributed.wait(stacked_features) logger.info("") if self.preprocessor is not None: raise NotImplementedError else: scaled_feature_space = [] atoms_symbols_map = [ client.scatter(chunk) for chunk in atoms_symbols_map ] stacked_features = client.scatter(stacked_features, broadcast=True) for image_index, symbols in enumerate(atoms_symbols_map): features = client.submit( self.stack_features, *(symbols, image_index, stacked_features)) scaled_feature_space.append(features) scaled_feature_space = client.gather(scaled_feature_space) # Clean del stacked_features # Restack images feature_space = [] if svm and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) elif svm is False and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) else: try: for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): restacked = client.submit(self.restack_image, *(i, image, feature_space, svm)) feature_space.append(restacked) feature_space = client.gather(feature_space) if svm and purpose == "training": # FIXME This might need to be improved logger.info("Building array with reference space.") hashes, reference_space = list(zip(*feature_space)) del hashes reference_space = list( itertools.chain.from_iterable(reference_space)) logger.info("Finished reference space.") feature_space = OrderedDict(feature_space) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Featurization finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info("features saved to {}.".format(self.filename)) data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) self.feature_space = feature_space self.reference_space = reference_space return self.feature_space, self.reference_space elif svm is False and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info("features saved to {}.".format(self.filename)) dump(feature_space, filename=self.filename) self.feature_space = feature_space return self.feature_space else: self.feature_space = feature_space return self.feature_space
def plot_atomic_features(latent_space, method="PCA", dimensions=2): """Plot high dimensional atomic feature vectors This function can take a feature space dictionary, or a database file and plot the atomic features using PCA or t-SNE. $ mlchem --plot tsne --file path.db Parameters ---------- latent_space : dict or str Dictionary of atomic features of path to database file. method : str, optional Dimensionality reduction method to employed, by default "PCA". Supported are: "PCA" and "TSNE". dimensions : int, optional Number of dimensions to reduce the high dimensional atomic feature vectors, by default 2. """ method = method.lower() if isinstance(latent_space, str): latent_space = load(latent_space) full_ls = [] full_symbols = [] # This conditional is needed if you are passing an atomic feature database. if b"feature_space" in latent_space.keys(): latent_space = latent_space[b"feature_space"] for hash, feature_space in latent_space.items(): for symbol, feature_vector in feature_space: try: symbol = symbol.decode("utf-8") except AttributeError: pass if isinstance(feature_vector, np.ndarray) is False: feature_vector = feature_vector.numpy() full_symbols.append(symbol) full_ls.append(feature_vector) if method == "pca": from sklearn.decomposition import PCA labels = {"x": "PCA-1", "y": "PCA-2"} pca = PCA(n_components=dimensions) pca_result = pca.fit_transform(full_ls) to_pandas = [] for i, element in enumerate(pca_result): to_pandas.append([full_symbols[i], element[0], element[1]]) columns = ["Symbol", "PCA-1", "PCA-2"] df = pd.DataFrame(to_pandas, columns=columns) sns.scatterplot(**labels, data=df, hue="Symbol") elif method == "tsne": from sklearn import manifold labels = {"x": "t-SNE-1", "y": "t-SNE-2"} tsne = manifold.TSNE(n_components=dimensions) tsne_result = tsne.fit_transform(full_ls) to_pandas = [] for i, element in enumerate(tsne_result): to_pandas.append([full_symbols[i], element[0], element[1]]) columns = ["Symbol", "t-SNE-1", "t-SNE-2"] df = pd.DataFrame(to_pandas, columns=columns) sns.scatterplot(**labels, data=df, hue="Symbol") plt.show()