def load(self, file_path, verbose=True): """Load internal parameters for each element. Only root MPI process loads parameters. Args: file_path (~pathlib.Path): File path to load parameters. verbose (bool, optional): Print log to stdout. """ if MPI.rank == 0: ndarray = np.load(file_path, allow_pickle=True) """Add <allow_pickle=True> for issue-265: enabling to use numpy 1.16.3 or later """ self._elements = ndarray['elements'].item() self._n_components = ndarray['n_components'].item() self._mean = { element: ndarray[f'mean:{element}'] for element in self._elements } self._transform = { element: ndarray[f'transform:{element}'] for element in self._elements } if verbose: pprint(f'Loaded PCA parameters from {file_path}.')
def dump_for_lammps(self, preprocesses, master_nnp): dc = self.dataset_config potential_file = self.load_dir / 'lammps.nnp' with potential_file.open('w') as f: # information now = datetime.datetime.now() machine = socket.gethostname() pprint(f''' # Created by hdnnpy {__version__} ({now}). # All parameters are read from [{machine}] {self.load_dir}. # Ref: https://github.com/ogura-edu/HDNNP ''', stream=f) # descriptor pprint(f''' # {dc.descriptor} parameters {len(dc.parameters)} ''', stream=f) for name, params in dc.parameters.items(): params_str = ('\n' + ' ' * 16).join( [' '.join(map(str, row)) for row in params]) pprint(f''' {name} {len(params)} {params_str} ''', stream=f) # preprocess pprint(f''' # pre-processing parameters {len(preprocesses)} ''', stream=f) for preprocess in preprocesses: pprint(f''' {preprocess.name} {textwrap.indent( textwrap.dedent(preprocess.dump_params()), ' '*16)} ''', stream=f) # model pprint(f''' # neural network parameters {len(master_nnp[0])} {textwrap.indent( textwrap.dedent(master_nnp.dump_params()), ' '*12)} ''', stream=f)
def load(self, file_path, verbose=True, remake=False): """Load dataset from .npz format file. Only root MPI process load dataset. It validates following compatibility between loaded dataset and atomic structures given at initialization. * length of data * elemental composition * elements * tag It also validates that loaded dataset satisfies requirements. * order Args: file_path (~pathlib.Path): File path to load dataset. verbose (bool, optional): Print log to stdout. remake (bool, optional): If loaded dataset is lacking in any property, recalculate dataset from scratch and overwrite it to ``file_path``. Otherwise, it raises ValueError. Raises: AssertionError: If loaded dataset is incompatible with atomic structures given at initialization. ValueError: If loaded dataset is lacking in any property and ``remake=False``. """ # validate compatibility between my structures and loaded dataset ndarray = np.load(file_path) assert list(ndarray['elemental_composition']) \ == self._elemental_composition assert list(ndarray['elements']) == self._elements assert ndarray['tag'].item() == self._tag assert len(ndarray[self._properties[0]]) == len(self) # validate lacking properties lacking_properties = set(self._properties) - set(ndarray) if lacking_properties: if verbose: lacking = ('\n'+' '*20).join(sorted(lacking_properties)) pprint(f''' Following properties are lacked in {file_path}. {lacking} ''') if remake: if verbose: pprint('Start to recalculate dataset from scratch.') self.make(verbose=verbose) self.save(file_path, verbose=verbose) return else: raise ValueError('Please recalculate dataset from scratch.') # load dataset as much as needed if MPI.rank == 0: for i in range(self._order + 1): self._dataset.append(ndarray[self._properties[i]]) if verbose: pprint(f'Successfully loaded & made needed {self.name} dataset' f' from {file_path}')
def construct_datasets(self, tag_xyz_map): dc = self.dataset_config mc = self.model_config tc = self.training_config preprocess_dir = tc.out_dir / 'preprocess' preprocess_dir.mkdir(parents=True, exist_ok=True) preprocesses = [] for (name, args, kwargs) in dc.preprocesses: preprocess = PREPROCESS[name](*args, **kwargs) if self.is_resume: preprocess.load( preprocess_dir / f'{name}.npz', verbose=self.verbose) preprocesses.append(preprocess) datasets = [] for pattern in tc.tags: for tag in fnmatch.filter(tag_xyz_map, pattern): if self.verbose: pprint(f'Construct sub dataset tagged as "{tag}"') tagged_xyz = tag_xyz_map.pop(tag) structures = AtomicStructure.read_xyz(tagged_xyz) # prepare descriptor dataset descriptor = DESCRIPTOR_DATASET[dc.descriptor]( self.loss_function.order['descriptor'], structures, **dc.parameters) descriptor_npz = tagged_xyz.with_name(f'{dc.descriptor}.npz') if descriptor_npz.exists(): descriptor.load( descriptor_npz, verbose=self.verbose, remake=dc.remake) else: descriptor.make(verbose=self.verbose) descriptor.save(descriptor_npz, verbose=self.verbose) # prepare property dataset property_ = PROPERTY_DATASET[dc.property_]( self.loss_function.order['property'], structures) property_npz = tagged_xyz.with_name(f'{dc.property_}.npz') if property_npz.exists(): property_.load( property_npz, verbose=self.verbose, remake=dc.remake) else: property_.make(verbose=self.verbose) property_.save(property_npz, verbose=self.verbose) # construct HDNNP dataset from descriptor & property datasets dataset = HDNNPDataset(descriptor, property_) dataset.construct( all_elements=tc.elements, preprocesses=preprocesses, shuffle=True, verbose=self.verbose) dataset.scatter() datasets.append(dataset) dc.n_sample += dataset.total_size mc.n_input = dataset.n_input mc.n_output = dataset.n_label for preprocess in preprocesses: preprocess.save( preprocess_dir / f'{preprocess.name}.npz', verbose=self.verbose) return datasets
def start(self): tc = self.training_config tc.out_dir.mkdir(parents=True, exist_ok=True) if not self.is_resume: shutil.copy(self.config_file, tc.out_dir / self.config_file.name) tag_xyz_map, tc.elements = parse_xyz(tc.data_file, verbose=self.verbose) #2020/3/26 making hold out before PCA #split xyz data to train and test #then, construct dataset #the procedure is as follows: #1. temporaly save to train.xyz and test.xyz for each tag and # map the tag to each file #2. construct the symmetry function data and PCA fit for training data #3. construct the symmetry function and apply PCA preprocess obtained by step 2 #print(tag_xyz_map) #hold out procedure tag_training_xyz_map = {} tag_test_xyz_map = {} train_descriptor_npz = [] test_descriptor_npz = [] for pattern in tc.tags: for tag in fnmatch.filter(tag_xyz_map, pattern): if self.verbose: pprint(f'holdout xyz data tagged as "{tag}"') tagged_xyz = tag_xyz_map.get(tag) train_descriptor_npz.append( tagged_xyz.with_name( f'{self.dataset_config.descriptor}.npz').exists()) test_descriptor_npz.append( tagged_xyz.with_name( f'{self.dataset_config.descriptor}-test.npz').exists()) #print(tagged_xyz) xyz_data = ase.io.read(str(tagged_xyz), index=':', format='xyz') random.shuffle(xyz_data) s = int(len(xyz_data) * tc.train_test_ratio) train = xyz_data[:s] test = xyz_data[s:] assert len(train) > 0 assert len(test) > 0 ase.io.write(str(tc.data_file.with_name(tag)) + "/train.xyz", train, format='xyz') ase.io.write(str(tc.data_file.with_name(tag)) + "/test.xyz", test, format='xyz') #print(tc.data_file.__class__) tag_training_xyz_map[tag] = (tc.data_file.with_name(tag) / 'train.xyz') tag_test_xyz_map[tag] = (tc.data_file.with_name(tag) / 'test.xyz') #decide load the npz for descriptor or not #if descriptor npz found in all tag, then load load_descriptor = False if (all(train_descriptor_npz) and all(test_descriptor_npz)): load_descriptor = True pprint( f'reuse the preserved descriptor data. train.xyz and test.xyz data is not used.' ) train_datasets = self.construct_training_datasets( tag_training_xyz_map, load_descriptor) test_datasets = self.construct_test_datasets(tag_test_xyz_map, load_descriptor) ''' for t in train_datasets: print(t.tag) print(t.property.properties) ''' #reshapse the form of dataset dataset = [] for train in train_datasets: test_dataset = None for test in test_datasets: if (test.tag == train.tag): test_dataset = test dataset.append((train, test)) #test print ''' for training, test in dataset: print(training.tag) print(test.tag) print(training.property.properties) print(test.property.properties) ''' #original detaset generation #In this case, PCA use all data to constract transform matrix ''' datasets = self.construct_datasets(tag_xyz_map) dataset = DatasetGenerator(*datasets).holdout(tc.train_test_ratio) ''' ## Stop process here if no_train flag is set #print(tc.no_train) if tc.no_train: print('Process is stopped by no_train flag') sys.exit() ## End of stopping process result = self.train(dataset) if MPI.rank == 0: self.dump_result(result)