示例#1
0
    def load(self, file_path, verbose=True):
        """Load internal parameters for each element.

        Only root MPI process loads parameters.

        Args:
            file_path (~pathlib.Path): File path to load parameters.
            verbose (bool, optional): Print log to stdout.
        """
        if MPI.rank == 0:
            ndarray = np.load(file_path, allow_pickle=True)
            """Add <allow_pickle=True> for issue-265: enabling to use numpy 1.16.3 or later """
            self._elements = ndarray['elements'].item()
            self._n_components = ndarray['n_components'].item()
            self._mean = {
                element: ndarray[f'mean:{element}']
                for element in self._elements
            }
            self._transform = {
                element: ndarray[f'transform:{element}']
                for element in self._elements
            }
        if verbose:
            pprint(f'Loaded PCA parameters from {file_path}.')
示例#2
0
    def dump_for_lammps(self, preprocesses, master_nnp):
        dc = self.dataset_config
        potential_file = self.load_dir / 'lammps.nnp'
        with potential_file.open('w') as f:
            # information
            now = datetime.datetime.now()
            machine = socket.gethostname()
            pprint(f'''
            # Created by hdnnpy {__version__} ({now}).
            # All parameters are read from [{machine}] {self.load_dir}.
            # Ref: https://github.com/ogura-edu/HDNNP
            ''',
                   stream=f)

            # descriptor
            pprint(f'''
            # {dc.descriptor} parameters
            {len(dc.parameters)}
            ''',
                   stream=f)
            for name, params in dc.parameters.items():
                params_str = ('\n' + ' ' * 16).join(
                    [' '.join(map(str, row)) for row in params])
                pprint(f'''
                {name} {len(params)}
                {params_str}
                ''',
                       stream=f)

            # preprocess
            pprint(f'''
            # pre-processing parameters
            {len(preprocesses)}
            ''',
                   stream=f)
            for preprocess in preprocesses:
                pprint(f'''
                {preprocess.name}

                {textwrap.indent(
                    textwrap.dedent(preprocess.dump_params()), ' '*16)}
                ''',
                       stream=f)

            # model
            pprint(f'''
            # neural network parameters
            {len(master_nnp[0])}

            {textwrap.indent(
                textwrap.dedent(master_nnp.dump_params()), ' '*12)}
            ''',
                   stream=f)
示例#3
0
    def load(self, file_path, verbose=True, remake=False):
        """Load dataset from .npz format file.

        Only root MPI process load dataset.

        It validates following compatibility between loaded dataset and
        atomic structures given at initialization.

            * length of data
            * elemental composition
            * elements
            * tag

        It also validates that loaded dataset satisfies requirements.

            * order

        Args:
            file_path (~pathlib.Path): File path to load dataset.
            verbose (bool, optional): Print log to stdout.
            remake (bool, optional): If loaded dataset is lacking in
                any property, recalculate dataset from scratch and
                overwrite it to ``file_path``. Otherwise, it raises
                ValueError.

        Raises:
            AssertionError: If loaded dataset is incompatible with
                atomic structures given at initialization.
            ValueError: If loaded dataset is lacking in any property and
                ``remake=False``.
        """
        # validate compatibility between my structures and loaded dataset
        ndarray = np.load(file_path)
        assert list(ndarray['elemental_composition']) \
               == self._elemental_composition
        assert list(ndarray['elements']) == self._elements
        assert ndarray['tag'].item() == self._tag
        assert len(ndarray[self._properties[0]]) == len(self)

        # validate lacking properties
        lacking_properties = set(self._properties) - set(ndarray)
        if lacking_properties:
            if verbose:
                lacking = ('\n'+' '*20).join(sorted(lacking_properties))
                pprint(f'''
                Following properties are lacked in {file_path}.
                    {lacking}
                ''')
            if remake:
                if verbose:
                    pprint('Start to recalculate dataset from scratch.')
                self.make(verbose=verbose)
                self.save(file_path, verbose=verbose)
                return
            else:
                raise ValueError('Please recalculate dataset from scratch.')

        # load dataset as much as needed
        if MPI.rank == 0:
            for i in range(self._order + 1):
                self._dataset.append(ndarray[self._properties[i]])

        if verbose:
            pprint(f'Successfully loaded & made needed {self.name} dataset'
                   f' from {file_path}')
示例#4
0
    def construct_datasets(self, tag_xyz_map):
        dc = self.dataset_config
        mc = self.model_config
        tc = self.training_config

        preprocess_dir = tc.out_dir / 'preprocess'
        preprocess_dir.mkdir(parents=True, exist_ok=True)
        preprocesses = []
        for (name, args, kwargs) in dc.preprocesses:
            preprocess = PREPROCESS[name](*args, **kwargs)
            if self.is_resume:
                preprocess.load(
                    preprocess_dir / f'{name}.npz', verbose=self.verbose)
            preprocesses.append(preprocess)

        datasets = []
        for pattern in tc.tags:
            for tag in fnmatch.filter(tag_xyz_map, pattern):
                if self.verbose:
                    pprint(f'Construct sub dataset tagged as "{tag}"')
                tagged_xyz = tag_xyz_map.pop(tag)
                structures = AtomicStructure.read_xyz(tagged_xyz)

                # prepare descriptor dataset
                descriptor = DESCRIPTOR_DATASET[dc.descriptor](
                    self.loss_function.order['descriptor'],
                    structures, **dc.parameters)
                descriptor_npz = tagged_xyz.with_name(f'{dc.descriptor}.npz')
                if descriptor_npz.exists():
                    descriptor.load(
                        descriptor_npz, verbose=self.verbose, remake=dc.remake)
                else:
                    descriptor.make(verbose=self.verbose)
                    descriptor.save(descriptor_npz, verbose=self.verbose)

                # prepare property dataset
                property_ = PROPERTY_DATASET[dc.property_](
                    self.loss_function.order['property'], structures)
                property_npz = tagged_xyz.with_name(f'{dc.property_}.npz')
                if property_npz.exists():
                    property_.load(
                        property_npz, verbose=self.verbose, remake=dc.remake)
                else:
                    property_.make(verbose=self.verbose)
                    property_.save(property_npz, verbose=self.verbose)

                # construct HDNNP dataset from descriptor & property datasets
                dataset = HDNNPDataset(descriptor, property_)
                dataset.construct(
                    all_elements=tc.elements, preprocesses=preprocesses,
                    shuffle=True, verbose=self.verbose)
                dataset.scatter()
                datasets.append(dataset)
                dc.n_sample += dataset.total_size
                mc.n_input = dataset.n_input
                mc.n_output = dataset.n_label

        for preprocess in preprocesses:
            preprocess.save(
                preprocess_dir / f'{preprocess.name}.npz',
                verbose=self.verbose)

        return datasets
示例#5
0
    def start(self):

        tc = self.training_config
        tc.out_dir.mkdir(parents=True, exist_ok=True)
        if not self.is_resume:
            shutil.copy(self.config_file, tc.out_dir / self.config_file.name)
        tag_xyz_map, tc.elements = parse_xyz(tc.data_file,
                                             verbose=self.verbose)

        #2020/3/26 making hold out before PCA
        #split xyz data to train and test
        #then, construct dataset
        #the procedure is as follows:
        #1. temporaly save to train.xyz and test.xyz for each tag and
        #   map the tag to each file
        #2. construct the symmetry function data and PCA fit for training data
        #3. construct the symmetry function and apply PCA preprocess obtained by step 2

        #print(tag_xyz_map)

        #hold out procedure
        tag_training_xyz_map = {}
        tag_test_xyz_map = {}
        train_descriptor_npz = []
        test_descriptor_npz = []

        for pattern in tc.tags:
            for tag in fnmatch.filter(tag_xyz_map, pattern):
                if self.verbose:
                    pprint(f'holdout xyz data tagged as "{tag}"')
                tagged_xyz = tag_xyz_map.get(tag)

                train_descriptor_npz.append(
                    tagged_xyz.with_name(
                        f'{self.dataset_config.descriptor}.npz').exists())
                test_descriptor_npz.append(
                    tagged_xyz.with_name(
                        f'{self.dataset_config.descriptor}-test.npz').exists())
                #print(tagged_xyz)
                xyz_data = ase.io.read(str(tagged_xyz),
                                       index=':',
                                       format='xyz')

                random.shuffle(xyz_data)
                s = int(len(xyz_data) * tc.train_test_ratio)
                train = xyz_data[:s]
                test = xyz_data[s:]
                assert len(train) > 0
                assert len(test) > 0

                ase.io.write(str(tc.data_file.with_name(tag)) + "/train.xyz",
                             train,
                             format='xyz')
                ase.io.write(str(tc.data_file.with_name(tag)) + "/test.xyz",
                             test,
                             format='xyz')
                #print(tc.data_file.__class__)

                tag_training_xyz_map[tag] = (tc.data_file.with_name(tag) /
                                             'train.xyz')
                tag_test_xyz_map[tag] = (tc.data_file.with_name(tag) /
                                         'test.xyz')

        #decide load the npz for descriptor or not
        #if descriptor npz found in all tag, then load
        load_descriptor = False
        if (all(train_descriptor_npz) and all(test_descriptor_npz)):
            load_descriptor = True
            pprint(
                f'reuse the preserved descriptor data. train.xyz and test.xyz data is not used.'
            )

        train_datasets = self.construct_training_datasets(
            tag_training_xyz_map, load_descriptor)
        test_datasets = self.construct_test_datasets(tag_test_xyz_map,
                                                     load_descriptor)
        '''
        for t in train_datasets:
            print(t.tag)
            print(t.property.properties)
        '''

        #reshapse the form of dataset
        dataset = []
        for train in train_datasets:
            test_dataset = None
            for test in test_datasets:
                if (test.tag == train.tag):
                    test_dataset = test
            dataset.append((train, test))

        #test print
        '''
        for training, test in dataset:
            print(training.tag)
            print(test.tag)
            print(training.property.properties)
            print(test.property.properties)
        '''

        #original detaset generation
        #In this case, PCA use all data to constract transform matrix
        '''
        datasets = self.construct_datasets(tag_xyz_map)
        dataset = DatasetGenerator(*datasets).holdout(tc.train_test_ratio)
        '''

        ## Stop process here if no_train flag is set
        #print(tc.no_train)
        if tc.no_train:
            print('Process is stopped by no_train flag')
            sys.exit()
        ## End of stopping process

        result = self.train(dataset)
        if MPI.rank == 0:
            self.dump_result(result)