def evaluate(self, test_structures, ref_energies, ref_forces, ref_stresses): """ Evaluate energies, forces and stresses of structures with trained interatomic potential. Args: test_structures ([Structure]): List of Pymatgen Structure Objects. ref_energies ([float]): List of DFT-calculated total energies of each structure in structures list. ref_forces ([np.array]): List of DFT-calculated (m, 3) forces of each structure with m atoms in structures list. m can be varied with each single structure case. ref_stresses (list): List of DFT-calculated (6, ) viriral stresses of each structure in structures list. """ predict_pool = pool_from(test_structures, ref_energies, ref_forces, ref_stresses) _, df_orig = convert_docs(predict_pool) _, df_predict = convert_docs(pool_from(test_structures)) outputs = self.model.predict(inputs=test_structures, override=True) df_predict['y_orig'] = df_predict['n'] * outputs return df_orig, df_predict
def evaluate(self, test_structures, ref_energies=None, ref_forces=None, ref_stresses=None, predict_energies=True, predict_forces=True, predict_stress=False): """ Evaluate energies, forces and stresses of structures with trained interatomic potential. Args: test_structures ([Structure]): List of Pymatgen Structure Objects. ref_energies ([float]): List of DFT-calculated total energies of each structure in structures list. ref_forces ([np.array]): List of DFT-calculated (m, 3) forces of each structure with m atoms in structures list. m can be varied with each single structure case. ref_stresses (list): List of DFT-calculated (6, ) viriral stresses of each structure in structures list. predict_energies (bool): Whether to predict energies of configurations. predict_forces (bool): Whether to predict forces of configurations. predict_stress (bool): Whether to predict virial stress of configurations. """ if not which('quip'): raise RuntimeError( "quip has not been found.\n", "Please refer to https://github.com/libAtoms/QUIP for ", "further detail.") xml_file = 'predict.xml' original_file = 'original.xyz' predict_file = 'predict.xyz' predict_pool = pool_from(test_structures, ref_energies, ref_forces, ref_stresses) with ScratchDir('.'): _ = self.write_param(xml_file) original_file = self.write_cfgs(original_file, cfg_pool=predict_pool) _, df_orig = self.read_cfgs(original_file) exe_command = ["quip"] exe_command.append("atoms_filename={}".format(original_file)) exe_command.append("param_filename={}".format(xml_file)) if predict_energies: exe_command.append("energy=T") if predict_forces: exe_command.append("forces=T") if predict_stress: exe_command.append("virial=T") p = subprocess.Popen(exe_command, stdout=open(predict_file, 'w')) stdout = p.communicate()[0] rc = p.returncode _, df_predict = self.read_cfgs(predict_file, predict=True) return df_orig, df_predict
def evaluate(self, test_structures, ref_energies, ref_forces, ref_stresses): """ Evaluate energies, forces and stresses of structures with trained interatomic potential. Args: test_structures ([Structure]): List of Pymatgen Structure Objects. ref_energies ([float]): List of DFT-calculated total energies of each structure in structures list. ref_forces ([np.array]): List of DFT-calculated (m, 3) forces of each structure with m atoms in structures list. m can be varied with each single structure case. ref_stresses (list): List of DFT-calculated (6, ) viriral stresses of each structure in structures list. """ predict_pool = pool_from(test_structures, ref_energies, ref_forces, ref_stresses) _, df_orig = convert_docs(predict_pool) data_pool = [] for struct in test_structures: d = {'outputs': {}} d['structure'] = struct.as_dict() d['num_atoms'] = len(struct) features = self.describer.describe(struct) targets = self.predictor.predict(features.values) d['outputs']['energy'] = 0 d['outputs']['forces'] = targets.reshape((-1, 3)) d['outputs']['virial_stress'] = [0., 0., 0., 0., 0., 0.] data_pool.append(d) _, df_pred = convert_docs(data_pool) return df_orig, df_pred
def train(self, train_structures, energies, forces, stresses=None, **kwargs): """ Training data with model. Args: train_structures ([Structure]): The list of Pymatgen Structure object. energies ([float]): The list of total energies of each structure in structures list. energies ([float]): List of total energies of each structure in structures list. forces ([np.array]): List of (m, 3) forces array of each structure with m atoms in structures list. m can be varied with each single structure case. stresses (list): List of (6, ) virial stresses of each structure in structures list. """ train_pool = pool_from(train_structures, energies, forces, stresses) _, df = convert_docs(train_pool) ytrain = df['y_orig'] / df['n'] self.model.fit(inputs=train_structures, outputs=ytrain, **kwargs) self.specie = Element(train_structures[0].symbol_set[0])
def describe(self, structure): """ Returns data for one input structure. Args: structure (Structure): Input structure. """ if not which('RuNNer'): raise RuntimeError("RuNNer has not been found.") if not which("RuNNerMakesym"): raise RuntimeError("RuNNerMakesym has not been found.") def read_functions_data(filename): """ Read structure features from file. Args: filename (str): The functions file to be read. """ with zopen(filename, 'rt') as f: lines = f.read() block_pattern = re.compile( r'(\n\s+\d+\n|^\s+\d+\n)(.+?)(?=\n\s+\d+\n|$)', re.S) points_features = [] for (num_neighbor, block) in block_pattern.findall(lines): point_features = pd.DataFrame([ feature.split()[1:] for feature in block.split('\n')[:-1] ], dtype=np.float32) points_features.append(point_features) points_features = pd.concat(points_features, keys=range( len(block_pattern.findall(lines))), names=['point_index', None]) return points_features dmin = sorted(set(structure.distance_matrix.ravel()))[1] r_etas = self.operator.generate_eta(dmin=self.dmin, r_cut=self.cutoff, num_symm2=self.num_symm2) atoms_filename = 'input.data' mode_output = 'mode.out' with ScratchDir('.'): atoms_filename = self.operator.write_cfgs(filename=atoms_filename, cfg_pool=pool_from( [structure])) input_filename = self.operator.write_input(mode=1, r_cut=self.cutoff, r_etas=r_etas, a_etas=self.a_etas, scale_feature=False) p = subprocess.Popen(['RuNNer'], stdout=open(mode_output, 'w')) stdout = p.communicate()[0] descriptors = read_functions_data('function.data') return pd.DataFrame(descriptors)
def test_pool_from(self): test_pool = pool_from(self.test_structures, self.test_energies, self.test_forces, self.test_stresses) for p1, p2 in zip(test_pool, self.test_pool): self.assertEqual(p1['outputs']['energy'], p2['outputs']['energy']) self.assertEqual(p1['outputs']['forces'], p2['outputs']['forces']) self.assertEqual(p1['outputs']['virial_stress'], p2['outputs']['virial_stress'])
def evaluate(self, test_structures, ref_energies, ref_forces, ref_stresses): """ Evaluate energies, forces and stresses of structures with trained interatomic potential. Args: test_structures ([Structure]): List of Pymatgen Structure Objects. ref_energies ([float]): List of DFT-calculated total energies of each structure in structures list. ref_forces ([np.array]): List of DFT-calculated (m, 3) forces of each structure with m atoms in structures list. m can be varied with each single structure case. ref_stresses (list): List of DFT-calculated (6, ) viriral stresses of each structure in structures list. """ if not which('nnp-predict'): raise RuntimeError("NNP Predictor has not been found.") original_file = 'input.data' predict_file = 'output.data' predict_pool = pool_from(test_structures, ref_energies, ref_forces, ref_stresses) with ScratchDir('.'): _, _ = self.write_param() original_file = self.write_cfgs(original_file, cfg_pool=predict_pool) _, df_orig = self.read_cfgs(original_file) input_filename = self.write_input() dfs = [] for data in predict_pool: _ = self.write_cfgs(original_file, cfg_pool=[data]) p = subprocess.Popen(['nnp-predict', input_filename], stdout=subprocess.PIPE) stdout = p.communicate()[0] rc = p.returncode if rc != 0: error_msg = 'RuNNer exited with return code %d' % rc msg = stdout.decode("utf-8").split('\n')[:-1] try: error_line = [ i for i, m in enumerate(msg) if m.startswith('ERROR') ][0] error_msg += ', '.join([e for e in msg[error_line:]]) except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) _, df = self.read_cfgs(predict_file) dfs.append(df) df_predict = pd.concat(dfs, ignore_index=True) return df_orig, df_predict
def evaluate(self, test_structures, ref_energies=None, ref_forces=None, ref_stresses=None, **kwargs): """ Evaluate energies, forces and stresses of structures with trained interatomic potential. Args: test_structures ([Structure]): List of Pymatgen Structure Objects. ref_energies ([float]): List of DFT-calculated total energies of each structure in structures list. ref_forces ([np.array]): List of DFT-calculated (m, 3) forces of each structure with m atoms in structures list. m can be varied with each single structure case. ref_stresses (list): List of DFT-calculated (6, ) viriral stresses of each structure in structures list. kwargs: Parameters of write_param method. """ if not which('mlp'): raise RuntimeError("mlp has not been found.\n", "Please refer to http://gitlab.skoltech.ru/shapeev/mlip ", "for further detail.") fitted_mtp = 'fitted.mtp' original_file = 'original.cfgs' predict_file = 'predict.cfgs' predict_pool = pool_from(test_structures, ref_energies, ref_forces, ref_stresses) dataset = predict_pool[0] if isinstance(dataset['structure'], dict): structure = Structure.from_dict(dataset['structure']) else: structure = dataset['structure'] symbol = structure.symbol_set[0] with ScratchDir('.'): self.write_param(fitted_mtp=fitted_mtp, Abinitio=0, Driver=1, Calculate_EFS=True, Write_cfgs=predict_file, Database_filename=original_file, **kwargs) original_file = self.write_cfg(original_file, cfg_pool=predict_pool) _, df_orig = self.read_cfgs(original_file, symbol=symbol) p = subprocess.Popen(['mlp', 'run', 'mlip.ini'], stdout=subprocess.PIPE) stdout = p.communicate()[0] rc = p.returncode if rc != 0: error_msg = 'MLP exited with return code %d' % rc msg = stdout.decode("utf-8").split('\n')[:-1] try: error_line = [i for i, m in enumerate(msg) if m.startswith('ERROR')][0] error_msg += ', '.join([e for e in msg[error_line:]]) except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) _, df_predict = self.read_cfgs(predict_file, symbol=symbol) return df_orig, df_predict
def evaluate2(self, test_structures, ref_energies=None, ref_forces=None, ref_stresses=None): """ Evaluate energies, forces and stresses of structures with trained interatomic potential. Args: test_structures ([Structure]): List of Pymatgen Structure Objects. ref_energies ([float]): List of DFT-calculated total energies of each structure in structures list. ref_forces ([np.array]): List of DFT-calculated (m, 3) forces of each structure with m atoms in structures list. m can be varied with each single structure case. ref_stresses (list): List of DFT-calculated (6, ) viriral stresses of each structure in structures list. """ predict_pool = pool_from(test_structures, ref_energies, ref_forces, ref_stresses) _, df_orig = convert_docs(predict_pool) efs_calculator = EnergyForceStress(ff_settings=self) efs_results = efs_calculator.calculate(test_structures) assert len(test_structures) == len(efs_results) data_pool = [] for struct, (energy, forces, stresses) in zip(test_structures, efs_results): d = {'outputs': {}} d['structure'] = struct.as_dict() d['num_atoms'] = len(struct) d['outputs']['energy'] = energy d['outputs']['forces'] = forces d['outputs']['virial_stress'] = stresses data_pool.append(d) _, df_pred = convert_docs(data_pool) return df_orig, df_pred
def train(self, train_structures, energies=None, forces=None, stresses=None, **kwargs): """ Training data with agni method. Args: train_structures ([Structure]): The list of Pymatgen Structure object. energies ([float]): The list of total energies of each structure in structures list. energies ([float]): List of total energies of each structure in structures list. forces ([np.array]): List of (m, 3) forces array of each structure with m atoms in structures list. m can be varied with each single structure case. stresses (list): List of (6, ) virial stresses of each structure in structures list. """ train_pool = pool_from(train_structures, energies, forces, stresses) _, _, features, targets = self.sample(train_pool, **kwargs) gamma = self.fit(features, targets) return 0
def train(self, train_structures, energies=None, forces=None, stresses=None, default_sigma=[0.0005, 0.1, 0.05, 0.01], use_energies=True, use_forces=True, use_stress=False, **kwargs): """ Training data with gaussian process regression. Args: train_structures ([Structure]): The list of Pymatgen Structure object. energies ([float]): The list of total energies of each structure in structures list. energies ([float]): List of total energies of each structure in structures list. forces ([np.array]): List of (m, 3) forces array of each structure with m atoms in structures list. m can be varied with each single structure case. stresses (list): List of (6, ) virial stresses of each structure in structures list. default_sigma (list): Error criteria in energies, forces, stress and hessian. Should have 4 numbers. use_energies (bool): Whether to use dft total energies for training. Default to True. use_forces (bool): Whether to use dft atomic forces for training. Default to True. use_stress (bool): Whether to use dft virial stress for training. Default to False. kwargs: l_max (int): Parameter to configure GAP. The band limit of spherical harmonics basis function. Default to 12. n_max (int): Parameter to configure GAP. The number of radial basis function. Default to 10. atom_sigma (float): Parameter to configure GAP. The width of gaussian atomic density. Default to 0.5. zeta (float): Present when covariance function type is do product. Default to 4. cutoff (float): Parameter to configure GAP. The cutoff radius. Default to 4.0. cutoff_transition_width (float): Parameter to configure GAP. The transition width of cutoff radial. Default to 0.5. delta (float): Parameter to configure Sparsification. The signal variance of noise. Default to 1. f0 (float): Parameter to configure Sparsification. The signal mean of noise. Default to 0.0. n_sparse (int): Parameter to configure Sparsification. Number of sparse points. covariance_type (str): Parameter to configure Sparsification. The type of convariance function. Default to dot_product. sparse_method (str): Method to perform clustering in sparsification. Default to 'cur_points'. sparse_jitter (float): Intrisic error of atomic/bond energy, used to regularise the sparse covariance matrix. Default to 1e-8. e0 (float): Atomic energy value to be subtracted from energies before fitting. Default to 0.0. e0_offset (float): Offset of baseline. If zero, the offset is the average atomic energy of the input data or the e0 specified manually. Default to 0.0. """ if not which('teach_sparse'): raise RuntimeError( "teach_sparse has not been found.\n", "Please refer to https://github.com/libAtoms/QUIP for ", "further detail.") atoms_filename = 'train.xyz' xml_filename = 'train.xml' train_pool = pool_from(train_structures, energies, forces, stresses) exe_command = ["teach_sparse"] exe_command.append('at_file={}'.format(atoms_filename)) gap_configure_params = [ 'l_max', 'n_max', 'atom_sigma', 'zeta', 'cutoff', 'cutoff_transition_width', 'delta', 'f0', 'n_sparse', 'covariance_type', 'sparse_method' ] preprocess_params = ['sparse_jitter', 'e0', 'e0_offset'] target_for_training = ['use_energies', 'use_forces', 'use_stress'] if len(default_sigma) != 4: raise ValueError( "The default sigma is supposed to have 4 numbers.") gap_command = ['soap'] for param_name in gap_configure_params: param = kwargs.get(param_name) if kwargs.get(param_name) \ else soap_params.get(param_name) gap_command.append(param_name + '=' + '{}'.format(param)) exe_command.append("gap=" + "{" + "{}".format(' '.join(gap_command)) + "}") for param_name in preprocess_params: param = kwargs.get(param_name) if kwargs.get(param_name) \ else soap_params.get(param_name) exe_command.append(param_name + '=' + '{}'.format(param)) default_sigma = [str(f) for f in default_sigma] exe_command.append("default_sigma={%s}" % (' '.join(default_sigma))) if use_energies: exe_command.append('energy_parameter_name=dft_energy') if use_forces: exe_command.append('force_parameter_name=dft_force') if use_stress: exe_command.append('virial_parameter_name=dft_virial') exe_command.append('gp_file={}'.format(xml_filename)) with ScratchDir('.'): self.write_cfgs(filename=atoms_filename, cfg_pool=train_pool) p = subprocess.Popen(exe_command, stdout=subprocess.PIPE) stdout = p.communicate()[0] rc = p.returncode if rc != 0: error_msg = 'QUIP exited with return code %d' % rc msg = stdout.decode("utf-8").split('\n')[:-1] try: error_line = [ i for i, m in enumerate(msg) if m.startswith('ERROR') ][0] error_msg += ', '.join([e for e in msg[error_line:]]) except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) def get_xml(xml_file): tree = ET.parse(xml_file) root = tree.getroot() potential_label = root.tag gpcoordinates = list(root.iter('gpCoordinates'))[0] param_file = gpcoordinates.get('sparseX_filename') param = np.loadtxt(param_file) return tree, param, potential_label tree, param, potential_label = get_xml(xml_filename) self.param['xml'] = tree self.param['param'] = param self.param['potential_label'] = potential_label return rc
def train(self, train_structures, energies=None, forces=None, stresses=None, **kwargs): """ Training data with moment tensor method. Args: train_structures ([Structure]): The list of Pymatgen Structure object. energies ([float]): The list of total energies of each structure in structures list. energies ([float]): List of total energies of each structure in structures list. forces ([np.array]): List of (m, 3) forces array of each structure with m atoms in structures list. m can be varied with each single structure case. stresses (list): List of (6, ) virial stresses of each structure in structures list. kwargs: Parameters in write_input method. """ if not which('nnp-train'): raise RuntimeError("NNP Trainer has not been found.") train_pool = pool_from(train_structures, energies, forces, stresses) atoms_filename = 'input.data' with ScratchDir('.'): atoms_filename = self.write_cfgs(filename=atoms_filename, cfg_pool=train_pool) output = 'training_output' input_filename = self.write_input(**kwargs) p_scaling = subprocess.Popen(['nnp-scaling', input_filename]) stdout = p_scaling.communicate()[0] p_train = subprocess.Popen(['nnp-train', input_filename], stdout=open(output, 'w')) stdout = p_train.communicate()[0] rc = p_train.returncode if rc != 0: error_msg = 'RuNNer exited with return code %d' % rc msg = stdout.decode("utf-8").split('\n')[:-1] try: error_line = [ i for i, m in enumerate(msg) if m.startswith('ERROR') ][0] error_msg += ', '.join([e for e in msg[error_line:]]) except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) with zopen(output) as f: error_lines = f.read() energy_rmse_pattern = re.compile( r'ENERGY\s*\S*\s*(\S*)\s*(\S*).*?\n') forces_rmse_pattern = re.compile( r'FORCES\s*\S*\s*(\S*)\s*(\S*).*?\n') self.train_energy_rmse, self.validation_energy_rmse = \ np.array([line for line in energy_rmse_pattern.findall(error_lines)], dtype=np.float).T self.train_forces_rmse, self.validation_forces_rmse = \ np.array([line for line in forces_rmse_pattern.findall(error_lines)], dtype=np.float).T weights_filename_pattern = 'weights*{}.out'.format(self.epochs) weights_filename = glob.glob(weights_filename_pattern)[0] self.suffix = weights_filename.split('.')[1] with open(weights_filename) as f: weights_lines = f.readlines() params = pd.DataFrame( [line.split() for line in weights_lines if "#" not in line]) params.columns = [ 'value', 'type', 'index', 'start_layer', 'start_neuron', 'end_layer', 'end_neuron' ] self.params = params for layer_index in range(1, len(self.layer_sizes)): weights_group = params[ (params['start_layer'] == str(layer_index - 1)) & (params['end_layer'] == str(layer_index))] weights = np.reshape( np.array(weights_group['value'], dtype=np.float), (self.layer_sizes[layer_index - 1], self.layer_sizes[layer_index])) self.weights.append(weights) bs_group = params[(params['type'] == 'b') & (params['start_layer'] == str(layer_index))] bs = np.array(bs_group['value'], dtype=np.float) self.bs.append(bs) with open('scaling.data') as f: scaling_lines = f.readlines() scaling_params = pd.DataFrame( [line.split() for line in scaling_lines if '#' not in line]) scaling_params.column = [ 'e_index', 'sf_index', 'sf_min', 'sf_max', 'sf_mean', 'sf_sigma' ] self.scaling_params = scaling_params return rc
def train(self, train_structures, energies=None, forces=None, stresses=None, unfitted_mtp=None, **kwargs): """ Training data with moment tensor method. Args: train_structures ([Structure]): The list of Pymatgen Structure object. energies ([float]): The list of total energies of each structure in structures list. energies ([float]): List of total energies of each structure in structures list. forces ([np.array]): List of (m, 3) forces array of each structure with m atoms in structures list. m can be varied with each single structure case. stresses (list): List of (6, ) virial stresses of each structure in structures list. unfitted_mtp (str): Define the initial mtp file. Default to the mtp file stored in .params directory. kwargs: Parameters in write_ini method. """ if not which('mlp'): raise RuntimeError("mlp has not been found.\n", "Please refer to http://gitlab.skoltech.ru/shapeev/mlip ", "for further detail.") train_pool = pool_from(train_structures, energies, forces, stresses) atoms_filename = 'train.cfgs' with ScratchDir('.'): atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool) if not unfitted_mtp: unfitted_mtp = 'MTP.mtp' shutil.copyfile(MTP_file_path, os.path.join(os.getcwd(), unfitted_mtp)) save_fitted_mtp = '.'.join( [unfitted_mtp.split('.')[0] + '_fitted', unfitted_mtp.split('.')[1]]) self.write_ini(Abinitio=1, MLIP=unfitted_mtp, Driver=1, Fit=True, Save=save_fitted_mtp, Database_filename=atoms_filename, **kwargs) p = subprocess.Popen(['mlp', 'run', 'mlip.ini'], stdout=subprocess.PIPE) stdout = p.communicate()[0] rc = p.returncode if rc != 0: error_msg = 'MLP exited with return code %d' % rc msg = stdout.decode("utf-8").split('\n')[:-1] try: error_line = [i for i, m in enumerate(msg) if m.startswith('ERROR')][0] error_msg += ', '.join([e for e in msg[error_line:]]) except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) param = OrderedDict() with open(save_fitted_mtp, 'r') as f: lines = f.readlines() param['safe'] = [line.rstrip() for line in lines[:-2]] for line in lines[-2:]: key = line.rstrip().split(' = ')[0] value = json.loads( line.rstrip().split(' = ')[1].replace('{', '[').replace('}', ']')) param[key] = value self.param = param return rc
def describe(self, structure): """ Returns data for one input structure. Args: structure (Structure): Input structure. """ if not which('quip'): raise RuntimeError( "quip has not been found.\n", "Please refer to https://github.com/libAtoms/QUIP for ", "further detail.") atoms_filename = 'structure.xyz' exe_command = ['quip'] exe_command.append('atoms_filename={}'.format(atoms_filename)) descriptor_command = ['soap'] descriptor_command.append("cutoff" + '=' + '{}'.format(self.cutoff)) descriptor_command.append("l_max" + '=' + '{}'.format(self.l_max)) descriptor_command.append("n_max" + '=' + '{}'.format(self.n_max)) descriptor_command.append("atom_sigma" + '=' + '{}'.format(self.atom_sigma)) atomic_numbers = [ str(num) for num in np.unique(structure.atomic_numbers) ] n_Z = len(atomic_numbers) n_species = len(atomic_numbers) Z = '{' + '{}'.format(' '.join(atomic_numbers)) + '}' species_Z = '{' + '{}'.format(' '.join(atomic_numbers)) + '}' descriptor_command.append("n_Z" + '=' + str(n_Z)) descriptor_command.append("Z" + '=' + Z) descriptor_command.append("n_species" + '=' + str(n_species)) descriptor_command.append("species_Z" + '=' + species_Z) exe_command.append("descriptor_str=" + "{" + "{}".format(' '.join(descriptor_command)) + "}") with ScratchDir('.'): atoms_filename = self.operator.write_cfgs(filename=atoms_filename, cfg_pool=pool_from( [structure])) descriptor_output = 'output' p = subprocess.Popen(exe_command, stdout=open(descriptor_output, 'w')) stdout = p.communicate()[0] rc = p.returncode if rc != 0: error_msg = 'QUIP exited with return code %d' % rc msg = stdout.decode("utf-8").split('\n')[:-1] try: error_line = [ i for i, m in enumerate(msg) if m.startswith('ERROR') ][0] error_msg += ', '.join([e for e in msg[error_line:]]) except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) with zopen(descriptor_output, 'rt') as f: lines = f.read() descriptor_pattern = re.compile('DESC(.*?)\n', re.S) descriptors = pd.DataFrame([ np.array(c.split(), dtype=np.float) for c in descriptor_pattern.findall(lines) ]) return descriptors