def get_run_parameters(SB): #GET MISCELANEOUS RUN PARAMETERS SB['src_path'] = path.dirname(path.realpath(__file__)) SB['run_path'] = writer.run_path SB['start_time'] = writer.start_time; writer.log("RUN PARAMETERS:"); writer.log_dict(SB)
def read_pot_file(SB): if (SB['pot_type'] == "NN"): file_path = SB['pot_file'] writer.log("READING NEURAL NETWORK FILE:") if (path.exists(file_path)): input_file = open(file_path, "r") lines = [] for line in input_file: parts = line.strip().split() #rm /n if (len(parts) == 1): parts = parts[0] #dont save single numbers as arrays if (parts != []): lines.append(parts) pot = neural.NN(lines, SB) #send lines to NN class to create NN object else: raise ValueError("NN_FILE=" + str(file_path) + " DOESNT EXIST") writer.log_dict(pot.info) SB.update(pot.info) SB['nn'] = pot
def read_input(SB): #READ DEFAULT FILE file_path = SB['src_path'] + '/defaults.json' if (path.exists(file_path)): writer.log(["READING DEFAULT PARAMETERS USING FILE:", file_path]) with open(file_path, "r") as read_file: #READ USER INPUT FILE input_data = load(read_file) writer.log_dict(input_data) SB.update(input_data) else: raise ValueError("DEFAULT_FILE=" + str(file_path) + " DOESNT EXIST") #READ INPUT FILE file_path = SB['input_file'] if (path.exists(file_path)): writer.log([ "OVERWRITING SELECT DEFAULTS USING INPUT FILE:", SB['input_file'] ]) with open(file_path, "r") as read_file: #READ USER INPUT FILE input_data = load(read_file) writer.log_dict(input_data) SB.update(input_data) else: raise ValueError("INPUT_FILE=" + str(file_path) + " DOESNT EXIST") if (SB['use_cuda'] and cuda.is_available() == False): writer.log("NOTE: CUDA IS NOT AVAILABLE (RE-SETTING)") writer.log(" use_cuda : False") SB['use_cuda'] = False if ('pot_type' not in SB.keys() or 'pot_file' not in SB.keys() or 'dataset_path' not in SB.keys()): raise ValueError( "INPUT FILE MUST CONTAIN KEYS FOLLOWING KEYS: pot_type, pot_file, dataset_path" ) if (SB['pot_type'] != "NN"): raise ValueError("REQUESTED POT_TYPE=" + str(SB['pot_type']) + " IS NOT AVAILABLE")
def read_database(SB): file_path = SB['dataset_path'] writer.log("READING DATASET FILE:") full_set = data.Dataset("full", SB) #INITIALIZE FULL DATASET OBJECT SID = 0 new_structure = True if (path.exists(file_path)): input_file = open(file_path, "r") for line in input_file: if (new_structure): lines = [] new_structure = False counter = 1 full_set.Ns += 1 else: counter += 1 parts = line.strip().split() if (len(parts) == 1): parts = parts[0] #dont save single numbers as arrays lines.append(parts) #TODO: THIS HARDCODE FOR CURRENT POSCAR FORMAT (NEED TO GENERALIZE) if (counter == 6): Natom = int(parts) if (counter > 6): if (counter == 6 + 1 + Natom + 1): #see dataset examples for format full_set.Na += Natom full_set.structures[SID] = data.Structure(lines, SID, SB) #create structure object GID = str(full_set.structures[SID].gid) if (GID not in full_set.group_sids.keys()): full_set.group_sids[GID] = [] full_set.group_sids[GID].append( [full_set.structures[SID].v, SID]) new_structure = True SID += 1 #print(lines) else: raise ValueError("DATABASE_FILE=" + str(file_path) + " DOESNT EXIST") full_set.sort_group_sids() writer.log([" TOTAL NUMBER OF STRUCTURES:", full_set.Ns]) writer.log([" TOTAL NUMBER OF ATOMS: ", full_set.Na]) SB['full_set'] = full_set
def add_neurons(self): #add N neurons to each hidden layer if (self.info['activation'] != 1): raise ValueError( "ERROR: CAN ONLY ADD NEURONS TO SHIFTD SIGMOID FUNCTION") #START FRESH EVERY TIME start_fresh = self.info['start_fresh'] if (start_fresh): self.randomize() max_rand_wb = self.info['max_rand_wb'] else: max_rand_wb = 1.0 self.unset_grad() new_nfit = 0 N_neuron_2_add = 2 writer.log("ADDING " + str(N_neuron_2_add) + " NEURONS TO EACH LAYER") writer.log([" original num_fit_param =", self.info['num_fit_param']]) for layer_add in range(1, len(self.info['nn_layers']) - 1): for neurons in range(0, N_neuron_2_add): for i in range(0, len(self.submatrices)): layer = 2 * (i - 1) if (layer_add == (i + 2.0) / 2): #ADD ROW (WEIGHT MATRIX) shp2 = self.submatrices[i].shape[1] TMP = max_rand_wb * torch.empty(1, shp2).uniform_( -1.0, 1.0) self.submatrices[i] = torch.cat( (self.submatrices[i], TMP)) #ADD BIAS shp2 = self.submatrices[i + 1].shape[1] TMP = max_rand_wb * torch.empty(1, shp2).uniform_( -1.0, 1.0) self.submatrices[i + 1] = torch.cat( (self.submatrices[i + 1], TMP)) # #ADD COL (WEIGHT MATRIX) shp1 = self.submatrices[i + 2].shape[0] TMP = max_rand_wb * torch.empty(shp1, 1).uniform_( -1.0, 1.0) self.submatrices[i + 2] = torch.cat( (self.submatrices[i + 2], TMP), 1) self.info['nn_layers'][ layer_add] = self.info['nn_layers'][layer_add] + 1 #COUNT NFIT for i in range(0, len(self.submatrices)): new_nfit += self.submatrices[i].shape[0] * self.submatrices[ i].shape[1] self.info['num_fit_param'] = new_nfit writer.log([" new num_fit_param =", new_nfit]) self.set_grad()
def __init__(self, lines, SB): info = {} info['lsp_type'] = int(lines[0][0]) info['pot_type'] = str(SB['pot_type']) info['lsp_shift'] = float(lines[0][1]) info['activation'] = int(lines[0][2]) info['num_species'] = int(lines[1][0]) info['species'] = str(lines[2][0]) info['atomic_weight'] = float(lines[2][1]) info['randomize_nn'] = bool(int(lines[3][0])) info['max_rand_wb'] = float(lines[3][1]) info['cutoff_dist'] = float(lines[3][2]) info['cutoff_range'] = float(lines[3][3]) info['lsp_sigma'] = float(lines[3][4]) info['N_lg_poly'] = int(lines[4][0]) #map converts str list to int list info['lsp_lg_poly'] = list(map(int, lines[4][1:])) info['N_ro_val'] = int(lines[5][0]) info['lsp_ro_val'] = list(map(float, lines[5][1:])) #map converts str list to float list info['ibaseline'] = bool(int(lines[6][0])) info['bop_param'] = list(map(float, lines[6][1:])) info['nn_layers'] = list(map(int, lines[7][1:])) info['cnst_final_bias'] = SB['cnst_final_bias'] info['final_bias'] = SB['final_bias'] info['start_fresh'] = SB['start_fresh'] info['constrain_WB'] = SB['constrain_WB'] #DETERMINE NUMBER OF FITITNG PARAM AND RANDOMIZE IF NEEDED nfit = 0 layers = info['nn_layers'] for i in range(1, len(layers)): nfit = nfit + layers[i - 1] * layers[i] + layers[i] info['num_fit_param'] = nfit self.info = info self.normalize_by_ro = SB["normalize_by_ro"] self.dtype = torch.FloatTensor if (SB['use_cuda']): self.dtype = torch.cuda.FloatTensor if (info['randomize_nn'] or SB['re_randomize']): writer.log([" RANDOMIZING NN MIN/MAX =", info['max_rand_wb']]) self.randomize() else: #always do LR ramp up when re-starting SB['ramp_LR'] = True WB = np.array(lines[8:]).astype(np.float)[:, 0] self.submatrices = self.extract_submatrices(WB) if (len(WB) != info['num_fit_param']): raise ValueError("INCORRECT NUMBER OF FITTING PARAMETERS") #SOME ERROR CHECKS if (info['num_species'] != 1): raise ValueError("NUM_SPECIES != 1 IN EURAL NETWORK FILE") if (len(info['nn_layers']) != int(lines[7][0])): raise ValueError( "NUMBER OF LAYERS IN NEURAL NETWORK FILE IS INCORRECT") if (int(lines[0][0]) not in [5, 6, 7, 20]): raise ValueError("REQUESTED POT_TYPE=" + str(int(lines[0][0])) + " NOT AVAILABLE") if (info['pot_type'] == 'PINN_BOP' and info['nn_layers'][-1] != 8): raise ValueError("ERROR: NN OUTPUT DIMENSION INCORRECT") if (info['pot_type'] == 'NN' and info['nn_layers'][-1] != 1): raise ValueError("ERROR: NN OUTPUT DIMENSION INCORRECT") if (info['N_ro_val'] != len(info['lsp_ro_val'])): raise ValueError("ERROR: N_ro_val != len(ro)") if (info['N_lg_poly'] != len(info['lsp_lg_poly'])): raise ValueError("ERROR: N_lg_poly != len(lsp_lg_poly)") if (info['nn_layers'][0] != len(info['lsp_ro_val']) * len(info['lsp_lg_poly'])): raise ValueError( "ERROR: NN INPUT DIMENSION INCORRECT FOR Gi CHOICE")
optimizer=optim.LBFGS(SB['nn'].submatrices, max_iter=SB['lbfgs_max_iter'], lr=SB['LR_f']) set_optim() def closure(): global loss,OBE1,OBL1,OBLP,OB_DU,rmse,OBT optimizer.zero_grad(); loss=0.0 [rmse,OBE1,OB_DU,OBL1,OBLP]=training_set.compute_objective(SB) loss=OBE1+OB_DU+OBL1+OBLP loss.backward(); OBE1=OBE1.item(); OB_DU=OB_DU.item(); OBLP=OBLP.item() OBL1=OBL1.item(); OBT=loss.item(); return loss #OPTIMIZATION LOOP start=time(); writer.log('STARTING FITTING LOOP:') writer.log([" INITIAL LR:",'%10.7s'%str(optimizer.param_groups[0]['lr'])]) N_TRY=1; while(t<max_iter): optimizer.step(closure) if(SB['ramp_LR']): scheduler.step() #ADJUST LR #CHECK CONVERGENCE if(str(OBE1)=='nan' or rmse>1000000): #START OVER writer.log("NOTE: THE OBJ FUNCTION BLEW UP (IM STARTING OVER)(MAYBE TRY SMALLER LR)") SB['nn'].unset_grad(); SB['nn'].randomize(); set_optim(); N_TRY=N_TRY+1 delta1=((rmse_m1-rmse)**2.0)**0.5 delta2=((rmse_m2-rmse)**2.0)**0.5
def partition_data(SB): test_set=data.Dataset("test",SB) #INITIALIZE DATASET OBJECT training_set=data.Dataset("train",SB) #INITIALIZE DATASET OBJECT validation_set=data.Dataset("validate",SB) #INITIALIZE DATASET OBJECT no_dft_set=data.Dataset("no_dft",SB) #INITIALIZE DATASET OBJECT writer.log("PARTITIONING DATA:") writer.log([" TOTAL NUMBER OF GROUPS=",len(SB['full_set'].group_sids.keys())]) fraction_train=SB['fraction_train'] train_edges=SB['train_edges'] #ERROR CHECKS if(fraction_train==0): raise ValueError("FRACTION_TRAIN=0 (CANT TRAIN WITHOUT TRAINING DATA)"); if(fraction_train<0 or fraction_train>1): ERR="BAD VALUE FOR FRACTION_TRAIN: (I.E. FRACTION_TRAIN<0 OR FRACTION_TRAIN>1)" raise ValueError(ERR); if(SB['n_rand_GIDS']>=len(SB['full_set'].group_sids.keys())): ERR="N_RAND_GIDS IS LARGER THAN TOTAL NUMBER OF GIDS: USE N_RAND_GIDS<" \ +str(len(SB['full_set'].group_sids.keys())) raise ValueError(ERR); #------------------------------------- #TEST-SET (EXTRAPOLATION) #------------------------------------- if(SB['fix_rand_seed']): random.seed(a=412122, version=2) #SAME RAND TEST SET EVERY TIME SB['test_set_gids']=[] #COLLECT GID FOR TEST SET if(SB['n_rand_GIDS']!=0): k=1 while(k<=SB['n_rand_GIDS']): rand_GID=random.choice(list(SB['full_set'].group_sids.keys())) #if(rand_GID not in SB['test_set_gids'] ): for i1 in SB['exclude_from_test']: if(i1 not in rand_GID): keep=True else: keep=False; break if(rand_GID not in SB['test_set_gids'] and keep and rand_GID != "NO_DFT"): #REMOVE #writer.log(" "+rand_GID) SB['test_set_gids'].append(rand_GID) k=k+1 for key in SB['test_set_tags']: for GID in SB['full_set'].group_sids.keys(): for i1 in SB['exclude_from_test']: if(i1 not in GID): keep=True else: keep=False; break if(key in SB['test_set_gids']): keep=False if(key in GID and keep and GID != "NO_DFT"): SB['test_set_gids'].append(GID) writer.log(" TEST SET (UNTRAINED):") #COLLECT STRUCTURES FOR TEST SET for GID in SB['full_set'].group_sids.keys(): if(GID in SB['test_set_gids']): writer.log([" GID : ",GID]) #test_set.group_sids[GID]= SB['full_set'].group_sids[GID] for SID in SB['full_set'].group_sids[GID]: test_set.structures[SID] = SB['full_set'].structures[SID] test_set.Ns+=1; test_set.Na+=SB['full_set'].structures[SID].N #EXTAPOLATION if("NO_DFT" in SB['full_set'].group_sids.keys()): for SID in SB['full_set'].group_sids["NO_DFT"]: no_dft_set.structures[SID] = SB['full_set'].structures[SID] no_dft_set.Ns+=1; no_dft_set.Na+=SB['full_set'].structures[SID].N #COLLECT WHATS LEFT (use for train+val) remainder=[] for SID in SB['full_set'].structures.keys(): if(SID not in test_set.structures.keys() and SID not in no_dft_set.structures.keys()): remainder.append(SID) #------------------------------------- #TRAIN-VALIDATION-SET (TRAIN+INTERPOLATION SET) #------------------------------------- #TRAINING SIDS (LIST OF DICTIONARY KEYS)a train_indices=np.random.choice(len(remainder),int(fraction_train*len(remainder)), replace=False).tolist() #keys for training structures for i in train_indices: training_set.structures[remainder[i]]= SB['full_set'].structures[remainder[i]] training_set.Ns+=1; training_set.Na+=SB['full_set'].structures[remainder[i]].N # #ADD MIN/MAX VOLUME STRUCTURES IN EACH GROUP TO TRAINING SET if(train_edges): sid_2_add=[] for i in SB['full_set'].group_sids.values(): if(len(i)>4): #already sorted by volume sid_2_add.append(i[0]); sid_2_add.append(i[1]) sid_2_add.append(i[-2]); sid_2_add.append(i[-1]) #print(sid_2_add) for SID in sid_2_add: if(SID not in training_set.structures.keys() and SID not in test_set.structures.keys() \ and SID not in no_dft_set.structures.keys()): training_set.structures[SID]= SB['full_set'].structures[SID] training_set.Ns+=1; training_set.Na+=SB['full_set'].structures[SID].N # #VALIDATION SID for SID in remainder: if(SID not in training_set.structures.keys()): validation_set.structures[SID]= SB['full_set'].structures[SID] validation_set.Ns+=1; validation_set.Na+=SB['full_set'].structures[SID].N #if(SB['full_set'].Ns != training_set.Ns+test_set.Ns+validation_set.Ns): # raise ValueError("LOST A STUCTURE IN DATA PARTITIONING"); # if(test_SIDS==[]): test_SIDS=validation_SIDS #not ideal but test_SIDS cant be empty writer.log([" N_train_structures : ",training_set.Ns]) writer.log([" N_val_structures : ",validation_set.Ns]) writer.log([" N_test_structures : ",test_set.Ns]) writer.log([" N_combined : ",training_set.Ns+test_set.Ns+validation_set.Ns]) test_set.build_arrays(SB) training_set.build_arrays(SB) validation_set.build_arrays(SB) SB['training_set']=training_set; SB['datasets']=['training_set'] if(validation_set.Ns>0): SB['validation_set']=validation_set SB['datasets'].append('validation_set') if(test_set.Ns>0): SB['test_set']=test_set SB['datasets'].append('test_set') if("NO_DFT" in SB['full_set'].group_sids.keys()): no_dft_set.build_arrays(SB) SB['no_dft_set']=no_dft_set SB['datasets'].append('no_dft_set')
def compute_all_lsps(SB): writer.log(["COMPUTING LOCAL STRUCTURE PARAMETERS (LSP):"]) start = time.time(); for structure in SB['full_set'].structures.values(): structure.compute_lsp(SB); writer.log([" LSP CONSTRUCTION TIME (SEC) =",time.time()-start])
def compute_all_nbls(SB): writer.log(["COMPUTING NEIGHBOR LIST (NBL):"]) start = time.time(); for structure in SB['full_set'].structures.values(): structure.compute_nbl(SB); writer.log([" NBL CONSTRUCTION TIME (SEC) =",time.time()-start])