def model_evaluation(model_file, data_file, file_plot_path, file_data_path): model_pkl = open(model_file, 'rb') model = pickle.load(model_pkl) model_pkl.close() # try: # model.pca # pca = PCA(n_components=params['n_components']) # del params['n_components'] # X = pca.fit(X).transform(X) # except AttributeError: # pass # Load data n_cols, weeks, y, X = load_data(filename=data_file) # Get stats from the prediction scores, mean, std_dev = stats(X, y, model) # Predict and save results and metrics y_true, y_pred = y, model.predict(X) MSE = mean_squared_error(y_true, y_pred) print("The MSE of the prediction is {}".format(MSE)) my_plotter = PlotData() my_plotter.generate_curve_plot(weeks, y_true, y_pred) my_plotter.save_plot_result(file_plot_path) save_data(file_data_path, weeks, y_true, y_pred)
def compress(name, k=8, force_update=False): result = [] block_list = [] elements_file_name = name + '.pickle' compressed_elements_file_name = name + '_compressed.pickle' if not force_update and os.path.exists(compressed_elements_file_name): return load_data(compressed_elements_file_name) else: elements = load_data(elements_file_name) for i in xrange(0, len(elements), k): block_list.append( (0, elements[i].term, elements[i].count, elements[i].posting_lists ) ) last_index = len(elements) - i for bi in xrange(1, min(k, last_index)): c = compare(elements[i + bi - 1].term, elements[i + bi].term) block_list.append( (c, elements[i + bi].term[c:], elements[i + bi].count, # elements[i + bi].term, elements[i + bi].posting_lists ) ) result.append(block_list) block_list = [] save_data(result, compressed_elements_file_name) return result
def get_index(folder_name, force_update=False): index_file_name = folder_name + '.pickle' if not force_update and os.path.exists(index_file_name): return load_data(index_file_name) else: elements = [] documents = get_file_list(folder_name) for doc_id in xrange(len(documents)): elements += map(lambda x: Element(x, doc_id), get_tokens(documents[doc_id])) elements.sort() result = [] for el in elements: if result and result[-1] == el: result[-1].update(el) else: result.append(el) save_data(result, name=index_file_name)
def train_model(text): # parser = argparse.ArgumentParser() # parser.add_argument('--max-epochs', type=int, default=10) # parser.add_argument('--batch-size', type=int, default=256) # parser.add_argument('--sequence-length', type=int, default=4) # args = parser.parse_args() args = { 'max_epochs': 5, 'batch_size': 256, 'sequence_length': 5, 'max_len': 100, } tokenized = text.apply(tokenize_text) dataset = Dataset(tokenized, **args) model = Model(dataset) train(dataset, model, args) # model.index_to_word = dataset.index_to_word # model.word_to_index = dataset.word_to_index from config import DATA_DIR save_data(model, DATA_DIR / 'models' / 'massage_model.pickle')
"office_sqm_1000", "trc_sqm_1000", "cafe_count_1000_price_high", "mosque_count_1000", "cafe_count_1500_price_high", "mosque_count_1500", "cafe_count_2000_price_high", 'hospital_beds_raion' ]) data = data.fillna(-1) data = utils.convert_data_to_numeric(data) data = dimensionality_reduction.principal_components_analysis(6, data) return data if __name__ == '__main__': train_data = utils.load_data('../files/train.csv') test_data = utils.load_data('../files/test.csv') print('====================[TRAIN DATA]====================') train_data = first_iteration(train_data) count_na_values(train_data) print(train_data.describe()) print('====================[TEST DATA]====================') test_data = first_iteration(test_data) count_na_values(test_data) print(test_data.describe()) # print(train_data.head()) utils.save_data(train_data, 'clean_train.csv') utils.save_data(test_data, 'clean_test.csv') # graph_outliers(data)
def train(args, encoder, decoder, loader, epoch, optimizer_encoder, optimizer_decoder, outpath, is_train, device): epoch_total_loss = 0 labels = [] gen_imgs = [] if args.compareFigs: original = [] if is_train: encoder.train() decoder.train() # else: # encoder.eval() # decoder.eval() for i, batch in enumerate(loader, 0): X, Y = batch[0].to(device), batch[1] batch_gen_imgs = decoder(encoder(X), args) loss = ChamferLoss(device) batch_loss = loss(batch_gen_imgs, X) epoch_total_loss += batch_loss.item() # True if batch_loss has at least one NaN value if (batch_loss != batch_loss).any(): raise RuntimeError('Batch loss is NaN!') # back prop if is_train: optimizer_encoder.zero_grad() optimizer_decoder.zero_grad() batch_loss.backward() optimizer_encoder.step() optimizer_decoder.step() # print(f"epoch {epoch+1}, batch {i+1}/{len(loader)}, train_loss={batch_loss.item()}", end='\r', flush=True) # else: # print(f"epoch {epoch+1}, batch {i+1}/{len(loader)}, valid_loss={batch_loss.item()}", end='\r', flush=True) # Save all generated images if args.save_figs and args.save_allFigs: labels.append(Y.cpu()) gen_imgs.append(torch.tanh(batch_gen_imgs).cpu()) if args.compareFigs: original.append(X.cpu()) # Save only the last batch elif args.save_figs: if (i == len(loader) - 1): labels.append(Y.cpu()) gen_imgs.append(torch.tanh(batch_gen_imgs).cpu()) if args.compareFigs: original.append(X.cpu()) # Save model if is_train: make_dir(f'{outpath}/weights_encoder') make_dir(f'{outpath}/weights_decoder') torch.save( encoder.state_dict(), f"{outpath}/weights_encoder/epoch_{epoch+1}_encoder_weights.pth") torch.save( decoder.state_dict(), f"{outpath}/weights_decoder/epoch_{epoch+1}_decoder_weights.pth") # Compute average loss epoch_avg_loss = epoch_total_loss / len(loader) save_data(epoch_avg_loss, "loss", epoch, is_train, outpath) for i in range(len(gen_imgs)): if args.compareFigs: save_gen_imgs(gen_imgs[i], labels[i], epoch, is_train, outpath, originals=original[i].cpu()) else: save_gen_imgs(gen_imgs[i], labels[i], epoch, is_train, outpath) return epoch_avg_loss, gen_imgs
def train_loop(args, encoder, decoder, train_loader, valid_loader, optimizer_encoder, optimizer_decoder, outpath, device=None): if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") assert (args.save_dir is not None), "Please specify directory of saving the models!" make_dir(args.save_dir) train_avg_losses = [] train_dts = [] valid_avg_losses = [] valid_dts = [] for ep in range(args.num_epochs): if args.load_toTrain: epoch = args.load_epoch + ep + 1 else: epoch = ep # Training start = time.time() train_avg_loss, train_gen_imgs = train(args, encoder, decoder, train_loader, epoch, optimizer_encoder, optimizer_decoder, outpath, is_train=True, device=device) train_dt = time.time() - start train_avg_losses.append(train_avg_loss) train_dts.append(train_dt) save_data(data=train_avg_loss, data_name="loss", epoch=epoch, outpath=outpath, is_train=True) save_data(data=train_dt, data_name="dt", epoch=epoch, outpath=outpath, is_train=True) # Validation start = time.time() valid_avg_loss, valid_gen_imgs = test(args, encoder, decoder, valid_loader, epoch, optimizer_encoder, optimizer_decoder, outpath, device=device) valid_dt = time.time() - start valid_avg_losses.append(train_avg_loss) valid_dts.append(valid_dt) save_data(data=valid_avg_loss, data_name="loss", epoch=epoch, outpath=outpath, is_train=False) save_data(data=valid_dt, data_name="dt", epoch=epoch, outpath=outpath, is_train=False) print( f'epoch={epoch+1}/{args.num_epochs if not args.load_toTrain else args.num_epochs+args.load_epoch}, ' + f'train_loss={train_avg_loss}, valid_loss={valid_avg_loss}, dt={train_dt+valid_dt}' ) if (epoch > 0) and ((epoch + 1) % 10 == 0): plot_eval_results(args, (train_avg_losses, valid_avg_losses), f"losses to {epoch+1}", outpath, global_data=False) # Save global data save_data(data=train_avg_losses, data_name="losses", epoch="global", outpath=outpath, is_train=True, global_data=True) save_data(data=train_dts, data_name="dts", epoch="global", outpath=outpath, is_train=True, global_data=True) save_data(data=valid_avg_losses, data_name="losses", epoch="global", outpath=outpath, is_train=False, global_data=True) save_data(data=valid_dts, data_name="dts", epoch="global", outpath=outpath, is_train=False, global_data=True) return train_avg_losses, valid_avg_losses, train_dts, valid_dts
def save_output(self, output): save_data(output, self.output().path)