def init_dataset(cls, goal_object): """ :return: """ return DatasetBuilder( { "name": (), "initial_position": ("axis"), "initial_angle": (), "goal_position": ("axis"), "goal_angle": (), "position": ("axis"), "angle": (), "wheel_target_speeds": ("wheel"), "scanner_distances": ("scanner_angle"), "scanner_image": ("scanner_angle", "channel"), "goal_reached": (), "goal_position_distance": (), "goal_angle_distance": () }, coords={ # TODO: run and step might be converted to a MultiIndex, making it # possible to directly use them for indexing "run": (), "step": (), "axis": (..., ["x", "y"]), "channel": (..., ["r", "g", "b"]), "wheel": (..., ["l", "r"]), "scanner_angle": (..., np.linspace(-np.pi, np.pi, 180)) }, attrs={"goal_object": goal_object})
def get_training_dataset(audio_params, audio_adapter, audio_path): """ Builds training dataset. :param audio_params: Audio parameters. :param audio_adapter: Adapter to load audio from. :param audio_path: Path of directory containing audio. :returns: Built dataset. """ builder = DatasetBuilder(audio_params, audio_adapter, audio_path, chunk_duration=audio_params.get( 'chunk_duration', 20.0), random_seed=audio_params.get('random_seed', 0)) return builder.build(audio_params.get('train_csv'), cache_directory=audio_params.get('training_cache'), batch_size=audio_params.get('batch_size'), n_chunks_per_song=audio_params.get( 'n_chunks_per_song', 1), random_data_augmentation=False, convert_to_uint=True, wait_for_cache=False)
def get_validation_dataset(audio_params, audio_adapter, audio_path): """ Builds validation dataset. :param audio_params: Audio parameters. :param audio_adapter: Adapter to load audio from. :param audio_path: Path of directory containing audio. :returns: Built dataset. """ builder = DatasetBuilder(audio_params, audio_adapter, audio_path, chunk_duration=20.0) return builder.build( audio_params.get('validation_csv'), batch_size=100, cache_directory=audio_params.get('validation_cache'), convert_to_uint=True, infinite_generator=False, n_chunks_per_song=1, # should not perform data augmentation for eval: random_data_augmentation=False, random_time_crop=False, shuffle=False, )
def _buildItems(self): DatasetBuilder().buildDataset(configuration.itemdir)
def _buildDialogs(self): DatasetBuilder().buildDataset(configuration.dialogdir)
def _buildScripts(self): DatasetBuilder().buildDataset(configuration.scriptdir)
def _buildTilesets(self): DatasetBuilder().buildDataset(configuration.tilesetdir)
def _buildSpritesets(self): DatasetBuilder().buildDataset(configuration.spritesetdir)
import pandas as pd import numpy as np from dataset import DatasetBuilder from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler import seaborn as sns from matplotlib import pyplot as plt import json # predict price variations on an altcoin based on btc, and some other altcoins # if __name__ == '__main__': db = DatasetBuilder() target = 'ETH' symbols = { 'ADA': "ADA", 'BCH': "BCH", 'BNB': "BNB", 'BTC': "BTC", 'BTG': "BTG", 'DASH': "DASH", 'DOGE': "DOGE", 'EOS': "EOS", 'ETC': "ETC", 'ETH': "ETH", 'IOT': "MIOTA", 'LINK': "LINK", 'LTC': "LTC", 'NEO': "NEO", 'QTUM': "QTUM", 'TRX': "TRX", 'USDT': "USDT",
help='Image width, this parameter will affect the output ' 'shape of the model, default is 100, so this model ' 'can only predict up to 24 characters.') parser.add_argument('-b', '--batch_size', type=int, default=256, help='Batch size.') parser.add_argument('-m', '--model', type=str, required=True, help='The saved model.') parser.add_argument('--img_channels', type=int, default=1, help='0: Use the number of channels in the image, ' '1: Grayscale image, 3: RGB image') parser.add_argument('--ignore_case', action='store_true', help='Whether ignore case.(default false)') args = parser.parse_args() dataset_builder = DatasetBuilder(args.table_path, args.img_width, args.img_channels, args.ignore_case) eval_ds, size = dataset_builder.build(args.ann_paths, False, args.batch_size) print('Num of eval samples: {}'.format(size)) model = keras.models.load_model(args.model, compile=False) model.compile(loss=CTCLoss(), metrics=[WordAccuracy()]) model.evaluate(eval_ds)
def train(dataset, args): on_gpu = torch.cuda.is_available() if on_gpu: print("Using gpu") # Loading dataset time_cutoff = None if args.time_cutoff == "None" else int(args.time_cutoff) dataset_builder = DatasetBuilder(dataset, only_binary=args.only_binary, features_to_consider=args.features, time_cutoff=time_cutoff, seed=args.seed) datasets = dataset_builder.create_dataset( standardize_features=args.standardize, on_gpu=on_gpu, oversampling_ratio=args.oversampling_ratio) train_data_loader = torch_geometric.data.DataLoader( datasets["train"], batch_size=args.batch_size, shuffle=True) val_data_loader = torch_geometric.data.DataLoader( datasets["val"], batch_size=args.batch_size, shuffle=True) test_data_loader = torch_geometric.data.DataLoader( datasets["test"], batch_size=args.batch_size, shuffle=True) print("Number of node features", dataset_builder.num_node_features) print("Dimension of hidden space", args.hidden_dim) # Setting up model model = GNNStack(dataset_builder.num_node_features, args.hidden_dim, dataset_builder.num_classes, args) # model = GNNStack(dataset.num_node_features, 32, dataset.num_classes, args) if on_gpu: model.cuda() # Tensorboard logging log_dir = os.path.join("logs", args.exp_name) if not os.path.isdir(log_dir): os.makedirs(log_dir) train_writer = SummaryWriter(os.path.join(log_dir, "train")) val_writer = SummaryWriter(os.path.join(log_dir, "val")) test_writer = SummaryWriter(os.path.join(log_dir, "test")) # CSV logging csv_logging = [] # Checkpoints checkpoint_dir = os.path.join("checkpoints", args.exp_name) checkpoint_path = os.path.join(checkpoint_dir, "model.pt") if args.exp_name == "default" or not os.path.isfile(checkpoint_path): if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) epoch_ckp = 0 global_step = 0 best_val_acc = 0 else: checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["model_state_dict"]) epoch_ckp = checkpoint["epoch"] global_step = checkpoint["global_step"] best_val_acc = checkpoint["best_val_acc"] print("Restoring previous model at epoch", epoch_ckp) # Training phase optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) for epoch in range(epoch_ckp, epoch_ckp + args.num_epochs): model.train() epoch_loss = 0 for batch in train_data_loader: # print(batch) # import pdb; pdb.set_trace() optimizer.zero_grad() out = model(batch) loss = F.nll_loss(out, batch.y) epoch_loss += loss.sum().item() # Optimization loss.backward() optimizer.step() # TFBoard logging train_writer.add_scalar("loss", loss.mean(), global_step) global_step += 1 print("epoch", epoch, "loss:", epoch_loss / len(train_data_loader)) if epoch % 1 == 0: # Evaluation on the training set model.eval() correct = 0 n_samples = 0 samples_per_label = np.zeros(dataset_builder.num_classes) pred_per_label = np.zeros(dataset_builder.num_classes) correct_per_label = np.zeros(dataset_builder.num_classes) with torch.no_grad(): for batch in train_data_loader: _, pred = model(batch).max(dim=1) correct += float(pred.eq(batch.y).sum().item()) for i in range(dataset_builder.num_classes): batch_i = batch.y.eq(i) pred_i = pred.eq(i) samples_per_label[i] += batch_i.sum().item() pred_per_label[i] += pred_i.sum().item() correct_per_label[i] += (batch_i * pred_i).sum().item() n_samples += len(batch.y) train_acc = correct / n_samples acc_per_label = correct_per_label / samples_per_label rec_per_label = correct_per_label / pred_per_label train_writer.add_scalar("Accuracy", train_acc, epoch) for i in range(dataset_builder.num_classes): train_writer.add_scalar("Accuracy_{}".format(i), acc_per_label[i], epoch) train_writer.add_scalar("Recall_{}".format(i), rec_per_label[i], epoch) print('Training accuracy: {:.4f}'.format(train_acc)) # Evaluation on the validation set model.eval() correct = 0 n_samples = 0 samples_per_label = np.zeros(dataset_builder.num_classes) pred_per_label = np.zeros(dataset_builder.num_classes) correct_per_label = np.zeros(dataset_builder.num_classes) with torch.no_grad(): for batch in val_data_loader: _, pred = model(batch).max(dim=1) correct += float(pred.eq(batch.y).sum().item()) for i in range(dataset_builder.num_classes): batch_i = batch.y.eq(i) pred_i = pred.eq(i) samples_per_label[i] += batch_i.sum().item() pred_per_label[i] += pred_i.sum().item() correct_per_label[i] += (batch_i * pred_i).sum().item() n_samples += len(batch.y) val_acc = correct / n_samples acc_per_label = correct_per_label / samples_per_label rec_per_label = correct_per_label / pred_per_label val_writer.add_scalar("Accuracy", val_acc, epoch) for i in range(dataset_builder.num_classes): val_writer.add_scalar("Accuracy_{}".format(i), acc_per_label[i], epoch) val_writer.add_scalar("Recall_{}".format(i), rec_per_label[i], epoch) print('Validation accuracy: {:.4f}'.format(val_acc)) # Evaluation on the test set model.eval() correct = 0 n_samples = 0 samples_per_label = np.zeros(dataset_builder.num_classes) pred_per_label = np.zeros(dataset_builder.num_classes) correct_per_label = np.zeros(dataset_builder.num_classes) with torch.no_grad(): for batch in test_data_loader: _, pred = model(batch).max(dim=1) correct += float(pred.eq(batch.y).sum().item()) for i in range(dataset_builder.num_classes): batch_i = batch.y.eq(i) pred_i = pred.eq(i) samples_per_label[i] += batch_i.sum().item() pred_per_label[i] += pred_i.sum().item() correct_per_label[i] += (batch_i * pred_i).sum().item() n_samples += len(batch.y) test_acc = correct / n_samples acc_per_label = correct_per_label / samples_per_label rec_per_label = correct_per_label / pred_per_label test_writer.add_scalar("Accuracy", test_acc, epoch) for i in range(dataset_builder.num_classes): test_writer.add_scalar("Accuracy_{}".format(i), acc_per_label[i], epoch) test_writer.add_scalar("Recall_{}".format(i), rec_per_label[i], epoch) print('Test accuracy: {:.4f}'.format(test_acc)) if val_acc > best_val_acc: best_val_acc = val_acc # Saving model if model is better checkpoint = { "epoch": epoch, "model_state_dict": model.state_dict(), "epoch_loss": epoch_loss / len(train_data_loader), "global_step": global_step, "best_val_acc": best_val_acc } torch.save(checkpoint, checkpoint_path) dict_logging = vars(args).copy() dict_logging["train_acc"] = train_acc dict_logging["val_acc"] = val_acc dict_logging["test_acc"] = test_acc csv_logging.append(dict_logging) csv_exists = os.path.exists("results.csv") header = dict_logging.keys() with open("results.csv", "a") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=header) if not csv_exists: writer.writeheader() for dict_ in csv_logging: writer.writerow(dict_) return
dst_root_folder, retweet_user_size, seed ) if not os.path.exists(dst_folder): os.makedirs(dst_folder) # ---------------------------------------------- # Load dataset # ---------------------------------------------- # load raw dataset raw_dataset = load_raw_dataset(DATA_ROOT_PATH) # build dataset with preprocessing # parameter setting data_builder = DatasetBuilder(raw_dataset, retweet_user_size) dataset, topic_index = data_builder.create_dataset() print('Topics in dataset: {}'.format(topic_index.keys())) print('Dataset size: {}'.format(len(dataset))) # raw_task: [[0:[t_ids],1:[t_ids]],...] raw_tasks = [topic_index[topic] for topic in topic_index.keys()] task_sizes = [] print("scaled task distribution:") for task in raw_tasks: print([len(task[key]) for key in task.keys()]) task_sizes.append(sum([len(task[key]) for key in task.keys()])) task_sizes = np.array(task_sizes) # split tasks in the dataset for training and testing idxs = np.arange(0, len(raw_tasks))
# "epoch_loss": args.batch_size * epoch_loss / len(train_loader), # "global_step": global_step # } # torch.save(checkpoint, checkpoint_path) print("epoch", epoch, "loss:", epoch_loss / len(train_loader)) return max_running_mean if __name__ == "__main__": args = parser.parse_args() # Loading dataset dataset_builder = DatasetBuilder(args.dataset, only_binary=True, time_cutoff=1500) full_dataset = dataset_builder.create_dataset(dataset_type="sequential", standardize_features=False) val_dataset = full_dataset['val'] if args.debug: train_dataset = val_dataset else: train_dataset = full_dataset['train'] train_dataset = seq_data_to_dataset(train_dataset, cap_len=args.cap_len, num_features=11, standardize=True) val_dataset = seq_data_to_dataset(val_dataset,
# task = sample_task_from_raw_task(raw_task, support_shots, query_shots) # tasks.append(task) print("Tasks len:", len(raw_tasks)) if os.path.exists(DATA_DIR + "/" + datasource + "/id_index/processed_dataset_{}.txt".format( retweet_user_size)): with open( DATA_DIR + "/" + datasource + "/id_index/processed_dataset_{}.txt".format( retweet_user_size), "r") as f: dataset = f.read() dataset = eval(dataset) vocab_size = get_vocab_size(datasource) else: data_builder = DatasetBuilder(datasource, time_cutoff=None, only_binary=True) dataset = data_builder.create_dataset(dataset_type="id_index", standardize_features=True) vocab_size = data_builder.get_vocab_size() np.set_printoptions(threshold=1e6) with open( DATA_DIR + "/" + datasource + "/id_index/processed_dataset_{}.txt".format( retweet_user_size), "w") as f: f.write(str(dataset)) print("dataset size: {}".format(len(dataset))) # print("task ids shape:\n{}".format(tasks_ids.shape)) # split dataset for training and testing idxs = np.arange(0, len(raw_tasks)) train_idxs, test_idxs = split_dataset(idxs, topic_split_rate[2], seed)
except ValueError: pass if (i + 1) % 5 == 0: time_spent = time.time() - start progress = 100. * (i + 1) / total print( f"{progress:.2f} % DONE, in {time_spent:.2f} seconds. Total would be {time_spent * 100 / (progress * 60):.2f} mins") df = pd.DataFrame(data=df_data, columns=df_columns) df.label = df.label.astype('category') df.to_csv(f"seiz_dataset_{name}.csv", index=False) if __name__ == "__main__": dataset_selected = 'twitter16' # Building a SEIZ dataset dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000) full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False) train_set = full_dataset['train'] dump_seiz_dataset(train_set, name=dataset_selected) dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val') dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test') dataset_selected = 'twitter15' dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000) full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False) train_set = full_dataset['train'] dump_seiz_dataset(train_set, name=dataset_selected) dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val') dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test')
def on_batch_begin(self, batch, logs=None): lr = cosine_decay_with_warmup( global_step=self.global_step, learning_rate_base=self.learning_rate_base, total_steps=self.total_steps, warmup_learning_rate=self.warmup_learning_rate, warmup_steps=self.warmup_steps, hold_base_rate_steps=self.hold_base_rate_steps) K.set_value(self.model.optimizer.lr, lr) if self.verbose > 0: print('\nBatch %05d: setting learning ' 'rate to %s.' % (self.global_step + 1, lr)) dataset_builder = DatasetBuilder(args.charset, args.img_width, args.img_channels, args.ignore_case) train_ds, train_size = dataset_builder.build(args.train_ann_paths, True, args.batch_size) print('Num of training samples: {}'.format(train_size)) print("num of label", dataset_builder.num_classes) saved_model_prefix = '{epoch:03d}_{word_accuracy:.4f}' if args.val_ann_paths: val_ds, val_size = dataset_builder.build(args.val_ann_paths, False, args.batch_size) print('Num of val samples: {}'.format(val_size)) saved_model_prefix = saved_model_prefix + '_{val_word_accuracy:.4f}' else: val_ds = None saved_model_path = ('saved_models/{}/'.format(localtime) + saved_model_prefix + '.h5')
train_files = files[0:int(sample_len * 0.9)] train_labels = labels[0:int(sample_len * 0.9)] val_files = files[int(sample_len * 0.9):] val_labels = labels[int(sample_len * 0.9):] def preprocess(x, y): img = tf.io.read_file(x) img = tf.io.decode_jpeg(img, channels=1) img = tf.image.convert_image_dtype(img, tf.float32) img = tf.image.resize(img, (32, 100)) return img, y dataset_builder = DatasetBuilder('./table_path.txt', 100, 1, ignore_case=False) train_ds, train_size = dataset_builder.build(train_files, train_labels, True, batch_size) val_ds, val_size = dataset_builder.build(val_files, val_labels, True, batch_size) optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, clipnorm=5) Epochs = 60 model = build_model(11, channels=1) model.summary() model.compile(optimizer=keras.optimizers.Adam(0.0001), loss=CTCLoss(), metrics=[WordAccuracy()]) localtime = time.strftime("%Y%m%d%H%M%S", time.localtime())