def run_dcrnn(args): #Pick GPU to use. Activate tensorflow_gpu conda env os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_instance with open(args.config_filename) as f: config = yaml.load(f) tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True ### From the yaml file get the adjacency matrix graph_pkl_filename = config['data']['graph_pkl_filename'] _, _, adj_mx = load_graph_data(graph_pkl_filename) with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=adj_mx, **config) ### Load the current trained model, access filename from yaml file supervisor.load(sess, config['train']['model_filename']) ### Evaluate or perform prediction outputs = supervisor.evaluate(sess) np.savez_compressed(args.output_filename, **outputs) print('Predictions saved as {}.'.format(args.output_filename))
def main(args): #Pick GPU to use. Activate tensorflow_gpu conda env os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_instance ### Open model parameters file with open(args.config_filename) as f: supervisor_config = yaml.load(f) ### Load adjacency matrix graph_pkl_filename = supervisor_config['data'].get('graph_pkl_filename') ### load the graph look at /lib/utils.py for this function sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data(graph_pkl_filename) tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True ### Call the DCRNN supervisor class and start training with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train(sess=sess)
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) SC_mx = load_graph_data( supervisor_config) # Load structural connectivity matrix. tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=SC_mx, **supervisor_config) supervisor.train(sess=sess) if args.save_predictions: outputs, _ = supervisor.evaluate(sess=sess) print('Save outputs in: ', supervisor._log_dir) np.savez(supervisor._log_dir + '/outputs', predictions=outputs['predictions'], groundtruth=outputs['groundtruth']) plot_predictions( log_dir=supervisor._log_dir, dataset_dir=supervisor_config['data']['dataset_dir'])
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) SC_mx = load_graph_data( supervisor_config) # Load structural connectivity matrix. if args.test_dataset: # For evaluating the model on a different dataset. supervisor_config['data']['dataset_dir'] = args.test_dataset tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=SC_mx, **supervisor_config) supervisor.load( sess, supervisor_config['train']['model_filename']) # Restore model. if args.save_predictions: outputs, _ = supervisor.evaluate(sess=sess) print('Save outputs in: ', supervisor._log_dir) np.savez(supervisor._log_dir + '/' + args.output_name, predictions=outputs['predictions'], groundtruth=outputs['groundtruth']) plot_predictions( log_dir=supervisor._log_dir, output_name=args.output_name, dataset_dir=supervisor_config['data']['dataset_dir'])
def main(config): logger = config.get_logger('train') graph_pkl_filename = 'data/sensor_graph/adj_mx_unix.pkl' _, _, adj_mat = utils.load_graph_data(graph_pkl_filename) data = utils.load_dataset( dataset_dir='data/METR-LA', batch_size=config["arch"]["args"]["batch_size"], test_batch_size=config["arch"]["args"]["batch_size"]) for k, v in data.items(): if hasattr(v, 'shape'): print((k, v.shape)) train_data_loader = data['train_loader'] val_data_loader = data['val_loader'] num_train_sample = data['x_train'].shape[0] num_val_sample = data['x_val'].shape[0] # get number of iterations per epoch for progress bar num_train_iteration_per_epoch = math.ceil( num_train_sample / config["arch"]["args"]["batch_size"]) num_val_iteration_per_epoch = math.ceil( num_val_sample / config["arch"]["args"]["batch_size"]) # setup data_loader instances # data_loader = config.initialize('data_loader', module_data) # valid_data_loader = data_loader.split_validation() # build model architecture, then print to console adj_arg = {"adj_mat": adj_mat} model = config.initialize('arch', module_arch, **adj_arg) # model = getattr(module_arch, config['arch']['type'])(config['arch']['args'], adj_arg) logger.info(model) # get function handles of loss and metrics loss = config.initialize('loss', module_metric, **{"scaler": data['scaler']}) metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.initialize('optimizer', torch.optim, trainable_params) lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = DCRNNTrainer(model, loss, metrics, optimizer, config=config, data_loader=train_data_loader, valid_data_loader=val_data_loader, lr_scheduler=lr_scheduler, len_epoch=num_train_iteration_per_epoch, val_len_epoch=num_val_iteration_per_epoch) trainer.train()
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) split_into_subgraphs = bool( supervisor_config['data'].get('split_into_subgraphs')) if split_into_subgraphs: assert (args.subgraph_id != None, 'Enter a subgraph_id as python argument') subgraph_id = str(args.subgraph_id) print('Splitting into Sub-graphs: True') print('Current Sub-graph ID: ' + subgraph_id) adj_mx = partition_into_n_subgraphs( graph_pkl_filename, subgraph_id, int(supervisor_config['data'].get('number_of_subgraphs'))) #Choosing the correct dataset directory for current subgraph supervisor_config['data']['dataset_dir'] = supervisor_config[ 'data'].get('dataset_dir') + subgraph_id #Choosing the correct number of nodes for current subgraph listofnodesizes = ( supervisor_config['model'].get('num_nodes')).split(',') supervisor_config['model']['num_nodes'] = int( listofnodesizes[int(subgraph_id)]) else: subgraph_id = str(subgraph_id) currentCuda.init() currentCuda.dcrnn_cudadevice = torch.device( "cuda:" + str(args.current_cuda_id) if torch.cuda.is_available() else "cpu") #Moving import here since the global variable for cuda device is declared above import model.pytorch.dcrnn_supervisor as dcrnn_supervisor supervisor = dcrnn_supervisor.DCRNNSupervisor(adj_mx=adj_mx, subgraph_id=subgraph_id, **supervisor_config) #supervisor.train(subgraph_identifier=subgraph_id) #Loading the previously trained model supervisor.load_model(subgraph_id=subgraph_id) #Evaluating the model finally and storing the results mean_score, outputs = supervisor.evaluate('test') output_filename = supervisor_config.get( 'predictions_dir' ) + '/' + 'final_predictions' + subgraph_id + '.npz' np.savez_compressed(output_filename, **outputs) print("MAE : {}".format(mean_score)) print('Predictions saved as {}.'.format(output_filename))
def read_adj(args): adj_mat_filename = args.paths['adj_mat_filename'] if Path(adj_mat_filename).suffix in ['.pkl']: sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( adj_mat_filename) elif Path(adj_mat_filename).suffix in ['.csv']: adj_mx = np.loadtxt(adj_mat_filename, dtype=np.float32, delimiter=',') else: adj_mx = np.loadtxt(adj_mat_filename, dtype=np.float32, delimiter=' ') return adj_mx
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train()
def run_dcrnn(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) mean_score, outputs = supervisor.evaluate('test') np.savez_compressed(args.output_filename, **outputs) print("MAE : {}".format(mean_score)) print('Predictions saved as {}.'.format(args.output_filename))
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get('graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data(graph_pkl_filename) tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train(sess=sess)
def run_dcrnn(args): with open(args.config_filename) as f: config = yaml.safe_load(f) tf_config = tf.compat.v1.ConfigProto() tf_config.gpu_options.allow_growth = True graph_pkl_filename = config['data']['graph_pkl_filename'] _, _, adj_mx = load_graph_data(graph_pkl_filename) with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=adj_mx, **config) supervisor.load(sess, config['train']['model_filename']) outputs = supervisor.print_datastream(sess) np.savez_compressed(args.output_filename + '.input.npz', **outputs) print('Evaluating...') supervisor.evaluate(sess)
def run_dcrnn(args): graph_pkl_filename = 'data/sensor_graph/adj_mx.pkl' with open(args.config_filename) as f: config = yaml.load(f) tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True _, _, adj_mx = load_graph_data(graph_pkl_filename) with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=adj_mx, **config) supervisor.load(sess, config['train']['model_filename']) outputs = supervisor.evaluate(sess) np.savez_compressed(args.output_filename, **outputs) print('Predictions saved as {}.'.format(args.output_filename))
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename', 'data/sensor_graph/adj_mx_bay.pkl') # graph_pkl_filename = supervisor_config['data'].get('graph_pkl_filename', # 'C:/Users/Administrator/Desktop/DCRNN_PyTorch-memoryefficiency/data/sensor_graph/adj_mx.pkl') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) # if args.use_cpu_only: # tf_config = tf.ConfigProto(device_count={'GPU': 0}) # with tf.Session(config=tf_config) as sess: supervisor = GARNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train()
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) import os os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train(sess=sess)
def main(args): print('main started with args: {}'.format(args)) with open(args.config_filename) as f: supervisor_config = yaml.load(f) add_prefix(args.train_local,supervisor_config,'base_dir') add_prefix(args.data_local,supervisor_config['data'],'dataset_dir') add_prefix(args.data_local,supervisor_config['data'],'graph_pkl_filename') add_prefix(args.data_local,supervisor_config['train'],'load_model_dir') graph_pkl_filename = supervisor_config['data'].get('graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data(graph_pkl_filename) supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) mean_score, outputs = supervisor.evaluate('test') np.savez_compressed(args.output_filename, **outputs) print("MAE : {}".format(mean_score)) print('Predictions saved as {}.'.format(args.output_filename))
def main(args): print('main started with args: {}'.format(args)) with open(args.config_filename) as f: supervisor_config = yaml.load(f) add_prefix(args.train_local, supervisor_config, 'base_dir') add_prefix(args.data_local, supervisor_config['data'], 'dataset_dir') add_prefix(args.data_local, supervisor_config['data'], 'graph_pkl_filename') print('using supervisor_config: {}'.format(supervisor_config)) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( os.path.join(args.data_local, graph_pkl_filename)) supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train()
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) data_type = args.config_filename.split('/')[-1].split('.')[0].split( '_')[-1] #'bay' or 'la' supervisor = DCRNNSupervisor(data_type=data_type, LOAD_INITIAL=args.LOAD_INITIAL, adj_mx=adj_mx, **supervisor_config) if args.TEST_ONLY: supervisor.evaluate_test() else: supervisor.train()
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) if args.rep: supervisor_config['param']['rep'] = args.rep print('overwrite rep parameter with argument') graph_pkl_filename = supervisor_config['data'].get('graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data(graph_pkl_filename) id_str = search_id(supervisor_config['alg'], supervisor_config['param']) model_dir = supervisor_config['train']['model_dir'] supervisor_config['train']['model_dir'] = os.path.join(model_dir, id_str) dset_dir = supervisor_config['data']['dataset_dir'] supervisor_config['data']['dataset_dir'] = os.path.join(dset_dir, id_str) supervisor = DCRNNSupervisor(adj_mx=adj_mx, **supervisor_config) supervisor.train()
def main(args): tf.reset_default_graph() with open(args.config_filename) as f: with tf.Graph().as_default() as g: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') if supervisor_config['data']['data_type'] == 'npz': sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) elif supervisor_config['data']['data_type'] == 'csv': adj_mx = load_graph_data_from_csv( supervisor_config['data'].get('dataset_dir')) tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True #tf_config.gpu_options.per_process_gpu_memory_fraction = 1 with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(args=args, adj_mx=adj_mx, **supervisor_config) supervisor.train(sess=sess)
def predict(config_filename='data/model/dcrnn_highway_flask.yaml', current_cuda_id=0, use_cpu_only=False, subgraph_id=0): # get sensor data and save it into the test dataset dir data = request.get_json() sensor_data = np.array([data["sensor_data"]]) with open(config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) split_into_subgraphs = bool( supervisor_config['data'].get('split_into_subgraphs')) if split_into_subgraphs: assert (subgraph_id != None, 'Enter a subgraph_id as python argument') subgraph_id = str(subgraph_id) print('Splitting into Sub-graphs: True') print('Current Sub-graph ID: ' + subgraph_id) adj_mx = partition_into_n_subgraphs( graph_pkl_filename, subgraph_id, int(supervisor_config['data'].get('number_of_subgraphs'))) #Choosing the correct dataset directory for current subgraph supervisor_config['data']['dataset_dir'] = supervisor_config[ 'data'].get('dataset_dir') + subgraph_id #Choosing the correct number of nodes for current subgraph listofnodesizes = ( supervisor_config['model'].get('num_nodes')).split(',') supervisor_config['model']['num_nodes'] = int( listofnodesizes[int(subgraph_id)]) else: subgraph_id = str(subgraph_id) currentCuda.init() currentCuda.dcrnn_cudadevice = torch.device( "cuda:" + str(current_cuda_id) if torch.cuda.is_available() else "cpu") #Saving the JSON test information to npz in test dir. Bypassing the requirement for needing actual train and val dataset if not split_into_subgraphs: if not os.path.exists( supervisor_config['data'].get('dataset_dir')): os.makedirs(supervisor_config['data'].get('dataset_dir')) np.savez_compressed(supervisor_config['data'].get('dataset_dir') + '/' + 'test.npz', x=sensor_data['x'], y=sensor_data['y'], x_offset=None, y_offset=None) #These train and val are not used anywhere, but as assert np.savez_compressed(supervisor_config['data'].get('dataset_dir') + '/' + 'train.npz', x=sensor_data['x'], y=sensor_data['y'], x_offset=None, y_offset=None) np.savez_compressed(supervisor_config['data'].get('dataset_dir') + '/' + 'val.npz', x=sensor_data['x'], y=sensor_data['y'], x_offset=None, y_offset=None) #Moving import here since the global variable for cuda device is declared above import model.pytorch.dcrnn_supervisor as dcrnn_supervisor supervisor = dcrnn_supervisor.DCRNNSupervisor(adj_mx=adj_mx, subgraph_id=subgraph_id, **supervisor_config) #supervisor.train(subgraph_identifier=subgraph_id) #Loading the previously trained model supervisor.load_model(subgraph_id=subgraph_id) #Evaluating the model finally and storing the results mean_score, outputs = supervisor.evaluate('test') output_filename = supervisor_config.get( 'predictions_dir' ) + '/' + 'final_predictions' + subgraph_id + '.npz' np.savez_compressed(output_filename, **outputs) print("MAE : {}".format(mean_score)) print('Predictions saved as {}.'.format(output_filename)) predictions = outputs['prediction'] return jsonify({"prediction": predictions})
def main(args): cfg = read_cfg_file(args.config_filename) log_dir = _get_log_dir(cfg) log_level = cfg.get('log_level', 'INFO') logger = utils.get_logger(log_dir, __name__, 'info.log', level=log_level) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # all edge_index in same dataset is same # edge_index = adjacency_to_edge_index(adj_mx) # alreay added self-loop logger.info(cfg) batch_size = cfg['data']['batch_size'] test_batch_size = cfg['data']['test_batch_size'] # edge_index = utils.load_pickle(cfg['data']['edge_index_pkl_filename']) hz = cfg['data'].get('name', 'nothz') == 'hz' adj_mx_list = [] graph_pkl_filename = cfg['data']['graph_pkl_filename'] if not isinstance(graph_pkl_filename, list): graph_pkl_filename = [graph_pkl_filename] src = [] dst = [] for g in graph_pkl_filename: if hz: adj_mx = utils.load_graph_data_hz(g) else: _, _, adj_mx = utils.load_graph_data(g) for i in range(len(adj_mx)): adj_mx[i, i] = 0 adj_mx_list.append(adj_mx) adj_mx = np.stack(adj_mx_list, axis=-1) if cfg['model'].get('norm', False): print('row normalization') adj_mx = adj_mx / (adj_mx.sum(axis=0) + 1e-18) src, dst = adj_mx.sum(axis=-1).nonzero() edge_index = torch.tensor([src, dst], dtype=torch.long, device=device) edge_attr = torch.tensor(adj_mx[adj_mx.sum(axis=-1) != 0], dtype=torch.float, device=device) output_dim = cfg['model']['output_dim'] for i in range(adj_mx.shape[-1]): logger.info(adj_mx[..., i]) # print(adj_mx.shape) (207, 207) if hz: dataset = utils.load_dataset_hz(**cfg['data'], scaler_axis=(0, 1, 2, 3)) else: dataset = utils.load_dataset(**cfg['data']) for k, v in dataset.items(): if hasattr(v, 'shape'): logger.info((k, v.shape)) scaler = dataset['scaler'] scaler_torch = utils.StandardScaler_Torch(scaler.mean, scaler.std, device=device) logger.info('scaler.mean:{}, scaler.std:{}'.format(scaler.mean, scaler.std)) model = Net(cfg).to(device) # model.apply(init_weights) criterion = nn.L1Loss(reduction='mean') optimizer = optim.Adam(model.parameters(), lr=cfg['train']['base_lr'], eps=cfg['train']['epsilon']) scheduler = StepLR2(optimizer=optimizer, milestones=cfg['train']['steps'], gamma=cfg['train']['lr_decay_ratio'], min_lr=cfg['train']['min_learning_rate']) max_grad_norm = cfg['train']['max_grad_norm'] train_patience = cfg['train']['patience'] val_steady_count = 0 last_val_mae = 1e6 horizon = cfg['model']['horizon'] for epoch in range(cfg['train']['epochs']): total_loss = 0 i = 0 begin_time = time.perf_counter() train_iterator = dataset['train_loader'].get_iterator() model.train() for _, (x, y, xtime, ytime) in enumerate(train_iterator): optimizer.zero_grad() y = y[:, :horizon, :, :output_dim] sequences, y = collate_wrapper(x=x, y=y, edge_index=edge_index, edge_attr=edge_attr, device=device) y_pred = model(sequences) y_pred = scaler_torch.inverse_transform(y_pred) y = scaler_torch.inverse_transform(y) loss = criterion(y_pred, y) loss.backward() clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() total_loss += loss.item() i += 1 val_result = evaluate(model=model, dataset=dataset, dataset_type='val', edge_index=edge_index, edge_attr=edge_attr, device=device, output_dim=output_dim, logger=logger, detail=False, cfg=cfg) val_mae, _, _ = val_result time_elapsed = time.perf_counter() - begin_time logger.info(('Epoch:{}, train_mae:{:.2f}, val_mae:{},' 'r_loss={:.2f},lr={}, time_elapsed:{}').format( epoch, total_loss / i, val_mae, 0, str(scheduler.get_lr()), time_elapsed)) if last_val_mae > val_mae: logger.info('val_mae decreased from {:.2f} to {:.2f}'.format( last_val_mae, val_mae)) last_val_mae = val_mae val_steady_count = 0 else: val_steady_count += 1 # after per epoch, run evaluation on test dataset. if (epoch + 1) % cfg['train']['test_every_n_epochs'] == 0: evaluate(model=model, dataset=dataset, dataset_type='test', edge_index=edge_index, edge_attr=edge_attr, device=device, output_dim=output_dim, logger=logger, cfg=cfg) if (epoch + 1) % cfg['train']['save_every_n_epochs'] == 0: save_dir = log_dir if not os.path.exists(save_dir): os.mkdir(save_dir) config_path = os.path.join(save_dir, 'config-{}.yaml'.format(epoch + 1)) epoch_path = os.path.join(save_dir, 'epoch-{}.pt'.format(epoch + 1)) torch.save(model.state_dict(), epoch_path) with open(config_path, 'w') as f: from copy import deepcopy save_cfg = deepcopy(cfg) save_cfg['model']['save_path'] = epoch_path f.write(yaml.dump(save_cfg, Dumper=Dumper)) if train_patience <= val_steady_count: logger.info('early stopping.') break scheduler.step()
def __init__(self): with open('data/dcrnn_la.yaml') as f_la, open( 'data/dcrnn_bay.yaml') as f_bay: config_la = yaml.load(f_la, Loader=yaml.FullLoader) config_bay = yaml.load(f_bay, Loader=yaml.FullLoader) sensor_ids1, sensor_id_to_ind1, adj_mx_la = load_graph_data( config_la['data'].get('graph_pkl_filename')) sensor_ids2, sensor_id_to_ind2, adj_mx_bay = load_graph_data( config_bay['data'].get('graph_pkl_filename')) self._kwargs = config_la self._data_kwargs = config_la.get('data') self._model_kwargs = config_la.get('model') self._data_kwargs2 = config_bay.get('data') self._model_kwargs2 = config_bay.get('model') self._train_kwargs = config_la.get('train') self.max_grad_norm = self._train_kwargs.get('max_grad_norm', 1.) # logging. self._log_dir = self._get_log_dir(config_la) self._writer = SummaryWriter('runs/' + self._log_dir) log_level = self._kwargs.get('log_level', 'INFO') self._logger = utils.get_logger(self._log_dir, __name__, 'info.log', level=log_level) # data set self._data = utils.load_dataset(**self._data_kwargs) self._data2 = utils.load_dataset(**self._data_kwargs2) self.standard_scaler = self._data['scaler'] self.standard_scaler2 = self._data2['scaler'] self._logger.info('Setting: {}'.format(args.setting)) self._logger.info("Party A trn samples: {}".format( self._data['train_loader'].size)) self._logger.info("Party A vld samples: {}".format( self._data['val_loader'].size)) self._logger.info("Party A tst samples: {}".format( self._data['test_loader'].size)) self._logger.info("Party B trn samples: {}".format( self._data2['train_loader'].size)) self._logger.info("Party B vld samples: {}".format( self._data2['val_loader'].size)) self._logger.info("Party B tst samples: {}".format( self._data2['test_loader'].size)) self.num_nodes = int(self._model_kwargs.get('num_nodes', 1)) self.num_nodes2 = int(self._model_kwargs2.get('num_nodes', 1)) self._logger.info("num_nodes: {}".format(self.num_nodes)) self._logger.info("num_nodes2: {}".format(self.num_nodes2)) self.input_dim = int(self._model_kwargs.get('input_dim', 1)) self.seq_len = int( self._model_kwargs.get('seq_len')) # for the encoder self.output_dim = int(self._model_kwargs.get('output_dim', 1)) self.use_curriculum_learning = bool( self._model_kwargs.get('use_curriculum_learning', False)) self.horizon = int(self._model_kwargs.get('horizon', 1)) # for the decoder # setup model dcrnn_model = DCRNNModel(adj_mx_la, self._logger, **self._model_kwargs) dcrnn_model2 = DCRNNModel(adj_mx_bay, self._logger, **self._model_kwargs2) if torch.cuda.is_available(): # dcrnn_model = nn.DataParallel(dcrnn_model) # dcrnn_model2 = nn.DataParallel(dcrnn_model2) self.dcrnn_model = dcrnn_model.cuda() self.dcrnn_model2 = dcrnn_model2.cuda() else: self.dcrnn_model = dcrnn_model self.dcrnn_model2 = dcrnn_model2 self._logger.info("Models created") self._logger.info('Local epochs:' + str(args.local_epochs)) self._epoch_num = self._train_kwargs.get('epoch', 0) if self._epoch_num > 0: self.load_model(self._epoch_num) # use PySyft for SPDZ if args.setting == 'fedavg' and args.spdz: import syft as sy self._logger.info('Using SPDZ for FedAvg') hook = sy.TorchHook(torch) self.party_workers = [ sy.VirtualWorker(hook, id="party{:d}".format(i)) for i in range(2) ] self.crypto = sy.VirtualWorker(hook, id="crypto") # DP if args.dp: class HiddenPrints: def __enter__(self): self._original_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') def __exit__(self, exc_type, exc_val, exc_tb): sys.stdout.close() sys.stdout = self._original_stdout def find_sigma(eps, batches_per_lot, dataset_size): lotSize = batches_per_lot * args.batch_size # L N = dataset_size delta = min(10**(-5), 1 / N) lotsPerEpoch = N / lotSize q = lotSize / N # Sampling ratio T = args.epochs * lotsPerEpoch # Total number of lots def compute_dp_sgd_wrapper(_sigma): with HiddenPrints(): return compute_dp_sgd_privacy.compute_dp_sgd_privacy( n=N, batch_size=lotSize, noise_multiplier=_sigma, epochs=args.epochs, delta=delta)[0] - args.epsilon sigma = newton(compute_dp_sgd_wrapper, x0=0.5, tol=1e-4) # adjust x0 to avoid error with HiddenPrints(): actual_eps = compute_dp_sgd_privacy.compute_dp_sgd_privacy( n=N, batch_size=lotSize, noise_multiplier=sigma, epochs=args.epochs, delta=delta)[0] # print('Batches_per_lot={}, q={}, T={}, sigma={}'.format(batches_per_lot, q, T, sigma)) # print('actual epslion = {}'.format(actual_eps)) return sigma self._logger.info('Epsilon: ' + str(args.epsilon)) self._logger.info('Lotsize_scaler: ' + str(args.lotsize_scaler)) lotsizes = [ N**.5 * args.lotsize_scaler for N in [ self._data['train_loader'].size, self._data2['train_loader'].size ] ] batches_per_lot_list = list( map(lambda lotsize: max(round(lotsize / args.batch_size), 1), lotsizes)) batches_per_lot_list = [ min(bpl, loader_len) for bpl, loader_len in zip(batches_per_lot_list, [ self._data['train_loader'].num_batch, self._data2['train_loader'].num_batch ]) ] self._logger.info('Batches per lot: ' + str(batches_per_lot_list)) sigma_list = [ find_sigma(args.epsilon, bpl, N) for bpl, N in zip(batches_per_lot_list, [ self._data['train_loader'].size, self._data2['train_loader'].size ]) ] self._logger.info('Sigma: ' + str(sigma_list)) for mod, bpl, sig in zip([self.dcrnn_model, self.dcrnn_model2], batches_per_lot_list, sigma_list): mod.batch_per_lot = bpl mod.sigma = sig self.dcrnn_model.batch_per_lot = batches_per_lot_list[0] self.dcrnn_model.sigma = sigma_list[0] self.dcrnn_model2.batch_per_lot = batches_per_lot_list[1] self.dcrnn_model2.sigma = sigma_list[1] self._lastNoiseShape = None self._noiseToAdd = None
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) supervisor_config['model']['num_nodes'] = num_nodes = len(sensor_ids) # Data preprocessing traffic_df_filename = supervisor_config['data']['hdf_filename'] df_data = pd.read_hdf(traffic_df_filename) #df_data = df_data.iloc[int(df_data.shape[0]/3):,:] validation_ratio = supervisor_config.get('data').get( 'validation_ratio') test_ratio = supervisor_config.get('data').get('test_ratio') df_train, df_val, df_test = train_val_test_split( df_data, val_ratio=validation_ratio, test_ratio=test_ratio) batch_size = supervisor_config.get('data').get('batch_size') val_batch_size = supervisor_config.get('data').get('val_batch_size') test_batch_size = supervisor_config.get('data').get('test_batch_size') horizon = supervisor_config.get('model').get('horizon') seq_len = supervisor_config.get('model').get('seq_len') scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data_train = generate_seq2seq_data(df_train, batch_size, seq_len, horizon, num_nodes, 'train', scaler) data_val = generate_seq2seq_data(df_val, val_batch_size, seq_len, horizon, num_nodes, 'val', scaler) data_train.update(data_val) #data_train['scaler'] = scaler data_test = generate_seq2seq_data(df_test, test_batch_size, seq_len, horizon, num_nodes, 'test', scaler) #data_test['scaler'] = scaler tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx, data_train, supervisor_config) data_tag = supervisor_config.get('data').get('dataset_dir') folder = data_tag + '/model/' if not os.path.exists(folder): os.makedirs(folder) # Train supervisor.train(sess=sess) # Test yaml_files = glob.glob('%s/model/*/*.yaml' % data_tag, recursive=True) yaml_files.sort(key=os.path.getmtime) config_filename = yaml_files[-1] #'config_%d.yaml' % config_id with open(config_filename) as f: config = yaml.load(f) # Load model and evaluate supervisor.load(sess, config['train']['model_filename']) y_preds = supervisor.evaluate(sess, data_test) n_test_samples = data_test['y_test'].shape[0] folder = data_tag + '/results/' if not os.path.exists(folder): os.makedirs(folder) for horizon_i in range(data_test['y_test'].shape[1]): y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0]) eval_dfs = df_test[seq_len + horizon_i:seq_len + horizon_i + n_test_samples] df = pd.DataFrame(y_pred, index=eval_dfs.index, columns=eval_dfs.columns) #df = pd.DataFrame(y_pred, columns=df_test.columns) filename = os.path.join( '%s/results/' % data_tag, 'dcrnn_speed_prediction_%s.h5' % str(horizon_i + 1)) df.to_hdf(filename, 'results') print( 'Predictions saved as %s/results/dcrnn_prediction_[1-12].h5...' % data_tag)
def __init__(self, is_training, batch_size, scaler, adj_matrix_file, **model_kwargs): # Scaler for data normalization. self._scaler = scaler # Train and loss self._loss = None self._mae = None self._train_op = None max_diffusion_step = int(model_kwargs.get('max_diffusion_step', 0)) cl_decay_steps = int(model_kwargs.get('cl_decay_steps', 1000)) filter_type = model_kwargs.get('filter_type', 'laplacian') networkType = model_kwargs.get('network', 'gconv') # fc/gconv matrixType = model_kwargs.get('weightMatrix') # a/d attention = model_kwargs.get('attention') horizon = int(model_kwargs.get('horizon', 1)) max_grad_norm = float(model_kwargs.get('max_grad_norm', 5.0)) num_nodes = int(model_kwargs.get('num_nodes', 1)) num_rnn_layers = int(model_kwargs.get('num_rnn_layers', 1)) rnn_units = int(model_kwargs.get('rnn_units')) seq_len = int(model_kwargs.get('seq_len')) use_curriculum_learning = bool( model_kwargs.get('use_curriculum_learning', False)) input_dim = int(model_kwargs.get('input_dim', 1)) output_dim = int(model_kwargs.get('output_dim', 1)) aux_dim = input_dim - output_dim _, _, adj_mx = load_graph_data(adj_matrix_file) graphEmbedFile = None if networkType == 'fc': graphEmbedFile = model_kwargs.get('graphEmbedFile') # input_dim = 2 # output_dim = 1 # Input (batch_size, timesteps, num_sensor, input_dim) # print(batch_size, seq_len, num_nodes, input_dim) # 64 12 207 2 # Batch size is a term used in machine learning and refers to the number of training examples utilised in one iteration. self._inputs = tf.placeholder(tf.float32, shape=(batch_size, seq_len, num_nodes, input_dim), name='inputs') # Labels: (batch_size, timesteps, num_sensor, input_dim), same format with input except the temporal dimension. self._labels = tf.placeholder(tf.float32, shape=(batch_size, horizon, num_nodes, input_dim), name='labels') # GO_SYMBOL = tf.zeros(shape=(batch_size, num_nodes * input_dim)) GO_SYMBOL = tf.zeros(shape=(batch_size, num_nodes * output_dim)) cell = DCGRUCell(rnn_units, adj_mx, max_diffusion_step=max_diffusion_step, num_nodes=num_nodes, network_type=networkType, graphEmbedFile=graphEmbedFile, filter_type=filter_type) cell_with_projection = DCGRUCell(rnn_units, adj_mx, max_diffusion_step=max_diffusion_step, num_nodes=num_nodes, network_type=networkType, graphEmbedFile=graphEmbedFile, num_proj=output_dim, filter_type=filter_type) encoding_cells = [cell] * num_rnn_layers decoding_cells = [cell] * (num_rnn_layers - 1) + [cell_with_projection] # projection is for the last step of decoding encoding_cells = tf.contrib.rnn.MultiRNNCell(encoding_cells, state_is_tuple=True) decoding_cells = tf.contrib.rnn.MultiRNNCell(decoding_cells, state_is_tuple=True) # print('We have initiated the cells.') global_step = tf.train.get_or_create_global_step() # Outputs: (batch_size, timesteps, num_nodes, output_dim) with tf.variable_scope('DCRNN_SEQ'): # What are the inputs and labels?? # labels are ground truth # What is input_dim and output_dim # input_dim = 2 # output_dim = 1 inputs = tf.unstack(tf.reshape( self._inputs, (batch_size, seq_len, num_nodes * input_dim)), axis=1) labels = tf.unstack( tf.reshape(self._labels[..., :output_dim], (batch_size, horizon, num_nodes * output_dim)), axis=1) if aux_dim > 0: aux_info = tf.unstack(self._labels[..., output_dim:], axis=1) aux_info.insert(0, None) labels.insert(0, GO_SYMBOL) # print('Did we arrive here? Yes we did.') def _loop_function(prev, i): if is_training: # Return either the model's prediction or the previous ground truth in training. if use_curriculum_learning: c = tf.random_uniform((), minval=0, maxval=1.) threshold = self._compute_sampling_threshold( global_step, cl_decay_steps) result = tf.cond(tf.less(c, threshold), lambda: labels[i], lambda: prev) else: result = labels[i] else: # Return the prediction of the model in testing. result = prev # print(result.shape) # exit() # (64, 207) if False and aux_dim > 0: result = tf.reshape(result, (batch_size, num_nodes, output_dim)) # print(result.shape) # (64, 207, 1) result = tf.concat([result, aux_info[i]], axis=-1) # print(result.shape) # (64, 207, 2) result = tf.reshape(result, (batch_size, num_nodes * input_dim)) # print(result.shape) # print(result.shape) # (64, 414) return result # tf.contrib.rnn.static_rnn: https://www.tensorflow.org/versions/r1.1/api_docs/python/tf/contrib/rnn/static_rnn # Creates a recurrent neural network specified by RNNCell: cell. # _gconv is called several times in this step _, enc_state = tf.contrib.rnn.static_rnn(encoding_cells, inputs, dtype=tf.float32) # exit() # ****** HaHa ****** appeared 24 times # exit() # outputs is a list # Inside the decoder function, there is a loop function that probably propogates in the rnn structure # there are many printouts for calling the cells as a function, in the _gconv # outputs is of 13 such rnn cells # <tf.Tensor 'Train/DCRNN/DCRNN_SEQ/rnn_decoder/rnn_decoder/multi_rnn_cell/cell_1_12/dcgru_cell/projection/Reshape_1:0' shape=(64, 207) dtype=float32> # final_state is of 2 such rnn cells # <tf.Tensor 'Train/DCRNN/DCRNN_SEQ/rnn_decoder/rnn_decoder/multi_rnn_cell/cell_0_12/dcgru_cell/add:0' shape=(64, 13248) dtype=float32> # print('We are now in decoding') # tf.contrib.legacy_seq2seq.rnn_decoder: https://www.tensorflow.org/api_docs/python/tf/contrib/legacy_seq2seq/rnn_decoder # RNN decoder for the sequence-to-sequence model. # _gconv is called several times in this step outputs, final_state = legacy_seq2seq.rnn_decoder( labels, enc_state, decoding_cells, loop_function=_loop_function) # print("Did we arrive here? No we didn't.") # Project the output to output_dim. # https://www.tensorflow.org/api_docs/python/tf/stack # Why remove the last element? outputs = tf.stack(outputs[:-1], axis=1) # outputs is not a list anymore, but a stacked tensor self._outputs = tf.reshape( outputs, (batch_size, horizon, num_nodes, output_dim), name='outputs') self._merged = tf.summary.merge_all()