def get_data_loader(args): base_path = args['base_path'] origin_folder = args['origin_folder'] core_folder = args.get('core_folder', None) nfeature_folder = args.get('nfeature_folder', None) node_file = args['node_file'] has_cuda = args['has_cuda'] node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) node_list = nodes_set['node'].tolist() node_num = nodes_set.shape[0] origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) if origin_folder else None core_base_path = os.path.abspath(os.path.join(base_path, core_folder)) if core_folder else None node_feature_path = os.path.abspath(os.path.join(base_path, nfeature_folder)) if nfeature_folder else None max_time_num = len(os.listdir(origin_base_path)) if origin_base_path else len(os.listdir(core_base_path)) assert max_time_num > 0 data_loader = DataLoader(node_list, max_time_num, has_cuda=has_cuda) args['origin_base_path'] = origin_base_path args['core_base_path'] = core_base_path args['nfeature_path'] = node_feature_path args['node_num'] = node_num return data_loader
def train(self, path, epochs=50, log_step=10, resume=False): ''' Trains the fall detection model on provided data. Args: path : Path to dataset directory ''' print("Beginning the training process...") if os.path.isfile('loaded_dataset.pkl'): data = pickle.load(open('loaded_dataset.pkl', 'rb')) else: data = DataLoader(path) pickle.dump(data, open('loaded_dataset.pkl', 'wb')) # load min validation loss valid_file = os.path.join(model_dir, "min_valid_loss.txt") if os.path.isfile(valid_file): min_valid_loss = float(open(valid_file).read().strip()) else: min_valid_loss = 1000000.0 with tf.Session() as session: writer = tf.summary.FileWriter(TB_DIR, session.graph) resumed = False if resume: try: self.saver.restore(session, self.model_file) resumed = True except: print( "No previous checkpoint file found, restarting training..." ) if not resumed: session.run(tf.global_variables_initializer()) for e in range(epochs): avg_loss = [] for batch_x, batch_y in data.next_batch(self.batch_size, training=True): batch_loss, _, tb_op = session.run( [self.loss, self.train_step, self.tensorboard_op], feed_dict={ self.x: batch_x, self.y: batch_y, self.is_training: True }) avg_loss.append(batch_loss) print("Average Loss for epoch {} = {}.".format( e, sum(avg_loss) / len(avg_loss))) if e % log_step == 0: # Run for validation set avg_loss = [] avg_accuracy = [] for batch_x, batch_y in data.next_batch(self.batch_size, training=False, validation=True): batch_loss, batch_acc = session.run( [self.loss, self.accuracy], feed_dict={ self.x: batch_x, self.y: batch_y, self.is_training: False }) avg_loss.append(batch_loss) avg_accuracy.append(batch_acc) avg_accuracy = sum(avg_accuracy) / len(avg_accuracy) avg_loss = sum(avg_loss) / len(avg_loss) if avg_loss < min_valid_loss: min_valid_loss = avg_loss with open(valid_file, 'w') as f: f.write(str(avg_loss)) self.save_session(session) print( "Validation Error: {}. Validation Accuracy: {}".format( avg_loss, avg_accuracy)) writer.add_summary(tb_op, e) writer.close() print("Training complete!") print(self.evaluate(data))
torch.cuda.set_device(args.gpu) logging.info('generate config') pretrained_embedding = pkl.load(open(args.emb_file)) config = Config(vocab_size=pretrained_embedding.shape[0], embedding_dim=pretrained_embedding.shape[1], position_size=500, position_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=pretrained_embedding) word2id = pkl.load(open('../data/word2id.pkl')) logging.info('loadding test dataset') test_dataset = pkl.load(open(args.test_file)) test_loader = DataLoader(test_dataset, shuffle=False) net = SummaRuNNer(config).cuda() net.load_state_dict(torch.load(args.model_file)) for index, docs in enumerate(test_loader): doc = docs[0] x, y = prepare_data(doc, word2id) sents = Variable(torch.from_numpy(x)).cuda() outputs = net(sents) hyp, gold, predict = test(doc, outputs.data.tolist(), index)
pretrained_embedding = pkl.load(open(args.emb_file)) config = Config(vocab_size=pretrained_embedding.shape[0], embedding_dim=pretrained_embedding.shape[1], position_size=500, position_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=pretrained_embedding) word2id = pkl.load(open('../data/word2id.pkl')) logging.info('loadding train dataset') train_dataset = pkl.load(open(args.train_file)) train_loader = DataLoader(train_dataset) logging.info('loadding validation dataset') validation_dataset = pkl.load(open(args.validation_file)) validation_loader = DataLoader(validation_dataset, shuffle=False) net = SummaRuNNer(config) net.cuda() # Loss and Optimizer criterion = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=args.lr) # training loss_sum = 0 min_eval_loss = float('Inf')
def dyngem_embedding(method, args): assert method in ['DynGEM', 'DynAE', 'DynRNN', 'DynAERNN'] from baseline.dynRNN import DynRNN from baseline.dynAERNN import DynAERNN from baseline.dynGEM import DynGEM, DynGEMLoss, DynGEMBatchGenerator, DynGEMBatchPredictor model_dict = {'DynGEM': DynGEM, 'DynAE': DynAE, 'DynRNN': DynRNN, 'DynAERNN': DynAERNN} # DynGEM, DynAE, DynRNN, DynAERNN common params base_path = args['base_path'] origin_folder = args['origin_folder'] embedding_folder = args['embed_folder'] model_folder = args['model_folder'] model_file = args['model_file'] node_file = args['node_file'] file_sep = args['file_sep'] start_idx = args['start_idx'] end_idx = args['end_idx'] duration = args['duration'] embed_dim = args['embed_dim'] has_cuda = args['has_cuda'] epoch = args['epoch'] lr = args['lr'] batch_size = args['batch_size'] load_model = args['load_model'] shuffle = args['shuffle'] export = args['export'] record_time = args['record_time'] # DynGEM, DynAE, DynRNN, DynAERNN model params n_units, ae_units, rnn_units = [], [], [] look_back, alpha = 0, 0 if method in ['DynGEM', 'DynAE', 'DynRNN']: n_units = args['n_units'] else: # DynAERNN ae_units = args['ae_units'] rnn_units = args['rnn_units'] if method in ['DynAE', 'DynRNN', 'DynAERNN']: look_back = args['look_back'] assert look_back > 0 else: # DynGEM alpha = args['alpha'] beta = args['beta'] nu1 = args['nu1'] nu2 = args['nu2'] bias = args['bias'] origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) max_time_num = len(os.listdir(origin_base_path)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) node_num = nodes_set.shape[0] node_list = nodes_set['node'].tolist() data_loader = DataLoader(node_list, max_time_num, has_cuda=has_cuda) if start_idx < 0: start_idx = max_time_num + start_idx if end_idx < 0: # original time range is [start_idx, end_idx] containing start_idx and end_idx end_idx = max_time_num + end_idx + 1 else: end_idx = end_idx + 1 if method == 'DynGEM': assert duration == 1 assert start_idx + 1 - duration >= 0 assert duration > look_back t1 = time.time() time_list = [] print('start ' + method + ' embedding!') for idx in range(start_idx, end_idx): print('idx = ', idx) # As DynGEM, DynAE, DynRNN, DynAERNN use original adjacent matrices as their input, so normalization is not necessary(normalization=Fals, add_eye=False) ! adj_list = data_loader.get_date_adj_list(origin_base_path, start_idx=idx - duration + 1, duration=duration, sep=file_sep, normalize=False, add_eye=False, data_type='matrix') adj_list = [adj.tolil() for adj in adj_list] model = model_dict[method](input_dim=node_num, output_dim=embed_dim, look_back=look_back, n_units=n_units, ae_units=ae_units, rnn_units=rnn_units, bias=bias) if method == 'DynGEM': loss = DynGEMLoss(alpha=alpha, beta=beta, nu1=nu1, nu2=nu2) batch_generator = DynGEMBatchGenerator(node_list=node_list, batch_size=batch_size, beta=beta, shuffle=shuffle, has_cuda=has_cuda) batch_predictor = DynGEMBatchPredictor(node_list=node_list, batch_size=batch_size, has_cuda=has_cuda) else: loss = DynGraph2VecLoss(beta=beta, nu1=nu1, nu2=nu2) batch_generator = BatchGenerator(node_list=node_list, batch_size=batch_size, look_back=look_back, beta=beta, shuffle=shuffle, has_cuda=has_cuda) batch_predictor = BatchPredictor(node_list=node_list, batch_size=batch_size, has_cuda=has_cuda) trainer = DynamicEmbedding(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=model, loss=loss, batch_generator=batch_generator, batch_predictor=batch_predictor, model_folder=model_folder, has_cuda=has_cuda) cost_time = trainer.learn_embedding(adj_list, epoch=epoch, lr=lr, idx=idx, model_file=model_file, load_model=load_model, export=export) time_list.append(cost_time) # record time cost of DynGEM, DynAE, DynRNN, DynAERNN if record_time: df_output = pd.DataFrame({'time': time_list}) df_output.to_csv(os.path.join(base_path, method + '_time.csv'), sep=',', index=False) t2 = time.time() print('finish ' + method + ' embedding! cost time: ', t2 - t1, ' seconds!')