示例#1
0
def get_data_loader(args):
    base_path = args['base_path']
    origin_folder = args['origin_folder']
    core_folder = args.get('core_folder', None)
    nfeature_folder = args.get('nfeature_folder', None)
    node_file = args['node_file']
    has_cuda = args['has_cuda']

    node_path = os.path.abspath(os.path.join(base_path, node_file))
    nodes_set = pd.read_csv(node_path, names=['node'])
    node_list = nodes_set['node'].tolist()
    node_num = nodes_set.shape[0]

    origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) if origin_folder else None
    core_base_path = os.path.abspath(os.path.join(base_path, core_folder)) if core_folder else None
    node_feature_path = os.path.abspath(os.path.join(base_path, nfeature_folder)) if nfeature_folder else None
    max_time_num = len(os.listdir(origin_base_path)) if origin_base_path else len(os.listdir(core_base_path))
    assert max_time_num > 0

    data_loader = DataLoader(node_list, max_time_num, has_cuda=has_cuda)
    args['origin_base_path'] = origin_base_path
    args['core_base_path'] = core_base_path
    args['nfeature_path'] = node_feature_path
    args['node_num'] = node_num
    return data_loader
示例#2
0
    def train(self, path, epochs=50, log_step=10, resume=False):
        '''
            Trains the fall detection model on provided data.
            Args:
                path : Path to dataset directory
        '''

        print("Beginning the training process...")

        if os.path.isfile('loaded_dataset.pkl'):
            data = pickle.load(open('loaded_dataset.pkl', 'rb'))
        else:
            data = DataLoader(path)
            pickle.dump(data, open('loaded_dataset.pkl', 'wb'))

        # load min validation loss
        valid_file = os.path.join(model_dir, "min_valid_loss.txt")
        if os.path.isfile(valid_file):
            min_valid_loss = float(open(valid_file).read().strip())
        else:
            min_valid_loss = 1000000.0

        with tf.Session() as session:
            writer = tf.summary.FileWriter(TB_DIR, session.graph)

            resumed = False
            if resume:
                try:
                    self.saver.restore(session, self.model_file)
                    resumed = True
                except:
                    print(
                        "No previous checkpoint file found, restarting training..."
                    )

            if not resumed:
                session.run(tf.global_variables_initializer())

            for e in range(epochs):
                avg_loss = []
                for batch_x, batch_y in data.next_batch(self.batch_size,
                                                        training=True):
                    batch_loss, _, tb_op = session.run(
                        [self.loss, self.train_step, self.tensorboard_op],
                        feed_dict={
                            self.x: batch_x,
                            self.y: batch_y,
                            self.is_training: True
                        })
                    avg_loss.append(batch_loss)

                print("Average Loss for epoch {} = {}.".format(
                    e,
                    sum(avg_loss) / len(avg_loss)))

                if e % log_step == 0:
                    # Run for validation set
                    avg_loss = []
                    avg_accuracy = []
                    for batch_x, batch_y in data.next_batch(self.batch_size,
                                                            training=False,
                                                            validation=True):
                        batch_loss, batch_acc = session.run(
                            [self.loss, self.accuracy],
                            feed_dict={
                                self.x: batch_x,
                                self.y: batch_y,
                                self.is_training: False
                            })
                        avg_loss.append(batch_loss)
                        avg_accuracy.append(batch_acc)

                    avg_accuracy = sum(avg_accuracy) / len(avg_accuracy)
                    avg_loss = sum(avg_loss) / len(avg_loss)

                    if avg_loss < min_valid_loss:
                        min_valid_loss = avg_loss
                        with open(valid_file, 'w') as f:
                            f.write(str(avg_loss))
                        self.save_session(session)

                    print(
                        "Validation Error: {}. Validation Accuracy: {}".format(
                            avg_loss, avg_accuracy))

                writer.add_summary(tb_op, e)

            writer.close()

        print("Training complete!")

        print(self.evaluate(data))
示例#3
0
torch.cuda.set_device(args.gpu)

logging.info('generate config')

pretrained_embedding = pkl.load(open(args.emb_file))
config = Config(vocab_size=pretrained_embedding.shape[0],
                embedding_dim=pretrained_embedding.shape[1],
                position_size=500,
                position_dim=50,
                word_input_size=100,
                sent_input_size=2 * args.hidden,
                word_GRU_hidden_units=args.hidden,
                sent_GRU_hidden_units=args.hidden,
                pretrained_embedding=pretrained_embedding)

word2id = pkl.load(open('../data/word2id.pkl'))

logging.info('loadding test dataset')
test_dataset = pkl.load(open(args.test_file))
test_loader = DataLoader(test_dataset, shuffle=False)

net = SummaRuNNer(config).cuda()
net.load_state_dict(torch.load(args.model_file))

for index, docs in enumerate(test_loader):
    doc = docs[0]
    x, y = prepare_data(doc, word2id)
    sents = Variable(torch.from_numpy(x)).cuda()
    outputs = net(sents)
    hyp, gold, predict = test(doc, outputs.data.tolist(), index)
示例#4
0
pretrained_embedding = pkl.load(open(args.emb_file))
config = Config(vocab_size=pretrained_embedding.shape[0],
                embedding_dim=pretrained_embedding.shape[1],
                position_size=500,
                position_dim=50,
                word_input_size=100,
                sent_input_size=2 * args.hidden,
                word_GRU_hidden_units=args.hidden,
                sent_GRU_hidden_units=args.hidden,
                pretrained_embedding=pretrained_embedding)

word2id = pkl.load(open('../data/word2id.pkl'))

logging.info('loadding train dataset')
train_dataset = pkl.load(open(args.train_file))
train_loader = DataLoader(train_dataset)

logging.info('loadding validation dataset')
validation_dataset = pkl.load(open(args.validation_file))
validation_loader = DataLoader(validation_dataset, shuffle=False)

net = SummaRuNNer(config)
net.cuda()

# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)

# training
loss_sum = 0
min_eval_loss = float('Inf')
示例#5
0
def dyngem_embedding(method, args):
    assert method in ['DynGEM', 'DynAE', 'DynRNN', 'DynAERNN']
    from baseline.dynRNN import DynRNN
    from baseline.dynAERNN import DynAERNN
    from baseline.dynGEM import DynGEM, DynGEMLoss, DynGEMBatchGenerator, DynGEMBatchPredictor
    model_dict = {'DynGEM': DynGEM, 'DynAE': DynAE, 'DynRNN': DynRNN, 'DynAERNN': DynAERNN}

    # DynGEM, DynAE, DynRNN, DynAERNN common params
    base_path = args['base_path']
    origin_folder = args['origin_folder']
    embedding_folder = args['embed_folder']
    model_folder = args['model_folder']
    model_file = args['model_file']
    node_file = args['node_file']
    file_sep = args['file_sep']
    start_idx = args['start_idx']
    end_idx = args['end_idx']
    duration = args['duration']
    embed_dim = args['embed_dim']
    has_cuda = args['has_cuda']
    epoch = args['epoch']
    lr = args['lr']
    batch_size = args['batch_size']
    load_model = args['load_model']
    shuffle = args['shuffle']
    export = args['export']
    record_time = args['record_time']

    # DynGEM, DynAE, DynRNN, DynAERNN model params
    n_units, ae_units, rnn_units = [], [], []
    look_back, alpha = 0, 0
    if method in ['DynGEM', 'DynAE', 'DynRNN']:
        n_units = args['n_units']
    else:  # DynAERNN
        ae_units = args['ae_units']
        rnn_units = args['rnn_units']
    if method in ['DynAE', 'DynRNN', 'DynAERNN']:
        look_back = args['look_back']
        assert look_back > 0
    else:  # DynGEM
        alpha = args['alpha']
    beta = args['beta']
    nu1 = args['nu1']
    nu2 = args['nu2']
    bias = args['bias']

    origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
    max_time_num = len(os.listdir(origin_base_path))
    node_path = os.path.abspath(os.path.join(base_path, node_file))
    nodes_set = pd.read_csv(node_path, names=['node'])
    node_num = nodes_set.shape[0]
    node_list = nodes_set['node'].tolist()
    data_loader = DataLoader(node_list, max_time_num, has_cuda=has_cuda)

    if start_idx < 0:
        start_idx = max_time_num + start_idx
    if end_idx < 0:  # original time range is [start_idx, end_idx] containing start_idx and end_idx
        end_idx = max_time_num + end_idx + 1
    else:
        end_idx = end_idx + 1

    if method == 'DynGEM':
        assert duration == 1
    assert start_idx + 1 - duration >= 0
    assert duration > look_back

    t1 = time.time()
    time_list = []

    print('start ' + method + ' embedding!')
    for idx in range(start_idx, end_idx):
        print('idx = ', idx)
        # As DynGEM, DynAE, DynRNN, DynAERNN use original adjacent matrices as their input, so normalization is not necessary(normalization=Fals, add_eye=False) !
        adj_list = data_loader.get_date_adj_list(origin_base_path, start_idx=idx - duration + 1, duration=duration, sep=file_sep, normalize=False, add_eye=False, data_type='matrix')
        adj_list = [adj.tolil() for adj in adj_list]
        model = model_dict[method](input_dim=node_num, output_dim=embed_dim, look_back=look_back, n_units=n_units, ae_units=ae_units, rnn_units=rnn_units, bias=bias)
        if method == 'DynGEM':
            loss = DynGEMLoss(alpha=alpha, beta=beta, nu1=nu1, nu2=nu2)
            batch_generator = DynGEMBatchGenerator(node_list=node_list, batch_size=batch_size, beta=beta, shuffle=shuffle, has_cuda=has_cuda)
            batch_predictor = DynGEMBatchPredictor(node_list=node_list, batch_size=batch_size, has_cuda=has_cuda)
        else:
            loss = DynGraph2VecLoss(beta=beta, nu1=nu1, nu2=nu2)
            batch_generator = BatchGenerator(node_list=node_list, batch_size=batch_size, look_back=look_back, beta=beta, shuffle=shuffle, has_cuda=has_cuda)
            batch_predictor = BatchPredictor(node_list=node_list, batch_size=batch_size, has_cuda=has_cuda)
        trainer = DynamicEmbedding(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=model, loss=loss,
                                             batch_generator=batch_generator, batch_predictor=batch_predictor, model_folder=model_folder, has_cuda=has_cuda)
        cost_time = trainer.learn_embedding(adj_list, epoch=epoch, lr=lr, idx=idx, model_file=model_file, load_model=load_model, export=export)
        time_list.append(cost_time)

    # record time cost of DynGEM, DynAE, DynRNN, DynAERNN
    if record_time:
        df_output = pd.DataFrame({'time': time_list})
        df_output.to_csv(os.path.join(base_path, method + '_time.csv'), sep=',', index=False)
    t2 = time.time()
    print('finish ' + method + ' embedding! cost time: ', t2 - t1, ' seconds!')