def load_dataset(dataset):
    '''
    Returns two numpy arrays (x, y) in which:
        x : is the data matrix of (num_examples, num_covariates)
        y : is a two column array containing the censor and time variables for each row in x
    '''
    ## Define internal functions
    def format_to_optunity(dataset, strata=False):
        '''
        Formats a dataset dictionary containing survival data with keys: 
            { 
                'x' : baseline data
                'e' : censor
                't' : event time
            }
        to a format that Optunity can use to run hyper-parameter searches on.
        '''
        x = dataset['x']
        e = dataset['e']
        t = dataset['t']
        y = np.column_stack((e, t))
        # Take the indices of censored entries as strata
        if strata:
            strata = [np.nonzero(np.logical_not(e).astype(np.int32))[0].tolist()]
        else:
            strata = None
        return (x,y,strata)

    def load_simulated_dataset(dataset):
        # Default values
        NUM_EXAMPLES = 5000
        treatment_group = False
        hr_ratio = 5

        # Check if experiment is treatment group        
        if dataset == 'treatment':
            hr_ratio = 10
            treatment_group = True
            dataset = 'gaussian'

        # Generate Data
        factory = datasets.SimulatedData(hr_ratio=hr_ratio,
                average_death=5, 
                censor_mode = 'observed_p', observed_p = .5,
                num_features=10, num_var=2,
                treatment_group=treatment_group)
        ds = factory.generate_data(NUM_EXAMPLES, method=dataset)

        return ds

    # Simulated Data Experiments
    if dataset in ['linear', 'gaussian', 'treatment']:
        ds = load_simulated_dataset(dataset)
    else:
        # If not a simulated dataset, load the dataset
        ds = utils.load_datasets(dataset)['train']
    
    return format_to_optunity(ds)
示例#2
0
 def __init__(self, splits):
     self.error_margin = 3.0
     self.splits = splits
     self.gt = {}
     self.instr_ids = []
     self.scans = []
     for item in load_datasets(splits):
         self.gt[item['path_id']] = item
         self.scans.append(item['scan'])
         self.instr_ids += ['%d_%d' % (item['path_id'],i) for i in range(3)]
     self.scans = set(self.scans)
     self.instr_ids = set(self.instr_ids)
     self.graphs = load_nav_graphs(self.scans)
     self.distances = {}
     for scan,G in self.graphs.iteritems(): # compute all shortest paths
         self.distances[scan] = dict(nx.all_pairs_dijkstra_path_length(G))
示例#3
0
 def __init__(self, feature_store, batch_size=100, seed=10, splits=['train'], tokenizer=None):
     self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size)
     self.data = []
     self.scans = []
     for item in load_datasets(splits):  
         # Split multiple instructions into separate entries
         for j,instr in enumerate(item['instructions']):
             self.scans.append(item['scan'])
             new_item = dict(item)
             new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
             new_item['instructions'] = instr
             if tokenizer:
                 new_item['instr_encoding'] = tokenizer.encode_sentence(instr)
             self.data.append(new_item)
     self.scans = set(self.scans)
     self.splits = splits
     self.seed = seed
     random.seed(self.seed)
     random.shuffle(self.data)
     self.ix = 0
     self.batch_size = batch_size
     self._load_nav_graphs()
     print 'R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits))
示例#4
0
def main():
    with open(params_path + 'params_' + args.dataset + '.json') as f:
        params = json.load(f)
    print(params)

 

    CI_list = np.zeros(n_runs)
    loss_list = np.zeros(n_runs)
    tic = time.time()
    for i in range(n_runs):
        dataset_name = dataset_path+'/'+dataset_type+'/'+str(i)+raw_datafile
        print(dataset_name)
        datasets = utils.load_datasets(dataset_path+'/'+dataset_type+'/'+raw_datafile)
        train_data = datasets['train']
        norm_vals = {
                'mean' : datasets['train']['x'].mean(axis=0),
                'std'  : datasets['train']['x'].std(axis=0)
            }
        test_data = datasets['test']
        print(params['standardize'])
        if params['standardize'] == True:
            train_data = utils.standardize_dataset(datasets['train'], norm_vals['mean'], norm_vals['std'])
            valid_data = utils.standardize_dataset(datasets['valid'], norm_vals['mean'], norm_vals['std'])
            test_data = utils.standardize_dataset(datasets['test'], norm_vals['mean'], norm_vals['std'])

        print('dataset shape: \n')
        print('train x: {}, valid x: {}, test x: {}'.format(train_data['x'].shape, valid_data['x'].shape, test_data['x'].shape))
        print('train e: {}, valid e: {}, test e: {}'.format(train_data['e'].shape, valid_data['e'].shape, test_data['e'].shape))
        print('train t: {}, valid t: {}, test t: {}'.format(train_data['t'].shape, valid_data['t'].shape, test_data['t'].shape))
     
        train_X = train_data['x']
        train_y = {'e': train_data['e'], 't': train_data['t']}
        valid_X = valid_data['x']
        valid_y = {'e': valid_data['e'], 't': valid_data['t']}
        test_X = test_data['x']
        test_y = {'e': test_data['e'], 't': test_data['t']}
        input_nodes = train_X.shape[1]
        output_nodes = 1


        test_data = {}
        valid_data = {}
        '''
        test_data['X'], test_data['E'], \
            test_data['T'], test_data['failures'], \
            test_data['atrisk'], test_data['ties'] = utils.parse_data(test_X, test_y)
        '''
        test_data['X'], test_data['E'], \
                test_data['T'] = utils.prepare_data(test_X, test_y)
        test_data['ties']='noties'

        valid_data['X'], valid_data['E'], \
                valid_data['T'] = utils.prepare_data(valid_X, valid_y)
        valid_data['ties']='noties'

        test_label = {'t': test_data['T'], 'e': test_data['E']}
        valid_label = {'t': valid_data['T'], 'e': valid_data['E']}



        model = L2DeepSurv(train_X, train_y, valid_X, valid_y,
                           input_nodes, output_nodes, **params)

        output_dir = save_dir + '/run_' + str(i)
        if not os.path.exists(output_dir) and not args.debug:
            os.makedirs(output_dir)

        with open(os.path.join(output_dir + 'params.json'), 'w') as outfile:
            json.dump(params, outfile)

        # Plot curve of loss and CI on train data
        model.train(num_epoch=2000, iteration=100,
                    plot_loss=True, plot_CI=True, plot_gate=True, output_dir=output_dir)
        test_CI, test_loss = model.eval(test_data['X'], test_label)
        fin_val_CI, fin_val_loss = model.eval(valid_data['X'], valid_label)
        CI_list[i] = test_CI
        loss_list[i] = test_loss
    np.save(output_dir+'CI_list', CI_list)
    np.save(output_dir+'loss_list', loss_list)
    print("final valid CI: {}".format(fin_val_CI))
    print("final valid loss: {}".format(fin_val_loss))
    print("average test CI over {} runs:  {}".format(n_runs, CI_list.mean()))
    print("average test loss over {} runs:  {}".format(n_runs, loss_list.mean()))
    tac = time.time()
    print("Time: {}".format(tac - tic))
示例#5
0
文件: env.py 项目: yangsikai/cvdn
    def __init__(self,
                 feature_store,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 path_type='planner_path',
                 history='target'):
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size)
        datasets = load_datasets(splits)
        self.data = []
        self.scans = [item['scan'] for item in datasets]
        self._load_nav_graphs()
        for item in datasets:
            # For every dialog history, stitch together a single instruction string.
            new_item = dict(item)
            new_item['inst_idx'] = item['inst_idx']
            if history == 'target' or len(
                    item['dialog_history']
            ) == 0:  # Have to use target only if no dialog history.
                tar = item['target']
                new_item['instructions'] = '<TAR> ' + tar
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence(
                        [tar], seps=['<TAR>'])
            elif history == 'oracle_ans':
                ora_a = item['dialog_history'][-1][
                    'message']  # i.e., the last oracle utterance.
                tar = item['target']
                new_item['instructions'] = '<ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence(
                        [ora_a, tar], seps=['<ORA>', '<TAR>'])
            elif history == 'nav_q_oracle_ans':
                nav_q = item['dialog_history'][-2]['message']
                ora_a = item['dialog_history'][-1]['message']
                tar = item['target']
                new_item[
                    'instructions'] = '<NAV> ' + nav_q + ' <ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    qa_enc = tokenizer.encode_sentence(
                        [nav_q, ora_a, tar], seps=['<NAV>', '<ORA>', '<TAR>'])
                    new_item['instr_encoding'] = qa_enc
            elif history == 'all':
                dia_inst = ''
                sentences = []
                seps = []
                for turn in item['dialog_history']:
                    sentences.append(turn['message'])
                    sep = '<NAV>' if turn['role'] == 'navigator' else '<ORA>'
                    seps.append(sep)
                    dia_inst += sep + ' ' + turn['message'] + ' '
                sentences.append(item['target'])
                seps.append('<TAR>')
                dia_inst += '<TAR> ' + item['target']
                new_item['instructions'] = dia_inst
                if tokenizer:
                    dia_enc = tokenizer.encode_sentence(sentences, seps=seps)
                    new_item['instr_encoding'] = dia_enc

            # TODO: add a flag to enable/disable nav history
            new_item['nav_hist_encoding'] = []
            for nav_idx, pano in enumerate(item['nav_history']):
                view_index = 0
                if nav_idx + 1 < len(item['nav_history']):
                    next_pano = item['nav_history'][nav_idx + 1]
                    pos = self.graphs[item['scan']].node[pano]['position']
                    next_pos = self.graphs[
                        item['scan']].node[next_pano]['position']
                    target_rel = next_pos - pos
                    target_heading = math.pi / 2.0 - math.atan2(
                        target_rel[1], target_rel[0])
                    if target_heading < 0:
                        target_heading += math.pi * 2
                    view_index = int(
                        round(target_heading / HEADING_INCREMENT) + 12)

                feature = self.env.features[item['scan'] + '_' +
                                            pano][view_index, :]
                new_item['nav_hist_encoding'].append(pano + '_' +
                                                     str(view_index))

            new_item['nav_hist_encoding'].reverse()
            new_item['nav_hist_encoding'] = tokenizer.pad_or_trunc_sequence(
                new_item['nav_hist_encoding'], NAV_HIST_SEQ_LENGTH, "")

            self.data.append(new_item)
        self.scans = set(self.scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self.path_type = path_type
        print 'R2RBatch loaded with %d instructions, using splits: %s' % (len(
            self.data), ",".join(splits))
示例#6
0
    output_file = os.path.join(output_dir,
                               '_'.join(['rsf', TIMESTRING, 'rec_surv.pdf']))
    viz.plot_survival_curves(experiment_name='RSF',
                             output_file=output_file,
                             **rec_dict)


if __name__ == '__main__':
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    args = parse_args()
    print("Arguments:", args)

    # Load Dataset
    print("Loading datasets: " + args.dataset)
    datasets = utils.load_datasets(args.dataset)

    # Train CPH model
    print("Training CPH Model")
    train_df = utils.format_dataset_to_df(datasets['train'], DURATION_COL,
                                          EVENT_COL)
    cf = CoxPHFitter()
    results = cf.fit(train_df,
                     duration_col=DURATION_COL,
                     event_col=EVENT_COL,
                     include_likelihood=True)
    cf.print_summary()
    print("Train Likelihood: " + str(cf._log_likelihood))

    if 'valid' in datasets:
        metrics = evaluate_model(cf, datasets['valid'])
示例#7
0
    def train(self, epoch, train_env, tb_logger=None):
        batch_time = AverageMeter()
        losses = AverageMeter()
        dists = AverageMeter()
        movements = AverageMeter()
        val_losses = AverageMeter()
        val_acces = AverageMeter()

        print('Training on {} env ...'.format(train_env.splits[0]))
        # switch to train mode
        self.agent.env = train_env
        self.agent.encoder.train()
        self.agent.model.train()

        if self.opts.second_training:
            self.agent.model.first_stage_model.training = False

        self.agent.feedback = self.opts.feedback_training
        self.agent.value_loss = None
        self.agent.val_acc = None

        # load dataset path for computing ground truth distance
        self.agent.gt = {}
        for item in load_datasets(train_env.splits, self.opts):
            self.agent.gt[item['path_id']] = item

        end = time.time()
        for iter in range(1, self.train_iters_epoch + 1):
            # rollout the agent
            if self.opts.arch == 'self-monitoring':
                loss, traj = self.agent.rollout_monitor()
            elif self.opts.arch == 'speaker-baseline':
                loss, traj = self.agent.rollout()
            else:
                raise NotImplementedError()

            dist_from_goal = np.mean(self.agent.dist_from_goal)
            movement = np.mean(self.agent.traj_length)

            losses.update(loss.item(), self.opts.batch_size)
            dists.update(dist_from_goal, self.opts.batch_size)
            movements.update(movement, self.opts.batch_size)

            if self.agent.value_loss is not None:
                val_losses.update(self.agent.value_loss.item(),
                                  self.opts.batch_size)

            if self.agent.val_acc is not None:
                val_acces.update(np.mean(self.agent.val_acc),
                                 self.opts.batch_size)

            # zero the gradients before backward pass
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if tb_logger and iter % 10 == 0:
                current_iter = iter + (epoch - 1) * self.train_iters_epoch
                tb_logger.add_scalar('train/loss_train', loss, current_iter)
                tb_logger.add_scalar('train/dist_from_goal', dist_from_goal,
                                     current_iter)
                tb_logger.add_scalar('train/movements', movement, current_iter)
                if self.agent.value_loss is not None:
                    tb_logger.add_scalar('train/value_loss',
                                         self.agent.value_loss, current_iter)

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      epoch,
                      iter,
                      self.train_iters_epoch,
                      batch_time=batch_time,
                      loss=losses))

        if tb_logger:
            tb_logger.add_scalar('epoch/learning_rate',
                                 self.optimizer.param_groups[0]['lr'], epoch)
            tb_logger.add_scalar('epoch/train/loss', losses.avg, epoch)
            tb_logger.add_scalar('epoch/train/dist_from_goal', dists.avg,
                                 epoch)
            tb_logger.add_scalar('epoch/train/movements', movements.avg, epoch)
            if self.agent.value_loss is not None:
                tb_logger.add_scalar('epoch/train/val_loss', val_losses.avg,
                                     epoch)
            if self.agent.val_acc is not None:
                tb_logger.add_scalar('epoch/train/val_acc', val_acces.avg,
                                     epoch)
示例#8
0
'''This is the repo which contains the original code to the WACV 2021 paper
"Same Same But DifferNet: Semi-Supervised Defect Detection with Normalizing Flows"
by Marco Rudolph, Bastian Wandt and Bodo Rosenhahn.
For further information contact Marco Rudolph ([email protected])'''

import config as c
from train import *
from utils import load_datasets, make_dataloaders
import time
import gc
import json

_, _, test_set = load_datasets(c.dataset_path, c.class_name, test=True)
_, _, test_loader = make_dataloaders(None, None, test_set, test=True)

model = torch.load("models/" + c.modelname + "", map_location=torch.device('cpu'))

with open('models/' + c.modelname + '.json') as jsonfile:
    model_parameters = json.load(jsonfile)

time_start = time.time()
test(model, model_parameters, test_loader)
time_end = time.time()
time_c = time_end - time_start  # 运行所花时间

print("test time cost: {:f} s".format(time_c))
示例#9
0
    def train(self, epoch, train_env, tb_logger=None):
        batch_time = AverageMeter()
        losses = AverageMeter()
        dists = AverageMeter()
        movements = AverageMeter()
        val_losses = AverageMeter()
        val_acces = AverageMeter()
        aux_losses = AverageMeter()
        accuracy = AverageMeter()

        print('Training on {} env ...'.format(train_env.splits[0]))
        # switch to train mode
        self.agent.env = train_env
        self.agent.encoder.train()
        self.agent.model.train()

        self.agent.feedback = self.opts.feedback_training
        self.agent.value_loss = None
        self.agent.val_acc = None
        self.agent.rollback_attn = None

        # load dataset path for computing ground truth distance
        self.agent.gt = {}
        for item in load_datasets(train_env.splits, self.opts):
            self.agent.gt[item['path_id']] = item

        success_count, rollback_success_count, rollback_count, oscillating_success_count, oscillating_count = 0, 0, 0, 0, 0
        end = time.time()
        for iter in range(1, self.train_iters_epoch + 1):
            # roll out the agent
            if self.opts.arch == 'regretful':
                loss, traj, categories = self.agent.rollout_regret()
            elif self.opts.arch == 'self-monitoring':
                loss, traj, categories = self.agent.rollout_monitor()
            elif self.opts.arch == 'speaker-baseline':
                loss, traj = self.agent.rollout()
            else:
                raise NotImplementedError()

            # Calculate aux task accuracy
            acc = []
            for gt, predicted in zip(categories[0], categories[1]):
                max_index = predicted.max(dim=1)[1]
                acc.append(
                    ((max_index == gt).sum().item()) / (max_index.size()[0]))
            acc = np.mean(acc) * 100

            dist_from_goal = np.mean(self.agent.dist_from_goal)
            movement = np.mean(self.agent.traj_length)

            accuracy.update(acc, self.opts.batch_size)
            losses.update(loss.item(), self.opts.batch_size)
            dists.update(dist_from_goal, self.opts.batch_size)
            movements.update(movement, self.opts.batch_size)

            if self.agent.value_loss is not None:
                val_losses.update(self.agent.value_loss.item(),
                                  self.opts.batch_size)

            if self.agent.val_acc is not None:
                val_acces.update(np.mean(self.agent.val_acc),
                                 self.opts.batch_size)

            if self.agent.aux_loss is not None:
                aux_losses.update(self.agent.aux_loss.item(),
                                  self.opts.batch_size)

            # zero the gradients before backward pass
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if self.agent.rollback_attn is not None:
                if iter == 1:
                    rollback_attn = self.agent.rollback_attn
                else:
                    rollback_attn = np.concatenate(
                        (rollback_attn, self.agent.rollback_attn), axis=1)

            if tb_logger and iter % 10 == 0:
                current_iter = iter + (epoch - 1) * self.train_iters_epoch
                tb_logger.add_scalar('train/loss_train', loss, current_iter)
                tb_logger.add_scalar('train/dist_from_goal', dist_from_goal,
                                     current_iter)
                tb_logger.add_scalar('train/movements', movement, current_iter)
                tb_logger.add_scalar('train/aux_accuracy', acc, current_iter)
                if self.agent.value_loss is not None:
                    tb_logger.add_scalar('train/value_loss',
                                         self.agent.value_loss, current_iter)

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      epoch,
                      iter,
                      self.train_iters_epoch,
                      batch_time=batch_time,
                      loss=losses))

            success_count, rollback_success_count, rollback_count, oscillating_success_count, oscillating_count = \
                count_rollback_success(success_count, rollback_success_count, rollback_count, oscillating_success_count,
                                       oscillating_count, traj)

        if tb_logger:
            tb_logger.add_scalar('epoch/learning_rate',
                                 self.optimizer.param_groups[0]['lr'], epoch)
            tb_logger.add_scalar('epoch/train/loss', losses.avg, epoch)
            tb_logger.add_scalar('epoch/train/dist_from_goal', dists.avg,
                                 epoch)
            tb_logger.add_scalar('epoch/train/movements', movements.avg, epoch)
            tb_logger.add_scalar('epoch/train/aux_accuracy', accuracy.avg,
                                 epoch)
            if self.agent.value_loss is not None:
                tb_logger.add_scalar('epoch/train/val_loss', val_losses.avg,
                                     epoch)
            if self.agent.val_acc is not None:
                tb_logger.add_scalar('epoch/train/val_acc', val_acces.avg,
                                     epoch)
            if self.agent.rollback_attn is not None:
                for step in range(self.opts.max_episode_len):
                    tb_logger.add_histogram(
                        'epoch_train/rollback_attn_{}'.format(step),
                        rollback_attn[step], epoch)
            if self.agent.aux_loss is not None:
                tb_logger.add_scalar('epoch/train/aux_loss', aux_losses.avg,
                                     epoch)
            tb_logger.add_scalar('rollback_oscillation/train/rollback',
                                 rollback_count / len(train_env.data), epoch)
            tb_logger.add_scalar('rollback_oscillation/train/rollback_SR',
                                 rollback_success_count / len(train_env.data),
                                 epoch)
            tb_logger.add_scalar('rollback_oscillation/train/oscillating',
                                 oscillating_count / len(train_env.data),
                                 epoch)
            tb_logger.add_scalar(
                'rollback_oscillation/train/oscillating_SR',
                oscillating_success_count / len(train_env.data), epoch)
示例#10
0
    POST_POWER_DICT[key] = p_dic[key]['power'] if 'power' in p_dic[key] else 1

with open(CONFIG['inflect_templates_path'], 'rb') as f:
    inflect_templates = pickle.load(f)

with open(CONFIG['tags_path'], 'rb') as f:
    tpl_cls_dict = pickle.load(f)

lemma_cls_dict = {}
for lemma_tpl in inflect_templates:
    lemma_id = tpl_cls_dict[lemma_tpl]['i']
    for tpl in inflect_templates[lemma_tpl]:
        lemma_cls_dict[tpl_cls_dict[tpl]['i']] = lemma_id

lemma_dict = {}
for item in load_datasets('inflect', 'test', 'train', 'valid'):
    if item['id'] not in lemma_dict:
        lemma_dict[item['id']] = (item['x_src'], item['x_cls'])

ad_tags_dict = {}
with open(VECT_PATH, 'rb') as f:
    vec_words = pickle.load(f)
    for word in vec_words:
        item = vec_words[word]
        for form in item['forms']:
            lexeme_id_key = 'inflect_id' if 'inflect_id' in form else 'id'
            lexeme_id = form[lexeme_id_key]

            if 'ad_tags' not in form:
                continue
示例#11
0
    clf = make_pipeline(SMOTE(random_state=0), LinearSVC(random_state=0))
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[1], '(b)')
    fig.tight_layout()

    plt.savefig(join(analysis_path, 'resampling_decision_function.pdf'),
                bbox_inches='tight',
                pad_inches=0)


if __name__ == '__main__':

    data_path, results_path, analysis_path = generate_paths()

    # load datasets
    datasets = load_datasets(data_dir=data_path)

    # load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(results_path, f'{name}.pkl')
        results.append(pd.read_pickle(file_path))

    # combine and select results
    results = combine_results(*results)
    results = select_results(results,
                             oversamplers_names=OVRS_NAMES,
                             classifiers_names=CLFS_NAMES)

    # datasets description
    summarize_multiclass_datasets(datasets).to_csv(join(
示例#12
0
    rec_dict = utils.calculate_recs_and_antirecs(rec_trt, true_trt = trt_idx, dataset = dataset)
    
    output_file = os.path.join(output_dir, '_'.join(['deepsurv',TIMESTRING, 'rec_surv.pdf']))
    print(output_file)
    viz.plot_survival_curves(experiment_name = 'DeepSurv', output_file=output_file, **rec_dict)

def save_model(model, output_file):
    model.save_weights(output_file)

if __name__ == '__main__':
    args = parse_args()
    print("Arguments:",args)

    # Load Dataset
    print("Loading datasets: " + args.dataset)
    datasets = utils.load_datasets(args.dataset)
    norm_vals = {
            'mean' : datasets['train']['x'].mean(axis =0),
            'std'  : datasets['train']['x'].std(axis=0)
        }

    # Train Model

    # TODO standardize location of logs + results => have them go into same directory with same UUID of experiment
    tensor_log_dir = "/shared/data/logs/tensorboard_" + str(args.dataset) + "_" + str(uuid.uuid4())
    logger = TensorboardLogger("experiments.deep_surv", tensor_log_dir, update_freq = 10)
    model = deep_surv.load_model_from_json(args.model, args.weights)
    if 'valid' in datasets:
        valid_data = datasets['valid']
    else:
        valid_data = None
示例#13
0
if create_features:
    result = subprocess.run('python create_features.py', shell=True)
    if result.returncode != 0:
        print('ERROR: create_features.py')
        quit()

# create_features.py された特徴量のリストを取得(今は使ってない)
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='features.json')
options = parser.parse_args()
feat_dict = json.load(open(options.config))
use_features = feat_dict['features']

# loading
path = cwd.replace(this_folder, '/features')
train_df, test_df = load_datasets(path, is_debug)

# 欠損値処理
train_df, test_df = removeMissingColumns(train_df, test_df, 0.5)
logging.debug("Train shape: {}, test shape: {}".format(train_df.shape,
                                                       test_df.shape))

# model
"""
models, model_params, feature_importance_df, train_preds, test_preds, scores, model_name = kfold_lightgbm_without_outliers(
    train_df, test_df, target_col=target_col, model_loss=loss_type,
    num_folds=folds, feats_exclude=feats_exclude, stratified=False, use_gpu=use_GPU)
"""
models, model_params, feature_importance_df, train_preds, test_preds, scores, model_name = kfold_lightgbm(
    train_df,
    test_df,
示例#14
0
def pre_train(args):

    writer = SummaryWriter()

    model_G = Generator()
    model_G = nn.DataParallel(model_G)
    model_G = model_G.to(device)

    optim_G = torch.optim.Adam(model_G.parameters(),
                               lr=0.0002,
                               betas=(0.5, 0.999))

    loss_G = nn.L1Loss().to(device)

    result = {}
    result["train/mae_loss"] = []
    result["valid/mae_loss"] = []

    train_dataset = load_datasets(args.train_data, args.batch_size,
                                  args.patch_size, True)
    valid_dataset = load_datasets(args.valid_data, args.batch_size,
                                  args.patch_size, True)

    for i in range(args.pre_epochs):
        mae_loss = []

        for (real_color, input_gray, hint_color), _ in tqdm(train_dataset):

            real_color = real_color.to(device)
            input_gray = input_gray.to(device)
            hint_color = hint_color.to(device)

            optim_G.zero_grad()

            fake_color = model_G(input_gray, hint_color)

            g_loss = loss_G(fake_color, real_color)
            g_loss.backward()

            optim_G.step()

            mae_loss.append(g_loss.item())

        result["train/mae_loss"].append(statistics.mean(mae_loss))

        writer.add_scalar("pre_train/train/mae_loss",
                          result["train/mae_loss"][-1], i + 1)

        if (i + 1) % 1 == 0 or (i + 1) == args.epochs or i == 0:
            with torch.no_grad():
                mae_loss = []
                for (real_color, input_gray,
                     hint_color), _ in tqdm(valid_dataset):
                    batch_len = len(real_color)

                    real_color = real_color.to(device)
                    input_gray = input_gray.to(device)
                    hint_color = hint_color.to(device)

                    fake_color = model_G(input_gray, hint_color)

                    g_loss = loss_G(fake_color, real_color)

                    mae_loss.append(g_loss.item())

                result["valid/mae_loss"].append(statistics.mean(mae_loss))

                writer.add_scalar("pre_train/valid/mae_loss",
                                  result["valid/mae_loss"][-1], i + 1)

                torchvision.utils.save_image(real_color[:min(batch_len, 50)],
                                             os.path.join(
                                                 args.pre_image_path,
                                                 f"real_epoch_{i + 1:03}.png"),
                                             nrow=5,
                                             range=(-1.0, 1.0),
                                             normalize=True)
                torchvision.utils.save_image(fake_color[:min(batch_len, 50)],
                                             os.path.join(
                                                 args.pre_image_path,
                                                 f"fake_epoch_{i + 1:03}.png"),
                                             nrow=5,
                                             range=(-1.0, 1.0),
                                             normalize=True)
                torchvision.utils.save_image(hint_color[:min(batch_len, 50)],
                                             os.path.join(
                                                 args.pre_image_path,
                                                 f"hint_epoch_{i + 1:03}.png"),
                                             nrow=5,
                                             range=(-1.0, 1.0),
                                             normalize=True)

                torch.save(
                    model_G.state_dict(),
                    os.path.join(args.pre_model_path, f"gen_{i + 1:03}.pt"))

    writer.close()
示例#15
0
def train(args):

    writer = SummaryWriter()

    model_name = get_model_name(args.pre_model_path)
    print("loading...{}".format(model_name))

    model_G, model_D = Generator(), Discriminator(args.patch_size)
    model_G, model_D = nn.DataParallel(model_G), nn.DataParallel(model_D)
    model_G.load_state_dict(torch.load(model_name))
    model_G, model_D = model_G.to(device), model_D.to(device)

    optim_G = torch.optim.Adam(model_G.parameters(),
                               lr=0.0002,
                               betas=(0.5, 0.999))
    optim_D = torch.optim.Adam(model_D.parameters(),
                               lr=0.0002,
                               betas=(0.5, 0.999))

    loss_bce = nn.BCEWithLogitsLoss()
    loss_mse = nn.MSELoss()

    ones = torch.ones(512, 1, args.patch_size // 16,
                      args.patch_size // 16).to(device)
    zeros = torch.zeros(512, 1, args.patch_size // 16,
                        args.patch_size // 16).to(device)

    result = {}
    result["train/total_loss_G"] = []
    result["train/total_loss_D"] = []
    result["valid/total_loss_G"] = []
    result["valid/total_loss_D"] = []

    train_dataset = load_datasets(args.train_data, args.batch_size,
                                  args.patch_size, True)
    valid_dataset = load_datasets(args.valid_data, args.batch_size,
                                  args.patch_size, True)

    for i in range(args.epochs):
        total_loss_G, total_loss_D = [], []

        for (real_color, input_gray, hint_color), _ in tqdm(train_dataset):
            batch_len = len(real_color)

            real_color = real_color.to(device)
            input_gray = input_gray.to(device)
            hint_color = hint_color.to(device)

            optim_D.zero_grad()
            optim_G.zero_grad()

            fake_color = model_G(input_gray, hint_color)
            fake_color_tensor = fake_color.detach()

            fake_D = model_D(fake_color)

            g_mse_loss = loss_mse(real_color, fake_color)
            g_bce_loss = loss_bce(fake_D, ones[:batch_len])
            g_loss = args.LAMBDA * g_mse_loss + g_bce_loss
            g_loss.backward(retain_graph=True)
            optim_G.step()

            total_loss_G.append(g_loss.item())

            real_D_out = model_D(real_color)
            fake_D_out = model_D(fake_color_tensor)

            d_real_loss = loss_bce(real_D_out, ones[:batch_len])
            d_fake_loss = loss_bce(fake_D_out, zeros[:batch_len])
            d_loss = d_real_loss + d_fake_loss

            d_loss.backward()
            optim_D.step()

            total_loss_D.append(d_loss.item())

        result["train/total_loss_G"].append(statistics.mean(total_loss_G))
        result["train/total_loss_D"].append(statistics.mean(total_loss_D))

        writer.add_scalar('train/loss_G', result['train/total_loss_G'][-1],
                          i + 1)
        writer.add_scalar('train/loss_D', result['train/total_loss_D'][-1],
                          i + 1)

        if (i + 1) % 1 == 0 or (i + 1) == args.epochs or i == 0:

            with torch.no_grad():
                total_loss_G, total_loss_D = [], []
                for (real_color, input_gray,
                     hint_color), _ in tqdm(valid_dataset):
                    batch_len = len(real_color)

                    real_color = real_color.to(device)
                    input_gray = input_gray.to(device)
                    hint_color = hint_color.to(device)

                    fake_color = model_G(input_gray, hint_color)
                    fake_color_tensor = fake_color.detach()

                    fake_D = model_D(fake_color)

                    g_mse_loss = loss_mse(real_color, fake_color)
                    g_bce_loss = loss_bce(fake_D, ones[:batch_len])
                    g_loss = args.LAMBDA * g_mse_loss + g_bce_loss

                    total_loss_G.append(g_loss.item())

                    real_D_out = model_D(real_color)
                    fake_D_out = model_D(fake_color_tensor)

                    d_real_loss = loss_bce(real_D_out, ones[:batch_len])
                    d_fake_loss = loss_bce(fake_D_out, zeros[:batch_len])
                    d_loss = d_real_loss + d_fake_loss

                    total_loss_D.append(d_loss.item())

                result["valid/total_loss_G"].append(
                    statistics.mean(total_loss_G))
                result["valid/total_loss_D"].append(
                    statistics.mean(total_loss_D))

                writer.add_scalar('valid/loss_G',
                                  result['valid/total_loss_G'][-1], i + 1)
                writer.add_scalar('valid/loss_D',
                                  result['valid/total_loss_D'][-1], i + 1)

                torchvision.utils.save_image(real_color[:min(batch_len, 50)],
                                             os.path.join(
                                                 args.image_path,
                                                 f"real_epoch_{i + 1:03}.png"),
                                             nrow=5,
                                             range=(-1.0, 1.0),
                                             normalize=True)
                torchvision.utils.save_image(fake_color[:min(batch_len, 50)],
                                             os.path.join(
                                                 args.image_path,
                                                 f"fake_epoch_{i + 1:03}.png"),
                                             nrow=5,
                                             range=(-1.0, 1.0),
                                             normalize=True)
                torchvision.utils.save_image(hint_color[:min(batch_len, 50)],
                                             os.path.join(
                                                 args.image_path,
                                                 f"hint_epoch_{i + 1:03}.png"),
                                             nrow=5,
                                             range=(-1.0, 1.0),
                                             normalize=True)

                torch.save(model_G.state_dict(),
                           os.path.join(args.model_path, f"gen_{i + 1:03}.pt"))
                torch.save(model_D.state_dict(),
                           os.path.join(args.model_path, f"dis_{i + 1:03}.pt"))

    writer.close()
import torch
from utils import load_datasets
from runner import Runner
import numpy as np

train_dataset, val_dataset, test_dataset = load_datasets()

encoder_file = input("Encoder file: ")
decoder_file = input("Decoder file: ")


encoder = torch.load("./{}".format(encoder_file))
decoder = torch.load("./{}".format(decoder_file))

runner = Runner(encoder, decoder, train_dataset, val_dataset, test_dataset)

bleu1, bleu4, test_loss = runner.test()
print("\n --- Test scores ---  \nBleu 1:     {} \nBleu4:      {} \nLoss:       {} \nPerplexity: {}"
              .format(bleu1, bleu4, test_loss, np.exp(test_loss)))
示例#17
0
    def __init__(self,
                 opts,
                 features,
                 img_spec,
                 batch_size=64,
                 seed=10,
                 splits=['train'],
                 tokenizer=None):
        self.env = PanoEnvBatch(features, img_spec, batch_size=batch_size)
        self.data = []
        self.scans = []
        self.opts = opts

        print('Loading {} dataset'.format(splits[0]))

        json_data = load_datasets(splits)
        total_length = len(json_data)

        # iteratively load data into system memory
        for i, item in enumerate(json_data):

            if not is_experiment() and i >= 20: break

            # Split multiple instructions into separate entries
            for j, instr in enumerate(item['instructions']):
                self.scans.append(item['scan'])
                new_item = dict(item)
                new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                new_item['instructions'] = instr
                if tokenizer:
                    if 'instr_encoding' not in item:  # we may already include 'instr_encoding' when generating synthetic instructions
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    else:
                        new_item['instr_encoding'] = item['instr_encoding']
                    if 'divide' in opts.lang_embed:
                        if opts.divide_method == 'kevin':
                            new_item[
                                'divid_instr_encoding'] = tokenizer.divide_instr_kevin(
                                    instr, opts.max_sentence_segs)
                        else:
                            new_item[
                                'divid_instr_encoding'] = tokenizer.divide_instr_victor(
                                    instr, opts.max_sentence_segs)
                self.data.append(new_item)
            print_progress(i + 1,
                           total_length,
                           prefix='Progress:',
                           suffix='Complete',
                           bar_length=50)

        self.scans = set(self.scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
示例#18
0
now = dt.datetime.now()
handler2 = logging.FileHandler(
    filename="./logs/sub_{0:%Y%m%d%H%M%S}.log".format(now))
logger.addHandler(handler2)
IDNAME = config["ID_name"] if "ID_NAME" in config else "id"
RANDOM_STATE = 0
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

features = config["features"]
logger.info(features)

target_name = config["target_name"]

logger.info("load datasets")
X_train_all, X_test, dims = load_datasets(features)

indexes = [
    f"{str_func(k)}{i}" if v > 1 else str_func(k) for k, v in dims.items()
    for i in range(v)
]

y_train_all = load_target(target_name)
logger.info(X_train_all.shape)

fmeasures = []
y_preds = []

params = config["params"]
model_name = config["model_name"]
示例#19
0
    def __init__(self,
                 feature_store,
                 nav_graphs,
                 panoramic,
                 action_space,
                 encoder_type,
                 beam_size=1,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 att_ctx_merge='None',
                 min_n_sentences=-1):  # , subgoal
        self.env = EnvBatch(feature_store=feature_store,
                            batch_size=batch_size,
                            beam_size=beam_size)

        self.data = []
        self.scans = []
        self.panoramic = panoramic
        self.nav_graphs = nav_graphs
        self.action_space = action_space
        #self.ctrl_feature = ctrl_feature
        self.ctrl_feature = None
        self.att_ctx_merge = att_ctx_merge
        self.traj_n_sents = {
        }  # for EncoderMultiLSTM and trajectories that have more than 3 instructions
        if tokenizer:
            tokname = tokenizer.__class__.__name__

        data_dict = {
        }  # to merge instructions for same path scattered in different splits
        for item in load_datasets(splits, encoder_type):
            self.att_ctx_merge = att_ctx_merge
            if self.att_ctx_merge == 'None' or self.att_ctx_merge == 'Eval':
                j_offset = 0
                while '%s_%d' % (item['path_id'], j_offset) in data_dict:
                    j_offset += 1
                for j, instr in enumerate(item['instructions']):
                    self.scans.append(item['scan'])
                    new_item = dict(item)
                    new_item['instr_id'] = '%s_%d' % (item['path_id'],
                                                      j + j_offset)
                    if self.att_ctx_merge == 'None':
                        new_item['instructions'] = instr
                    elif self.att_ctx_merge == 'Eval':
                        new_item['instructions'] = [instr]

                    if tokenizer:
                        if self.att_ctx_merge == 'None':
                            new_item[
                                'instr_encoding'] = tokenizer.encode_sentence(
                                    instr)
                        elif self.att_ctx_merge == 'Eval':
                            new_item['instr_encoding'] = [
                                tokenizer.encode_sentence(instr)
                            ]

                    #self.data.append(new_item)
                    data_dict[new_item['instr_id']] = new_item
                    self.traj_n_sents[item['path_id']] = j + j_offset + 1
            else:
                self.scans.append(item['scan'])
                if tokenizer:
                    instr_encoding = [
                        tokenizer.encode_sentence(instr)
                        for instr in item['instructions']
                    ]

                if str(item['path_id']) not in data_dict:
                    new_item = dict(item)
                    new_item['instr_id'] = str(item['path_id'])
                    data_dict[new_item['instr_id']] = new_item
                    self.traj_n_sents[new_item['instr_id']] = 0
                    if tokenizer: new_item['instr_encoding'] = []
                else:
                    new_item = data_dict[str(item['path_id'])]
                    new_item['instructions'].extend(item['instructions'])

                if tokenizer: new_item['instr_encoding'].extend(instr_encoding)
                self.traj_n_sents[new_item['instr_id']] += len(
                    item['instructions'])

        sent_max_len = 0
        for traj in data_dict.values():
            if len(traj['instr_encoding']) < min_n_sentences:
                print('ignore path_id', traj['path_id'], 'with only',
                      len(traj['instr_encoding']), 'instructions')
                continue

            #sent_max_len = max(sent_max_len, max([len(tokenizer.split_sentence(instr)) for instr in traj['instructions']]))
            sent_max_len = max(
                sent_max_len,
                max([len(instr.split(' ')) for instr in traj['instructions']]))

            if min_n_sentences <= 0 or self.att_ctx_merge == 'None':
                self.data.append(traj)
            else:
                # add permutations to get more instruction groups
                for id_ix, id_perm in enumerate(
                        list(
                            combinations(
                                list(range(len(traj['instr_encoding']))),
                                min_n_sentences))):
                    new_traj = dict()
                    new_traj['instr_id'] = traj['instr_id'] + '_' + str(id_ix)
                    new_traj['instructions'] = list(
                        itemgetter(*id_perm)(traj['instructions']))
                    new_traj['instr_encoding'] = list(
                        itemgetter(*id_perm)(traj['instr_encoding']))
                    new_traj['distance'] = traj['distance']
                    new_traj['scan'] = traj['scan']
                    new_traj['path_id'] = traj['path_id']
                    new_traj['path'] = traj['path']
                    new_traj['heading'] = traj['heading']
                    self.data.append(new_traj)

                    if len(self.data) % 100000 == 0:
                        print('%d instructions' % (len(self.data)))

        print(
            'Average n_sentences:',
            sum([i for i in self.traj_n_sents.values()]) /
            len(self.traj_n_sents))
        print('Max sentence length:', sent_max_len)

        self.scans = set(self.scans)
        self.splits = splits
        if seed != 'resume':
            self.seed = seed
            random.seed(self.seed)
            random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        self.epo_inc = False  # jolin: middle of an epoch

        #self.env.pre_loadSims(self.data) # debug
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
示例#20
0
#prepare the log file
now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
sc = logging.StreamHandler()
logger.addHandler(sc)
fh = logging.FileHandler(f'logs/log_{now}.log')
logger.addHandler(fh)
logger.debug(f'logs/log_{now}.log')
logger.debug(config_path)

#load in datasets and target
feats = config['feats']
target_name = config['target_name']
train, test = load_datasets(feats)
target = load_target(target_name)
molecule_name = feather.read_dataframe(
    './data/input/train.feather')['molecule_name'].values

if is_debug_mode:
    print("Debug mode is ON!")
    train = train.iloc[:10000]
    test = test.iloc[:1000]
    target = target.iloc[:10000]
    molecule_name = molecule_name[:10000]

train_type = train['type'].values
test_type = test['type'].values
logger.debug(feats)
示例#21
0
 def load_data(self):
     train = pd.read_csv("./data/raw/train.csv")
     X_train, X_test = load_datasets(self.features)
     y_train = train["target"].to_frame()
     return X_train, X_test, y_train
'''This is the repo which contains the original code to the WACV 2021 paper
"Same Same But DifferNet: Semi-Supervised Defect Detection with Normalizing Flows"
by Marco Rudolph, Bastian Wandt and Bodo Rosenhahn.
For further information contact Marco Rudolph ([email protected])'''

import config as c
from train import *
from utils import load_datasets, make_dataloaders
import time
import gc

train_set, validate_set, _ = load_datasets(c.dataset_path, c.class_name)
train_loader, validate_loader, _ = make_dataloaders(train_set, validate_set,
                                                    None)

time_start = time.time()
model, model_parameters = train(train_loader, validate_loader)
#model, model_config = train(train_loader, None)
time_end = time.time()
time_c = time_end - time_start  # 运行所花时间
print("train time cost: {:f} s".format(time_c))

# free memory
del train_set
del validate_set
del train_loader
del validate_loader

gc.collect()
torch.cuda.empty_cache()
示例#23
0
    def eval(self, epoch, val_env, tb_logger=None):
        batch_time = AverageMeter()
        losses = AverageMeter()
        dists = AverageMeter()
        movements = AverageMeter()
        val_losses = AverageMeter()
        val_acces = AverageMeter()
        aux_losses = AverageMeter()
        accuracy = AverageMeter()

        env_name, (env, evaluator) = val_env

        print('Evaluating on {} env ...'.format(env_name))

        self.agent.env = env
        self.agent.env.reset_epoch()
        self.agent.model.eval()
        self.agent.encoder.eval()
        self.agent.feedback = self.opts.feedback
        self.agent.value_loss = None
        self.agent.val_acc = None
        self.agent.rollback_attn = None

        # load dataset path for computing ground truth distance
        self.agent.gt = {}
        for item in load_datasets([env_name]):
            self.agent.gt[item['path_id']] = item
        val_iters_epoch = math.ceil(len(env.data) / self.opts.batch_size)
        self.agent.results = {}
        looped = False
        iter = 1
        success_count, rollback_success_count, rollback_count, oscillating_success_count, oscillating_count = 0, 0, 0, 0, 0

        with torch.no_grad():
            end = time.time()
            while True:

                # roll out the agent
                if self.opts.arch == 'regretful':
                    loss, traj, categories = self.agent.rollout_regret()
                elif self.opts.arch == 'self-monitoring':
                    loss, traj, categories = self.agent.rollout_monitor()
                elif self.opts.arch == 'speaker-baseline':
                    loss, traj = self.agent.rollout()
                else:
                    raise NotImplementedError()

                # Calculate accuracy
                acc = []
                for gt, predicted in zip(categories[0], categories[1]):
                    max_index = predicted.max(dim=1)[1]
                    acc.append(((max_index == gt).sum().item()) /
                               (max_index.size()[0]))
                acc = np.mean(acc) * 100

                dist_from_goal = np.mean(self.agent.dist_from_goal)
                movement = np.mean(self.agent.traj_length)

                accuracy.update(acc, self.opts.batch_size)
                losses.update(loss.item(), self.opts.batch_size)
                dists.update(dist_from_goal, self.opts.batch_size)
                movements.update(movement, self.opts.batch_size)
                if self.agent.value_loss is not None:
                    val_losses.update(self.agent.value_loss.item(),
                                      self.opts.batch_size)
                if self.agent.val_acc is not None:
                    val_acces.update(np.mean(self.agent.val_acc),
                                     self.opts.batch_size)
                if self.agent.aux_loss is not None:
                    aux_losses.update(self.agent.aux_loss.item(),
                                      self.opts.batch_size)

                if tb_logger and iter % 10 == 0:
                    current_iter = iter + (epoch - 1) * val_iters_epoch
                    tb_logger.add_scalar('{}/loss'.format(env_name), loss,
                                         current_iter)
                    tb_logger.add_scalar('{}/dist_from_goal'.format(env_name),
                                         dist_from_goal, current_iter)
                    tb_logger.add_scalar('{}/movements'.format(env_name),
                                         movement, current_iter)
                    tb_logger.add_scalar('{}/aux_acc'.format(env_name), acc,
                                         current_iter)
                    if self.agent.value_loss is not None:
                        tb_logger.add_scalar('{}/val_loss'.format(env_name),
                                             self.agent.value_loss,
                                             current_iter)

                success_count, rollback_success_count, rollback_count, oscillating_success_count, oscillating_count = \
                    count_rollback_success(success_count, rollback_success_count, rollback_count, oscillating_success_count, oscillating_count, traj)

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                if self.agent.rollback_attn is not None:
                    if iter == 1:
                        rollback_attn = self.agent.rollback_attn
                    else:
                        rollback_attn = np.concatenate(
                            (rollback_attn, self.agent.rollback_attn), axis=1)

                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          epoch,
                          iter,
                          val_iters_epoch,
                          batch_time=batch_time,
                          loss=losses))

                # write into results
                for traj_ in traj:
                    if traj_['instr_id'] in self.agent.results:
                        looped = True
                    else:
                        result = {
                            'path': traj_['path'],
                            'distance': traj_['distance'],
                            'img_attn': traj_['img_attn'],
                            'ctx_attn': traj_['ctx_attn'],
                            'rollback_forward_attn':
                            traj_['rollback_forward_attn'],
                            'value': traj_['value'],
                            'viewpoint_idx': traj_['viewpoint_idx'],
                            'navigable_idx': traj_['navigable_idx']
                        }
                        self.agent.results[traj_['instr_id']] = result
                if looped:
                    break
                iter += 1

        print('============================')
        print('success rate: {}'.format(success_count / len(env.data)))
        print('rollback rate: {}'.format(rollback_count / len(env.data)))
        print('rollback success rate: {}'.format(rollback_success_count /
                                                 len(env.data)))
        print('oscillating rate: {}'.format(oscillating_count / len(env.data)))
        print('oscillating success rate: {}'.format(oscillating_success_count /
                                                    len(env.data)))
        print('============================')

        if tb_logger:
            tb_logger.add_scalar('epoch/{}/loss'.format(env_name), losses.avg,
                                 epoch)
            tb_logger.add_scalar('epoch/{}/dist_from_goal'.format(env_name),
                                 dists.avg, epoch)
            tb_logger.add_scalar('epoch/{}/movements'.format(env_name),
                                 movements.avg, epoch)
            tb_logger.add_scalar('epoch/{}/aux_accuracy'.format(env_name),
                                 accuracy.avg, epoch)
            if self.agent.value_loss is not None:
                tb_logger.add_scalar('epoch/{}/val_loss'.format(env_name),
                                     val_losses.avg, epoch)
            if self.agent.val_acc is not None:
                tb_logger.add_scalar('epoch/{}/val_acc'.format(env_name),
                                     val_acces.avg, epoch)
            if self.agent.rollback_attn is not None:
                for step in range(self.opts.max_episode_len):
                    tb_logger.add_histogram(
                        'epoch_{}/rollback_attn_{}'.format(env_name, step),
                        rollback_attn[step], epoch)
            if self.agent.aux_loss is not None:
                tb_logger.add_scalar('epoch/{}/aux_loss'.format(env_name),
                                     aux_losses.avg, epoch)
            tb_logger.add_scalar(
                'rollback_oscillation/{}/rollback'.format(env_name),
                rollback_count / len(env.data), epoch)
            tb_logger.add_scalar(
                'rollback_oscillation/{}/rollback_SR'.format(env_name),
                rollback_success_count / len(env.data), epoch)
            tb_logger.add_scalar(
                'rollback_oscillation/{}/oscillating'.format(env_name),
                oscillating_count / len(env.data), epoch)
            tb_logger.add_scalar(
                'rollback_oscillation/{}/oscillating_SR'.format(env_name),
                oscillating_success_count / len(env.data), epoch)

        # dump into JSON file
        self.agent.results_path = '{}{}_{}_epoch_{}.json'.format(
            self.opts.results_dir, self.opts.exp_name, env_name, epoch)
        self.agent.write_results()
        score_summary, _ = evaluator.score(self.agent.results_path)
        result_str = ''
        success_rate = 0.0
        for metric, val in score_summary.items():
            result_str += '| {}: {} '.format(metric, val)
            if metric in ['success_rate']:
                success_rate = val
            if tb_logger:
                tb_logger.add_scalar('score/{}/{}'.format(env_name, metric),
                                     val, epoch)
        print(result_str)

        return success_rate
示例#24
0
    def __init__(self,
                 feature_store,
                 obj_d_feat=None,
                 obj_s_feat=None,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        if obj_d_feat or obj_s_feat:
            self.env = ObjEnvBatch(feature_store=feature_store,
                                   obj_d_feat=obj_d_feat,
                                   obj_s_feat=obj_s_feat,
                                   batch_size=batch_size)
        else:
            self.env = EnvBatch(feature_store=feature_store,
                                batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        self.data = []
        if tokenizer:
            self.tok = tokenizer
        scans = []
        for split in splits:
            for item in load_datasets([split]):
                # Split multiple instructions into separate entries
                for j, instr in enumerate(item['instructions']):
                    if item['scan'] not in self.env.featurized_scans:  # For fast training
                        continue
                    new_item = dict(item)
                    new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                    new_item['instructions'] = instr
                    if tokenizer:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    if not tokenizer or new_item[
                            'instr_encoding'] is not None:  # Filter the wrong data
                        self.data.append(new_item)
                        scans.append(item['scan'])
        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name

        self.scans = set(scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature = utils.get_all_point_angle_feature()
        self.angle_avg_feature = utils.get_avg_point_angle_feature()
        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
示例#25
0
    5,
    'n_runs':
    3,
    'rnd_seed':
    0,
    'n_jobs':
    -1
}

if __name__ == '__main__':

    # Extract paths
    data_path, results_path, _ = generate_paths()

    # Load lucas dataset
    datasets = load_datasets(data_path=data_path, data_type='csv')

    # Extract oversamplers
    oversamplers = CONFIG['oversamplers']

    # Generate oversamplers
    for oversampler in oversamplers:

        # Define and fit experiment
        experiment = ImbalancedExperiment(
            oversamplers=[oversampler],
            classifiers=CONFIG['classifiers'],
            scoring=CONFIG['scoring'],
            n_splits=CONFIG['n_splits'],
            n_runs=CONFIG['n_runs'],
            random_state=CONFIG['rnd_seed'],
示例#26
0
    def __init__(self,
                 feature_store,
                 candidate_store,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        self.env = EnvBatch(feature_store=feature_store,
                            candidate_store=candidate_store,
                            batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        self.data = []
        if tokenizer:
            self.tok = tokenizer
        scans = []
        for split in splits:
            for item in load_datasets([split]):
                # Split multiple instructions into separate entries
                for j, instr in enumerate(item['instructions']):
                    if item['scan'] not in self.env.featurized_scans:  # For fast training
                        continue
                    new_item = dict(item)
                    new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                    new_item['instructions'] = instr
                    if tokenizer:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    if not tokenizer or new_item[
                            'instr_encoding'] is not None:  # Filter the wrong data
                        self.data.append(new_item)
                        scans.append(item['scan'])
        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name

        self.scans = set(scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        if args.filter != "":
            filter_name, percent = args.filter.split("_")
            percent = int(percent) / 100
            scan_list = list(self.scans)
            scan_list = sorted(scan_list)
            scan_num = len(scan_list)
            scan_num_in_use = int(scan_num * percent)
            scan_in_use = set(scan_list[:scan_num_in_use])
            data_in_use = [
                datum for datum in self.data if datum['scan'] in scan_in_use
            ]
            data_num_in_use = len(data_in_use)
            if self.name == 'train':
                if filter_name == 'env':
                    print("With the top %d scans and %d data" %
                          (scan_num_in_use, data_num_in_use))
                    print("With percent %0.4f and %0.4f" %
                          (scan_num_in_use / len(self.scans),
                           data_num_in_use / len(self.data)))
                    print(scan_in_use)
                    self.scans = scan_in_use
                    self.data = data_in_use
                    assert len(self.data) == data_num_in_use
                elif filter_name == 'data':
                    print("With the all %d scans and %d data" %
                          (len(self.scans), data_num_in_use))
                    self.data = self.data[:data_num_in_use]
                    for datum in self.data[:5]:
                        print(datum['instr_id'])
                    assert len(self.data) == data_num_in_use
            # elif self.name == 'aug':
            #     if filter_name == 'env':
            #         print("With the top %d scans and %d data" % (scan_num_in_use, data_num_in_use))
            #         print("With percent %0.4f and %0.4f" % (scan_num_in_use / len(self.scans), data_num_in_use / len(self.data)))
            #         print(scan_in_use)
            #         self.scans = scan_in_use
            #         self.data = data_in_use
            #         assert len(self.data) == data_num_in_use
            #     elif filter_name == 'data':
            #         print("With the all %d scans and %d data" % (len(self.scans), len(self.data)))

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature = utils.get_all_point_angle_feature()
        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
示例#27
0
    def eval(self, epoch, val_env, tb_logger=None):
        batch_time = AverageMeter()
        losses = AverageMeter()
        dists = AverageMeter()
        movements = AverageMeter()
        val_losses = AverageMeter()
        val_acces = AverageMeter()

        env_name, (env, evaluator) = val_env

        print('Evaluating on {} env ...'.format(env_name))

        self.agent.env = env
        self.agent.env.reset_epoch()
        self.agent.model.eval()
        self.agent.encoder.eval()
        self.agent.feedback = self.opts.feedback
        self.agent.value_loss = None
        self.agent.val_acc = None

        # load dataset path for computing ground truth distance
        self.agent.gt = {}
        for item in load_datasets([env_name]):
            self.agent.gt[item['path_id']] = item
        val_iters_epoch = math.ceil(len(env.data) / self.opts.batch_size)
        self.agent.results = {}
        looped = False
        iter = 1

        with torch.no_grad():
            end = time.time()
            while True:

                if self.opts.progress_inference:
                    traj = self.agent.sample_progress_inference(
                        self.opts.beam_size)
                elif self.opts.eval_beam:
                    traj = self.agent.sample_beam(self.opts.beam_size)
                else:
                    # rollout the agent
                    if self.opts.arch == 'self-monitoring':
                        loss, traj = self.agent.rollout_monitor()
                    elif self.opts.arch == 'speaker-baseline':
                        loss, traj = self.agent.rollout()
                    else:
                        raise NotImplementedError()

                    dist_from_goal = np.mean(self.agent.dist_from_goal)
                    movement = np.mean(self.agent.traj_length)

                    losses.update(loss.item(), self.opts.batch_size)
                    dists.update(dist_from_goal, self.opts.batch_size)
                    movements.update(movement, self.opts.batch_size)
                    if self.agent.value_loss is not None:
                        val_losses.update(self.agent.value_loss.item(),
                                          self.opts.batch_size)
                    if self.agent.val_acc is not None:
                        val_acces.update(np.mean(self.agent.val_acc),
                                         self.opts.batch_size)

                    if tb_logger and iter % 10 == 0:
                        current_iter = iter + (epoch - 1) * val_iters_epoch
                        tb_logger.add_scalar('{}/loss'.format(env_name), loss,
                                             current_iter)
                        tb_logger.add_scalar(
                            '{}/dist_from_goal'.format(env_name),
                            dist_from_goal, current_iter)
                        tb_logger.add_scalar('{}/movements'.format(env_name),
                                             movement, current_iter)
                        if self.agent.value_loss is not None:
                            tb_logger.add_scalar(
                                '{}/val_loss'.format(env_name),
                                self.agent.value_loss, current_iter)

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          epoch,
                          iter,
                          val_iters_epoch,
                          batch_time=batch_time,
                          loss=losses))

                # write into results
                for traj_ in traj:
                    if traj_['instr_id'] in self.agent.results:
                        looped = True
                    else:
                        result = {
                            'path': traj_['path'],
                            'distance': traj_['distance'],
                            'img_attn': traj_['img_attn'],
                            'ctx_attn': traj_['ctx_attn'],
                            'value': traj_['value'],
                            'viewpoint_idx': traj_['viewpoint_idx'],
                            'navigable_idx': traj_['navigable_idx']
                        }
                        self.agent.results[traj_['instr_id']] = result
                if looped:
                    break
                iter += 1

        if tb_logger:
            tb_logger.add_scalar('epoch/{}/loss'.format(env_name), losses.avg,
                                 epoch)
            tb_logger.add_scalar('epoch/{}/dist_from_goal'.format(env_name),
                                 dists.avg, epoch)
            tb_logger.add_scalar('epoch/{}/movements'.format(env_name),
                                 movements.avg, epoch)
            if self.agent.value_loss is not None:
                tb_logger.add_scalar('epoch/{}/val_loss'.format(env_name),
                                     val_losses.avg, epoch)
            if self.agent.val_acc is not None:
                tb_logger.add_scalar('epoch/{}/val_acc'.format(env_name),
                                     val_acces.avg, epoch)

        # dump into JSON file
        if self.opts.eval_beam:
            self.agent.results_path = '{}{}-beam_{}_{}_epoch_{}.json'.format(
                self.opts.results_dir, self.opts.exp_name, self.opts.beam_size,
                env_name, epoch)
        else:
            self.agent.results_path = '{}{}_{}_epoch_{}.json'.format(
                self.opts.results_dir, self.opts.exp_name, env_name, epoch)
        self.agent.write_results()
        score_summary, _ = evaluator.score(self.agent.results_path)
        result_str = ''
        success_rate = 0.0
        for metric, val in score_summary.items():
            result_str += '| {}: {} '.format(metric, val)
            if metric in ['success_rate']:
                success_rate = val
            if tb_logger:
                tb_logger.add_scalar('score/{}/{}'.format(env_name, metric),
                                     val, epoch)
        print(result_str)

        return success_rate
示例#28
0
def main():
    # load train and test datasets
    train_x, train_y, test_x, test_y = load_datasets()

    # load model hyperparams
    epochs, batch_size, learning_rate, model_replica_path, dropout_rate = load_hyperparams(
    )

    # init dropout params
    train_dropout_rate = dropout_rate
    test_dropout_rate = 0.0

    # remove previous weights, bias, inputs, etc..
    tf.reset_default_graph()

    # init inputs
    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3), name='x')
    y = tf.placeholder(tf.float32, shape=(None, 10), name='y')

    dropout_rate = tf.placeholder(tf.float32, name='dropout_rate')

    # init layer biases and weights
    weights, biases = get_layer_weights(), get_layer_biases()

    # init model
    model = net(x, biases=biases, weights=weights, dropout_rate=dropout_rate)

    # init optimization function
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=y))
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(cost)

    # accuracy function
    correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # create tensorflow session
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        train_loss, test_loss, train_accuracy, test_accuracy = [], [], [], []

        summary_writer = tf.summary.FileWriter('./output', session.graph)

        print('Training')

        for i in range(epochs):

            print('Epoch {} :'.format(i + 1))
            for batch in range(len(train_x) // batch_size):
                batch_x = train_x[batch *
                                  batch_size:min((batch + 1) *
                                                 batch_size, len(train_x))]
                batch_y = train_y[batch *
                                  batch_size:min((batch + 1) *
                                                 batch_size, len(train_y))]

                # start training
                session.run(optimizer,
                            feed_dict={
                                x: batch_x,
                                y: batch_y,
                                dropout_rate: train_dropout_rate
                            })

                # compute metrics
                train_loss_batch, train_accuracy_batch = session.run(
                    [cost, accuracy],
                    feed_dict={
                        x: batch_x,
                        y: batch_y,
                        dropout_rate: train_dropout_rate
                    })
                print('Batch range:{} - {}  Loss: {:>10.4f}  Accuracy: {:.6f}'.
                      format(batch * batch_size,
                             min((batch + 1) * batch_size, len(train_x)),
                             train_loss_batch, train_accuracy_batch))

            test_accuracy_batch, test_loss_batch = session.run(
                [accuracy, cost],
                feed_dict={
                    x: test_x,
                    y: test_y,
                    dropout_rate: test_dropout_rate
                })
            print(
                'Epoch {} finished, Loss: {:>10.4f} Validation Accuracy: {:.6f}'
                .format((i + 1), test_loss_batch, test_accuracy_batch))

            train_loss.append(train_loss_batch)
            test_loss.append(test_loss_batch)
            train_accuracy.append(train_accuracy_batch)
            test_accuracy.append(test_accuracy_batch)

        save_model(session, model_replica_path)

        # draw plots
        loss_plot(train_loss, test_loss)
        accuracy_plot(train_accuracy, test_accuracy)

        summary_writer.close()
示例#29
0
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./configs/default.json')
options = parser.parse_args()
config = json.load(open(options.config))

now = datetime.datetime.now()
filename = 'log_lgbm_{0:%Y%m%d%H%M%S}.log'.format(now)
logging.basicConfig(filename='./logs/' + filename, level=logging.DEBUG)
logging.debug('./logs/' + filename)

feats = config['features']
logging.debug(feats)

target_name = config['target_name']

X_train_all, X_test = load_datasets(feats)
y_train_all = load_target(target_name)
logging.debug(X_train_all.shape)

# old_oof = pd.read_feather("./features/lgbm_train.feather")
# drop_idx = np.where((old_oof.lgbm_pred.values >= 0.1) & (old_oof.lgbm_pred.values < 0.13))[0]

# print("=== remove several rows===")
# logging.debug("=== remove several rows===")
# X_train_all, y_train_all = remove_row(X_train_all, y_train_all, drop_idx)

print(X_train_all.shape)

y_preds = []
models = []
oof = np.zeros(len(X_train_all))
示例#30
0
    def __init__(self, feature_store, nav_graphs, panoramic, action_space, beam_size=1, batch_size=100, seed=10, splits=['train'], tokenizer=None, path_type=None, history=None, blind=False):  # , subgoal
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size, beam_size=beam_size,blind=blind)

        self.data = []
        self.scans = []
        self.panoramic = panoramic
        self.nav_graphs= nav_graphs
        self.action_space = action_space
        self.ctrl_feature = None
        if tokenizer:
            tokname = tokenizer.__class__.__name__

        longest_inst = list()
        longest_ep_len = list()
        for item in load_datasets(splits):

            # For every dialog history, stitch together a single instruction string.
            self.scans.append(item['scan'])
            new_item = dict(item)
            new_item['inst_idx'] = item['inst_idx']
            if history == 'none':  # no language input at all
                new_item['instructions'] = ''
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence('')
            elif history == 'target' or len(item['dialog_history']) == 0:  # Have to use target only if no dialog history.
                tar = item['target']
                new_item['instructions'] = '<TAR> ' + tar
                if tokenizer:
                    if tokname == 'Tokenizer':
                        new_item['instr_encoding'] = tokenizer.encode_sentence([tar], seps=['<TAR>'])
                    else:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(tar)

            elif history == 'oracle_ans':
                ora_a = item['dialog_history'][-1]['message']  # i.e., the last oracle utterance.
                tar = item['target']
                new_item['instructions'] = '<ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    if tokname == 'Tokenizer':
                        new_item['instr_encoding'] = tokenizer.encode_sentence([ora_a, tar], seps=['<ORA>', '<TAR>'])
                    else:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(new_item['instructions'])

            elif history == 'nav_q_oracle_ans':
                nav_q = item['dialog_history'][-2]['message']
                ora_a = item['dialog_history'][-1]['message']
                tar = item['target']
                new_item['instructions'] = '<NAV> ' + nav_q + ' <ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    if tokname == 'Tokenizer':
                        qa_enc = tokenizer.encode_sentence([nav_q, ora_a, tar], seps=['<NAV>', '<ORA>', '<TAR>'])
                    else:
                        qa_enc = tokenizer.encode_sentence(new_item['instructions'])
                    new_item['instr_encoding'] = qa_enc
            elif history == 'all':
                dia_inst = ''
                sentences = []
                seps = []
                for turn in item['dialog_history']:
                    sentences.append(turn['message'])
                    sep = '<NAV>' if turn['role'] == 'navigator' else '<ORA>'
                    seps.append(sep)
                    dia_inst += sep + ' ' + turn['message'] + ' '
                sentences.append(item['target'])
                seps.append('<TAR>')
                dia_inst += '<TAR> ' + item['target']
                new_item['instructions'] = dia_inst
                if tokenizer:
                    if tokname == "Tokenizer":
                        dia_enc = tokenizer.encode_sentence(sentences, seps=seps)
                    else:
                        dia_enc = tokenizer.encode_sentence(dia_inst)
                    new_item['instr_encoding'] = dia_enc

            # If evaluating against 'trusted_path', we need to calculate the trusted path and instantiate it.
            if path_type == 'trusted_path':
                # The trusted path is either the planner_path or the player_path depending on whether the player_path
                # contains the planner_path goal (e.g., stricter planner oracle success of player_path
                # indicates we can 'trust' it, otherwise we fall back to the planner path for supervision).
                # Hypothesize that this will combine the strengths of good human exploration with the known good, if
                # short, routes the planner uses.
                planner_goal = item['planner_path'][-1]  # this could be length 1 if "plan" is to not move at all.
                if planner_goal in item['player_path'][1:]:  # player walked through planner goal (did not start on it)
                    new_item['trusted_path'] = item['player_path'][:]  # trust the player.
                else:
                    new_item['trusted_path'] = item['planner_path'][:]  # trust the planner.
					
            longest_ep_len.append(len(new_item[path_type]))
            longest_inst.append(len(new_item['instructions'].split()))

            self.data.append(new_item)


        self.scans = set(self.scans)
        self.splits = splits
        if seed!= 'resume':
            self.seed = seed
            random.seed(self.seed)
            random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        self.epo_inc = False  # jolin: middle of an epoch
        self.path_type = path_type

        #self.env.pre_loadSims(self.data) # debug
        print('R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits)))
        print('Instructions avg length %d, max length %d, using splits: %s' % (np.mean(longest_inst),np.max(longest_inst), ",".join(splits)))
        print('Path avg length %d, max length %d, using splits: %s' % (np.mean(longest_ep_len),np.max(longest_ep_len), ",".join(splits)))
示例#31
0
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./configs/default_v02.json')
options = parser.parse_args()
config = json.load(open(options.config))

now = datetime.datetime.now()
logging.basicConfig(filename='./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now),
                    level=logging.DEBUG)
logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now))

feats = config['features']
logging.debug(feats)

target_name = config['target_name']

X_train_all, X_test = load_datasets(feats, target_name)
y_train_all = load_target(target_name)
logging.debug(X_train_all.shape)

y_preds = []
models = []

lgbm_params = config['lgbm_params']

kf = KFold(n_splits=10, random_state=0)
for train_index, valid_index in kf.split(X_train_all):
    X_train, X_valid = (X_train_all.iloc[train_index, :],
                        X_train_all.iloc[valid_index, :])
    y_train, y_valid = y_train_all[train_index], y_train_all[valid_index]

    # lgbmの実行
示例#32
0
    def __init__(self,
                 feature_store,
                 pano_caffee=None,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        self.data = []
        self.configs = {}

        self.motion_indicator = {}
        self.landmark = {}

        if not name:
            configs = np.load(args.configpath + "configs_" + splits[0] +
                              ".npy",
                              allow_pickle=True).item()
            self.configs.update(configs)

        if tokenizer:
            self.tok = tokenizer
        scans = []
        for item in tqdm(load_datasets(splits)):
            # Split multiple instructions into separate entries
            for j, instr in enumerate(item['instructions']):
                if item['scan'] not in self.env.featurized_scans:  # For fast training
                    continue
                new_item = dict(item)
                new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                #new_item['instr_id'] = str(item['path_id'])
                if args.configuration and not name:
                    each_configuration_list = self.configs[str(
                        new_item['instr_id'])]

                    # each_configuration_list = get_configurations(instr)
                    # self.configs[str(new_item['instr_id'])] = each_configuration_list
                    for config_id, each_c in enumerate(
                            each_configuration_list):
                        #self.motion_indicator[str(new_item['instr_id']) + "_" + str(config_id)] = get_motion_indicator(each_c)
                        self.landmark[str(new_item['instr_id']) + "_" +
                                      str(config_id)] = get_landmark(
                                          each_c, whether_root=True)

                    new_item['configurations'] = each_configuration_list
                    configuration_length = len(each_configuration_list)
                    tmp_str = " Quan ".join(each_configuration_list) + " Quan"
                    new_item['instructions'] = tmp_str
                    if configuration_length:
                        self.data.append(
                            (len(new_item['configurations']), new_item))

                    if tokenizer:
                        if 'instr_encoding' not in item:  # we may already include 'instr_encoding' when generating synthetic instructions
                            new_item[
                                'instr_encoding'] = tokenizer.encode_sentence(
                                    tmp_str)

                else:
                    new_item['instructions'] = instr
                    if tokenizer:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    if not tokenizer or new_item[
                            'instr_encoding'] is not None:  # Filter the wrong data
                        self.data.append(new_item)
                scans.append(item['scan'])

        np.save(
            f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components3/landmarks/landmark_{splits[0]}.npy",
            self.landmark)
        '''
        np.save(f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components2/configs/configs_{splits[0]}.npy", self.configs)
        np.save(f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components2/motion_indicator/motion_indicator_{splits[0]}.npy", self.motion_indicator)
        np.save(f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components2/landmarks/landmark_{splits[0]}.npy", self.landmark)
        '''
        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name
        self.pano_caffee = pano_caffee

        self.scans = set(scans)
        self.splits = splits

        if args.configuration and not name:
            #     self.data.sort(key=lambda x: x[0])
            self.data = list(map(lambda item: item[1], self.data))
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature, self.pano_angles = list(
            zip(*utils.get_all_point_angle_feature()))

        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
示例#33
0
def benchmark_trvae(dataset, log_name, cfg, **kwargs):
    ds = dataset
    n_genes = min(ds.X.shape[1], cfg.n_genes)

    scvai_genes, scvai_batches_ind, scvai_labels_ind = get_high_variance_genes(
        ds.X,
        ds.batch_indices,
        ds.labels,
        n_genes = n_genes,
        argmax=False
    )
    cfg.count_classes = int(np.max(ds.batch_indices) + 1)
    cfg.count_labels = int(np.max(ds.labels) + 1)
    cfg.input_dim = int(scvai_genes.shape[1])


    data = load_datasets(cfg, True, True,
                         (scvai_genes, scvai_batches_ind, scvai_labels_ind),
                         0.9)
    dataloader_train = data[0]
    dataloader_val = data[1]
    dataloader_test = data[2]
    annot_train = data[3]
    annot_test = data[4]
    x, batch_ind, celltype = annot_train.dataset.tensors
    batch_ind = batch_ind.argmax(dim=1)
    celltype = celltype.argmax(dim=1)

    anndata_train = make_anndata(x.cpu().numpy(),
                                 batch_ind.cpu().numpy(),
                                 'condition',
                                 celltype.cpu().numpy(),
                                 'cell_type')
    x_test, batch_ind_test, celltype_test = annot_test.dataset.tensors
    batch_ind_test = batch_ind_test.argmax(dim=1)
    celltype_test = celltype_test.argmax(dim=1)
    anndata_test = make_anndata(x_test.cpu().numpy(), batch_ind_test.cpu().numpy(),
                                'condition', celltype_test.cpu().numpy(), 'cell_type')
    sc.pp.normalize_per_cell(anndata_train)
    sc.pp.normalize_per_cell(anndata_test)
    sc.pp.log1p(anndata_train)
    sc.pp.log1p(anndata_test)

    n_conditions = anndata_train.obs["condition"].unique().shape[0]
    x_test = anndata_test.X
    batch_ind_test_tmp = anndata_test.obs['condition']
    batch_ind_test = zeros(batch_ind_test_tmp.shape[0], cfg.count_classes)
    batch_ind_test = batch_ind_test.scatter(1, LongTensor(batch_ind_test_tmp.astype('uint16')).view(-1, 1), 1).numpy()
    celltype_test_tmp = anndata_test.obs['cell_type']
    celltype_test = zeros(celltype_test_tmp.shape[0], cfg.count_labels)
    celltype_test = celltype_test.scatter(1, LongTensor(celltype_test_tmp.astype('uint16')).view(-1, 1), 1).numpy()

    model = trVAE(x.shape[1],
                  num_classes=n_conditions,
                  encoder_layer_sizes=[128, 32],
                  decoder_layer_sizes=[32, 128],
                  latent_dim=cfg.bottleneck,
                  alpha=0.0001,
                 )
    trainer = trvaep.Trainer(model, anndata_train)

    print('Training...')
    trainer.train_trvae(cfg.epochs, 512, 50)#n_epochs, batch_size, early_patience

    print('Tests...')
    print('Dataset:', log_name)
    res = test(cfg,
                model, None,
                annot_train,
                x_test,
                batch_ind_test,
                celltype_test
    )
    res['n_genes'] = n_genes

    metrics_path = Path(cfg.metrics_dir) / 'trVAE'
    metrics_path.mkdir(parents=True, exist_ok=True)
    with open(metrics_path / (log_name + '.json'), 'w') as file:
        json.dump(res, file, indent=4)

    del ds
    del model
    del data
    del dataloader_train, dataloader_val, dataloader_test
    del annot_train, annot_test
    del scvai_genes, scvai_batches_ind, scvai_labels_ind
    cuda.empty_cache()
示例#34
0
文件: env.py 项目: yangsikai/cvdn
    def __init__(self,
                 feature_store,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 path_type='planner_path',
                 history='target',
                 blind=False):
        self.env = EnvBatch(feature_store=feature_store,
                            batch_size=batch_size,
                            blind=blind)
        self.data = []
        self.scans = []
        for item in load_datasets(splits):

            # For every dialog history, stitch together a single instruction string.
            self.scans.append(item['scan'])
            new_item = dict(item)
            new_item['inst_idx'] = item['inst_idx']
            if history == 'none':  # no language input at all
                new_item['instructions'] = ''
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence('')
            elif history == 'target' or len(
                    item['dialog_history']
            ) == 0:  # Have to use target only if no dialog history.
                tar = item['target']
                new_item['instructions'] = '<TAR> ' + tar
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence(
                        [tar], seps=['<TAR>'])
            elif history == 'oracle_ans':
                ora_a = item['dialog_history'][-1][
                    'message']  # i.e., the last oracle utterance.
                tar = item['target']
                new_item['instructions'] = '<ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence(
                        [ora_a, tar], seps=['<ORA>', '<TAR>'])
            elif history == 'nav_q_oracle_ans':
                nav_q = item['dialog_history'][-2]['message']
                ora_a = item['dialog_history'][-1]['message']
                tar = item['target']
                new_item[
                    'instructions'] = '<NAV> ' + nav_q + ' <ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    qa_enc = tokenizer.encode_sentence(
                        [nav_q, ora_a, tar], seps=['<NAV>', '<ORA>', '<TAR>'])
                    new_item['instr_encoding'] = qa_enc
            elif history == 'all':
                dia_inst = ''
                sentences = []
                seps = []
                for turn in item['dialog_history']:
                    sentences.append(turn['message'])
                    sep = '<NAV>' if turn['role'] == 'navigator' else '<ORA>'
                    seps.append(sep)
                    dia_inst += sep + ' ' + turn['message'] + ' '
                sentences.append(item['target'])
                seps.append('<TAR>')
                dia_inst += '<TAR> ' + item['target']
                new_item['instructions'] = dia_inst
                if tokenizer:
                    dia_enc = tokenizer.encode_sentence(sentences, seps=seps)
                    new_item['instr_encoding'] = dia_enc

            # If evaluating against 'trusted_path', we need to calculate the trusted path and instantiate it.
            if path_type == 'trusted_path':
                # The trusted path is either the planner_path or the player_path depending on whether the player_path
                # contains the planner_path goal (e.g., stricter planner oracle success of player_path
                # indicates we can 'trust' it, otherwise we fall back to the planner path for supervision).
                # Hypothesize that this will combine the strengths of good human exploration with the known good, if
                # short, routes the planner uses.
                planner_goal = item['planner_path'][
                    -1]  # this could be length 1 if "plan" is to not move at all.
                if planner_goal in item['player_path'][
                        1:]:  # player walked through planner goal (did not start on it)
                    new_item['trusted_path'] = item[
                        'player_path'][:]  # trust the player.
                else:
                    new_item['trusted_path'] = item[
                        'planner_path'][:]  # trust the planner.

            self.data.append(new_item)
        self.scans = set(self.scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        self.path_type = path_type
        print 'R2RBatch loaded with %d instructions, using splits: %s' % (len(
            self.data), ",".join(splits))