Пример #1
0
    def __init__(self, model_path, gpu_id=0):
        from model import get_model, get_loss, get_converter
        from data_loader import get_dataloader
        from metric import get_metric
        self.gpu_id = gpu_id

        if self.gpu_id is not None and isinstance(
                self.gpu_id, int) and torch.cuda.is_available():
            self.device = torch.device("cuda:%s" % self.gpu_id)
        else:
            self.device = torch.device("cpu")
        print('device:', self.device)
        checkpoint = torch.load(model_path, map_location=self.device)

        config = checkpoint['config']
        self.config = config
        self.model = get_model(config['arch'])
        # config['converter']['args']['character'] = 'license_plate'
        self.converter = get_converter(config['converter'])
        # self.post_process = get_post_processing(config['post_processing'])
        self.img_mode = config['dataset']['train']['dataset']['args'][
            'img_mode']
        self.model.load_state_dict(checkpoint['state_dict'])
        self.model.to(self.device)
        self.model.eval()
        self.metric = get_metric(config['metric'])
        # config['dataset']['validate']['loader']['num_workers'] = 8
        # config['dataset']['validate']['dataset']['args']['pre_processes'] = [{'type': 'CropWordBox', 'args': [1, 1.2]}]
        if args.img_path is not None:
            config['dataset']['validate']['dataset']['args']['data_path'] = [
                args.img_path
            ]
        self.validate_loader = get_dataloader(config['dataset']['validate'],
                                              config['distributed'])
Пример #2
0
def eval_fn(hparams):
    """Inference function."""
    hparams.tgt_sos_id, hparams.tgt_eos_id = _get_tgt_sos_eos_id(hparams)
    model_fn = make_model_fn(hparams)
    eval_runner = create_eval_runner_and_build_graph(hparams, model_fn)
    predictions = list(eval_runner.predict())
    checkpoint_path = tf.train.latest_checkpoint(hparams.out_dir)
    current_step = int(os.path.basename(checkpoint_path).split("-")[1])
    return metric.get_metric(hparams, predictions, current_step)
Пример #3
0
def add_task():
    r = metric.get_redis()
    pos = 1
    end = 28010000
    #end = 100000
    limit = 10000
    while pos <= end:
        cnt = int(r.scard(task_key))
        if cnt < limit:
            print 'add tasks', pos
            pipeline = r.pipeline()
            for i in xrange(pos, pos + limit):
                exits = metric.get_metric(
                    i, 'answer') or metric.get_metric(i) or metric.get_metric(
                        i, '404')
                if not exits:
                    pipeline.sadd(task_key, i)
            pipeline.execute()
            pos += limit
        time.sleep(0.3)
Пример #4
0
def main(config):
    import torch
    from model import get_model, get_loss, get_converter, get_post_processing
    from metric import get_metric
    from data_loader import get_dataloader
    from tools.rec_trainer import RecTrainer as rec
    from tools.det_trainer import DetTrainer as det
    if torch.cuda.device_count() > 1:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl",
            init_method="env://",
            world_size=torch.cuda.device_count(),
            rank=args.local_rank)
        config['distributed'] = True
    else:
        config['distributed'] = False
    config['local_rank'] = args.local_rank
    train_loader = get_dataloader(config['dataset']['train'],
                                  config['distributed'])
    assert train_loader is not None
    if 'validate' in config['dataset']:
        validate_loader = get_dataloader(config['dataset']['validate'], False)
    else:
        validate_loader = None

    criterion = get_loss(config['loss']).cuda()

    if config.get('post_processing', None):
        post_p = get_post_processing(config['post_processing'])
    else:
        post_p = None

    metric = get_metric(config['metric'])

    if config['arch']['algorithm'] == 'rec':
        converter = get_converter(config['converter'])
        config['arch']['num_class'] = len(converter.character)
        model = get_model(config['arch'])
    else:
        converter = None
        model = get_model(config['arch'])

    trainer = eval(config['arch']['algorithm'])(
        config=config,
        model=model,
        criterion=criterion,
        train_loader=train_loader,
        post_process=post_p,
        metric=metric,
        validate_loader=validate_loader,
        converter=converter)
    trainer.train()
Пример #5
0
    def __init__(self, model_path, gpu_id=0):

        from model import get_model, get_post_processing
        from data_loader import get_dataloader
        from metric import get_metric
        self.device = torch.device("cuda:%s" % gpu_id)
        if gpu_id is not None:
            torch.backends.cudnn.benchmark = True
        checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
        config = checkpoint['config']
        config['arch']['args']['pretrained'] = False
        if args.img_path is not None:
            config['dataset']['validate']['dataset']['args']['data_path'] = [args.img_path]
        self.validate_loader = get_dataloader(config['dataset']['validate'])

        self.model = get_model(config['arch'])
        self.model.load_state_dict(checkpoint['state_dict'])
        self.model.to(self.device)

        self.post_process = get_post_processing(config['post_processing'])
        self.metric_cls = get_metric(config['metric'])
Пример #6
0
    def fit(self,
            X,
            y,
            validation_data=(None, None),
            early_stopping_rounds=np.inf,
            maximize=True,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_boost_round=1000,
            max_depth=6,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1):
        """
        :param X: pandas.core.frame.DataFrame
        :param y: pandas.core.series.Series
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param subsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param colsample_bylevel: column sample rate when spliting each tree node,
                                  the number of features = total_features*colsample_bytree*colsample_bylevel
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param seed: random seed
        :param num_thread: number of threself.tree_predict_Xad to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0.0

        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss(reg_lambda)
        elif loss == "squareloss":
            self.loss = SquareLoss(reg_lambda)
            self.first_round_pred = y.mean()
        else:
            try:
                self.loss = CustomizeLoss(loss, reg_lambda)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        # to evaluate on validation set and conduct early stopping
        # we should get (val_X,val_y)
        # and set some variable to check when to stop
        do_validation = True
        if not isinstance(validation_data, tuple):
            raise TypeError("validation_data should be (val_X, val_y)")

        val_X, val_y = validation_data
        if val_X is None or val_y is None:
            do_validation = False
        else:
            # type check
            if not isinstance(val_X, pd.core.frame.DataFrame):
                raise TypeError("val_X should be 'pd.core.frame.DataFrame'")
            if not isinstance(val_y, pd.core.series.Series):
                raise TypeError("val_X should be 'pd.core.series.Series'")
            val_X.reset_index(drop=True, inplace=True)
            val_y.reset_index(drop=True, inplace=True)
            val_Y = pd.DataFrame(val_y.values, columns=['label'])
            val_Y['y_pred'] = self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        # Y stores: label, y_pred, grad, hess, sample_weight
        Y = pd.DataFrame(y.values, columns=['label'])
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)
        Y['sample_weight'] = 1.0
        Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight

        for i in range(self.num_boost_round):
            # weighted grad and hess
            Y.grad = Y.grad * Y.sample_weight
            Y.hess = Y.hess * Y.sample_weight
            # row and column sample before training the current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.subsample, axis=0)
            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(
                ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1)

            # train current tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     min_child_weight=self.min_child_weight,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
            Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None:
                print "TGBoost round {iteration}".format(iteration=i)
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)

                if not do_validation:
                    print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric)
                else:
                    val_Y['y_pred'] += self.eta * tree.predict(val_X)
                    val_metric = mertric_func(
                        self.loss.transform(val_Y.y_pred.values),
                        val_Y.label.values)
                    print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric,
                        val_metric=val_metric)

                    # check if to early stop
                    if maximize:
                        if val_metric > best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            print "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format(
                                best_round=best_round,
                                eval_metric=eval_metric,
                                best_val_metric=best_val_metric)
                            break
                    else:
                        if val_metric < best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            print "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                                best_round=best_round,
                                eval_metric=eval_metric,
                                best_val_metric=best_val_metric)
                            break
Пример #7
0
    def fit(self,
            train_data,
            validation_data,
            early_stopping_rounds=np.inf,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_round=1000,
            max_depth=6,
            pool_size=1,
            min_instances_byleaf=1,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=1,
            reg_lambda=1.0,
            gamma=0):
        """
        :param train_data: Data object, train data
        :param validation_data: Data object, validation data
        :param eta: learning rate
        :param num_round: number of boosting round
        :param max_depth: max depth of each tree
        :param pool_size: the num of processes
        :param subsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param min_instances_byleaf: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_round = num_round
        self.max_depth = max_depth
        self.pool_size = pool_size
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_instances_byleaf = min_instances_byleaf
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0.0

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss(self.reg_lambda)
        elif loss == "squareloss":
            self.loss = SquareLoss(self.reg_lambda)
            self.first_round_pred = train_data.getLabelMean()
        else:
            raise NotImplementedError(
                "loss should be 'logisticloss' or 'squareloss'")

        # to evaluate on validation set and conduct early stopping
        do_validation = True
        valData = validation_data.getData()
        if not valData:
            raise ValueError("validation_data is empty !")

        valIdxList = []  #save an fixed order
        valLabels = []
        for idx in valData:
            valData[idx][
                'yPred'] = self.first_round_pred  #init it with traindata
            valIdxList.append(idx)
            valLabels.append(valData[idx]['label'])

        best_val_metric = np.inf
        best_round = 0
        become_worse_round = 0

        data = train_data.getData()
        if not train_data:
            raise ValueError("train_data is empty !")
        idxList = []  #save an fixed order
        labels = []
        for idx in data:
            data[idx]['yPred'] = self.first_round_pred
            data[idx]['grad'] = self.loss.grad(data[idx]['grad'],
                                               data[idx]['label'])
            data[idx]['hess'] = self.loss.hess(data[idx]['hess'],
                                               data[idx]['label'])
            if data[idx]['label'] == 1.0:
                data[idx]['weight'] = self.scale_pos_weight
            idxList.append(idx)
            labels.append(data[idx]['label'])
        labels = np.array(labels)
        for i in range(self.num_round):
            # weighted grad and hess
            for idx in data:
                data[idx]['grad'] = data[idx]['grad'] * data[idx]['weight']
                data[idx]['hess'] = data[idx]['hess'] * data[idx]['weight']

            # row and column sample before training the current tree
            factors = train_data.getFactors()
            factorTypes = train_data.getFeatureTypes()
            sampledFactors = random.sample(
                factors, int(len(factors) * self.colsample_bytree))
            sampledData = {}
            for idx in random.sample(idxList,
                                     int(len(idxList) * self.subsample)):
                sampledData.update({idx: data[idx]})

            # train current tree
            tree = Tree()
            tree.fit(sampledData,
                     sampledFactors,
                     factorTypes,
                     max_depth=self.max_depth,
                     pool_size=self.pool_size,
                     min_child_weight=self.min_child_weight,
                     min_instances_byleaf=self.min_instances_byleaf,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(sampledData)
            for idx in sampledData:
                data[idx]['yPred'] += self.eta * preds[idx]
                data[idx]['grad'] = self.loss.grad(data[idx]["yPred"],
                                                   data[idx]["label"])
                data[idx]['hess'] = self.loss.hess(data[idx]["yPred"],
                                                   data[idx]["label"])

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None:
                print "Apollo round {iteration}".format(iteration=i)
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                curPreds = np.array([data[idx]["yPred"] for idx in idxList])
                train_metric = mertric_func(self.loss.transform(curPreds),
                                            labels)

                if not do_validation:
                    print "Apollo round {iteration}, train-{eval_metric} is {train_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric)
                else:
                    valPreds = tree.predict(valData)
                    for idx in valData:
                        valData[idx]['yPred'] += self.eta * valPreds[idx]
                    curValPreds = [valData[idx]['yPred'] for idx in valIdxList]
                    assert len(curValPreds) == len(valLabels)
                    val_metric = mertric_func(
                        self.loss.transform(np.array(curValPreds)),
                        np.array(valLabels))
                    print "Apollo round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric,
                        val_metric=val_metric)

                    # check if to early stop
                    if val_metric < best_val_metric:
                        best_val_metric = val_metric
                        best_round = i
                        become_worse_round = 0
                    else:
                        become_worse_round += 1
                    if become_worse_round > early_stopping_rounds:
                        print "Apollo training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                            best_round=best_round,
                            eval_metric=eval_metric,
                            best_val_metric=best_val_metric)
                        break
Пример #8
0
def evaluate(config):
    model = get_model(config)
    weight_file = 'experiment/' + config['dir'] + '/' + config['weights']
    model.load_state_dict(torch.load(weight_file)['model'])
    model.eval()
    n_kpoints = 21

    is_real = bool(config['is_real'])

    cuda = config['cuda']
    path_dir = 'experiment/' + config['dir'] + '/'
    path_val_file = path_dir + config['dataset'] + '_valuation.json'

    dataloader = get_dataloader(config, scope='val')
    roc_auc_2d = get_metric('roc_auc_2d')
    roc_auc_3d = get_metric('roc_auc_3d')
    keypoint_error = get_metric('keypoint_error')

    area_2d = 0
    error_keypoint_2d = np.zeros(n_kpoints)
    error_keypoint_total_2d = 0

    area_3d = 0
    error_keypoint_3d = np.zeros(n_kpoints)
    error_keypoint_total_3d = 0

    present_counter = np.zeros(n_kpoints)

    thres_range = np.arange(0, 1, 0.01)
    acc_range_2d = np.zeros(len(thres_range))
    acc_range_3d = np.zeros(len(thres_range))

    for num, sample in enumerate(dataloader, 1):

        sample['image'] = sample['image'].cuda(cuda)
        output = model(sample)

        if is_real:
            batch_avg_keypoint_error_2d = keypoint_error(
                sample, output, 'vector_2d', True)
            error_keypoint_2d += batch_avg_keypoint_error_2d['keypoint_error']
            present_counter += batch_avg_keypoint_error_2d['present_counter']

        else:

            batch_avg_metric_2d = roc_auc_2d(sample, output)
            area_2d += batch_avg_metric_2d['area']
            acc_range_2d += batch_avg_metric_2d['acc_range']

            batch_avg_keypoint_error_2d = keypoint_error(
                sample, output, 'vector_2d', False)
            error_keypoint_2d += batch_avg_keypoint_error_2d['keypoint_error']
            error_keypoint_total_2d += batch_avg_keypoint_error_2d[
                'total_error']

            batch_avg_metric_3d = roc_auc_3d(sample, output)
            area_3d += batch_avg_metric_3d['area']
            acc_range_3d += batch_avg_metric_3d['acc_range']

            batch_avg_keypoint_error_3d = keypoint_error(
                sample, output, 'vector_3d', False)
            error_keypoint_3d += batch_avg_keypoint_error_3d['keypoint_error']
            error_keypoint_total_3d += batch_avg_keypoint_error_3d[
                'total_error']

        if num % 50 == 0:
            print('Evaluation done for {} batches'.format(num))

    if is_real:

        error_keypoint_2d_avg = np.array([-1.] * 21)
        for i in range(21):
            if not present_counter[i] == 0:
                error_keypoint_2d_avg[
                    i] = error_keypoint_2d[i] / present_counter[i]

        error_total_2d_avg = np.sum(error_keypoint_2d) / np.sum(
            present_counter)

        val_dict = {
            'error_keypoint_2d': list(error_keypoint_2d_avg),
            'error_total_2d': error_total_2d_avg
        }

    else:
        val_dict = {
            'thres_range': list(thres_range),
            'area_2d': area_2d / num,
            'acc_range_2d': list(acc_range_2d / num),
            'area_3d': area_3d / num,
            'acc_range_3d': list(acc_range_3d / num),
            'error_keypoint_2d': list(error_keypoint_2d / num),
            'error_total_2d': error_keypoint_total_2d / num,
            'error_keypoint_3d': list(error_keypoint_3d / num),
            'error_total_3d': error_keypoint_total_3d / num
        }

    # saving data
    with open(path_val_file, 'w') as fp:
        json.dump(val_dict, fp)
Пример #9
0
def test(args):
    # Prepare dataset
    data = get_data(args)

    data_test = data(args, 'test')

    loader_test = DataLoader(dataset=data_test,
                             batch_size=1,
                             shuffle=False,
                             num_workers=args.num_threads)

    # Network
    model = get_model(args)
    net = model(args)
    net.cuda()

    if args.pretrain is not None:
        assert os.path.exists(args.pretrain), \
            "file not found: {}".format(args.pretrain)

        checkpoint = torch.load(args.pretrain)
        key_m, key_u = net.load_state_dict(checkpoint['net'], strict=False)

        if key_u:
            print('Unexpected keys :')
            print(key_u)

        if key_m:
            print('Missing keys :')
            print(key_m)
            raise KeyError

    net = nn.DataParallel(net)

    metric = get_metric(args)
    metric = metric(args)
    summary = get_summary(args)

    try:
        os.makedirs(args.save_dir, exist_ok=True)
        os.makedirs(args.save_dir + '/test', exist_ok=True)
    except OSError:
        pass

    writer_test = summary(args.save_dir, 'test', args, None,
                          metric.metric_name)

    net.eval()

    num_sample = len(loader_test) * loader_test.batch_size

    pbar = tqdm(total=num_sample)

    t_total = 0

    for batch, sample in enumerate(loader_test):
        sample = {
            key: val.cuda()
            for key, val in sample.items() if val is not None
        }

        t0 = time.time()
        output = net(sample)
        t1 = time.time()

        t_total += (t1 - t0)

        metric_val = metric.evaluate(sample, output, 'train')

        writer_test.add(None, metric_val)

        # Save data for analysis
        if args.save_image:
            writer_test.save(args.epochs, batch, sample, output)

        current_time = time.strftime('%y%m%d@%H:%M:%S')
        error_str = '{} | Test'.format(current_time)
        pbar.set_description(error_str)
        pbar.update(loader_test.batch_size)

    pbar.close()

    writer_test.update(args.epochs, sample, output)

    t_avg = t_total / num_sample
    print('Elapsed time : {} sec, '
          'Average processing time : {} sec'.format(t_total, t_avg))
Пример #10
0
    def fit(self,
            X,
            y,
            eta=0.01,
            num_boost_round=1000,
            max_depth=5,
            rowsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_sample_split=10,
            loss="logisticloss",
            l2_regularization=1.0,
            gamma=0.1,
            num_thread=-1,
            eval_metric=None):
        """
        :param X: pandas.core.frame.DataFrame
        :param y: pandas.core.series.Series
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param rowsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param colsample_bylevel: column sample rate when spliting each tree node,
                                  the number of features = total_features*colsample_bytree*colsample_bylevel
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param l2_regularization: lambda
        :param gamma: gamma
        :param seed: random seed
        :param num_thread: number of thread to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.rowsample = rowsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.l2_regularization = l2_regularization
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric

        if loss == "logisticloss":
            self.loss = LogisticLoss(l2_regularization)
        elif loss == "squareloss":
            self.loss = SquareLoss(l2_regularization)
        else:
            try:
                self.loss = CustomizeLoss(loss, l2_regularization)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        self.first_round_pred = y.mean()

        # Y stores label, y_pred, grad, hess
        Y = pd.DataFrame(y.values,
                         columns=['label'])  # only one column "label"
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

        for i in range(self.num_boost_round):
            # sample samples and features to train current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.rowsample, axis=0)
            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(['label', 'y_pred', 'grad', 'hess'], axis=1)

            # train current tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     l2_regularization=self.l2_regularization,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole dataset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
            Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

            if self.eval_metric is not None:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")
                metric_value = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)
                print "TGBoost round {iteration}, {eval_metric} is {metric_value}".format(
                    iteration=i,
                    eval_metric=self.eval_metric,
                    metric_value=metric_value)
            else:
                print "TGBoost round {iteration}"

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)
Пример #11
0
def train(gpu, args):
    # Initialize workers
    # NOTE : the worker with gpu=0 will do logging
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.num_gpus,
                            rank=gpu)
    torch.cuda.set_device(gpu)

    # Prepare dataset
    data = get_data(args)

    data_train = data(args, 'train')
    data_val = data(args, 'val')

    sampler_train = DistributedSampler(data_train,
                                       num_replicas=args.num_gpus,
                                       rank=gpu)
    sampler_val = DistributedSampler(data_val,
                                     num_replicas=args.num_gpus,
                                     rank=gpu)

    batch_size = args.batch_size // args.num_gpus

    loader_train = DataLoader(dataset=data_train,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=args.num_threads,
                              pin_memory=True,
                              sampler=sampler_train,
                              drop_last=True)
    loader_val = DataLoader(dataset=data_val,
                            batch_size=1,
                            shuffle=False,
                            num_workers=args.num_threads,
                            pin_memory=True,
                            sampler=sampler_val,
                            drop_last=False)

    # Network
    model = get_model(args)
    net = model(args)
    net.cuda(gpu)

    if gpu == 0:
        if args.pretrain is not None:
            assert os.path.exists(args.pretrain), \
                "file not found: {}".format(args.pretrain)

            checkpoint = torch.load(args.pretrain)
            net.load_state_dict(checkpoint['net'])

            print('Load network parameters from : {}'.format(args.pretrain))

    # Loss
    loss = get_loss(args)
    loss = loss(args)
    loss.cuda(gpu)

    # Optimizer
    optimizer, scheduler = utility.make_optimizer_scheduler(args, net)

    net = apex.parallel.convert_syncbn_model(net)
    net, optimizer = amp.initialize(net,
                                    optimizer,
                                    opt_level=args.opt_level,
                                    verbosity=0)

    if gpu == 0:
        if args.pretrain is not None:
            if args.resume:
                try:
                    optimizer.load_state_dict(checkpoint['optimizer'])
                    scheduler.load_state_dict(checkpoint['scheduler'])
                    amp.load_state_dict(checkpoint['amp'])

                    print('Resume optimizer, scheduler and amp '
                          'from : {}'.format(args.pretrain))
                except KeyError:
                    print('State dicts for resume are not saved. '
                          'Use --save_full argument')

            del checkpoint

    net = DDP(net)

    metric = get_metric(args)
    metric = metric(args)
    summary = get_summary(args)

    if gpu == 0:
        utility.backup_source_code(args.save_dir + '/code')
        try:
            os.makedirs(args.save_dir, exist_ok=True)
            os.makedirs(args.save_dir + '/train', exist_ok=True)
            os.makedirs(args.save_dir + '/val', exist_ok=True)
        except OSError:
            pass

    if gpu == 0:
        writer_train = summary(args.save_dir, 'train', args, loss.loss_name,
                               metric.metric_name)
        writer_val = summary(args.save_dir, 'val', args, loss.loss_name,
                             metric.metric_name)

        with open(args.save_dir + '/args.json', 'w') as args_json:
            json.dump(args.__dict__, args_json, indent=4)

    if args.warm_up:
        warm_up_cnt = 0.0
        warm_up_max_cnt = len(loader_train) + 1.0

    for epoch in range(1, args.epochs + 1):
        # Train
        net.train()

        sampler_train.set_epoch(epoch)

        if gpu == 0:
            current_time = time.strftime('%y%m%d@%H:%M:%S')

            list_lr = []
            for g in optimizer.param_groups:
                list_lr.append(g['lr'])

            print('=== Epoch {:5d} / {:5d} | Lr : {} | {} | {} ==='.format(
                epoch, args.epochs, list_lr, current_time, args.save_dir))

        num_sample = len(
            loader_train) * loader_train.batch_size * args.num_gpus

        if gpu == 0:
            pbar = tqdm(total=num_sample)
            log_cnt = 0.0
            log_loss = 0.0

        for batch, sample in enumerate(loader_train):
            sample = {
                key: val.cuda(gpu)
                for key, val in sample.items() if val is not None
            }

            if epoch == 1 and args.warm_up:
                warm_up_cnt += 1

                for param_group in optimizer.param_groups:
                    lr_warm_up = param_group['initial_lr'] \
                                 * warm_up_cnt / warm_up_max_cnt
                    param_group['lr'] = lr_warm_up

            optimizer.zero_grad()

            output = net(sample)

            loss_sum, loss_val = loss(sample, output)

            # Divide by batch size
            loss_sum = loss_sum / loader_train.batch_size
            loss_val = loss_val / loader_train.batch_size

            with amp.scale_loss(loss_sum, optimizer) as scaled_loss:
                scaled_loss.backward()

            optimizer.step()

            if gpu == 0:
                metric_val = metric.evaluate(sample, output, 'train')
                writer_train.add(loss_val, metric_val)

                log_cnt += 1
                log_loss += loss_sum.item()

                current_time = time.strftime('%y%m%d@%H:%M:%S')
                error_str = '{:<10s}| {} | Loss = {:.4f}'.format(
                    'Train', current_time, log_loss / log_cnt)

                if epoch == 1 and args.warm_up:
                    list_lr = []
                    for g in optimizer.param_groups:
                        list_lr.append(round(g['lr'], 6))
                    error_str = '{} | Lr Warm Up : {}'.format(
                        error_str, list_lr)

                pbar.set_description(error_str)
                pbar.update(loader_train.batch_size * args.num_gpus)

        if gpu == 0:
            pbar.close()

            writer_train.update(epoch, sample, output)

            if args.save_full or epoch == args.epochs:
                state = {
                    'net': net.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                    'amp': amp.state_dict(),
                    'args': args
                }
            else:
                state = {'net': net.module.state_dict(), 'args': args}

            torch.save(state,
                       '{}/model_{:05d}.pt'.format(args.save_dir, epoch))

        # Val
        torch.set_grad_enabled(False)
        net.eval()

        num_sample = len(loader_val) * loader_val.batch_size * args.num_gpus

        if gpu == 0:
            pbar = tqdm(total=num_sample)
            log_cnt = 0.0
            log_loss = 0.0

        for batch, sample in enumerate(loader_val):
            sample = {
                key: val.cuda(gpu)
                for key, val in sample.items() if val is not None
            }

            output = net(sample)

            loss_sum, loss_val = loss(sample, output)

            # Divide by batch size
            loss_sum = loss_sum / loader_val.batch_size
            loss_val = loss_val / loader_val.batch_size

            if gpu == 0:
                metric_val = metric.evaluate(sample, output, 'train')
                writer_val.add(loss_val, metric_val)

                log_cnt += 1
                log_loss += loss_sum.item()

                current_time = time.strftime('%y%m%d@%H:%M:%S')
                error_str = '{:<10s}| {} | Loss = {:.4f}'.format(
                    'Val', current_time, log_loss / log_cnt)
                pbar.set_description(error_str)
                pbar.update(loader_val.batch_size * args.num_gpus)

        if gpu == 0:
            pbar.close()

            writer_val.update(epoch, sample, output)
            print('')

            writer_val.save(epoch, batch, sample, output)

        torch.set_grad_enabled(True)

        scheduler.step()
Пример #12
0
    def fit(self,
            features,
            label,
            validation_data=(None, None),
            early_stopping_rounds=np.inf,
            maximize=True,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_boost_round=1000,
            max_depth=6,
            scale_pos_weight=1,
            subsample=0.8,
            colsample=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1):
        """
        :param features: np.array
        :param label: np.array
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param subsample: row sample rate when building a tree
        :param colsample: column sample rate when building a tree
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param num_thread: number of threself.tree_predict_Xad to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample = colsample
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss()
        elif loss == "squareloss":
            self.loss = SquareLoss()
            self.first_round_pred = label.mean()
        else:
            try:
                self.loss = CustomizeLoss(loss)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        # initialize row_sampler, col_sampler, bin_structure, attribute_list, class_list
        row_sampler = RowSampler(features.shape[0], self.subsample)
        col_sampler = ColumnSampler(features.shape[1], self.colsample)
        bin_structure = BinStructure(features)
        attribute_list = AttributeList(features, bin_structure)
        class_list = ClassList(label)
        class_list.initialize_pred(self.first_round_pred)
        class_list.update_grad_hess(self.loss)

        # to evaluate on validation set and conduct early stopping
        # we should get (val_features,val_label)
        # and set some variable to check when to stop
        do_validation = True
        if not isinstance(validation_data, tuple):
            raise TypeError(
                "validation_data should be (val_features, val_label)")

        val_features, val_label = validation_data
        val_pred = None
        if val_features is None or val_label is None:
            do_validation = False
        else:
            val_pred = np.ones(val_label.shape) * self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        # start learning
        logging.info("TGBoost start training")
        for i in range(self.num_boost_round):
            t0 = time()
            # train current tree
            tree = Tree(self.min_sample_split, self.min_child_weight,
                        self.max_depth, self.colsample, self.subsample,
                        self.reg_lambda, self.gamma, self.num_thread)
            tree.fit(attribute_list, class_list, row_sampler, col_sampler,
                     bin_structure)

            # when finish building this tree, update the class_list.pred, grad, hess
            class_list.update_pred(self.eta)
            class_list.update_grad_hess(self.loss)
            # save this tree
            self.trees.append(tree)

            t1 = time()

            # print training information
            if self.eval_metric is None:
                logging.info("TGBoost round {iteration}".format(iteration=i))
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(class_list.pred), label)

                if not do_validation:
                    logging.info(
                        "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, exec time {tc:.3f}s"
                        .format(iteration=i,
                                eval_metric=self.eval_metric,
                                train_metric=train_metric,
                                tc=t1 - t0))
                else:
                    val_pred += self.eta * tree.predict(val_features)
                    val_metric = mertric_func(self.loss.transform(val_pred),
                                              val_label)
                    logging.info(
                        "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, val-{eval_metric}: {val_metric:.4f}, exec time {tc:.3f}s"
                        .format(iteration=i,
                                eval_metric=self.eval_metric,
                                train_metric=train_metric,
                                val_metric=val_metric,
                                tc=t1 - t0))

                    # check whether to early stop
                    if maximize:
                        if val_metric > best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            logging.info(
                                "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric:.4f}"
                                .format(best_round=best_round,
                                        eval_metric=eval_metric,
                                        best_val_metric=best_val_metric))
                            break
                    else:
                        if val_metric < best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            logging.info(
                                "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric:.4f}"
                                .format(best_round=best_round,
                                        eval_metric=eval_metric,
                                        best_val_metric=best_val_metric))
                            break
Пример #13
0
import json
from nltk.util import ngrams
import metric

f=open('test_ppl.json')
lines=json.load(f)
c=lines[4]
multi_ref=lines[2]
keyword=lines[1]
query=lines[0]
expose=lines[7]
keyword_ng=[]
for i in range(len(query)):
    qk=list(set(query[i]+keyword[i]))
    keyword_ng.append(qk)
log_path='./'

bleu, recall_q, recall_k, recall_qk, dist1, dist2=\
    metric.get_metric(c, multi_ref, keyword, query, keyword_ng,expose, log_path)
print(' bleu{}, recall_q{}, recall_k{}, recall_qk{}, dist1{}, dist2{} \n'.format(bleu, recall_q, recall_k, recall_qk, dist1, dist2))
  def train_and_predict(self):
    """Run the predict loop on the TPU device."""
    self.sess.run([self.compile_op])

    # Train and eval thread.
    def train_eval_thread_fn(sess, train_eval_op):
      tf.logging.info("train_eval_op start")
      sess.run([train_eval_op])

    train_eval_thread = threading.Thread(
        target=train_eval_thread_fn, args=(self.sess, self.train_eval_op))
    train_eval_thread.start()

    # Infeed thread.
    def infeed_thread_fn(sess, train_enqueue_ops, eval_enqueue_ops, eval_init):
      """Start the infeed."""
      time.sleep(300)

      mlp_log.mlperf_print("init_stop", None)
      mlp_log.mlperf_print("run_start", None)
      for i in range(self.hparams.max_train_epochs):
        tf.logging.info("Infeed for epoch: %d", i + 1)
        mlp_log.mlperf_print(
            "block_start",
            None,
            metadata={
                "first_epoch_num": i + 1,
                "epoch_count": 1
            })
        mlp_log.mlperf_print("epoch_start", None, metadata={"epoch_num": i + 1})
        sess.run(eval_init)
        sess.run([train_enqueue_ops])
        sess.run([eval_enqueue_ops])

    infeed_thread = threading.Thread(
        target=infeed_thread_fn,
        args=(self.sess, self.enqueue_ops, self.eval_enqueue_ops,
              self.eval_dataset_initializer))
    infeed_thread.start()

    if self.eval_steps > 0:
      eval_state = {"run_success": False, "score": 0.0}

      for epoch in range(self.hparams.max_train_epochs):
        predictions = list(self.predict())
        mlp_log.mlperf_print(
            "eval_start", None, metadata={"epoch_num": epoch + 1})
        current_step = epoch * self.iterations

        eval_state["score"] = metric.get_metric(self.hparams, predictions,
                                                current_step)
        tf.logging.info("Score after epoch %d: %f", epoch, eval_state["score"])
        mlp_log.mlperf_print(
            "eval_accuracy",
            eval_state["score"],
            metadata={"epoch_num": epoch + 1})
        mlp_log.mlperf_print(
            "eval_stop", None, metadata={"epoch_num": epoch + 1})
        mlp_log.mlperf_print(
            "block_stop",
            None,
            metadata={
                "first_epoch_num": epoch,
                "epoch_count": 1
            })
        if eval_state["score"] >= self.hparams.target_bleu:
          eval_state["run_success"] = True
          mlp_log.mlperf_print("run_stop", None, metadata={"status": "success"})
          break

      if not eval_state["run_success"]:
        mlp_log.mlperf_print("run_stop", None, metadata={"status": "abort"})

    infeed_thread.join()
    train_eval_thread.join()

    if self.eval_steps > 0:
      return eval_state["score"], current_step
    else:
      return None, None
Пример #15
0
def eval(model_sample, model_generate, vocab, dataloader_k, dataloader_qk,
         epoch, updates):
    model_sample.eval()
    model_generate.eval()

    multi_ref, query, keyword, keyword_ng = [], [], [], []
    candidate, candidate_s, candidate_qk, candidate_qk_s = [], [], [], []
    expose_sum = []

    for batch in tqdm(dataloader_k):

        sample_node_idx, sample_adj, sample_adj_weight, sample_prob, tgt, word_type, query_batch, sub_node_idx_batch = model_sample(
            batch,
            train_type=args.train_type,
            sample=False,
            sample_type='eval')
        expose = batch.expose
        expose = [int(e) for e in expose]
        expose_sum += expose

        sub_batch_sent = [vocab.id2sent(k) for k in sub_node_idx_batch]
        query_batch_sent = [vocab.id2sent(s) for s in query_batch]
        ref = [vocab.id2sent(t[1:]) for t in tgt]

        multi_ref += ref
        keyword += sub_batch_sent
        query += query_batch_sent

        samples = model_generate.sample(sample_node_idx, sample_adj,
                                        sample_adj_weight, word_type)
        cand = [vocab.id2sent(s) for s in samples]
        candidate += cand

        sample_node_idx, sample_adj, sample_adj_weight, sample_prob, tgt, word_type, query_batch_s, sub_node_idx_batch_s = model_sample(
            batch, train_type=args.train_type, sample=True, sample_type='eval')
        samples_s = model_generate.sample(sample_node_idx, sample_adj,
                                          sample_adj_weight, word_type)
        cand_s = [vocab.id2sent(s) for s in samples_s]
        candidate_s += cand_s

    for batch in tqdm(dataloader_qk):

        sample_node_idx, sample_adj, sample_adj_weight, sample_prob, tgt, word_type, query_batch_qk, sub_node_idx_batch_qk = model_sample(
            batch,
            train_type=args.train_type,
            sample=False,
            sample_type='eval')
        key = [vocab.id2sent(k) for k in sub_node_idx_batch_qk]
        '''for i in key:
            print(i)
        a = 1
        assert a == 0'''
        keyword_ng += key

        samples_qk = model_generate.sample(sample_node_idx, sample_adj,
                                           sample_adj_weight, word_type)
        cand_qk = [vocab.id2sent(s) for s in samples_qk]
        candidate_qk += cand_qk

        sample_node_idx, sample_adj, sample_adj_weight, sample_prob, tgt, word_type, query_batch, sub_node_idx_batch = model_sample(
            batch, train_type=args.train_type, sample=True, sample_type='eval')
        samples_qk_s = model_generate.sample(sample_node_idx, sample_adj,
                                             sample_adj_weight, word_type)
        cand_qk_s = [vocab.id2sent(s) for s in samples_qk_s]
        candidate_qk_s += cand_qk_s

    text_result, bleu = utils.eval_bleu(multi_ref, candidate, log_path)
    text_result_s, bleu_s = utils.eval_bleu(multi_ref, candidate_s, log_path)
    text_result_qk, bleu_qk = utils.eval_bleu(multi_ref, candidate_qk,
                                              log_path)
    text_result_qk_s, bleu_qk_s = utils.eval_bleu(multi_ref, candidate_qk_s,
                                                  log_path)

    logging_csv([
        epoch, updates, text_result, text_result_s, text_result_qk,
        text_result_qk_s
    ])

    print_list = [
        query, keyword, multi_ref, candidate, candidate_s, candidate_qk,
        candidate_qk_s, expose_sum
    ]

    with open(log_path + "test_ppl.json", "w") as f:
        json.dump(print_list, f)
    print_list = [
        query, keyword, multi_ref, candidate, candidate_s, candidate_qk,
        candidate_qk_s
    ]
    utils.write_result_to_file(print_list, log_path)

    candidate_list = [
        multi_ref, candidate, candidate_s, candidate_qk, candidate_qk_s
    ]
    name = ['ori', 'nosample', 'sample', 'nosample+q', 'sample+q']
    bleu_target = 0
    for c, n in zip(candidate_list, name):
        bleu, recall_q, recall_k, recall_qk, dist1, dist2 = metric.get_metric(
            c, multi_ref, keyword, query, keyword_ng, expose_sum, log_path)
        logging(
            '{}: bleu {}, recall_q {}, recall_k {}, recall_qk {}, dist1 {}, dist2 {} \n'
            .format(n, bleu, recall_q, recall_k, recall_qk, dist1, dist2))
        logging_csv([
            epoch, updates, n, bleu, recall_q, recall_k, recall_qk, dist1,
            dist2
        ])
        if n == 'sample':
            bleu_target = bleu

    return bleu_target
Пример #16
0
    def fit(self,
            X,
            y,
            validation_data=(None, None),
            early_stopping_rounds=10,
            maximize=True,
            eval_metric=None,
            loss='logisticloss',
            eta=0.3,
            num_boost_round=1000,
            max_depth=5,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1,
            pred_cutoff=0.5):
        '''
		X:pandas.core.frame.DataFrame
		y:pandas.core.series.Series
		early_stopping_rounds: early_stop when eval rsult become worse more the early_stopping_rounds times
		maximize:the target is to make loss as large as possible
		eval_metric: evaluate method
		loss : loss function for optionmize
		num_boost_round : number of boosting
		max_depth: max_depth for a tree
		scale_pos_weight: weight for samples with 1 labels
		subsample: row sample rate when build a tree
		colsample_bytree: column sample rate when building a tree
		colsample_bylevel: column sample rate when spliting each tree node. when split a tree,the number of features = total_features*colsample_bytree*colsample_bylevel
		min_sample_split: min number of samples in a leaf node
		'''
        self.eval_metric = eval_metric
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.first_round_pred = 0.0
        self.subsample = subsample
        self.max_depth = max_depth
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.reg_lambda = reg_lambda
        self.min_sample_split = min_sample_split
        self.gamma = gamma
        self.num_thread = num_thread
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.pred_cutoff = pred_cutoff

        #将X,y修改为能通过int下标(从0开始)进行索引的FramData
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        if 'logisticloss':
            self.loss = LogisticLoss(self.reg_lambda)
        elif 'squareloss' == loss:
            self.loss = SquareLoss(self.reg_lambda)
        else:
            raise Exception('No find match loss')

        if not isinstance(validation_data, tuple):
            raise Exception('validation_data must be tuple')

        val_X, val_y = validation_data

        do_val = True
        if val_X is None or val_y is None:
            do_val = False
        else:
            if not isinstance(val_X, pd.core.frame.DataFrame):
                raise Exception('val_X must be pd.core.frame.DataFrame')

            if not isinstance(val_y, pd.core.series.Series):
                raise Exception('val_y must be pd.core.series.Series')

            val_X.reset_index(drop=True, inplace=True)
            val_y.reset_index(drop=True, inplace=True)

            val_Y = pd.DataFrame(val_y.values, columns=['label'])

            #set default pred value
            val_Y['y_pred'] = self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        Y = pd.DataFrame(y.values, columns=['label'])
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

        Y['sample_weight'] = 1.0
        #调整正样本权重
        Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight

        for i in range(self.num_boost_round):
            # row and column sample before training the current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)  #column sample
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.subsample, axis=0)  #row sample

            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(
                ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1)

            #print X_selected
            #print Y_selected

            # fit a tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     min_child_weight=self.min_child_weight,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values,
                                       Y.label.values) * Y.sample_weight
            Y['hess'] = self.loss.hess(Y.y_pred.values,
                                       Y.label.values) * Y.sample_weight

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None or not do_val:
                print "GBoost round {iteration}".format(iteration=i)

            #evaluate in validation data
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)

                #val_Y is [n_sampels 2], column is label,pred
                val_Y['y_pred'] += self.eta * tree.predict(val_X)

                #evaludate on the current predict result
                val_metric = mertric_func(
                    self.loss.transform(val_Y.y_pred.values),
                    val_Y.label.values)

                print "GBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                    iteration=i,
                    eval_metric=self.eval_metric,
                    train_metric=train_metric,
                    val_metric=val_metric)

                # check if to early stop
                if maximize:
                    if val_metric > best_val_metric:
                        best_val_metric = val_metric
                        best_round = i
                        become_worse_round = 0
                    else:
                        become_worse_round += 1

                    #when the evaluation result is worse more than early_stopping_rounds times
                    #stop to continue building tree
                    if become_worse_round > early_stopping_rounds:
                        print "training early Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format(
                            best_round=best_round,
                            eval_metric=eval_metric,
                            best_val_metric=best_val_metric)
                        break
                else:
                    if val_metric < best_val_metric:
                        best_val_metric = val_metric
                        best_round = i
                        become_worse_round = 0
                    else:
                        become_worse_round += 1
                    if become_worse_round > early_stopping_rounds:
                        print "training early Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                            best_round=best_round,
                            eval_metric=eval_metric,
                            best_val_metric=best_val_metric)
                        break
Пример #17
0
    # for quick test use python .\main.py --layers 3 --features 5 --end_features 10 --iterations 101 --batch_size 6 --model pixelcnn

    if conf.model == 'graph':
        data = Dataset(conf)
        test_data = data.get_plain_test_values()
        with tf.Session() as sess:
            samples = []
            for _ in range(data.total_test_batches):
                X, _ = sess.run(test_data)
                samples.append(X)
            X = np.concatenate(samples)
            print(X.shape)
        X_noncausal_graph = NonCausal(conf, data).get_test_samples_graph()
        tf.reset_default_graph()
        get_metric(X, X_noncausal_graph)
        tf.reset_default_graph()
    elif conf.model == 'evaluate':
        data = Dataset(conf)
        test_data = data.get_plain_test_values()
        with tf.Session() as sess:
            samples = []
            for _ in range(data.total_test_batches):
                X, _ = sess.run(test_data)
                samples.append(X)
            X = np.concatenate(samples)
            print(X.shape)
        X_denoising = PixelCNN(conf, data, True).get_test_samples()
        tf.reset_default_graph()
        X_noncausal = NonCausal(conf, data).get_test_samples()
        tf.reset_default_graph()