Exemplo n.º 1
0
def etf():
    data_loader = data.Yahoo(
        data.Config('data',
                    normalize=True).add_param('symbols', ["SPY"]).add_param(
                        'start', "2011-01-03").add_param('end', "2015-04-14"))
    data_loader.load_data()
    return data_loader
Exemplo n.º 2
0
def sin():
    data_loader = data.TfGenerator(
        data.Config(dir='data').add_param('samples', 10000).add_param(
            "op_factory", SinusoidFactory("normal")).add_param(
                "x", np.reshape(np.random.uniform(-1.5, 1.5, 10000), (-1, 1))))
    data_loader.load_data()
    return data_loader
Exemplo n.º 3
0
def inv_sin_t_noise():
    data_loader = data.TrendingSinusoid(
        data.Config('data',
                    normalize=True).add_param("noise",
                                              "standard_t").add_param("df", 3))
    data_loader.load_data()
    return data_loader
Exemplo n.º 4
0
def main(config='config/finetune/agnews/train.json'):

    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    TaskDataset = data.get_class(
        cfg_data.task)  # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file,
                                           do_lower_case=True)
    dataset = TaskDataset(
        cfg_data.data_file[cfg.mode],
        pipelines=[
            data.RemoveSymbols('\\'),
            data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            data.AddSpecialTokensWithTruncation(cfg_data.max_len),
            data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                               TaskDataset.labels, cfg_data.max_len)
        ],
        n_data=None)
    dataset = TensorDataset(*dataset.get_tensors())  # To Tensors
    data_iter = DataLoader(dataset,
                           batch_size=cfg_optim.batch_size,
                           shuffle=True)

    classifier = models.Classifier4Transformer(cfg_model,
                                               len(TaskDataset.labels))
    optimizer = optim.optim4GPU(cfg_optim, classifier)

    train_loop = trainer.TrainLoop(cfg_optim, classifier, data_iter, optimizer,
                                   cfg.save_dir, get_device())

    def get_loss(model, batch,
                 global_step):  # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = nn.CrossEntropyLoss()(logits, label_id)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    if cfg.mode == "train":
        train_loop.train(get_loss, cfg.model_file, cfg.pretrain_file)
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        results = train_loop.eval(evaluate, cfg.model_file)
        total_accuracy = torch.cat(results).mean().item()
        print(f"Accuracy: {total_accuracy}")
Exemplo n.º 5
0
def uci_parkinsons_joint():
    data_loader = data.UCI(
        data.Config(dir='data',
                    normalize=True,
                    x_slice=slice(0),
                    y_slice=slice(None),
                    uniqueness_threshold=0.05).add_param(
                        'file', "parkinsons_updrs_processed.data").add_param(
                            'delimiter', ','))
    data_loader.load_data()
    return data_loader
Exemplo n.º 6
0
def uci_whitewine_joint():
    data_loader = data.UCI(
        data.Config(dir='data',
                    normalize=True,
                    x_slice=slice(0),
                    y_slice=slice(None),
                    uniqueness_threshold=0.05).add_param(
                        'file',
                        "winequality-white.csv").add_param('delimiter', ';'))
    data_loader.load_data()
    return data_loader
Exemplo n.º 7
0
def uci_redwine():
    data_loader = data.UCI(
        data.Config(dir='data',
                    normalize=True,
                    x_slice=slice(None, -2),
                    y_slice=slice(-2, None),
                    uniquenessThreshold=0.05).add_param(
                        'file',
                        "winequality-red.csv").add_param('delimiter', ';'))
    data_loader.load_data()
    return data_loader
Exemplo n.º 8
0
def etf2d():
    data_loader = data.Yahoo(
        data.Config('data',
                    normalize=True,
                    x_slice=slice(None, -2),
                    y_slice=slice(-2, None)).add_param(
                        'symbols', ["SPY", "DIA"]).add_param(
                            'start',
                            "2011-01-03").add_param('end', "2015-04-14"))
    data_loader.load_data()
    return data_loader
Exemplo n.º 9
0
def fx_all_predicted():
    data_loader = data.Fxcm(
        data.Config('data',
                    normalize=True,
                    x_slice=slice(None, -8),
                    y_slice=slice(-8, None)).add_param(
                        'symbols',
                        FX_SYMBOLS).add_param('start', '2015-01-05').add_param(
                            'end', '2015-01-30').add_param('ar_terms', 2))
    data_loader.load_data()
    return data_loader
Exemplo n.º 10
0
def mpg():
    data_loader = data.TfGenerator(
        data.Config(dir='data',
                    x_slice=slice(None, -2),
                    y_slice=slice(-2, None)).add_param(
                        'samples',
                        10000).add_param("op_factory", MPGFactory()).add_param(
                            "x",
                            np.reshape(np.random.uniform(-10, 10, 10000),
                                       (-1, 1))))
    data_loader.load_data()
    return data_loader
Exemplo n.º 11
0
def fx_eur_predicted():
    data_loader = data.Fxcm(
        data.Config('data',
                    normalize=True,
                    x_slice=slice(None, -1),
                    y_slice=slice(-1, None)).add_param(
                        'symbols',
                        FX_SYMBOLS).add_param('start', '2015-01-05').add_param(
                            'end',
                            '2015-01-30').add_param('ar_terms', 10).add_param(
                                'predicted_idx', [FX_SYMBOLS.index('EURUSD')]))
    data_loader.load_data()
    return data_loader
Exemplo n.º 12
0
def build_config(args):
    config = data.Config()

    if args.config:
        logger.debug('Loading file from --config')
        fp = args.config
    else:  #pragma: no cover
        logger.debug('Looking for a config file')
        fp = find_config_file()

    if fp:
        logger.debug('Found a config file. Loading.')
        try:
            config.from_file(fp)
        except data.InvalidConfiguration as ex:
            logger.warn(ex.message)

    logger.debug('Overwriting config params with command line args.')
    config.from_args(args)
    logger.debug('Running validation against config')
    config.validate()
    return config
Exemplo n.º 13
0
def main(config='config/finetune/agnews/train.json'):

    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    ### Prepare Dataset and Preprocessing ###

    TaskDataset = data.get_class(cfg_data.task) # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True)
    dataset = TaskDataset(cfg_data.data_file[cfg.mode], pipelines=[
        data.RemoveSymbols('\\'),
        data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        data.AddSpecialTokensWithTruncation(cfg_data.max_len),
        data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                           TaskDataset.labels,
                           cfg_data.max_len)
    ], n_data=None)
    tensors = TensorDataset(*dataset.get_tensors()) # To Tensors
    data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False)

    ### Fetch Teacher's output and put it into the dataset ###

    def fetch_logits(model):
        def get_logits(model, batch):
            input_ids, segment_ids, input_mask, label_id = batch
            logits = model(input_ids, segment_ids, input_mask)
            return 0.0, logits

        train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, None, None, get_device())
        results = torch.cat(train_loop.eval(get_logits, cfg.model_file))
        return results


    if cfg.mode == "train":
        print("Fetching teacher's output...")
        teacher = models.Classifier4Transformer(cfg_model, len(TaskDataset.labels))
        teacher.load_state_dict(torch.load(cfg.model_file)) # use trained model
        with torch.no_grad():
            teacher_logits = fetch_logits(teacher)

        tensors = TensorDataset(teacher_logits, *dataset.get_tensors()) # To Tensors
        data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False)

    ### Models ###

    model = models.BlendCNN(cfg_model, len(TaskDataset.labels))
    checkpoint.load_embedding(model.embed, cfg.pretrain_file)

    optimizer = optim.optim4GPU(cfg_optim, model)

    train_loop = trainer.TrainLoop(
        cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device()
    )

    def get_loss(model, batch, global_step): # make sure loss is a scalar tensor
        teacher_logits, input_ids, segment_ids, input_mask, label_id = batch
        T = 1.0
        logits = model(input_ids, segment_ids, input_mask)
        loss = 0.1*nn.CrossEntropyLoss()(logits, label_id)
        loss += 0.9*nn.KLDivLoss()(
            F.log_softmax(logits/T, dim=1),
            F.softmax(teacher_logits/T, dim=1)
        )
        #loss = 0.9*nn.MSELoss()(logits, teacher_logits)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float() #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    if cfg.mode == "train":
        train_loop.train(get_loss, None, None) # not use pretrain file
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        results = train_loop.eval(evaluate, cfg.model_file)
        total_accuracy = torch.cat(results).mean().item()
        print(f"Accuracy: {total_accuracy}")
Exemplo n.º 14
0
def main(config='config/blendcnn/mrpc/eval.json', args=None):
    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    TaskDataset = data.get_class(
        cfg_data.task)  # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file,
                                           do_lower_case=True)
    dataset = TaskDataset(
        args.dataset_location,
        pipelines=[
            data.RemoveSymbols('\\'),
            data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            data.AddSpecialTokensWithTruncation(cfg_data.max_len),
            data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                               TaskDataset.labels, cfg_data.max_len)
        ],
        n_data=None)
    dataset = TensorDataset(*dataset.get_tensors())  # To Tensors
    data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

    model = models.BlendCNN(cfg_model, len(TaskDataset.labels))
    checkpoint.load_embedding(model.embed, cfg.pretrain_file)

    optimizer = optim.optim4GPU(cfg_optim, model)

    train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer,
                                   cfg.save_dir, get_device())

    def get_loss(model, batch,
                 global_step):  # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = nn.CrossEntropyLoss()(logits, label_id)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    class Bert_DataLoader(object):
        def __init__(self,
                     loader=None,
                     model_type=None,
                     device='cpu',
                     batch_size=1):
            self.loader = loader
            self.model_type = model_type
            self.device = device
            self.batch_size = batch_size

        def __iter__(self):
            for batch in self.loader:
                batch = tuple(t.to(self.device) for t in batch)
                outputs = {
                    'output_all': (batch[0], batch[1], batch[2]),
                    'labels': batch[3]
                }

                yield outputs['output_all'], outputs['labels']

    def benchmark(model):
        total_samples = 0
        total_time = 0
        index = 0

        class RandomDataset(object):
            def __init__(self, size, shape):
                self.len = size
                self.input_ids = torch.randint(low=0,
                                               high=30522,
                                               size=(size, shape),
                                               dtype=torch.int64)
                self.segment_ids = torch.randint(low=0,
                                                 high=1,
                                                 size=(size, shape),
                                                 dtype=torch.int64)
                self.input_mask = torch.randint(low=0,
                                                high=1,
                                                size=(size, shape),
                                                dtype=torch.int64)
                self.data = (self.input_ids, self.segment_ids, self.input_mask)

            def __getitem__(self, index):
                return (self.data[0][index], self.data[1][index],
                        self.data[2][index])

            def __len__(self):
                return self.len

        rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128),
                                 batch_size=args.batch_size,
                                 shuffle=True)

        for batch in rand_loader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        input_ids, segment_ids, input_mask = batch
                        _ = model(*batch)
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    input_ids, segment_ids, input_mask = batch
                    _ = model(*batch)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f images/sec' % (throughput))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))

    def eval_func(model):
        results = []  # prediction results
        total_samples = 0
        total_time = 0
        index = 0
        model.eval()
        eval_dataloader = Bert_DataLoader(loader=data_iter,
                                          batch_size=args.batch_size)
        for batch, label in eval_dataloader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        accuracy, result = evaluate(model, (*batch, label))
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    accuracy, result = evaluate(model, (*batch, label))
            results.append(result)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        total_accuracy = torch.cat(results).mean().item()
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f samples/sec' % (throughput))
        print('Accuracy: %.3f ' % (total_accuracy))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))
        return total_accuracy

    if cfg.mode == "train":
        train_loop.train(get_loss, cfg.model_file,
                         None)  # not use pretrain_file
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        # results = train_loop.eval(evaluate, cfg.model_file)
        # total_accuracy = torch.cat(results).mean().item()
        # print(f"Accuracy: {total_accuracy}")

        if args.tune:
            import lpot
            from lpot import common
            # lpot tune
            model.load_state_dict(torch.load(args.input_model))
            eval_dataloader = Bert_DataLoader(loader=data_iter,
                                              batch_size=args.batch_size)

            quantizer = lpot.Quantization(args.tuned_yaml)
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = eval_dataloader
            quantizer.eval_func = eval_func
            q_model = quantizer()
            q_model.save(args.tuned_checkpoint)

        elif args.int8:
            from lpot.utils.pytorch import load
            int8_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
            print(int8_model)
            if args.accuracy_only:
                eval_func(int8_model)
            elif args.benchmark:
                benchmark(int8_model)

        else:
            model.load_state_dict(torch.load(args.input_model))
            print(model)
            if args.accuracy_only:
                eval_func(model)
            elif args.benchmark:
                benchmark(model)
Exemplo n.º 15
0
        the lr and start epoch is read from the trainingfile
    '''
    if args.continue_exprm:
        path = utils.make_dir([args.save, args.continue_exprm])
        configfile = os.path.join(path, 'config.json')
        trainingfile = os.path.join(path, 'training.json')
        if not utils.check_file(trainingfile):
            utils.save_kvstore({'epoch': 0, 'lr': args.lr}, trainingfile)
            logging.info("trainingfile not found, create a new one.")

        try:
            args = data.Config(
                utils.read_kvstore(
                    configfile, {
                        'continue_exprm': args.continue_exprm,
                        'predict_only': args.predict_only,
                        'seed': args.seed,
                        'early_stop': args.early_stop,
                        'debug': args.debug
                    }))
            training_states = utils.read_kvstore(trainingfile)
            args.lr = float(training_states['lr'])
            start_epoch = training_states['epoch'] + 1
        except FileNotFoundError:
            raise

    else:
        # By default, use argparse for configuration
        if args.tied and args.model == 'StandardRNN':
            args.hid_size = args.emb_size
        path = utils.make_dir(
Exemplo n.º 16
0
def inv_sin():
    data_loader = data.TrendingSinusoid(data.Config('data', normalize=True))
    data_loader.load_data()
    return data_loader