Exemplo n.º 1
0
def prepare_and_run(params):
    loo = LeaveOneOut()
    plates, classes = get_train_exemplars()
    cv_monitor = MetricsMonitorCV()
    fold_num = 0

    for train_index, val_index in loo.split(plates):
        train_paths, val_paths = plates[train_index], plates[val_index]
        train_classes, val_classes = classes[train_index], classes[val_index]

        train_dataset = get_dataset(list(zip(train_paths, classes)),
                                    train_transforms)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=params["batch_size"],
                                      shuffle=True)
        val_dataset = get_dataset(list(zip(val_paths, val_classes)),
                                  val_test_transforms)
        # we using leave one out validation, so batch_size should be 1
        val_dataloader = DataLoader(val_dataset, batch_size=1)

        train_monitor, validation_monitor = train(train_dataloader,
                                                  val_dataloader, fold_num,
                                                  params)

        cv_monitor.add_train_monitor(train_monitor).add_val_monitor(
            validation_monitor)

        fold_num += 1

    return cv_monitor
Exemplo n.º 2
0
    def test_udd_ofr_nids(self):
        """test one of haed and tail entity ID is larger than number of entities
        """
        extra_train = ['7\t0\t1\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_train = ['0\t0\t6\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_valid = ['7\t0\t1\n']
        files = gen_udd_files(extra_valid=extra_valid)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_test = ['7\t0\t1\n']
        files = gen_udd_files(extra_test=extra_test)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)
Exemplo n.º 3
0
    def test_udd_error_nids(self):
        """test one of head and tail entity ID < 0
        """
        extra_train = ['-1\t0\t1\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_train = ['0\t0\t-1\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_valid = ['-1\t0\t1\n']
        files = gen_udd_files(extra_valid=extra_valid)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_test = ['-1\t0\t1\n']
        files = gen_udd_files(extra_test=extra_test)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)
Exemplo n.º 4
0
    def test_udd_noint_triplets(self):
        """test one of (h, r, t) is not an integer
        """
        extra_train = ['deadbeaf\t0\t1\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(ValueError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_train = ['0\tdeadbeaf\t1\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(ValueError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_train = ['0\t0\tdeadbeaf\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(ValueError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_valid = ['deadbeaf\t0\t1\n']
        files = gen_udd_files(extra_valid=extra_valid)
        with self.assertRaises(ValueError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_test = ['deadbeaf\t0\t1\n']
        files = gen_udd_files(extra_test=extra_test)
        with self.assertRaises(ValueError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)
Exemplo n.º 5
0
def generate_datasets(DATASET_PATH, batch_size):
    '''
    Load train, val and test dataset.
    :param DATASET_PATH: Path to dataset
    :param batch_size: batch size obviously ...
    :return:
    '''
    # Load training data
    train_dataset = get_dataset("train", dataset_path=DATASET_PATH)
    # Load validation data
    test_dataset = get_dataset("test", dataset_path=DATASET_PATH)
    # Create the dataloader
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               num_workers=2,
                                               shuffle=True,
                                               collate_fn=collate_remove_none,
                                               worker_init_fn=worker_init_fn)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=1,
                                              num_workers=2,
                                              shuffle=False,
                                              collate_fn=collate_remove_none,
                                              worker_init_fn=worker_init_fn)
    vis_train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=1,
        num_workers=2,
        shuffle=False,
        collate_fn=collate_remove_none,
        worker_init_fn=worker_init_fn)
    return test_loader, train_loader, vis_train_loader
Exemplo n.º 6
0
    def gen_data(ckpt_path, flag, dataset_config, dataloader_config, njobs):
        ckpt_dir = os.path.join(ckpt_path, '')
        ckpt_dir_flag = os.path.join(ckpt_dir, flag)
        prefix = os.path.basename(os.path.dirname(dataset_config.indexes_path))
        train_pkl = os.path.join(ckpt_dir, f'{prefix}_train.pkl')
        dev_pkl = os.path.join(ckpt_dir, f'{prefix}_dev.pkl')
        os.makedirs(ckpt_dir, exist_ok=True)
        with open(os.path.join(ckpt_dir, 'dirtype'), 'w') as f:
            f.write('ckpt_dir')
        os.makedirs(ckpt_dir_flag, exist_ok=True)
        with open(os.path.join(ckpt_dir_flag, 'dirtype'), 'w') as f:
            f.write('ckpt_dir_flag')
        file_handler = logging.FileHandler(os.path.join(ckpt_dir, 'train.log'))
        logging.getLogger().addHandler(file_handler)
        if os.path.exists(train_pkl):
            logger.info(f'=> load train, dev data from {ckpt_dir}')
            train_data = pk.load(open(train_pkl, 'rb'))
            # train_set is a of 1600 elements which are dict.
            # with keys:'mel' for mel-spectogram np array (80,xxx) and 'speaker' for speaker string ('p225')
            # print(f"**********************data is: {train_data[0]}********************")
            # print(f"**********************type is: {train_data[0]['mel'].shape}********************")
            dev_data = pk.load(open(dev_pkl, 'rb'))
            train_set = get_dataset(dset='train',
                                    dataset_config=dataset_config,
                                    njobs=njobs,
                                    metadata=train_data)
            # train_set is a class: dataloader.vctk.Dataset.
            # Has attribute data such that train_set.data is a list of 1600 features so train_set.data[i] is o size (80,xxx)

            # print(f"**********************type is: {type(train_set)}********************")
            # print(f"**********************shape is: {train_set.data[0]['mel'].shape}********************")
            dev_set = get_dataset(dset='dev',
                                  dataset_config=dataset_config,
                                  njobs=njobs,
                                  metadata=dev_data)
        else:
            train_set = get_dataset(dset='train',
                                    dataset_config=dataset_config,
                                    njobs=njobs)
            dev_set = get_dataset(dset='dev',
                                  dataset_config=dataset_config,
                                  njobs=njobs)
            pk.dump(train_set.data, open(train_pkl, 'wb'))
            pk.dump(dev_set.data, open(dev_pkl, 'wb'))
        train_loader = get_dataloader(dset='train',
                                      dataloader_config=dataloader_config,
                                      dataset=train_set)
        dev_loader = get_dataloader(dset='dev',
                                    dataloader_config=dataloader_config,
                                    dataset=dev_set)
        # print(f"**********************type is: {type(train_loader)}********************")

        return ckpt_dir_flag, train_set, dev_set, train_loader, dev_loader
Exemplo n.º 7
0
    def load_data(ckpt_path, dataset_config, dataloader_config, njobs):
        if os.path.isdir(ckpt_path):
            d = os.path.join(ckpt_path, '')
            with open(os.path.join(d, 'dirtype'), 'r') as f:
                dirtype = f.read().strip()
            if dirtype == 'ckpt_dir':
                logger.warn(
                    f'The ckpt_path is {ckpt_path}, the flag is not specified.'
                )
                logger.warn(f'Use "default" flag.')
                ckpt_dir = d
                flag = 'default'
                ckpt_dir_flag = os.path.join(ckpt_dir, flag)
                ckpt_path = ckpt_dir_flag
            elif dirtype == 'ckpt_dir_flag':
                ckpt_dir_flag = os.path.dirname(d)
                ckpt_dir = os.path.dirname(ckpt_dir_flag)
                flag = os.path.basename(ckpt_dir_flag)
            else:
                raise NotImplementedError(
                    f'Wrong dirtype: {dirtype} from {d}.')
        else:
            ckpt_dir_flag = os.path.dirname(ckpt_path)
            ckpt_dir = os.path.dirname(ckpt_dir_flag)
            flag = os.path.basename(ckpt_dir_flag)

        file_handler = logging.FileHandler(os.path.join(ckpt_dir, 'train.log'))
        logging.getLogger().addHandler(file_handler)
        logger.info(f'=> load train, dev data from {os.path.join(ckpt_dir)}')
        prefix = os.path.basename(os.path.dirname(dataset_config.indexes_path))
        train_data = pk.load(
            open(os.path.join(ckpt_dir, f'{prefix}_train.pkl'), 'rb'))
        dev_data = pk.load(
            open(os.path.join(ckpt_dir, f'{prefix}_dev.pkl'), 'rb'))

        train_set = get_dataset(dset='train',
                                dataset_config=dataset_config,
                                njobs=njobs,
                                metadata=train_data)
        dev_set = get_dataset(dset='dev',
                              dataset_config=dataset_config,
                              njobs=njobs,
                              metadata=dev_data)
        train_loader = get_dataloader(dset='train',
                                      dataloader_config=dataloader_config,
                                      dataset=train_set)
        dev_loader = get_dataloader(dset='dev',
                                    dataloader_config=dataloader_config,
                                    dataset=dev_set)

        return ckpt_dir_flag, train_set, dev_set, train_loader, dev_loader
Exemplo n.º 8
0
def get_data_loaders(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    datasets_raw = {}
    logger.info("Loading training data")
    datasets_raw['train'] = get_dataset(tokenizer, args.dataset_cache,
                                        args.dataset_path, 'train')
    logger.info("Loading validation data")
    datasets_raw['valid'] = get_dataset(tokenizer, args.dataset_cache,
                                        args.dataset_path, 'dev')

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}

    for dataset_name, dataset in datasets_raw.items():
        for data_point in dataset:
            instance, _ = build_input_from_segments(data_point, tokenizer)
            for input_name, input_array in instance.items():
                datasets[dataset_name][input_name].append(input_array)

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset,
                              padding=tokenizer.convert_tokens_to_ids(
                                  SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(
        *tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset) if args.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset) if args.distributed else None
    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=args.train_batch_size,
                              shuffle=(not args.distributed))
    valid_loader = DataLoader(valid_dataset,
                              sampler=valid_sampler,
                              batch_size=args.valid_batch_size,
                              shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(
        train_dataset.tensors[0].shape))
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(
        valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler
def load_data():
    BATCH_SIZE = 2

    train_video_dataset = dataloader.get_dataset(dataloader.TRAIN_JSON_PATH, dataloader.SINGLE_FRAME)
    train_audio_dataset = dataloader.AudioDataset()
    train_loader = dataloader.AVDataLoader(train_video_dataset, train_audio_dataset, batch_size=BATCH_SIZE, shuffle=False, single_frame=True)
    return train_loader
Exemplo n.º 10
0
 def test_udd_error_rids(self):
     """test one of relation ID < 0
     """
     extra_train = ['0\t-1\t1\n']
     files = gen_udd_files(extra_train=extra_train)
     with self.assertRaises(AssertionError):
         dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
     cleanup(files)
Exemplo n.º 11
0
    def gen_data(ckpt_path, flag, dataset_config, dataloader_config, njobs):
        ckpt_dir = os.path.join(ckpt_path, '')
        ckpt_dir_flag = os.path.join(ckpt_dir, flag)
        prefix = os.path.basename(os.path.dirname(dataset_config.indexes_path))
        train_pkl = os.path.join(ckpt_dir, f'{prefix}_train.pkl')
        dev_pkl = os.path.join(ckpt_dir, f'{prefix}_dev.pkl')
        os.makedirs(ckpt_dir, exist_ok=True)
        with open(os.path.join(ckpt_dir, 'dirtype'), 'w') as f:
            f.write('ckpt_dir')
        os.makedirs(ckpt_dir_flag, exist_ok=True)
        with open(os.path.join(ckpt_dir_flag, 'dirtype'), 'w') as f:
            f.write('ckpt_dir_flag')
        file_handler = logging.FileHandler(os.path.join(ckpt_dir, 'train.log'))
        logging.getLogger().addHandler(file_handler)
        if os.path.exists(train_pkl):
            logger.info(f'=> load train, dev data from {ckpt_dir}')
            train_data = pk.load(open(train_pkl, 'rb'))
            dev_data = pk.load(open(dev_pkl, 'rb'))
            train_set = get_dataset(dset='train',
                                    dataset_config=dataset_config,
                                    njobs=njobs,
                                    metadata=train_data)
            dev_set = get_dataset(dset='dev',
                                  dataset_config=dataset_config,
                                  njobs=njobs,
                                  metadata=dev_data)
        else:
            train_set = get_dataset(dset='train',
                                    dataset_config=dataset_config,
                                    njobs=njobs)
            dev_set = get_dataset(dset='dev',
                                  dataset_config=dataset_config,
                                  njobs=njobs)
            pk.dump(train_set.data, open(train_pkl, 'wb'))
            pk.dump(dev_set.data, open(dev_pkl, 'wb'))
        train_loader = get_dataloader(dset='train',
                                      dataloader_config=dataloader_config,
                                      dataset=train_set)
        dev_loader = get_dataloader(dset='dev',
                                    dataloader_config=dataloader_config,
                                    dataset=dev_set)

        return ckpt_dir_flag, train_set, dev_set, train_loader, dev_loader
Exemplo n.º 12
0
def main(config):
    trainset = get_dataset(config.train_data_dir, config)
    blockSampler = RandomBatchSampler(trainset,
                                      config.batch_size,
                                      drop_last=True)
    train_loader = DataLoader(dataset=trainset,
                              batch_sampler=blockSampler,
                              collate_fn=CropCollate(0),
                              num_workers=config.num_workers)
    solver = Solver(train_loader, config)
    solver.train()
Exemplo n.º 13
0
    def test_udd_ofr_rids(self):
        """test one of relation ID is larger than number of relations
        """
        extra_train = ['0\t2\t1\n']
        files = gen_udd_files(extra_train=extra_train)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_valid = ['0\t2\t1\n']
        files = gen_udd_files(extra_valid=extra_valid)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)

        extra_test = ['0\t2\t1\n']
        files = gen_udd_files(extra_test=extra_test)
        with self.assertRaises(AssertionError):
            dataset = get_dataset('./', 'udd_test', 'udd_hrt', '\t', files)
        cleanup(files)
Exemplo n.º 14
0
def main():
    """
    Main predict function for the wikikg90m
    """
    args = ArgParser().parse_args()
    config = load_model_config(
        os.path.join(args.model_path, 'model_config.json'))
    args = use_config_replace_args(args, config)
    dataset = get_dataset(args, args.data_path, args.dataset, args.format,
                          args.delimiter, args.data_files,
                          args.has_edge_importance)
    print("Load the dataset done.")
    eval_dataset = EvalDataset(dataset, args)

    model = BaseKEModel(
        args=args,
        n_entities=dataset.n_entities,
        n_relations=dataset.n_relations,
        model_name=args.model_name,
        hidden_size=args.hidden_dim,
        entity_feat_dim=dataset.entity_feat.shape[1],
        relation_feat_dim=dataset.relation_feat.shape[1],
        gamma=args.gamma,
        double_entity_emb=args.double_ent,
        cpu_emb=args.cpu_emb,
        relation_times=args.ote_size,
        scale_type=args.scale_type)

    print("Create the model done.")
    model.entity_feat = dataset.entity_feat
    model.relation_feat = dataset.relation_feat
    load_model_from_checkpoint(model, args.model_path)
    print("The model load the checkpoint done.")

    if args.infer_valid:
        valid_sampler_tail = eval_dataset.create_sampler(
            'valid',
            args.batch_size_eval,
            mode='tail',
            num_workers=args.num_workers,
            rank=0,
            ranks=1)
        infer(args, model, config, 0, [valid_sampler_tail], "valid")

    if args.infer_test:
        test_sampler_tail = eval_dataset.create_sampler(
            'test',
            args.batch_size_eval,
            mode='tail',
            num_workers=args.num_workers,
            rank=i,
            ranks=args.num_proc)
        infer(args, model, config, 0, [test_sampler_tail], "test")
Exemplo n.º 15
0
def main():
    print('===> Loading datasets')
    train_set = get_dataset(args.patch_size,
                            args.upscale_factor,
                            phase='train')
    train_loader = DataLoader(dataset=train_set,
                              batch_size=args.bs,
                              shuffle=True)

    test_set = get_dataset(0, args.upscale_factor,
                           phase='test')  # 0 has no meaning
    test_loader = DataLoader(dataset=test_set, batch_size=1, shuffle=False)

    if args.model == 'srcnn':
        model = SRCNNTrainer(args, train_loader, test_loader)
    elif args.model == 'srdcn':
        model = SRDCNTrainer(args, train_loader, test_loader)
    else:
        raise Exception("the model does not exist")

    model.run()
Exemplo n.º 16
0
def main(config):
    dataset = get_dataset(config.data_dir, config)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            num_workers=config.num_workers,
                            drop_last=False)
    evaluator = Evaluation(config)
    if not os.path.exists(config.post_dir):
        os.makedirs(config.post_dir)
    evaluator.extract_post_to_hardisk(
        dataloader,
        'ark,scp:{0}/posts.ark,{0}/posts.scp'.format(config.post_dir))
Exemplo n.º 17
0
def pretrain(args):
    tf = get_transform(args, 'none')
    ds = get_dataset(args, tf, 'none')

    args, model, ckpt_available = get_model_ckpt(args)

    if ckpt_available:
        print("loaded checkpoint {} in pretraining stage".format(args.ckpt_name))
        loss_fn = get_loss(args)
        sub_optimizer = get_sub_optimizer(args, model)
        optimizer = get_optimizer(args, sub_optimizer)
        scheduler = get_scheduler(args, optimizer)

        # setup nvidia/apex amp
        # model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, num_losses=1)
        # model = idist.auto_model(model)

        trainer = get_trainer(args, model, loss_fn, optimizer, scheduler)
Exemplo n.º 18
0
def pretrain(args):
    tf = get_transform(args, 'none')
    ds = get_dataset(args, tf, 'none')

    args, model, ckpt_available = get_model_ckpt(args)

    if ckpt_available:
        print("loaded checkpoint {} in pretraining stage".format(
            args.ckpt_name))
    loss_fn = get_loss(args)
    sub_optimizer = get_sub_optimizer(args, model)
    optimizer = get_optimizer(args, sub_optimizer)
    scheduler = get_scheduler(args, optimizer)

    # setup nvidia/apex amp
    # model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, num_losses=1)
    # model = idist.auto_model(model)

    trainer = get_trainer(args, model, loss_fn, optimizer, scheduler)

    metrics = get_metrics(args)
    logger = get_logger(args)

    @trainer.on(Events.STARTED)
    def on_training_started(engine):
        print("Begin Pretraining")

        # batch-wise
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_iter_results(engine):
        log_results(logger, 'pretrain/iter', engine.state,
                    engine.state.iteration)

    # epoch-wise (ckpt)
    @trainer.on(Events.EPOCH_COMPLETED)
    def save_epoch(engine):
        log_results(logger, 'pretrain/epoch', engine.state, engine.state.epoch)
        log_results_cmd(logger, 'pretrain/epoch', engine.state,
                        engine.state.epoch)
        save_ckpt(args, engine.state.epoch, engine.state.metrics['loss'],
                  model)

    trainer.run(ds, max_epochs=args.epoch)
Exemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser(description='Partition a knowledge graph')
    parser.add_argument('--data_path', type=str, default='data',
                        help='root path of all dataset')
    parser.add_argument('--dataset', type=str, default='FB15k',
                        help='dataset name, under data_path')
    parser.add_argument('--data_files', type=str, default=None, nargs='+',
                        help='a list of data files, e.g. entity relation train valid test')
    parser.add_argument('--format', type=str, default='built_in',
                        help='the format of the dataset, it can be built_in,'\
                                'raw_udd_{htr} and udd_{htr}')
    parser.add_argument('-k', '--num-parts', required=True, type=int,
                        help='The number of partitions')
    args = parser.parse_args()
    num_parts = args.num_parts

    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files)

    src, etype_id, dst = dataset.train
    coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)),
            shape=[dataset.n_entities, dataset.n_entities])
    g = dgl.DGLGraph(coo, readonly=True, sort_csr=True)
    g.edata['tid'] = F.tensor(etype_id, F.int64)

    part_dict = dgl.transform.metis_partition(g, num_parts, 1)

    tot_num_inner_edges = 0
    for part_id in part_dict:
        part = part_dict[part_id]

        num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0])
        num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0])
        print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format(
              part_id, part.number_of_nodes(), part.number_of_edges(),
              num_inner_nodes, num_inner_edges))
        tot_num_inner_edges += num_inner_edges

        part.copy_from_parent()
        save_graphs(args.data_path + '/part_' + str(part_id) + '.dgl', [part])
    print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format(
        g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
Exemplo n.º 20
0
def eval_linear(pretrain_args, args):
    # get pretrained model
    pt_args, pt_model, ckpt_available = get_model_ckpt(pretrain_args)
    
    tf = get_transform(args, 'train')
    ds = get_dataset(args, tf, 'train')

    if ckpt_available:
        print("loaded pretrained model {} in eval linear".format(args.ckpt_name))

    model = get_linear(args, pt_model, args.num_classes)
    loss_fn = get_loss(args)
    optimizer = get_sub_optimizer(args, model)
    scheduler = get_scheduler(args, optimizer)

    trainer = get_trainer(args, model, loss_fn, optimizer, scheduler)
    evaluator = get_evaluator(args, model, loss_fn)

    # metrics = get_metrics(args)
    logger = get_logger(args)
    trainer.run(ds, max_epochs=args.epoch)
Exemplo n.º 21
0
def main():
    args = get_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_ids
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    if not os.path.exists(args.tensorboard_dir):
        os.makedirs(args.tensorboard_dir)
    logging = get_logging()
    seed_everything(args.seed)

    transforms_train = Compose([
        RandomResizedCrop(args.img_size, args.img_size),
        Transpose(p=0.5),
        HorizontalFlip(p=0.5),
        VerticalFlip(p=0.5),
        ShiftScaleRotate(p=0.5),
        HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
        RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
        CoarseDropout(p=0.5),
        ToTensorV2(p=1.0),
    ], p=1.)
    transforms_val = Compose([
        CenterCrop(args.img_size, args.img_size, p=1.),
        Resize(args.img_size, args.img_size),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
        ToTensorV2(p=1.0),
    ], p=1.)

    train_loader, val_loader = get_dataset(args.data_path, args.batch_size, args.batch_size_val, transforms_train,
                                           transforms_val)
    net = get_model(True, device)

    trainer = Trainer(net, train_loader, val_loader, args, device, logging)
    trainer.train()
Exemplo n.º 22
0
def main(args):
    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format)
    args.pickle_graph = False
    args.train = False
    args.valid = False
    args.test = True
    args.batch_size_eval = args.batch_size

    logger = get_logger(args)
    # Here we want to use the regualr negative sampler because we need to ensure that
    # all positive edges are excluded.
    eval_dataset = EvalDataset(dataset, args)
    args.neg_sample_size_test = args.neg_sample_size
    if args.neg_sample_size < 0:
        args.neg_sample_size_test = args.neg_sample_size = eval_dataset.g.number_of_nodes(
        )
    if args.num_proc > 1:
        test_sampler_tails = []
        test_sampler_heads = []
        for i in range(args.num_proc):
            test_sampler_head = eval_dataset.create_sampler(
                'test',
                args.batch_size,
                args.neg_sample_size,
                mode='PBG-head',
                num_workers=args.num_worker,
                rank=i,
                ranks=args.num_proc)
            test_sampler_tail = eval_dataset.create_sampler(
                'test',
                args.batch_size,
                args.neg_sample_size,
                mode='PBG-tail',
                num_workers=args.num_worker,
                rank=i,
                ranks=args.num_proc)
            test_sampler_heads.append(test_sampler_head)
            test_sampler_tails.append(test_sampler_tail)
    else:
        test_sampler_head = eval_dataset.create_sampler(
            'test',
            args.batch_size,
            args.neg_sample_size,
            mode='PBG-head',
            num_workers=args.num_worker,
            rank=0,
            ranks=1)
        test_sampler_tail = eval_dataset.create_sampler(
            'test',
            args.batch_size,
            args.neg_sample_size,
            mode='PBG-tail',
            num_workers=args.num_worker,
            rank=0,
            ranks=1)

    # load model
    n_entities = dataset.n_entities
    n_relations = dataset.n_relations
    ckpt_path = args.model_path
    model = load_model_from_checkpoint(logger, args, n_entities, n_relations,
                                       ckpt_path)

    if args.num_proc > 1:
        model.share_memory()
    # test
    args.step = 0
    args.max_step = 0
    if args.num_proc > 1:
        procs = []
        for i in range(args.num_proc):
            proc = mp.Process(target=test,
                              args=(args, model, [
                                  test_sampler_heads[i], test_sampler_tails[i]
                              ]))
            procs.append(proc)
            proc.start()
        for proc in procs:
            proc.join()
    else:
        test(args, model, [test_sampler_head, test_sampler_tail])
Exemplo n.º 23
0
def main(args):
    args.eval_filter = not args.no_eval_filter
    if args.neg_deg_sample:
        assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."

    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format)
    args.pickle_graph = False
    args.train = False
    args.valid = False
    args.test = True
    args.batch_size_eval = args.batch_size

    logger = get_logger(args)
    # Here we want to use the regualr negative sampler because we need to ensure that
    # all positive edges are excluded.
    eval_dataset = EvalDataset(dataset, args)

    args.neg_sample_size_test = args.neg_sample_size
    args.neg_deg_sample_eval = args.neg_deg_sample
    if args.neg_sample_size < 0:
        args.neg_sample_size_test = args.neg_sample_size = eval_dataset.g.number_of_nodes(
        )
    if args.neg_chunk_size < 0:
        args.neg_chunk_size = args.neg_sample_size

    num_workers = args.num_worker
    # for multiprocessing evaluation, we don't need to sample multiple batches at a time
    # in each process.
    if args.num_proc > 1:
        num_workers = 1
    if args.num_proc > 1:
        test_sampler_tails = []
        test_sampler_heads = []
        for i in range(args.num_proc):
            test_sampler_head = eval_dataset.create_sampler(
                'test',
                args.batch_size,
                args.neg_sample_size,
                args.neg_chunk_size,
                args.eval_filter,
                mode='chunk-head',
                num_workers=num_workers,
                rank=i,
                ranks=args.num_proc)
            test_sampler_tail = eval_dataset.create_sampler(
                'test',
                args.batch_size,
                args.neg_sample_size,
                args.neg_chunk_size,
                args.eval_filter,
                mode='chunk-tail',
                num_workers=num_workers,
                rank=i,
                ranks=args.num_proc)
            test_sampler_heads.append(test_sampler_head)
            test_sampler_tails.append(test_sampler_tail)
    else:
        test_sampler_head = eval_dataset.create_sampler(
            'test',
            args.batch_size,
            args.neg_sample_size,
            args.neg_chunk_size,
            args.eval_filter,
            mode='chunk-head',
            num_workers=num_workers,
            rank=0,
            ranks=1)
        test_sampler_tail = eval_dataset.create_sampler(
            'test',
            args.batch_size,
            args.neg_sample_size,
            args.neg_chunk_size,
            args.eval_filter,
            mode='chunk-tail',
            num_workers=num_workers,
            rank=0,
            ranks=1)

    # load model
    n_entities = dataset.n_entities
    n_relations = dataset.n_relations
    ckpt_path = args.model_path
    model = load_model_from_checkpoint(logger, args, n_entities, n_relations,
                                       ckpt_path)

    if args.num_proc > 1:
        model.share_memory()
    # test
    args.step = 0
    args.max_step = 0
    start = time.time()
    if args.num_proc > 1:
        queue = mp.Queue(args.num_proc)
        procs = []
        for i in range(args.num_proc):
            proc = mp.Process(target=test,
                              args=(args, model, [
                                  test_sampler_heads[i], test_sampler_tails[i]
                              ], 'Test', queue))
            procs.append(proc)
            proc.start()
        for proc in procs:
            proc.join()

        total_metrics = {}
        for i in range(args.num_proc):
            metrics = queue.get()
            for k, v in metrics.items():
                if i == 0:
                    total_metrics[k] = v / args.num_proc
                else:
                    total_metrics[k] += v / args.num_proc
        for k, v in metrics.items():
            print('Test average {} at [{}/{}]: {}'.format(
                k, args.step, args.max_step, v))
    else:
        test(args, model, [test_sampler_head, test_sampler_tail])
    print('Test takes {:.3f} seconds'.format(time.time() - start))
Exemplo n.º 24
0
def run(args, logger):
    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format)
    n_entities = dataset.n_entities
    n_relations = dataset.n_relations
    if args.neg_sample_size_test < 0:
        args.neg_sample_size_test = n_entities

    train_data = TrainDataset(dataset, args, ranks=args.num_proc)
    if args.num_proc > 1:
        train_samplers = []
        for i in range(args.num_proc):
            train_sampler_head = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                mode='PBG-head',
                num_workers=args.num_worker,
                shuffle=True,
                exclude_positive=True,
                rank=i)
            train_sampler_tail = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                mode='PBG-tail',
                num_workers=args.num_worker,
                shuffle=True,
                exclude_positive=True,
                rank=i)
            train_samplers.append(
                NewBidirectionalOneShotIterator(train_sampler_head,
                                                train_sampler_tail, True,
                                                n_entities))
    else:
        train_sampler_head = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            mode='PBG-head',
            num_workers=args.num_worker,
            shuffle=True,
            exclude_positive=True)
        train_sampler_tail = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            mode='PBG-tail',
            num_workers=args.num_worker,
            shuffle=True,
            exclude_positive=True)
        train_sampler = NewBidirectionalOneShotIterator(
            train_sampler_head, train_sampler_tail, True, n_entities)

    if args.valid or args.test:
        eval_dataset = EvalDataset(dataset, args)
    if args.valid:
        # Here we want to use the regualr negative sampler because we need to ensure that
        # all positive edges are excluded.
        if args.num_proc > 1:
            valid_sampler_heads = []
            valid_sampler_tails = []
            for i in range(args.num_proc):
                valid_sampler_head = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_valid,
                    mode='PBG-head',
                    num_workers=args.num_worker,
                    rank=i,
                    ranks=args.num_proc)
                valid_sampler_tail = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_valid,
                    mode='PBG-tail',
                    num_workers=args.num_worker,
                    rank=i,
                    ranks=args.num_proc)
                valid_sampler_heads.append(valid_sampler_head)
                valid_sampler_tails.append(valid_sampler_tail)
        else:
            valid_sampler_head = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_valid,
                mode='PBG-head',
                num_workers=args.num_worker,
                rank=0,
                ranks=1)
            valid_sampler_tail = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_valid,
                mode='PBG-tail',
                num_workers=args.num_worker,
                rank=0,
                ranks=1)
    if args.test:
        # Here we want to use the regualr negative sampler because we need to ensure that
        # all positive edges are excluded.
        if args.num_proc > 1:
            test_sampler_tails = []
            test_sampler_heads = []
            for i in range(args.num_proc):
                test_sampler_head = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_test,
                    mode='PBG-head',
                    num_workers=args.num_worker,
                    rank=i,
                    ranks=args.num_proc)
                test_sampler_tail = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_test,
                    mode='PBG-tail',
                    num_workers=args.num_worker,
                    rank=i,
                    ranks=args.num_proc)
                test_sampler_heads.append(test_sampler_head)
                test_sampler_tails.append(test_sampler_tail)
        else:
            test_sampler_head = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_test,
                mode='PBG-head',
                num_workers=args.num_worker,
                rank=0,
                ranks=1)
            test_sampler_tail = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_test,
                mode='PBG-tail',
                num_workers=args.num_worker,
                rank=0,
                ranks=1)

    # We need to free all memory referenced by dataset.
    eval_dataset = None
    dataset = None
    # load model
    model = load_model(logger, args, n_entities, n_relations)

    if args.num_proc > 1:
        model.share_memory()

    # train
    start = time.time()
    if args.num_proc > 1:
        procs = []
        for i in range(args.num_proc):
            valid_samplers = [valid_sampler_heads[i], valid_sampler_tails[i]
                              ] if args.valid else None
            proc = mp.Process(target=train,
                              args=(args, model, train_samplers[i],
                                    valid_samplers))
            procs.append(proc)
            proc.start()
        for proc in procs:
            proc.join()
    else:
        valid_samplers = [valid_sampler_head, valid_sampler_tail
                          ] if args.valid else None
        train(args, model, train_sampler, valid_samplers)
    print('training takes {} seconds'.format(time.time() - start))

    if args.save_emb is not None:
        if not os.path.exists(args.save_emb):
            os.mkdir(args.save_emb)
        model.save_emb(args.save_emb, args.dataset)

    # test
    if args.test:
        if args.num_proc > 1:
            procs = []
            for i in range(args.num_proc):
                proc = mp.Process(target=test,
                                  args=(args, model, [
                                      test_sampler_heads[i],
                                      test_sampler_tails[i]
                                  ]))
                procs.append(proc)
                proc.start()
            for proc in procs:
                proc.join()
        else:
            test(args, model, [test_sampler_head, test_sampler_tail])
Exemplo n.º 25
0
from dataloader import get_dataset, transform
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from matplotlib import pyplot as plt
from torchvision import transforms

train_iter = get_dataset('train', transform=transform, batch_size=1)

for idx, item in enumerate(train_iter):
    # print(item)
    data, annotation = item
    data, _ = data
    annotation, _ = annotation
    if idx % 100 == 0:
        print(idx)

print('finished')
Exemplo n.º 26
0
def run(args, logger):
    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format)
    n_entities = dataset.n_entities
    n_relations = dataset.n_relations
    if args.neg_sample_size_test < 0:
        args.neg_sample_size_test = n_entities
    args.eval_filter = not args.no_eval_filter
    if args.neg_deg_sample_eval:
        assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."

    # When we generate a batch of negative edges from a set of positive edges,
    # we first divide the positive edges into chunks and corrupt the edges in a chunk
    # together. By default, the chunk size is equal to the negative sample size.
    # Usually, this works well. But we also allow users to specify the chunk size themselves.
    if args.neg_chunk_size < 0:
        args.neg_chunk_size = args.neg_sample_size
    if args.neg_chunk_size_valid < 0:
        args.neg_chunk_size_valid = args.neg_sample_size_valid
    if args.neg_chunk_size_test < 0:
        args.neg_chunk_size_test = args.neg_sample_size_test

    train_data = TrainDataset(dataset, args, ranks=args.num_proc)
    if args.num_proc > 1:
        train_samplers = []
        for i in range(args.num_proc):
            train_sampler_head = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                args.neg_chunk_size,
                mode='chunk-head',
                num_workers=args.num_worker,
                shuffle=True,
                exclude_positive=True,
                rank=i)
            train_sampler_tail = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                args.neg_chunk_size,
                mode='chunk-tail',
                num_workers=args.num_worker,
                shuffle=True,
                exclude_positive=True,
                rank=i)
            train_samplers.append(
                NewBidirectionalOneShotIterator(train_sampler_head,
                                                train_sampler_tail,
                                                args.neg_chunk_size, True,
                                                n_entities))
    else:
        train_sampler_head = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_chunk_size,
            mode='chunk-head',
            num_workers=args.num_worker,
            shuffle=True,
            exclude_positive=True)
        train_sampler_tail = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_chunk_size,
            mode='chunk-tail',
            num_workers=args.num_worker,
            shuffle=True,
            exclude_positive=True)
        train_sampler = NewBidirectionalOneShotIterator(
            train_sampler_head, train_sampler_tail, args.neg_chunk_size, True,
            n_entities)

    # for multiprocessing evaluation, we don't need to sample multiple batches at a time
    # in each process.
    num_workers = args.num_worker
    if args.num_proc > 1:
        num_workers = 1
    if args.valid or args.test:
        eval_dataset = EvalDataset(dataset, args)
    if args.valid:
        # Here we want to use the regualr negative sampler because we need to ensure that
        # all positive edges are excluded.
        if args.num_proc > 1:
            valid_sampler_heads = []
            valid_sampler_tails = []
            for i in range(args.num_proc):
                valid_sampler_head = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_valid,
                    args.neg_chunk_size_valid,
                    args.eval_filter,
                    mode='chunk-head',
                    num_workers=num_workers,
                    rank=i,
                    ranks=args.num_proc)
                valid_sampler_tail = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_valid,
                    args.neg_chunk_size_valid,
                    args.eval_filter,
                    mode='chunk-tail',
                    num_workers=num_workers,
                    rank=i,
                    ranks=args.num_proc)
                valid_sampler_heads.append(valid_sampler_head)
                valid_sampler_tails.append(valid_sampler_tail)
        else:
            valid_sampler_head = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_valid,
                args.neg_chunk_size_valid,
                args.eval_filter,
                mode='chunk-head',
                num_workers=num_workers,
                rank=0,
                ranks=1)
            valid_sampler_tail = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_valid,
                args.neg_chunk_size_valid,
                args.eval_filter,
                mode='chunk-tail',
                num_workers=num_workers,
                rank=0,
                ranks=1)
    if args.test:
        # Here we want to use the regualr negative sampler because we need to ensure that
        # all positive edges are excluded.
        if args.num_proc > 1:
            test_sampler_tails = []
            test_sampler_heads = []
            for i in range(args.num_proc):
                test_sampler_head = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_test,
                    args.neg_chunk_size_test,
                    args.eval_filter,
                    mode='chunk-head',
                    num_workers=num_workers,
                    rank=i,
                    ranks=args.num_proc)
                test_sampler_tail = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_test,
                    args.neg_chunk_size_test,
                    args.eval_filter,
                    mode='chunk-tail',
                    num_workers=num_workers,
                    rank=i,
                    ranks=args.num_proc)
                test_sampler_heads.append(test_sampler_head)
                test_sampler_tails.append(test_sampler_tail)
        else:
            test_sampler_head = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_test,
                args.neg_chunk_size_test,
                args.eval_filter,
                mode='chunk-head',
                num_workers=num_workers,
                rank=0,
                ranks=1)
            test_sampler_tail = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_test,
                args.neg_chunk_size_test,
                args.eval_filter,
                mode='chunk-tail',
                num_workers=num_workers,
                rank=0,
                ranks=1)

    # We need to free all memory referenced by dataset.
    eval_dataset = None
    dataset = None
    # load model
    model = load_model(logger, args, n_entities, n_relations)

    if args.num_proc > 1:
        model.share_memory()

    # train
    start = time.time()
    if args.num_proc > 1:
        procs = []
        for i in range(args.num_proc):
            rel_parts = train_data.rel_parts if args.rel_part else None
            valid_samplers = [valid_sampler_heads[i], valid_sampler_tails[i]
                              ] if args.valid else None
            proc = mp.Process(target=train,
                              args=(args, model, train_samplers[i], i,
                                    rel_parts, valid_samplers))
            procs.append(proc)
            proc.start()
        for proc in procs:
            proc.join()
    else:
        valid_samplers = [valid_sampler_head, valid_sampler_tail
                          ] if args.valid else None
        train(args, model, train_sampler, valid_samplers)
    print('training takes {} seconds'.format(time.time() - start))

    if args.save_emb is not None:
        if not os.path.exists(args.save_emb):
            os.mkdir(args.save_emb)
        model.save_emb(args.save_emb, args.dataset)

    # test
    if args.test:
        start = time.time()
        if args.num_proc > 1:
            queue = mp.Queue(args.num_proc)
            procs = []
            for i in range(args.num_proc):
                proc = mp.Process(target=test,
                                  args=(args, model, [
                                      test_sampler_heads[i],
                                      test_sampler_tails[i]
                                  ], i, 'Test', queue))
                procs.append(proc)
                proc.start()

            total_metrics = {}
            for i in range(args.num_proc):
                metrics = queue.get()
                for k, v in metrics.items():
                    if i == 0:
                        total_metrics[k] = v / args.num_proc
                    else:
                        total_metrics[k] += v / args.num_proc
            for k, v in metrics.items():
                print('Test average {} at [{}/{}]: {}'.format(
                    k, args.step, args.max_step, v))

            for proc in procs:
                proc.join()
        else:
            test(args, model, [test_sampler_head, test_sampler_tail])
        print('test:', time.time() - start)
Exemplo n.º 27
0
def dist_train_test(args,
                    model,
                    train_sampler,
                    entity_pb,
                    relation_pb,
                    l2g,
                    rank=0,
                    rel_parts=None,
                    cross_rels=None,
                    barrier=None):
    if args.num_proc > 1:
        th.set_num_threads(args.num_thread)

    client = connect_to_kvstore(args, entity_pb, relation_pb, l2g)
    client.barrier()
    train_time_start = time.time()
    train(args, model, train_sampler, None, rank, rel_parts, cross_rels,
          barrier, client)
    client.barrier()
    print('Total train time {:.3f} seconds'.format(time.time() -
                                                   train_time_start))

    model = None

    if client.get_id() % args.num_client == 0:  # pull full model from kvstore

        args.num_test_proc = args.num_client
        dataset_full = get_dataset(args.data_path, args.dataset, args.format)

        print('Full data n_entities: ' + str(dataset_full.n_entities))
        print("Full data n_relations: " + str(dataset_full.n_relations))

        model_test = load_model(None, args, dataset_full.n_entities,
                                dataset_full.n_relations)
        eval_dataset = EvalDataset(dataset_full, args)

        if args.test:
            model_test.share_memory()

        if args.neg_sample_size_test < 0:
            args.neg_sample_size_test = dataset_full.n_entities
        args.eval_filter = not args.no_eval_filter
        if args.neg_deg_sample_eval:
            assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."

        if args.neg_chunk_size_valid < 0:
            args.neg_chunk_size_valid = args.neg_sample_size_valid
        if args.neg_chunk_size_test < 0:
            args.neg_chunk_size_test = args.neg_sample_size_test

        print("Pull relation_emb ...")
        relation_id = F.arange(0, model_test.n_relations)
        relation_data = client.pull(name='relation_emb', id_tensor=relation_id)
        model_test.relation_emb.emb[relation_id] = relation_data

        print("Pull entity_emb ... ")
        # split model into 100 small parts
        start = 0
        percent = 0
        entity_id = F.arange(0, model_test.n_entities)
        count = int(model_test.n_entities / 100)
        end = start + count
        while True:
            print("Pull %d / 100 ..." % percent)
            if end >= model_test.n_entities:
                end = -1
            tmp_id = entity_id[start:end]
            entity_data = client.pull(name='entity_emb', id_tensor=tmp_id)
            model_test.entity_emb.emb[tmp_id] = entity_data
            if end == -1:
                break
            start = end
            end += count
            percent += 1

        if args.save_emb is not None:
            if not os.path.exists(args.save_emb):
                os.mkdir(args.save_emb)
            model_test.save_emb(args.save_emb, args.dataset)

        if args.test:
            args.num_thread = 1
            test_sampler_tails = []
            test_sampler_heads = []
            for i in range(args.num_test_proc):
                test_sampler_head = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_test,
                    args.neg_chunk_size_test,
                    args.eval_filter,
                    mode='chunk-head',
                    num_workers=args.num_thread,
                    rank=i,
                    ranks=args.num_test_proc)
                test_sampler_tail = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_test,
                    args.neg_chunk_size_test,
                    args.eval_filter,
                    mode='chunk-tail',
                    num_workers=args.num_thread,
                    rank=i,
                    ranks=args.num_test_proc)
                test_sampler_heads.append(test_sampler_head)
                test_sampler_tails.append(test_sampler_tail)

            eval_dataset = None
            dataset_full = None

            print("Run test, test processes: %d" % args.num_test_proc)

            queue = mp.Queue(args.num_test_proc)
            procs = []
            for i in range(args.num_test_proc):
                proc = mp.Process(target=test_mp,
                                  args=(args, model_test, [
                                      test_sampler_heads[i],
                                      test_sampler_tails[i]
                                  ], i, 'Test', queue))
                procs.append(proc)
                proc.start()

            total_metrics = {}
            metrics = {}
            logs = []
            for i in range(args.num_test_proc):
                log = queue.get()
                logs = logs + log

            for metric in logs[0].keys():
                metrics[metric] = sum([log[metric]
                                       for log in logs]) / len(logs)
            for k, v in metrics.items():
                print('Test average {} at [{}/{}]: {}'.format(
                    k, args.step, args.max_step, v))

            for proc in procs:
                proc.join()

        if client.get_id() == 0:
            client.shut_down()
Exemplo n.º 28
0
                    required=True,
                    help='kernel : linear | polynomial | gaussian')
parser.add_argument('--q',
                    type=int,
                    default=2,
                    help='parameter for polynomial kernel')
parser.add_argument('--sigma',
                    type=float,
                    default=1,
                    help='parameter for gaussian kernel')
parser.add_argument('--normalize',
                    action='store_true',
                    help='toggle for normalizing the input data')
opt = parser.parse_args()

train, test_x = get_dataset(root=opt.dataroot, normalize=opt.normalize)
train_x, train_y = train[:, 0:train.shape[1] - 1], train[:, train.shape[1] - 1]

cross_valid = StratifiedKFold(n_splits=5)

tr_accuracies = []
va_accuracies = []
tr_times = []

for tr, va in cross_valid.split(train_x, train_y):

    print('Started cross validation split: {0}'.format(len(tr_accuracies) + 1))
    print('Ratio: {0}/{1} :: TR/VA'.format(tr.shape[0], va.shape[0]))

    tr_x, va_x = train_x[tr], train_x[va]
    tr_y, va_y = train_y[tr], train_y[va]
Exemplo n.º 29
0
def run(args, logger):
    init_time_start = time.time()
    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format,
                          args.data_files)

    if args.neg_sample_size_eval < 0:
        args.neg_sample_size_eval = dataset.n_entities
    args.batch_size = get_compatible_batch_size(args.batch_size,
                                                args.neg_sample_size)
    args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval,
                                                     args.neg_sample_size_eval)

    args.eval_filter = not args.no_eval_filter
    if args.neg_deg_sample_eval:
        assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."

    train_data = TrainDataset(dataset, args, ranks=args.num_proc)
    # if there is no cross partition relaiton, we fall back to strict_rel_part
    args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part
                                                 == False)
    args.soft_rel_part = args.mix_cpu_gpu and args.soft_rel_part and train_data.cross_part
    args.num_workers = 8  # fix num_worker to 8

    if args.num_proc > 1:
        train_samplers = []
        for i in range(args.num_proc):
            train_sampler_head = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                args.neg_sample_size,
                mode='head',
                num_workers=args.num_workers,
                shuffle=True,
                exclude_positive=False,
                rank=i)
            train_sampler_tail = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                args.neg_sample_size,
                mode='tail',
                num_workers=args.num_workers,
                shuffle=True,
                exclude_positive=False,
                rank=i)
            train_samplers.append(
                NewBidirectionalOneShotIterator(train_sampler_head,
                                                train_sampler_tail,
                                                args.neg_sample_size,
                                                args.neg_sample_size, True,
                                                dataset.n_entities))

        train_sampler = NewBidirectionalOneShotIterator(
            train_sampler_head, train_sampler_tail, args.neg_sample_size,
            args.neg_sample_size, True, dataset.n_entities)
    else:  # This is used for debug
        train_sampler_head = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_sample_size,
            mode='head',
            num_workers=args.num_workers,
            shuffle=True,
            exclude_positive=False)
        train_sampler_tail = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_sample_size,
            mode='tail',
            num_workers=args.num_workers,
            shuffle=True,
            exclude_positive=False)
        train_sampler = NewBidirectionalOneShotIterator(
            train_sampler_head, train_sampler_tail, args.neg_sample_size,
            args.neg_sample_size, True, dataset.n_entities)

    if args.valid or args.test:
        if len(args.gpu) > 1:
            args.num_test_proc = args.num_proc if args.num_proc < len(
                args.gpu) else len(args.gpu)
        else:
            args.num_test_proc = args.num_proc
        eval_dataset = EvalDataset(dataset, args)

    if args.valid:
        if args.num_proc > 1:
            valid_sampler_heads = []
            valid_sampler_tails = []
            for i in range(args.num_proc):
                valid_sampler_head = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_eval,
                    args.neg_sample_size_eval,
                    args.eval_filter,
                    mode='chunk-head',
                    num_workers=args.num_workers,
                    rank=i,
                    ranks=args.num_proc)
                valid_sampler_tail = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_eval,
                    args.neg_sample_size_eval,
                    args.eval_filter,
                    mode='chunk-tail',
                    num_workers=args.num_workers,
                    rank=i,
                    ranks=args.num_proc)
                valid_sampler_heads.append(valid_sampler_head)
                valid_sampler_tails.append(valid_sampler_tail)
        else:  # This is used for debug
            valid_sampler_head = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_eval,
                args.neg_sample_size_eval,
                args.eval_filter,
                mode='chunk-head',
                num_workers=args.num_workers,
                rank=0,
                ranks=1)
            valid_sampler_tail = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_eval,
                args.neg_sample_size_eval,
                args.eval_filter,
                mode='chunk-tail',
                num_workers=args.num_workers,
                rank=0,
                ranks=1)
    if args.test:
        if args.num_test_proc > 1:
            test_sampler_tails = []
            test_sampler_heads = []
            for i in range(args.num_test_proc):
                test_sampler_head = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_eval,
                    args.neg_sample_size_eval,
                    args.eval_filter,
                    mode='chunk-head',
                    num_workers=args.num_workers,
                    rank=i,
                    ranks=args.num_test_proc)
                test_sampler_tail = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_eval,
                    args.neg_sample_size_eval,
                    args.eval_filter,
                    mode='chunk-tail',
                    num_workers=args.num_workers,
                    rank=i,
                    ranks=args.num_test_proc)
                test_sampler_heads.append(test_sampler_head)
                test_sampler_tails.append(test_sampler_tail)
        else:
            test_sampler_head = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_eval,
                args.neg_sample_size_eval,
                args.eval_filter,
                mode='chunk-head',
                num_workers=args.num_workers,
                rank=0,
                ranks=1)
            test_sampler_tail = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_eval,
                args.neg_sample_size_eval,
                args.eval_filter,
                mode='chunk-tail',
                num_workers=args.num_workers,
                rank=0,
                ranks=1)

    # load model
    model = load_model(logger, args, dataset.n_entities, dataset.n_relations)
    if args.num_proc > 1 or args.async_update:
        model.share_memory()

    # We need to free all memory referenced by dataset.
    eval_dataset = None
    dataset = None

    print('Total initialize time {:.3f} seconds'.format(time.time() -
                                                        init_time_start))

    # train
    start = time.time()
    rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None
    cross_rels = train_data.cross_rels if args.soft_rel_part else None
    if args.num_proc > 1:
        procs = []
        barrier = mp.Barrier(args.num_proc)
        for i in range(args.num_proc):
            valid_sampler = [valid_sampler_heads[i], valid_sampler_tails[i]
                             ] if args.valid else None
            proc = mp.Process(target=train_mp,
                              args=(args, model, train_samplers[i],
                                    valid_sampler, i, rel_parts, cross_rels,
                                    barrier))
            procs.append(proc)
            proc.start()
        for proc in procs:
            proc.join()
    else:
        valid_samplers = [valid_sampler_head, valid_sampler_tail
                          ] if args.valid else None
        train(args, model, train_sampler, valid_samplers, rel_parts=rel_parts)

    print('training takes {} seconds'.format(time.time() - start))

    if args.save_emb is not None:
        if not os.path.exists(args.save_emb):
            os.mkdir(args.save_emb)
        model.save_emb(args.save_emb, args.dataset)

        # We need to save the model configurations as well.
        conf_file = os.path.join(args.save_emb, 'config.json')
        with open(conf_file, 'w') as outfile:
            json.dump(
                {
                    'dataset': args.dataset,
                    'model': args.model_name,
                    'emb_size': args.hidden_dim,
                    'max_train_step': args.max_step,
                    'batch_size': args.batch_size,
                    'neg_sample_size': args.neg_sample_size,
                    'lr': args.lr,
                    'gamma': args.gamma,
                    'double_ent': args.double_ent,
                    'double_rel': args.double_rel,
                    'neg_adversarial_sampling': args.neg_adversarial_sampling,
                    'adversarial_temperature': args.adversarial_temperature,
                    'regularization_coef': args.regularization_coef,
                    'regularization_norm': args.regularization_norm
                },
                outfile,
                indent=4)

    # test
    if args.test:
        start = time.time()
        if args.num_test_proc > 1:
            queue = mp.Queue(args.num_test_proc)
            procs = []
            for i in range(args.num_test_proc):
                proc = mp.Process(target=test_mp,
                                  args=(args, model, [
                                      test_sampler_heads[i],
                                      test_sampler_tails[i]
                                  ], i, 'Test', queue))
                procs.append(proc)
                proc.start()

            total_metrics = {}
            metrics = {}
            logs = []
            for i in range(args.num_test_proc):
                log = queue.get()
                logs = logs + log

            for metric in logs[0].keys():
                metrics[metric] = sum([log[metric]
                                       for log in logs]) / len(logs)
            for k, v in metrics.items():
                print('Test average {} : {}'.format(k, v))

            for proc in procs:
                proc.join()
        else:
            test(args, model, [test_sampler_head, test_sampler_tail])
        print('testing takes {:.3f} seconds'.format(time.time() - start))
Exemplo n.º 30
0
    d = {
        "costs": costs,
        "Y_prediction_train": Y_prediction_train,
        "Y_prediction_test": Y_prediction_test,
        "w": w,
        "b": b,
        "epoch": epoch,
        "learning_rate": learning_rate
    }

    return d


if __name__ == "__main__":
    train_x_orig, train_y, test_x_orig, test_y, classes = get_dataset()

    # Reshape train/test samples
    train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1).T
    test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1).T
    # # Example of a picture
    # index = 2
    # plt.imshow(train_x_orig[index])
    # plt.show()
    # print("y = " + str(train_y[:, index]) + ", it's a '" + classes[np.squeeze(train_y[:, index])].decode(
    #     "utf-8") + "' picture.")
    # print(train_x_orig.shape)
    # print(train_x_flatten.shape)
    # 对图片数据集进行标准化处理
    train_x = train_x_flatten / 255
    test_x = test_x_flatten / 255