Exemplo n.º 1
0
    def train_epoch_cae(model, optimizer, dataset, mode, args, device):
        seed_everything()
        loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
        total_loss = 0
        total_images = 0

        if mode == TrainType.CV_TRAIN:
            model.train()
        else:
            model.eval()

        # training iteration
        for id, batch in enumerate(loader):
            images, _ = batch
            images = images.to(device)
            _, output = model(images)
            loss = F.mse_loss(images, output)
            if mode == TrainType.CV_TRAIN:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # collect statistics
            total_images += len(images)
            total_loss += loss.detach().cpu().item()
        return total_loss, total_images, images, output
Exemplo n.º 2
0
 def __call__(self, img):
     img = np.asarray(img)
     img = img / 255  # convert value range into [0, 1] from [0, 255]
     seed_everything(local_seed=self.cnt_seed)
     self.cnt_seed += 1
     img = self.transform(image=img)[
         "image"]  # shape is also transformed from (w, h, c) to (c, w, h)
     return img
Exemplo n.º 3
0
 def __initialize_weight(self):
     for idx, m in enumerate(self.modules()):
         seed_everything(local_seed=idx)
         if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
             nn.init.normal_(m.weight.data, 0.0, 0.02)
         elif isinstance(m, nn.BatchNorm2d):
             nn.init.normal_(m.weight.data, 1.0, 0.02)
             seed_everything(local_seed=idx)
             nn.init.constant_(m.bias.data, 0)
Exemplo n.º 4
0
def main():
    # get args
    args = parse_arguments()

    # make output directory
    out_dir = args.out_dir
    out_dir_path = path.normpath(path.join(getcwd(), out_dir))
    makedirs(out_dir_path, exist_ok=True)
    # save the parameter
    with open(path.join(out_dir_path, 'params.json'), mode="w") as f:
        json.dump(args.__dict__, f, indent=4)

    # setup
    seed_everything(args.seed)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # load mp_ids
    data_dir = path.normpath(path.join(getcwd(), args.data_path))
    mp_ids = np.load(path.join(data_dir, 'mp_ids.npy'))

    # split
    train_ids, test_ids = train_test_split(mp_ids, test_size=args.test_ratio)
    # setup data loader
    train_dataset = CellImageDataset(train_ids, data_dir)
    test_dataset = CellImageDataset(test_ids, data_dir)
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    valid_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
    loaders = {'train': train_loader, 'valid': valid_loader}

    model = CellAutoEncoder(z_size=args.z_size)
    model = model.to(device)
    optimizer = Adam(model.parameters(), lr=args.learning_rate)
    criterion = Reconstruction()
    scheduler = None

    # runner
    runner = AutoEncoderRunner(device=device)
    # model training
    runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler,
                 loaders=loaders, logdir=args.out_dir, num_epochs=args.epochs)

    # encoding
    all_dataset = CellImageDataset(mp_ids, data_dir, encode=True)
    all_loader = DataLoader(all_dataset, batch_size=args.batch_size, shuffle=False)
    resume = path.join(args.out_dir, 'best_model.pth')
    preds, image_names = runner.predict_loader(model, all_loader, resume)

    # save encoded cell image
    out_encode_dir_path = path.join(data_dir, 'cell_image_encode')
    makedirs(out_encode_dir_path, exist_ok=True)
    for pred, image_name in zip(preds, image_names):
        encode_name = '{}.npy'.format(image_name)
        save_path = path.join(out_encode_dir_path, encode_name)
        np.save(save_path, pred)
Exemplo n.º 5
0
def main():
    # get args
    args = parse_arguments()

    # make output directory
    out_dir = args.out_dir
    out_dir_path = path.normpath(path.join(getcwd(), out_dir))
    makedirs(out_dir_path, exist_ok=True)
    # save the parameter
    with open(path.join(out_dir_path, 'params.json'), mode="w") as f:
        json.dump(args.__dict__, f, indent=4)

    # setup
    seed_everything(args.seed)
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    # load mp_ids
    data_dir = path.normpath(path.join(getcwd(), args.data_path))
    raw_data_dir = path.normpath(path.join(getcwd(), args.raw_data_path))
    mp_ids = np.load(path.join(data_dir, 'mp_ids.npy'))

    # split
    train_ids, test_ids = train_test_split(mp_ids, test_size=args.test_ratio)
    # setup data loader
    train_dataset = MaterialsGeneratorDataset(train_ids, data_dir,
                                              raw_data_dir)
    test_dataset = MaterialsGeneratorDataset(test_ids, data_dir, raw_data_dir)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
    valid_loader = DataLoader(test_dataset,
                              batch_size=args.batch_size,
                              shuffle=False)
    loaders = {'train': train_loader, 'valid': valid_loader}

    model = MaterialGenerator(z_size=args.z_size)
    model = model.to(device)
    optimizer = Adam(model.parameters(), lr=args.learning_rate)
    criterion = VAELoss(coef_kl=args.reg_kl, coef_classify=args.reg_classify)
    scheduler = None

    # runner
    runner = MaterialsGeneratorRunner(device=device)
    # model training
    runner.train(model=model,
                 criterion=criterion,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 loaders=loaders,
                 logdir=args.out_dir,
                 num_epochs=args.epochs)
Exemplo n.º 6
0
    def _train_epoch(self, cur_fold, cur_epoch, num_folds, model, optimizer,
                     dataset, mode: TrainType, es=None):
        """
        Train CNN for a single epoch.

        model: instance of CNN
        dataset: instance of sub-class of AbstractCIFAR10
        mode: glb.cv_train or glb.cv_valid
        es: instance of EarlyStopping
        """
        seed_everything()
        loader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True)
        total_loss = 0
        preds = []
        gt_labels = []
        if mode == TrainType.CV_TRAIN:
            model.train()
        else:
            model.eval()

        # training iteration
        for id, batch in enumerate(loader):
            images, labels = batch
            images, labels = images.to(self.device), labels.long().to(self.device)
            output = model(images)  # shape: (data_num, class_num)
            output = F.log_softmax(output, dim=1)
            loss = F.nll_loss(output, labels)
            if mode == TrainType.CV_TRAIN:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # collect statistics
            total_loss += loss.detach().cpu().item()
            _, predicted = torch.max(output.detach().cpu(), 1)
            preds.extend(predicted.tolist())
            gt_labels.extend(labels.detach().cpu().tolist())

        if mode == TrainType.CV_VALID:
            # logging statistics
            mean_loss, stats = self.stat_collector.calc_stat_cnn(total_loss, np.array(preds), np.array(gt_labels))
            self.stat_collector.logging_stat_cnn(mode=mode.value, cur_fold=cur_fold, cur_epoch=cur_epoch,
                                                 mean_loss=mean_loss, stats=stats, num_folds=self.num_folds)
            # record score for early stopping
            es.set_stop_flg(mean_loss, stats["accuracy"])
Exemplo n.º 7
0
    def __get_idx_folds(self):
        """ Split data into n-patterns of training and validation data in the stratified manner """
        train_idx_list = []
        valid_idx_list = []
        seed_everything(target="random")
        skf = StratifiedKFold(n_splits=self.args.num_folds, shuffle=True)
        for train_idx, valid_idx in skf.split(self.data, self.targets):
            train_idx_list.append(train_idx)
            valid_idx_list.append(valid_idx)

        # assertion whether train_idx includes valid idx
        for t_idx, v_idx in zip(train_idx_list, valid_idx_list):
            res = np.intersect1d(np.array(t_idx), np.array(v_idx))
            assert len(
                res
            ) == 0, f"Images are shared between train and valid dataset: {res}."

        return train_idx_list, valid_idx_list
Exemplo n.º 8
0
def main():
    # get args
    args = parse_arguments()

    # make output directory
    out_dir = args.out_dir
    out_dir_path = path.normpath(path.join(getcwd(), out_dir))
    makedirs(out_dir_path, exist_ok=True)

    # load raw dataset
    csv_path = path.normpath(path.join(getcwd(), args.csv_path))
    table_data = pd.read_csv(csv_path, index_col=False)
    structure_path = path.normpath(path.join(getcwd(), args.structure_path))
    structure_data = h5py.File(structure_path, "r")

    # loop
    mp_ids = []
    for mp_id, formula in tqdm(
            zip(table_data['material_id'], table_data['pretty_formula'])):
        crystal = Structure.from_str(structure_data[mp_id].value, args.fmt)
        n_sites = len(crystal.sites)
        max_lattice_length = max(crystal.lattice.lengths)
        if isinstance(formula, str):
            crystal = Structure.from_str(structure_data[mp_id].value, args.fmt)
            n_sites = len(crystal.sites)
            max_lattice_length = max(crystal.lattice.lengths)
            n_elements = len(Composition(formula).elements)
            # https://github.com/kaist-amsg/imatgen/issues/2 + up to 5 elements
            if n_sites <= 20 and max_lattice_length <= 10 and n_elements <= 5:
                mp_ids += [mp_id]

    # save
    print('All MP ids : ', len(mp_ids))
    mp_ids = np.array(mp_ids)
    if len(mp_ids) > args.size:
        seed_everything(seed=1234)
        index = random.sample([i for i in range(len(mp_ids))], args.size)
        mp_ids = mp_ids[index]
    save_path = path.join(out_dir, 'mp_ids.npy')
    np.save(save_path, mp_ids)
    print('Total MP ids : ', len(mp_ids))

    return True
Exemplo n.º 9
0
def main():
    # get args
    args = parse_arguments()

    # make output directory
    out_dir = args.out_dir
    out_dir_path = path.normpath(path.join(getcwd(), out_dir))
    makedirs(out_dir_path, exist_ok=True)
    # save the parameter
    with open(path.join(out_dir_path, 'params.json'), mode="w") as f:
        json.dump(args.__dict__, f, indent=4)

    # setup
    seed_everything(args.seed)
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    materials_generator_path = path.normpath(
        path.join(getcwd(), args.materials_generator_path))
    cell_ae_path = path.normpath(path.join(getcwd(), args.cell_ae_path))
    basis_ae_path = path.normpath(path.join(getcwd(), args.basis_ae_path))

    # sampling
    if args.sampling == 'random':
        size = 500 * args.sampling_size
        sampling = np.random.normal(size=size).reshape(
            (args.sampling_size, 500))
    else:
        size = 500 * args.sampling_size
        sampling = np.random.normal(size=size).reshape(
            (args.sampling_size, 500))

    # create dataset
    dataset = TensorDataset(torch.FloatTensor(sampling))
    loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

    # generate
    structure_generator = StructureGenerator(device, args.cell_z_size,
                                             args.basis_z_size, args.z_size)
    structure_generator.load_pretrained_weight(cell_ae_path, basis_ae_path,
                                               materials_generator_path)
    structure_generator.generate(loader, out_dir_path)
Exemplo n.º 10
0
    def __train_epoch_cae_cnn(model, optimizer, dataset, mode, args, device):
        """
        Train CAE and CNN for a single epoch.
        model: instance of Classifier
        """
        seed_everything()
        loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
        total_loss_cnn = 0
        total_loss_cae = 0
        preds = []
        gt_labels = []
        if mode == TrainType.CV_TRAIN:
            model.train()
        else:
            model.eval()

        # training iteration
        for id, batch in enumerate(loader):
            images, labels = batch
            images, labels = images.to(device), labels.long().to(device)
            op_cae, op_cnn = model(images)

            # loss
            loss_cae = F.mse_loss(images, op_cae)
            op_cnn = F.log_softmax(op_cnn, dim=1)
            loss_cnn = F.nll_loss(op_cnn, labels)
            loss = loss_cae + loss_cnn

            if mode == TrainType.CV_TRAIN:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # collect statistics
            total_loss_cnn += loss_cnn.detach().cpu().item()
            _, predicted = torch.max(op_cnn.detach().cpu(), 1)
            preds.extend(predicted.tolist())
            gt_labels.extend(labels.detach().cpu().tolist())
            total_loss_cae += loss_cae.detach().cpu().item()
        return total_loss_cae, total_loss_cnn, preds, gt_labels
Exemplo n.º 11
0
    def __regulate_data_num(self, reg_map: dict) -> (np.ndarray, list):
        """
        Regulate the number of images.
        reg_map: dict
            {cls_name for the regulation: number of images after the regulation, ....}
        """
        keep_idx = []
        for idx, (class_nm, class_no) in enumerate(self.class_to_idx.items()):
            idx_array = np.where(class_no == np.array(self.targets))[0]
            if self.args.is_local == 1:
                # regulate data of each class into 250 if running pgm at local while development
                seed_everything(target="random")
                idx_array = np.random.choice(idx_array, 250, replace=False)
            elif "all" in reg_map.keys():
                seed_everything(target="random", local_seed=idx)
                idx_array = np.random.choice(idx_array,
                                             reg_map["all"],
                                             replace=False)
            elif class_nm in reg_map.keys():
                seed_everything(target="random", local_seed=idx)
                idx_array = np.random.choice(idx_array,
                                             reg_map[class_nm],
                                             replace=False)
            keep_idx.extend(list(idx_array))

        data = self.data[keep_idx, :, :, :]  # shape is (batch, w, h, ch)
        targets = list(np.array(self.targets)[keep_idx])
        return data, targets