test_graphs = list()
test_targets = list()
print('preprocess test molecules ...')
for mole in test_moles:
    test_graphs.append(Graph(structures_groups.get_group(mole), list_atoms))
    test_targets.append(test_gp.get_group(mole))

# In[10]:

display(valid_gp.get_group(valid_moles[0]))
display(structures_groups.get_group(valid_moles[0]))

# In[7]:

train_dataset = DictDataset(graphs=train_graphs, targets=train_targets)
valid_dataset = DictDataset(graphs=valid_graphs, targets=valid_targets)
test_dataset = DictDataset(graphs=test_graphs, targets=test_targets)

# In[8]:


class SchNetUpdateBN(SchNetUpdate):
    def __init__(self, *args, **kwargs):
        super(SchNetUpdateBN, self).__init__(*args, **kwargs)
        with self.init_scope():
            self.bn = GraphBatchNormalization(args[0])

    def __call__(self, h, adj, **kwargs):
        v = self.linear[0](h)
        v = self.cfconv(v, adj)
def run(dir_dataset: Path, batch_size: int, epochs: int, alpha: float,
        seed: int, debug: bool):

    tic = time.time()

    logger = getLogger('root')

    np.random.seed(seed)
    random.seed(seed)

    model = EdgeUpdateNet()
    model.to_gpu(device=0)

    train_ids, valid_ids, test_ids = load_dataset(dir_dataset)

    logger.info(f'train_ids: {train_ids[:5]} ... {train_ids[-5:]}')
    logger.info(f'valid_ids: {valid_ids[:5]} ... {valid_ids[-5:]}')
    logger.info(f' test_ids: {test_ids[:5]} ... {test_ids[-5:]}')

    train_scores = pd.read_csv(dir_dataset / 'train_scores.csv')
    train_scores.index = train_scores['Id']

    target_cols = [
        'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'
    ]
    train_target = train_scores.loc[train_ids][target_cols].values.astype(
        np.float32)
    valid_target = train_scores.loc[valid_ids][target_cols].values.astype(
        np.float32)
    test_target = np.zeros((len(test_ids), len(target_cols)), dtype=np.float32)

    loading = pd.read_csv(dir_dataset / 'loading.csv')
    loading.index = loading['Id']

    loading_train = loading.loc[train_ids].iloc[:,
                                                1:].values.astype(np.float32)
    loading_valid = loading.loc[valid_ids].iloc[:,
                                                1:].values.astype(np.float32)
    loading_test = loading.loc[test_ids].iloc[:, 1:].values.astype(np.float32)

    fnc_train, fnc_valid, fnc_test = get_fnc(dir_dataset, train_ids, valid_ids,
                                             test_ids, alpha)

    logger.info(f'fnc train: {fnc_train.shape}')
    logger.info(f'fnc valid: {fnc_valid.shape}')
    logger.info(f'fnc  test: {fnc_test.shape}')

    icn_numbers = pd.read_csv('../../input/ICN_numbers.csv')
    feature = np.zeros((53, len(icn_numbers['net_type'].unique())),
                       dtype=np.float32)
    feature[range(len(feature)), icn_numbers['net_type_code']] = 1.0

    net_type_train = np.tile(np.expand_dims(feature, 0),
                             (len(train_ids), 1, 1))
    net_type_valid = np.tile(np.expand_dims(feature, 0),
                             (len(valid_ids), 1, 1))
    net_type_test = np.tile(np.expand_dims(feature, 0), (len(test_ids), 1, 1))

    spatial_map_train, spatial_map_valid = load_spatial_map(
        train_ids, valid_ids)
    spatial_map_test = np.load('../../input/spatial_map_test.npy')

    train_dataset = DictDataset(loading=loading_train,
                                fnc=fnc_train,
                                net_type=net_type_train,
                                spatial_map=spatial_map_train,
                                targets=train_target,
                                Id=train_ids)

    valid_dataset = DictDataset(loading=loading_valid,
                                fnc=fnc_valid,
                                net_type=net_type_valid,
                                spatial_map=spatial_map_valid,
                                targets=valid_target,
                                Id=valid_ids)

    test_dataset = DictDataset(loading=loading_test,
                               fnc=fnc_test,
                               net_type=net_type_test,
                               spatial_map=spatial_map_test,
                               targets=test_target,
                               Id=test_ids)

    train_iter = chainer.iterators.SerialIterator(train_dataset,
                                                  batch_size,
                                                  shuffle=True)
    valid_iter = chainer.iterators.SerialIterator(valid_dataset,
                                                  batch_size,
                                                  shuffle=False,
                                                  repeat=False)
    test_iter = chainer.iterators.SerialIterator(test_dataset,
                                                 batch_size,
                                                 shuffle=False,
                                                 repeat=False)

    optimizer = optimizers.Adam(alpha=1e-3)
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter, optimizer, device=0)
    trainer = training.Trainer(updater, (epochs, 'epoch'), out="result")

    trainer.extend(training.extensions.LogReport(filename=f'seed{seed}.log'))

    trainer.extend(training.extensions.ExponentialShift('alpha', 0.99999))
    trainer.extend(
        training.extensions.observe_value(
            'alpha', lambda tr: tr.updater.get_optimizer('main').alpha))

    def stop_train_mode(trigger):
        @make_extension(trigger=trigger)
        def _stop_train_mode(_):
            logger.debug('turn off training mode')
            chainer.config.train = False

        return _stop_train_mode

    trainer.extend(stop_train_mode(trigger=(1, 'epoch')))

    trainer.extend(
        training.extensions.PrintReport(
            ['epoch', 'elapsed_time', 'main/loss', 'valid/main/All', 'alpha']))

    trainer.extend(
        TreNDSEvaluator(iterator=valid_iter,
                        target=model,
                        name='valid',
                        device=0,
                        is_validate=True))

    trainer.extend(TreNDSEvaluator(iterator=test_iter,
                                   target=model,
                                   name='test',
                                   device=0,
                                   is_submit=True,
                                   submission_name=f'submit_seed{seed}.csv'),
                   trigger=triggers.MinValueTrigger('valid/main/All'))

    chainer.config.train = True
    trainer.run()

    trained_result = pd.DataFrame(trainer.get_extension('LogReport').log)
    best_score = np.min(trained_result['valid/main/All'])
    logger.info(f'validation score: {best_score: .4f} (seed: {seed})')

    elapsed_time = time.time() - tic
    logger.info(f'elapsed time: {elapsed_time / 60.0: .1f} [min]')
示例#3
0
def main():
    #%% Load datasets
    train, valid, test, train_moles, valid_moles, test_moles = load_dataset(
        CTYPE)

    train_gp = train.groupby('molecule_name')
    valid_gp = valid.groupby('molecule_name')
    test_gp = test.groupby('molecule_name')

    #%%
    structures = pd.read_csv(DATA_PATH / 'structures.csv')

    giba_features = pd.read_csv(DATA_PATH / 'unified-features' /
                                'giba_features.csv',
                                index_col=0)
    structures = pd.merge(structures,
                          giba_features.drop(['atom_name', 'x', 'y', 'z'],
                                             axis=1),
                          on=['molecule_name', 'atom_index'])
    norm_col = [
        col for col in structures.columns
        if col not in ['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z']
    ]
    structures[norm_col] = (structures[norm_col] - structures[norm_col].mean()
                            ) / structures[norm_col].std()
    structures = structures.fillna(0)
    structures_groups = structures.groupby('molecule_name')

    #%%
    if CTYPE != 'all':
        train_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' /
                                   'kuma_dataset' / 'kuma_dataset' / 'train' /
                                   '{}_full.csv'.format(CTYPE),
                                   index_col=0)
    else:
        train_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' /
                                   'kuma_dataset' / 'kuma_dataset' /
                                   'train_all.csv',
                                   index_col=0)
    train_couple = reduce_mem_usage(train_couple)
    train_couple = train_couple.drop(
        ['id', 'scalar_coupling_constant', 'type'], axis=1)
    if CTYPE != 'all':
        test_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' /
                                  'kuma_dataset' / 'kuma_dataset' / 'test' /
                                  '{}_full.csv'.format(CTYPE),
                                  index_col=0)
    else:
        test_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' /
                                  'kuma_dataset' / 'kuma_dataset' /
                                  'test_all.csv',
                                  index_col=0)
    test_couple = reduce_mem_usage(test_couple)
    test_couple = test_couple.drop(['id', 'type'], axis=1)

    couples = pd.concat([train_couple, test_couple])

    del train_couple, test_couple

    couples_norm_col = [
        col for col in couples.columns if col not in
        ['atom_index_0', 'atom_index_1', 'molecule_name', 'type']
    ]

    for col in couples_norm_col:
        if couples[col].dtype == np.dtype('O'):
            couples = pd.get_dummies(couples, columns=[col])
        else:
            couples[col] = (couples[col] -
                            couples[col].mean()) / couples[col].std()

    couples = couples.fillna(0)
    couples = couples.replace(np.inf, 0)
    couples = couples.replace(-np.inf, 0)
    couples_groups = couples.groupby('molecule_name')

    #%% Make graphs
    feature_col = [
        col for col in structures.columns
        if col not in ['molecule_name', 'atom_index', 'atom']
    ]

    list_atoms = list(set(structures['atom']))
    print('list of atoms')
    print(list_atoms)

    train_graphs = list()
    train_targets = list()
    train_couples = list()
    print('preprocess training molecules ...')
    for mole in tqdm(train_moles):
        train_graphs.append(
            Graph(structures_groups.get_group(mole), list_atoms, feature_col))
        train_targets.append(train_gp.get_group(mole))
        train_couples.append(couples_groups.get_group(mole))

    valid_graphs = list()
    valid_targets = list()
    valid_couples = list()
    print('preprocess validation molecules ...')
    for mole in tqdm(valid_moles):
        valid_graphs.append(
            Graph(structures_groups.get_group(mole), list_atoms, feature_col))
        valid_targets.append(valid_gp.get_group(mole))
        valid_couples.append(couples_groups.get_group(mole))

    test_graphs = list()
    test_targets = list()
    test_couples = list()
    print('preprocess test molecules ...')
    for mole in tqdm(test_moles):
        test_graphs.append(
            Graph(structures_groups.get_group(mole), list_atoms, feature_col))
        test_targets.append(test_gp.get_group(mole))
        test_couples.append(couples_groups.get_group(mole))

    #%% Make datasets
    train_dataset = DictDataset(graphs=train_graphs,
                                targets=train_targets,
                                couples=train_couples)
    valid_dataset = DictDataset(graphs=valid_graphs,
                                targets=valid_targets,
                                couples=valid_couples)
    test_dataset = DictDataset(graphs=test_graphs,
                               targets=test_targets,
                               couples=test_couples)

    #%% Build Model
    model = SchNet(num_layer=NUM_LAYER)
    model.to_gpu(device=0)

    #%% Sampler
    train_sampler = SameSizeSampler(structures_groups, train_moles, BATCH_SIZE)
    valid_sampler = SameSizeSampler(structures_groups,
                                    valid_moles,
                                    BATCH_SIZE,
                                    use_remainder=True)
    test_sampler = SameSizeSampler(structures_groups,
                                   test_moles,
                                   BATCH_SIZE,
                                   use_remainder=True)

    #%% Iterator, Optimizer
    train_iter = chainer.iterators.SerialIterator(train_dataset,
                                                  BATCH_SIZE,
                                                  order_sampler=train_sampler)

    valid_iter = chainer.iterators.SerialIterator(valid_dataset,
                                                  BATCH_SIZE,
                                                  repeat=False,
                                                  order_sampler=valid_sampler)

    test_iter = chainer.iterators.SerialIterator(test_dataset,
                                                 BATCH_SIZE,
                                                 repeat=False,
                                                 order_sampler=test_sampler)

    optimizer = optimizers.Adam(alpha=1e-3)
    optimizer.setup(model)

    #%% Updater
    if opt.multi_gpu:
        updater = training.updaters.ParallelUpdater(
            train_iter,
            optimizer,
            # The device of the name 'main' is used as a "master", while others are
            # used as slaves. Names other than 'main' are arbitrary.
            devices={
                'main': 0,
                'sub1': 1,
                'sub2': 2,
                'sub3': 3
            },
        )
    else:
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=coupling_converter,
                                           device=0)

    # early_stopping
    stop_trigger = triggers.EarlyStoppingTrigger(
        patients=EARLY_STOPPING_ROUNDS,
        monitor='valid/main/ALL_LogMAE',
        max_trigger=(EPOCH, 'epoch'))
    trainer = training.Trainer(updater, stop_trigger, out=RESULT_PATH)
    # trainer = training.Trainer(updater, (100, 'epoch'), out=RESULT_PATH)

    #%% Evaluator
    trainer.extend(
        TypeWiseEvaluator(iterator=valid_iter,
                          target=model,
                          converter=coupling_converter,
                          name='valid',
                          device=0,
                          is_validate=True))
    trainer.extend(
        TypeWiseEvaluator(iterator=test_iter,
                          target=model,
                          converter=coupling_converter,
                          name='test',
                          device=0,
                          is_submit=True))

    #%% Other extensions
    trainer.extend(training.extensions.ExponentialShift('alpha', 0.99999))

    trainer.extend(stop_train_mode(trigger=(1, 'epoch')))

    trainer.extend(
        training.extensions.observe_value(
            'alpha', lambda tr: tr.updater.get_optimizer('main').alpha))

    trainer.extend(training.extensions.LogReport(log_name=f'log_{CTYPE}'))
    trainer.extend(
        training.extensions.PrintReport([
            'epoch', 'elapsed_time', 'main/loss', 'valid/main/ALL_LogMAE',
            'alpha'
        ]))

    # trainer.extend(extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'))
    trainer.extend(SaveRestore(filename=f'best_epoch_{CTYPE}'),
                   trigger=triggers.MinValueTrigger('valid/main/ALL_LogMAE'))

    #%% Train
    if not opt.test:
        chainer.config.train = True
        trainer.run()
    else:
        chainer.config.train = False
        snapshot_path = f'results/chainer/best_epoch_{CTYPE}'
        chainer.serializers.npz.load_npz(snapshot_path, model,
                                         'updater/model:main/')
        oof = predict_iter(valid_iter, model)
        oof.to_csv(f'schnet_{CTYPE}_oof.csv', index=False)

    #%% Final Evaluation
    chainer.config.train = False
    prediction = predict_iter(test_iter, model)
    prediction.to_csv(f'schnet_{CTYPE}.csv', index=False)
def main():
    #%% Load datasets
    train, valid, test, train_moles, valid_moles, test_moles = load_dataset(CTYPE)

    train_gp = train.groupby('molecule_name')
    valid_gp = valid.groupby('molecule_name')
    test_gp = test.groupby('molecule_name')

    #%%
    structures = pd.read_csv(DATA_PATH/'structures.csv')

    giba_features = pd.read_csv(DATA_PATH/'unified-features'/'giba_features.csv', index_col=0)
    structures = pd.merge(structures,giba_features.drop(['atom_name','x','y','z'],axis=1),on=['molecule_name','atom_index'])
    norm_col = [col for col in structures.columns if col not in ['molecule_name','atom_index','atom','x','y','z']]
    structures[norm_col] = (structures[norm_col]-structures[norm_col].mean())/structures[norm_col].std()
    structures = structures.fillna(0)
    structures_groups = structures.groupby('molecule_name')

    #%%
    if CTYPE != 'all':
        train_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'train'/'{}_full.csv'.format(CTYPE),index_col=0)
    else:
        train_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'train_all.csv',index_col=0)
    train_couple = reduce_mem_usage(train_couple)
    train_couple = train_couple.drop(['id','scalar_coupling_constant','type'],axis=1)
    if CTYPE != 'all':
        test_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'test'/'{}_full.csv'.format(CTYPE),index_col=0)
    else:
        test_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'test_all.csv',index_col=0)
    test_couple = reduce_mem_usage(test_couple)
    test_couple = test_couple.drop(['id','type'],axis=1)

    couples = pd.concat([train_couple, test_couple])

    del train_couple, test_couple

    couples_norm_col = [col for col in couples.columns if col not in ['atom_index_0','atom_index_1','molecule_name','type']]

    for col in couples_norm_col:
        if couples[col].dtype==np.dtype('O'):
            couples = pd.get_dummies(couples,columns=[col])
        else:
            couples[col] = (couples[col]-couples[col].mean())/couples[col].std()

    couples = couples.fillna(0)
    couples = couples.replace(np.inf, 0)
    couples = couples.replace(-np.inf, 0)
    couples_groups = couples.groupby('molecule_name')


    #%% Make graphs
    feature_col = [col for col in structures.columns if col not in ['molecule_name','atom_index','atom']]

    list_atoms = list(set(structures['atom']))
    print('list of atoms')
    print(list_atoms)

    train_graphs = list()
    train_targets = list()
    train_couples = list()
    print('preprocess training molecules ...')
    for mole in tqdm(train_moles):
        train_graphs.append(Graph(structures_groups.get_group(mole), list_atoms, feature_col, mole))
        train_targets.append(train_gp.get_group(mole))
        train_couples.append(couples_groups.get_group(mole))

    valid_graphs = list()
    valid_targets = list()
    valid_couples = list()
    print('preprocess validation molecules ...')
    for mole in tqdm(valid_moles):
        valid_graphs.append(Graph(structures_groups.get_group(mole), list_atoms, feature_col, mole))
        valid_targets.append(valid_gp.get_group(mole))
        valid_couples.append(couples_groups.get_group(mole))

    test_graphs = list()
    test_targets = list()
    test_couples = list()
    print('preprocess test molecules ...')
    for mole in tqdm(test_moles):
        test_graphs.append(Graph(structures_groups.get_group(mole), list_atoms, feature_col, mole))
        test_targets.append(test_gp.get_group(mole))
        test_couples.append(couples_groups.get_group(mole))


    #%% Make datasets
    train_dataset = DictDataset(graphs=train_graphs, targets=train_targets, couples=train_couples)
    valid_dataset = DictDataset(graphs=valid_graphs, targets=valid_targets, couples=valid_couples)
    test_dataset = DictDataset(graphs=test_graphs, targets=test_targets, couples=test_couples)


    #%% Build Model
    model = WeaveNet(n_sub_layer=3)
    model.to_gpu(device=0)


    #%% Sampler
    train_sampler = SameSizeSampler(structures_groups, train_moles, BATCH_SIZE)
    valid_sampler = SameSizeSampler(structures_groups, valid_moles, BATCH_SIZE,
                                    use_remainder=True)
    test_sampler = SameSizeSampler(structures_groups, test_moles, BATCH_SIZE,
                                   use_remainder=True)


    #%% Iterator, Optimizer
    train_iter = chainer.iterators.SerialIterator(
        train_dataset, BATCH_SIZE, order_sampler=train_sampler)

    valid_iter = chainer.iterators.SerialIterator(
        valid_dataset, BATCH_SIZE, repeat=False, order_sampler=valid_sampler)

    test_iter = chainer.iterators.SerialIterator(
        test_dataset, BATCH_SIZE, repeat=False, order_sampler=test_sampler)

    #%% Predict
	chainer.config.train = False