示例#1
0
    def read_hasc(self,
                  data_path,
                  n_channels,
                  n_steps,
                  target,
                  types,
                  n_class,
                  issample=True):

        X = np.empty((0, n_steps, n_channels))
        Y = list()
        for index in range(self.process_num):

            store_data_file = "%s/split_data%d.npy" % (data_path, index)
            store_label_file = "%s/split_label%d.npy" % (data_path, index)

            temp_data = np.load(store_data_file)
            X = np.vstack([X, temp_data])
            temp_label = np.load(store_label_file)
            temp_label = self.map2id(temp_label, target, types)
            Y.extend(temp_label)

        Y = np.array(Y)
        if target == 'binary':
            if issample == True:
                X, Y = sample(X, Y)

        elif target == 'one-class':
            X, Y = one_class_sample(X, Y)

        Y = np.asarray(pd.get_dummies(np.array(Y)), dtype=np.int8)

        logging.info("Label has %d class" % Y[0].shape[0])

        assert Y[0].shape[0] == n_class

        return X, Y
示例#2
0
    def batch_gen(self, batch_size):
        while True:
            if self.record_point == 0:
                self.entity1 = np.zeros(self.record_num, dtype=np.int32)
                self.entity2 = np.zeros(self.record_num, dtype=np.int32)
                self.entity3 = np.zeros(self.record_num, dtype=np.int32)
                self.entity4 = np.zeros(self.record_num, dtype=np.int32)
                self.relation = np.zeros(self.record_num, dtype=np.int32)
                index = range(self.record_num)
                random.shuffle(index)
                kneg_ind = 0
                if self.kneg:
                    smp_ind = sample(2 * self.record_num, self.w_cnt)
                for i in index:
                    if self.kneg:
                        j = smp_ind[kneg_ind % (2 * self.record_num)]
                        kneg_ind += 1
                    else:
                        j = random.randint(0, self.entity_num - 1)
                    if self.version == 'bern':
                        pr = self.r_p[self.fb_r[i]]
                    else:
                        pr = 0.5
                    tmp_r = self.fb_r[i]
                    if random.random() < pr:
                        if self.center and random.random() < 0.4:
                            j = j % len(self.r_e_t[tmp_r])
                            j = self.r_e_t[tmp_r][j]
                        elif self.asym and random.random() < 0.05:
                            j = self.fb_h[i]
                        while (self.fb_h[i], self.fb_r[i], j) in self.ok:
                            if self.center and random.random() < 0.4:
                                j = random.randint(0,
                                                   len(self.r_e_t[tmp_r]) - 1)
                                j = self.r_e_t[tmp_r][j]
                            else:
                                if self.kneg:
                                    j = smp_ind[kneg_ind %
                                                (2 * self.record_num)]
                                    kneg_ind += 1
                                else:
                                    j = random.randint(0, self.entity_num - 1)
                        self.entity1[i] = self.fb_h[i]
                        self.entity2[i] = self.fb_l[i]
                        self.entity3[i] = self.fb_h[i]
                        self.entity4[i] = j
                        self.relation[i] = self.fb_r[i]
                    else:
                        if self.center and random.random() < 0.4:
                            j = j % len(self.r_e_h[tmp_r])
                            j = self.r_e_h[tmp_r][j]
                        elif self.asym and random.random() < 0.05:
                            j = self.fb_l[i]
                        while (j, self.fb_r[i], self.fb_l[i]) in self.ok:
                            if self.center and random.random() < 0.4:
                                j = random.randint(0,
                                                   len(self.r_e_h[tmp_r]) - 1)
                                j = self.r_e_h[tmp_r][j]
                            else:
                                if self.kneg:
                                    j = smp_ind[kneg_ind %
                                                (2 * self.record_num)]
                                    kneg_ind += 1
                                else:
                                    j = random.randint(0, self.entity_num - 1)
                        self.entity1[i] = self.fb_h[i]
                        self.entity2[i] = self.fb_l[i]
                        self.entity3[i] = j
                        self.entity4[i] = self.fb_l[i]
                        self.relation[i] = self.fb_r[i]

            end = self.record_point + batch_size
            '''
            if end < self.record_num:
                ind = range(self.record_point,end)
                self.record_point += batch_size
            else:
                ind = range(self.record_point,self.record_num)
                ind.extend(range(end-self.record_num))
                self.record_point = 0
            '''
            ind = range(self.record_point, end)
            self.record_point += batch_size
            if end + batch_size > self.record_num:
                self.record_point = 0
            entity1 = self.entity1[ind]
            entity2 = self.entity2[ind]
            entity3 = self.entity3[ind]
            entity4 = self.entity4[ind]
            relation = self.relation[ind]

            yield entity1, entity2, entity3, entity4, relation
示例#3
0
def train_tox21(seed: int = 19700101,
                limit: int = -1,
                use_cuda: bool = True,
                use_tqdm=True,
                force_save=False,
                special_config: dict = None,
                position_encoder_path: str = 'net/pe.pt',
                dataset='TOX21',
                tag='std'):
    cfg = DEFAULT_CONFIG.copy()
    if special_config:
        cfg.update(special_config)
    for k, v in cfg.items():
        print(k, ':', v)
    set_seed(seed, use_cuda)
    np.set_printoptions(precision=4, suppress=True, linewidth=140)

    if dataset == 'TOX21':
        smiles, info_list, properties = load_tox21(limit)
        graph_path = TOX21_GRAPH_PATH
    else:
        assert False, "Unknown dataset: {}.".format(dataset)
    n_label = properties.shape[-1]
    is_nan = np.isnan(properties)
    properties[is_nan] = 0.0
    not_nan = np.logical_not(is_nan).astype(np.float)
    not_nan_mask = not_nan.astype(np.int)
    not_nan = torch.tensor(not_nan, dtype=torch.float32)
    properties = torch.tensor(properties, dtype=torch.float32)
    if use_cuda:
        not_nan = not_nan.cuda()
        properties = properties.cuda()
    molecules = [
        HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em'])
        for info in info_list
    ]
    n_dim = molecules[0].n_dim
    e_dim = molecules[0].e_dim
    node_num = len(molecules)

    train_mask, validate_mask, test_mask = sample(list(range(node_num)),
                                                  cfg['TRAIN_PER'],
                                                  cfg['VALIDATE_PER'],
                                                  cfg['TEST_PER'])
    n_seg = int(len(train_mask) / (cfg['BATCH'] + 1))
    train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1))
    validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(test_mask) / (cfg['BATCH'] + 1))
    test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)]
    print(train_mask, validate_mask, test_mask)
    print(len(train_mask_list), len(validate_mask_list), len(test_mask_list))

    if position_encoder_path and os.path.exists(position_encoder_path):
        position_encoder = torch.load(position_encoder_path)
        position_encoder.eval()
    else:
        print('NO POSITION ENCODER IS BEING USED!!!')
        position_encoder = None
    model = AMPNN(n_dim=n_dim,
                  e_dim=e_dim,
                  config=cfg,
                  position_encoder=position_encoder,
                  use_cuda=use_cuda)
    classifier = MLP(cfg['F_DIM'],
                     n_label,
                     h_dims=cfg['MLP_DIMS'],
                     dropout=cfg['DROPOUT'],
                     activation='sigmoid')
    if use_cuda:
        model.cuda()
        classifier.cuda()
    params = list(chain(model.parameters(), classifier.parameters()))
    for param in params:
        print(param.shape)
    optimizer = optim.Adam(params, lr=cfg['LR'], weight_decay=cfg['DECAY'])
    current_lr = cfg['LR']
    matrix_cache = MatrixCache(cfg['MAX_DICT'])
    loss_fuc = BCELoss()
    logs = []

    def forward(mask: list,
                name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor):
        nfs = torch.cat([molecules[i].node_features for i in mask])
        efs = torch.cat([molecules[i].edge_features for i in mask])
        if use_cuda:
            nfs = nfs.cuda()
            efs = efs.cuda()

        us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name,
                                              use_cuda)

        embeddings, _ = model(nfs, efs, us, vs, mm_tuple, name,
                              [smiles[i] for i in mask])
        std_loss = 0
        logits = classifier(embeddings) * not_nan[mask, :]
        target = properties[mask, :]
        if use_cuda:
            target = target.cuda()
        return logits, target, std_loss

    def train(mask_list: list, name=None):
        model.train()
        classifier.train()
        u_losses = []

        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            optimizer.zero_grad()
            logits, target, std_loss = forward(m, name=name_)
            u_loss = loss_fuc(logits, target)
            u_losses.append(u_loss.cpu().item())
            loss = u_loss + std_loss
            loss.backward()
            optimizer.step()
            nonlocal current_lr
            current_lr *= 1 - cfg['DECAY']

        print('\t\tSemi-supervised loss: {:.4f}'.format(np.average(u_losses)))
        logs[-1].update({'on_train_loss': np.average(u_losses)})

    def calc_masked_roc(logits, target, mask) -> float:
        rocs = []
        for i in range(n_label):
            l = logits[:, i]
            t = target[:, i]
            # nnm = not_nan_mask[mask, i]
            # l = l[nnm == 1]
            # t = t[nnm == 1]
            rocs.append(roc_auc_score(t, l))
            # print(l.shape)
        # print(rocs)
        return np.average(rocs)

    def evaluate(mask_list: list, name=None):
        model.eval()
        classifier.eval()
        losses = []
        mask = []
        logits_list = []
        target_list = []
        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            logits, target, _ = forward(m, name=name_)
            loss = loss_fuc(logits, target)
            losses.append(loss.cpu().item())

            mask.extend(m)
            logits_list.append(logits.cpu().detach().numpy())
            target_list.append(target.cpu().detach().numpy())

        all_logits = np.vstack(logits_list)
        all_target = np.vstack(target_list)
        # print(all_logits[: 10])
        # print(all_target[: 10])
        roc = calc_masked_roc(all_logits, all_target, mask)
        print('\t\tLoss: {:.3f}'.format(np.average(losses)))
        print('\t\tROC: {:.3f}'.format(roc))
        logs[-1].update({'{}_loss'.format(name): np.average(losses)})
        logs[-1].update({'{}_metric'.format(name): roc})

    for epoch in range(cfg['ITERATION']):
        logs.append({'epoch': epoch + 1})
        print('In iteration {}:'.format(epoch + 1))
        print('\tLearning rate: {:.8e}'.format(current_lr))
        print('\tTraining: ')
        train(train_mask_list, name='train')
        print('\tEvaluating training: ')
        evaluate(train_mask_list, name='train')
        print('\tEvaluating validation: ')
        evaluate(validate_mask_list, name='evaluate')
        print('\tEvaluating test: ')
        evaluate(test_mask_list, name='test')
        gc.collect()
        d = {'metric': 'Multi-ROC', 'logs': logs}
        with open('{}{}.json'.format(LOG_PATH, tag), 'w+',
                  encoding='utf-8') as fp:
            json.dump(d, fp)
示例#4
0
def train_lipop(seed: int = 19700101,
                limit: int = -1,
                use_cuda: bool = True,
                use_tqdm=True,
                force_save=False,
                special_config: dict = None,
                position_encoder_path: str = 'net/pe.pt',
                tag='std',
                dataset='Lipop'):
    cfg = DEFAULT_CONFIG.copy()
    if special_config:
        cfg.update(special_config)
    for k, v in cfg.items():
        print(k, ':', v)
    set_seed(seed, use_cuda)
    np.set_printoptions(precision=4, suppress=True, linewidth=140)

    if dataset == 'FreeSolv':
        smiles, info_list, properties = load_freesolv(limit,
                                                      force_save=force_save)
    elif dataset == 'ESOL':
        smiles, info_list, properties = load_esol(limit, force_save=force_save)
    else:
        smiles, info_list, properties = load_lipop(limit,
                                                   force_save=force_save)
    molecules = [
        HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em'])
        for info in info_list
    ]
    n_dim = molecules[0].n_dim
    e_dim = molecules[0].e_dim
    node_num = len(molecules)

    train_mask, validate_mask, test_mask = sample(list(range(node_num)),
                                                  cfg['TRAIN_PER'],
                                                  cfg['VALIDATE_PER'],
                                                  cfg['TEST_PER'])
    n_seg = int(len(train_mask) / (cfg['BATCH'] + 1))
    n_seg = min(len(train_mask), n_seg)
    train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1))
    n_seg = min(len(validate_mask), n_seg)
    validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(test_mask) / (cfg['BATCH'] + 1))
    n_seg = min(len(test_mask), n_seg)
    test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)]
    print(train_mask[0], validate_mask[0], test_mask[0])
    print(len(train_mask_list), len(validate_mask_list), len(test_mask_list))

    t_properties = properties[train_mask, :]
    prop_mean = np.mean(t_properties, axis=0)
    print('mean:', prop_mean)
    prop_std = np.std(t_properties.tolist(), axis=0, ddof=1)
    print('std:', prop_std)
    prop_mad = robust.mad(t_properties.tolist(), axis=0)
    print('mad:', prop_mad)
    norm_properties = (properties - prop_mean) / prop_std

    if position_encoder_path and os.path.exists(position_encoder_path):
        position_encoder = torch.load(position_encoder_path)
        position_encoder.eval()
    else:
        print('NO POSITION ENCODER IS BEING USED!!!')
        position_encoder = None
    model = AMPNN(n_dim=n_dim,
                  e_dim=e_dim,
                  config=cfg,
                  position_encoder=position_encoder,
                  use_cuda=use_cuda)
    regression = MLP(cfg['F_DIM'],
                     1,
                     h_dims=cfg['MLP_DIMS'],
                     dropout=cfg['DROPOUT'])
    if use_cuda:
        model.cuda()
        regression.cuda()
    for name, param in chain(model.named_parameters(),
                             regression.named_parameters()):
        if param.requires_grad:
            print(name, ":", param.shape)
    optimizer = optim.Adam(filter(
        lambda x: x.requires_grad,
        chain(model.parameters(), regression.parameters())),
                           lr=cfg['LR'],
                           weight_decay=cfg['DECAY'])
    scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer,
                                          step_size=1,
                                          gamma=cfg['GAMMA'])
    matrix_cache = MatrixCache(cfg['MAX_DICT'])
    loss_fuc = MSELoss()
    logs = []

    def forward(mask: list,
                name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor):
        nfs = torch.cat([molecules[i].node_features for i in mask])
        efs = torch.cat([molecules[i].edge_features for i in mask])
        if use_cuda:
            nfs = nfs.cuda()
            efs = efs.cuda()

        us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name,
                                              use_cuda)

        embeddings, _ = model(nfs, efs, us, vs, mm_tuple, name,
                              [smiles[i] for i in mask])
        std_loss = 0
        logits = regression(embeddings)
        target = norm_properties[mask, :]
        target = torch.tensor(target.astype(np.float32), dtype=torch.float32)
        if use_cuda:
            target = target.cuda()
        return logits, target, std_loss

    def train(mask_list: list, name=None):
        model.train()
        regression.train()
        u_losses = []
        losses = []

        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            optimizer.zero_grad()
            logits, target, std_loss = forward(m, name=name_)
            u_loss = loss_fuc(logits, target)
            u_losses.append(u_loss.cpu().item())
            loss = u_loss + std_loss
            # loss.backward()
            # optimizer.step()
            losses.append(loss)
            if len(losses) >= cfg['PACK'] or i == len(mask_list) - 1:
                (sum(losses) / len(losses)).backward()
                optimizer.step()
                losses.clear()

        u_loss = np.average(u_losses)
        print('\t\tSemi-supervised loss: {:.4f}'.format(u_loss))
        logs[-1].update({'on_train_loss': u_loss})

    def evaluate(mask_list: list, name=None, visualize=None):
        model.eval()
        regression.eval()
        losses = []
        masks = []
        logits_list = []
        target_list = []
        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            logits, target, _ = forward(m, name=name_)
            loss = loss_fuc(logits, target)
            losses.append(loss.cpu().item())

            if visualize:
                masks.extend(m)
                logits_list.append(logits.cpu().detach().numpy())
                target_list.append(target.cpu().detach().numpy())

        mse_loss = np.average(losses) * (prop_std[0]**2)
        rmse_loss = np.average([loss**0.5 for loss in losses]) * prop_std[0]
        print('\t\tMSE Loss: {:.3f}'.format(mse_loss))
        print('\t\tRMSE Loss: {:.3f}'.format(rmse_loss))
        logs[-1].update({'{}_loss'.format(name): mse_loss})
        logs[-1].update({'{}_metric'.format(name): rmse_loss})

        if visualize:
            all_logits = np.vstack(logits_list)
            all_target = np.vstack(target_list)
            best_ids, best_ds, worst_ids, worst_ds = \
                plt_multiple_scatter(GRAPH_PATH + visualize, masks, all_logits, all_target)
            print('\t\tBest performance on:')
            for i, d in zip(best_ids, best_ds):
                print('\t\t\t{}: {}'.format(smiles[i], d))
            print('\t\tWorst performance on:')
            for i, d in zip(worst_ids, worst_ds):
                print('\t\t\t{}: {}'.format(smiles[i], d))

    for epoch in range(cfg['ITERATION']):
        logs.append({'epoch': epoch + 1})
        scheduler.step(epoch=epoch)
        print('In iteration {}:'.format(epoch + 1))
        print('\tTraining: ')
        train(train_mask_list, name='train')
        print('\tEvaluating training: ')
        evaluate(
            train_mask_list,
            name='train',
            # visualize='train_{}'.format(epoch + 1) if (epoch + 1) % cfg['EVAL'] == 0 else None
        )
        print('\tEvaluating validation: ')
        evaluate(
            validate_mask_list,
            name='evaluate',
            # visualize='val_{}'.format(epoch + 1) if (epoch + 1) % cfg['EVAL'] == 0 else None
        )
        print('\tEvaluating test: ')
        evaluate(
            test_mask_list,
            name='test',
            # visualize='test' if epoch + 1 == cfg['ITERATION'] else None
        )
        gc.collect()
        d = {'metric': 'RMSE', 'logs': logs}
        with open('{}{}.json'.format(LOG_PATH, tag), 'w+',
                  encoding='utf-8') as fp:
            json.dump(d, fp)
示例#5
0
def fit_qm9(seed: int = 19700101, limit: int = -1, use_cuda: bool = True, use_tqdm=True, force_save=False,
            special_config: dict = None, model_save_path: str = 'net/pe.pt', tag='std', mpnn_pos_encode=False,
            use_rdkit=False):
    t0 = time.time()
    cfg = DEFAULT_CONFIG.copy()
    if special_config:
        cfg.update(special_config)
    for k, v in cfg.items():
        print(k, ':', v)
    set_seed(seed, use_cuda)
    np.set_printoptions(precision=4, suppress=True, linewidth=140)

    smiles, info_list, _ = load_qm9(limit, force_save=force_save)
    mol_atom_pos = load_mol_atom_pos(limit)

    with open('data/gdb9/incons.json') as fp:
        incons = json.load(fp)
        left = list(set(range(len(smiles))) - set(incons))
        print('{} / {}'.format(len(left), len(smiles)))
        smiles = [smiles[i] for i in left]
        info_list = [info_list[i] for i in left]
        mol_atom_pos = [mol_atom_pos[i] for i in left]

    molecules = [HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em']) for info in info_list]
    n_dim = molecules[0].n_dim
    e_dim = molecules[0].e_dim
    node_num = len(molecules)

    train_mask, validate_mask, test_mask = sample(list(range(node_num)),
                                                  cfg['TRAIN_PER'],
                                                  cfg['VALIDATE_PER'],
                                                  cfg['TEST_PER'])
    n_seg = int(len(train_mask) / (cfg['BATCH'] + 1))
    train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1))
    validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(test_mask) / (cfg['BATCH'] + 1))
    test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)]
    print(train_mask[0], validate_mask[0], test_mask[0])
    print(len(train_mask_list), len(validate_mask_list), len(test_mask_list))

    model = PositionEncoder(n_dim=n_dim,
                            e_dim=e_dim,
                            config=cfg,
                            use_cuda=use_cuda,
                            use_mpnn=mpnn_pos_encode,
                            use_rdkit=use_rdkit)

    if use_cuda:
        model.cuda()
    for name, param in model.named_parameters():
        print(name, ":", param.shape)
    optimizer = optim.Adam(model.parameters(), lr=cfg['LR'], weight_decay=cfg['DECAY'])
    current_lr = cfg['LR']
    matrix_cache = MatrixCache(cfg['MAX_DICT'])
    best_val = -1e8
    logs = []
    graph_logs = []

    def visualize(smiles_list, pos: torch.Tensor, fit_pos: torch.Tensor, mol_node_matrix: torch.Tensor, vis=range(5)):
        if use_cuda:
            pos = pos.cpu()
            fit_pos = fit_pos.cpu()
            mol_node_matrix = mol_node_matrix.cpu()
        pos = pos.detach()
        fit_pos = fit_pos.detach()
        pos_list = []
        for i in vis:
            node_mask = mol_node_matrix[i] > 0
            pos_i = pos[node_mask == 1, :]
            fit_pos_i = fit_pos[node_mask == 1, :]
            new_pos_i, new_fit_pos_i = kabsch(pos_i, fit_pos_i,
                                              torch.full([1, pos_i.shape[0]], 1, dtype=torch.float32),
                                              use_cuda=False)
            pos_list.append({'smiles': smiles_list[i], 'src': new_pos_i.tolist(), 'tgt': new_fit_pos_i.tolist()})
            # plt_molecule_3d(new_pos_i.numpy(), smiles_list[i],
            #                 title='fit_qm9_{}_{}_{}'.format(tag, epoch, i), d=GRAPH_PATH)
            # plt_molecule_3d(new_fit_pos_i.numpy(), smiles_list[i],
            #                 title='fit_qm9_origin_{}'.format(i), d=GRAPH_PATH)
        graph_logs[-1].update({'pos': pos_list})

    def forward(mask: list, name=None):
        nfs = torch.cat([molecules[i].node_features for i in mask])
        efs = torch.cat([molecules[i].edge_features for i in mask])
        atom_pos = torch.cat([torch.from_numpy(mol_atom_pos[i]).type(torch.float32) for i in mask])
        if use_cuda:
            nfs = nfs.cuda()
            efs = efs.cuda()
            atom_pos = atom_pos.cuda()

        us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name, use_cuda)
        mask_smiles = [smiles[i] for i in mask]

        adj3_loss, dis_loss, rmsd_loss, s_loss, c_loss, pos = model.fit(nfs, efs, mask_smiles, us, vs, mm_tuple,
                                                                          atom_pos, print_mode=name == 'test0')
        if name == 'test0':
            visualize([smiles[i] for i in mask], pos, atom_pos, mm_tuple[0])
        return adj3_loss, dis_loss, rmsd_loss, s_loss, c_loss

    def train(mask_list: list, name=None):
        model.train()
        a_losses = []
        d_losses = []
        r_losses = []
        s_losses = []
        c_losses = []

        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            optimizer.zero_grad()
            a_loss, d_loss, r_loss, s_loss, c_loss = forward(m, name=name_)
            a_losses.append(a_loss.cpu().item() if 'cpu' in dir(a_loss) else a_loss)
            d_losses.append(d_loss.cpu().item() if 'cpu' in dir(d_loss) else d_loss)
            r_losses.append(r_loss.cpu().item() if 'cpu' in dir(r_loss) else r_loss)
            s_losses.append(s_loss.cpu().item() if 'cpu' in dir(s_loss) else s_loss)
            c_losses.append(c_loss.cpu().item() if 'cpu' in dir(c_loss) else c_loss)
            # loss = a_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss
            # loss = d_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss
            loss = r_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss + cfg['GAMMA_A'] * a_loss
            loss.backward()
            optimizer.step()
            nonlocal current_lr
            current_lr *= 1 - cfg['DECAY']

        print('\t\tADJ3 loss: {:.4f}'.format(np.average(a_losses)))
        print('\t\tDistance loss: {:.4f}'.format(np.average(d_losses)))
        print('\t\tRMSD metric: {:.4f}'.format(np.average(r_losses)))
        print('\t\tStationary loss: {:.4f}'.format(np.average(s_losses)))
        print('\t\tCentrality loss: {:.4f}'.format(np.average(c_losses)))
        logs[-1].update({'on_train_loss': np.average(a_losses)})

    def evaluate(mask_list: list, name=None):
        model.eval()
        losses = []
        a_losses = []
        d_losses = []
        r_losses = []

        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            a_loss, d_loss, r_loss, s_loss, c_loss = forward(m, name=name_)
            # loss = a_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss
            # loss = d_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss
            loss = r_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss + cfg['GAMMA_A'] * a_loss
            losses.append(loss.cpu().item() if 'cpu' in dir(loss) else loss)
            a_losses.append(a_loss.cpu().item() if 'cpu' in dir(a_loss) else a_loss)
            d_losses.append(d_loss.cpu().item() if 'cpu' in dir(d_loss) else d_loss)
            r_losses.append(r_loss.cpu().item() if 'cpu' in dir(r_loss) else r_loss)

        if name == 'evaluate':
            val = -np.average(losses)
            nonlocal best_val
            if val > best_val:
                print('\t\tSaving position encoder...')
                torch.save(model, model_save_path)
                best_val = val
                print('\t\tSaving finished!')
        print('\t\tLoss: {:.5f}'.format(np.average(losses)))
        print('\t\tADJ3 loss: {:.5f}'.format(np.average(a_losses)))
        print('\t\tDistance loss: {:.5f}'.format(np.average(d_losses)))
        print('\t\tRMSD metric: {:.5f}'.format(np.average(r_losses)))
        logs[-1].update({'{}_loss'.format(name): np.average(losses)})
        logs[-1].update({'{}_adj3_metric'.format(name): np.average(a_losses)})
        logs[-1].update({'{}_distance_metric'.format(name): np.average(d_losses)})
        logs[-1].update({'{}_rmsd_metric'.format(name): np.average(r_losses)})

    for epoch in range(cfg['ITERATION']):
        logs.append({'epoch': epoch + 1})
        graph_logs.append({'epoch': epoch + 1})
        print('In iteration {}:'.format(epoch + 1))
        print('\tLearning rate: {:.8e}'.format(current_lr))
        if not use_rdkit:
            print('\tTraining: ')
            train(train_mask_list, name='train')
            print('\tEvaluating training: ')
            evaluate(train_mask_list, name='train')
        print('\tEvaluating validation: ')
        evaluate(validate_mask_list, name='evaluate')
        print('\tEvaluating test: ')
        evaluate(test_mask_list, name='test')
        gc.collect()
        d = {'metric': 'Distance Loss', 'time': time.time() - t0, 'logs': logs}
        with open('{}{}.json'.format(LOG_PATH, tag), 'w+', encoding='utf-8') as fp:
            json.dump(d, fp)
        gd = graph_logs
        with open('{}{}.json'.format(GRAPH_PATH, tag), 'w+', encoding='utf-8') as fp:
            json.dump(gd, fp)
示例#6
0
def train_gdb9(seed: int = 19700101,
               limit: int = -1,
               residual: bool = True,
               use_cuda: bool = False,
               prop: list = [9]):
    set_seed(seed, use_cuda)
    np.set_printoptions(precision=4, suppress=True, linewidth=140)

    molecule_set, properties = load_gdb9(limit)
    properties = properties[:, prop]
    molecules, n_dim, e_dim = encode_molecules(molecule_set)
    hidden_dims = [H_DIM] * len(C_DIMS)

    node_num = len(molecules)
    train_mask, validate_mask, test_mask = sample(list(range(node_num)),
                                                  TRAIN_PER, VALIDATE_PER,
                                                  TEST_PER)
    print(train_mask, validate_mask, test_mask)
    t_properties = properties[train_mask, :]
    prop_mean = np.mean(t_properties, axis=0)
    print('mean:', prop_mean)
    prop_std = np.std(t_properties.tolist(), axis=0, ddof=1)
    print('std:', prop_std)
    prop_mad = robust.mad(t_properties.tolist(), axis=0)
    print('mad:', prop_mad)
    ratio = (prop_std / prop_mad)**2
    norm_properties = (properties - prop_mean) / prop_std

    model = AMPNN(n_dim,
                  e_dim,
                  H_DIM,
                  C_DIMS,
                  HE_DIM,
                  HEAD_NUM,
                  len(hidden_dims),
                  residual=residual,
                  use_cuda=use_cuda,
                  dropout=DROPOUT)
    if residual:
        r_dim = (n_dim + sum(hidden_dims))
    else:
        r_dim = hidden_dims[-1]
    regression = MLP(int(r_dim * HEAD_NUM),
                     len(prop), [MLP_HIDDEN],
                     dropout=DROPOUT)
    if use_cuda:
        model.cuda()
        regression.cuda()
    params = list(chain(model.parameters(), regression.parameters()))
    for param in params:
        print(param.shape)
    optimizer = optim.Adam(params, lr=LR, weight_decay=DECAY)
    loss_fuc = MSELoss()
    # forward_time = 0.
    bp_time = 0.

    def forward(mask: list,
                show_attention_cnt=0) -> (torch.Tensor, torch.Tensor, list):
        as_ = []
        embeddings = []
        target = norm_properties[mask, :]
        for i in mask:
            hg = molecules[i]
            if use_cuda:
                embedding, a = model(hg.node_features.cuda(),
                                     hg.edge_features.cuda(), hg.us, hg.vs,
                                     hg.edge_mask, GLOBAL_MASK)

            else:
                embedding, a = model(hg.node_features, hg.edge_features, hg.us,
                                     hg.vs, hg.edge_mask, GLOBAL_MASK)
            embeddings.append(embedding)
            as_.append(a)
            if show_attention_cnt:
                print('### For molecule {} ###'.format(i))
                molecule_set.molecules[i].show()
                if use_cuda:
                    a = a.cpu()
                print(a.detach().numpy())
                show_attention_cnt -= 1

        embeddings = torch.stack(embeddings)
        target = torch.tensor(target.astype(np.float32), dtype=torch.float32)
        if use_cuda:
            embeddings = embeddings.cuda()
            target = target.cuda()
        logits = regression(embeddings)
        return logits, target, as_

    def calc_normalized_loss(logits, target):
        losses = []
        for i in range(len(prop)):
            losses.append(loss_fuc(logits[:, i], target[:, i]) * ratio[i])
        return sum(losses)

    t_losses = []
    v_losses = []
    t_maes = []
    v_maes = []
    for epoch in range(ITERATION):
        optimizer.zero_grad()
        if len(train_mask) > TRN_BATCH:
            temp_train_mask = np.random.permutation(train_mask)[:TRN_BATCH]
        else:
            temp_train_mask = train_mask
        if len(validate_mask) > VAL_BATCH:
            temp_validate_mask = np.random.permutation(
                validate_mask)[:VAL_BATCH]
        else:
            temp_validate_mask = validate_mask

        # t1 = time.time()
        t_logits, t_target, tas = forward(temp_train_mask)
        v_logits, v_target, vas = forward(temp_validate_mask)
        # forward_time += time.time() - t1

        t_loss = calc_normalized_loss(t_logits, t_target)
        # tas_loss = 0.0 * t_loss.cpu().item() * sum([as_.sum(1).norm() for as_ in tas]) / len(tas)
        # total_loss = t_loss + tas_loss
        v_loss = calc_normalized_loss(v_logits, v_target)
        t_mae = torch.abs(t_logits - t_target).mean(dim=0)
        v_mae = torch.abs(v_logits - v_target).mean(dim=0)
        t_losses.append(t_loss.cpu().item())
        v_losses.append(v_loss.cpu().item())
        t_maes.append(t_mae.cpu().detach().numpy())
        v_maes.append(v_mae.cpu().detach().numpy())
        if (epoch + 1) % EVAL == 0:
            print(
                'In iteration {}, training: {:.3f}; validation: {:.3f}'.format(
                    epoch, np.average(t_losses[-EVAL:]),
                    np.average(v_losses[-EVAL:])))
            print('\tFor training:   {}.'.format(
                np.average(t_maes[-EVAL:], axis=0) * prop_std))
            print('\tFor validation: {}.'.format(
                np.average(v_maes[-EVAL:], axis=0) * prop_std))
            # print('\tBias: {}.'.format(regression.linear1.bias.cpu().detach().numpy()))
            # print(tas_loss.cpu().item())
        t1 = time.time()
        t_loss.backward()
        optimizer.step()
        bp_time += time.time() - t1

    if len(test_mask) > TST_BATCH:
        temp_test_mask = np.random.permutation(test_mask)[:TST_BATCH]
    else:
        temp_test_mask = test_mask
    e_logits, e_target, eas = forward(temp_test_mask, show_attention_cnt=10)
    print(e_logits.cpu().detach().numpy() * prop_std + prop_mean)
    print(e_target.cpu().detach().numpy() * prop_std + prop_mean)
    e_loss = calc_normalized_loss(e_logits, e_target)
    print('target MSE:', e_loss.cpu().item())
    e_mae = torch.abs(e_logits - e_target).mean(dim=0)
    print('target MAE:', e_mae.cpu().detach().numpy() * prop_std)
    # print(forward_time)
    print(bp_time)
    print(model.total_forward_time)
    print(model.layer_forward_time)
示例#7
0
def train_hiv(seed: int = 19700101,
              limit: int = -1,
              use_cuda: bool = True,
              use_tqdm=True,
              use_model='HamGN',
              dataset='HIV'):
    set_seed(seed, use_cuda)
    np.set_printoptions(precision=4, suppress=True, linewidth=140)

    if dataset == 'HIV':
        smiles, info_list, properties = load_hiv(limit)
        graph_path = HIV_GRAPH_PATH
        default_config = HIVConfig
    elif dataset == 'BBBP':
        smiles, info_list, properties = load_bbbp(limit)
        graph_path = BBBP_GRAPH_PATH
        default_config = BBBPConfig
    else:
        assert False, "Unknown dataset: {}.".format(dataset)
    n_label = properties.max() + 1
    properties = torch.tensor(properties, dtype=torch.int64)
    if use_cuda:
        properties = properties.cuda()
    molecules = [
        HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em'])
        for info in info_list
    ]
    n_dim = molecules[0].n_dim
    e_dim = molecules[0].e_dim
    node_num = len(molecules)

    train_mask, validate_mask, test_mask = sample(list(range(node_num)),
                                                  default_config.TRAIN_PER,
                                                  default_config.VALIDATE_PER,
                                                  default_config.TEST_PER)
    n_seg = int(len(train_mask) / (default_config.BATCH + 1))
    train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(validate_mask) / (default_config.BATCH + 1))
    validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)]
    n_seg = int(len(test_mask) / (default_config.BATCH + 1))
    test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)]
    print(train_mask, validate_mask, test_mask)
    print(len(train_mask_list), len(validate_mask_list), len(test_mask_list))

    if use_model == 'HamGN':
        model = DynamicGraphEncoder(n_dim=n_dim,
                                    e_dim=e_dim,
                                    default_config=default_config,
                                    use_cuda=use_cuda)
    elif use_model == 'AMPNN':
        model = AMPNN(n_dim=n_dim,
                      e_dim=e_dim,
                      default_config=default_config,
                      use_cuda=use_cuda)
    else:
        assert False, 'Undefined model: {}!'.format(use_model)
    classifier = MLP(default_config.F_DIM,
                     n_label,
                     h_dims=default_config.H_DIMS,
                     dropout=default_config.DROPOUT,
                     activation='softmax')
    if use_cuda:
        model.cuda()
        classifier.cuda()
    params = list(chain(model.parameters(), classifier.parameters()))
    for param in params:
        print(param.shape)
    optimizer = optim.Adam(params,
                           lr=default_config.LR,
                           weight_decay=default_config.DECAY)
    current_lr = default_config.LR
    loss_fuc = CrossEntropyLoss()
    # forward_time = 0.
    matrix_mask_dicts = {}
    s_losses = []
    c_losses = []
    a_losses = []
    u_losses = []

    def forward(mask: list,
                name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor):
        nfs = torch.cat([molecules[i].node_features for i in mask])
        efs = torch.cat([molecules[i].edge_features for i in mask])
        if use_cuda:
            nfs = nfs.cuda()
            efs = efs.cuda()
        ms = []
        us = []
        vs = []
        em = []
        ptr = 0
        for i, m in enumerate(mask):
            nn = molecules[m].node_features.shape[0]
            ms.extend([i] * nn)
            for u in molecules[m].us:
                us.append(u + ptr)
            for v in molecules[m].vs:
                vs.append(v + ptr)
            em.extend(molecules[m].edge_mask)
            ptr += nn

        if name and name in matrix_mask_dicts.keys():
            mm_tuple = matrix_mask_dicts[name]
        else:
            n_node = nfs.shape[0]
            mol_node_matrix, mol_node_mask = \
                AMPNN.produce_node_edge_matrix(max(ms) + 1, ms, ms, [1] * len(ms))
            node_edge_matrix_global, node_edge_mask_global = \
                AMPNN.produce_node_edge_matrix(n_node, us, vs, [1] * len(us))
            if use_cuda:
                mol_node_matrix = mol_node_matrix.cuda()
                mol_node_mask = mol_node_mask.cuda()
                node_edge_matrix_global = node_edge_matrix_global.cuda()
                node_edge_mask_global = node_edge_mask_global.cuda()
            mm_tuple = (
                mol_node_matrix,
                mol_node_mask,
                node_edge_matrix_global,
                node_edge_mask_global,
            )
            if name and len(
                    matrix_mask_dicts.keys()) < default_config.MAX_DICT:
                matrix_mask_dicts[name] = mm_tuple

        if use_model == 'HamGN':
            embeddings, s_loss, c_loss, a_loss = model(nfs, efs, us, vs,
                                                       mm_tuple)
            # if np.random.randint(0, 1000) == 0:
            #     print(embeddings.cpu().detach().numpy())
            s_losses.append(s_loss.cpu().item())
            c_losses.append(c_loss.cpu().item())
            a_losses.append(a_loss.cpu().item())
            std_loss = default_config.GAMMA_S * s_loss + \
                       default_config.GAMMA_C * c_loss + \
                       default_config.GAMMA_A * a_loss
        elif use_model == 'AMPNN':
            embeddings, _ = model(nfs, efs, us, vs, mm_tuple)
            std_loss = 0
        else:
            assert False
        logits = classifier(embeddings)
        # print(logits.cpu())
        target = properties[mask]
        if use_cuda:
            target = target.cuda()
        return logits, target, std_loss

    def train(mask_list: list, name=None):
        model.train()
        classifier.train()

        s_losses.clear()
        c_losses.clear()
        a_losses.clear()
        u_losses.clear()

        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            optimizer.zero_grad()
            logits, target, std_loss = forward(m, name=name_)
            u_loss = loss_fuc(logits, target)
            u_losses.append(u_loss.cpu().item())
            loss = u_loss + std_loss
            loss.backward()
            optimizer.step()
            nonlocal current_lr
            current_lr *= 1 - default_config.DECAY

        if use_model == 'HamGN':
            print('\t\tStationary loss: {:.4f}'.format(np.average(s_losses)))
            print('\t\tCentrality loss: {:.4f}'.format(np.average(c_losses)))
            print('\t\tAffinity loss: {:.4f}'.format(np.average(a_losses)))
        print('\t\tSemi-supervised loss: {:.4f}'.format(np.average(u_losses)))

    def evaluate(mask_list: list, name=None, visualize=None):
        model.eval()
        classifier.eval()
        losses = []
        masks = []
        logits_list = []
        target_list = []
        t = enumerate(mask_list)
        if use_tqdm:
            t = tqdm(t, total=len(mask_list))
        for i, m in t:
            if name:
                name_ = name + str(i)
            else:
                name_ = None
            logits, target, _ = forward(m, name=name_)
            loss = loss_fuc(logits, target)
            losses.append(loss.cpu().item())

            logits_list.append(logits.cpu().detach().numpy())
            target_list.append(target.cpu().detach().numpy())
            if visualize:
                masks.extend(m)

        all_logits = np.vstack(logits_list)
        all_target = np.concatenate(target_list)
        print('\t\tLoss: {:.3f}'.format(np.average(losses)))
        print('\t\tROC: {:.3f}'.format(
            roc_auc_score(all_target, all_logits[:, 1])))

        if visualize:
            best_ids, best_ds, worst_ids, worst_ds = \
                plt_multiple_scatter(graph_path + visualize, masks, all_logits, all_target)
            print('\t\tBest performance on:')
            for i, d in zip(best_ids, best_ds):
                print('\t\t\t{}: {}'.format(smiles[i], d))
            print('\t\tWorst performance on:')
            for i, d in zip(worst_ids, worst_ds):
                print('\t\t\t{}: {}'.format(smiles[i], d))

    for epoch in range(default_config.ITERATION):
        print('In iteration {}:'.format(epoch + 1))
        print('\tLearning rate: {:.8e}'.format(current_lr))
        print('\tTraining: ')
        train(train_mask_list, name='train')
        print('\tEvaluating training: ')
        evaluate(train_mask_list, name='train')
        print('\tEvaluating validation: ')
        evaluate(validate_mask_list, name='evaluate')
        print('\tEvaluating test: ')
        evaluate(test_mask_list)
        gc.collect()