Пример #1
0
 def __init__(self, params):
     self.params = params
     self.postfix = time.strftime('%d_%m_%Y') + '_' + time.strftime(
         '%H:%M:%S')
     self.prj_path = Path(__file__).parent.resolve()
     self.device = torch.device('cpu' if self.params.gpu ==
                                -1 else f'cuda:{params.gpu}')
     if self.params.evaluate:
         self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.map_dict, self.time = load_data(
             params)
     else:
         self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.time = load_data(
             params)
     """
     test_dict = {
         'graph': test_graph_dict,
         'nid': test_index_dict,
         'mask': test_mask_dict
     """
     self.model = GNN(in_feats=params.dense_dim,
                      n_hidden=params.hidden_dim,
                      n_classes=self.num_classes,
                      n_layers=1,
                      gene_num=self.num_genes,
                      activation=F.relu,
                      dropout=params.dropout)
     self.load_model()
     self.num_neighbors = self.total_cell + self.num_genes
     self.model.to(self.device)
Пример #2
0
    def __init__(self, params):
        self.params = params
        self.prj_path = Path(__file__).parent.resolve()
        self.save_path = self.prj_path / 'pretrained' / f'{self.params.species}' / 'models'
        if not self.save_path.exists():
            self.save_path.mkdir(parents=True)
        self.device = torch.device('cpu' if self.params.gpu ==
                                   -1 else f'cuda:{params.gpu}')
        self.num_cells, self.num_genes, self.num_labels, self.graph, self.train_ids, self.test_ids, self.labels = load_data_internal(
            params)
        self.labels = self.labels.to(self.device)
        self.model = GNN(in_feats=self.params.dense_dim,
                         n_hidden=self.params.hidden_dim,
                         n_classes=self.num_labels,
                         n_layers=self.params.n_layers,
                         gene_num=self.num_genes,
                         activation=F.relu,
                         dropout=self.params.dropout).to(self.device)

        self.optimizer = torch.optim.Adam(
            self.model.parameters(),
            lr=self.params.lr,
            weight_decay=self.params.weight_decay)
        self.loss_fn = nn.CrossEntropyLoss(reduction='sum')
        if self.params.num_neighbors == 0:
            self.num_neighbors = self.num_cells + self.num_genes
        else:
            self.num_neighbors = self.params.num_neighbors

        print(
            f"Train Number: {len(self.train_ids)}, Test Number: {len(self.test_ids)}"
        )
Пример #3
0
def main():

  gnn = GNN(7, 96, layers, class_num);
  optimizer = tf.keras.optimizers.Adam(1e-3);
  trainset = tf.data.TFRecordDataset(join('datasets', 'trainset.tfrecord')).repeat(-1).map(parse_function).batch(1).prefetch(tf.data.experimental.AUTOTUNE);
  if False == exists('checkpoints'): mkdir('checkpoints');
  checkpoint = tf.train.Checkpoint(model = gnn, optimizer = optimizer);
  checkpoint.restore(tf.train.latest_checkpoint('checkpoints'));
  log = tf.summary.create_file_writer('checkpoints');
  avg_loss = tf.keras.metrics.Mean(name = 'loss', dtype = tf.float32);
  for embeddings, _1_jump_adj, region_types in trainset:
    # embeddings.shape = (1, N, 7), feature vectors of nodes
    # _1_jump_adj.shape = (1, N, N), adjacent matrix
    # region_types.shape = (1, N), class of nodes
    row_sum = tf.math.reduce_sum(_1_jump_adj, axis = -1, keepdims = 1);
    _1_jump_adj = _1_jump_adj / row_sum;
    with tf.GradientTape() as tape:
      features, adjacent = gnn(embeddings); # features.shape = (1, N, class_num), adjacent.shape = (1, N, N, jumps = 16)
      class_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)(region_types, features);
      def body(i, n_jump_adj, loss):
        loss += tf.keras.losses.MSE(tf.keras.layers.Flatten()(n_jump_adj), tf.keras.layers.Flatten()(adjacent[:,:,:,i]));
        i += 1;
        n_jump_adj = tf.linalg.matmul(_1_jump_adj, n_jump_adj); # n_jump_adj.shape = (1, N, N)
        return i, n_jump_adj, loss;
      _, _, edge_loss = tf.while_loop(lambda i, n_jump_adj, loss: i < adjacent.shape[-1], body, loop_vars = (1, _1_jump_adj, 0));
      loss = class_loss + edge_loss;
    avg_loss.update_state(loss);
    if tf.equal(optimizer.iterations % 100, 0):
      with log.as_default():
        tf.summary.scalar('loss', avg_loss.result(), step = optimizer.iterations);
      print('Step #%d Loss: %.6f' % (optimizer.iterations, avg_loss.result()));
      if avg_loss.result() < 0.01: break;
      avg_loss.reset_states();
    grads = tape.gradient(loss, gnn.trainable_variables);
    optimizer.apply_gradients(zip(grads, gnn.trainable_variables));
    if tf.equal(optimizer.iterations % 2000, 0):
      checkpoint.save(join('checkpoints', 'ckpt'));
  if Fasle == exists('model'): mkdir('model');
  gnn.save(join('model', 'gnn.h5'));
Пример #4
0
class Trainer:
    def __init__(self, params):
        self.params = params
        self.prj_path = Path(__file__).parent.resolve()
        self.save_path = self.prj_path / 'pretrained' / f'{self.params.species}' / 'models'
        if not self.save_path.exists():
            self.save_path.mkdir(parents=True)
        self.device = torch.device('cpu' if self.params.gpu ==
                                   -1 else f'cuda:{params.gpu}')
        self.num_cells, self.num_genes, self.num_labels, self.graph, self.train_ids, self.test_ids, self.labels = load_data_internal(
            params)
        self.labels = self.labels.to(self.device)
        self.model = GNN(in_feats=self.params.dense_dim,
                         n_hidden=self.params.hidden_dim,
                         n_classes=self.num_labels,
                         n_layers=self.params.n_layers,
                         gene_num=self.num_genes,
                         activation=F.relu,
                         dropout=self.params.dropout).to(self.device)

        self.optimizer = torch.optim.Adam(
            self.model.parameters(),
            lr=self.params.lr,
            weight_decay=self.params.weight_decay)
        self.loss_fn = nn.CrossEntropyLoss(reduction='sum')
        if self.params.num_neighbors == 0:
            self.num_neighbors = self.num_cells + self.num_genes
        else:
            self.num_neighbors = self.params.num_neighbors

        print(
            f"Train Number: {len(self.train_ids)}, Test Number: {len(self.test_ids)}"
        )

    def fit(self):
        max_test_acc, _train_acc, _epoch = 0, 0, 0
        for epoch in range(self.params.n_epochs):
            loss = self.train()
            train_correct, train_unsure = self.evaluate(
                self.train_ids, 'train')
            train_acc = train_correct / len(self.train_ids)
            test_correct, test_unsure = self.evaluate(self.test_ids, 'test')
            test_acc = test_correct / len(self.test_ids)
            if max_test_acc <= test_acc:
                final_test_correct_num = test_correct
                final_test_unsure_num = test_unsure
                _train_acc = train_acc
                _epoch = epoch
                max_test_acc = test_acc
                self.save_model()
            print(
                f">>>>Epoch {epoch:04d}: Train Acc {train_acc:.4f}, Loss {loss / len(self.train_ids):.4f}, Test correct {test_correct}, "
                f"Test unsure {test_unsure}, Test Acc {test_acc:.4f}")
            if train_acc == 1:
                break

        print(
            f"---{self.params.species} {self.params.tissue} Best test result:---"
        )
        print(
            f"Epoch {_epoch:04d}, Train Acc {_train_acc:.4f}, Test Correct Num {final_test_correct_num}, Test Total Num {len(self.test_ids)}, Test Unsure Num {final_test_unsure_num}, Test Acc {final_test_correct_num / len(self.test_ids):.4f}"
        )

    def train(self):
        self.model.train()
        total_loss = 0
        for batch, nf in enumerate(
                NeighborSampler(g=self.graph,
                                batch_size=self.params.batch_size,
                                expand_factor=self.num_neighbors,
                                num_hops=self.params.n_layers,
                                neighbor_type='in',
                                shuffle=True,
                                num_workers=8,
                                seed_nodes=self.train_ids)):
            nf.copy_from_parent(
            )  # Copy node/edge features from the parent graph.
            logits = self.model(nf)
            batch_nids = nf.layer_parent_nid(-1).type(
                torch.long).to(device=self.device)
            loss = self.loss_fn(logits, self.labels[batch_nids])
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()

        return total_loss

    def evaluate(self, ids, type='test'):
        self.model.eval()
        total_correct, total_unsure = 0, 0
        for nf in NeighborSampler(g=self.graph,
                                  batch_size=self.params.batch_size,
                                  expand_factor=self.num_cells +
                                  self.num_genes,
                                  num_hops=params.n_layers,
                                  neighbor_type='in',
                                  shuffle=True,
                                  num_workers=8,
                                  seed_nodes=ids):
            nf.copy_from_parent(
            )  # Copy node/edge features from the parent graph.
            with torch.no_grad():
                logits = self.model(nf).cpu()
            batch_nids = nf.layer_parent_nid(-1).type(torch.long)
            logits = nn.functional.softmax(logits, dim=1).numpy()
            label_list = self.labels.cpu()[batch_nids]
            for pred, label in zip(logits, label_list):
                max_prob = pred.max().item()
                if max_prob < self.params.unsure_rate / self.num_labels:
                    total_unsure += 1
                elif pred.argmax().item() == label:
                    total_correct += 1

        return total_correct, total_unsure

    def save_model(self):
        state = {
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }

        torch.save(
            state,
            self.save_path / f"{self.params.species}-{self.params.tissue}.pt")
Пример #5
0
# Yields indices to split data into training, validation and test sets
idx = np.random.permutation(n)
idx_train = idx[:int(0.6 * n)]
idx_val = idx[int(0.6 * n):int(0.8 * n)]
idx_test = idx[int(0.8 * n):]

# Transform the numpy matrices/vectors to torch tensors
features = torch.FloatTensor(features)
y = torch.LongTensor(np.argmax(class_labels, axis=1))
adj = torch.FloatTensor(adj)
idx_train = torch.LongTensor(idx_train)
idx_val = torch.LongTensor(idx_val)
idx_test = torch.LongTensor(idx_test)

# Creates the model and specifies the optimizer
model = GNN(features.shape[1], n_hidden_1, n_hidden_2, n_class, dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output, _ = model(features, adj)
    loss_train = F.nll_loss(output[idx_train], y[idx_train])
    acc_train = accuracy(output[idx_train], y[idx_train])
    loss_train.backward()
    optimizer.step()

    model.eval()
    output, _ = model(features, adj)
Пример #6
0
n_hidden_2 = 32
n_hidden_3 = 32
learning_rate = 0.01

# Generates synthetic dataset
Gs, y = create_dataset()
n_class = np.unique(y).size

# Splits the dataset into a training and a test set
G_train, G_test, y_train, y_test = train_test_split(Gs, y, test_size=0.1)

N_train = len(G_train)
N_test = len(G_test)

# Initializes model and optimizer
model = GNN(1, n_hidden_1, n_hidden_2, n_hidden_3, n_class, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

# Trains the model
for epoch in range(epochs):
    t = time.time()
    model.train()
    
    train_loss = 0
    correct = 0
    count = 0
    for i in range(0, N_train, batch_size):
        adj_batch = list()
        idx_batch = list()
        y_batch = list()
Пример #7
0
class Runner:
    def __init__(self, params):
        self.params = params
        self.postfix = time.strftime('%d_%m_%Y') + '_' + time.strftime(
            '%H:%M:%S')
        self.prj_path = Path(__file__).parent.resolve()
        self.device = torch.device('cpu' if self.params.gpu ==
                                   -1 else f'cuda:{params.gpu}')
        if self.params.evaluate:
            self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.map_dict, self.time = load_data(
                params)
        else:
            self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.time = load_data(
                params)
        """
        test_dict = {
            'graph': test_graph_dict,
            'nid': test_index_dict,
            'mask': test_mask_dict
        """
        self.model = GNN(in_feats=params.dense_dim,
                         n_hidden=params.hidden_dim,
                         n_classes=self.num_classes,
                         n_layers=1,
                         gene_num=self.num_genes,
                         activation=F.relu,
                         dropout=params.dropout)
        self.load_model()
        self.num_neighbors = self.total_cell + self.num_genes
        self.model.to(self.device)

    def run(self):
        for num in self.params.test_dataset:
            tic = time.time()
            if self.params.evaluate:
                correct, total, unsure, acc, pred = self.evaluate_test(num)
                print(
                    f"{self.params.species}_{self.params.tissue} #{num} Test Acc: {acc:.4f} ({correct}/{total}), Number of Unsure Cells: {unsure}"
                )
            else:
                pred = self.inference(num)
            toc = time.time()
            print(
                f'{self.params.species}_{self.params.tissue} #{num} Time Consumed: {toc - tic + self.time:.2f} seconds.'
            )
            self.save_pred(num, pred)

    def load_model(self):
        model_path = self.prj_path / 'pretrained' / self.params.species / 'models' / f'{self.params.species}-{self.params.tissue}.pt'
        state = torch.load(model_path, map_location=self.device)
        self.model.load_state_dict(state['model'])

    def inference(self, num):
        self.model.eval()
        new_logits = torch.zeros(
            (self.test_dict['graph'][num].number_of_nodes(), self.num_classes))
        for nf in NeighborSampler(g=self.test_dict['graph'][num],
                                  batch_size=self.params.batch_size,
                                  expand_factor=self.total_cell +
                                  self.num_genes,
                                  num_hops=1,
                                  neighbor_type='in',
                                  shuffle=False,
                                  num_workers=8,
                                  seed_nodes=self.test_dict['nid'][num]):
            nf.copy_from_parent(
            )  # Copy node/edge features from the parent graph.
            with torch.no_grad():
                logits = self.model(nf).cpu()
            batch_nids = nf.layer_parent_nid(-1).type(torch.long)
            new_logits[batch_nids] = logits

        new_logits = new_logits[self.test_dict['mask'][num]]
        new_logits = nn.functional.softmax(new_logits, dim=1).numpy()
        predict_label = []
        for pred in new_logits:
            pred_label = self.id2label[pred.argmax().item()]
            if pred.max().item() < self.params.unsure_rate / self.num_classes:
                # unsure
                predict_label.append('unsure')
            else:
                predict_label.append(pred_label)
        return predict_label

    def evaluate_test(self, num):
        self.model.eval()
        new_logits = torch.zeros(
            (self.test_dict['graph'][num].number_of_nodes(), self.num_classes))
        for nf in NeighborSampler(g=self.test_dict['graph'][num],
                                  batch_size=self.params.batch_size,
                                  expand_factor=self.total_cell +
                                  self.num_genes,
                                  num_hops=1,
                                  neighbor_type='in',
                                  shuffle=False,
                                  num_workers=8,
                                  seed_nodes=self.test_dict['nid'][num]):
            nf.copy_from_parent(
            )  # Copy node/edge features from the parent graph.
            with torch.no_grad():
                logits = self.model(nf).cpu()
            batch_nids = nf.layer_parent_nid(-1).type(torch.long)
            new_logits[batch_nids] = logits

        new_logits = new_logits[self.test_dict['mask'][num]]
        new_logits = nn.functional.softmax(new_logits, dim=1).numpy()
        total = new_logits.shape[0]
        unsure_num, correct = 0, 0
        predict_label = []
        for pred, t_label in zip(new_logits, self.test_dict['label'][num]):
            pred_label = self.id2label[pred.argmax().item()]
            if pred.max().item() < self.params.unsure_rate / self.num_classes:
                # unsure
                unsure_num += 1
                predict_label.append('unsure')
            else:
                if pred_label in self.map_dict[num][t_label]:
                    correct += 1
                predict_label.append(pred_label)
        return correct, total, unsure_num, correct / total, predict_label

    def save_pred(self, num, pred):
        label_map = pd.read_excel(
            './map/celltype2subtype.xlsx',
            sheet_name=self.params.species,
            header=0,
            names=['species', 'old_type', 'new_type', 'new_subtype'])
        label_map = label_map.fillna('N/A', inplace=False)
        oldtype2newtype = {}
        oldtype2newsubtype = {}
        for _, old_type, new_type, new_subtype in label_map.itertuples(
                index=False):
            oldtype2newtype[old_type] = new_type
            oldtype2newsubtype[old_type] = new_subtype

        save_path = self.prj_path / self.params.save_dir
        if not save_path.exists():
            save_path.mkdir()
        if self.params.evaluate:
            df = pd.DataFrame({
                'index':
                self.test_dict['origin_id'][num],
                'original label':
                self.test_dict['label'][num],
                'cell_type': [oldtype2newtype.get(p, p) for p in pred],
                'cell_subtype': [oldtype2newsubtype.get(p, p) for p in pred]
            })
        else:
            df = pd.DataFrame({
                'index':
                self.test_dict['origin_id'][num],
                'cell_type': [oldtype2newtype.get(p, p) for p in pred],
                'cell_subtype': [oldtype2newsubtype.get(p, p) for p in pred]
            })
        df.to_csv(save_path /
                  (self.params.species + f"_{self.params.tissue}_{num}.csv"),
                  index=False)
        print(
            f"output has been stored in {self.params.species}_{self.params.tissue}_{num}.csv"
        )
Пример #8
0
                                     valid_dataloader,
                                     optm=optm,
                                     learning_rate=learning_rate,
                                     patience=5)
    test_result = TestModel(model, test_dataloader, max_speed)
    StoreData(model_name, train_result, test_result, directory, model,
              random_seed, save_model)

    # GNN-6
    importlib.reload(models)
    from models import GNN
    importlib.reload(utils)
    from utils import TrainModel, TestModel
    model_name = 'GMN6'
    print(model_name)
    model = GNN(A, layer=6, gamma=gamma)
    model, train_result = TrainModel(model,
                                     train_dataloader,
                                     valid_dataloader,
                                     optm=optm,
                                     learning_rate=learning_rate,
                                     patience=5)
    test_result = TestModel(model, test_dataloader, max_speed)
    StoreData(model_name, train_result, test_result, directory, model,
              random_seed, save_model)

    # GNN-8
    importlib.reload(models)
    from models import GNN
    importlib.reload(utils)
    from utils import TrainModel, TestModel
Пример #9
0
def main(_run, _config, _log):
    '''
    _config: dictionary; its keys and values are the variables setting in the cfg function
    _run: run object defined by Sacred, can be used to record hashable values and get some information, e.g. run id, for a run
    _log: logger object provided by Sacred, but is not very flexible, we can define loggers by oureselves
    '''

    config = dcopy(
        _config
    )  # We need this step because Sacred does not allow us to change _config object
    # But sometimes we need to add some key-value pairs to config
    torch.cuda.set_device(config['gpu_id'])

    save_source(_run)  # Source code are saved by running this line
    init_seed(config['seed'])
    logger = init_logger(log_root=_run.observers[0].dir, file_name='log.txt')

    output_folder_path = opjoin(_run.observers[0].dir,
                                config['path']['output_folder_name'])
    os.makedirs(output_folder_path, exist_ok=True)

    best_acc_list = []
    last_acc_list = []
    train_best_list = []
    train_last_list = []

    best_epoch = []

    data = load_data(config=config)
    split_iterator = range(config['data']['random_split']['num_splits']) \
                     if config['data']['random_split']['use'] \
                    else range(1)

    config['adj'] = data[0]

    for i in split_iterator:
        output_folder = opjoin(output_folder_path, str(i))
        os.makedirs(output_folder, exist_ok=True)

        if config['data']['random_split']['use']:
            data = resplit(
                dataset=config['data']['dataset'],
                data=data,
                full_sup=config['data']['full_sup'],
                num_classes=torch.unique(data[2]).shape[0],
                num_nodes=data[1].shape[0],
                num_per_class=config['data']['label_per_class'],
            )
            print(torch.sum(data[3]))

        model = GNN(config=config)

        if i == 0:
            logger.info(model)

        if config['use_gpu']:
            model.cuda()
            data = [
                each.cuda() if hasattr(each, 'cuda') else each for each in data
            ]

        optimizer = init_optimizer(
            params=model.parameters(),
            optim_type=config['optim']['type'],
            lr=config['optim']['lr'],
            weight_decay=config['optim']['weight_decay'],
            momentum=config['optim']['momemtum'])

        criterion = nn.NLLLoss()

        best_model_path = opjoin(output_folder, 'best_model.pth')
        last_model_path = opjoin(output_folder, 'last_model.pth')
        best_dict_path = opjoin(output_folder, 'best_pred_dict.pkl')
        last_dict_path = opjoin(output_folder, 'last_pred_dict.pkl')
        losses_curve_path = opjoin(output_folder, 'losses.pkl')
        accs_curve_path = opjoin(output_folder, 'accs.pkl')
        best_state_path = opjoin(output_folder, 'best_state.pkl')
        grads_path = opjoin(output_folder, 'grads.pkl')

        best_pred_dict, last_pred_dict, train_losses, train_accs, \
        val_losses, val_accs, best_state, grads, model_state = train(best_model_path,
                                                       last_model_path,
                                                       config,
                                                       criterion,
                                                       data,
                                                       logger,
                                                       model,
                                                       optimizer
                                                       )
        last_model_state, best_model_state = model_state

        losses_dict = {'train': train_losses, 'val': val_losses}

        accs_dict = {'train': train_accs, 'val': val_accs}
        logger.info(f'split_seed: {i: 04d}')
        logger.info(f'Test set results on the last model:')
        last_pred_dict = test(
            criterion,
            data,
            last_model_path,
            last_pred_dict,
            logger,
            model,
            last_model_state,
        )

        logger.info(f'Test set results on the best model:')
        if config['fastmode']:
            best_pred_dict = last_pred_dict
        else:
            best_pred_dict = test(
                criterion,
                data,
                best_model_path,
                best_pred_dict,
                logger,
                model,
                best_model_state,
            )

        logger.info('\n')

        check_before_pkl(best_pred_dict)
        with open(best_dict_path, 'wb') as f:
            pkl.dump(best_pred_dict, f)

        check_before_pkl(last_pred_dict)
        with open(last_dict_path, 'wb') as f:
            pkl.dump(last_pred_dict, f)

        check_before_pkl(losses_dict)
        with open(losses_curve_path, 'wb') as f:
            pkl.dump(losses_dict, f)

        check_before_pkl(accs_dict)
        with open(accs_curve_path, 'wb') as f:
            pkl.dump(accs_dict, f)

        check_before_pkl(best_state)
        with open(best_state_path, 'wb') as f:
            pkl.dump(best_state, f)

        check_before_pkl(grads)
        with open(grads_path, 'wb') as f:
            pkl.dump(grads, f)

        best_acc_list.append(best_pred_dict['test acc'].item())
        last_acc_list.append(last_pred_dict['test acc'].item())
        train_best_list.append(best_state['train acc'].item())
        train_last_list.append(train_accs[-1].item())
        best_epoch.append(best_state['epoch'])

    logger.info('********************* STATISTICS *********************')
    np.set_printoptions(precision=4, suppress=True)
    logger.info(f"\n"
                f"Best test acc: {best_acc_list}\n"
                f"Mean: {np.mean(best_acc_list)}\t"
                f"Std: {np.std(best_acc_list)}\n"
                f"Last test acc: {last_acc_list}\n"
                f"Mean: {np.mean(last_acc_list)}\t"
                f"Std: {np.std(last_acc_list)}\n")

    logger.info(f"best epoch: {best_epoch}")