def load_from_file(cls, file_path, evaluation_mode=False, LOG=None): """ Load a model from specified file path :param file_path: model file :param evaluation_mode: training or evaluation mode :return: """ # model parameters if torch.cuda.is_available(): save_dict = torch.load(file_path) else: save_dict = torch.load(file_path, map_location=lambda storage, loc: storage) # encoder and decoder params encoder_params = save_dict.get("encoder_params", {}) decoder_params = save_dict.get("decoder_params", {}) # load model model = Model(vocabulary=save_dict['vocabulary'], tokenizer=save_dict.get('tokenizer', mv.SMILESTokenizer()), encoder_params=encoder_params, decoder_params=decoder_params, max_sequence_length=save_dict['max_sequence_length']) model.network.encoder.load_state_dict(save_dict["encoder"]) model.network.decoder.load_state_dict(save_dict["decoder"]) if evaluation_mode: model.network.encoder.eval() model.network.decoder.eval() if LOG: LOG.info(model.network.encoder) LOG.info(model.network.decoder) return model
def validation_stat(self, dataloader, model, loss_compute, device, vocab): pad = cfgd.DATA_DEFAULT['padding_value'] total_loss = 0 n_correct = 0 total_n_trg = 0 total_tokens = 0 tokenizer = mv.SMILESTokenizer() for i, batch in enumerate( ul.progress_bar(dataloader, total=len(dataloader))): src, source_length, trg, src_mask, trg_mask, max_length_target, _ = batch trg_y = trg[:, 1:].to(device) # skip start token # number of tokens without padding ntokens = (trg_y != pad).data.sum() # Move to GPU src = src.to(device) trg = trg[:, :-1].to(device) # save start token, skip end token src_mask = src_mask.to(device) trg_mask = trg_mask.to(device) # Compute loss with teaching forcing out = model.forward(src, trg, src_mask, trg_mask) loss = loss_compute(out, trg_y, ntokens) total_loss += loss total_tokens += ntokens # Decode max_length_target = cfgd.DATA_DEFAULT['max_sequence_length'] smiles = decode(model, src, src_mask, max_length_target, type='greedy') # Compute accuracy for j in range(trg.size()[0]): seq = smiles[j, :] target = trg[j] target = tokenizer.untokenize( vocab.decode(target.cpu().numpy())) seq = tokenizer.untokenize(vocab.decode(seq.cpu().numpy())) if seq == target: n_correct += 1 # number of samples in current batch n_trg = trg.size()[0] # total samples total_n_trg += n_trg # Accuracy accuracy = n_correct * 1.0 / total_n_trg loss_epoch = total_loss / total_tokens return loss_epoch, accuracy
def initialize_dataloader(self, data_path, batch_size, vocab, data_type): # Read train or validation data = pd.read_csv(os.path.join(data_path, data_type + '.csv'), sep=",") dataset = md.Dataset(data=data, vocabulary=vocab, tokenizer=mv.SMILESTokenizer(), prediction_mode=False) dataloader = torch.utils.data.DataLoader( dataset, batch_size, shuffle=True, collate_fn=md.Dataset.collate_fn) return dataloader
def get_model(self, opt, vocab, device): # Train from scratch or resume training from a given epoch if opt.starting_epoch == 1: model = Model.make_model(opt.num_layers, opt.layer_size, opt.cell_type, opt.embedding_layer_size, opt.dropout, opt.bidirectional, opt.bidirect_model, opt.attn_model, cfgd.DATA_DEFAULT['max_sequence_length'], vocab, mv.SMILESTokenizer(), self.LOG) else: file_name = os.path.join(self.save_path, f'checkpoint/model_{opt.starting_epoch-1}.pt') model = Model.load_from_file(file_name) # move to GPU model.network.encoder.to(device) model.network.decoder.to(device) return model
def __init__(self, opt): self.save_path = os.path.join('experiments', opt.save_directory, opt.test_file_name, f'evaluation_{opt.epoch}') global LOG LOG = ul.get_logger(name="generate", log_path=os.path.join(self.save_path, 'generate.log')) LOG.info(opt) LOG.info("Save directory: {}".format(self.save_path)) # Load vocabulary with open(os.path.join(opt.data_path, 'vocab.pkl'), "rb") as input_file: vocab = pkl.load(input_file) self.vocab = vocab self.tokenizer = mv.SMILESTokenizer()
def validation_stat(self, dataloader, model, device, vocab): pad = cfgd.DATA_DEFAULT['padding_value'] total_loss = 0 total_tokens = 0 n_correct = 0 total_n_trg = 0 tokenizer = mv.SMILESTokenizer() model.network.encoder.eval() model.network.decoder.eval() for _, batch in enumerate(ul.progress_bar(dataloader, total=len(dataloader))): encoder_input, source_length, decoder_output, mask, _, max_length_target, _ = batch # Move to GPU encoder_input = encoder_input.to(device) decoder_output = decoder_output.to(device) source_length = source_length.to(device) mask = torch.squeeze(mask, 1).to(device) # Loss with torch.no_grad(): loss_b_sq = model.loss_step(encoder_input, source_length, decoder_output, mask, max_length_target, device) ntokens = (decoder_output != pad).data.sum() total_tokens += ntokens total_loss += loss_b_sq.sum() # Sample using greedy, compute accuracy predicted_seqs, predicted_nlls = model.greedy_sample(encoder_input, source_length, decoder_output, mask, device) for j, seq in enumerate(predicted_seqs): target = tokenizer.untokenize(vocab.decode(decoder_output[j].cpu().numpy())) smi = tokenizer.untokenize(vocab.decode(seq.cpu().numpy())) if smi == target: n_correct += 1 total_n_trg += decoder_output.shape[0] accuracy = n_correct*1.0 / total_n_trg loss = total_loss/total_tokens return loss, accuracy
# add property name before property change; save to file property_condition = [] for property_name in cfgd.PROPERTIES: if property_name == 'LogD': intervals, _ = property_change_encoder[property_name] property_condition.extend(intervals) else: intervals = property_change_encoder[property_name] for name in intervals: property_condition.append("{}_{}".format(property_name, name)) LOG.info("Property condition tokens: {}".format(len(property_condition))) encoded_file = pdp.save_df_property_encoded(args.input_data_path, property_change_encoder, LOG) LOG.info("Building vocabulary") tokenizer = mv.SMILESTokenizer() smiles_list = pdp.get_smiles_list(args.input_data_path) vocabulary = mv.create_vocabulary(smiles_list, tokenizer=tokenizer, property_condition=property_condition) tokens = vocabulary.tokens() LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens) # Save vocabulary to file parent_path = uf.get_parent_dir(args.input_data_path) output_file = os.path.join(parent_path, 'vocab.pkl') with open(output_file, 'wb') as pickled_file: pickle.dump(vocabulary, pickled_file) LOG.info("Save vocabulary to file: {}".format(output_file)) # Split data into train, validation, test train, validation, test = pdp.split_data(encoded_file, LOG)