def __init__(self, test_query, test_reply): self.tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name) bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) self.model = opt.model_class(bert_model, opt).to(opt.device) # * testset df_test_query = pd.read_csv(test_query, sep='\t', header=None, encoding='utf-8', engine='python') df_test_query.columns = ['id', 'q1'] df_test_reply = pd.read_csv(test_reply, sep='\t', header=None, encoding='utf-8', engine='python') df_test_reply.columns = ['id', 'id_sub', 'q2'] df_test_reply['q2'] = df_test_reply['q2'].fillna('好的') df_test_data = df_test_query.merge(df_test_reply, how='left') if opt.add_pseudo_data: self.pseudo_groups = df_test_data.loc[:, 'id'].to_numpy() self.pseudo_index = np.array(df_test_data.index) self.pseudo_data = copy.deepcopy(df_test_data) self.submit = copy.deepcopy(df_test_reply) # self.pseudo = copy.deepcopy(df_test_data) testset = BertSentenceDataset(df_test_data, self.tokenizer, test=True) if opt.dialogue: self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False, collate_fn=collate_wrapper) else: self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False) if opt.datareverse: df_test_data_reverse = copy.deepcopy( df_test_data[['id', 'q2', 'id_sub', 'q1']]) testset_reverse = BertSentenceDataset(df_test_data_reverse, self.tokenizer, test=True) self.test_dataloader_reverse = DataLoader( dataset=testset_reverse, batch_size=opt.eval_batch_size, shuffle=False) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name) bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) self.pretrained_bert_state_dict = bert_model.state_dict() self.model = opt.model_class(bert_model, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) torch.autograd.set_grad_enabled(False) testset = BertSentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt) self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False)
def __init__(self, opt): self.opt = opt self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_length=opt.max_length, data_file='./embedding/{0}_{1}_tokenizer.dat'.format( opt.model_name, opt.dataset), ) embedding_matrix = build_embedding_matrix( vocab=self.tokenizer.vocab, embed_dim=opt.embed_dim, data_file='./embedding/{0}_{1}d_{2}_embedding_matrix.dat'.format( opt.model_name, str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name) # bert_model = AutoModel.from_pretrained(opt.pretrained_bert_name) bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) # bert_model = AlbertModel.from_pretrained(opt.pretrained_bert_name) # self.pretrained_bert_state_dict = bert_model.state_dict() self.model = opt.model_class(bert_model, opt).to(opt.device) trainset = BertSentenceDataset(opt.dataset_file['train'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt) testset = BertSentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt) if opt.datatype == 'diadata': self.train_dataloader = DataLoader(dataset=trainset, batch_size=opt.train_batch_size, shuffle=True, collate_fn=collate_wrapper) # , drop_last=True self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False, collate_fn=collate_wrapper) else: self.train_dataloader = DataLoader(dataset=trainset, batch_size=opt.train_batch_size, shuffle=True) # , drop_last=True self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(self.opt.device.index))) self._print_args()