def __init__(self, config, n_gpu, vocab, train_loader=None, val_loader=None): self.config = config self.vocab = vocab self.n_gpu = n_gpu self.train_loader = train_loader self.val_loader = val_loader # Build model vocab_size = self.vocab.vocab_size() self.model = CNN_Text(self.config, vocab_size, self.config.n_label) self.model.to(device) if self.n_gpu > 1: self.model = nn.DataParallel(self.model) # Build optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=0.0005) # Build criterion self.criterion = nn.CrossEntropyLoss()
def main_train(): def clean_str(string): string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string TEXT = data.Field(sequential=True, lower=True, batch_first=True) TEXT.preprocessing = data.Pipeline(clean_str) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) trainset, valset = MR.splits(data_path, fields=[("text", TEXT), ("label", LABEL)]) TEXT.build_vocab(trainset) with open("text.field", 'wb') as f: dill.dump(TEXT, f) trainiter = data.BucketIterator(trainset, batch_size=batch_size, sort_key=lambda x: len(x.text), shuffle=True, device=device) valiter = data.BucketIterator(valset, batch_size=batch_size, sort_key=lambda x: len(x.text), shuffle=True, device=device) model = CNN_Text(channel_dim, len(TEXT.vocab), embed_dim, output_dim, kernel_sizes, is_static=False, dropout_rate=dropout_rate) model = model.to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=weight_decay) train_model(epochs, model, trainiter, valiter, optimizer, criterion)
def main(args): # # Device configuration device = torch.device( 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu') num_epochs = 80 num_classes = 8 learning_rate = 0.08 num_views = 3 num_layers = 4 data_path = args.dir file_list = [ './data/train_web_content.npy', './data/train_web_links.npy', './data/train_web_title.npy', './data/test_web_content.npy', './data/test_web_links.npy', './data/test_web_title.npy', './data/train_label.npy', './data/test_label.npy' ] aaa = list(map(os.path.exists, file_list)) if sum(aaa) != len(aaa): print( 'Raw data has not been pre-processed! Start pre-processing the raw data.' ) data_loader.preprocess(data_path) else: print('Loading the existing data set...') # train_dataset = data_loader.Load_datasets('train', num_classes) train_dataset = data_loader.Load_datasets('train', 8) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4) input_dims = np.array(train_dataset.data[0]).shape model = CNN_Text(input_dims, [64, 32, 32, 32], [1, 2, 3, 4], num_classes, 0.5, num_layers, num_views).to(device) model = model.double() model.device = device model.learning_rate = learning_rate model.epoch = 0 if args.model != None: model.load_state_dict(torch.load(args.mpodel)) print('Successfully load pre-trained model!') # train the model until the model is fully trained train_model(model, train_loader, num_epochs) print('Finish training process!') evaluation(model)
def build_model(args): if args.clf_model.lower() == "cnn": # easy for text tokenization tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = CNN_Text(args) elif args.clf_model.lower() == "robert": print("name is {}".format(args.model_name_or_path)) tokenizer = RobertaTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = RobertaConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = RobertaForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers if args.freeze: for n, p in model.named_parameters(): if "bert" in n: p.requires_grad = False elif args.clf_model.lower() == "bert": tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers # if args.freeze: # for n, p in model.named_parameters(): # if "bert" in n: # p.requires_grad = False else: tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = DistilBertConfig.from_pretrained( args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = DistilBertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) model.expand_class_head(args.multi_head) model = model.to(args.device) return tokenizer, model
tokens.append('oov') tokens.append('bos') id = 0 word2id = {} for word in tokens: word2id[word] = id id += 1 args.embed_num = len(tokens) args.class_num = 2 args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] #print("\nParameters:") #for attr, value in sorted(args.__dict__.items()): # print("\t{}={}".format(attr.upper(), value)) model = CNN_Text(args) if torch.cuda.is_available(): model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) report_interval = 5000 for epoch in range(1, args.epochs + 1): train_batch_i = 0 batch_counter = 0 accumulated_loss = 0 train_sents_scaned = 0 train_num_correct = 0 model.train() print('--' * 20) start_time = time.time()
text_field = data.Field(lower=True) label_field = data.Field(sequential=False) train_data, dev_data = MR.splits(text_field, label_field) text_field.build_vocab(train_data, dev_data) label_field.build_vocab(train_data, dev_data) args = Args() args.dropout = 0.5 args.max_norm = 3.0 args.embed_dim = 128 args.kernel_num = 100 args.kernel_sizes = '3,4,5' args.static = False args.snapshot = 'snapshot/best.pt' args.embed_num = len(text_field.vocab) args.class_num = len(label_field.vocab) - 1 args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] model = CNN_Text(args) model.load_state_dict(torch.load(args.snapshot, map_location='cpu')) model = model.to(device) @app.route('/cls/<text>') def classify_text(text): app.logger.warning(text) result, conf = predict(text, model, text_field, label_field, device) app.logger.warning(conf) return result
#use CUDA to speed up use_cuda = torch.cuda.is_available() #get data train_loader = Data.DataLoader(dataset=CustomDataset(path="train.json", balance=False), batch_size=BATCH_SIZE, shuffle=True) test_loader = Data.DataLoader(dataset=CustomDataset(path="test.json", balance=False), batch_size=BATCH_SIZE, shuffle=True) #initialize model cnn = CNN_Text() if use_cuda: cnn = cnn.cuda() optimizer = torch.optim.Adam(cnn.parameters(), lr=LR, weight_decay=0.0005) #train for epoch in range(EPOCH): print("epoch :") if epoch % 5 == 0: test(cnn, test_loader, use_cuda) for step, data in enumerate(train_loader): vec, lens, label = data #print(vec.shape) if use_cuda: vec = vec.cuda() label = label.cuda()
def train(self, m_2, m_3, m_4): word_dict, label_dict = self.divide_two_dict(m_2) if self.hyperparameter_1.word_embedding: path = "word2vec/glove.6B.100d.txt" print("loading word2vec ") word_vecs = self.load_my_vector(path, word_dict.m_list) print("new words already in word2vec:" + str(len(word_vecs))) print("loading unknow word2vec and convert to list... ") word_vecs = self.add_unknow_words_by_average( word_vecs, word_dict.m_list, k=self.hyperparameter_1.embed_dim) print("unknown word2vec load ! and converted to list...") # if self.hyperparameter_1.word_embedding: self.hyperparameter_1.pretrained_weight = word_vecs # pretrained_weight = np.array(self.hyperparameter_1.pretrained_weight) # self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) # self.nn = network(2, 2, 2, hidden_layer_weights=None, hidden_layer_bias=None, output_layer_weights=None, output_layer_bias=None) train_example = self.out_example_index(m_2, m_2) dev_example = self.out_example_index(m_2, m_3) test_example = self.out_example_index(m_2, m_4) random.shuffle(train_example) random.shuffle(dev_example) random.shuffle(test_example) self.model = CNN_Text(self.hyperparameter_1) optimizer = torch.optim.Adam(self.model.parameters(), lr=self.hyperparameter_1.lr) train_example_idx = self.set_index(train_example) random.shuffle(train_example_idx) steps = 0 self.model.train() for epoch in range(1, self.hyperparameter_1.epochs + 1): batchBlock = self.set_batchBlock(train_example) for every_batchBlock in range(batchBlock): exams = [] start_pos = every_batchBlock * self.hyperparameter_1.batch_size end_pos = (every_batchBlock + 1) * self.hyperparameter_1.batch_size if end_pos > len(train_example): end_pos = len(train_example) for idx in range(start_pos, end_pos): exams.append(train_example[train_example_idx[idx]]) max_len = self.get_max_sentence_len(exams) optimizer.zero_grad() feat, label = self.batch(exams, self.hyperparameter_1.batch_size, max_len) label = label.view(len(exams)) logit = self.model.forward(feat) loss = F.cross_entropy(logit, label) loss.backward() optimizer.step() steps += 1 if steps % self.hyperparameter_1.log_interval == 0: train_size = len(train_example) corrects = (torch.max(logit, 1)[1].view( label.size()).data == label.data).sum() accuracy = corrects / self.hyperparameter_1.batch_size * 100.0 sys.stdout.write( '\rBatch[{}/{}] - loss: {:.6f} acc: {:.4f}%({}/{})'. format(steps, train_size, loss.data[0], accuracy, corrects, self.hyperparameter_1.batch_size)) if steps % self.hyperparameter_1.test_interval == 0: self.eval(dev_example, self.model) if steps % self.hyperparameter_1.save_interval == 0: if not os.path.isdir(self.hyperparameter_1.save_dir): os.makedirs(self.hyperparameter_1.save_dir) save_prefix = os.path.join(self.hyperparameter_1.save_dir, 'snapshot') save_path = '{}_steps{}.pt'.format(save_prefix, steps) torch.save(self.model, save_path)
class Classifier: def __init__(self): self.hyperparameter_1 = Hyperparameter() self.inst = inst() self.Read_inst = Read_inst() self.aphabet = alphabet() self.example = example() def divide_two_dict(self, m_2): all_w = [] all_l = [] for inst in m_2: for w in inst.m_word: all_w.append(w) all_w.append(self.hyperparameter_1.unknow) all_w.append(self.hyperparameter_1.padding) word_alphabet = self.aphabet.add_dict(all_w) for inst in m_2: for w in inst.m_label: all_l.append(w) label_alphabet = self.aphabet.add_dict(all_l) return word_alphabet, label_alphabet def load_my_vector(self, path, vocab): word_vecs = {} with open(path, encoding='UTF - 8') as f: lines = f.readlines()[1:] for line in lines: values = line.split(' ') word = values[0] if word in vocab: vector = [] for count, val in enumerate(values): if count == 0: continue vector.append(float(val)) word_vecs[word] = vector return word_vecs def add_unknow_words_by_uniform(self, word_vecs, vocab, k=100): list_word2vec = [] oov = 0 iov = 0 for word in vocab: if word not in word_vecs: oov += 1 word_vecs[word] = np.random.uniform(-0.25, 0.25, k).round(6).tolist() list_word2vec.append(word_vecs[word]) else: iov += 1 list_word2vec.append(word_vecs[word]) return list_word2vec def add_unknow_words_by_average(self, word_vecs, vocab, k=100): word_vecs_numpy = [] for word in vocab: if word in word_vecs: word_vecs_numpy.append(word_vecs[word]) col = [] for i in range(k): sum = 0.0 for j in range(int(len(word_vecs_numpy))): sum += word_vecs_numpy[j][i] sum = round(sum, 6) col.append(sum) zero = [] for m in range(k): avg = col[m] / (len(word_vecs_numpy)) avg = round(avg, 6) zero.append(float(avg)) list_word2vec = [] oov = 0 iov = 0 for word in vocab: if word not in word_vecs: oov += 1 word_vecs[word] = zero list_word2vec.append(word_vecs[word]) else: iov += 1 list_word2vec.append(word_vecs[word]) return list_word2vec def get_max_sentence_len(self, all_example): max_sentence_len = 0 for exam in all_example: if max_sentence_len < len(exam.m_word_index): max_sentence_len = len(exam.m_word_index) return max_sentence_len def batch(self, examples, batch_size, max_len): for exam in examples: if len(exam.m_word_index) == max_len: continue for i in range(max_len - len(exam.m_word_index)): exam.m_word_index.append(self.hyperparameter_1.padding_id) minibatch_word = [] minibatch_label = [] for exam in examples: minibatch_word.append(exam.m_word_index) minibatch_label.append(exam.m_label_index) if len(minibatch_word) % batch_size == 0: minibatch_word = Variable(torch.LongTensor(minibatch_word)) minibatch_label = Variable(torch.LongTensor(minibatch_label)) return minibatch_word, minibatch_label if minibatch_word or minibatch_label: minibatch_word = Variable(torch.LongTensor(minibatch_word)) minibatch_label = Variable(torch.LongTensor(minibatch_label)) return minibatch_word, minibatch_label def set_batchBlock(self, examples): if len(examples) % self.hyperparameter_1.batch_size == 0: batchBlock = len(examples) // self.hyperparameter_1.batch_size else: batchBlock = len(examples) // self.hyperparameter_1.batch_size + 1 return batchBlock def set_index(self, examples): index = [] for i in range(len(examples)): index.append(i) return index def out_example_index(self, m_2, m_3): word_dict, label_dict = self.divide_two_dict(m_2) all_example = [] for i in m_3: b = example() b.m_label_index.append(label_dict.dict[i.m_label]) for j in i.m_word: if j not in word_dict.dict: b.m_word_index.append( word_dict.dict[self.hyperparameter_1.unknow]) else: b.m_word_index.append(word_dict.dict[j]) all_example.append(b) self.hyperparameter_1.unknow_id = word_dict.dict[ self.hyperparameter_1.unknow] self.hyperparameter_1.padding_id = word_dict.dict[ self.hyperparameter_1.padding] self.hyperparameter_1.vocab_num = len(word_dict.m_list) return all_example def train(self, m_2, m_3, m_4): word_dict, label_dict = self.divide_two_dict(m_2) if self.hyperparameter_1.word_embedding: path = "word2vec/glove.6B.100d.txt" print("loading word2vec ") word_vecs = self.load_my_vector(path, word_dict.m_list) print("new words already in word2vec:" + str(len(word_vecs))) print("loading unknow word2vec and convert to list... ") word_vecs = self.add_unknow_words_by_average( word_vecs, word_dict.m_list, k=self.hyperparameter_1.embed_dim) print("unknown word2vec load ! and converted to list...") # if self.hyperparameter_1.word_embedding: self.hyperparameter_1.pretrained_weight = word_vecs # pretrained_weight = np.array(self.hyperparameter_1.pretrained_weight) # self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) # self.nn = network(2, 2, 2, hidden_layer_weights=None, hidden_layer_bias=None, output_layer_weights=None, output_layer_bias=None) train_example = self.out_example_index(m_2, m_2) dev_example = self.out_example_index(m_2, m_3) test_example = self.out_example_index(m_2, m_4) random.shuffle(train_example) random.shuffle(dev_example) random.shuffle(test_example) self.model = CNN_Text(self.hyperparameter_1) optimizer = torch.optim.Adam(self.model.parameters(), lr=self.hyperparameter_1.lr) train_example_idx = self.set_index(train_example) random.shuffle(train_example_idx) steps = 0 self.model.train() for epoch in range(1, self.hyperparameter_1.epochs + 1): batchBlock = self.set_batchBlock(train_example) for every_batchBlock in range(batchBlock): exams = [] start_pos = every_batchBlock * self.hyperparameter_1.batch_size end_pos = (every_batchBlock + 1) * self.hyperparameter_1.batch_size if end_pos > len(train_example): end_pos = len(train_example) for idx in range(start_pos, end_pos): exams.append(train_example[train_example_idx[idx]]) max_len = self.get_max_sentence_len(exams) optimizer.zero_grad() feat, label = self.batch(exams, self.hyperparameter_1.batch_size, max_len) label = label.view(len(exams)) logit = self.model.forward(feat) loss = F.cross_entropy(logit, label) loss.backward() optimizer.step() steps += 1 if steps % self.hyperparameter_1.log_interval == 0: train_size = len(train_example) corrects = (torch.max(logit, 1)[1].view( label.size()).data == label.data).sum() accuracy = corrects / self.hyperparameter_1.batch_size * 100.0 sys.stdout.write( '\rBatch[{}/{}] - loss: {:.6f} acc: {:.4f}%({}/{})'. format(steps, train_size, loss.data[0], accuracy, corrects, self.hyperparameter_1.batch_size)) if steps % self.hyperparameter_1.test_interval == 0: self.eval(dev_example, self.model) if steps % self.hyperparameter_1.save_interval == 0: if not os.path.isdir(self.hyperparameter_1.save_dir): os.makedirs(self.hyperparameter_1.save_dir) save_prefix = os.path.join(self.hyperparameter_1.save_dir, 'snapshot') save_path = '{}_steps{}.pt'.format(save_prefix, steps) torch.save(self.model, save_path) def eval(self, data_example, model): self.model.eval() corrects, avg_loss = 0, 0 data_example_idx = self.set_index(data_example) batchBlock = self.set_batchBlock(data_example) for every_batchBlock in range(batchBlock): exams = [] start_pos = every_batchBlock * self.hyperparameter_1.batch_size end_pos = (every_batchBlock + 1) * self.hyperparameter_1.batch_size if end_pos > len(data_example): end_pos = len(data_example) for idx in range(start_pos, end_pos): exams.append(data_example[data_example_idx[idx]]) max_len = self.get_max_sentence_len(exams) feat, label = self.batch(exams, self.hyperparameter_1.batch_size, max_len) label = label.view(len(exams)) logit = self.model.forward(feat) loss = F.cross_entropy(logit, label, size_average=False) # print(loss.data[0]) avg_loss += loss.data[0] corrects += (torch.max(logit, 1)[1].view( label.size()).data == label.data).sum() size = len(data_example) avg_loss = loss.data[0] / size accuracy = corrects / size * 100.0 self.model.train() print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format( avg_loss, accuracy, corrects, size)) def variable(self, example): x = Variable(torch.LongTensor(1, len(example.m_word_index))) y = Variable(torch.LongTensor(1)) for i in range(len(example.m_word_index)): x.data[0][i] = example.m_word_index[i] y.data[0] = example.m_label_index[0] return x, y
class Trainer: def __init__(self, config, n_gpu, vocab, train_loader=None, val_loader=None): self.config = config self.vocab = vocab self.n_gpu = n_gpu self.train_loader = train_loader self.val_loader = val_loader # Build model vocab_size = self.vocab.vocab_size() self.model = CNN_Text(self.config, vocab_size, self.config.n_label) self.model.to(device) if self.n_gpu > 1: self.model = nn.DataParallel(self.model) # Build optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=0.0005) # Build criterion self.criterion = nn.CrossEntropyLoss() def train(self): best_f1 = 0.0 best_acc = 0.0 global_step = 0 batch_f1 = [] batch_acc = [] for epoch in range(self.config.num_epoch): batch_loss = [] for step, batch in enumerate(self.train_loader): self.model.train() batch = tuple(t.to(device) for t in batch) batch = sort_batch(batch) input_ids, input_lengths, labels = batch outputs = self.model(input_ids) loss = self.criterion(outputs['logits'].view(-1, self.config.n_label), labels.view(-1)) f1, acc = ic_metric(labels.cpu(), outputs['predicted_intents'].cpu()) if self.n_gpu > 1: loss = loss.mean() loss.backward() self.optimizer.step() self.optimizer.zero_grad() global_step += 1 batch_loss.append(loss.float().item()) batch_f1.append(f1) batch_acc.append(acc) if (global_step == 1) or (global_step % self.config.log_interval == 0): mean_loss = np.mean(batch_loss) mean_f1 = np.mean(batch_f1) mean_acc = np.mean(batch_acc) batch_loss = [] nsml.report(summary=True, scope=locals(), epoch=epoch, train_loss=mean_loss, step=global_step) if (global_step > 0) and (global_step % self.config.val_interval == 0): val_loss, val_f1, val_acc = self.evaluation() nsml.report(summary=True, scope=locals(), epoch=epoch, val_loss=val_loss, val_f1=val_f1, val_acc=val_acc, step=global_step) if val_f1 > best_f1: best_f1 = val_f1 best_acc = val_acc nsml.save(global_step) def evaluation(self): self.model.eval() total_loss = [] preds = [] targets = [] with torch.no_grad(): for step, batch in enumerate(self.val_loader): batch = tuple(t.to(device) for t in batch) batch = sort_batch(batch) input_ids, input_lengths, labels = batch outputs = self.model(input_ids) loss = self.criterion(outputs['logits'].view(-1, self.config.n_label), labels.view(-1)) pred = outputs['predicted_intents'].squeeze(-1).cpu().numpy().tolist() target = labels.cpu().numpy().tolist() preds.extend(pred) targets.extend(target) total_loss.append(loss.float().item()) mean_loss = np.mean(total_loss) mean_f1, mean_acc = ic_metric(targets, preds) return mean_loss, mean_f1, mean_acc