def train(args, labeled, resume_from, ckpt_file): batch_size = args["batch_size"] lr = 4.0 momentum = 0.9 epochs = args["train_epochs"] if not os.path.isdir('./.data'): os.mkdir('./.data') global train_dataset, test_dataset train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS']( root='./.data', ngrams=args["N_GRAMS"], vocab=None) global VOCAB_SIZE, EMBED_DIM, NUN_CLASS VOCAB_SIZE = len(train_dataset.get_vocab()) EMBED_DIM = args["EMBED_DIM"] NUN_CLASS = len(train_dataset.get_labels()) trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=generate_batch) net = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = optim.SGD(net.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9) if resume_from is not None: ckpt = torch.load(os.path.join(args["EXPT_DIR"], resume_from)) net.load_state_dict(ckpt["model"]) optimizer.load_state_dict(ckpt["optimizer"]) else: getdatasetstate() net.train() for epoch in tqdm(range(epochs), desc="Training"): running_loss = 0.0 train_acc = 0 for i, data in enumerate(trainloader): text, offsets, cls = data text, offsets, cls = text.to(device), offsets.to(device), cls.to( device) outputs = net(text, offsets) loss = criterion(outputs, cls) optimizer.zero_grad() loss.backward() optimizer.step() train_acc += (outputs.argmax(1) == cls).sum().item() running_loss += loss.item() scheduler.step() print("Finished Training. Saving the model as {}".format(ckpt_file)) print("Training accuracy: {}".format( (train_acc / len(train_dataset) * 100))) ckpt = {"model": net.state_dict(), "optimizer": optimizer.state_dict()} torch.save(ckpt, os.path.join(args["EXPT_DIR"], ckpt_file)) return
def test(args, ckpt_file): batch_size = args["batch_size"] testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=generate_batch) predictions, targets = [], [] net = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device) ckpt = torch.load(os.path.join(args["EXPT_DIR"], ckpt_file)) net.load_state_dict(ckpt["model"]) net.eval() correct, total = 0, 0 with torch.no_grad(): for data in tqdm(testloader, desc="Testing"): text, offsets, cls = data text, offsets, cls = text.to(device), offsets.to(device), cls.to( device) outputs = net(text, offsets) _, predicted = torch.max(outputs.data, 1) predictions.extend(predicted.cpu().numpy().tolist()) targets.extend(cls.cpu().numpy().tolist()) total += cls.size(0) correct += (predicted == cls).sum().item() return {"predictions": predictions, "labels": targets}
def infer(args, unlabeled, ckpt_file): unlabeled = Subset(train_dataset, unlabeled) unlabeled_loader = torch.utils.data.DataLoader( unlabeled, batch_size=args["batch_size"], shuffle=False, num_workers=2, collate_fn=generate_batch) net = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device) ckpt = torch.load(os.path.join(args["EXPT_DIR"], ckpt_file)) net.load_state_dict(ckpt["model"]) net.eval() correct, total = 0, 0 outputs_fin = {} with torch.no_grad(): for i, data in tqdm(enumerate(unlabeled_loader), desc="Inferring"): text, offsets, cls = data text, offsets, cls = text.to(device), offsets.to(device), cls.to( device) outputs = net(text, offsets) _, predicted = torch.max(outputs.data, 1) total += cls.size(0) correct += (predicted == cls).sum().item() for j in range(len(outputs)): outputs_fin[j] = {} outputs_fin[j]["prediction"] = predicted[j].item() outputs_fin[j]["pre_softmax"] = outputs[j].cpu().numpy() return {"outputs": outputs_fin}
def infer(sample): train_dataset, test_dataset, mytrainloader, mytestloader = get_loaders() classes = ("World", "Sports", "Business", "Sci/Tec") VOCAB_SIZE = len(train_dataset.get_vocab()) EMBED_DIM = 32 NUM_CLASS = len(train_dataset.get_labels()) mynet = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device) mycriterion = nn.CrossEntropyLoss().to(device) myoptimizer = optim.SGD(mynet.parameters(), lr=4.0) myscheduler = torch.optim.lr_scheduler.StepLR(myoptimizer, 1, gamma=0.9) sampler = SubsetSampler(sample) dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=1, num_workers=4, sampler=sampler, collate_fn=generate_batch, ) soft = torch.nn.Softmax(dim=0) results = [] infer_outs = {} with torch.no_grad(): with tqdm(total=len(dataloader), desc="Inferring on unlabeled ...") as tq: for r, (text, offsets, cls) in enumerate(dataloader): text, offsets, cls = text.to(device), offsets.to( device), cls.to(device) outputs = mynet(text, offsets) _, predicted = torch.max(outputs.data, 1) ground_truth = cls.item() prediction = predicted.item() infer_outs[r] = soft(outputs[0]).numpy().tolist() tq.update(1) # results.append([sample[r], classes[ground_truth], classes[prediction], probability[prediction],classwiseprobs]) return infer_outs
logging.basicConfig(level=getattr(logging, args.logging_level)) start_time = time.time() logging.info("Loading vocab from: {}".format(args.vocab)) vocab = torch.load(args.vocab) logging.info("Counting training lines and labels") num_labels, train_num_lines = count(train_data_path) logging.info("Counting testing lines and labels") num_labels, test_num_lines = count(test_data_path) logging.info("Loading iterable datasets") train_dataset = Dataset(get_csv_iterator(train_data_path, ngrams, vocab), train_num_lines, num_epochs) test_dataset = Dataset(get_csv_iterator(test_data_path, ngrams, vocab), test_num_lines, num_epochs) logging.info("Creating models") model = TextSentiment(len(vocab), embed_dim, num_labels).to(device) criterion = torch.nn.CrossEntropyLoss().to(device) logging.info("Setup took: {:3.0f}s".format(time.time() - start_time)) logging.info("Starting training") train(lr, num_epochs, train_dataset) test(test_dataset) if args.save_model_path: print("Saving model to {}".format(args.save_model_path)) torch.save(model.to('cpu'), args.save_model_path)
split_ratio = args.split_ratio # two args for sentencepiece tokenizer use_sp_tokenizer = args.use_sp_tokenizer sp_vocab_size = args.sp_vocab_size logging.basicConfig(level=getattr(logging, args.logging_level)) if not os.path.exists(data): print("Creating directory {}".format(data)) os.mkdir(data) import hackson_dataset train_dataset, test_dataset = hackson_dataset.setup_datasets( args.dataset, root='.data', vocab_size=sp_vocab_size) #pdb.set_trace() model = TextSentiment(sp_vocab_size, embed_dim, len(train_dataset.get_labels())).to(device) criterion = torch.nn.CrossEntropyLoss().to(device) # split train_dataset into train and valid train_len = int(len(train_dataset) * split_ratio) sub_train_, sub_valid_ = \ random_split(train_dataset, [train_len, len(train_dataset) - train_len]) train_and_valid(lr, sub_train_, sub_valid_) print("Test - Accuracy: {}".format(test(test_dataset))) if args.save_model_path: print("Saving model to {}".format(args.save_model_path)) torch.save(model.to('cpu'), args.save_model_path) if args.dictionary is not None:
def main(): device = "gpu" if torch.cuda.is_available() else "cpu" train_dataset, test_dataset = get_dataset() VOCAB_SIZE = len(train_dataset.get_vocab()) EMBED_DIM = 32 NUN_CLASS = len(train_dataset.get_labels()) model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device) BATCH_SIZE = 16 N_EPOCHS = 5 min_valid_loss = float('inf') criterion = torch.nn.CrossEntropyLoss().to( device) # mutil-class use the CrossEntropy optimizer = torch.optim.SGD(model.parameters(), lr=4.0) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9) train_len = int(len(train_dataset) * 0.95) sub_train_, sub_valid_ = \ random_split(train_dataset, [train_len, len(train_dataset) - train_len]) train_loader = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch) valid_loader = DataLoader(sub_valid_, batch_size=BATCH_SIZE, collate_fn=generate_batch) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=generate_batch) for epoch in tqdm(range(N_EPOCHS)): start_time = time.time() train_loss, train_acc = train_fn(dataLoader=train_loader, model=model, optimizer=optimizer, scheduler=scheduler, criterion=criterion, device=device) valid_loss, valid_acc = evaluate_fn(dataLoader=valid_loader, model=model, criterion=criterion, device=device) secs = int(time.time() - start_time) mins = secs / 60 secs = secs % 60 print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs)) print( f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)' ) print( f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)' ) if valid_loss < min_valid_loss: torch.save(model.state_dict(), "../weights/text_news{}.pth".format(valid_loss)) print(min_valid_loss, "--------->>>>>>>>", valid_loss) min_valid_loss = valid_loss print('Checking the results of test dataset...') test_loss, test_acc = evaluate_fn(dataLoader=test_loader, model=model, criterion=criterion, device=device) print( f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')
# train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS']( # root='./data/AG/', ngrams=NGRAMS, vocab=None, download=False) train_dataset, test_dataset = _setup_datasets(root='./data/AG/ag_news_csv', ngrams=NGRAMS, vocab=None) BATCH_SIZE = 16 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") VOCAB_SIZE = len(train_dataset.get_vocab()) EMBED_DIM = 32 NUN_CLASS = len(train_dataset.get_labels()) # embbed 层 32维 num class 对应label 4层 vocab print("VOCAB_SIZE", "NUN_CLASS", VOCAB_SIZE, NUN_CLASS) model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device) def generate_batch(batch): label = torch.tensor([entry[0] for entry in batch]) text = [entry[1] for entry in batch] offsets = [0] + [len(entry) for entry in text] # torch.Tensor.cumsum returns the cumulative sum # of elements in the dimension dim. # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0) offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) text = torch.cat(text) return text, offsets, label
return sum(total_accuracy) / len(total_accuracy) if __name__ == "__main__": logger = log.GetLogger(log.logging.INFO) a = ArgParse() logger.info("batch size:{}".format(a.batch_size)) logger.info("device:{}".format(a.device)) logger.info("data_name:{}".format(a.data_name)) logger.info("data_dir:{}".format(a.data_dir)) if not os.path.exists(a.data_dir): print("Creating directory {}".format(a.data_dir)) os.mkdir(data) train, test = text_classification.DATASETS[a.data_name](root=a.data_dir, ngrams=a.ngrams) model = TextSentiment(len(train.get_vocab()), a.embed_dim, len(train.get_labels())).to(a.device) train_len = int(len(train) * a.split_ratio) train2, valid = random_split(train, [train_len, len(train) - train_len]) TrainValid(a.num_epochs, a.num_workers, a.device, a.batch_size, a.lr, a.lr_gamma, train2, valid, model = model) acc = Test(a.batch_size, a.device, test, model) logger.info("Test - Accuracy: {}".format(acc)) if a.save_model_path: logger.info(a.save_model_path) torch.save(model.to('cpu'), a.save_model_path) if a.dictionary is not None: print("Save vocab to {}".format(a.dictionary))
def train_rating_model( YELP_TRAIN, fields, criterion, N_EPOCHS=20, split_ratio=0.9, num_hidden=30, embed_dim=50, actual_embed_dim=50, ): SEED = 0 BATCH_SIZE = 16 # Load and process data train_data = data.TabularDataset(path=YELP_TRAIN, format="json", fields=fields) print(YELP_TRAIN) print("NUM TRAIN", len(train_data.examples)) assert len(train_data.examples) > 2 TEXT = fields["text"][1] TEXT.build_vocab(train_data, vectors="glove.6B.%dd" % embed_dim) # Load model model = TextSentiment( vocab_size=len(TEXT.vocab), vocab=TEXT.vocab, embed_dim=actual_embed_dim, num_class=1, num_hidden=num_hidden, ) # define optimizer and loss optimizer = optim.Adam(model.parameters()) # criterion = nn.CrossEntropyLoss() # Train the model random.seed(0) train_data, valid_data = train_data.split(split_ratio=split_ratio, random_state=random.getstate()) train_iterator, valid_iterator = data.Iterator.splits( (train_data, valid_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, shuffle=True, ) # iterator = data.Iterator( # train_data, # batch_size = BATCH_SIZE, # sort_key = lambda x: len(x.text), # sort_within_batch=True, # shuffle=True) for epoch in range(N_EPOCHS): train_loss = train(model, train_iterator, optimizer, criterion) if epoch % 5 == 0: print(f"\tTrain Loss {epoch}: {train_loss:.3f}") evaluate(model, valid_iterator, criterion) evaluate(model, valid_iterator, criterion) return model
from torchtext.data.utils import get_tokenizer from dataset import get_dataset import pickle import argparse import torch ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"} WEIGHT_PATH = "../weights/text_news0.2672930294473966.pth" vocab = pickle.load(open(".data/save_vocab.p", "rb")) device = "cuda" if torch.cuda.is_available() else "cpu" VOCAB_SIZE = 1308844 EMBED_DIM = 32 NUM_CLASS = 4 model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS) checkpoint = torch.load(WEIGHT_PATH, map_location=torch.device('cpu')) model.load_state_dict(checkpoint) model.to(device) def predict(text, model, vocab, ngrams): tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text = torch.tensor([ vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams) ]) output = model(text, torch.tensor([0])) return output.argmax(1).item() + 1