def speech_tagging_test(): st_time = time.time() data = Dataset("pos_tags.txt", "pos_sentences.txt", train_test_split=0.8, seed=0) data.train_data = data.train_data[:100] data.test_data = data.test_data[:10] model = model_training(data.train_data, data.tags) tagging = sentence_tagging(data.test_data, model, data.tags) total_words = 0 total_correct = 0 for i in range(len(tagging)): correct, words, accur = accuracy(tagging[i], data.test_data[i].tags) total_words += words total_correct += correct print("accuracy: ", accur) print("Your total accuracy: ", total_correct * 1.0 / total_words) print("My total accuracy: ", 0.7761904761904762) en_time = time.time() print("sentence_tagging total time: ", en_time - st_time)
def word2vec(input_file_name, output_file_name, emb_dimension=100, batch_size=100000, window_size=5, iteration=5, using_hs=False, using_neg=False, context_size=2, ): dataset = Dataset(input_file_name) optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=0.01) pair_count = dataset.pair_count(self.window_size) batch_count = iteration * pair_count / batch_size skip_gram_model = skip_gram(self.emb_size, self.emb_dimension) skip_gram_model.save_embedding(self.data.idx2word, 'skip_gram_begin_embedding.txt') for i in range(batch_count): word_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_hs(word_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [pair[1] for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() self.skip_gram_model.save_embedding(self.data.idx2word, self.output_file_name)
def run(fold): dfx = pd.read_csv(TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = Dataset(tokenizer, df_train.src.values, \ df_train.dst.values, df_train.label.values) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, num_workers=1) train_fn(train_data_loader, model, optimizer, device)
def train(config): net = BertForMaskedLM.from_pretrained(config.model) lossFunc = KLDivLoss(config) if torch.cuda.is_available(): net = net.cuda() lossFunc = lossFunc.cuda() if config.dataParallel: net = DataParallelModel(net) lossFunc = DataParallelCriterion(lossFunc) options = optionsLoader(LOG, config.optionFrames, disp=False) Tokenizer = BertTokenizer.from_pretrained(config.model) prepareFunc = prepare_data trainSet = Dataset('train', config.batch_size, lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer, options['dataset'], LOG, 'train') validSet = Dataset('valid', config.batch_size, lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer, options['dataset'], LOG, 'valid') print(trainSet.__len__()) Q = [] best_vloss = 1e99 counter = 0 lRate = config.lRate prob_src = config.prob_src prob_tgt = config.prob_tgt num_train_optimization_steps = trainSet.__len__( ) * options['training']['stopConditions']['max_epoch'] param_optimizer = list(net.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lRate, e=1e-9, t_total=num_train_optimization_steps, warmup=0.0) for epoch_idx in range(options['training']['stopConditions']['max_epoch']): total_seen = 0 total_similar = 0 total_unseen = 0 total_source = 0 trainSet.setConfig(config, prob_src, prob_tgt) trainLoader = data.DataLoader(dataset=trainSet, batch_size=1, shuffle=True, num_workers=config.dataLoader_workers, pin_memory=True) validSet.setConfig(config, 0.0, prob_tgt) validLoader = data.DataLoader(dataset=validSet, batch_size=1, shuffle=False, num_workers=config.dataLoader_workers, pin_memory=True) for batch_idx, batch_data in enumerate(trainLoader): if (batch_idx + 1) % 10000 == 0: gc.collect() start_time = time.time() net.train() inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data inputs = inputs[0].cuda() positions = positions[0].cuda() token_types = token_types[0].cuda() labels = labels[0].cuda() masks = masks[0].cuda() total_seen += batch_seen total_similar += batch_similar total_unseen += batch_unseen total_source += batch_source n_token = int((labels.data != 0).data.sum()) predicts = net(inputs, positions, token_types, masks) loss = lossFunc(predicts, labels, n_token).sum() Q.append(float(loss)) if len(Q) > 200: Q.pop(0) loss_avg = sum(Q) / len(Q) optimizer.zero_grad() loss.backward() optimizer.step() LOG.log( 'Epoch %2d, Batch %6d, Loss %9.6f, Average Loss %9.6f, Time %9.6f' % (epoch_idx + 1, batch_idx + 1, loss, loss_avg, time.time() - start_time)) # Checkpoints idx = epoch_idx * trainSet.__len__() + batch_idx + 1 if (idx >= options['training']['checkingPoints']['checkMin']) and ( idx % options['training']['checkingPoints']['checkFreq'] == 0): if config.do_eval: vloss = 0 total_tokens = 0 for bid, batch_data in enumerate(validLoader): inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data inputs = inputs[0].cuda() positions = positions[0].cuda() token_types = token_types[0].cuda() labels = labels[0].cuda() masks = masks[0].cuda() n_token = int((labels.data != config.PAD).data.sum()) with torch.no_grad(): net.eval() predicts = net(inputs, positions, token_types, masks) vloss += float(lossFunc(predicts, labels).sum()) total_tokens += n_token vloss /= total_tokens is_best = vloss < best_vloss best_vloss = min(vloss, best_vloss) LOG.log( 'CheckPoint: Validation Loss %11.8f, Best Loss %11.8f' % (vloss, best_vloss)) if is_best: LOG.log('Best Model Updated') save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': best_vloss }, is_best, path=config.save_path, fileName='latest.pth.tar') counter = 0 else: counter += options['training']['checkingPoints'][ 'checkFreq'] if counter >= options['training']['stopConditions'][ 'rateReduce_bound']: counter = 0 for param_group in optimizer.param_groups: lr_ = param_group['lr'] param_group['lr'] *= 0.55 _lr = param_group['lr'] LOG.log( 'Reduce Learning Rate from %11.8f to %11.8f' % (lr_, _lr)) LOG.log('Current Counter = %d' % (counter)) else: save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': 1e99 }, False, path=config.save_path, fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '_Batch' + str(batch_idx + 1) + '.pth.tar') LOG.log('CheckPoint Saved!') if options['training']['checkingPoints']['everyEpoch']: save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': 1e99 }, False, path=config.save_path, fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '.pth.tar') LOG.log('Epoch Finished.') LOG.log( 'Total Seen: %d, Total Unseen: %d, Total Similar: %d, Total Source: %d.' % (total_seen, total_unseen, total_similar, total_source)) gc.collect()
from data_process import Dataset import csv from ast import literal_eval if __name__ == '__main__': class Opts(object): def __init__(self): self.window_size = 10 self.vocab_size = 400000 opts = Opts() data = Dataset(opts) incomplete = data.next_batch(100) with open('train_x.csv', 'wb') as csvfile: train_x_writer = csv.writer(csvfile, delimiter=',') with open('train_y.csv', 'wb') as csvfile: train_y_writer = csv.writer(csvfile, delimiter=',') incomplete = data.next_batch(1) while incomplete: train_x_writer.writerow(data.X_train_batch) train_y_writer.writerow(data.y_train_batch) incomplete = data.next_batch(1) with open('train_x.csv', 'rb') as csvfile: train_x_reader = csv.reader(csvfile, delimiter=',') batch = [] for i in xrange(100): batch.append(int(train_x_reader.next()[0])) print(batch)
import numpy as np from data_process import Dataset data = Dataset("pos_tags.txt", "pos_sentences.txt", train_test_split=0.8, seed=0) train_data = data.words #obs_dict = {i: j for i, j in zip(data.train_data, range(len(data.train_data)))} np_array = np.array(data.train_data) print(data.tags) print(np_array)
from data_process import get_processed_datas, Dataset test_imgs, test_labels, test_name_list = get_processed_datas(list_root_test) # from cnn_finetune import make_model # model = make_model('resnet18',num_classes=num_class,pretrained=False,input_size=input_sizes).cuda() model = vgg16_bn(num_classes=num_class, init_weights=False).cuda() model.load_state_dict(torch.load(model_path)) model.eval() model2 = MyCapsNet_cam16(A=A, B=B, C=C, D=D, E=num_class, iters=em_iters).cuda() model2.load_state_dict(torch.load(model_path2)) model2.eval() test_dataset = Dataset(test_imgs,test_labels,is_train=False) test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=False,num_workers=0,drop_last=False) preds0, preds1, preds2 = test() savepath = path + 'results.csv' r0 = preds0 save_csv(test_name_list,test_labels,r0,preds1,preds2,savepath) print('accuracy0:', test_accuracy(preds0, test_labels)) print('accuracy1:', test_accuracy(preds1, test_labels)) print('accuracy2:', test_accuracy(preds2, test_labels)) precision, recall, f1, acc = test_prfa(preds0, test_labels) print('precision:', precision) print('recall:', recall)
def main(_): cfg_file = 'config.yaml' # hparams hparams = load_config(cfg_file, section='hparams') # logger logger = create_logger('textcnn') # prepare datasets files_path = load_config(cfg_file, section='path') trainset = Dataset(files_path['train_data_path'], logger) num_classes = trainset.num_classes validset = Dataset(files_path['valid_data_path'], logger, dict_class_to_label=trainset.dict_class_to_label) logger.info('dict_class_to_label: %s', trainset.dict_class_to_label) logger.info('trainset label_stat: %s', trainset.label_stat) logger.info('validset label_stat: %s', validset.label_stat) # load vocab, embed vocab = load_vocab(files_path['vocab_path']) word_embed = load_embed(files_path['word_embed_path']) hparams.add_hparam('vocab_size', word_embed.shape[0]) hparams.add_hparam('embed_size', word_embed.shape[1]) # load model logger.info('loading model...') graph = tf.Graph() with graph.as_default(): model = TextCNN(hparams=hparams, num_classes=num_classes, logger=logger) # train model with tf.Session(graph=graph) as sess: # debug if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, dump_root='tfdbg') # init model sess.run(tf.global_variables_initializer()) logger.info('params initialized') # create a saver saver = tf.train.Saver() save_path = files_path['save_model_path'] os.makedirs(os.path.dirname(save_path), exist_ok=True) # performance of the model before training loss_valid, acc = evaluate(sess, model, validset, vocab, word_embed) logger.info('loss_valid: %.4f\tacc: %.4f', loss_valid, acc) best_result = {'loss_valid': loss_valid, 'acc': acc} patience = 0 # train model for id_epoch in range(hparams.num_epoch): train_epoch(sess, model, trainset, vocab, word_embed, id_epoch) # train epoch loss_valid, acc = evaluate(sess, model, validset, vocab, word_embed) # evaluate logger.info('Epoch: %d\tloss_valid: %.4f\tacc: %.4f', id_epoch + 1, loss_valid, acc) if loss_valid < best_result['loss_valid']: # save model saver.save(sess=sess, save_path=save_path) logger.info('model saved in %s', save_path) best_result = {'loss_valid': loss_valid, 'acc': acc} patience = 0 else: # early stopping patience += 1 if patience >= hparams.earlystop_patience: logger.info('earlystop.') logger.info('Best result: loss_valid: %.4f\tacc: %.4f', best_result['loss_valid'], best_result['acc']) break
from learner import HawkesProcessLearner from Config import Config from data_process import Dataset import pickle from data_process import Sample train_data = Dataset("train") test_data = Dataset("test") print("==== dataset read ====") dim = Config.dim lam = Config.lam row = Config.row beta = Config.beta learner = HawkesProcessLearner(lam, row, beta, train_data, test_data, dim) A, mu = learner.train(epoc=100, verbose=True) file = open("A.dat", "wb") pickle.dump(A, file) file2 = open("mu.dat", "wb") pickle.dump(mu, file2)
continue low = self.mu[sample.get_point(i)[1]] for j in range(i): low += self.A[sample.get_point(i)[1]][sample.get_point( j)[1]] * self.g_func( sample.get_point(i)[0] - sample.get_point(j)[0]) sum_pii += self.mu[u] / low return sum_pii def g_func(x): return math.exp(-Config.beta * x) if __name__ == '__main__': dim = Config.dim A = np.random.rand(dim, dim) mu = np.random.rand(dim) dataset = Dataset('train') checker = Checker(A, mu, g_func) for i in range(100): batch_data = dataset.get_next_batch() for u in range(dim): print(i, u) p1 = checker.pii_func(batch_data, u) p2 = checker.force_sum_pii(batch_data, u) print(p1 - p2) if math.fabs(p1 - p2) > 1e-7: print('ERROR') print("expected " + str(p2) + " get " + str(p1))
import numpy as np import torch from torch import nn, optim from torchvision import datasets, transforms, models from collections import OrderedDict import torchvision.transforms.functional as TF import matplotlib.pyplot as plt import time from model import Model from data_process import Dataset md = Model() dataset = Dataset() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Running on {}".format(str(device).upper())) model_name = 'resnet50' output_size = 102 hidden_layers = [1000] model = md.creat_network(model_name, output_size, hidden_layers) model.to(device) save_path = "output/" def train(epochs, model, optimizers,