def main(): opt = get_args() decode_fn = setup_inference(opt) device = get_device() model = torch.load(open(opt.model, mode='rb'), map_location=device) model = model.to(device) trg_i2c = {i: c for c, i in model.trg_c2i.items()} decode_trg = lambda seq: [trg_i2c[i] for i in seq] maybe_mkdir(opt.out_file) with open(opt.in_file, 'r', encoding='utf-8') as in_fp, \ open(opt.out_file, 'w', encoding='utf-8') as out_fp: for line in in_fp.readlines(): toks = line.strip().split('\t') if len(toks) < 2 or line[0] == '#': # pass through out_fp.write(line) continue # word, lemma, tags = toks[1], toks[2], toks[5] word, tags = toks[1], toks[5] word, tags = list(word), tags.split(';') src = encode(model, word, tags, device) pred, _ = decode_fn(model, src) pred_out = ''.join(decode_trg(pred)) # write lemma toks[2] = pred_out out_fp.write('\t'.join(toks) + '\n')
def main(): args = parse_args() global_dic = runpy.run_path(args.params) assert global_dic.get('common_params', None) is not None, \ 'common_params should be in params.py' assert global_dic.get('train_params', None) is not None, \ 'train_params should be in params.py' assert global_dic.get('eval_params', None) is not None, \ 'eval_params should be in params.py' assert args.mode in ['train', 'eval', 'inference'], \ 'mode should be one of [train, eval, inference]' common_params = global_dic['common_params'] train_params = global_dic['train_params'] eval_params = global_dic['eval_params'] device = get_device() estimator = Estimator(device, common_params) loss_func = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID) optimizer = torch.optim.Adam(estimator.get_model_parameters(), train_params.learning_rate) if args.mode == 'train': estimator.train(train_params, loss_func, optimizer) elif args.mode == 'eval': estimator.eval(eval_params, loss_func) else: assert args.input is not None, '--input required on inference mode.' estimator.inference(args.input, eval_params)
def evaluate(args: argparse.Namespace) -> None: device = util.get_device() model: MorphemeVectors = torch.load(args.morpheme_vectors) model.to(device) model.run_testing(batch_size=args.batch_size, start_of_morpheme=args.start_of_morpheme, end_of_morpheme=args.end_of_morpheme)
def main(): check_params(eval_params) device = get_device() print(f' Available device is {device}') src_tokenizer = eval_params.src_tokenizer() tgt_tokenizer = eval_params.tgt_tokenizer() checkpoint_path = eval_params.checkpoint_path base_dir = os.getcwd() dataset_dir = os.path.join(base_dir, 'dataset') src_vocab_file_path = os.path.join(dataset_dir, eval_params.src_vocab_filename) tgt_vocab_file_path = os.path.join(dataset_dir, eval_params.tgt_vocab_filename) src_word_embedding_file_path = os.path.join( dataset_dir, eval_params.src_word_embedding_filename) tgt_word_embedding_file_path = os.path.join( dataset_dir, eval_params.tgt_word_embedding_filename) src_corpus_file_path = os.path.join(dataset_dir, eval_params.src_corpus_filename) tgt_corpus_file_path = os.path.join(dataset_dir, eval_params.tgt_corpus_filename) src_word2id, src_id2word, src_embedding = check_vocab_embedding( src_vocab_file_path, src_word_embedding_file_path) tgt_word2id, tgt_id2word, tgt_embedding = check_vocab_embedding( tgt_vocab_file_path, tgt_word_embedding_file_path) encoder_params.vocab_size = len(src_word2id) encoder_params.device = device encoder = eval_params.encoder(encoder_params) # encoder.init_embedding_weight(src_embedding) decoder_params.vocab_size = len(tgt_word2id) decoder_params.device = device decoder = eval_params.decoder(decoder_params) # decoder.init_embedding_weight(tgt_embedding) model: nn.Module = Seq2Seq(encoder, decoder) loss_func = nn.CrossEntropyLoss() checkpoint = torch.load(os.path.join(base_dir, checkpoint_path)) model.load_state_dict(checkpoint['model_state_dict']) model.to(device) dataset = ParallelTextDataSet(src_tokenizer, tgt_tokenizer, src_corpus_file_path, tgt_corpus_file_path, encoder_params.max_seq_len, decoder_params.max_seq_len, src_word2id, tgt_word2id) data_loader = DataLoader(dataset, eval_params.batch_size, collate_fn=dataset.collate_func) # avg_loss, bleu_score = eval_model(model, loss_func, data_loader, device, tgt_id2word) avg_loss, bleu_score = eval_model(model, loss_func, data_loader, device, tgt_id2word)
def load_ensemble_from_dirs(model_dirs): """ Create ensemble classifier from models stored in the specified directories """ models = load_models(model_dirs) ensemble = Ensemble(models) ensemble.to(get_device()) return ensemble
def load_ensemble_from_checkpoints(checkpoints): models = [] for chk in checkpoints: model = get_model() model.load_state_dict(chk["model"]) models.append(model) ensemble = Ensemble(models) ensemble.to(get_device()) return ensemble
def post_stream_updates(client, city, hourly_forecast): """ Posts hourly forecast updates to the given cities streams. """ print("posting stream_updates...") try: device = util.get_device(client, city) timestamps = get_timestamps(hourly_forecast) temperature_values = get_temperature_values(hourly_forecast, timestamps) humidity_values = get_humidity_values(hourly_forecast, timestamps) pressure_values = get_pressure_values(hourly_forecast, timestamps) ozone_values = get_ozone_values(hourly_forecast, timestamps) precip_values = get_precip_values(hourly_forecast, timestamps) for stream_name in STREAMS: stream = util.get_stream(device, stream_name, STREAMS[stream_name]) time.sleep(5) try: if stream_name == "Temperature": print("posting temperature values:") print(temperature_values) stream.post_values(temperature_values) elif stream_name == "Humidity": print("posting humidity values:") print(humidity_values) stream.post_values(humidity_values) elif stream_name == "Pressure": print("posting pressure values:") print(pressure_values) stream.post_values(pressure_values) elif stream_name == "Ozone": print("posting ozone values:") print(ozone_values) stream.post_values(ozone_values) elif stream_name == "Precipitation": print("posting precipitation values:") print(precip_values) stream.post_values(precip_values) except Exception as e: print("Exception occurred while posting " + stream_name + " values to " + city + ":\n" + str(e)) except Exception as e: print("Exception occurred while getting device for " + city + ":\n" + str(e))
def __init__(self, model, optimizer, loss, train_loader, test_loader, validate_loader): self.model = model self.train_loader = train_loader self.test_loader = test_loader self.validate_loader = validate_loader self.optimizer = optimizer self.loss = loss self.device = util.get_device() config = util.get_config() self.sample_length = int(config['sample_length']) self.lower_pitch_limit = int(config['lower_pitch_limit']) self.upper_pitch_limit = int(config['upper_pitch_limit']) self.classes = [ x for x in range(self.lower_pitch_limit, self.upper_pitch_limit) ] self.current_epoch = 0
def main(): opt = get_args() decode_fn = setup_inference(opt) device = get_device() model = torch.load(open(opt.model, mode='rb'), map_location=device) model = model.to(device) trg_i2c = {i: c for c, i in model.trg_c2i.items()} decode_trg = lambda seq: [trg_i2c[i] for i in seq] maybe_mkdir(opt.out_file) with open(opt.out_file, 'w', encoding='utf-8') as fp: for lemma, tags in read_file(opt.in_file, opt.lang): src = encode(model, lemma, tags, device) pred, _ = decode_fn(model, src) pred_out = ''.join(decode_trg(pred)) fp.write(f'{"".join(lemma)}\t{pred_out}\t{";".join(tags[1:])}\n')
def run_baseline(output_dir, num_models=3, max_train_samples=None, epochs=3, finetune=False, uncertainty_strategy='best'): started = time.time() model_dirs = [] for i in range(num_models): model_dir = os.path.join(output_dir, f'm{i + 1}') model_dirs.append(model_dir) for model_dir in model_dirs: print("=== Training model", model_dir, "===") trainer = Trainer(max_train_samples=max_train_samples, epochs=epochs, finetune=finetune, uncertainty_strategy=uncertainty_strategy, output_path=model_dir, arch="resnet", layers=18) trainer.train() y_true, y_pred = trainer.evaluate() plot_roc_auc(y_true, y_pred, save_to_file=True, output_path=model_dir) print("=== Completed training of", model_dir, "===") display_elapsed_time(started, "Total elapsed") print() ensemble = load_ensemble_from_dirs(model_dirs) results = evaluate(model=ensemble, dataloader=get_val_loader(), device=get_device()) labels = results['labels'] preds = results['predictions'] final_auc = mt.roc_auc_score(labels, preds) print("Ensemble Validation AUC Score", final_auc) plot_roc_auc(labels, preds, save_to_file=True, output_path=output_dir) display_elapsed_time(started, "Total time taken")
def create_init_state(self, batch_size, train=True, gpu=None): """Create initial state (hidden layers) filled with zeros.""" volatile = not train state = {} with util.get_device(gpu): if gpu is None: xp = np else: xp = cuda.cupy for layer_num, l in enumerate(self.layers, 1): h_data = xp.zeros((batch_size, l), dtype=np.float32) h = chainer.Variable(h_data, volatile=volatile) state['h' + str(layer_num)] = h if self.lstm: c_data = xp.zeros((batch_size, l), dtype=np.float32) c = chainer.Variable(c_data, volatile=volatile) state['c' + str(layer_num)] = c assert len(self.layers) > 0 state['h_last'] = h return state
def reinflect(model_source, lemmas, tags, poses, multi=False): decode_fn = setup_inference() device = get_device() model = torch.load(open(model_source, mode='rb'), map_location=device) model = model.to(device) forms = [] for i, (lemma, tag) in enumerate(zip(lemmas, tags)): pos = poses[i] if multi or isinstance(tag, list): # we need to inflect many times preds = set() for t in tag: pred_out = reinflect_form(model, device, decode_fn, t, pos, lemma) preds.add(pred_out) forms.append(list(preds)) else: pred_out = reinflect_form(model, device, decode_fn, tag, pos, lemma) forms.append(f"{pred_out}") return forms
def train(args: argparse.Namespace) -> None: device = util.get_device() logging.info( f"Training MorphemeVectors on {str(device)} using {args.corpus} as training data" ) model: MorphemeVectors = MorphemeVectors( corpus=MorphemeCorpus.load(args.corpus), hidden_layer_size=args.hidden_size, num_hidden_layers=args.hidden_layers, device=device) model.run_training(learning_rate=args.learning_rate, epochs=args.num_epochs, batch_size=args.batch_size, logging_frequency=args.print_every) logging.info(f"Saving model to {args.output_file}") model.to(torch.device("cpu")) torch.save(model, args.output_file)
''' all model ''' from collections import namedtuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from util import get_device, BOS_IDX, EOS_IDX, PAD_IDX EPSILON = 1e-7 DEVICE = get_device() class StackedLSTM(nn.Module): ''' step-by-step stacked LSTM ''' def __init__(self, input_siz, rnn_siz, nb_layers, dropout): ''' init ''' super().__init__() self.nb_layers = nb_layers self.rnn_siz = rnn_siz self.layers = nn.ModuleList() self.dropout = nn.Dropout(dropout) for _ in range(nb_layers):
def run_multiside(output_dir): max_batches = None started = time.time() sides = ['frontal', 'lateral'] base_dir = os.path.join(output_dir, 'base') base_trainer = Trainer(max_train_samples=max_batches, epochs=1, finetune=True, uncertainty_strategy='best', output_path=base_dir, arch="resnet") print("Training base model") base_trainer.train() y_true, y_pred = base_trainer.evaluate() plot_roc_auc(y_true, y_pred, save_to_file=True, output_path=base_dir) print("=== Completed training of base model", base_dir, "===") display_elapsed_time(started, "Total elapsed") print("== Extracting features ==") base_model_ws = base_trainer.train_results['checkpoints'].checkpoints[0][ 'model'] base_model = get_model(arch='resnet') base_model.load_state_dict(base_model_ws) print() print("Training frontal model") frontal_dir = os.path.join(output_dir, 'frontal') frontal_trainer = Trainer(max_train_samples=max_batches, epochs=2, model=get_clone(base_model), uncertainty_strategy='best', side='frontal', output_path=frontal_dir) frontal_trainer.train() y_true, y_pred = frontal_trainer.evaluate() plot_roc_auc(y_true, y_pred, save_to_file=True, output_path=frontal_dir) print("=== Completed training frontal model", frontal_dir, "===") print() print("Training lateral model") lateral_dir = os.path.join(output_dir, 'lateral') lateral_trainer = Trainer(max_train_samples=max_batches, epochs=2, model=get_clone(base_model), uncertainty_strategy='best', side='lateral', output_path=lateral_dir) lateral_trainer.train() y_true, y_pred = lateral_trainer.evaluate() plot_roc_auc(y_true, y_pred, save_to_file=True, output_path=lateral_dir) print("=== Completed training lateral model", lateral_dir, "===") print() frontal = load_ensemble_from_dirs([frontal_dir]) lateral = load_ensemble_from_dirs([lateral_dir]) multiside = MultiSide(frontal=frontal, lateral=lateral) multiside.to(get_device()) results = evaluate_multiside(multiside, get_val_loader_for_multiside(), get_device()) labels = results['labels'] preds = results['predictions'] final_auc = mt.roc_auc_score(labels, preds) print("Ensemble Validation AUC", final_auc) plot_roc_auc(labels, preds, save_to_file=True, output_path=output_dir) display_elapsed_time(started, "Total time taken")
def test(args: dict(), save_flag: bool, seed_val): device = util.get_device(device_no=args.device_no) model = torch.load(args.model_path, map_location=device) random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) testfile = args.input_file true_label = args.label truncation = args.truncation n_samples = None if "n_samples" in args: n_samples = args.n_samples # Load the BERT tokenizer. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) max_len = 0 reviews = [] labels = [] with open(testfile, "r") as fin: reviews = fin.readlines() reviews = [rev.lower() for rev in reviews] if n_samples == None: n_samples = len(reviews) indices = np.random.choice(np.arange(len(reviews)), size=n_samples) selected_reviews = [reviews[idx] for idx in indices] labels = [0 if true_label == "negative" else 1]*len(selected_reviews) # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # For every sentence... for rev in selected_reviews: # `encode_plus` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. # (5) Pad or truncate the sentence to `max_length` # (6) Create attention masks for [PAD] tokens. input_id = tokenizer.encode(rev, add_special_tokens=True) if len(input_id) > 512: if truncation == "tail-only": # tail-only truncation input_id = [tokenizer.cls_token_id]+input_id[-511:] elif truncation == "head-and-tail": # head-and-tail truncation input_id = [tokenizer.cls_token_id]+input_id[1:129]+input_id[-382:]+[tokenizer.sep_token_id] else: # head-only truncation input_id = input_id[:511]+[tokenizer.sep_token_id] input_ids.append(torch.tensor(input_id).view(1,-1)) attention_masks.append(torch.ones([1,len(input_id)], dtype=torch.long)) else: encoded_dict = tokenizer.encode_plus( rev, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' max_length = 512, # Pad & truncate all sentences. pad_to_max_length = True, return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids.append(encoded_dict['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) # Set the batch size. batch_size = 8 # Create the DataLoader. prediction_data = TensorDataset(input_ids, attention_masks, labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) print('Predicting labels for {:,} test sentences...'.format(len(input_ids))) # Put model in evaluation mode model.eval() # Tracking variables predictions , true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) print('DONE.') return predictions, true_labels, selected_reviews
parser = argparse.ArgumentParser() parser.add_argument("--input_dir", help="input directory of images", type=str) parser.add_argument("--output_dir", help="output directory of images", type=str) parser.add_argument("--max_epsilon", help="maximum perturbation", type=int) args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) script_folder = os.path.dirname(os.path.abspath(__file__)) Config = ConfigParser.ConfigParser() Config.read(script_folder + '/config.ini') try: gpu = util.get_device(Config.getint('GPU', 'id')) except: gpu = "/gpu:0" batch_size = Config.getint('input', 'batch_size') num_classes = Config.getint('input', 'num_classes') image_height = Config.getint('input', 'image_height') image_width = Config.getint('input', 'image_width') try: max_number = Config.getint('input', 'max_number') except: max_number = None loss_func = Config.get('model', 'loss_func') sigmoid_perturb = Config.getboolean('model', 'sigmoid_perturb') if sigmoid_perturb: lr = Config.getfloat('model', 'learning_rate') itr = Config.getint('model', 'iteration')
#==================================================================== # Main program: read devices, then get traffic statistics from each # Get arguments passed to our application print 'argv: ',argv interval = int(argv[1]) if len(argv) >= 2 else 5 count = int(argv[2]) if len(argv) >= 3 else 5 device_ip = argv[3] if len(argv) >= 4 else '10.30.30.1' interface = argv[4] if len(argv) >= 5 else 'gigabitethernet 0/1' # Read device information from database, into list of device info lists devices_from_db = read_devices_db('devices.db') # Get device information for our device device = get_device(devices_from_db, device_ip) if device == None: print '!!! Cannot find device in DB!' exit() logfile = 'dev-stats-log' # set output CSV log file # Gather traffic data for the devices in the list gather_traffic_data(logfile, device, interface, interval, count) dev_stats_log = open(logfile,'r') csv_log = csv.reader(dev_stats_log) log_info_list = [log_info for log_info in csv_log] # Print log information for our one device
from util import plotutil import util # Config related config = util.get_config() base_data_dir = config['base_data_dir'] total_epoch = config['total_epoch'] sample_rate = int(config['sample_rate']) sample_length = int(config['sample_length']) batch_size = int(config['batch_size']) learning_rate = float(config['learning_rate']) weight_decay = float(config['weight_decay']) lower_pitch_limit = int(config['lower_pitch_limit']) upper_pitch_limit = int(config['upper_pitch_limit']) classes = [x for x in range(lower_pitch_limit, upper_pitch_limit)] device = util.get_device() def load_dataset(): train_ds = NSynthDataSet_RawAudio('train') train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True) test_ds = NSynthDataSet_RawAudio('test') test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=True) validate_ds = NSynthDataSet_RawAudio('validate') validate_loader = torch.utils.data.DataLoader(validate_ds, batch_size=batch_size, shuffle=True)
def get_sentence_sentiment(args: dict(), texts): seed_val = args.seed_val device = util.get_device(device_no=args.device_no) model = torch.load(args.model_path, map_location=device) random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Load the BERT tokenizer. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) max_len = 0 reviews = [] # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # For every sentence... for rev in texts: input_id = tokenizer.encode(rev, add_special_tokens=True) if len(input_id) > 512: if args.truncation == "tail-only": input_id = [tokenizer.cls_token_id] + input_id[-511:] elif args.truncation == "head-and-tail": input_id = [tokenizer.cls_token_id ] + input_id[1:129] + input_id[-382:] + [ tokenizer.sep_token_id ] else: input_id = input_id[:511] + [tokenizer.sep_token_id] input_ids.append(torch.tensor(input_id).view(1, -1)) attention_masks.append( torch.ones([1, len(input_id)], dtype=torch.long)) else: encoded_dict = tokenizer.encode_plus( rev, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=512, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids.append(encoded_dict['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) # labels = torch.tensor(labels) # Set the batch size. batch_size = 8 # Create the DataLoader. prediction_data = TensorDataset(input_ids, attention_masks) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) print('Predicting labels for {:,} sentences...'.format(len(input_ids))) # Put model in evaluation mode model.eval() # Tracking variables predictions, true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() # label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) # true_labels.append(label_ids) print(' DONE.') # return predictions, true_labels # preds, true_labels = test(args, False, seed_val) # Combine the results across all batches. flat_predictions = np.concatenate(predictions, axis=0) # For each sample, pick the label (0 or 1) with the higher score. flat_predictions = np.argmax(flat_predictions, axis=1).flatten() return flat_predictions
def main(): check_params(train_params) device = get_device() # device = 'cpu' print(f' Available device is {device}') src_tokenizer = train_params.src_tokenizer() tgt_tokenizer = train_params.tgt_tokenizer() base_dir = os.getcwd() dataset_dir = os.path.join(base_dir, 'dataset') src_vocab_file_path = os.path.join(dataset_dir, train_params.src_vocab_filename) tgt_vocab_file_path = os.path.join(dataset_dir, train_params.tgt_vocab_filename) src_word_embedding_file_path = os.path.join(dataset_dir, train_params.src_word_embedding_filename) tgt_word_embedding_file_path = os.path.join(dataset_dir, train_params.tgt_word_embedding_filename) src_corpus_file_path = os.path.join(dataset_dir, train_params.src_corpus_filename) tgt_corpus_file_path = os.path.join(dataset_dir, train_params.tgt_corpus_filename) src_word2id, src_id2word, src_embed_matrix = ensure_vocab_embedding( src_tokenizer, src_vocab_file_path, src_word_embedding_file_path, src_corpus_file_path, encoder_params.embedding_dim, "Source") tgt_word2id, tgt_id2word, tgt_embed_matrix = ensure_vocab_embedding( tgt_tokenizer, tgt_vocab_file_path, tgt_word_embedding_file_path, tgt_corpus_file_path, decoder_params.embedding_dim, "Target") dataset = ParallelTextDataSet(src_tokenizer, tgt_tokenizer, src_corpus_file_path, tgt_corpus_file_path, encoder_params.max_seq_len, decoder_params.max_seq_len, src_word2id, tgt_word2id) data_loader = DataLoader(dataset, batch_size=train_params.batch_size, shuffle=True, collate_fn=dataset.collate_func) encoder_params.vocab_size = len(src_word2id) encoder_params.device = device encoder = train_params.encoder(encoder_params) # Freeze word embedding weight encoder.init_embedding_weight(src_embed_matrix) decoder_params.vocab_size = len(tgt_word2id) decoder_params.device = device decoder = train_params.decoder(decoder_params) # Freeze word embedding weight decoder.init_embedding_weight(tgt_embed_matrix) model: nn.Module = Seq2Seq(encoder, decoder) model.to(device) loss_func = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID) optimizer = torch.optim.Adam(model.parameters(), lr=train_params.learning_rate) epoch = 0 avg_loss = 0. for epoch in range(train_params.n_epochs): avg_loss = train_model(model, optimizer, loss_func, data_loader, device, train_params, encoder_params, decoder_params, epoch + 1) save_dir_path = os.path.join(train_params.model_save_directory, get_checkpoint_dir_path(epoch + 1)) if not os.path.exists(save_dir_path): os.makedirs(save_dir_path) # save checkpoint for last epoch torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': avg_loss }, os.path.join(save_dir_path, 'checkpoint.tar'))
started = time.time() model_dirs = args.output_path.split(",") for model_dir in model_dirs: print("=== Training model", model_dir, "===") trainer = Trainer(max_train_samples=args.max_train_samples, epochs=args.epochs, finetune=args.finetune, uncertainty_strategy=args.uncertainty_strategy, output_path=model_dir) trainer.train() y_true, y_pred = trainer.evaluate() plot_roc_auc(y_true, y_pred, save_to_file=True, output_path=model_dir) print("=== Completed training of", model_dir, "===") display_elapsed_time(started, "Total elapsed") print() ensemble = load_ensemble_from_dirs(model_dirs) results = evaluate(model=ensemble, dataloader=get_val_loader(), device=get_device()) labels = results['labels'] preds = results['predictions'] final_auc = mt.roc_auc_score(labels, preds) print("Ensemble Validation AUC Score", final_auc) plot_roc_auc(labels, preds, save_to_file=True, output_path="./models/baseline") display_elapsed_time(started, "Total time taken")
def main(): check_params(train_params) device = get_device() print(f' Available device is {device}') src_tokenizer = train_params.src_tokenizer() tgt_tokenizer = train_params.tgt_tokenizer() base_dir = os.getcwd() dataset_dir = os.path.join(base_dir, 'dataset') src_vocab_file_path = os.path.join(dataset_dir, train_params.src_vocab_filename) tgt_vocab_file_path = os.path.join(dataset_dir, train_params.tgt_vocab_filename) src_word_embedding_file_path = os.path.join( dataset_dir, train_params.src_word_embedding_filename) tgt_word_embedding_file_path = os.path.join( dataset_dir, train_params.tgt_word_embedding_filename) src_corpus_file_path = os.path.join(dataset_dir, train_params.src_corpus_filename) tgt_corpus_file_path = os.path.join(dataset_dir, train_params.tgt_corpus_filename) src_word2id, src_id2word, src_embed_matrix = ensure_vocab_embedding( src_tokenizer, src_vocab_file_path, src_word_embedding_file_path, src_corpus_file_path, encoder_params.embedding_dim, "Source") tgt_word2id, tgt_id2word, tgt_embed_matrix = ensure_vocab_embedding( tgt_tokenizer, tgt_vocab_file_path, tgt_word_embedding_file_path, tgt_corpus_file_path, decoder_params.embedding_dim, "Target") dataset = ParallelTextDataSet(src_tokenizer, tgt_tokenizer, src_corpus_file_path, tgt_corpus_file_path, encoder_params.max_seq_len, decoder_params.max_seq_len, src_word2id, tgt_word2id) data_loader = DataLoader(dataset, batch_size=train_params.batch_size, shuffle=True, collate_fn=dataset.collate_func) encoder_params.vocab_size = len(src_word2id) encoder_params.device = device decoder_params.vocab_size = len(tgt_word2id) decoder_params.device = device ## Evaluation dataset eval_src_tokenizer = eval_params.src_tokenizer() eval_tgt_tokenizer = eval_params.tgt_tokenizer() eval_src_vocab_file_path = os.path.join(dataset_dir, eval_params.src_vocab_filename) eval_tgt_vocab_file_path = os.path.join(dataset_dir, eval_params.tgt_vocab_filename) eval_src_word_embedding_file_path = os.path.join( dataset_dir, eval_params.src_word_embedding_filename) eval_tgt_word_embedding_file_path = os.path.join( dataset_dir, eval_params.tgt_word_embedding_filename) eval_src_corpus_file_path = os.path.join(dataset_dir, eval_params.src_corpus_filename) eval_tgt_corpus_file_path = os.path.join(dataset_dir, eval_params.tgt_corpus_filename) eval_src_word2id, eval_src_id2word, eval_src_embedding = check_vocab_embedding( eval_src_vocab_file_path, eval_src_word_embedding_file_path) eval_tgt_word2id, eval_tgt_id2word, eval_tgt_embedding = check_vocab_embedding( eval_tgt_vocab_file_path, eval_tgt_word_embedding_file_path) # encoder_params.vocab_size = len(src_word2id) # encoder_params.device = device # # decoder_params.vocab_size = len(tgt_word2id) # decoder_params.device = device eval_dataset = ParallelTextDataSet(eval_src_tokenizer, eval_tgt_tokenizer, eval_src_corpus_file_path, eval_tgt_corpus_file_path, encoder_params.max_seq_len, decoder_params.max_seq_len, eval_src_word2id, eval_tgt_word2id) eval_data_loader = DataLoader(dataset, eval_params.batch_size, collate_fn=dataset.collate_func) if train_params['encoder'] == GruEncoder: encoder = train_params.encoder(encoder_params) # Freeze word embedding weight encoder.init_embedding_weight(src_embed_matrix) decoder = train_params.decoder(decoder_params) # Freeze word embedding weight decoder.init_embedding_weight(tgt_embed_matrix) model: nn.Module = Seq2Seq(encoder, decoder) elif train_params['encoder'] == Transformer: encoder = train_params.encoder(encoder_params, decoder_params) # Freeze word embedding weight encoder.init_src_embedding_weight(src_embed_matrix) decoder = train_params.decoder(decoder_params, decoder_params) # Freeze word embedding weight decoder.init_tgt_embedding_weight(tgt_embed_matrix) model: nn.Module = Transformer(encoder_params, decoder_params) model.to(device) loss_func = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=train_params.learning_rate) epoch = 0 avg_loss = 0. best_val_loss = 1e+10 for epoch in range(train_params.n_epochs): avg_loss, val_loss = train_model(model, optimizer, loss_func, data_loader, eval_data_loader, eval_tgt_id2word, device, train_params, encoder_params, decoder_params, epoch + 1) if val_loss < best_val_loss: save_dir_path = os.path.join(train_params.model_save_directory, get_checkpoint_dir_path(epoch + 1)) if not os.path.exists(save_dir_path): os.makedirs(save_dir_path) print("[Best model Save] train_loss: {}, val_loss: {}".format( avg_loss, val_loss)) # CPU에서도 동작 가능하도록 자료형 바꾼 뒤 저장? # save checkpoint for best model torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': avg_loss }, os.path.join(save_dir_path, 'checkpoint.tar')) best_val_loss = val_loss
def train_model(args: dict, hparams: dict): # Code for this function adopted from https://mccormickml.com/2019/07/22/BERT-fine-tuning/ pos_file = args.pos_file neg_file = args.neg_file truncation = args.truncation n_samples = args.n_samples seed_val = hparams["seed_val"] device = util.get_device(device_no=args.device_no) saves_dir = "saves/" Path(saves_dir).mkdir(parents=True, exist_ok=True) time = datetime.datetime.now() saves_path = os.path.join(saves_dir, util.get_filename(time)) Path(saves_path).mkdir(parents=True, exist_ok=True) log_path = os.path.join(saves_path, "training.log") logging.basicConfig(filename=log_path, filemode='w', format='%(name)s - %(levelname)s - %(message)s') logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.info("Pos file: " + str(pos_file)) logger.info("Neg file: " + str(neg_file)) logger.info("Parameters: " + str(args)) logger.info("Truncation: " + truncation) # Load the BERT tokenizer. logger.info('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) max_len = 0 reviews, labels = util.read_samples_new(filename0=neg_file, filename1=pos_file, seed_val=seed_val, n_samples=n_samples, sentence_flag=True) print(len(reviews), len(labels)) # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # For every sentence... for rev in reviews: # `encode_plus` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. # (5) Pad or truncate the sentence to `max_length` # (6) Create attention masks for [PAD] tokens. input_id = tokenizer.encode(rev, add_special_tokens=True) if len(input_id) > 512: if truncation == "tail-only": # tail-only truncation input_id = [tokenizer.cls_token_id] + input_id[-511:] elif truncation == "head-and-tail": # head-and-tail truncation input_id = [tokenizer.cls_token_id ] + input_id[1:129] + input_id[-382:] + [ tokenizer.sep_token_id ] else: # head-only truncation input_id = input_id[:511] + [tokenizer.sep_token_id] input_ids.append(torch.tensor(input_id).view(1, -1)) attention_masks.append( torch.ones([1, len(input_id)], dtype=torch.long)) else: encoded_dict = tokenizer.encode_plus( rev, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=512, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids.append(encoded_dict['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) # Combine the training inputs into a TensorDataset. dataset = TensorDataset(input_ids, attention_masks, labels) # Create a 90-10 train-validation split. # Calculate the number of samples to include in each set. train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size # Divide the dataset by randomly selecting samples. train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) logger.info('{:>5,} training samples'.format(train_size)) logger.info('{:>5,} validation samples'.format(val_size)) # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch # size of 16 or 32. batch_size = hparams["batch_size"] # Create the DataLoaders for our training and validation sets. # We'll take training samples in random order. train_dataloader = DataLoader( train_dataset, # The training samples. sampler=RandomSampler(train_dataset), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler=SequentialSampler( val_dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model = model.to(device=device) # model.cuda(device=device) # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW( model.parameters(), lr=hparams[ "learning_rate"], # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=hparams["adam_epsilon"] # args.adam_epsilon - default is 1e-8. ) # Number of training epochs. The BERT authors recommend between 2 and 4. # We chose to run for 4, but we'll see later that this may be over-fitting the # training data. epochs = 4 # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # This training code is based on the `run_glue.py` script here: # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # Set the seed value all over the place to make this reproducible. random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. logger.info("") logger.info('======== Epoch {:} / {:} ========'.format( epoch_i + 1, epochs)) logger.info('Training...') # Reset the total loss for this epoch. total_train_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Report progress. logger.info(' Batch {:>5,} of {:>5,}. '.format( step, len(train_dataloader))) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_train_loss += loss.detach().cpu().numpy() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) logger.info("") logger.info(" Average training loss: {0:.2f}".format(avg_train_loss)) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. logger.info("") logger.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 # Evaluate data for one epoch for batch in validation_dataloader: # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the validation loss. total_eval_loss += loss.detach().cpu().numpy() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += flat_accuracy(logits, label_ids) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) logger.info(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) logger.info(" Validation Loss: {0:.2f}".format(avg_val_loss)) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, }) model_save_path = os.path.join(saves_path, "model_" + str(epoch_i + 1) + "epochs") torch.save(model, model_save_path) logger.info("") logger.info("Training complete!") handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler)
args = parser.parse_args() if not args.session_id: ids = [int(c.split("_")[1].split(".")[0]) for c in os.listdir("./checkpoints")] ids.sort(reverse=True) session_id = ids[0] else: session_id = args.session_id model = load_model(session_id) model.eval() model_to_device(model) if args.image_path is not None: image = Image.open(args.image_path).convert("RGB") image_tensor = ToTensor()(image).to(get_device()) accepted_bboxes = evaluate(model, [image_tensor])[0] plot_image(image_tensor, accepted_bboxes) else: data_loader = torch.utils.data.DataLoader( dataset=GlobalDataset(transforms=Compose([ToTensor()])), batch_size=args.batch_size, collate_fn=collate_fn ) for image_tensor, _ in data_loader: accepted_bbox_lists = evaluate(model, image_tensor) for i, accepted_bboxes in enumerate(accepted_bbox_lists): plot_image(image_tensor[i], accepted_bboxes) break
#==================================================================== # Main program: read devices, then get traffic statistics from each # Get arguments passed to our application print('argv: ',argv) interval = int(argv[1]) if len(argv) >= 2 else 5 count = int(argv[2]) if len(argv) >= 3 else 5 device_ip = argv[3] if len(argv) >= 4 else '10.30.30.1' interface = argv[4] if len(argv) >= 5 else 'GigabitEthernet 1' # Read device information from database, into list of device info lists devices_from_db = read_devices_db('devices.db') # Get device information for our device device = get_device(devices_from_db, device_ip) if device == None: print('!!! Cannot find device in DB!') exit() logfile = 'dev-stats-log' # set output CSV log file # Gather traffic data for the devices in the list gather_traffic_data(logfile, device, interface, interval, count) dev_stats_log = open(logfile,'r') csv_log = csv.reader(dev_stats_log) log_info_list = [log_info for log_info in csv_log] # Print log information for our one device
def __init__(self): self.device = get_device()