def detect_lang(self, text): datafile = Dataset(self.params, None, os.path.join('data', self.params.get('corpus_name'), 'train'), text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 while not datafile.is_finished(): batch_xs, _, lengths = datafile.get_batch() outs = self.model.eval(self.session, batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): max = outs[i][j] if batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[max] += 1 total += 1 best = np.argmax(guesses) acc = 0 if total > 0: acc = float(guesses[best]) / float(total) return self.langs[datafile.get_target_name(best, type='orig')], acc
def evaluate_string(self, text, print_per_character=False, languages=None): if languages is not None: langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int) for l in languages: # try find originally id = self.train_set.trg_vocab.get_id(l) if id == Vocabulary.Vocab.UNK_ID: print("UNSUPPORTED LANGUAGE IN MODEL: " + l) else: langs_mask[id] = 1 datafile = Dataset(self.params, None, "data/" + self.params.get("corpus_name") + "/train", text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 orig = "" classif = "" while not datafile.is_finished(): dev_batch_xs, _, lengths = datafile.get_batch() if languages is not None: outs = self.model.eval(self.sess, dev_batch_xs, lengths, langs_mask=langs_mask) else: outs = self.model.eval(self.sess, dev_batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): maxim = outs[i][j] if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[maxim] += 1 total += 1 max = np.argmax(guesses) if print_per_character: print(orig) print(classif) accur = 0 if total > 0: accur = float(guesses[max]) / float(total) print([datafile.get_target_name(max, type='name'), accur])
def _init_train(self): ''' Initialize loader for train data ''' train_data = self.dataset_reader.read_dataset("train") self.train_loader = data.DataLoader(Dataset(train_data), batch_size=self.config.batch_size, shuffle=True, collate_fn=self.my_collate_fn) eval_train_data = self.dataset_reader.read_dataset("train", is_eval=True) self.eval_train_loader = data.DataLoader( Dataset(eval_train_data), batch_size=self.config.eval_batch_size, shuffle=False, collate_fn=self.my_collate_fn)
def __init__(self, sess, trained_model=None, params=None, prepare_train_set=True): start = time.time() self.session = sess self.params = Parameters('PARAMS') if trained_model: self.params.load_params(trained_model) logging.info('Загружается модель {0}'.format(trained_model)) else: self.params = params self.train_set = Dataset(self.params, os.path.join('data', self.params.get('corpus_name'), 'train'), only_eval=False) self.langs = {} with open( os.path.join('data', self.params.get('corpus_name'), 'labels'), 'r') as f: for line in f.readlines(): split = line.strip().split(' ', 1) self.langs[split[0]] = split[1] if prepare_train_set: self.train_set.prepare_data(self.params.get('min_count')) self.model = Model(self.session, self.params, self.train_set.vocab_size()) if trained_model: self.model.saver.restore( self.session, os.path.join('models', self.params.get('corpus_name'), trained_model)) print('Модель подготовлена за {0} секунд'.format( str(int(time.time() - start))))
def _init_dev(self): ''' Initialize loader for dev data ''' dev_data = self.dataset_reader.read_dataset("dev") self.dev_loader = data.DataLoader( Dataset(dev_data), batch_size=self.config.eval_batch_size, shuffle=False, collate_fn=self.my_collate_fn)
def _init_test(self): ''' Initialize loader for test data ''' test_data = self.dataset_reader.read_dataset("test") self.test_loader = data.DataLoader( Dataset(test_data), batch_size=self.config.eval_batch_size, shuffle=False, collate_fn=self.my_collate_fn)
def get_data(pre_win, post_win): settings = setup(dataset='test', data_loc='./data/controlIntervention/', subject_n=3) data = Dataset( settings, trim=True, check=False, used_data_types=[DATA_TYPES.event, DATA_TYPES.fitbit] ) minutes = post_win+pre_win PNUM = 0 bars = [] for evt in data.subject_data[0].event_data.time: time = evt-timedelta(minutes=pre_win) bars.append(data.get_steps_after_time(time, minutes, PNUM)) pids = [1]*len(bars) # all events are same participant return minutes, pids, bars
def get_fake_data(pre_win, post_win, minutes, pids, bars): # returns data from randomly chosen fake data points settings = setup(dataset='test', data_loc='./data/controlIntervention/', subject_n=3) data = Dataset( settings, trim=True, check=False, used_data_types=[DATA_TYPES.event, DATA_TYPES.fitbit] ) PNUM = 0 fake_bars = [] for evt in data.subject_data[0].event_data.time: time = evt-timedelta(days=1, minutes=pre_win) # get random(ish) time fake_bars.append(data.get_steps_after_time(time, minutes, PNUM)) diff_bars = [] for i in range(len(bars)): diff_bars.append(list_subtract(bars[i], fake_bars[i])) return minutes, pids, diff_bars
def __init__(self, sess, params, trained_model=False, prepare_train_set=True): start = time.time() self.sess = sess self.params = params self.train_set = Dataset(self.params, "data/" + self.params.get("corpus_name") + "/train", None, only_eval=False) if prepare_train_set: self.train_set.prepare_data(self.params.get("min_count")) self.model = Model(sess, self.params, self.train_set.vocab_size()) if trained_model: self.model.saver.restore(sess, trained_model) print("Модель подготовлена за " + str(int(time.time() - start)) + " секунд.")
def test(model, test_dataloader, device, distance): model.eval() average_meter = AverageMeter() with torch.no_grad(): for i, data in enumerate(test_dataloader): spectrograms, targets, input_lens, target_lens, word_wise_target = data spectrograms, targets = Dataset.pad_batch( spectrograms=list(spectrograms), targets=list(targets) ) spectrograms = spectrograms.to(device) # ==== forward ==== output = model(spectrograms, this_model_train=True) output = nn.LogSoftmax(dim=2)(output) # adjust word wise targets adjusted_targets = [] for target in word_wise_target: for word_index in target: adjusted_targets.append(torch.Tensor([word_index])) adjusted_targets = torch.stack(adjusted_targets) adjusted_targets.transpose_(1, 0) tensor_len_delta = adjusted_targets.shape[1] - output.shape[0] if tensor_len_delta > 0: output = torch.cat((output, torch.zeros(tensor_len_delta, 1, 9896).to(device))) loss = distance(output, adjusted_targets, (output.shape[0],), (adjusted_targets.shape[1],)) # ==== log ==== if loss.item() != 0: average_meter.step(loss=loss.item()) average_loss = average_meter.average() test_losses.append(average_loss) print(f'Test evaluation: Average loss: {average_loss}')
def train(model, train_dataloader, device, distance, optim, epoch, lr_scheduler, dataset): model.train() average_meter = AverageMeter() for i, data in enumerate(train_dataloader): spectrograms, targets, input_lens, target_lens, word_wise_target = data spectrograms, targets = Dataset.pad_batch( spectrograms=list(spectrograms), targets=list(targets) ) spectrograms = spectrograms.to(device) targets = targets.to(device) # ==== forward ==== output = model(x=spectrograms, this_model_train=True) output = nn.LogSoftmax(dim=2)(output) output = output.transpose(0, 1) # reshape to '(input_sequence_len, batch_size, n_classes)' as described in 'https://pytorch.org/docs/master/generated/torch.nn.CTCLoss.html' loss = distance(output, targets, input_lens, target_lens) # ==== backward ==== optim.zero_grad() loss.backward() optim.step() # ==== adjustments ==== lr = lr_scheduler.new_lr() for param_group in optim.param_groups: param_group['lr'] = lr # ==== log ==== if loss.item() != 0: average_meter.step(loss=loss.item()) if i % 200 == 0: average_loss = average_meter.average() train_losses.append(average_loss) print(f'Loss: {average_loss} | Batch: {i} / {len(train_dataloader)} | Epoch: {epoch} | lr: {lr}') return lr
def test(self, dataset): datafile = Dataset( self.params, os.path.join('data', dataset, 'test'), os.path.join('data', self.params.get('corpus_name'), 'train')) datafile.prepare_data(self.params.get('min_count')) start = time.time() logging.info( 'Тестирование начато. Датасет для тестирования - {0}.'.format( dataset)) corr = [0, 0] while not datafile.is_finished(): batch_xs, batch_ys, lengths = datafile.get_batch() dropout = 1 _, out = self.model.run(self.session, batch_xs, batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) logging.info('Тестирование закончено за {0} секунд'.format( str(int(time.time() - start)))) return corr
def training(self, eval=None): self.train_set.skip_n_lines(self.params.params["trained_lines"]) dev = Dataset(self.params, "data/" + self.params.get("corpus_name") + "/dev", "data/" + self.params.get("corpus_name") + "/train") dev.prepare_data(self.params.get("min_count")) start = time.time() # for counting the time cycle_time = time.time() logging.info("Training process begun.") stop = False loss_per_epoch = [] accuracy_per_epoch = [] # Keep training until reach max iterations while not stop: self.params.params["step"] += 1 batch_xs, batch_ys, lengths = self.train_set.get_batch() l, _ = self.model.run(self.sess, batch_xs, batch_ys, lengths, self.params.get("dropout")) loss_per_epoch.append(l) stop = self.chech_stopfile("STOP_IMMEDIATELY") if time.strftime("%H") == self.params.get("time_stop"): stop = True if self.params.params["step"] % self.params.get( "steps_per_checkpoint") == 0 or stop: c_time = time.time() corr = [0, 0] while not dev.is_finished() and eval is None: dev_batch_xs, dev_batch_ys, lengths = dev.get_batch() dropout = 1 _, out = self.model.run(self.sess, dev_batch_xs, dev_batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) if eval is not None: logging.info("Not testing on dev but on special function.") result = eval() else: # restart development data dev.restart() result = (corr[0] / corr[1]) * 100 accuracy_per_epoch.append(corr[0] / corr[1]) self.params.params[ "trained_lines"] = self.train_set.get_trained_lines() self.model.save(self.sess, self.params.params["step"], result) print( "Iter {0}, Total correctness: {1} % {2}, time per step: {3} s, total time: {4} min, {5}" .format( self.params.params["step"] * self.params.get("batch_size"), result, corr, (c_time - cycle_time) / self.params.get("steps_per_checkpoint"), int((time.time() - start) / 60), time.strftime("%H:%M:%S"))) # print((c_time - cycle_time) / self.params.get("steps_per_checkpoint")) cycle_time = time.time() stop = stop or self.chech_stopfile( "STOP_MODEL") # if it already is True do not change it if self.params.params["step"] >= self.params.get("max_iters"): stop = True # check if the file was not finished and if it was, start over if self.train_set.is_finished(): avg_loss = np.mean(loss_per_epoch) avg_test_accuracy = np.mean(accuracy_per_epoch) summ = self.sess.run(self.model.performance_summaries, feed_dict={ self.model.tf_loss_ph: avg_loss, self.model.tf_accuracy_ph: avg_test_accuracy }) self.model.sum_writer.add_summary(summ, self.params.get('epochs')) loss_per_epoch.clear() accuracy_per_epoch.clear() self.params.params["epochs"] += 1 logging.info( "Generator read training file completely and starts over") self.train_set.restart() print("Training finished in " + str(int(time.time() - start)) + " s")
def main(root, train_url='train-clean-100', test_url='test-clean'): version = 5 CONTINUE_TRAINING = False TRAIN_SPEECH_MODEL = True n_epochs = 20 hyper_params_speech = { # ==== training hyper parameters ==== 'i_lr': 0.0005, 'n_batches_warmup': 420, 'batch_size': 15, # ==== model hyper parameters ==== 'n_res_cnn_layers': 4, 'n_bi_gru_layers': 5, 'bi_gru_dim': 512, 'n_classes': 29, 'n_features': 128, 'dropout_p': 0.2, 'd_audio_embedding': 128 } device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # define dataset loaders train_dataset = Dataset(root=root, url=train_url, mode='train', n_features=hyper_params_speech['n_features'], download=False) train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=hyper_params_speech['batch_size'], shuffle=True, num_workers=0, collate_fn=collate_fn ) test_dataset = Dataset(root=root, url=test_url, mode='test', n_features=hyper_params_speech['n_features'], download=False) test_dataloader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=hyper_params_speech['batch_size'], shuffle=False, num_workers=0, collate_fn=collate_fn ) # get models start_epoch = 1 # word_distribution = train_dataset.create_word_distribution() if TRAIN_SPEECH_MODEL: # speech model speech_model = SpeechModel( n_res_cnn_layers=hyper_params_speech['n_res_cnn_layers'], n_bi_gru_layers=hyper_params_speech['n_bi_gru_layers'], bi_gru_dim=hyper_params_speech['bi_gru_dim'], n_classes=hyper_params_speech['n_classes'], n_features=hyper_params_speech['n_features'], dropout_p=hyper_params_speech['dropout_p'], device=device, dataset=train_dataset, d_audio_embedding=hyper_params_speech['d_audio_embedding'] ).to(device) # speech_model = speech_model.apply(weights_init) # set up optimizer, loss function and learning rate scheduler params = [p for p in speech_model.parameters() if p.requires_grad] optim = torch.optim.Adam(params=params, lr=hyper_params_speech['i_lr']) # amsgrad=True ? distance = nn.CTCLoss(blank=28).to(device) n_batches_warmup = hyper_params_speech['n_batches_warmup'] if CONTINUE_TRAINING: speech_model, optim, start_epoch, hyper_params_speech['i_lr'] = load_checkpoint(checkpoint_path='models/asr/model_checkpoints/model_checkpoint_1.0.pth', model=speech_model, optim=optim) # n_batches_warmup = 0 lr_scheduler = CosineLearningRateScheduler(i_lr=hyper_params_speech['i_lr'], n_batches_warmup=n_batches_warmup, n_total_batches=(len(train_dataloader) * n_epochs)) # train for epoch in range(start_epoch, (n_epochs + start_epoch)): if TRAIN_SPEECH_MODEL: lr = train(model=speech_model, train_dataloader=train_dataloader, device=device, distance=distance, optim=optim, epoch=epoch, lr_scheduler=lr_scheduler, dataset=train_dataset) # test(model=speech_model, test_dataloader=test_dataloader, device=device, distance=distance) torch.save(speech_model, f'models/asr/models/speech_model_{version}.{epoch}.pth') torch.save({ 'epoch': n_epochs, 'model_state_dict': speech_model.state_dict(), 'optim_state_dict': optim.state_dict(), 'lr': lr }, f'models/asr/model_checkpoints/speech_model_checkpoint_{version}.{epoch}.pth') plot_info_data = { 'train_losses': train_losses, 'test_losses': test_losses } with open(f'models/asr/plot_data/plot_data_speech_model_{version}_{epoch}', 'w') as plot_info_file: json.dump(plot_info_data, plot_info_file) if TRAIN_SPEECH_MODEL: torch.save(speech_model, f'models/asr/models/speech_model_{version}.0.pth') torch.save({ 'epoch': n_epochs, 'model_state_dict': speech_model.state_dict(), 'optim_state_dict': optim.state_dict(), 'lr': lr }, f'models/asr/model_checkpoints/speech_model_checkpoint_{version}.0.pth')
import warnings import pylab import pandas from src.settings import setup, QUALITY_LEVEL, DATA_TYPES from src.data.mAvatar.Data import DAY_TYPE from src.data.Dataset import Dataset settings = setup(dataset='USF', data_loc='../subjects/', subject_n=0) with warnings.catch_warnings(): warnings.simplefilter("ignore") data = Dataset(settings, min_quality=QUALITY_LEVEL.acceptable, trim=True, check=True, used_data_types=[DATA_TYPES.fitbit, DATA_TYPES.avatar_views], avatar_view_freq=60) UP_TO_DATE = True # true if software versions are good if pandas.version.version < '0.12.0': UP_TO_DATE = False print '\n\nWARN: Some analysis cannot be completed due to outdated pandas version ' + pandas.version.version + '\n\n' ################### ### BEGIN plots ### ################### if UP_TO_DATE: # correlation scatterplot import src.scatterplot as scatterplot
USE_HMM = False model_path = '../../../models/asr/models/speech_model_4.11.pth' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = torch.load(model_path, map_location='cuda') root = 'data' test_url = 'test-clean' train_url = 'train-clean-100' test_dataset = Dataset(root=root, url=train_url, mode='test', n_features=128, download=False) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_fn) hmm = HMM(root='data/hmm_data', n_states=29) for i, data in enumerate(test_dataloader): spectrograms, targets, input_lens, target_lens, _ = data spectrograms, targets = Dataset.pad_batch(spectrograms=list(spectrograms), targets=list(targets)) spectrograms = spectrograms.to(device)
class NNHelper(object): def __init__(self, sess, trained_model=None, params=None, prepare_train_set=True): start = time.time() self.session = sess self.params = Parameters('PARAMS') if trained_model: self.params.load_params(trained_model) logging.info('Загружается модель {0}'.format(trained_model)) else: self.params = params self.train_set = Dataset(self.params, os.path.join('data', self.params.get('corpus_name'), 'train'), only_eval=False) self.langs = {} with open( os.path.join('data', self.params.get('corpus_name'), 'labels'), 'r') as f: for line in f.readlines(): split = line.strip().split(' ', 1) self.langs[split[0]] = split[1] if prepare_train_set: self.train_set.prepare_data(self.params.get('min_count')) self.model = Model(self.session, self.params, self.train_set.vocab_size()) if trained_model: self.model.saver.restore( self.session, os.path.join('models', self.params.get('corpus_name'), trained_model)) print('Модель подготовлена за {0} секунд'.format( str(int(time.time() - start)))) def detect_langs(self, text, count): datafile = Dataset(self.params, None, os.path.join('data', self.params.get('corpus_name'), 'train'), text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 while not datafile.is_finished(): batch_xs, _, lengths = datafile.get_batch() outs = self.model.eval(self.session, batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): max = outs[i][j] if batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[max] += 1 total += 1 result = {} for i in range(count): if all(item == 0 for item in guesses): break best = np.argmax(guesses) acc = 0 if total > 0: acc = float(guesses[best]) / float(total) lang = self.langs[datafile.get_target_name(best, type='orig')] guesses[best] = 0 result[lang] = acc return result def detect_lang(self, text): datafile = Dataset(self.params, None, os.path.join('data', self.params.get('corpus_name'), 'train'), text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 while not datafile.is_finished(): batch_xs, _, lengths = datafile.get_batch() outs = self.model.eval(self.session, batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): max = outs[i][j] if batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[max] += 1 total += 1 best = np.argmax(guesses) acc = 0 if total > 0: acc = float(guesses[best]) / float(total) return self.langs[datafile.get_target_name(best, type='orig')], acc def test(self, dataset): datafile = Dataset( self.params, os.path.join('data', dataset, 'test'), os.path.join('data', self.params.get('corpus_name'), 'train')) datafile.prepare_data(self.params.get('min_count')) start = time.time() logging.info( 'Тестирование начато. Датасет для тестирования - {0}.'.format( dataset)) corr = [0, 0] while not datafile.is_finished(): batch_xs, batch_ys, lengths = datafile.get_batch() dropout = 1 _, out = self.model.run(self.session, batch_xs, batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) logging.info('Тестирование закончено за {0} секунд'.format( str(int(time.time() - start)))) return corr def train(self): self.train_set.skip_n_lines(self.params.get('trained_lines')) dev = Dataset( self.params, os.path.join('data', self.params.get('corpus_name'), 'dev'), os.path.join('data', self.params.get('corpus_name'), 'train')) dev.prepare_data(self.params.get('min_count')) start = time.time() cycle_time = time.time() logging.info('Процесс обучения запущен') stop = False loss_per_epoch = [] accuracy_per_epoch = [] while not stop: self.params.params['step'] += 1 batch_xs, batch_ys, lengths = self.train_set.get_batch() l, _ = self.model.run(self.session, batch_xs, batch_ys, lengths, self.params.get('dropout')) loss_per_epoch.append(l) stop = self.check_stopfile('STOP_IMMEDIATELY') if time.strftime('%H') == self.params.get('time_stop'): stop = True if self.params.get('step') % self.params.get( 'steps_per_checkpoint') == 0 or stop: c_time = time.time() corr = [0, 0] while not dev.is_finished(): dev_batch_xs, dev_batch_ys, lengths = dev.get_batch() dropout = 1 _, out = self.model.run(self.session, dev_batch_xs, dev_batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) result = (corr[0] / corr[1]) * 100 accuracy_per_epoch.append(float(corr[0]) / float(corr[1])) self.params.params[ 'trained_lines'] = self.train_set.get_trained_lines() self.model.save(self.session, self.params.get('step'), result) print('''Итерация: {0}, Точность: {1}% {2}, Времени на шаг: {3} секунд Время обучения: {4} минут Время: {5}'''.format( self.paramsget('step') * self.params.get('batch_size'), result, corr, (c_time - cycle_time) / self.params.get('steps_per_checkpoint'), int((time.time() - start) / 60), time.strftime('%H:%M:%S'))) cycle_time = time.time() stop = stop or self.check_stopfile('STOP_MODEL') if self.params.get('step') >= self.params.get('max_iters'): stop = True if self.train_set.is_finished(): avg_loss = np.mean(loss_per_epoch) avg_test_accuracy = np.mean(accuracy_per_epoch) summ = self.sess.run(self.model.performance_summaries, feed_dict={ self.model.tf_loss_ph: avg_loss, self.model.tf_accuracy_ph: avg_test_accuracy }) self.model.sum_writer.add_summary(summ, self.params.get('epochs')) loss_per_epoch.clear() accuracy_per_epoch.clear() self.params.params["epochs"] += 1 logging.info("Эпоха {0} начата.".format( self.params.get('epochs'))) self.train_set.restart() print("Обучение закончено за " + str(int(time.time() - start)) + " секунд") def check_stopfile(self, filename): stop = False with open(filename, mode="r") as stp: for line in stp: if line.strip() == self.params.params["corpus_name"]: logging.info("Stopping training on command from stopfile.") stop = True break if stop: # remove command from file f = open(filename, "r") lines = f.readlines() f.close() f = open(filename, "w") for line in lines: if line.strip() != self.params.params["corpus_name"]: f.write(line) f.close() return stop
def train(self): self.train_set.skip_n_lines(self.params.get('trained_lines')) dev = Dataset( self.params, os.path.join('data', self.params.get('corpus_name'), 'dev'), os.path.join('data', self.params.get('corpus_name'), 'train')) dev.prepare_data(self.params.get('min_count')) start = time.time() cycle_time = time.time() logging.info('Процесс обучения запущен') stop = False loss_per_epoch = [] accuracy_per_epoch = [] while not stop: self.params.params['step'] += 1 batch_xs, batch_ys, lengths = self.train_set.get_batch() l, _ = self.model.run(self.session, batch_xs, batch_ys, lengths, self.params.get('dropout')) loss_per_epoch.append(l) stop = self.check_stopfile('STOP_IMMEDIATELY') if time.strftime('%H') == self.params.get('time_stop'): stop = True if self.params.get('step') % self.params.get( 'steps_per_checkpoint') == 0 or stop: c_time = time.time() corr = [0, 0] while not dev.is_finished(): dev_batch_xs, dev_batch_ys, lengths = dev.get_batch() dropout = 1 _, out = self.model.run(self.session, dev_batch_xs, dev_batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) result = (corr[0] / corr[1]) * 100 accuracy_per_epoch.append(float(corr[0]) / float(corr[1])) self.params.params[ 'trained_lines'] = self.train_set.get_trained_lines() self.model.save(self.session, self.params.get('step'), result) print('''Итерация: {0}, Точность: {1}% {2}, Времени на шаг: {3} секунд Время обучения: {4} минут Время: {5}'''.format( self.paramsget('step') * self.params.get('batch_size'), result, corr, (c_time - cycle_time) / self.params.get('steps_per_checkpoint'), int((time.time() - start) / 60), time.strftime('%H:%M:%S'))) cycle_time = time.time() stop = stop or self.check_stopfile('STOP_MODEL') if self.params.get('step') >= self.params.get('max_iters'): stop = True if self.train_set.is_finished(): avg_loss = np.mean(loss_per_epoch) avg_test_accuracy = np.mean(accuracy_per_epoch) summ = self.sess.run(self.model.performance_summaries, feed_dict={ self.model.tf_loss_ph: avg_loss, self.model.tf_accuracy_ph: avg_test_accuracy }) self.model.sum_writer.add_summary(summ, self.params.get('epochs')) loss_per_epoch.clear() accuracy_per_epoch.clear() self.params.params["epochs"] += 1 logging.info("Эпоха {0} начата.".format( self.params.get('epochs'))) self.train_set.restart() print("Обучение закончено за " + str(int(time.time() - start)) + " секунд")
import pylab import pandas from src.settings import setup, QUALITY_LEVEL, DATA_TYPES from src.data.mAvatar.Data import DAY_TYPE from src.data.Dataset import Dataset settings = setup(dataset='USF', data_loc='../subjects/', subject_n=0) with warnings.catch_warnings(): warnings.simplefilter("ignore") data = Dataset( settings, min_quality=QUALITY_LEVEL.acceptable, trim=True, check=True, used_data_types=[DATA_TYPES.fitbit, DATA_TYPES.avatar_views], avatar_view_freq=60) UP_TO_DATE = True # true if software versions are good if pandas.version.version < '0.12.0': UP_TO_DATE = False print '\n\nWARN: Some analysis cannot be completed due to outdated pandas version ' + pandas.version.version + '\n\n' ################### ### BEGIN plots ### ################### if UP_TO_DATE: # correlation scatterplot
def evaluate(self, files, max_langs_per_file, allowed_langs, output_file, threashold, eval_lines=False, eval_blocks=False, smoothing=0, unknown=None, separator=",", code_swaps=None): langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int) for allowed in self.train_set.get_tagging_classes(): langs_mask[allowed] = 1 for l in allowed_langs: # try find originally id = self.train_set.trg_vocab.get_id(l) if id == Vocabulary.Vocab.UNK_ID: print("UNSUPPORTED LANGUAGE IN MODEL: " + l) else: langs_mask[id] = 1 datafile = Dataset(self.params, None, "data/" + self.params.get("corpus_name") + "/train", only_eval=True, use_eol=eval_lines) if smoothing > 0: print("USING SMOOTHING OF {0}".format(smoothing)) with open(output_file, encoding='utf-8', mode='w', buffering=1) as bal: for filename in files: # files has structure: [folder, outputing_name, possible_encoding] if len(filename) > 2: datafile.restart(filename[0] + filename[1], filename[2]) else: datafile.restart(filename[0] + filename[1]) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) row = np.zeros(self.train_set.vocab_size()[1], np.int) row_length = 0 total = 0 smooth = [] while not datafile.is_finished(): dev_batch_xs, dev_batch_ys, lengths = datafile.get_batch() outs = self.model.eval(self.sess, dev_batch_xs, lengths, langs_mask=langs_mask) for j in range(len(outs[0])): block_guesses = np.zeros( self.train_set.vocab_size()[1], np.int) for i in range(len(outs)): if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break # print(datafile.get_source_name(dev_batch_xs[i][j]), datafile.get_target_name(outs[i][j], "orig")) total += 1 if eval_lines: if dev_batch_xs[i][ j] == datafile.trg_vocab.EOL_ID: # dev_batch_ys[i][j] == -2: # or guesses[np.argmax(row)] += row_length # print("filename {0}, guessed {1}, sum {2}, line length {3}".format(filename[1], datafile.get_target_name(np.argmax(row), "iso2"), row[np.argmax(row)], row_length)) row = np.zeros( self.train_set.vocab_size()[1], np.int) row_length = 0 else: row[outs[i][j]] += 1 row_length += 1 elif eval_blocks: block_guesses[outs[i][j]] += 1 elif smoothing > 0: smooth.append(outs[i][j]) else: guesses[outs[i][j]] += 1 if eval_blocks: guesses[np.argmax(block_guesses)] += i if smoothing > 0: for i in range(len(smooth)): if i + smoothing < len(smooth) and smooth[i] == smooth[ i + smoothing]: # if first and the last are the same, the inbetween should be too guesses[smooth[i]] += smoothing i += smoothing - 1 else: guesses[smooth[i]] += 1 langs = 0 last_count = 1 seznam = "" for max in np.argsort(-guesses): if guesses[max] == 0 or langs == max_langs_per_file: break guess_name = datafile.get_target_name(max, "iso2") percent = 100 * guesses[max] / total if guess_name in allowed_langs: if code_swaps is not None and guess_name in code_swaps: guess_name = code_swaps[guess_name] # print at least on language # if langs > 0 and 100 * guesses[max] / last_count < threashold: # break if langs > 0 and percent < threashold: break seznam += "{0} {1:.0f}; ".format(guess_name, percent) bal.write(filename[1] + separator + guess_name + "\n") # print(filename[1] + "," + guess_name) langs += 1 last_count = guesses[max] else: print(filename[1] + ", not allowed lang: " + guess_name) if langs == 0 and unknown is not None: # no language was outputted bal.write(filename[1] + separator + unknown + "\n")
# from src.post_view_event_steps_bars import test_get_avg_list # test_get_avg_list() #knowMe.makePlots(type=PLOT_TYPES.bars, show=True, pre_win=10, post_win=40) #knowMe.makePlots(type=PLOT_TYPES.bars, show=True) if avatar: ### USF mAVATAR DATA LOADING ### settings = setup(dataset='USF', data_loc='../subjects/', subject_n=0) with warnings.catch_warnings(): warnings.simplefilter("ignore") data = Dataset( settings, min_quality=QUALITY_LEVEL.acceptable, trim=True, check=True, used_data_types=[DATA_TYPES.fitbit, DATA_TYPES.avatar_views], avatar_view_freq=60) UP_TO_DATE = True # true if software versions are good if pandas.version.version < '0.12.0': UP_TO_DATE = False print '\n\nWARN: Some analysis cannot be completed due to outdated pandas version ' + pandas.version.version + '\n\n' # comparison of events selected with/without overlap from mAvatar dataset # to demonstrate difference (especially at high time intervals like no-overlap for 3hrs around event) #plot_minutes(data, MINS=12*60, overlap_okay=True, shift=-6*60, edgecolor='none') #pylab.show() plot_minutes(data, MINS=60, overlap_okay=True, shift=-30, edgecolor='none') pylab.show()
class Architecture(object): def __init__(self, sess, params, trained_model=False, prepare_train_set=True): start = time.time() self.sess = sess self.params = params self.train_set = Dataset(self.params, "data/" + self.params.get("corpus_name") + "/train", None, only_eval=False) if prepare_train_set: self.train_set.prepare_data(self.params.get("min_count")) self.model = Model(sess, self.params, self.train_set.vocab_size()) if trained_model: self.model.saver.restore(sess, trained_model) print("Модель подготовлена за " + str(int(time.time() - start)) + " секунд.") def evaluate(self, files, max_langs_per_file, allowed_langs, output_file, threashold, eval_lines=False, eval_blocks=False, smoothing=0, unknown=None, separator=",", code_swaps=None): langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int) for allowed in self.train_set.get_tagging_classes(): langs_mask[allowed] = 1 for l in allowed_langs: # try find originally id = self.train_set.trg_vocab.get_id(l) if id == Vocabulary.Vocab.UNK_ID: print("UNSUPPORTED LANGUAGE IN MODEL: " + l) else: langs_mask[id] = 1 datafile = Dataset(self.params, None, "data/" + self.params.get("corpus_name") + "/train", only_eval=True, use_eol=eval_lines) if smoothing > 0: print("USING SMOOTHING OF {0}".format(smoothing)) with open(output_file, encoding='utf-8', mode='w', buffering=1) as bal: for filename in files: # files has structure: [folder, outputing_name, possible_encoding] if len(filename) > 2: datafile.restart(filename[0] + filename[1], filename[2]) else: datafile.restart(filename[0] + filename[1]) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) row = np.zeros(self.train_set.vocab_size()[1], np.int) row_length = 0 total = 0 smooth = [] while not datafile.is_finished(): dev_batch_xs, dev_batch_ys, lengths = datafile.get_batch() outs = self.model.eval(self.sess, dev_batch_xs, lengths, langs_mask=langs_mask) for j in range(len(outs[0])): block_guesses = np.zeros( self.train_set.vocab_size()[1], np.int) for i in range(len(outs)): if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break # print(datafile.get_source_name(dev_batch_xs[i][j]), datafile.get_target_name(outs[i][j], "orig")) total += 1 if eval_lines: if dev_batch_xs[i][ j] == datafile.trg_vocab.EOL_ID: # dev_batch_ys[i][j] == -2: # or guesses[np.argmax(row)] += row_length # print("filename {0}, guessed {1}, sum {2}, line length {3}".format(filename[1], datafile.get_target_name(np.argmax(row), "iso2"), row[np.argmax(row)], row_length)) row = np.zeros( self.train_set.vocab_size()[1], np.int) row_length = 0 else: row[outs[i][j]] += 1 row_length += 1 elif eval_blocks: block_guesses[outs[i][j]] += 1 elif smoothing > 0: smooth.append(outs[i][j]) else: guesses[outs[i][j]] += 1 if eval_blocks: guesses[np.argmax(block_guesses)] += i if smoothing > 0: for i in range(len(smooth)): if i + smoothing < len(smooth) and smooth[i] == smooth[ i + smoothing]: # if first and the last are the same, the inbetween should be too guesses[smooth[i]] += smoothing i += smoothing - 1 else: guesses[smooth[i]] += 1 langs = 0 last_count = 1 seznam = "" for max in np.argsort(-guesses): if guesses[max] == 0 or langs == max_langs_per_file: break guess_name = datafile.get_target_name(max, "iso2") percent = 100 * guesses[max] / total if guess_name in allowed_langs: if code_swaps is not None and guess_name in code_swaps: guess_name = code_swaps[guess_name] # print at least on language # if langs > 0 and 100 * guesses[max] / last_count < threashold: # break if langs > 0 and percent < threashold: break seznam += "{0} {1:.0f}; ".format(guess_name, percent) bal.write(filename[1] + separator + guess_name + "\n") # print(filename[1] + "," + guess_name) langs += 1 last_count = guesses[max] else: print(filename[1] + ", not allowed lang: " + guess_name) if langs == 0 and unknown is not None: # no language was outputted bal.write(filename[1] + separator + unknown + "\n") def evaluate_dataset(self, source, allowed_languages=None): correct_all = 0 total_all = 0 with open(source, mode='r') as src: for l in src: if total_all % 1000 == 0: print("processed lines ", total_all) entry = l.strip().split(' ', 1) if allowed_languages is not None: guess = self.evaluate_string(entry[1], languages=allowed_languages) else: guess = self.evaluate_string(entry[1]) total_all += 1 if entry[0] == guess[0]: correct_all += 1 print("Accuracy all: {0} ({1}/{2})".format(correct_all / total_all, correct_all, total_all)) def evaluate_string(self, text, print_per_character=False, languages=None): if languages is not None: langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int) for l in languages: # try find originally id = self.train_set.trg_vocab.get_id(l) if id == Vocabulary.Vocab.UNK_ID: print("UNSUPPORTED LANGUAGE IN MODEL: " + l) else: langs_mask[id] = 1 datafile = Dataset(self.params, None, "data/" + self.params.get("corpus_name") + "/train", text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 orig = "" classif = "" while not datafile.is_finished(): dev_batch_xs, _, lengths = datafile.get_batch() if languages is not None: outs = self.model.eval(self.sess, dev_batch_xs, lengths, langs_mask=langs_mask) else: outs = self.model.eval(self.sess, dev_batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): maxim = outs[i][j] if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[maxim] += 1 total += 1 max = np.argmax(guesses) if print_per_character: print(orig) print(classif) accur = 0 if total > 0: accur = float(guesses[max]) / float(total) print([datafile.get_target_name(max, type='name'), accur]) def training(self, eval=None): self.train_set.skip_n_lines(self.params.params["trained_lines"]) dev = Dataset(self.params, "data/" + self.params.get("corpus_name") + "/dev", "data/" + self.params.get("corpus_name") + "/train") dev.prepare_data(self.params.get("min_count")) start = time.time() # for counting the time cycle_time = time.time() logging.info("Training process begun.") stop = False loss_per_epoch = [] accuracy_per_epoch = [] # Keep training until reach max iterations while not stop: self.params.params["step"] += 1 batch_xs, batch_ys, lengths = self.train_set.get_batch() l, _ = self.model.run(self.sess, batch_xs, batch_ys, lengths, self.params.get("dropout")) loss_per_epoch.append(l) stop = self.chech_stopfile("STOP_IMMEDIATELY") if time.strftime("%H") == self.params.get("time_stop"): stop = True if self.params.params["step"] % self.params.get( "steps_per_checkpoint") == 0 or stop: c_time = time.time() corr = [0, 0] while not dev.is_finished() and eval is None: dev_batch_xs, dev_batch_ys, lengths = dev.get_batch() dropout = 1 _, out = self.model.run(self.sess, dev_batch_xs, dev_batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) if eval is not None: logging.info("Not testing on dev but on special function.") result = eval() else: # restart development data dev.restart() result = (corr[0] / corr[1]) * 100 accuracy_per_epoch.append(corr[0] / corr[1]) self.params.params[ "trained_lines"] = self.train_set.get_trained_lines() self.model.save(self.sess, self.params.params["step"], result) print( "Iter {0}, Total correctness: {1} % {2}, time per step: {3} s, total time: {4} min, {5}" .format( self.params.params["step"] * self.params.get("batch_size"), result, corr, (c_time - cycle_time) / self.params.get("steps_per_checkpoint"), int((time.time() - start) / 60), time.strftime("%H:%M:%S"))) # print((c_time - cycle_time) / self.params.get("steps_per_checkpoint")) cycle_time = time.time() stop = stop or self.chech_stopfile( "STOP_MODEL") # if it already is True do not change it if self.params.params["step"] >= self.params.get("max_iters"): stop = True # check if the file was not finished and if it was, start over if self.train_set.is_finished(): avg_loss = np.mean(loss_per_epoch) avg_test_accuracy = np.mean(accuracy_per_epoch) summ = self.sess.run(self.model.performance_summaries, feed_dict={ self.model.tf_loss_ph: avg_loss, self.model.tf_accuracy_ph: avg_test_accuracy }) self.model.sum_writer.add_summary(summ, self.params.get('epochs')) loss_per_epoch.clear() accuracy_per_epoch.clear() self.params.params["epochs"] += 1 logging.info( "Generator read training file completely and starts over") self.train_set.restart() print("Training finished in " + str(int(time.time() - start)) + " s") def chech_stopfile(self, filename): stop = False with open(filename, mode="r") as stp: for line in stp: if line.strip() == self.params.params["corpus_name"]: logging.info("Stopping training on command from stopfile.") stop = True break if stop: # remove command from file f = open(filename, "r") lines = f.readlines() f.close() f = open(filename, "w") for line in lines: if line.strip() != self.params.params["corpus_name"]: f.write(line) f.close() return stop