def main(): input_dim = 6 spatial_dims = [0, 1, 2] args = utils.read_args() experiment_dir = utils.get_experiment_dir(args.name, args.run) utils.initialize_experiment_if_needed(experiment_dir, args.evaluate) # Logger will print to stdout and logfile utils.initialize_logger(experiment_dir) # Optionally restore arguments from previous training # Useful if training is interrupted if not args.evaluate: try: args = utils.load_args(experiment_dir) except: args.best_tpr = 0.0 args.nb_epochs_complete = 0 # Track in case training interrupted utils.save_args(experiment_dir, args) # Save initial args net = utils.create_or_restore_model(experiment_dir, args.nb_hidden, args.nb_layer, input_dim, spatial_dims) if torch.cuda.is_available(): net = net.cuda() logging.warning("Training on GPU") logging.info("GPU type:\n{}".format(torch.cuda.get_device_name(0))) criterion = nn.functional.binary_cross_entropy if not args.evaluate: assert (args.train_file != None) assert (args.val_file != None) train_loader = construct_loader(args.train_file, args.nb_train, args.batch_size, shuffle=True) valid_loader = construct_loader(args.val_file, args.nb_val, args.batch_size) logging.info("Training on {} samples.".format( len(train_loader) * args.batch_size)) logging.info("Validate on {} samples.".format( len(valid_loader) * args.batch_size)) train(net, criterion, args, experiment_dir, train_loader, valid_loader) # Perform evaluation over test set try: net = utils.load_best_model(experiment_dir) logging.warning("\nBest model loaded for evaluation on test set.") except: logging.warning( "\nCould not load best model for test set. Using current.") assert (args.test_file != None) test_loader = construct_loader(args.test_file, args.nb_test, args.batch_size) test_stats = evaluate(net, criterion, experiment_dir, args, test_loader, TEST_NAME)
def evaluate(sys_argv): from utils import read_args opts = read_args(args_as_a_list=sys_argv[1:]) from utils.train import models_path evaluate_model_dir_path(models_dir_path=models_path, model_dir_path=opts.model_path, model_epoch_dir_path=opts.model_epoch_path)
def xnlp_experiments(sys_argv): from utils import read_args opts = read_args(args_as_a_list=sys_argv[1:], for_xnlp=True) from utils.train import models_path model, data_dict, id_to_tag, word_to_id, stats_dict = do_xnlp( models_dir_path=models_path, model_dir_path=opts.model_path, model_epoch_dir_path=opts.model_epoch_path)
def start_webapp(sys_argv): from utils import read_args opts = read_args(args_as_a_list=sys_argv[1:]) assert type(opts.port) == int print("Creating app object") app = make_app(opts) print("Listening") app.listen(opts.port) print("Starting the loop") tornado.ioloop.IOLoop.current().start()
def predict_from_stdin(sys_argv): from utils import read_args opts = read_args(args_as_a_list=sys_argv[1:]) from utils.train import models_path model, opts, parameters = initialize_model_with_pretrained_parameters( opts.model_path, opts.model_epoch_path, models_path) line = sys.stdin.readline() while line: # "ali ata bak\ndeneme deneme" predict_sentences_given_model(line.decode("utf8"), model) line = sys.stdin.readline()
def build_model(self, netpath: str = None): if self.outchannel is None: self.outchannel = self.img_.shape[1] if self.args.net == "load": _args = u.read_args(os.path.join('results', *netpath.split('/')[:-1], "args.txt")) assert net_args_are_same(self.args, _args) self.net = get_net(_args, self.outchannel).type(self.dtype) self.net.load_state_dict(torch.load(os.path.join('results', netpath))) else: self.net = get_net(self.args, self.outchannel).type(self.dtype) u.init_weights(self.net, self.args.inittype, self.args.initgain) # self.net = self.net.type(self.dtype) # # if self.args.net != 'load': # u.init_weights(self.net, self.args.inittype, self.args.initgain) self.parameters = u.get_params('net', self.net, self.input_) self.num_params = sum(np.prod(list(p.size())) for p in self.net.parameters())
def testar(ntwk, rodada, dados_carregado, largura): with open('base_BF_rod' + str(rodada) + '.pkl', 'rb') as f: base = dill.load(f) lista, y_train = base[0], base[2] args = utils.read_args('configs/default.ast') num_epochs, nnet_args = args['num_epochs'], args['nnet_args'] chars = utils.mapeamento_palavra() num_classes = len(chars) num_samples = len(lista) printer = utils.Printer(chars) data_x, data_y = formata_padrao_entrada_saida(num_classes, y_train, dados_carregado, lista, largura) img_ht = data_x[0].shape[0] acertos = 0 erros = 0 quantidade_total = len(lista) for c in range(1): for i in range(len(lista)): # print(lista[i]) x = data_x[i] y = data_y[i] _, esperado = printer.yprint2(y) saida_obtida, _ = ntwk.tester(x) rotulo_pred, retornado2 = printer.rotulo_(saida_obtida) resultado = verificar(retornado2) if (esperado == resultado): acertos += 1 else: erros += 1 return acertos, erros
def main(): # read args args = u.read_args() u.create_directories(args) #create classification model c = Classifier(args) #if training flag is true build model and train it if args['train']: model = c.build() plot_model(model, to_file=args['exp_dir'] + 'modelimage' + '.png', show_layer_names=False, show_shapes=False) operator = Train(model, args) operator.train() operator.validate() #if test is true, load best model and test it if args['test']: #load data only without creating model operator = Train(None, args) operator.validate() true, predicted = operator.test() #plot confusion matrix class_names = ['0', '1'] cf = confusion_matrix(true, predicted) plt.figure() u.plot_confusion_matrix( cf, classes=class_names, normalize=False, title='Confusion matrix, without normalization')
# http://rosalind.info/problems/fib/ from utils import read_args # n = number of months # k = number of offspring pairs on pair produces n, k = read_args(2) # they produce offspring after 2 months wabbits = [1, 1] while len(wabbits) < n: wabbits.append(wabbits[-1] + wabbits[-2] * k) print(wabbits[-1])
def run_a_single_configuration_without_fabric( crf, lr_method, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, separate_bilstms, skip_testing, max_epochs, train_filepath, dev_filepath, test_filepath, embeddings_filepath, reload, _run): from sacred.observers import MongoObserver """ python train.py --pre_emb ../../data/we-300.txt --train dataset/tr.train --dev dataset/tr.test --test dataset/tr.test --word_dim 300 --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 --lr_method=sgd-lr_0.01 --maximum-epochs 100 --char_dim 200 --char_lstm_dim 200 --char_bidirect 1 --morpho_tag_dim 100 --morpho_tag_lstm_dim 100 --morpho_tag_type char --overwrite-mappings 1 --batch-size 5 """ execution_part = "python train.py " if word_dim == 0: embeddings_part = "" else: embeddings_part = "--pre_emb ../../datasets/%s " % embeddings_filepath print(train_filepath, dev_filepath, test_filepath, skip_testing, max_epochs) always_constant_part = "-T ../../datasets/%s " \ "-d ../../datasets/%s " \ "-t ../../datasets/%s " \ "%s" \ "--skip-testing %d " \ "--tag_scheme iobes " \ "--maximum-epochs %d " % (train_filepath, dev_filepath, test_filepath, embeddings_part, skip_testing, max_epochs) commandline_args = always_constant_part + \ "--crf %d " \ "--lr_method %s " \ "--dropout %1.1lf " \ "--char_dim %d " \ "--char_lstm_dim %d " \ "--morpho_tag_dim %d " \ "--morpho_tag_lstm_dim %d " \ "--morpho_tag_type %s " \ "--morpho-tag-column-index %d " \ "--word_dim %d " \ "--word_lstm_dim %d "\ "--cap_dim %d "\ "--separate-bilstms %d "\ "--reload %d" % (crf, lr_method, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, separate_bilstms, reload) tagger_root = "/media/storage/genie/turkish-ner/code/tagger" print _run print _run.info print subprocess.check_output(["id"]) print subprocess.check_output(["pwd"]) opts = read_args(commandline_args.split(" ")) # print opts parameters = form_parameters_dict(opts) # print parameters # model_path = get_name(parameters) model_path = get_model_subpath(parameters) print model_path _run.info['costs'] = dict() _run.info['best_performances'] = dict() _run.info['starting'] = 1 dummy_prefix = "" print dummy_prefix + execution_part + commandline_args process = subprocess.Popen( (dummy_prefix + execution_part + commandline_args).split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def record_metric(_run, epoch, samples, label, value): if str(epoch) in _run.info[label]: _run.info[label][str(epoch)].append(value) else: _run.info[label][str(epoch)] = list() _run.info[label][str(epoch)].append(value) for line in iter(process.stdout.readline, ''): sys.stdout.write(line) m = re.match( "^Epoch (\d+): (\d+) Samples read. Avg. cost: ([^,]+), Scores on dev: ([^,]+), (.+)$", line) if m: epoch = int(m.group(1)) samples = int(m.group(2)) epoch_avg_cost = float(m.group(3)) if skip_testing == 1 or dev_filepath == test_filepath: epoch_performance = float(m.group(4)) else: epoch_performance = float(m.group(5)) record_metric(_run, epoch, samples, "costs", epoch_avg_cost) record_metric(_run, epoch, samples, "best_performances", epoch_performance) sys.stdout.flush() # for epoch in range(max_epochs): # epoch_cost = subprocess.check_output(("tail -1 %s" % os.path.join("models", model_path, "epoch-%08d" % epoch, "epoch_cost.txt")).split(" ")) # best_performances = subprocess.check_output(("cat %s" % os.path.join("models", model_path, "epoch-%08d" % epoch, "best_performances.txt")).split(" ")) # print "EPOCHCOST: " + epoch_cost # _run.info['costs'][str(epoch)] = float(epoch_cost.strip()) # print "BESTPERF: " + best_performances # if skip_testing == 1 or dev_filepath == test_filepath: # _run.info['best_performances'][str(epoch)] = float(best_performances.split(" ")[0]) # else: # _run.info['best_performances'][str(epoch)] = float(best_performances.split(" ")[1]) return model_path
import sys from datetime import datetime as dt import editdistance import numpy as np import theano as th import rnn_ctc.neuralnet as nn # from parscribe import ParScribe as Scribe from scribe import Scribe import utils import telugu as lang import utils ############################################ Read Args args = utils.read_args(sys.argv[1:]) num_samples, num_epochs = args['num_samples'], args['num_epochs'] scribe_args, nnet_args = args['scribe_args'], args['nnet_args'] if len(sys.argv) > 1: output_fname = '-'.join(sorted(sys.argv[1:])) output_fname = output_fname.replace('.ast', '').replace('/', '').replace('configs', '') else: output_fname = "default" network_fname = '{}.pkl'.format(output_fname) output_fname += '_' + dt.now().strftime('%y%m%d_%H%M') + '.txt' distances, wts = [], [] print("Output will be written to: ", output_fname) # Initialize Language lang.select_labeler(args['labeler'])
import dill with open('BLSTM.pkl', 'rb') as pkl_file: layer2, layer1, image = dill.load(pkl_file) rodada = 10 # largura = 60 with open('base_BF_rod' + str(rodada) + '_' + nome_arquivo + '_teste.pkl', 'rb') as f: base = dill.load(f) lista, y_train = base[0], base[2] dados_carregado = cd.Carrega() args = utils.read_args('configs/default.ast') num_epochs, nnet_args = args['num_epochs'], args['nnet_args'] chars = utils.mapeamento_palavra() num_classes = len(chars) num_samples = len(lista) printer = utils.Printer(chars) data_x, data_y = [], [] for indice in range(len(lista)): y = utils.classe(y_train[indice]) # Recupera a palavra y = utils.palavra_indice(y) y1 = utils.insere_blanks(y, num_classes) data_y.append(np.asarray(y1, dtype=np.int32)) _, sinal = aux.deslocamento_amostra(lista[indice], larg=70)
def train_a_single_configuration( datasets_root, crf, lr_method, batch_size, sparse_updates_enabled, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, skip_testing, max_epochs, train_filepath, dev_filepath, test_filepath, yuret_train_filepath, yuret_test_filepath, train_with_yuret, test_with_yuret, use_golden_morpho_analysis_in_word_representation, embeddings_filepath, integration_mode, active_models, multilayer, shortcut_connections, reload, dynet_gpu, _run): """ python train.py --pre_emb ../../data/we-300.txt --train dataset/gungor.ner.train.only_consistent --dev dataset/gungor.ner.dev.only_consistent --test dataset/gungor.ner.test.only_consistent --word_di m 300 --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 [email protected] --maximum-epochs 50 --char_dim 200 --char_lstm_dim 200 --char_bid irect 1 --overwrite-mappings 1 --batch-size 1 --morpho_tag_dim 100 --integration_mode 2 """ execution_part = "python main.py --command train --overwrite-mappings 1 " if sparse_updates_enabled == 0: execution_part += "--disable_sparse_updates " if dynet_gpu == 1: execution_part += "--dynet-gpu 1 " if train_with_yuret == 1: execution_part += "--train_with_yuret " if use_golden_morpho_analysis_in_word_representation == 1: execution_part += "--use_golden_morpho_analysis_in_word_representation " if word_dim == 0: embeddings_part = "" else: if embeddings_filepath: embeddings_part = "--pre_emb %s/%s " % (datasets_root, embeddings_filepath) else: embeddings_part = "" print (train_filepath, dev_filepath, test_filepath, skip_testing, max_epochs) always_constant_part = "-T %s/%s " \ "-d %s/%s " \ "-t %s/%s " \ "%s" \ "%s" \ "--yuret_train %s/%s " \ "--yuret_test %s/%s " \ "%s" \ "--skip-testing %d " \ "--tag_scheme iobes " \ "--maximum-epochs %d " % (datasets_root, train_filepath, datasets_root, dev_filepath, datasets_root, test_filepath, "--train_with_yuret " if train_with_yuret else "", "--test_with_yuret " if test_with_yuret else "", datasets_root, yuret_train_filepath, datasets_root, yuret_test_filepath, embeddings_part, skip_testing, max_epochs) commandline_args = always_constant_part + \ "--crf %d " \ "--lr_method %s " \ "--batch-size %d " \ "--dropout %1.1lf " \ "--char_dim %d " \ "--char_lstm_dim %d " \ "--morpho_tag_dim %d " \ "--morpho_tag_lstm_dim %d " \ "--morpho_tag_type %s " \ "--morpho-tag-column-index %d " \ "--word_dim %d " \ "--word_lstm_dim %d "\ "--cap_dim %d "\ "--integration_mode %d " \ "--active_models %d " \ "--multilayer %d " \ "--shortcut_connections %d " \ "--reload %d" % (crf, lr_method, batch_size, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, integration_mode, active_models, multilayer, shortcut_connections, reload) # tagger_root = "/media/storage/genie/turkish-ner/code/tagger" print _run print _run.info print subprocess.check_output(["id"]) print subprocess.check_output(["pwd"]) opts = read_args(args_as_a_list=commandline_args.split(" ")) print opts parameters = form_parameters_dict(opts) print parameters # model_path = get_name(parameters) model_path = get_model_subpath(parameters) print model_path task_names = ["NER", "MORPH", "YURET"] for task_name in task_names: _run.info["%s_dev_f_score" % task_name] = dict() _run.info["%s_test_f_score" % task_name] = dict() _run.info['starting'] = 1 dummy_prefix = "" full_commandline = dummy_prefix + execution_part + commandline_args print full_commandline process = subprocess.Popen(full_commandline.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def record_metric(epoch, label, value): if str(epoch) in _run.info[label]: _run.info[label][str(epoch)].append(value) else: _run.info[label][str(epoch)] = list() _run.info[label][str(epoch)].append(value) def capture_information(line): # 1 """ NER Epoch: %d Best dev and accompanying test score, best_dev, best_test: %lf %lf """ for task_name in task_names: m = re.match("^%s Epoch: (\d+) .* best_dev, best_test: (.+) (.+)$" % task_name, line) if m: epoch = int(m.group(1)) best_dev = float(m.group(2)) best_test = float(m.group(3)) record_metric(epoch, "%s_dev_f_score" % task_name, best_dev) record_metric(epoch, "%s_test_f_score" % task_name, best_test) for line in iter(process.stdout.readline, ''): sys.stdout.write(line) capture_information(line) sys.stdout.flush() return model_path
import numpy as np import tensorflow as tf from utils import read_args, form_parameters_dict, models_path, eval_script, eval_temp, iobes_iob import loader from loader import calculate_global_maxes, update_tag_scheme, \ word_mapping, augment_with_pretrained, char_mapping, tag_mapping, prepare_dataset from model_tensorflow import Model logging.basicConfig(level=logging.INFO) logger = logging.getLogger("eval") # Read parameters from command line opts = read_args(evaluation=True) # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])
self.err_best_g = self.err[j] self.update(args) if args.verbose: print("#", i + 1, "\tBest Solution:\t ", self.err_best_g) i += 1 return self.err_best_g, self.pos_best_g if __name__ == "__main__": initial = [] bounds = [] args = read_args() if args.fn == 1: fn = fn1 else: print("ERROR : FUNCTION NOT FOUND") box_limit = [-args.box, args.box] for i in range(args.d): initial.append(args.x0) bounds.append(box_limit) pso = Swarm(args, bounds) start = time.time()
import sys sys.path.append("..") from utils import slab_print, read_args import telugu as language import scribe args = read_args(sys.argv[1:], default='../configs/default.ast') scriber = scribe.Scribe(language, **args['scribe_args']) try: while True: image, text, labels = scriber.get_text_image() slab_print(image) print(image.shape) print(labels) # print("Twist: {:.3f}".format(angle), fp) # print(text) print(scriber) print("Press Enter to continue and Ctrl-D to quit.") input() except (KeyboardInterrupt, EOFError): pass
import tensorflow as tf import loader from loader import augment_with_pretrained, calculate_global_maxes from loader import update_tag_scheme, prepare_dataset from loader import word_mapping, char_mapping, tag_mapping # from model import Model from model_tensorflow import Model from utils import models_path, evaluate, eval_script, eval_temp from utils import read_args, form_parameters_dict logging.basicConfig(level=logging.INFO) logger = logging.getLogger("main") # Read parameters from command line opts = read_args() # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])
def show_results(res_dir: Path or str, opts: dict = None, curves: int = 0, savefig=False): res_dir = Path(res_dir) args = u.read_args(res_dir / "args.txt") print(args.__dict__) inputs = np.load(os.path.join(args.imgdir, args.imgname), allow_pickle=True) if opts is None: opts = dict() if 'clipval' not in opts.keys(): opts['clipval'] = u.clim(inputs, 98) if 'save_opts' not in opts.keys(): opts['save_opts'] = { 'format': 'png', 'dpi': 150, 'bbox_inches': 'tight' } outputs, hist = reconstruct_patches(args, return_history=True, verbose=True) if outputs.shape != inputs.shape: print("\n\tWarning! Outputs and Inputs have different shape! %s - %s" % (outputs.shape, inputs.shape)) inputs = inputs[:outputs.shape[0], :outputs.shape[1]] if inputs.ndim == 3: inputs = inputs[:, :, :outputs.shape[2]] # plot output volume if savefig: u.explode_volume(outputs, filename=res_dir / "output", **opts) else: u.explode_volume(outputs, **opts) # plot curves if curves > 0: if len(hist) <= curves: idx = range(len(hist)) else: idx = sample(range(len(hist)), curves) idx.sort() fig, axs = plt.subplots(1, 4, figsize=(18, 4)) for i in idx: axs[0].plot(hist[i].loss, label='patch %d' % i) axs[1].plot(hist[i].snr, label='patch %d' % i) axs[2].plot(hist[i].pcorr, label='patch %d' % i) try: axs[3].plot(hist[i].lr, label='patch %d' % i) except AttributeError: pass try: axs[0].set_title('LOSS %s' % args.loss) except AttributeError: axs[0].set_title('LOSS mae') axs[1].set_title('SNR = %.2f dB' % u.snr(outputs, inputs)) axs[2].set_title('PCORR = %.2f %%' % (u.pcorr(outputs, inputs) * 100)) axs[3].set_title('Learning Rate') for a in axs: a.legend() a.set_xlim(0, args.epochs) a.grid() axs[0].set_ylim(0) axs[1].set_ylim(0) axs[2].set_ylim(0, 1) axs[3].set_ylim(0, args.lr * 10) plt.suptitle(res_dir) plt.tight_layout(pad=.5) if savefig: plt.savefig(res_dir / f"curves.{opts['save_opts']['format']}", **opts['save_opts']) plt.show()
def main(argv=None): # pylint: disable=unused-argument # if tf.gfile.Exists(FLAGS.eval_dir): # tf.gfile.DeleteRecursively(FLAGS.eval_dir) # tf.gfile.MakeDirs(FLAGS.eval_dir) # Read parameters from command line opts = read_args(evaluation=True) # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) if not os.path.exists(eval_temp): os.makedirs(eval_temp) if not os.path.exists(models_path): os.makedirs(models_path) event_logs_path = os.path.join(eval_temp, "eval_logs") # if not os.path.exists(event_logs_path): # os.makedirs(event_logs_path) # Initialize model model = MainTaggerModel(parameters=parameters, models_path=models_path, overwrite_mappings=opts.overwrite_mappings) print "MainTaggerModel location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['t_s'] max_sentence_lengths = {} max_word_lengths = {} # Load sentences train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \ loader.load_sentences(opts.train, lower, zeros) dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences( opts.dev, lower, zeros) test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences( opts.test, lower, zeros) global_max_sentence_length, global_max_char_length = \ calculate_global_maxes(max_sentence_lengths, max_word_lengths) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) if opts.overwrite_mappings: print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) model.reload_mappings() # Index data train_buckets, train_stats, train_unique_words = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) dev_buckets, dev_stats, dev_unique_words = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) test_buckets, test_stats, test_unique_words = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_stats), len(dev_stats), len(test_stats)) print "%i / %i / %i words in train / dev / test." % ( sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]), sum([x[0] for x in test_stats])) print "%i / %i / %i longest sentences in train / dev / test." % ( max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]), max([x[0] for x in test_stats])) print "%i / %i / %i shortest sentences in train / dev / test." % ( min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]), min([x[0] for x in test_stats])) for i, label in [[2, 'char']]: print "%i / %i / %i total %s in train / dev / test." % ( sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]), sum([sum(x[i]) for x in test_stats]), label) print "%i / %i / %i max. %s lengths in train / dev / test." % ( max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]), max([max(x[i]) for x in test_stats]), label) print "%i / %i / %i min. %s lengths in train / dev / test." % ( min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]), min([min(x[i]) for x in test_stats]), label) print "Max. sentence lengths: %s" % max_sentence_lengths print "Max. char lengths: %s" % max_word_lengths for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words], ['dev', dev_stats, dev_unique_words], ['test', test_stats, test_unique_words]]: int32_items = len(train_stats) * ( max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1) float32_items = n_unique_words * parameters['word_dim'] total_size = int32_items + float32_items logging.info("Input ids size of the %s dataset is %d" % (label, int32_items)) logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % ( n_unique_words, label, float32_items)) logging.info("Total size of the %s dataset is %d" % (label, total_size)) batch_size = 5 # Build the model cost, train_step, tag_scores, tag_ids, word_ids, \ crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build( max_sentence_length_scalar=global_max_sentence_length, max_word_length_scalar=global_max_char_length, batch_size_scalar=batch_size, **parameters) FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', event_logs_path, """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path, """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") evaluate(model, dev_buckets, test_buckets, FLAGS, opts, id_to_tag, batch_size, placeholders, enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths, FLAGS.eval_dir, tag_scheme)
def prob(a, b): # return a cross product of the two tuples children = list(itertools.product(a, b)) return 1.0 * len(filter(has_allele, children)) / len(children) def generate_prob_table(): pairings = itertools.combinations_with_replacement([dg, hg, rg], 2) probability_table = dict((p, prob(*p)) for p in pairings) return probability_table dg = 1, 1 hg = 1, 0 rg = 0, 0 prob_table = generate_prob_table() # up to here we are just precomputing the probabilities. # its not even necessary honestly, a bit of premature optimization d, h, r = read_args(3) population = itertools.chain([dg] * d, [hg] * h, [rg] * r) probs = map(lambda x: prob_table[x], itertools.combinations(population, 2)) # since we're already dividing by 4 in prob(), we aren't talking about # integer numbers, so we take the mean # generally we can't do this, but the values cancel out and it works out here print('%.5f' % (sum(probs) / len(probs)))
import pickle import sys import numpy as np import theano as th from nnet.neuralnet import NeuralNet import utils # th.config.optimizer = 'fast_compile' # th.config.exception_verbosity='high' ################################### Main Script ########################### print('Loading the dataset.') with open(sys.argv[1], 'rb') as pkl_file: data = pickle.load(pkl_file) args = utils.read_args(sys.argv[2:]) num_epochs, train_on_fraction = args['num_epochs'], args['train_on_fraction'] scribe_args, nnet_args, = args['scribe_args'], args['nnet_args'], chars = data['chars'] num_classes = len(chars) img_ht = len(data['x'][0]) num_samples = len(data['x']) nTrainSamples = int(num_samples * train_on_fraction) printer = utils.Printer(chars) print('\nInput Dim: {}' '\nNum Classes: {}' '\nNum Samples: {}' '\nNum Epochs: {}' '\nFloatX: {}'
def train_a_single_configuration( lang_name, datasets_root, crf, lr_method, batch_size, sparse_updates_enabled, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, skip_testing, starting_epoch_no, maximum_epochs, file_format, debug, ner_train_file, ner_dev_file, ner_test_file, md_train_file, md_dev_file, md_test_file, use_golden_morpho_analysis_in_word_representation, embeddings_filepath, integration_mode, active_models, multilayer, shortcut_connections, reload, model_path, model_epoch_path, dynet_gpu, _run): """ python train.py --pre_emb ../../data/we-300.txt --train dataset/gungor.ner.train.only_consistent --dev dataset/gungor.ner.dev.only_consistent --test dataset/gungor.ner.test.only_consistent --word_di m 300 --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 [email protected] --maximum-epochs 50 --char_dim 200 --char_lstm_dim 200 --char_bid irect 1 --overwrite-mappings 1 --batch-size 1 --morpho_tag_dim 100 --integration_mode 2 """ execution_part = "python main.py --command train --overwrite-mappings 1 " if sparse_updates_enabled == 0: execution_part += "--disable_sparse_updates " if dynet_gpu == 1: execution_part += "--dynet-gpu 1 " if use_golden_morpho_analysis_in_word_representation == 1: execution_part += "--use_golden_morpho_analysis_in_word_representation " execution_part += "--debug " + str(debug) + " " if word_dim == 0: embeddings_part = "" else: if embeddings_filepath: embeddings_part = "--pre_emb %s/%s " % (datasets_root, embeddings_filepath) else: embeddings_part = "" always_constant_part = "--lang_name %s --file_format %s " \ "--ner_train_file %s/%s/%s " \ "%s" \ "--ner_test_file %s/%s/%s " \ "--md_train_file %s/%s/%s " \ "%s" \ "--md_test_file %s/%s/%s " \ "%s" \ "--skip-testing %d " \ "--tag_scheme iobes " \ "--starting-epoch-no %d " \ "--maximum-epochs %d " % (lang_name, file_format, datasets_root, lang_name, ner_train_file, ("--ner_dev_file %s/%s/%s " % (datasets_root, lang_name, ner_dev_file)) if ner_dev_file else "", datasets_root, lang_name, ner_test_file, datasets_root, lang_name, md_train_file, ("--md_dev_file %s/%s/%s " % (datasets_root, lang_name, md_dev_file)) if md_dev_file else "", datasets_root, lang_name, md_test_file, embeddings_part, skip_testing, starting_epoch_no, maximum_epochs) if reload == 1: reload_part = "--reload %d --model_path %s --model_epoch_path %s " % ( reload, model_path, model_epoch_path) else: reload_part = "--reload 0 " commandline_args = always_constant_part + \ "--crf %d " \ "--lr_method %s " \ "--batch-size %d " \ "--dropout %1.1lf " \ "--char_dim %d " \ "--char_lstm_dim %d " \ "--morpho_tag_dim %d " \ "--morpho_tag_lstm_dim %d " \ "--morpho_tag_type %s " \ "--morpho-tag-column-index %d " \ "--word_dim %d " \ "--word_lstm_dim %d "\ "--cap_dim %d "\ "--integration_mode %d " \ "--active_models %d " \ "--multilayer %d " \ "--shortcut_connections %d " \ "%s" % (crf, lr_method, batch_size, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, integration_mode, active_models, multilayer, shortcut_connections, reload_part) # tagger_root = "/media/storage/genie/turkish-ner/code/tagger" print(_run) print(_run.info) print(subprocess.check_output(["id"])) print(subprocess.check_output(["pwd"])) opts = read_args(args_as_a_list=commandline_args.split(" ")) print(opts) parameters = form_parameters_dict(opts) print(parameters) # model_path = get_name(parameters) model_path = get_model_subpath(parameters) print(model_path) task_names = ["NER", "MORPH"] for task_name in task_names: _run.info["%s_dev_f_score" % task_name] = dict() _run.info["%s_test_f_score" % task_name] = dict() _run.info["avg_loss"] = dict() _run.info['starting'] = 1 dummy_prefix = "" full_commandline = dummy_prefix + execution_part + commandline_args print(full_commandline) process = subprocess.Popen(full_commandline.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def record_metric(epoch, label, value): """ Each label can have multiple values in an epoch. This is for updates to the metric's value. i.e. metrics calculated before an epoch has finished. :param epoch: :param label: :param value: :return: """ epoch_str = str(epoch) if epoch_str in _run.info[label]: _run.info[label][epoch_str].append(value) else: _run.info[label][epoch_str] = [value] def capture_information(line): # 1 """ NER Epoch: %d Best dev and accompanying test score, best_dev, best_test: %lf %lf """ for task_name in task_names: m = re.match( "^.*%s Epoch: (\d+) .* best_dev, best_test: (.+) (.+)$" % task_name, line) if m: epoch = int(m.group(1)) best_dev = float(m.group(2)) best_test = float(m.group(3)) record_metric(epoch, "%s_dev_f_score" % task_name, best_dev) record_metric(epoch, "%s_test_f_score" % task_name, best_test) m = re.match("^.*Epoch (\d+) Avg. loss over training set: (.+)$", line) if m: epoch = int(m.group(1)) avg_loss_over_training_set = float(m.group(2)) record_metric(epoch, "avg_loss", avg_loss_over_training_set) """ MainTaggerModel location: ./models/model-00000227 """ m = re.match("^.*MainTaggerModel location: (.+)$", line) if m: model_dir_path = m.group(1) _run.info["model_dir_path"] = model_dir_path """ LOG: model_epoch_dir_path: {} """ m = re.match("^.*LOG: model_epoch_dir_path: (.+)$", line) if m: model_epoch_dir_path = m.group(1) _run.info["model_epoch_dir_path"] = model_epoch_dir_path for line in process.stdout: sys.stdout.write(line.decode("utf8")) capture_information(line.decode("utf8")) sys.stdout.flush() return model_path