def __init__(self, mode = 'chat'): if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH) if(mode == "chat"): self.__chat_init()
def main(): #parser = argparse.ArgumentParser() #parser.add_argument('--mode', choices={'train', 'chat'}, # default='train', help="mode. if not specified, it's in the train mode") #args = parser.parse_args() if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print('Data ready!') # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH)
def main(args): if not os.path.isdir(config.PROCESSED_PATH): data.process_data() print('Data is ready!') data.make_dir(config.CPT_PATH) mode = args[-1] if mode == 'train': train() elif mode == 'test': predict()
def main(): lstm = load_model('model/100211_all/lstm.h5') gru = load_model('model/100211_all/gru.h5') saes = load_model('model/100211_all/saes.h5') models = [lstm, gru, saes] models = [lstm] names = ['LSTM', 'GRU', 'SAEs'] lag = 12 file1 = 'data/100211data/100211_weekend_train.csv' file2 = 'data/100211data/100211_weekend_test.csv' _, _, X_test, y_test, scaler = process_data(file1, file2, lag) y_test = scaler.inverse_transform(y_test.reshape(-1, 1)).reshape(1, -1)[0] y_preds = [] for name, model in zip(names, models): if name == 'SAEs': X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1])) else: X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1)) file = 'images/' + name + '.png' plot_model(model, to_file=file, show_shapes=True) predicted = model.predict(X_test) predicted = scaler.inverse_transform(predicted.reshape(-1, 1)).reshape( 1, -1)[0] y_preds.append(predicted[0:288]) print(name) eva_regress(y_test, predicted) plot_results(y_test[0:288], y_preds, names)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode',choices={'train','chat'},default='train',help="mode if not specified its in train mode") args = parser.parse_args() if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print('Data Ready!') data.make_dir(config.CPT_PATH) if args.mode == 'train': train() elif args.mode == 'chat': chat()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', choices={'train', 'chat'}, default='train', help="mode. if not specified, it's in the train mode") args = parser.parse_args() if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print('Data ready!') # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH) if args.mode == 'train': train() elif args.mode == 'chat': chat()
def get_data(): data = None if 'pd' not in listdir('{}/processedData/'.format(ROOT)): data = process_data() pickle.dump(data, open('{}/processedData/pd'.format(ROOT), 'w')) else: data = pickle.load(open('{}/processedData/pd'.format(ROOT))) # print "shape of data:",data.shape return data
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', choices={'train', 'chat'}, default='train', help="mode. if not specified, it's in the train mode") args = parser.parse_args() if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print("Data ready, starting application") # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH) if args.mode == 'train': train() elif args.mode == 'chat': chat()
def main(args): mode = None if len(args) == 2: mode, preload = args else: raise ValueError('Incorrect number of args.') if preload == 'none': preload = None if mode == 'vis': data = get_data() model = init_model( data=[data['embedding_matrix'], data['len_word_index']]) return visualizer(model) if mode == 'train': return runner(50) if mode == 'confusion': model = init_model(preload=preload, declare=False) data = process_data() embedding_matrix = data['embedding_matrix'] len_word_index = data['len_word_index'] x_train, y_train = data['x_train'], data['y_train'] x_val, y_val = data['x_val'], data['y_val'] y_train = np.asarray(y_train, dtype=np.float32) x_train_1 = np.zeros(((y_train.shape[0]), max_claim_len)) x_train_2 = np.zeros(((y_train.shape[0]), max_text_len)) x_val_1 = np.zeros(((y_val.shape[0]), max_claim_len)) x_val_2 = np.zeros(((y_val.shape[0]), max_text_len)) # print x_train[1][1] for i in range(0, len(x_train)): x_train_1[i] = x_train[i][0] for i in range(0, len(x_train)): x_train_2[i] = x_train[i][1] for i in range(0, len(x_val)): x_val_1[i] = x_val[i][0] for i in range(0, len(x_val)): x_val_2[i] = x_val[i][1] y_true = data['y_val'] y_pred = model.predict([x_val_1, x_val_2]) y_t = [] for i in range(0, len(y_true)): for j in range(0, 4): if y_true[i][j] == 1: y_t.append(j) y_p = [] for i in range(0, len(y_true)): for j in range(0, 4): if y_pred[i][j] == max(y_pred[i]): y_p.append(j) print len(y_t) print len(y_p) print get_confusion_matrix(y_t, y_p) else: raise ValueError('Incorrect mode')
def main(): start_time = time() print("Processing data, please wait...") process_data(jungle=False, ml=True) print(f"\nData processed in {round(time() - start_time, 3)}s.") print( "Press Enter to process teams in the ml.yaml file or 'q' followed by Enter to quit." ) while True: user_input = input() if user_input.lower() in ("q", "quit"): break try: blue, red = get_teams_from_yaml() process_teams(blue, red) except Exception as e: traceback.print_tb(sys.exc_info()[2]) print(e) print("[ERROR] Please check your inputs. See above for more info.") print("\nWaiting for next input...")
def main(): parser = argparse.ArgumentParser() parser.add_argument('mode', choices={'train', 'test', 'translate'}, default='train', help="mode. if not specified, it's in the train mode") args = parser.parse_args() if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print('Data ready!') # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH) if args.mode == 'train': train() elif args.mode == 'test': bleu_scores = test() elif args.mode == 'translate': translate()
def main(hparams): """ Main training routine specific for this project :param hparams: """ if hparams.save != None: checkpoint_callback = ModelCheckpoint( filepath=hparams.save + '/{epoch:02d}_{eval_acc:.3f}', verbose=True, monitor='eval_acc', mode='max', save_top_k=hparams.save_k if not hparams.debug else 0, period=0) else: checkpoint_callback = None if hparams.task == 'conj': hparams.train_fp = 'data/ptb-train.labels' if hparams.train_fp == None else hparams.train_fp hparams.dev_fp = 'data/ptb-dev.labels' if hparams.dev_fp == None else hparams.dev_fp hparams.test_fp = 'data/ptb-test.labels' if hparams.test_fp == None else hparams.test_fp if hparams.debug: hparams.train_fp = hparams.dev_fp = hparams.test_fp = 'data/debug_conj.labels' elif hparams.task == 'oie': hparams.train_fp = 'data/openie4_labels' if hparams.train_fp == None else hparams.train_fp hparams.dev_fp = 'carb/data/dev.txt' if hparams.dev_fp == None else hparams.dev_fp hparams.test_fp = 'carb/data/test.txt' if hparams.test_fp == None else hparams.test_fp if hparams.debug: hparams.train_fp = hparams.dev_fp = hparams.test_fp = 'data/debug_oie.labels' hparams.gradient_clip_val = 5 if hparams.gradient_clip_val == None else float( hparams.gradient_clip_val) train_dataset, val_dataset, test_dataset, meta_data_vocab, all_sentences = data.process_data( hparams) train_dataloader = DataLoader(train_dataset, batch_size=hparams.batch_size, collate_fn=data.pad_data, shuffle=True, num_workers=1) val_dataloader = DataLoader(val_dataset, batch_size=hparams.batch_size, collate_fn=data.pad_data, num_workers=1) test_dataloader = DataLoader(test_dataset, batch_size=hparams.batch_size, collate_fn=data.pad_data, num_workers=1) for process in hparams.mode.split('_'): globals()[process](hparams, checkpoint_callback, meta_data_vocab, train_dataloader, val_dataloader, test_dataloader, all_sentences)
def main(): """parser = argparse.ArgumentParser() parser.add_argument('--mode', choices={'train', 'chat'}, default='train', help="mode. if not specified, it's in the train mode") args = parser.parse_args()""" if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print('Data ready!') # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH) mode = input("Input mode (train|chat): ") """if args.mode == 'train': train() elif args.mode == 'chat': chat()""" if mode == 'train': train() else: chat()
def train(FLAGS): """ Train our embeddings. """ # Get data loaders print("==> Reading and processing the data ... ", end="") train_loader, test_loader, num_unique_words = process_data( data_dir=FLAGS.data_dir, data_file=FLAGS.data_file, vocab_size=FLAGS.vocab_size, window_size=FLAGS.window_size, split_ratio=FLAGS.split_ratio, batch_size=FLAGS.batch_size, ) print("[COMPLETE]") # Initialize model, criterion, loss print("==> Initializing model components ... ", end="") model = MLP( D_in=num_unique_words, embedding_dim=FLAGS.embedding_dim, num_hidden_units=FLAGS.num_hidden_units, window_size=FLAGS.window_size, ) # Objective criterion = torch.nn.CrossEntropyLoss() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.lr) print("[COMPLETE]") # Train the model print("==> Training the model ... [IN PROGRESS]") model = training_procedure( model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, test_loader=test_loader, num_epochs=FLAGS.num_epochs, learning_rate=FLAGS.lr, decay_rate=FLAGS.decay_rate, max_grad_norm=FLAGS.max_grad_norm, ) print("\n[COMPLETE]") # Save the model print("==> Saving the model ... [IN PROGRESS]") torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt")) print("\n[COMPLETE]")
def train(FLAGS): """ Train our embeddings. """ # Get data loaders print ("==> Reading and processing the data ... ", end="") train_loader, test_loader, num_unique_words = process_data( data_dir=FLAGS.data_dir, data_file=FLAGS.data_file, vocab_size=FLAGS.vocab_size, window_size=FLAGS.window_size, split_ratio=FLAGS.split_ratio, batch_size=FLAGS.batch_size, ) print ("[COMPLETE]") # Initialize model, criterion, loss print ("==> Initializing model components ... ", end="") model = MLP( D_in=num_unique_words, embedding_dim=FLAGS.embedding_dim, num_hidden_units=FLAGS.num_hidden_units, window_size=FLAGS.window_size, ) # Objective criterion = torch.nn.CrossEntropyLoss() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.lr) print ("[COMPLETE]") # Train the model print ("==> Training the model ... [IN PROGRESS]") model = training_procedure( model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, test_loader=test_loader, num_epochs=FLAGS.num_epochs, learning_rate=FLAGS.lr, decay_rate=FLAGS.decay_rate, max_grad_norm=FLAGS.max_grad_norm, ) print ("\n[COMPLETE]") # Save the model print ("==> Saving the model ... [IN PROGRESS]") torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt")) print ("\n[COMPLETE]")
def make_kfold(): ''' Faz para cada um dos tres exames e para cada um dos tres modelos de rede neural o treinalmento usando stratified k-fold cross validation. Retorna uma matriz 3D que contem para cada cambinacao de exame e rede uma lista com quatro valores: acuracia e loss no conjunto de treinamento e no de validacao. ''' raw_data = load_data() n_folds = 5 resultado = np.zeros((3, 3, 4)) # [tr_lss,tr_acc, vl_lss,vl_acc] for exam_type, name in [(0, 'IGG'), (1, 'IGM'), (2, 'PCR')]: print('\n-------------------------------------------') print('----------- Rede para ' + name + '------------------') print('-------------------------------------------\n') x, y = process_data(raw_data, exam_type) x = x.to_numpy() y = y.to_numpy() print("Formato conjunto treinamento: {0} e de teste: {1}\n"\ .format(x.shape, y.shape)) skf = StratifiedKFold(n_splits=n_folds) fold_iter = 1 for train, val in skf.split(x, y): # cada iteracao é um novo fold # (x_train_f, y_train_f, x_val_f, y_val_f) fold_data = create_fold(x, y, train, val) for model_n in NN: # cada iteracao testa um modelo if model_n == 0: network = NN1(fold_data[0].shape[1]) elif model_n == 1: network = NN2(fold_data[0].shape[1]) else: network = NN3(fold_data[0].shape[1]) optimizer = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE) criterion = torch.nn.BCELoss() valores = train_network(fold_data, network, optimizer, \ criterion, MODEL_PATH[exam_type]) resultado[exam_type][model_n] = np.sum([resultado[exam_type][model_n], \ np.asarray(valores)], axis=0) print("Fold[{0}] model[{1}] accuracy on validation {2:.2f}% and train {3:.2f}%"\ .format(fold_iter, model_n+1, valores[3]*100, valores[1]*100)) fold_iter += 1 return np.divide(resultado, n_folds)
def main(FLAGS): """ """ if FLAGS.mode == 'train': # Process the data train_data, test_data = process_data( data_dir=FLAGS.data_dir, split_ratio=FLAGS.split_ratio, ) # Sample sample( data=train_data, data_dir=FLAGS.data_dir, ) # Load components with open(os.path.join(basedir, FLAGS.data_dir, 'char2index.json'), 'r') as f: char2index = json.load(f) # Training train( data_dir=FLAGS.data_dir, char2index=char2index, train_data=train_data, test_data=test_data, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size, num_filters=FLAGS.num_filters, learning_rate=FLAGS.lr, decay_rate=FLAGS.decay_rate, max_grad_norm=FLAGS.max_grad_norm, dropout_p=FLAGS.dropout_p, ) elif FLAGS.mode == 'infer': # Inference infer( data_dir=FLAGS.data_dir, model_name=FLAGS.model_name, sentence=FLAGS.sentence, ) else: raise Exception('Choose --mode train|infer')
def get_stats(): config = { 'unknown_freq': 2, 'gold_ratio': 0.1, 'inc_option': 'auxiliary', 'auxiliary_option': 'detection', 'seed': 66 } dir_path = '/path/to/working/dir' set_random_seed(config['seed']) train_file = dir_path + '/data/ontonotes.development.ner' print('load data') train_data = get_data(train_file) gold_data, inc_data = split_data(train_data, config) print('get vocabulary') word_to_ix, pos_to_ix, ner_to_ix = get_vocabulary(train_data, config) config['ner_to_ix'] = ner_to_ix config['pos_to_ix'] = pos_to_ix config['word_to_ix'] = word_to_ix config['output_size'] = len(ner_to_ix) print('ner_to_ix', ner_to_ix) print('word_to_ix', len(word_to_ix)) print('process data') inc_input_ids, inc_sent_ids, inc_pos_ids, inc_ner_ids = process_data( inc_data, word_to_ix, pos_to_ix, ner_to_ix) inc_ner_ids = get_incidental_data(inc_sent_ids, inc_input_ids, inc_pos_ids, inc_ner_ids, config) inc_label_counter = Counter() for label in inc_ner_ids: # if label[0] == 'B' or label[0] == 'I': # label = label[2:] inc_label_counter[label] += 1 / len(inc_ner_ids) print('inc label counter', inc_label_counter) inputs, sent_ids, pos_labels, ner_labels = inc_data word_seqs = generate_sent_seqs(inputs, sent_ids) pos_seqs = generate_sent_seqs(pos_labels, sent_ids) ner_seqs = generate_sent_seqs(ner_labels, sent_ids) inc_data = [] sent_counter = Counter() for x in range(len(word_seqs)): inc_data.append((word_seqs[x], pos_seqs[x], ner_seqs[x])) sent_counter[len(word_seqs[x])] += 1 / len(word_seqs) print('average sent length', len(sent_ids) / len(word_seqs)) print('sent length distribution', sent_counter.items())
def run(): seed_everything(args.seed) df = pd.read_csv(os.path.join(args.data_dir, "train.csv")) df = process_data(df, args.subset) df_folds = create_folds(df, args.n_folds) train_image_ids = df_folds[df_folds["fold"] != args.fold].index.values valid_image_ids = df_folds[df_folds["fold"] == args.fold].index.values train_loader = get_train_loader( args.data_dir, df, train_image_ids, transforms=get_train_augs(args), do_cutmix=args.cutmix, batch_size=args.bs, num_workers=args.num_workers, ) valid_loader = get_valid_loader( args.data_dir, df, valid_image_ids, transforms=get_valid_augs(args), batch_size=args.bs, num_workers=args.num_workers, ) model = get_model(args.model_variant, model_dir=args.model_dir, checkpoint_path=args.load_path).cuda() if args.scheduler == "one_cycle": args.steps_per_epoch = len(train_image_ids) // args.bs scheduler_class, scheduler_params = get_scheduler(args) else: scheduler_class, scheduler_params = get_scheduler(args) learner = Learner(model, scheduler_class, scheduler_params, hparams=args) learner.fit(train_loader, valid_loader)
def run_test_experiments(config): dir_path = '/path/to/working/dir' train_file = dir_path + '/data/ontonotes.development.ner' test_file = dir_path + '/data/ontonotes.test.ner' model_path = dir_path + '/models/MLPNet_' + config['para_option'] + '.pt' print('load data') train_data = get_data(train_file) test_data = get_data(test_file) print('get vocabulary and embeddings') word_to_ix, pos_to_ix, ner_to_ix = get_vocabulary(train_data, config) config['ner_to_ix'] = ner_to_ix config['pos_to_ix'] = pos_to_ix config['word_to_ix'] = word_to_ix config['output_size'] = len(ner_to_ix) print('ner_to_ix', ner_to_ix) vocab_embeddings = get_vocab_embeddings(word_to_ix) print('process data') test_input_ids, test_sent_ids, test_pos_ids, test_ner_ids = process_data( test_data, word_to_ix, pos_to_ix, ner_to_ix) print('get test input features') test_input_features = get_word_features(test_input_ids, test_sent_ids, vocab_embeddings) test_data = { 'inputs': test_input_features, 'sent_ids': test_sent_ids, 'labels': test_ner_ids, 'confidences': [1.0] * len(test_input_features) } print('test words', len(test_input_features)) print('build model') model, loss_function, optimizer = build_model(config) print('load model') model.load_state_dict(torch.load(model_path)) print('test model') test_accuracy = evaluate(test_data, model, ner_to_ix, config) print('test accuracy', test_accuracy)
# -*- coding: utf-8 -*- import pandas as pd from bokeh.core.properties import field from bokeh.io import curdoc from bokeh.layouts import layout from bokeh.models import (ColumnDataSource, HoverTool, SingleIntervalTicker, Slider, Button, Label, CategoricalColorMapper) from bokeh.palettes import Spectral6 from bokeh.plotting import figure from data import process_data fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions_list = process_data() df = pd.concat({'fertility': fertility_df, 'life': life_expectancy_df, 'population': population_df_size}, axis=1) data = {} regions_df.rename({'Group':'region'}, axis='columns', inplace=True) for year in years: df_year = df.iloc[:,df.columns.get_level_values(1)==year] df_year.columns = df_year.columns.droplevel(1) data[year] = df_year.join(regions_df.region).reset_index().to_dict('series') source = ColumnDataSource(data=data[years[0]]) plot = figure(x_range=(1, 9), y_range=(20, 100), title='Gapminder Data', plot_height=300)
from bokeh.core.properties import field from bokeh.embed import file_html from bokeh.layouts import column from bokeh.models import ( ColumnDataSource, Plot, Circle, Range1d, LinearAxis, HoverTool, Text, SingleIntervalTicker, CustomJS, Slider, CategoricalColorMapper, Legend, LegendItem, ) from bokeh.models.annotations import Title from bokeh.palettes import Spectral6 from bokeh.resources import JSResources from bokeh.util.browser import view from data import process_data fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions_list = process_data() sources = {} region_name = regions_df.Group region_name.name = 'region' for year in years: fertility = fertility_df[year] fertility.name = 'fertility' life = life_expectancy_df[year] life.name = 'life' population = population_df_size[year] population.name = 'population' new_df = pd.concat([fertility, life, population, region_name], axis=1) sources['_' + str(year)] = ColumnDataSource(new_df)
from tensorflow.keras.models import load_model from data import process_data #method used to comute return based on each model. if __name__ == "__main__": model_type = "LSTM" model = load_model('models/{}.h5'.format(model_type)) x_train, y_train, x_test, y_test = process_data(time_series=True, debug=True) total_return = 0 for x in range(x_test.shape[0]): y_pred = model.predict(x_test[x:x+1]) for y in range(y_pred.shape[1]): if y_pred[0, y] > 0: total_return += (100* y_test[x, y]) else: total_return += (-100* y_test[x, y]) print(total_return/x_test.shape[0]) print('ok')
""" if alg == 'kNN': clf = KNeighborsClassifier(n_neighbors=3) elif alg == 'SVM': clf = SVC(kernel='linear') elif alg == 'Decision Tree': clf = DecisionTreeClassifier(max_depth=5) elif alg == 'Logistic Regression': clf = LogisticRegression() return clf # Read in dataset df, feature_dict = process_data() feature_names = list(feature_dict.keys()) feature_boxes = CheckboxGroup(labels=feature_names, active=[0, 1, 2, 3, 4]) s = ColumnDataSource(data=dict(x=[], real_y=[], predict_y=[])) pred_ticker = Select(title='Predict:', value='cancer', options=TARGETS, width=sidebar_width) alg_ticker = Select(title='Classifier:', value='kNN', options=CLASSIFIERS, width=sidebar_width) # Plot the classification result p = figure(**plot_params)
def train(FLAGS): """ Train our embeddings. """ # Get data loaders print ("==> Reading and processing the data ... ", end="") train_loader, test_loader, num_unique_words, \ num_unique_documents, word_to_idx = process_data( data_dir=FLAGS.data_dir, vocab_size=FLAGS.vocab_size, window_size=FLAGS.window_size, split_ratio=FLAGS.split_ratio, batch_size=FLAGS.batch_size, ) print ("[COMPLETE]") # Load pretrained GloVe embeddings for our vocab embedding_dir = os.path.join(basedir, "../../../../embeddings/glove") embedding_dim = 100 embeddings = get_embeddings( embedding_dir=embedding_dir, embedding_dim=embedding_dim, words=word_to_idx.keys(), ) # Initialize model, criterion, loss print ("==> Initializing model components ... ", end="") model = MLP( D_in_words=num_unique_words, D_in_documents=num_unique_documents, embedding_dim=FLAGS.embedding_dim, num_hidden_units=FLAGS.num_hidden_units, window_size=FLAGS.window_size, embeddings=embeddings, ) # Objective criterion = torch.nn.CrossEntropyLoss() # Optimizer # Only get the parameters with gradients (we freeze our GloVe embeddings) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=FLAGS.lr) print ("[COMPLETE]") # Train the model print ("==> Training the model ... [IN PROGRESS]") model = training_procedure( model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, test_loader=test_loader, num_epochs=FLAGS.num_epochs, learning_rate=FLAGS.lr, decay_rate=FLAGS.decay_rate, max_grad_norm=FLAGS.max_grad_norm, log_every=FLAGS.log_every, ) print ("\n[COMPLETE]") # Save the model print ("==> Saving the model ... [IN PROGRESS]") torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt")) print ("\n[COMPLETE]")
LEARNING_RATE = 0.001 MODEL_PATH = [ './trained_nn/igG.pt', './trained_nn/igM.pt', './trained_nn/PCR.pt' ] if __name__ == "__main__": print('\nCARREGANDO DADOS...') data = load_data() start = time.time() min_max(data) for exam_type, name in [(0, 'IGG'), (1, 'IGM'), (2, 'PCR')]: print('\n------------------------\n') print('[REDE ' + name + '] SEPARANDO DADOS PARA TREINAMENTO...') processed_data = process_data(data, exam_type) print('[REDE ' + name + '] ADEQUANDO DADOS PARA TREINAMENTO...') splitted_data = split_data(processed_data) print('[REDE ' + name + '] MONTANDO MODELO...') x_train = splitted_data[0] y_train = splitted_data[2] print('[REDE ' + name + '] ' + str(len(x_train)) + \ ' ELEMENTOS NO CONJUNTO DE TREINAMENTO...') print('[REDE ' + name + '] ' + str(len(y_train)) + \ ' ELEMENTOS NO CONJUNTO DE VALIDAÇÃO...') print('[REDE ' + name + '] INSTANCIANDO REDES...') network = NN2(x_train.shape[1])
default=False, action='store_true') args = parser.parse_args() data, label_to_id = load_data(args.train, args.dev, args.labels) train_data = data['train'] validation_data = data['dev'] vocab = None model = None if args.pretrained_model is not None: model, model_config, vocab, reverse_vocab = load_model( args.pretrained_model) print('\nLoading training data...') train_X, train_Y, vocab, reverse_vocab = process_data( train_data, label_to_id, vocab=vocab, vocab_size=args.vocab_size, max_tokens=args.sequence_length) print('Training data loaded.') print('\nLoading validation data...') validation_X, validation_Y, _, _ = process_data( validation_data, label_to_id, vocab=vocab, max_tokens=args.sequence_length) print('Validation data loaded.') print('\nGenerating batches...') train_batches = generate_batches(train_X, train_Y, args.batch_size) validation_batches = generate_batches(validation_X, validation_Y, args.batch_size) print('Batches finished generating.')
def start_training(): if not os.path.isdir(config.PROCESSED_PATH): data.prepare_raw_data() data.process_data() print('Data ready!') # create checkpoints folder if there isn't one already data.make_dir(config.CPT_PATH) """ Train the bot """ test_buckets, data_buckets, train_buckets_scale = _get_buckets() # in train mode, we need to create the backward path, so forwrad_only is False model = Seq2SeqModel(False, config.BATCH_SIZE) model.build_graph() saver = tf.train.Saver() with tf.Session() as sess: print('Running session') sess.run(tf.global_variables_initializer()) _check_restore_parameters(sess, saver) iteration = model.global_step.eval() total_loss = 0 # Infinite loop print('Start training ...') train_record_file = open( os.path.join(config.PROCESSED_PATH, config.TRAINING_RECORD_FILE), 'a+') test_record_file = open( os.path.join(config.PROCESSED_PATH, config.TESTING_RECORD_FILE), 'a+') while True: try: skip_step = _get_skip_step(iteration) bucket_id = _get_random_bucket(train_buckets_scale) encoder_inputs, decoder_inputs, decoder_masks = data.get_batch( data_buckets[bucket_id], bucket_id, batch_size=config.BATCH_SIZE) start = time.time() _, step_loss, _ = run_step(sess, model, encoder_inputs, decoder_inputs, decoder_masks, bucket_id, False) total_loss += step_loss iteration += 1 if iteration % skip_step == 0: _train_info = 'Iter {}: loss {}, time {}'.format( iteration, total_loss / skip_step, time.time() - start) print(_train_info) train_record_file.write(_train_info + '\n') start = time.time() total_loss = 0 saver.save(sess, os.path.join(config.CPT_PATH, 'chatbot'), global_step=model.global_step) if iteration % (10 * skip_step) == 0: # Run evals on development set and print their loss _test_info = _eval_test_set(sess, model, test_buckets) for item in _test_info: print(item) test_record_file.write("%s\n" % item) start = time.time() sys.stdout.flush() except KeyboardInterrupt: print('Interrupted by user at iteration {}'.format(iteration)) train_record_file.close() test_record_file.close()
k = 0 try: for i in range(0,len(dialogues)): dialogue = dialogues[i].text k = k + 2 if len(name_date[k].text.split()) < 4: name = name_date[k].text else: k = k - 1 name = name_date[k].text if name == "Karan Singla": mine = mine + 1 else: others = others + 1 if dialogue.strip() != '': out_text_file.write(dialogue+"\n") except: continue # total = total + len(soup.find_all('p')) out_text_file.close() data.FILENAME = out_text_file+"fb_chat.txt" data.process_data() print("#total messages", total) print("#my_messages", mine) print("#others messages", others)
(index + 1), total_correct_predictions / total_predictions)) generator_tqdm.set_description(description, refresh=False) average_loss = total_eval_loss / len(eval_batches) eval_accuracy = total_correct_predictions / total_predictions print('Final evaluation accuracy: %.4f loss: %.4f' % (eval_accuracy, average_loss)) if __name__ == "__main__": parser = argparse.ArgumentParser( description="""Script to evaluate a trained model on data.""") parser.add_argument('model', help='Path to trained model directory') parser.add_argument('--test', help='Path to evaluation data.', default=r'./data/test.csv') parser.add_argument('--labels', help='Path to label dictionary.', default=r'./data/answers.json') args = parser.parse_args() data, label_to_id = load_eval_data(args.test, args.labels) print('\nLoading test data...') model, model_config, vocab, reverse_vocab = load_model(args.model) test_X, test_Y, vocab, reverse_vocab = process_data( data, label_to_id, vocab=vocab, vocab_size=model_config['vocab_size']) print('Test data loaded.') batch_size = 32 batches = generate_batches(test_X, test_Y, batch_size) print('Batches finished generating.') train_result = eval(model, batches)
from keras.layers import Convolution1D from keras.models import Sequential from keras.layers import Activation, Dense, Flatten, Dropout from data import process_data from keras import backend as K import numpy as np import os if not os.listdir('datasets/processed'): process_data() arrhy_data = np.loadtxt(open('datasets/processed/arrhythmia.csv', 'r'), skiprows=1) malignant_data = np.loadtxt(open('datasets/processed/malignant-ventricular-ectopy.csv', 'r'), skiprows=1) arrhy_data = arrhy_data[:len(malignant_data)] arrhy_len = len(arrhy_data)/500 i = 0 X_train = [] inter_X_train = [] inter_y_train = [] y_train = [] nb_filters = 32 nb_epoch = 10 batch_size = 8 counter = 0 for _ in range(arrhy_len): counter += 1 if not (counter % batch_size): X_train.append(inter_X_train) y_train.append(inter_y_train)
'--fill', type=float, metavar='F', help= 'Transparency of the filled portion of the graph. If 0 (default), only plots the lines', default=0) cmd = parser.parse_args() import visualization import data if cmd.U: data.download_data() if len(cmd.countries) > 0: p, c = data.process_data(*data.load_data()) row_mask = [cmd.no_daily, cmd.no_cumulative] col_mask = [cmd.no_cases, cmd.no_deaths, cmd.no_recoveries, cmd.no_active] smooth = { 'days': cmd.smooth_days, 'smoothness': cmd.smoothness, 'type': 'window' } if cmd.exponential: smoot['type'] = 'exponential' visualization.main_plot_countries(c, cmd.countries, cmd.begin,
def set_log(args): set_seeds(args) if True:#args.do_train: # preparing embeddings tokens_emb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300") # preparing train datasets assert args.raw_train_file != None, "--raw_train_file should be set when training!" if not os.path.exists(args.train_file): process_data(args.raw_train_file,args.train_file,tokens_emb) with open(args.train_file,mode="r",encoding="utf-8") as rfp: train_ex = json.load(rfp) train_dataset = DuReaderDataset(train_ex) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify, return_list=True) # preparing dev datasets assert args.raw_dev_file != None, "--raw_dev_file should be set when training!" if not os.path.exists(args.dev_file): process_data(args.raw_dev_file,args.dev_file,tokens_emb) with open(args.train_file,mode="r",encoding="utf-8") as rfp: dev_ex = json.load(rfp) dev_dataset = DuReaderDataset(dev_ex) dev_batch_sampler = paddle.io.DistributedBatchSampler( dev_dataset, batch_size=args.dev_batch_size, shuffle=True) dev_data_loader = paddle.io.DataLoader( dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs if paddle.distributed.get_rank() == 0: dev_count = paddle.fluid.core.get_cuda_device_count() logger.info("Device count: %d" % dev_count) logger.info("Num train examples: %d" % len(train_dataset)) logger.info("Num dev examples: %d" % len(dev_dataset)) logger.info("Max train steps: %d" % num_training_steps) model = DocReader(args) model.init_lr_scheduler(args, num_training_steps) model.init_optimizer(args) model.init_loss(args) # Training process global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 loss = model.update(batch) if global_step % args.logging_steps == 0: logger.info("global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_file = os.path.join(output_dir + '.ckpt') model.save(model_file) model_file = os.path.join(args.output_dir, args.model_name + "-global.ckpt") model.save(model_file) if args.do_predict: # preparing test datasets pass
def splitpredict(hparams, checkpoint_callback, meta_data_vocab, train_dataloader, val_dataloader, test_dataloader, all_sentences): mapping, conj_word_mapping = {}, {} hparams.write_allennlp = True if hparams.split_fp == '': hparams.task = 'conj' hparams.checkpoint = hparams.conj_model hparams.model_str = 'bert-base-cased' hparams.mode = 'predict' model = predict(hparams, None, meta_data_vocab, None, None, test_dataloader, all_sentences) conj_predictions = model.all_predictions_conj sentences_indices = model.all_sentence_indices_conj # conj_predictions = model.predictions # sentences_indices = model.all_sentence_indices assert len(conj_predictions) == len(sentences_indices) all_conj_words = model.all_conjunct_words_conj sentences, orig_sentences = [], [] for i, sentences_str in enumerate(conj_predictions): list_sentences = sentences_str.strip('\n').split('\n') conj_words = all_conj_words[i] if len(list_sentences) == 1: orig_sentences.append(list_sentences[0] + ' [unused1] [unused2] [unused3]') mapping[list_sentences[0]] = list_sentences[0] conj_word_mapping[list_sentences[0]] = conj_words sentences.append(list_sentences[0] + ' [unused1] [unused2] [unused3]') elif len(list_sentences) > 1: orig_sentences.append(list_sentences[0] + ' [unused1] [unused2] [unused3]') conj_word_mapping[list_sentences[0]] = conj_words for sent in list_sentences[1:]: mapping[sent] = list_sentences[0] sentences.append(sent + ' [unused1] [unused2] [unused3]') else: assert False sentences.append('\n') count = 0 for sentence_indices in sentences_indices: if len(sentence_indices) == 0: count += 1 else: count += len(sentence_indices) assert count == len(sentences) - 1 else: with open(hparams.predict_fp, 'r') as f: lines = f.read() lines = lines.replace("\\", "") sentences = [] orig_sentences = [] extra_str = " [unused1] [unused2] [unused3]" for line in lines.split('\n\n'): if len(line) > 0: list_sentences = line.strip().split('\n') if len(list_sentences) == 1: mapping[list_sentences[0]] = list_sentences[0] sentences.append(list_sentences[0] + extra_str) orig_sentences.append(list_sentences[0] + extra_str) elif len(list_sentences) > 1: orig_sentences.append(list_sentences[0] + extra_str) for sent in list_sentences[1:]: mapping[sent] = list_sentences[0] sentences.append(sent + extra_str) else: assert False hparams.task = 'oie' hparams.checkpoint = hparams.oie_model hparams.model_str = 'bert-base-cased' _, _, split_test_dataset, meta_data_vocab, _ = data.process_data( hparams, sentences) split_test_dataloader = DataLoader(split_test_dataset, batch_size=hparams.batch_size, collate_fn=data.pad_data, num_workers=1) model = predict(hparams, None, meta_data_vocab, None, None, split_test_dataloader, mapping=mapping, conj_word_mapping=conj_word_mapping, all_sentences=all_sentences) if 'labels' in hparams.type: label_lines = get_labels(hparams, model, sentences, orig_sentences, sentences_indices) f = open(hparams.out + '.labels', 'w') f.write('\n'.join(label_lines)) f.close() if hparams.rescoring: print() print("Starting re-scoring ...") print() sentence_line_nums, prev_line_num, no_extractions = set(), 0, dict() for sentence_str in model.all_predictions_oie: sentence_str = sentence_str.strip('\n') num_extrs = len(sentence_str.split('\n')) - 1 if num_extrs == 0: if curr_line_num not in no_extractions: no_extractions[curr_line_num] = [] no_extractions[curr_line_num].append(sentence_str) continue curr_line_num = prev_line_num + num_extrs sentence_line_nums.add( curr_line_num ) # check extra empty lines, example with no extractions prev_line_num = curr_line_num # testing rescoring inp_fp = model.predictions_f_allennlp rescored = rescore(inp_fp, model_dir=hparams.rescore_model, batch_size=256) all_predictions, sentence_str = [], '' for line_i, line in enumerate(rescored): fields = line.split('\t') sentence = fields[0] confidence = float(fields[2]) if line_i == 0: sentence_str = f'{sentence}\n' exts = [] if line_i in sentence_line_nums: exts = sorted(exts, reverse=True, key=lambda x: float(x.split()[0][:-1])) exts = exts[:hparams.num_extractions] all_predictions.append(sentence_str + ''.join(exts)) sentence_str = f'{sentence}\n' exts = [] if line_i in no_extractions: for no_extraction_sentence in no_extractions[line_i]: all_predictions.append(f'{no_extraction_sentence}\n') arg1 = re.findall( "<arg1>.*</arg1>", fields[1])[0].strip('<arg1>').strip('</arg1>').strip() rel = re.findall( "<rel>.*</rel>", fields[1])[0].strip('<rel>').strip('</rel>').strip() arg2 = re.findall( "<arg2>.*</arg2>", fields[1])[0].strip('<arg2>').strip('</arg2>').strip() extraction = Extraction(pred=rel, head_pred_index=None, sent=sentence, confidence=math.exp(confidence), index=0) extraction.addArg(arg1) extraction.addArg(arg2) if hparams.type == 'sentences': ext_str = data.ext_to_sentence(extraction) + '\n' else: ext_str = data.ext_to_string(extraction) + '\n' exts.append(ext_str) exts = sorted(exts, reverse=True, key=lambda x: float(x.split()[0][:-1])) exts = exts[:hparams.num_extractions] all_predictions.append(sentence_str + ''.join(exts)) if line_i + 1 in no_extractions: for no_extraction_sentence in no_extractions[line_i + 1]: all_predictions.append(f'{no_extraction_sentence}\n') if hparams.out != None: print('Predictions written to ', hparams.out) predictions_f = open(hparams.out, 'w') predictions_f.write('\n'.join(all_predictions) + '\n') predictions_f.close() return