def load_and_run(filename): with open(filename, "r") as f: print("loading file...") contents = "".join(f.readlines()) print(contents) print("building ast...") loaded_ast = parse.Parser(contents).parse_expression() print(debug.draw_node(loaded_ast)) print("running...") print(evaluation.evaluate(loaded_ast, main.DefaultEnvironment()))
def multi_step_2D(model, x_test, y_test, mmn, len_closeness, step): # model = build_model(external_dim) dict_multi_score = {} nb_flow = 2 y_pre = [] y_test = copy(y_test) x_test_now = [copy(e) for e in x_test] # inference for i in range(1, step + 1): y_pre_inference = model.predict(x_test_now) # 1 # expand dims [timeslots, flow, height, width] --> [step, timeslots, flow, height, width] y_pre_expand_dims = np.expand_dims(y_pre_inference, axis=0) # append in all step y_pre.append(y_pre_expand_dims) x_test_noremove = x_test_now[0][1:] x_test_noremove = x_test_noremove.transpose((1, 0, 2, 3, 4)) x_test_noremove = x_test_noremove[len_closeness:] x_test_noremove = x_test_noremove.transpose((1, 0, 2, 3, 4)) x_test_remove = x_test_now[0].transpose((1, 0, 2, 3, 4)) x_test_remove = x_test_remove[:len_closeness] for j in range(len_closeness - 1): x_test_remove[j + 1] = x_test_remove[j] x_test_remove[0] = y_pre_expand_dims x_test_remove = x_test_remove.transpose((1, 0, 2, 3, 4)) x_test_remove = x_test_remove[:-1] x_test_next = np.concatenate((x_test_remove, x_test_noremove), axis=1) # # make training data x_test_makeData = [] x_test_makeData.append(x_test_next) x_test_makeData.append(x_test[1][i:]) x_test_now = x_test_makeData for i in range(len(y_pre)): print(f'Step {i+1}:') score = evaluate(y_test[i:], y_pre[i][0], mmn) dict_multi_score[i] = score return dict_multi_score
def multi_step_2D(model, x_test, y_test, mmn, len_closeness, step): # model = build_model(external_dim) dict_multi_score = {} nb_flow = 2 y_pre = [] y_test = copy(y_test) x_test_now = [copy(e) for e in x_test[0:-1] ] # x_test[-1] is the external feature array # inference for i in range(1, step + 1): y_pre_inference = model.predict(x_test_now) # 1 # expand dims [timeslots, timestamps, height, width, flow] --> [step, timeslots, timestamps, height, width, flow] y_pre_expand_dims = np.expand_dims(y_pre_inference, axis=0) # append in all step y_pre.append(y_pre_expand_dims) x_test_next = x_test_now[0].transpose((1, 0, 2, 3, 4)) for j in range(len_closeness - 1): x_test_next[j] = x_test_next[j + 1] # x_test_next[len_closeness - 1] = y_pre_inference x_test_next = x_test_next.transpose((1, 0, 2, 3, 4)) x_test_next = x_test_next[:-1] # make training data x_test_makeData = [] x_test_makeData.append(x_test_next) x_test_now = x_test_makeData for i in range(len(y_pre)): print(f'Step {i+1}:') score = evaluate(y_test[i:], y_pre[i][0], mmn) dict_multi_score[i] = score return dict_multi_score
encoding=encoding, max_iter=args.max_iter) else: train_toks = make_dummy_featuresets(train_corpus.reader) model = MajorityTag.train(train_toks) print('done', file=sys.stderr) with open(args.model_path, 'wb') as f: pickle.dump(model, f) print(f'* Model saved to {args.model_path}', file=sys.stderr) print(f'* Evaluate on training set:', file=sys.stderr) train_featuresets = [fs for fs, _ in train_toks] hyp_tags = model.classify_many(train_featuresets) ref_tags = [tag for _, tag in train_corpus.reader.tagged_words()] result = evaluate(ref_tags, hyp_tags) print(pretty_format(result), file=sys.stderr) else: dev_corpus = CoNLLCorpus(args.corpus) out = dev_corpus.summarize().split('\n') print('* Loaded dev corpus', file=sys.stderr, end='\n ') print('\n '.join(out), file=sys.stderr) with open(args.model_path, 'rb') as f: model = pickle.load(f) print(f'* Model loaded from {args.model_path}', file=sys.stderr) print(f'* Evaluate on dev set:', file=sys.stderr) if args.model_name == 'memo': featuresets = make_word_featuresets(dev_corpus.reader) elif args.model_name == 'maxent': contexts = args.contexts if args.contexts is not None else range(
def learner(rank, world_size, args): """The learner in a distributed RL setting. Updates the network params, pushes new network params to actors. Additionally, this function collects the transitions in the queue from the actors and manages the replay buffer. Params ====== rank: (int) world_size: (int) args: (dict) { no_actors: (int) , train_steps: (int) , batch_size: (int) , optimizer: (String) , policy_net: (torch.nn) , policy_config: (dict) { system_size: (int) size of the toric grid. , number_of_actions (int) } , learning_rate: (float) , device: (String) {"cpu", "cuda"} , policy_update: (int) , discount_factor: (float) , con_send_weights: (multiprocessing.Connection) , transition_queue_from_memory: (multiprocessing.Queue) Queue , update_priorities_queue_to_memory: (multiprocessing.Queue) Queue , con_actors: Array of connections (multiprocessing.Pipe) Pipe(Duplex = True) , con_replay_memory: (multiprocessing.Pipe) Pipe(Duplex = True) , eval_freq (int) , update_tb (int) frequensy to update tensorboard , tb_log_dir (String) tensorboard log dir , env: (String) for evaluating the policy. , env_config: (dict) size: (int) , min_qubit_errors (int) , p_error (float) } } """ def terminate(): # prepare replay memory for termination msg = "prep_terminate" con_replay_memory.send(msg) #wait for acknowlage back = con_replay_memory.recv() # prepare actors for termination msg = ("prep_terminate", None) for a in range(world_size - 2): con_actors[a].send(msg) # wait for acknowledge back = con_actors[a].recv() # terminate actors msg = ("terminate", None) for a in range(world_size - 2): con_actors[a].send(msg) # wait for acknowledge back = con_actors[a].recv() # empty and close queue before termination try: while True: transition_queue_from_memory.get_nowait() except Empty: pass transition_queue_from_memory.close() update_priorities_queue_to_memory.close() # terminate memory msg = "terminate" con_replay_memory.send(msg) # wait for acknowlage back = con_replay_memory.recv() # Tensorboard tb = SummaryWriter(log_dir=args["tb_log_dir"] + "_learner", filename_suffix="_learner") update_tb = args["update_tb"] update_priorities_queue_to_memory = args[ "update_priorities_queue_to_memory"] transition_queue_from_memory = args["transition_queue_from_memory"] device = args["device"] train_steps = args["train_steps"] discount_factor = args["discount_factor"] batch_size = args["batch_size"] con_actors = args["con_actors"] con_replay_memory = args["con_replay_memory"] # eval params eval_freq = args["eval_freq"] env_config = args["env_config"] system_size = env_config["size"] grid_shift = int(env_config["size"] / 2) # Init policy net policy_class = args["policy_net"] policy_config = args["policy_config"] if policy_class == NN_11 or policy_class == NN_17: policy_net = policy_class(policy_config["system_size"], policy_config["number_of_actions"], args["device"]) target_net = policy_class(policy_config["system_size"], policy_config["number_of_actions"], args["device"]) else: policy_net = policy_class() target_net = policy_class() policy_net.to(device) target_net.to(device) # copy policy params to target params = parameters_to_vector(policy_net.parameters()) vector_to_parameters(params, target_net.parameters()) # Push initial network params for actor in range(world_size - 2): msg = ("weights", params.detach()) con_actors[actor].send(msg) # define criterion and optimizer criterion = nn.MSELoss(reduction='none') if args["optimizer"] == 'RMSprop': optimizer = optim.RMSprop(policy_net.parameters(), lr=args["learning_rate"]) elif args["optimizer"] == 'Adam': optimizer = optim.Adam(policy_net.parameters(), lr=args["learning_rate"]) # init counter push_new_weights = 0 # logging wait_time = 0 sum_loss = 0 sum_wait_time = 0 print("Learner waiting for replay memory to be filled.") # Wait until replay memory has enough transitions for one batch while transition_queue_from_memory.empty(): continue # Start training for t in range(train_steps): print("learner: traning step: ", t + 1, " / ", train_steps) # wait until there is an item in queue while transition_queue_from_memory.empty(): wait_time += 1 continue data = transition_queue_from_memory.get() batch_state, batch_actions, batch_reward, batch_next_state, batch_terminal, weights, indices = dataToBatch( data, device) policy_net.train() target_net.eval() # compute policy net output policy_output = policy_net(batch_state) policy_output = policy_output.gather(1, batch_actions.view(-1, 1)).squeeze(1) # compute target network output # target_output = predictMax(target_net, batch_next_state, len(batch_next_state),grid_shift, system_size, device) target_output = predictMaxOptimized(target_net, batch_next_state, grid_shift, system_size, device) target_output = target_output.to(device) # compute loss and update replay memory y = batch_reward + ((~batch_terminal).type(torch.float) * discount_factor * target_output) y = y.clamp(-100, 100) loss = criterion(y, policy_output) loss = weights * loss # Compute priorities priorities = np.absolute(loss.cpu().detach().numpy()) optimizer.zero_grad() loss = loss.mean() # backpropagate loss loss.backward() optimizer.step() # update priorities in replay buffer update_priorities_queue_to_memory.put([*zip(priorities, indices)]) # update actor weights push_new_weights += 1 if push_new_weights >= args["policy_update"]: params = parameters_to_vector(policy_net.parameters()) # update policy network vector_to_parameters(params, target_net.parameters()) target_net.to(device) # dont know if this is needed msg = ("weights", params.detach()) # send weights to actors for actor in range(world_size - 2): con_actors[actor].send(msg) push_new_weights = 0 sum_loss += loss.sum() sum_wait_time += wait_time wait_time = 0 # eval and write to tensorboard if t % eval_freq == 0: p_errors = [0.1] suc_rate, gr_st, avg_no_steps, mean_q, _ = evaluate( policy_net, args["env"], args["env_config"], grid_shift, device, p_errors, num_of_episodes=1) for i, e in enumerate(p_errors): tb.add_scalar("Eval/SuccessRate_{}".format(e), suc_rate[i], t) tb.add_scalar("Eval/GroundState_{}".format(e), gr_st[i], t) tb.add_scalar("Eval/AvgNoSteps_{}".format(e), avg_no_steps[i], t) tb.add_scalar("Eval/MeanQValue_{}".format(e), mean_q[i], t) # write to tensorboard if t % update_tb == 0: tb.add_scalar('Eval/Avg_Over_{}_Loss'.format(update_tb), sum_loss.item() / eval_freq, t) tb.add_scalar( 'Wait/Avg_Over_{}_Wait_Learner_For_New_Transitions'.format( update_tb), sum_wait_time / eval_freq, t) sum_loss = 0 sum_wait_time = 0 # training done torch.save( policy_net.state_dict(), "network/Size_{}_{}.pt".format(system_size, type(policy_net).__name__)) tb.close() terminate()
threshold=1.7, NE_list={}) new_result = [] for line in result: if not re.match("\\s+", line): new_line = ng.modify_line(statistic_model, line) new_result.append(new_line) # Write result print('Writing result ...') # reader.write_file(result, output_path+'corrected_text1') # reader.write_file(new_result, output_path+'corrected_text2') reader.write_file(new_result, output_filename) # Evaluation if len(sys.argv) == 4 or use_local_file is True: print('Evaluation ...') result = reader.lines2string(result) new_result = reader.lines2string(new_result) data = reader.lines2string(data) gold = reader.read_file(gold_filename) gold = reader.clean_empty_line(gold) gold = reader.lines2string(gold) print('WER in raw text:', evaluation.evaluate(data, gold)) print('WER after rule-based system:', evaluation.evaluate(result, gold)) print('WER after rule-based and statistical system:', evaluation.evaluate(new_result, gold))
model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) logger.info("Saving final model to %s", OUTPUT_DIR) logger.info("Saving log file to %s", OUTPUT_DIR) with open(os.path.join(OUTPUT_DIR, "logs.json"), 'w') as f: json.dump(log_file, f, indent=4) if run_config.do_eval: tokenizer = tokenizer_class.from_pretrained(str(OUTPUT_DIR), do_lower_case=DO_LOWER_CASE) model = model_class.from_pretrained(str(OUTPUT_DIR)).to(device) result = evaluate(model=model, tokenizer=tokenizer, device=device, file_path=EVAL_FILE, model_type=MODEL_TYPE, output_dir=OUTPUT_DIR, run_config=run_config ) print("done") if run_config.do_test: tokenizer = tokenizer_class.from_pretrained(str(OUTPUT_DIR), do_lower_case=DO_LOWER_CASE) model = model_class.from_pretrained(str(OUTPUT_DIR)).to(device) result = predict(model=model, tokenizer=tokenizer, device=device, file_path=TEST_FILE, model_type=MODEL_TYPE,
'timeit', '-n 1 search.knownImageSearch(test_set[0], training_set, cv2.HISTCMP_CHISQR_ALT, 5, HIST_FRAME_SKIP, [1])' ) # In[ ]: get_ipython().run_cell_magic( 'time', '', 'results = []\n\nfor i, test_segment in enumerate(test_set):\n print("\\rSearching segment {}/{}".format(i+1, len(test_set), len(test_segment)), end=\'\', flush=True)\n \n results.append(search.knownImageSearch(test_segment, training_set, cv2.HISTCMP_CHISQR_ALT, 5, \n HIST_FRAME_SKIP, [0]))' ) # ## Evaluate performance # In[497]: movie_results, start_frame_dist = evaluate(results, labels) fractions = (movie_results[0] / movie_results[2] * 100 if movie_results[2] > 0 else 0, movie_results[1] / movie_results[0] * 100 if movie_results[0] > 0 else 0) print("TEST RESULTS\n") printParams() print("\nCorrect video: {:d} / {:d} ({:.1f}%)".format(movie_results[0], movie_results[2], fractions[0])) print("Inside fragment: {:d} / {:d} ({:.1f}%)".format(movie_results[1], movie_results[0], fractions[1])) print(
def _eval(string, env): return evaluation.evaluate(parse.Parser(string).parse_expression(), env)
callbacks=[early_stopping, model_checkpoint], verbose=2) model.save_weights(os.path.join( path_model, '{}.h5'.format(hyperparams_name)), overwrite=True) pickle.dump((history.history), open(os.path.join( path_result, '{}.history.pkl'.format(hyperparams_name)), 'wb')) print('=' * 10) # evaluate model print('evaluating using the model that has the best loss on the valid set') model.load_weights(fname_param) # load best weights for current iteration Y_pred = model.predict(X_test) # compute predictions score = evaluate(Y_test, Y_pred, mmn, rmse_factor=1) # evaluate performance # save to csv csv_name = os.path.join('results','PredCNN_taxiBJ_results.csv') if not os.path.isfile(csv_name): if os.path.isdir('results') is False: os.mkdir('results') with open(csv_name, 'a', encoding = "utf-8") as file: file.write('iteration,' 'rsme_in,rsme_out,rsme_tot,' 'mape_in,mape_out,mape_tot,' 'ape_in,ape_out,ape_tot' ) file.write("\n") file.close() with open(csv_name, 'a', encoding = "utf-8") as file:
import test_examples from src import parse from src import evaluation from src import main for example in test_examples.get_cases(): example_parser = parse.Parser(example.expr) parsed = evaluation.evaluate(example_parser.parse_expression(), main.DefaultEnvironment()) print(test_examples.format_message(parsed.val, example.evaluation, example.expr)) if parsed.val != example.evaluation: raise Exception('BAD BAD BAD BAD') print('OK')
def run_experiment(n_steps, grid_size, cone_angle, split_size_range, chain_length, burnin, hpd_values, working_dir, movement_model='rrw', **kwargs): """Run an experiment ´n_runs´ times with the specified parameters. Args: n_steps (int): Number of steps to simulate. grid_size (int): Size of the simulation grid (the exact grid_size is adapted to the cone_angle to achieve consistent area/tree size. cone_angle (float): Angle of the free cone for the expansion. split_size_range (tuple[int,int]): Minimum and maximum area of a taxon. chain_length (int): MCMC chain length in BEAST analysis. burnin (int): MCMC burnin steps in BEAST analysis. hpd_values (list): The values for the HPD coverage statistics. working_dir (str): The working directory in which intermediate files will be dumped. Keyword Args: movement_model (str): The movement to be used in BEAST analysis Options: ['brownian', 'rrw', 'cdrw', 'rdrw'] Returns: dict: Statistics of the experiments (different error values). """ # Paths xml_path = working_dir + 'nowhere.xml' # Inferred parameters grid_size = int(grid_size / (cone_angle**0.5)) # Run Simulation p_grow_distr = scipy.stats.beta(1., 1.).rvs world, tree_simu, _ = init_cone_simulation( grid_size=(grid_size, grid_size), p_grow_distr=p_grow_distr, cone_angle=cone_angle, split_size_range=split_size_range) run_simulation(n_steps, tree_simu, world) root = tree_simu.location if movement_model == 'tree_statistics': results = tree_statistics(tree_simu) else: # Create an XML file as input for the BEAST analysis tree_simu.write_beast_xml(xml_path, chain_length, movement_model=movement_model, drift_prior_std=1.) # Run phylogeographic reconstruction in BEAST run_beast(working_dir=working_dir) results = evaluate(working_dir, burnin, hpd_values, root) # Add statistics about simulated tree (to compare between simulation modes) results['observed_stdev'] = np.hypot( *np.std(tree_simu.get_leaf_locations(), axis=0)) leafs_mean = np.mean(tree_simu.get_leaf_locations(), axis=0) leafs_mean_offset = leafs_mean - root results['observed_drift_x'] = leafs_mean_offset[0] results['observed_drift_y'] = leafs_mean_offset[1] results['observed_drift_norm'] = np.hypot(*leafs_mean_offset) return results
def run_experiment(n_steps, n_expected_leafs, total_drift, total_diffusion, drift_density, p_settle, drift_direction, chain_length, burnin, hpd_values, working_dir, turnover=0.2, clock_rate=1.0, movement_model='rrw', max_fossil_age=0, min_n_fossils=10, **kwargs): """Run an experiment ´n_runs´ times with the specified parameters. Args: n_runs (int): Number of times the experiment should be repeated. n_steps (int): Number of steps to simulate. n_expected_leafs (int): Number data points to be expected in the end (only expected, not exact value, due to stochasticity) total_drift (float): The total distance that every society will travel due to drift over the simulated time. total_diffusion (float): The expected total distance that every society will move away from the root, due to diffusion. drift_density (float): Frequency of drift occurring (does not effect the total drift). p_settle (float): Probability of stopping drift and 'settling' at the current location (only diffusion from this point). drift_direction (np.array): The direction of drift. chain_length (int): MCMC chain length in BEAST analysis. burnin (int): MCMC burnin steps in BEAST analysis. hpd_values (list): The values for the HPD coverage statistics. Kwargs: movement_model (str): The movement to be used in BEAST analysis ('rrw' or 'brownian'). working_dir (str): The working directory in which intermediate files will be dumped. drop_fossils (bool): Remove extinct taxa from the sampled phylogeny. max_fossil_age (float): Remove all fossils older than this. min_n_fossils (int): If `max_fossil_age` is set: Ensure sampled trees have at least this many fossils. Returns: dict: Statistics of the experiments (different error values). """ # Ensure arrays to be np.array root = np.zeros(2) drift_direction = np.asarray(drift_direction) min_leaves, max_leaves = 0.4 * n_expected_leafs, 2. * n_expected_leafs # Paths xml_path = working_dir + 'nowhere.xml' # Inferred parameters drift_direction = normalize(drift_direction) step_var = total_diffusion_2_step_var(total_diffusion, n_steps) _step_drift = total_drift_2_step_drift(total_drift, n_steps, drift_density=drift_density) step_mean = _step_drift * drift_direction # Compute birth-/death-rate from n_expected_leaves, n_steps and turnover eff_div_rate = np.log(n_expected_leafs) / n_steps birth_rate = eff_div_rate / (1 - turnover) death_rate = birth_rate * turnover # b = e / (4/5) = e*5/4 # d = b * 1/5 = e*5/4/5 = e/4 # Check parameter validity if True: assert 0 < drift_density <= 1 assert 0 <= turnover < 1 assert 0 <= death_rate < birth_rate <= 1 for hpd in hpd_values: assert 0 < hpd < 100 assert burnin < chain_length valid_tree = False while not valid_tree: # Run Simulation p0 = np.zeros(2) world = VectorWorld() tree_simu = VectorState(world, p0, step_mean, step_var, clock_rate, birth_rate, drift_frequency=drift_density, death_rate=death_rate) tree_simu, world = run_simulation(n_steps, tree_simu, world, condition_on_root=True) tree_simu.drop_fossils(max_fossil_age) # Check whether tree satisfies criteria... # Criteria: not too small/big & root has two extant subtrees n_leafs = len([n for n in tree_simu.iter_leafs() if n.depth == n_steps]) valid_tree = (min_leaves < n_leafs < max_leaves) if n_leafs < min_leaves: print('Invalid: Not enough leafs: %i' % n_leafs) continue elif n_leafs > max_leaves: print('Invalid: Too many leafs: %i' % n_leafs) continue for c in tree_simu.children: if not any(n.depth == n_steps for n in c.iter_leafs()): valid_tree = False print('Invalid: One side of the tree died!') break if valid_tree and (max_fossil_age > 0): if tree_simu.height() < n_steps: # This might happen if all languages on one side of the first split go extinct. valid_tree = False print('Invalid: Tree lost in height!') elif tree_simu.n_fossils() < min_n_fossils: valid_tree = False print('Invalid: Not enough fossils (only %i)' % tree_simu.n_fossils()) print('Valid tree with %i leaves and %i fossils' % (tree_simu.n_leafs(), tree_simu.n_fossils())) if movement_model == 'tree_statistics': results = {} else: # Create an XML file as input for the BEAST analysis tree_simu.write_beast_xml(xml_path, chain_length, movement_model=movement_model, drift_prior_std=1.) # Run phylogeographic reconstruction in BEAST run_beast(working_dir=working_dir) results = evaluate(working_dir, burnin, hpd_values, root) # Add statistics about simulated tree (to compare between simulation modes) results['observed_stdev'] = np.hypot(*np.std(tree_simu.get_leaf_locations(), axis=0)) leafs_mean = np.mean(tree_simu.get_leaf_locations(), axis=0) leafs_mean_offset = leafs_mean - root results['observed_drift_x'] = leafs_mean_offset[0] results['observed_drift_y'] = leafs_mean_offset[1] results['observed_drift_norm'] = np.hypot(*leafs_mean_offset) # Always include tree stats tree_stats = tree_statistics(tree_simu) results.update(tree_stats) return results
def learner(args): start_time = time.time() # heartbeat heart = time.time() heartbeat_interval = 60 * 5 # 10 minutes train_steps = args["train_steps"] discount_factor = args["discount_factor"] batch_size = args["batch_size"] device = args["device"] # params env_config = args["env_config"] system_size = env_config["size"] grid_shift = int(env_config["size"] / 2) policy_update = args["policy_update"] save_date = args["save_date"] # eval params eval_p_errors = args["learner_eval_p_errors"] eval_no_episodes = args["learner_eval_no_episodes"] eval_freq = args["learner_eval_freq"] count_to_eval = 0 if could_import_tb: tb = SummaryWriter("runs/{}/Learner/".format(save_date)) # Comms learner_io_queue = args["learner_io_queue"] io_learner_queue = args["io_learner_queue"] shared_mem_weights = args["shared_mem_weights"] shared_mem_weight_id = args["shared_mem_weight_id"] # Init networks policy_class = args["model"] policy_config = args["model_config"] model_no_params = args["model_no_params"] checkpoint = args["learner_checkpoint"] if policy_class == NN_11 or policy_class == NN_17: policy_net = policy_class(policy_config["system_size"], policy_config["number_of_actions"], device) target_net = policy_class(policy_config["system_size"], policy_config["number_of_actions"], device) else: policy_net = policy_class() target_net = policy_class() policy_net.to(device) target_net.to(device) # define criterion and optimizer criterion = nn.MSELoss(reduction='none') optimizer = None if args["optimizer"] == 'RMSprop': optimizer = optim.RMSprop(policy_net.parameters(), lr=args["learning_rate"]) elif args["optimizer"] == 'Adam': optimizer = optim.Adam(policy_net.parameters(), lr=args["learning_rate"]) # load checkpoint if not checkpoint == None: # Load checkpoints data (continue training) policy_net.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) criterion = checkpoint["loss"] params = parameters_to_vector( policy_net.parameters()) # get policy weights vector_to_parameters( params, target_net.parameters()) # load policy weights to target else: # Load initial parameters from shared mem (new start) weights = np.empty(model_no_params) with shared_mem_weights.get_lock(): reader = np.frombuffer(shared_mem_weights.get_obj()) np.copyto(weights, reader) vector_to_parameters( from_numpy(weights).type(torch.FloatTensor).to(device), policy_net.parameters()) vector_to_parameters( from_numpy(weights).type(torch.FloatTensor).to(device), target_net.parameters()) preformance_start = time.time() preformance_stop = None # Start training print("Learner: starting training loop.") for t in range(train_steps): #print("Learner timestep: {}".format(t)) # Time guard if time.time() - start_time > args["job_max_time"]: print("Learner: time exceeded, aborting...") break # update target and update shared memory with new weights if t % policy_update == 0 and t != 0: performence_stop = time.time() performence_elapsed = performence_stop - preformance_start performence_transitions = policy_update * batch_size #print("consuming ",performence_transitions/performence_elapsed, "tranistions/s") preformance_start = time.time() params = parameters_to_vector( policy_net.parameters()) # get policy weights vector_to_parameters( params, target_net.parameters()) # load policy weights to target target_net.to(device) # update shared memory with new weights with shared_mem_weights.get_lock(): shared_mem_weights[:] = params.detach().cpu().numpy() shared_mem_weight_id.value += 1 if io_learner_queue.qsize == 0: print("Learner waiting") data = io_learner_queue.get() batch_state, batch_actions, batch_reward, batch_next_state, batch_terminal, weights, indices = dataToBatch( data, device) policy_net.train() target_net.eval() # compute policy net output policy_output = policy_net(batch_state) policy_output = policy_output.gather(1, batch_actions.view(-1, 1)).squeeze(1) # compute target network output # target_output = predictMax(target_net, batch_next_state, len(batch_next_state),grid_shift, system_size, device) target_output = predictMaxOptimized(target_net, batch_next_state, grid_shift, system_size, device) target_output = target_output.to(device) # compute loss and update replay memory y = batch_reward + ((~batch_terminal).type(torch.float) * discount_factor * target_output) y = y.clamp(-100, 100) loss = criterion(y, policy_output) optimizer.zero_grad() loss = weights * loss # Compute priorities priorities = np.absolute(loss.cpu().detach().numpy()) loss = loss.mean() # backpropagate loss loss.backward() optimizer.step() # update priorities in replay_memory p_update = (indices, priorities) msg = ("priorities", p_update) learner_io_queue.put(msg) # evaluations of policy count_to_eval += 1 if eval_freq != -1 and could_import_tb and count_to_eval >= eval_freq: count_to_eval = 0 success_rate, ground_state_rate, _, mean_q_list, _ = evaluate( policy_net, 'toric-code-v0', env_config, int(system_size / 2), device, eval_p_errors, num_of_episodes=eval_no_episodes, epsilon=0.0, num_of_steps=75, plot_one_episode=False, minimum_nbr_of_qubit_errors=0) for i, p in enumerate(eval_p_errors): tb.add_scalar("Network/Mean Q, p error {}".format(p), mean_q_list[i], t) tb.add_scalar("Network/Success Rate, p error {}".format(p), success_rate[i], t) tb.add_scalar( "Network/Ground State Rate, p error {}".format(p), ground_state_rate[i], t) # heartbeat to see if process is alive if could_import_tb and time.time() - heart > heartbeat_interval: heart = time.time() tb.add_scalar("Heartbeat/Learner", 1) # close tensorboard writer if eval_freq != -1 and could_import_tb: tb.close() # training done # save network msg = ("terminate", None) learner_io_queue.put(msg) save_path = "runs/{}/Size_{}_{}_{}.pt".format(save_date, system_size, type(policy_net).__name__, save_date) torch.save( { "model_state_dict": policy_net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "loss": criterion }, save_path) print("Saved network to {}".format(save_path)) print("Total trainingsteps: {}".format(t))
hyperparams_name = '3dclost_roma32x32' fname_param = os.path.join('MODEL', '{}.best.h5'.format(hyperparams_name)) model_checkpoint = ModelCheckpoint( fname_param, monitor='val_rmse', verbose=0, save_best_only=True, mode='min') history = model.fit(X_train, Y_train, epochs=nb_epoch, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[model_checkpoint], verbose=2) # predict Y_pred = model.predict(X_test) # compute predictions # evaluate score = evaluate(Y_test, Y_pred, mmn) # evaluate performance # save to csv csv_name = os.path.join('results', f'roma32x32_results.csv') save_to_csv(score, csv_name) ## TL without re-training # load weights model_fname = 'TaxiBJ.c4.p1.t0.iter7.best.noMeteo.h5' model.load_weights(os.path.join('../best_models', '3DCLoST', model_fname)) # predict Y_pred = model.predict(X_test) # compute predictions # evaluate
statistic_model = ng.read_ngram_model(model_path, split_strategy=ng.TOKENIZER, topN=50, delta=0.1, threshold=1.7, NE_list=possible_name_entity_dict) new_result = [] for line in result: if not re.match("\\s+", line): new_line = ng.modify_line(statistic_model, line) new_result.append(new_line) # Write result print('Writing result ...') # reader.write_file(result, output_path+'corrected_text1') # reader.write_file(new_result, output_path+'corrected_text2') reader.write_file(new_result, output_filename) # Evaluation if len(sys.argv) == 4 or use_local_file is True: print('Evaluation ...') result = reader.lines2string(result) new_result = reader.lines2string(new_result) data = reader.lines2string(data) gold = reader.read_file(gold_filename) gold = reader.clean_empty_line(gold) gold = reader.lines2string(gold) print('WER in raw text:', evaluation.evaluate(data, gold)) print('WER after rule-based system:', evaluation.evaluate(result, gold)) print('WER after rule-based and statistical system:', evaluation.evaluate(new_result, gold))
def main(model_config_module): model_config = importlib.import_module(model_config_module) logger.info(f"Loading data from {RAW_DATA_IN_PATH}") raw_dataframe = get_data(RAW_DATA_IN_PATH) logger.info(f"Splitting into {config.TRAIN_TEST_SPLIT_RATIO} train and {1-config.TRAIN_TEST_SPLIT_RATIO} test") raw_train, raw_test = train_test_split(raw_dataframe, config.TEST_SPLIT_DAYS) logger.info(f"Loading metadata from {META_DATA_IN_PATH}") meta_dataframe = get_data(META_DATA_IN_PATH) logger.info(f"Processing train dataset") processed_train_dataset = preprocess_train_data(raw_train, meta_dataframe) initialize_model = model_config.initialize_model grid = model_config.GRID #set experiment name logger.info(f"Starting MLFlow runs in experiment {config.EXPERIMENT_NAME}") mlflow.set_experiment(config.EXPERIMENT_NAME) logger.info(f"Train model with grid length of {len(grid)}") with mlflow.start_run(run_name=f"{model_config.RUN_NAME} grid search parent."): for params in grid: with mlflow.start_run(run_name=f'{model_config.RUN_NAME}: parameters: {params}', nested=True): logger.info(f"Train model with parameters: {params}.") mlflow.log_param("Parameters", params) init_model = initialize_model(params) model = train_model(init_model, processed_train_dataset) logger.info(f"Adding predictions") test_dataframe = add_prediction(test_dataset=raw_test, base_dataset=processed_train_dataset, meta_dataframe=meta_dataframe, model=model, predict_col_name=config.PREDICT) # Metrics logger.info(f"Logging metrics to MLFlow") metric = evaluate(test_dataframe) #Remember to change name from metric to test metric # MlFlow logs mlflow.log_metric("Root mean squared error", metric['root_mean_squared_error']) mlflow.log_metric("Mean squared error", metric['mean_squared_error']) mlflow.log_metric("Mean absolute error", metric['mean_absolute_error']) mlflow.log_metric("Mean absolute percentage error", metric['mean_absolute_percentage_error']) mlflow.log_metric("Absolute biggest deviation", metric['absolute_biggest_deviation']) # Plot logger.info(f"Logging timeserie graph to MLFlow") timeserie_plot(test_dataframe, config.DATE_COLUMN, PLOT_ACTUAL_VS_PREDICT_PLOT) # Log artifacts (output files) mlflow.log_artifact(str(PLOT_ACTUAL_VS_PREDICT_PLOT)) logger.info(f"Saving model to {MODEL_PATH}") save_as_pickle(model, MODEL_PATH) logger.info(f"Saving test_dataframe to {TEST_DATAFRAME_PATH}") save_as_pickle(test_dataframe, TEST_DATAFRAME_PATH) mlflow.end_run()
def train_model(lr, batch_size, lstm, lstm_number, save_results=False, i=''): # get discrete parameters lstm = 350 if lstm < 1 else 500 lstm_number = int(lstm_number) batch_size = 16 if batch_size < 2 else 64 lr = round(lr, 5) # build model model = build_model('NY', X_train, Y_train, conv_filt=64, kernel_sz=(2, 3, 3), mask=mask, lstm=lstm, lstm_number=lstm_number, add_external_info=True, lr=lr, save_model_pic=None) # model.summary() hyperparams_name = 'TaxiNYC{}.c{}.p{}.t{}.lstm_{}.lstmnumber_{}.lr_{}.batchsize_{}'.format( i, len_c, len_p, len_t, lstm, lstm_number, lr, batch_size) fname_param = os.path.join('MODEL', '{}.best.h5'.format(hyperparams_name)) early_stopping = EarlyStopping(monitor='val_rmse', patience=25, mode='min') # lr_callback = LearningRateScheduler(lrschedule) model_checkpoint = ModelCheckpoint(fname_param, monitor='val_rmse', verbose=0, save_best_only=True, mode='min') # train model print("training model...") ts = time.time() if (i): print(f'Iteration {i}') np.random.seed(i * 18) tf.random.set_seed(i * 18) history = model.fit( X_train_all, Y_train_all, epochs=nb_epoch, batch_size=batch_size, validation_data=(X_test, Y_test), # callbacks=[early_stopping, model_checkpoint], # callbacks=[model_checkpoint, lr_callback], callbacks=[model_checkpoint], verbose=2) model.save_weights(os.path.join('MODEL', '{}.h5'.format(hyperparams_name)), overwrite=True) pickle.dump((history.history), open( os.path.join(path_result, '{}.history.pkl'.format(hyperparams_name)), 'wb')) print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts)) # evaluate model.load_weights(fname_param) score = model.evaluate(X_test, Y_test, batch_size=Y_test.shape[0], verbose=0) print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' % (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.)) if (save_results): print( 'evaluating using the model that has the best loss on the valid set' ) model.load_weights( fname_param) # load best weights for current iteration Y_pred = model.predict(X_test) # compute predictions score = evaluate(Y_test, Y_pred, mmn, rmse_factor=1) # evaluate performance # save to csv csv_name = os.path.join('results', '3dclost_taxiNYC_results.csv') if not os.path.isfile(csv_name): if os.path.isdir('results') is False: os.mkdir('results') with open(csv_name, 'a', encoding="utf-8") as file: file.write('iteration,' 'rsme_in,rsme_out,rsme_tot,' 'mape_in,mape_out,mape_tot,' 'ape_in,ape_out,ape_tot') file.write("\n") file.close() with open(csv_name, 'a', encoding="utf-8") as file: file.write( f'{i},{score[0]},{score[1]},{score[2]},{score[3]},' f'{score[4]},{score[5]},{score[6]},{score[7]},{score[8]}') file.write("\n") file.close() K.clear_session() # bayes opt is a maximization algorithm, to minimize validation_loss, return 1-this bayes_opt_score = 1.0 - score[1] return bayes_opt_score
"number_of_actions": env_config["size"] } #model = NN_11(model_config["system_size"], 3, device) model = ResNet18() model.load_state_dict(torch.load("runs/15_Apr_2020_20_06_29/Size_9_ResNet_15_Apr_2020_20_06_29.pt", map_location=device)["model_state_dict"]) model.to(device) model.eval() p_error = np.linspace(0.06, 0.2, 8, endpoint=True) for p in p_error: success_rate, ground_state_rate, average_number_of_steps_list, mean_q_list, failed_syndroms = evaluate( model, 'toric-code-v0', env_config, int(env_config["size"]/2), device, [p], num_of_episodes=no_episodes, epsilon=0.0, num_of_steps=75, plot_one_episode=False, minimum_nbr_of_qubit_errors=0) tb = SummaryWriter(log_dir='runs/test_size_{}_steps_{}'.format(env_config["size"], no_episodes)) # for i, p in enumerate(p_error): tb.add_scalar("Performance/Ground State Rate", ground_state_rate[0], p*100) tb.add_scalar("Performance/Success Rate", success_rate[0], p*100) tb.add_scalar("Performance/Mean Q", mean_q_list[0], p*100) tb.add_scalar("Performance/Avg No Steps", average_number_of_steps_list[0], p*100)
def train_model(batch_size, encoder_block, filters, save_results=False, i='', freeze=True, spatial=False): # build model model = build_model(len_closeness, len_period, len_trend, nb_flow, map_height, map_width, external_dim=external_dim, encoder_blocks=encoder_block, filters=filters, kernel_size=3, num_res=2) if encoder_block == 3: if freeze: #load weight model_fname = 'model3resunit_doppia_attention.TaxiBJ1.c4.p0.t0.encoderblocks_3.kernel_size_3.lr_0.0007.batchsize_16.noMeteo.best.h5' model.load_weights( os.path.join('../best_models', 'model3', model_fname)) if not spatial: #freeze all layers except attention for layer in model.layers[:-28]: layer.trainable = False hyperparams_name = 'Roma_32x32_iterazione{}_trained_attention_accuracy'.format( i) else: #freeze all layers except attention for layer in model.layers[:-13]: layer.trainable = False hyperparams_name = 'Roma_32x32_iterazione{}_trained_only_spatial_accuracy'.format( i) else: hyperparams_name = 'Roma_32x32_iterazione{}_trained_random_weight_accuracy'.format( i) else: if freeze: # load weight model_fname = 'model3resunit_doppia_attention.TaxiNYC5.c4.p0.t0.encoderblocks_2.kernel_size_3.lr_0.00086.batchsize_48.best.h5' model.load_weights( os.path.join('../best_models', 'model3', model_fname)) if not spatial: # freeze all layers except attention for layer in model.layers[:-28]: layer.trainable = False hyperparams_name = 'Roma_16x8_iterazione{}_trained_attention_accuracy'.format( i) else: # freeze all layers except attention for layer in model.layers[:-13]: layer.trainable = False hyperparams_name = 'Roma_16x8_iterazione{}_trained_only_attention_accuracy'.format( i) else: hyperparams_name = 'Roma_16x8_iterazione{}_trained_random_weight_accuracy'.format( i) fname_param = os.path.join('MODEL', '{}.best.h5'.format(hyperparams_name)) model_checkpoint = ModelCheckpoint(fname_param, monitor='val_rmse', verbose=0, save_best_only=True, mode='min') # train model print("training model...") ts = time.time() print(f'Iteration {i}') np.random.seed(i * 18) tf.random.set_seed(i * 18) history = model.fit(X_train_all, Y_train_all, epochs=nb_epoch, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[model_checkpoint], verbose=0) print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts)) tempo = time.time() - ts # evaluate model.load_weights(fname_param) score = model.evaluate(X_test, Y_test, batch_size=128, verbose=0) print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' % (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.)) if (save_results): print( 'evaluating using the model that has the best loss on the valid set' ) model.load_weights( fname_param) # load best weights for current iteration Y_pred = model.predict(X_test) # compute predictions score = evaluate(Y_test, Y_pred, mmn, rmse_factor=1) # evaluate performance # save h5 file to generate map save_map(Y_pred, i, freeze, spatial) # save to csv if freeze: if not spatial: csv_name = os.path.join( 'results', f'Roma_{map_height}x{map_width}_trained_attention_results.csv' ) else: csv_name = os.path.join( 'results', f'Roma_{map_height}x{map_width}_trained_only_spatial_results.csv' ) else: csv_name = os.path.join( 'results', f'Roma_{map_height}x{map_width}_trained_random_weight_results.csv' ) if not os.path.isfile(csv_name): if os.path.isdir('results') is False: os.mkdir('results') with open(csv_name, 'a', encoding="utf-8") as file: file.write('iteration,' 'rsme_in,rsme_out,rsme_tot,' 'mape_in,mape_out,mape_tot,' 'ape_in,ape_out,ape_tot,' 'tempo_esecuzione') file.write("\n") file.close() with open(csv_name, 'a', encoding="utf-8") as file: file.write( f'{i},{score[0]},{score[1]},{score[2]},{score[3]},' f'{score[4]},{score[5]},{score[6]},{score[7]},{score[8]},' f'{tempo}') file.write("\n") file.close() K.clear_session()
def train_model(encoder_blocks, lstm_units, lr, batch_size, save_results=False, i=''): # get discrete parameters encoder_blocks = int(encoder_blocks) lstm_units = 2 ** int(lstm_units) batch_size = 16 * int(batch_size) filters = [32,64,16] if encoder_blocks==2 else [32,64,64,16] # build model m = models_dict[model_name] model = m.build_model( len_closeness, len_period, len_trend, nb_flow, map_height, map_width, external_dim=external_dim, lr=lr, encoder_blocks=encoder_blocks, filters=filters, lstm_units=lstm_units # save_model_pic=f'BikeNYC_{model_name}' ) # model.summary() hyperparams_name = '{}.BikeNYC{}.c{}.p{}.t{}.encoderblocks_{}.lstm_{}.lr_{}.batchsize_{}'.format( model_name, i, len_closeness, len_period, len_trend, encoder_blocks, lstm_units, lr, batch_size) fname_param = os.path.join('MODEL', '{}.best.h5'.format(hyperparams_name)) early_stopping = EarlyStopping(monitor='val_rmse', patience=25, mode='min') model_checkpoint = ModelCheckpoint( fname_param, monitor='val_rmse', verbose=0, save_best_only=True, mode='min') # train model print("training model...") ts = time.time() if (i): np.random.seed(i*18) tf.random.set_seed(i*18) history = model.fit(X_train, Y_train, epochs=nb_epoch, batch_size=batch_size, validation_data=(X_test,Y_test), # callbacks=[early_stopping, model_checkpoint], callbacks=[model_checkpoint], verbose=0) model.save_weights(os.path.join( 'MODEL', '{}.h5'.format(hyperparams_name)), overwrite=True) pickle.dump((history.history), open(os.path.join( path_result, '{}.history.pkl'.format(hyperparams_name)), 'wb')) print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts)) # evaluate model.load_weights(fname_param) score = model.evaluate( X_test, Y_test, batch_size=Y_test.shape[0], verbose=0) print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' % (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.)) if (save_results): print('evaluating using the model that has the best loss on the valid set') model.load_weights(fname_param) # load best weights for current iteration Y_pred = model.predict(X_test) # compute predictions score = evaluate(Y_test, Y_pred, mmn, rmse_factor=1) # evaluate performance # save to csv csv_name = os.path.join('results',f'{model_name}_bikeNYC_results.csv') if not os.path.isfile(csv_name): if os.path.isdir('results') is False: os.mkdir('results') with open(csv_name, 'a', encoding = "utf-8") as file: file.write('iteration,' 'rsme_in,rsme_out,rsme_tot,' 'mape_in,mape_out,mape_tot,' 'ape_in,ape_out,ape_tot' ) file.write("\n") file.close() with open(csv_name, 'a', encoding = "utf-8") as file: file.write(f'{i},{score[0]},{score[1]},{score[2]},{score[3]},' f'{score[4]},{score[5]},{score[6]},{score[7]},{score[8]}' ) file.write("\n") file.close() K.clear_session() # bayes opt is a maximization algorithm, to minimize validation_loss, return 1-this bayes_opt_score = 1.0 - score[1] return bayes_opt_score
def train(batch_size=2, epochs=1, show_vizdom=False, run_eval=False): ''' :param batch_size: :param epochs: :param show_vizdom: if this is True visdom server should be running in background :param run_eval: :return: ''' data_transforms = { 'train': transforms.Compose([ transforms.ToTensor() ]), 'val': transforms.Compose([ transforms.ToTensor() ]), } ds = FileCsvJsonDataset(target_col='',transform=data_transforms['train']) if USE_GPU: model = FineTuneImageNet(num_classes=1).cuda() else: model = FineTuneImageNet(num_classes=1) run_id = utils.generate_run_id() since = time.time() best_model_wts = model.state_dict() least_loss = 999999.99 val_loss = 999999.99 for epoch in range(epochs): train_idx, val_idx = ds.train_val_split() data_loaders = { 'train': torch.utils.data.DataLoader(ds, sampler=SubsetRandomSampler(indices=train_idx), batch_size=batch_size, num_workers=4), 'val': torch.utils.data.DataLoader(ds, sampler=SubsetRandomSampler(indices=val_idx), batch_size=batch_size, num_workers=4) } dataset_sizes = {'train': len(train_idx), 'val': len(val_idx)} # pbar = tqdm_notebook(train_loader, total=len(train_loader)) print('Epoch {}/{}'.format(epoch + 1, epochs)) print('-' * 10) val_loss, val_acc = train_model(model=model, data_loaders=data_loaders, dataset_sizes=dataset_sizes, epoch=epoch, show_vizdom=show_vizdom) # get validation metric to check best run if val_loss < least_loss: least_loss = val_loss best_model_wts = model.state_dict() # torch.save(model.state_dict(), 'generated/model/{}_{}.pth.tar'.format(run_id, epoch)) torch.save(best_model_wts, 'generated/models/{}.pth.tar'.format(run_id)) print("Saved model at generated/models/{}.pth.tar".format(run_id)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val_loss: {}'.format(least_loss)) if run_eval: print("Starting evaluation for: {}".format(run_id)) eval.evaluate(model=model) print("Evaluation ended")
def main(model_config_module): model_config = importlib.import_module(model_config_module) mlflow.set_experiment(config.EXPERIMENT_NAME) logger.info(f"Loading data from {RAW_DATA_IN_PATH}") raw_dataframe = get_data(RAW_DATA_IN_PATH) logger.info( f"Splitting into {config.TRAIN_TEST_SPLIT_RATIO} train and {1-config.TRAIN_TEST_SPLIT_RATIO} test" ) raw_train, raw_test = train_test_split(raw_dataframe, config.TEST_SPLIT_DAYS) logger.info(f"Loading metadata from {META_DATA_IN_PATH}") meta_dataframe = get_data(META_DATA_IN_PATH) meta_dataframe = meta_dataframe.sort_values('WorkingDate') logger.info(f"Processing train dataset") processed_train_dataset = preprocess_train_data(raw_train, meta_dataframe) processed_test_dataset = preprocess_test_data(raw_test) with mlflow.start_run(run_name=f"{model_config.RUN_NAME}."): logger.info(f"Training model") model = train_model(model_config.model, processed_train_dataset) logger.info(f"Adding predictions") test_dataframe = add_prediction(test_dataset=processed_test_dataset, base_dataset=processed_train_dataset, meta_dataframe=meta_dataframe, model=model, predict_col_name=config.PREDICT) # Metrics logger.info(f"Logging test metrics to MLFlow") metric = evaluate( test_dataframe ) #Remember to change name from metric to test metric # MlFlow logs mlflow.log_metric("Test - Root mean squared error", metric['root_mean_squared_error']) mlflow.log_metric("Test - Mean squared error", metric['mean_squared_error']) mlflow.log_metric("Test - Mean absolute error", metric['mean_absolute_error']) mlflow.log_metric("Test - Mean absolute percentage error", metric['mean_absolute_percentage_error']) mlflow.log_metric("Test - Absolute biggest deviation", metric['absolute_biggest_deviation']) # Plot logger.info(f"Logging timeserie graph to MLFlow") timeserie_plot(test_dataframe, config.DATE_COLUMN, PLOT_ACTUAL_VS_PREDICT_PLOT_TEST, view="Test") # Log artifacts (output files) mlflow.log_artifact(str(PLOT_ACTUAL_VS_PREDICT_PLOT_TEST)) logger.info(f"Saving model to {MODEL_PATH}") save_as_pickle(model, MODEL_PATH) logger.info(f"Saving test_dataframe to {TEST_DATAFRAME_PATH}") save_as_pickle(test_dataframe, TEST_DATAFRAME_PATH) logger.info(f"Logging test metrics to MLFlow") train_dataframe = add_prediction_to_train_df( processed_train_dataset, model=model, predict_col_name=config.PREDICT) # MlFlow logs train_metric = evaluate( train_dataframe ) # Remember to change name from metric to test metric mlflow.log_metric("Train - Root mean squared error", train_metric['root_mean_squared_error']) mlflow.log_metric("Train - Mean squared error", train_metric['mean_squared_error']) mlflow.log_metric("Train - Mean absolute error", train_metric['mean_absolute_error']) mlflow.log_metric("Train - Mean absolute percentage error", train_metric['mean_absolute_percentage_error']) mlflow.log_metric("Train - Absolute biggest deviation", train_metric['absolute_biggest_deviation']) # Plot logger.info(f"Logging train timeserie graph to MLFlow") timeserie_plot(train_dataframe, config.DATE_COLUMN, PLOT_ACTUAL_VS_PREDICT_PLOT_TRAIN, view="Train") # Log artifacts (output files) mlflow.log_artifact(str(PLOT_ACTUAL_VS_PREDICT_PLOT_TRAIN)) # Save as excel file to MLFlow (output files) logger.info(f"Saving dataframe to MLFlow") save_as_excel(test_dataframe, train_dataframe, PATH_TO_EXCEL_FILE) mlflow.log_artifact(str(PATH_TO_EXCEL_FILE)) mlflow.end_run()
def main(): """Load the graph, create the embeddings, evaluate them with link prediction and save the results.""" args = parse_args() graph = utils.load_graph(args.weighted, args.directed, args.input) utils.print_graph_info(graph, "original graph") graph.remove_nodes_from(list(nx.isolates(graph))) utils.print_graph_info(graph, "graph without isolates") edge_splitter_test = EdgeSplitter(graph) graph_test, X_test_edges, y_test = edge_splitter_test.train_test_split( p=args.test_percentage, method="global") edge_splitter_train = EdgeSplitter(graph_test, graph) graph_train, X_edges, y = edge_splitter_train.train_test_split( p=args.train_percentage, method="global") X_train_edges, X_model_selection_edges, y_train, y_model_selection = train_test_split( X_edges, y, train_size=0.75, test_size=0.25) logger.info(f'\nEmbedding algorithm started.') start = time.time() embedding.create_embedding(args, graph_train) time_diff = time.time() - start logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.') embeddings = utils.load_embedding(args.output) logger.info(f'\nEmbedding evaluation started.') start = time.time() results = evaluation.evaluate(args.classifier, embeddings, X_train_edges, y_train, X_model_selection_edges, y_model_selection) time_diff = time.time() - start logger.info(f'Embedding evaluation finished in {time_diff:.2f} seconds.') best_result = max(results, key=lambda result: result["roc_auc"]) logger.info( f"\nBest roc_auc_score on train set using '{best_result['binary_operator'].__name__}': {best_result['roc_auc']}." ) logger.info(f'\nEmbedding algorithm started.') start = time.time() embedding.create_embedding(args, graph_test) time_diff = time.time() - start logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.') embedding_test = utils.load_embedding(args.output) roc_auc, average_precision, accuracy, f1 = evaluation.evaluate_model( best_result["classifier"], embedding_test, best_result["binary_operator"], X_test_edges, y_test) logger.info( f"Scores on test set using '{best_result['binary_operator'].__name__}'." ) logger.info(f"roc_auc_score: {roc_auc}") logger.info(f"average_precision_score: {average_precision}") logger.info(f"accuracy_score: {accuracy}") logger.info(f"f1_score on test set using: {f1}\n") if (args.results): evaluation.save_evaluation_results( args.dataset, args.method, args.classifier, (roc_auc, average_precision, accuracy, f1), args.results)