def main(): model = create_model() model.summary() # Building Phase data = import_data("./dataset/crx_clean.data.txt") X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) # Expand data dimension for kernel to convolve over X_train = np.expand_dims(X_train, axis=2) # (None, 46, 1) X_test = np.expand_dims(X_test, axis=2) # (None, 46, 1) # create model model = KerasClassifier(build_fn=create_model, verbose=0) # Operational Phase scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_CNN(model, X_train, Y_train, X_test, Y_test, scorer) Y_pred_grid_search = np.squeeze(Y_pred_grid_search) print() print() print(Y_pred_grid_search) print() print(Y_test) print() print_scores(Y_test, Y_pred_grid_search)
def main(): # Building Phase data = import_data("./dataset/crx_clean.data.txt") X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) # Operational Phase scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_mlp(X_train, Y_train, X_test, Y_test, scorer) print() print() print(Y_pred_grid_search) print() print(Y_test) print() print_scores(Y_test, Y_pred_grid_search)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', help='path to data directory') parser.add_argument('--n-jobs', type=int, default=1, help='parallelism') args = parser.parse_args() x_train, y_train = load_dataset(os.path.join(args.data_dir, 'train.txt')) x_valid, y_valid = load_dataset(os.path.join(args.data_dir, 'valid.txt')) x_test, y_test = load_dataset(os.path.join(args.data_dir, 'test.txt')) n_train, n_valid = len(x_train), len(x_valid) x_train = np.concatenate([x_train, x_valid]) y_train = np.concatenate([y_train, y_valid]) vectorizer = TfidfVectorizer(tokenizer=tokenize, max_df=0.5, min_df=5) vectorizer.fit(x_train) x_train_vectorized = vectorizer.transform(x_train) x_test_vectorized = vectorizer.transform(x_test) steps = [ ('decomposer', TruncatedSVD(random_state=42)), ('classifier', RandomForestClassifier()) ] pipeline = Pipeline(steps) params = { 'decomposer__n_components': [32, 64, 128, 256], 'classifier__n_estimators': [64, 128, 256, 512] } splitter = [list(range(0, n_train))], [list(range(n_train, n_train + n_valid))] predictor = GridSearchCV( pipeline, params, cv=zip(*splitter), n_jobs=args.n_jobs, verbose=3 ) predictor.fit(x_train_vectorized, y_train) y_test_pred = predictor.predict(x_test_vectorized) print_scores(y_test, y_test_pred) print(predictor.best_params_)
def main(): # Building Phase data = import_data( "./dataset/crx_clean.data.txt" ) X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) clf_entropy = train_using_entropy(X_train, Y_train) # Operational Phase print("\n### SINGLE TRAIN-TEST SPLIT ###\n") Y_pred_entropy = prediction(X_test, clf_entropy) print_scores(Y_test, Y_pred_entropy) print("\n### CROSS VAL USING STRATIFIED K FOLD ###\n") fold_scores = cv_with_entropy(X, Y) print("Cross Validate: ", fold_scores) print("Best F1_score: ", max(fold_scores)*100) scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_DT(X_train, Y_train, X_test, Y_test, scorer) print_scores(Y_test, Y_pred_grid_search)
test_loader = cifar_loader(batch_test) if torch.cuda.is_available(): # releasing unnecessary memory in GPU torch.cuda.empty_cache() # ----------------- TESTING ----------------- test_losses = 0 precision, recall, f1, accuracy = [], [], [], [] with torch.no_grad(): for i, data in enumerate(test_loader): X, y = data[0].to(device), data[1].to(device) outputs = net(X) # this get's the prediction from the network test_losses += criterion(outputs, y) predicted_classes = torch.max( outputs, 1)[1] # get class from network's prediction # calculate P/R/F1/A metrics for batch for acc, metric in zip( (precision, recall, f1, accuracy), (precision_score, recall_score, f1_score, accuracy_score)): acc.append( calculate_metric(metric, y.cpu(), predicted_classes.cpu())) # print( # f"\nEpoch {epoch + 1}/{num_epochs}, training loss: {epoch / len(train_loader)}, validation loss: {test_losses / len(test_loader)}") print_scores(precision, recall, f1, accuracy, len(test_loader))
def train(self, scratch, game, display): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=display) t1 = time.time() fname = None if not scratch: fname = self.load() else: delete_files(self.DATA_DIREC) f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC) eps_tau = (self.NB_FRAMES - f0) // 8 scores = [] while step < self.NB_FRAMES: if len(scores) == self.SCORE_FREQ: print('States visited:', len(self.Q)) print_scores(scores, self.SCORE_FREQ) scores = [] p.reset_game() state = game.getGameState() state_tp = self.discretize(state) if state_tp not in self.Q: self.Q[state_tp] = [0, 0] act = 1 episode = deque([], self.SIZE_FIFO) elig = {} gscore = 0 nb_games += 1 while not p.game_over(): step += 1 if step != 0 and (step % self.SAVE_FREQ) == 0: self.save('Q_' + chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games) + '.p') nb_save += 1 if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0: self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau, self.NB_FRAMES) # 1) Observe r, s′ bare_reward = p.act(ACTIONS[act]) reward = self.reward_engineering(bare_reward) new_state = game.getGameState() new_state_tp = self.discretize(new_state) # 2) Choose a′ (GLIE actor) using Q if new_state_tp not in self.Q: self.Q[new_state_tp] = [0, 0] qvals = self.get_qvals(new_state) new_act = self.greedy_action(qvals, self.epsilon) # 3) Temporal difference: δ=r+γQ(s′,a′)−Q(s,a) delta = reward + self.GAMMA * self.Q[new_state_tp][ new_act] - self.Q[state_tp][act] # 4) Update Q episode.append((state_tp, act)) elig[(state_tp, act)] = 1 for (state_tp_ep, act_ep) in episode: self.Q[state_tp_ep][act_ep] += ( self.ALPHA * delta * elig[(state_tp_ep, act_ep)]) elig[(state_tp_ep, act_ep)] *= self.LAMBDA # 5) s<-s', a<-a' state = new_state state_tp = new_state_tp act = new_act if bare_reward > 0: gscore += 1 scores.append(gscore) t2 = time.time() # Unicode code point of a: 97 self.save('Q_' + chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games) + '.p') print() print('Number of played games:', nb_games) print('Training completed in', (t2 - t1) / 60, 'minutes') print()
def train(self, scratch, game, display): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=display) fname = None if not scratch: fname = self.load() else: delete_files(self.DATA_DIREC) f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC) eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE scores = [] while step < self.NB_FRAMES: if len(scores) == self.SCORE_FREQ: print_scores(scores, self.SCORE_FREQ) scores = [] p.reset_game() state = game.getGameState() state_arr = self.state_to_arr(state) # state_arr = self.scaler.transform(state_arr.reshape(1, -1)) gscore = 0 nb_games += 1 while not p.game_over(): step += 1 if step != 0 and (step % self.SAVE_FREQ) == 0: self.save( chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games)) nb_save += 1 if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0: self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau, self.NB_FRAMES) print('WEIGHTS ABS MEAN') print(abs(np.mean(self.model.get_weights()[0], axis=1))) # 1) In s, choose a (GLIE actor) qvals = self.get_qvals(state) act = self.greedy_action(qvals, self.epsilon) # 2) Observe r, s′ bare_reward = p.act(ACTIONS[act]) reward = self.reward_engineering(bare_reward) new_state = game.getGameState() new_state_arr = self.state_to_arr(state) self.replay_memory.append( (state_arr, act, reward, new_state_arr)) if (len(self.replay_memory) == self.BUFFER_SIZE and step % self.TRAIN_FREQ == 0): X_train = [] y_train = [] # TEST: TRAIN ONLY WITH A SMALL BUFFER BATCH replay_memory_copy = list(self.replay_memory)[:] random.shuffle(replay_memory_copy) for frame in replay_memory_copy[:self.BATCH_SIZE]: s_arr_1, act_x, bare_reward_x, s_arr_2 = frame reward_x = self.reward_engineering(bare_reward_x) old_qval = self.model.predict(s_arr_1, batch_size=1) qval_new = self.model.predict(s_arr_2, batch_size=1) max_qval = np.max(qval_new) # terminal state if bare_reward < 0: delta = reward_x else: delta = reward_x + self.GAMMA * max_qval y = np.zeros((1, len(ACTIONS))) y[0][:] = old_qval[0][:] y[0][act_x] = old_qval[0][act_x] + self.ALPHA * delta X_train.append(s_arr_1.reshape(len(STATES), )) y_train.append(y.reshape(len(ACTIONS), )) X_train = np.array(X_train) y_train = np.array(y_train) self.model.fit(X_train, y_train, batch_size=self.BATCH_SIZE, epochs=2, verbose=False) # 5) s <- s' state = new_state state_arr = new_state_arr if bare_reward > 0: gscore += 1 scores.append(gscore) self.save(chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games))
def train(self, scratch, game, display): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=display) fname = None if not scratch: fname = self.load() else: delete_files(self.DATA_DIREC) f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC) eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE scores = [] while step < self.NB_FRAMES: if len(scores) == self.SCORE_FREQ: print_scores(scores, self.SCORE_FREQ) scores = [] p.reset_game() self.game.getGameState() screen = self.process_screen(p.getScreenRGB()) last_screens_buff = deque([screen] * 4, maxlen=NB_LAST_SCREENS) last_screens = np.stack(last_screens_buff, axis=-1) # gscore = 0 nb_games += 1 score = 0 while not p.game_over(): step += 1 if step != 0 and (step % self.SAVE_FREQ) == 0: self.save( chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games)) nb_save += 1 if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0: self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau, self.NB_FRAMES) # print('WEIGHTS ABS MEAN') # print(abs(np.mean(self.model.get_weights()[0], axis=1))) # 1) In s, choose a (GLIE actor) qvals = self.get_qvals(last_screens) act = self.greedy_action(qvals, self.epsilon) # 2) Observe r, s′ bare_reward = p.act(ACTIONS[act]) if bare_reward > 0: score += 1 reward = self.reward_engineering(bare_reward) screen_new = self.process_screen(p.getScreenRGB()) # update replay_memory self.replay_memory.append(screen, act, screen_new, reward) if len(self.replay_memory.buff) > self.MIN_REPLAY_MEMORY_SIZE: # build minibatch ls, actions, ls_new, r, terms = self.replay_memory.minibatch( ) qvals_new = self.model_target.predict(ls_new) qvals_new_max = qvals_new.max(1).reshape( (self.BATCH_SIZE, 1)) delta = r + (1 - terms) * self.GAMMA * qvals_new_max qvals = self.model.predict(ls) qvals[np.arange(self.BATCH_SIZE), actions.ravel()] = delta.ravel() self.model.train_on_batch(x=ls, y=qvals) if step % self.TARGET_FREQ == 0: self.model.save(filepath=self.DATA_DIREC + 'target.h5') self.model_target = load_model( filepath=self.DATA_DIREC + 'target.h5') last_screens_buff.append(screen_new) last_screens = np.stack(last_screens_buff, axis=-1) screen = screen_new scores.append(score)
def test_score(beam_size, encoder, decoder, imgs_path, df_path, vocab, return_results=False): loader = get_loaders(1, imgs_path, df_path, transform, vocab, test=True, n_workers=8) vocab_size = len(vocab) references = list() hypotheses = list() # For each image for i, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) # Encode encoder_out = encoder( image) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view( 1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) # We'll treat the problem as having a batch size of k encoder_out = encoder_out.expand( k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h, c = decoder.init_hidden_state(encoder_out) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) awe, _ = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) gate = decoder.sigmoid( decoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = decoder.fc(h) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk(k, 0) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = top_k_words // vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # print(top_k_scores, top_k_words) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != vocab.stoi['<eos>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h = h[prev_word_inds[incomplete_inds]] c = c[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 if len(complete_seqs_scores) == 0: continue i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in { vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi[ '<pad>'] } ], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses hypotheses.append([ w for w in seq if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']} ]) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores # bleu4 = corpus_bleu(references, hypotheses) b1, b2, b3, b4 = print_scores(references, hypotheses, vocab=vocab) if return_results: return references, hypotheses return b1, b2, b3, b4
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', help='path to data directory') parser.add_argument('--n-jobs', type=int, default=1, help='parallelism') parser.add_argument('-s', '--save-model', action='store_true', help='whether to save model') parser.add_argument('-l', '--load-model', action='store_true', help='whether to load model') args = parser.parse_args() if args.load_model: x_test, y_test = load_dataset(os.path.join(args.data_dir, 'test.txt')) vectorizer = joblib.load(os.path.join(get_model_path(), 'ML_models/vectorizer.pkl')) predictor = joblib.load(os.path.join(get_model_path(), 'ML_models/random_forest.pkl')) x_test_vectorized = vectorizer.transform(x_test) y_test_pred = predictor.predict(x_test_vectorized) print_scores(y_test, y_test_pred) else: x_train, y_train = load_dataset(os.path.join(args.data_dir, 'train.txt')) x_valid, y_valid = load_dataset(os.path.join(args.data_dir, 'valid.txt')) x_test, y_test = load_dataset(os.path.join(args.data_dir, 'test.txt')) n_train, n_valid = len(x_train), len(x_valid) x_train = np.concatenate([x_train, x_valid]) y_train = np.concatenate([y_train, y_valid]) vectorizer = TfidfVectorizer(tokenizer=tokenize, max_df=0.5, min_df=5) vectorizer.fit(x_train) x_train_vectorized = vectorizer.transform(x_train) x_test_vectorized = vectorizer.transform(x_test) steps = [ ('decomposer', TruncatedSVD(random_state=42)), ('classifier', RandomForestClassifier()) ] pipeline = Pipeline(steps) params = { 'decomposer__n_components': [32, 64, 128, 256], 'classifier__n_estimators': [64, 128, 256, 512] } splitter = [list(range(0, n_train))], [list(range(n_train, n_train + n_valid))] predictor = GridSearchCV( pipeline, params, cv=zip(*splitter), n_jobs=args.n_jobs, verbose=3 ) predictor.fit(x_train_vectorized, y_train) y_test_pred = predictor.predict(x_test_vectorized) print_scores(y_test, y_test_pred) print(predictor.best_params_) if args.save_model: joblib.dump( vectorizer, os.path.join(get_model_path(), 'ML_models/vectorizer.pkl'), compress=1 ) joblib.dump( predictor.best_estimator_, os.path.join(get_model_path(), 'ML_models/random_forest.pkl'), compress=1 )
def valid(branch, model_dense, model_res, model_fusion, dataloader, criterion, print_freq, classes, cfg, data_transforms): torch.set_grad_enabled(False) model_dense.eval() model_res.eval() model_fusion.eval() device = torch.device('cuda:0') num_tasks = len(cfg.num_classes) steps = len(dataloader) loss_sum = np.zeros(num_tasks) acc_sum = np.zeros(num_tasks) predlist = list(x for x in range(len(cfg.num_classes))) true_list = list(x for x in range(len(cfg.num_classes))) loss_total = 0 i_batch = 0 with torch.no_grad(): for batch_idx, tuple_i in enumerate(dataloader): i_batch += 1 data, target = tuple_i data = data.to(device) target = target.to(device) output_dense, pool_dense = model_dense(data) output_res, pool_res = model_res(data) output_fusion = model_fusion(pool_dense, pool_res) output = [] if branch == 'step1': for i in range(num_tasks): aux = (output_dense[i] * 0.3) + (output_res[i] * 0.0) + ( output_fusion[i] * 1.0) output.append(aux) elif branch == 'step2': for i in range(num_tasks): aux = (output_dense[i] * 0.0) + (output_res[i] * 0.3) + ( output_fusion[i] * 1.0) output.append(aux) else: for i in range(num_tasks): aux = (output_dense[i] * 0.3) + (output_res[i] * 0.3) + ( output_fusion[i] * 1.0) output.append(aux) loss = 0 for t in range(len(cfg.num_classes)): loss_t, acc_t = get_loss(output, target, t, device, cfg) # AUC loss += loss_t output_tensor = torch.sigmoid( output[t].view(-1)).cpu().detach().numpy() target_tensor = target[:, t].view(-1).cpu().detach().numpy() if batch_idx == 0: predlist[t] = output_tensor true_list[t] = target_tensor else: predlist[t] = np.append(predlist[t], output_tensor) true_list[t] = np.append(true_list[t], target_tensor) loss_sum[t] += loss_t.item() acc_sum[t] += acc_t.item() loss = loss / 14.0 loss_total += loss if (batch_idx + 1) % print_freq == 0: print("Batch {}/{}\t Loss {:.6f} ({:.6f})".format( batch_idx + 1, len(dataloader), loss, loss_total / i_batch)) roc_classes, roc_mean = compute_roc_auc(true_list, predlist, len(classes)) loss_total = loss_total / steps print_scores('auroc', classes, roc_classes, roc_mean, loss_total) return loss_total
def train(options): ''' Trains a specified classifier on a specified dataset using specified feature extractors. Will proceed as follows : - loads the dataset - builds the corpus - loads the classifier - loads the features extractor - builds the execution pipeline - trains the classifier on the corpus - cross-validates the resulting model [optional] - saves the resulting model [optional] ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Labels not specified", "expected 'l', 'g' or 'v'") if not (options["features"]) and not (options["gensim"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=options["classifier"][0], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = None if not (options["gensim"]): features_extr = get_features_extr( features_str_list=options["features"][0], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Train the execution pipeline # train and cross validate results if (options["cross-validation"]): if (options["verbosity"]): print("Model Training with cross validation\n") if options["gensim"]: model, pipeline, scores = train_model_gensim_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, config=options["hyper-parameters"], token_level=options["token-level"], verbose=options["verbosity"]) else: pipeline, scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=options["verbosity"]) if options["verbosity"]: print_scores(scores) if options["output-dir"]: if options["gensim"]: filename = str("doc2vec" + "-siz_" + str(model[0].vector_size) + "-win_" + str(model[0].window) + "-cnt_" + str(model[0].min_count) + get_classifier_name(classifier)) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_scores(scores=scores, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) # train without validation --> output-dir required else: if options["verbosity"]: print("Model Training without cross validation\n") if not (options["output-dir"]): abort_clean("No output directory specified.", "Training without persisting is not allowed") train_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Save the resulting model if options["gensim"]: filename = "doc2vec+" + get_classifier_name(classifier) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_model(pipeline=pipeline, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # End Execution if options["verbosity"]: print("Training task complete in " + str(round(time() - t0)) + " s")
def evolve(self, population, save=True): """ Evolve agents :param population: :type population: :param save: save agents weights and scores :type save: bool :return: """ timestamp = datetime.now().strftime('%Y%m%d%H%M%S') print("Optimization - started", timestamp) agents = population.create_population() for i in range(population.max_generations): if not self.terminate(population, i): try: population.agents_weights[i] = np.array( [a.model.get_weights() for a in agents], dtype=np.ndarray) for j, agent in enumerate(agents): # TODO parallelize score = agent.run_agent() population.scores[i][j] = score print_scores(i + 1, population.scores[i]) if save and (i + 1) % 50 == 0: save_results(population.agents_weights[:i], population.scores[:i], timestamp) if i < population.max_generations - 1: self.generate_next_generation(population=population, generation=i) for k, a in enumerate(agents): agents[k].model.set_weights( population.agents_weights[i + 1][k]) except KeyboardInterrupt: LOGGER.log(environment=ENVIRONMENT.name, timestamp=timestamp, algorithm=self.__class__.__name__, parameters=vars(self), generations=i, score=np.max(population.scores[i - 1])) save_results(population.agents_weights[:i - 1], population.scores[:i - 1], timestamp) sys.exit() else: population.agents_weights = population.agents_weights[:i] population.scores = population.scores[:i] break if save: LOGGER.log(environment=ENVIRONMENT.name, timestamp=timestamp, algorithm=self.__class__.__name__, parameters=vars(self), generations=i, score=np.max(population.scores[i])) save_results(population.agents_weights, population.scores, timestamp) return population.agents_weights, population.scores
def compare(options): ''' Compare a set of specified classifiers on a specified dataset using specified features Will proceed as follows : - loads the dataset - builds the corpus - loads the classifiers - loads the features extractors - builds the execution pipelines - trains the different classifiers on the corpus - saves the scores obtained by each classifier on each set of features ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("label type not specified", "expected 'l', 'g' or 'v'") if not (options["features"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") #-------------------------------------------------------------------------- # Load the tweets Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifiers classifier_str_list = [] if isinstance(options["classifier"], list): classifier_str_list = options["classifier"] else: classifier_str_list = [options["classifier"]] classifiers = [ get_classifier(classifier_str=clf, config=None, verbose=False) for clf in classifier_str_list ] if options["verbosity"]: print("Classifiers Loaded: ") for clf in classifiers: print(" - '" + clf[0] + "'") print() #-------------------------------------------------------------------------- # Load the features extractors extractors_str_list = options["features"] extractors = [ get_features_extr(features_str_list=extr, verbose=False) for extr in extractors_str_list ] if options["verbosity"]: print("Features extractors Loaded: ") for extrs in extractors: print(" - '" + extrs[0] + "'") print() #-------------------------------------------------------------------------- # Prepare results informations supports F1_micro = [[0 for x in classifiers] for y in extractors] F1_macro = [[0 for x in classifiers] for y in extractors] Time_train = [[0 for x in classifiers] for y in extractors] output_dir = options["output-dir"] individual_scores_dir = output_dir + "indiv_scores/" create_dir(individual_scores_dir) #-------------------------------------------------------------------------- # Start the model comparison t0 = time() total_iteration = len(classifiers) * len(extractors) if options["verbosity"]: print("Starting model comparisons") # Loop for each pair features-extractor/classifier for idx_extr, extr in enumerate(extractors): extr_name = get_features_extr_name(extr) for idx_clf, clf in enumerate(classifiers): clf_name = get_classifier_name(clf) if options["verbosity"]: iteration_number = (idx_extr) * len(classifiers) + idx_clf + 1 print("Iteration : " + str(iteration_number) + "/" + str(total_iteration)) print("Testing : Features: " + extr_name + " | Classifier: " + clf_name) t0_step = time() # Build pipeline pipeline = get_pipeline(features_extr=extr, classifier=clf, verbose=False) # Start training + cross validation try: model, step_scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=False) except: print("some error occured - the features extracted and the \ classifier are problably incompatible\n") continue if options["verbosity"]: print("Training complete in " + str(round(time() - t0_step)) + " seconds") print_scores(step_scores) print() # Save scores save_scores(scores=step_scores, output_dir=individual_scores_dir, filename=extr_name + "+" + clf_name, verbose=False) F1_micro[idx_extr][idx_clf] = step_scores["mean_score_micro"] F1_macro[idx_extr][idx_clf] = step_scores["mean_score_macro"] Time_train[idx_extr][idx_clf] = round(time() - t0_step) # Save final micro and macro measuresand execution time save_comparison_table(F1_micro, extractors, classifiers, output_dir + "micro.csv") save_comparison_table(F1_macro, extractors, classifiers, output_dir + "macro.csv") save_comparison_table(Time_train, extractors, classifiers, output_dir + "time.csv") if options["verbosity"]: print("Comparison task complete in " + str(round(time() - t0)) + " s")