Пример #1
0
def main():
    model = create_model()
    model.summary()

    # Building Phase
    data = import_data("./dataset/crx_clean.data.txt")
    X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)

    # Expand data dimension for kernel to convolve over
    X_train = np.expand_dims(X_train, axis=2)  # (None, 46, 1)
    X_test = np.expand_dims(X_test, axis=2)  # (None, 46, 1)

    # create model
    model = KerasClassifier(build_fn=create_model, verbose=0)

    # Operational Phase
    scorer = make_scorer(f1_score, pos_label='+')
    print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
    Y_pred_grid_search = grid_search_cv_CNN(model, X_train, Y_train, X_test,
                                            Y_test, scorer)
    Y_pred_grid_search = np.squeeze(Y_pred_grid_search)
    print()
    print()
    print(Y_pred_grid_search)
    print()
    print(Y_test)
    print()
    print_scores(Y_test, Y_pred_grid_search)
def main():
    # Building Phase
    data = import_data("./dataset/crx_clean.data.txt")
    X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)

    # Operational Phase
    scorer = make_scorer(f1_score, pos_label='+')
    print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
    Y_pred_grid_search = grid_search_cv_mlp(X_train, Y_train, X_test, Y_test,
                                            scorer)

    print()
    print()
    print(Y_pred_grid_search)
    print()
    print(Y_test)
    print()
    print_scores(Y_test, Y_pred_grid_search)
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir', help='path to data directory')
    parser.add_argument('--n-jobs', type=int, default=1, help='parallelism')
    args = parser.parse_args()

    x_train, y_train = load_dataset(os.path.join(args.data_dir, 'train.txt'))
    x_valid, y_valid = load_dataset(os.path.join(args.data_dir, 'valid.txt'))
    x_test, y_test = load_dataset(os.path.join(args.data_dir, 'test.txt'))

    n_train, n_valid = len(x_train), len(x_valid)
    x_train = np.concatenate([x_train, x_valid])
    y_train = np.concatenate([y_train, y_valid])

    vectorizer = TfidfVectorizer(tokenizer=tokenize, max_df=0.5, min_df=5)
    vectorizer.fit(x_train)

    x_train_vectorized = vectorizer.transform(x_train)
    x_test_vectorized = vectorizer.transform(x_test)

    steps = [
        ('decomposer', TruncatedSVD(random_state=42)),
        ('classifier', RandomForestClassifier())
    ]
    pipeline = Pipeline(steps)

    params = {
        'decomposer__n_components': [32, 64, 128, 256],
        'classifier__n_estimators': [64, 128, 256, 512]
    }

    splitter = [list(range(0, n_train))], [list(range(n_train, n_train + n_valid))]
    predictor = GridSearchCV(
        pipeline,
        params,
        cv=zip(*splitter),
        n_jobs=args.n_jobs,
        verbose=3
    )

    predictor.fit(x_train_vectorized, y_train)
    y_test_pred = predictor.predict(x_test_vectorized)
    print_scores(y_test, y_test_pred)
    print(predictor.best_params_)
Пример #4
0
def main():
	# Building Phase
	data = import_data(
		"./dataset/crx_clean.data.txt"
		)
	X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)
	clf_entropy = train_using_entropy(X_train, Y_train)

	# Operational Phase
	print("\n### SINGLE TRAIN-TEST SPLIT ###\n")
	Y_pred_entropy = prediction(X_test, clf_entropy)
	print_scores(Y_test, Y_pred_entropy)

	print("\n### CROSS VAL USING STRATIFIED K FOLD ###\n")
	fold_scores = cv_with_entropy(X, Y)
	print("Cross Validate: ", fold_scores)
	print("Best F1_score: ", max(fold_scores)*100)

	scorer = make_scorer(f1_score, pos_label='+')
	print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
	Y_pred_grid_search = grid_search_cv_DT(X_train, Y_train, X_test, Y_test, scorer)
	print_scores(Y_test, Y_pred_grid_search)
Пример #5
0
test_loader = cifar_loader(batch_test)

if torch.cuda.is_available():
    # releasing unnecessary memory in GPU
    torch.cuda.empty_cache()
    # ----------------- TESTING  -----------------
    test_losses = 0
    precision, recall, f1, accuracy = [], [], [], []

    with torch.no_grad():
        for i, data in enumerate(test_loader):
            X, y = data[0].to(device), data[1].to(device)

            outputs = net(X)  # this get's the prediction from the network

            test_losses += criterion(outputs, y)

            predicted_classes = torch.max(
                outputs, 1)[1]  # get class from network's prediction

            # calculate P/R/F1/A metrics for batch
            for acc, metric in zip(
                (precision, recall, f1, accuracy),
                (precision_score, recall_score, f1_score, accuracy_score)):
                acc.append(
                    calculate_metric(metric, y.cpu(), predicted_classes.cpu()))

    # print(
    #     f"\nEpoch {epoch + 1}/{num_epochs}, training loss: {epoch / len(train_loader)}, validation loss: {test_losses / len(test_loader)}")
    print_scores(precision, recall, f1, accuracy, len(test_loader))
Пример #6
0
    def train(self, scratch, game, display):
        p = PLE(game,
                fps=30,
                frame_skip=1,
                num_steps=1,
                force_fps=True,
                display_screen=display)
        t1 = time.time()
        fname = None
        if not scratch:
            fname = self.load()
        else:
            delete_files(self.DATA_DIREC)
        f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC)

        eps_tau = (self.NB_FRAMES - f0) // 8

        scores = []
        while step < self.NB_FRAMES:
            if len(scores) == self.SCORE_FREQ:
                print('States visited:', len(self.Q))
                print_scores(scores, self.SCORE_FREQ)
                scores = []
            p.reset_game()
            state = game.getGameState()
            state_tp = self.discretize(state)
            if state_tp not in self.Q:
                self.Q[state_tp] = [0, 0]

            act = 1
            episode = deque([], self.SIZE_FIFO)
            elig = {}
            gscore = 0
            nb_games += 1
            while not p.game_over():
                step += 1
                if step != 0 and (step % self.SAVE_FREQ) == 0:
                    self.save('Q_' + chr(97 + nb_save) + '_' + str(step) +
                              '_' + str(nb_games) + '.p')
                    nb_save += 1
                if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0:
                    self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau,
                                                  self.NB_FRAMES)
                # 1) Observe r, s′
                bare_reward = p.act(ACTIONS[act])
                reward = self.reward_engineering(bare_reward)
                new_state = game.getGameState()
                new_state_tp = self.discretize(new_state)

                # 2) Choose a′ (GLIE actor) using Q
                if new_state_tp not in self.Q:
                    self.Q[new_state_tp] = [0, 0]
                qvals = self.get_qvals(new_state)
                new_act = self.greedy_action(qvals, self.epsilon)

                # 3) Temporal difference:  δ=r+γQ(s′,a′)−Q(s,a)
                delta = reward + self.GAMMA * self.Q[new_state_tp][
                    new_act] - self.Q[state_tp][act]

                # 4) Update Q
                episode.append((state_tp, act))
                elig[(state_tp, act)] = 1
                for (state_tp_ep, act_ep) in episode:
                    self.Q[state_tp_ep][act_ep] += (
                        self.ALPHA * delta * elig[(state_tp_ep, act_ep)])
                    elig[(state_tp_ep, act_ep)] *= self.LAMBDA

                # 5) s<-s', a<-a'
                state = new_state
                state_tp = new_state_tp
                act = new_act

                if bare_reward > 0:
                    gscore += 1

            scores.append(gscore)

        t2 = time.time()
        # Unicode code point of a: 97
        self.save('Q_' + chr(97 + nb_save) + '_' + str(step) + '_' +
                  str(nb_games) + '.p')
        print()
        print('Number of played games:', nb_games)
        print('Training completed in', (t2 - t1) / 60, 'minutes')
        print()
Пример #7
0
    def train(self, scratch, game, display):
        p = PLE(game,
                fps=30,
                frame_skip=1,
                num_steps=1,
                force_fps=True,
                display_screen=display)
        fname = None
        if not scratch:
            fname = self.load()
        else:
            delete_files(self.DATA_DIREC)
        f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC)

        eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE
        scores = []
        while step < self.NB_FRAMES:
            if len(scores) == self.SCORE_FREQ:
                print_scores(scores, self.SCORE_FREQ)
                scores = []

            p.reset_game()
            state = game.getGameState()
            state_arr = self.state_to_arr(state)
            # state_arr = self.scaler.transform(state_arr.reshape(1, -1))
            gscore = 0
            nb_games += 1
            while not p.game_over():
                step += 1
                if step != 0 and (step % self.SAVE_FREQ) == 0:
                    self.save(
                        chr(97 + nb_save) + '_' + str(step) + '_' +
                        str(nb_games))
                    nb_save += 1
                if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0:
                    self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau,
                                                  self.NB_FRAMES)
                    print('WEIGHTS ABS MEAN')
                    print(abs(np.mean(self.model.get_weights()[0], axis=1)))

                # 1) In s, choose a (GLIE actor)
                qvals = self.get_qvals(state)
                act = self.greedy_action(qvals, self.epsilon)

                # 2) Observe r, s′
                bare_reward = p.act(ACTIONS[act])
                reward = self.reward_engineering(bare_reward)
                new_state = game.getGameState()
                new_state_arr = self.state_to_arr(state)

                self.replay_memory.append(
                    (state_arr, act, reward, new_state_arr))
                if (len(self.replay_memory) == self.BUFFER_SIZE
                        and step % self.TRAIN_FREQ == 0):

                    X_train = []
                    y_train = []

                    # TEST: TRAIN ONLY WITH A SMALL BUFFER BATCH
                    replay_memory_copy = list(self.replay_memory)[:]
                    random.shuffle(replay_memory_copy)
                    for frame in replay_memory_copy[:self.BATCH_SIZE]:
                        s_arr_1, act_x, bare_reward_x, s_arr_2 = frame
                        reward_x = self.reward_engineering(bare_reward_x)
                        old_qval = self.model.predict(s_arr_1, batch_size=1)
                        qval_new = self.model.predict(s_arr_2, batch_size=1)
                        max_qval = np.max(qval_new)
                        # terminal state
                        if bare_reward < 0:
                            delta = reward_x
                        else:
                            delta = reward_x + self.GAMMA * max_qval
                        y = np.zeros((1, len(ACTIONS)))
                        y[0][:] = old_qval[0][:]
                        y[0][act_x] = old_qval[0][act_x] + self.ALPHA * delta
                        X_train.append(s_arr_1.reshape(len(STATES), ))
                        y_train.append(y.reshape(len(ACTIONS), ))

                    X_train = np.array(X_train)
                    y_train = np.array(y_train)
                    self.model.fit(X_train,
                                   y_train,
                                   batch_size=self.BATCH_SIZE,
                                   epochs=2,
                                   verbose=False)

                # 5) s <- s'
                state = new_state
                state_arr = new_state_arr

                if bare_reward > 0:
                    gscore += 1
            scores.append(gscore)

        self.save(chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games))
Пример #8
0
    def train(self, scratch, game, display):
        p = PLE(game,
                fps=30,
                frame_skip=1,
                num_steps=1,
                force_fps=True,
                display_screen=display)
        fname = None
        if not scratch:
            fname = self.load()
        else:
            delete_files(self.DATA_DIREC)
        f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC)

        eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE
        scores = []
        while step < self.NB_FRAMES:
            if len(scores) == self.SCORE_FREQ:
                print_scores(scores, self.SCORE_FREQ)
                scores = []

            p.reset_game()
            self.game.getGameState()
            screen = self.process_screen(p.getScreenRGB())
            last_screens_buff = deque([screen] * 4, maxlen=NB_LAST_SCREENS)
            last_screens = np.stack(last_screens_buff, axis=-1)

            # gscore = 0
            nb_games += 1
            score = 0
            while not p.game_over():
                step += 1
                if step != 0 and (step % self.SAVE_FREQ) == 0:
                    self.save(
                        chr(97 + nb_save) + '_' + str(step) + '_' +
                        str(nb_games))
                    nb_save += 1
                if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0:
                    self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau,
                                                  self.NB_FRAMES)
                    # print('WEIGHTS ABS MEAN')
                    # print(abs(np.mean(self.model.get_weights()[0], axis=1)))

                # 1) In s, choose a (GLIE actor)
                qvals = self.get_qvals(last_screens)
                act = self.greedy_action(qvals, self.epsilon)

                # 2) Observe r, s′
                bare_reward = p.act(ACTIONS[act])
                if bare_reward > 0:
                    score += 1
                reward = self.reward_engineering(bare_reward)
                screen_new = self.process_screen(p.getScreenRGB())

                # update replay_memory
                self.replay_memory.append(screen, act, screen_new, reward)
                if len(self.replay_memory.buff) > self.MIN_REPLAY_MEMORY_SIZE:
                    # build minibatch
                    ls, actions, ls_new, r, terms = self.replay_memory.minibatch(
                    )
                    qvals_new = self.model_target.predict(ls_new)
                    qvals_new_max = qvals_new.max(1).reshape(
                        (self.BATCH_SIZE, 1))
                    delta = r + (1 - terms) * self.GAMMA * qvals_new_max
                    qvals = self.model.predict(ls)
                    qvals[np.arange(self.BATCH_SIZE),
                          actions.ravel()] = delta.ravel()
                    self.model.train_on_batch(x=ls, y=qvals)

                    if step % self.TARGET_FREQ == 0:
                        self.model.save(filepath=self.DATA_DIREC + 'target.h5')
                        self.model_target = load_model(
                            filepath=self.DATA_DIREC + 'target.h5')

                last_screens_buff.append(screen_new)
                last_screens = np.stack(last_screens_buff, axis=-1)
                screen = screen_new
            scores.append(score)
Пример #9
0
def test_score(beam_size,
               encoder,
               decoder,
               imgs_path,
               df_path,
               vocab,
               return_results=False):

    loader = get_loaders(1,
                         imgs_path,
                         df_path,
                         transform,
                         vocab,
                         test=True,
                         n_workers=8)
    vocab_size = len(vocab)

    references = list()
    hypotheses = list()

    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(
            image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(
            1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(
            k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out,
                                       h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(
                decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                       (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words // vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            #             print(top_k_scores, top_k_words)
            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != vocab.stoi['<eos>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        if len(complete_seqs_scores) == 0:
            continue
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    w for w in c if w not in {
                        vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([
            w for w in seq if w not in
            {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}
        ])

        assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
#     bleu4 = corpus_bleu(references, hypotheses)
    b1, b2, b3, b4 = print_scores(references, hypotheses, vocab=vocab)

    if return_results:
        return references, hypotheses

    return b1, b2, b3, b4
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir', help='path to data directory')
    parser.add_argument('--n-jobs', type=int, default=1, help='parallelism')
    parser.add_argument('-s', '--save-model', action='store_true', help='whether to save model')
    parser.add_argument('-l', '--load-model', action='store_true', help='whether to load model')
    args = parser.parse_args()

    if args.load_model:
        x_test, y_test = load_dataset(os.path.join(args.data_dir, 'test.txt'))
        vectorizer = joblib.load(os.path.join(get_model_path(), 'ML_models/vectorizer.pkl'))
        predictor = joblib.load(os.path.join(get_model_path(), 'ML_models/random_forest.pkl'))
        x_test_vectorized = vectorizer.transform(x_test)
        y_test_pred = predictor.predict(x_test_vectorized)
        print_scores(y_test, y_test_pred)

    else:
        x_train, y_train = load_dataset(os.path.join(args.data_dir, 'train.txt'))
        x_valid, y_valid = load_dataset(os.path.join(args.data_dir, 'valid.txt'))
        x_test, y_test = load_dataset(os.path.join(args.data_dir, 'test.txt'))

        n_train, n_valid = len(x_train), len(x_valid)
        x_train = np.concatenate([x_train, x_valid])
        y_train = np.concatenate([y_train, y_valid])

        vectorizer = TfidfVectorizer(tokenizer=tokenize, max_df=0.5, min_df=5)
        vectorizer.fit(x_train)

        x_train_vectorized = vectorizer.transform(x_train)
        x_test_vectorized = vectorizer.transform(x_test)

        steps = [
            ('decomposer', TruncatedSVD(random_state=42)),
            ('classifier', RandomForestClassifier())
        ]
        pipeline = Pipeline(steps)

        params = {
            'decomposer__n_components': [32, 64, 128, 256],
            'classifier__n_estimators': [64, 128, 256, 512]
        }

        splitter = [list(range(0, n_train))], [list(range(n_train, n_train + n_valid))]
        predictor = GridSearchCV(
            pipeline,
            params,
            cv=zip(*splitter),
            n_jobs=args.n_jobs,
            verbose=3
        )

        predictor.fit(x_train_vectorized, y_train)
        y_test_pred = predictor.predict(x_test_vectorized)
        print_scores(y_test, y_test_pred)
        print(predictor.best_params_)

        if args.save_model:
            joblib.dump(
                vectorizer,
                os.path.join(get_model_path(), 'ML_models/vectorizer.pkl'),
                compress=1
            )
            joblib.dump(
                predictor.best_estimator_,
                os.path.join(get_model_path(), 'ML_models/random_forest.pkl'),
                compress=1
            )
Пример #11
0
def valid(branch, model_dense, model_res, model_fusion, dataloader, criterion,
          print_freq, classes, cfg, data_transforms):

    torch.set_grad_enabled(False)
    model_dense.eval()
    model_res.eval()
    model_fusion.eval()

    device = torch.device('cuda:0')
    num_tasks = len(cfg.num_classes)
    steps = len(dataloader)
    loss_sum = np.zeros(num_tasks)
    acc_sum = np.zeros(num_tasks)

    predlist = list(x for x in range(len(cfg.num_classes)))
    true_list = list(x for x in range(len(cfg.num_classes)))

    loss_total = 0
    i_batch = 0

    with torch.no_grad():
        for batch_idx, tuple_i in enumerate(dataloader):
            i_batch += 1
            data, target = tuple_i

            data = data.to(device)
            target = target.to(device)

            output_dense, pool_dense = model_dense(data)
            output_res, pool_res = model_res(data)
            output_fusion = model_fusion(pool_dense, pool_res)

            output = []

            if branch == 'step1':
                for i in range(num_tasks):
                    aux = (output_dense[i] * 0.3) + (output_res[i] * 0.0) + (
                        output_fusion[i] * 1.0)
                    output.append(aux)
            elif branch == 'step2':
                for i in range(num_tasks):
                    aux = (output_dense[i] * 0.0) + (output_res[i] * 0.3) + (
                        output_fusion[i] * 1.0)
                    output.append(aux)
            else:
                for i in range(num_tasks):
                    aux = (output_dense[i] * 0.3) + (output_res[i] * 0.3) + (
                        output_fusion[i] * 1.0)
                    output.append(aux)

            loss = 0

            for t in range(len(cfg.num_classes)):
                loss_t, acc_t = get_loss(output, target, t, device, cfg)
                # AUC

                loss += loss_t

                output_tensor = torch.sigmoid(
                    output[t].view(-1)).cpu().detach().numpy()
                target_tensor = target[:, t].view(-1).cpu().detach().numpy()
                if batch_idx == 0:
                    predlist[t] = output_tensor
                    true_list[t] = target_tensor
                else:
                    predlist[t] = np.append(predlist[t], output_tensor)
                    true_list[t] = np.append(true_list[t], target_tensor)

                loss_sum[t] += loss_t.item()
                acc_sum[t] += acc_t.item()

            loss = loss / 14.0

            loss_total += loss

            if (batch_idx + 1) % print_freq == 0:
                print("Batch {}/{}\t Loss {:.6f} ({:.6f})".format(
                    batch_idx + 1, len(dataloader), loss,
                    loss_total / i_batch))
        roc_classes, roc_mean = compute_roc_auc(true_list, predlist,
                                                len(classes))

        loss_total = loss_total / steps

        print_scores('auroc', classes, roc_classes, roc_mean, loss_total)

    return loss_total
Пример #12
0
def train(options):
    '''
    Trains a specified classifier on a specified dataset using specified 
    feature extractors.
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifier
        - loads the features extractor
        - builds the execution pipeline
        - trains the classifier on the corpus
        - cross-validates the resulting model [optional]
        - saves the resulting model [optional]
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Labels not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]) and not (options["gensim"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=options["classifier"][0],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = None
    if not (options["gensim"]):
        features_extr = get_features_extr(
            features_str_list=options["features"][0],
            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Train the execution pipeline

    # train and cross validate results
    if (options["cross-validation"]):
        if (options["verbosity"]):
            print("Model Training with cross validation\n")

        if options["gensim"]:
            model, pipeline, scores = train_model_gensim_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                config=options["hyper-parameters"],
                token_level=options["token-level"],
                verbose=options["verbosity"])
        else:
            pipeline, scores = train_model_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                verbose=options["verbosity"])

        if options["verbosity"]:
            print_scores(scores)
        if options["output-dir"]:
            if options["gensim"]:
                filename = str("doc2vec" + "-siz_" +
                               str(model[0].vector_size) + "-win_" +
                               str(model[0].window) + "-cnt_" +
                               str(model[0].min_count) +
                               get_classifier_name(classifier))
            else:
                filename = str(
                    get_features_extr_name(features_extr) + "+" +
                    get_classifier_name(classifier))
                save_scores(scores=scores,
                            output_dir=options["output-dir"],
                            filename=filename,
                            verbose=options["verbosity"])

    # train without validation --> output-dir required
    else:
        if options["verbosity"]:
            print("Model Training without cross validation\n")
        if not (options["output-dir"]):
            abort_clean("No output directory specified.",
                        "Training without persisting is not allowed")

        train_corpus = build_corpus(authors=Authors,
                                    label_type=options["label-type"],
                                    verbosity=options["verbosity"])

        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Save the resulting model
    if options["gensim"]:
        filename = "doc2vec+" + get_classifier_name(classifier)
    else:
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_model(pipeline=pipeline,
                   output_dir=options["output-dir"],
                   filename=filename,
                   verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # End Execution
    if options["verbosity"]:
        print("Training task complete in " + str(round(time() - t0)) + " s")
Пример #13
0
    def evolve(self, population, save=True):
        """
        Evolve agents

        :param population:
        :type population:
        :param save: save agents weights and scores
        :type save: bool
        :return:
        """

        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        print("Optimization - started", timestamp)

        agents = population.create_population()

        for i in range(population.max_generations):
            if not self.terminate(population, i):
                try:
                    population.agents_weights[i] = np.array(
                        [a.model.get_weights() for a in agents],
                        dtype=np.ndarray)

                    for j, agent in enumerate(agents):  # TODO parallelize
                        score = agent.run_agent()
                        population.scores[i][j] = score

                    print_scores(i + 1, population.scores[i])

                    if save and (i + 1) % 50 == 0:
                        save_results(population.agents_weights[:i],
                                     population.scores[:i], timestamp)

                    if i < population.max_generations - 1:
                        self.generate_next_generation(population=population,
                                                      generation=i)

                        for k, a in enumerate(agents):
                            agents[k].model.set_weights(
                                population.agents_weights[i + 1][k])

                except KeyboardInterrupt:
                    LOGGER.log(environment=ENVIRONMENT.name,
                               timestamp=timestamp,
                               algorithm=self.__class__.__name__,
                               parameters=vars(self),
                               generations=i,
                               score=np.max(population.scores[i - 1]))
                    save_results(population.agents_weights[:i - 1],
                                 population.scores[:i - 1], timestamp)
                    sys.exit()

            else:
                population.agents_weights = population.agents_weights[:i]
                population.scores = population.scores[:i]
                break

        if save:
            LOGGER.log(environment=ENVIRONMENT.name,
                       timestamp=timestamp,
                       algorithm=self.__class__.__name__,
                       parameters=vars(self),
                       generations=i,
                       score=np.max(population.scores[i]))
            save_results(population.agents_weights, population.scores,
                         timestamp)

        return population.agents_weights, population.scores
Пример #14
0
def compare(options):
    '''
    Compare a set of specified classifiers on a specified dataset using 
    specified features
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifiers
        - loads the features extractors
        - builds the execution pipelines
        - trains the different classifiers on the corpus
        - saves the scores obtained by each classifier on each set of features
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("label type not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    #--------------------------------------------------------------------------
    # Load the tweets
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifiers

    classifier_str_list = []
    if isinstance(options["classifier"], list):
        classifier_str_list = options["classifier"]
    else:
        classifier_str_list = [options["classifier"]]

    classifiers = [
        get_classifier(classifier_str=clf, config=None, verbose=False)
        for clf in classifier_str_list
    ]

    if options["verbosity"]:
        print("Classifiers Loaded: ")
        for clf in classifiers:
            print("    - '" + clf[0] + "'")
        print()

    #--------------------------------------------------------------------------
    # Load the features extractors

    extractors_str_list = options["features"]

    extractors = [
        get_features_extr(features_str_list=extr, verbose=False)
        for extr in extractors_str_list
    ]

    if options["verbosity"]:
        print("Features extractors Loaded: ")
        for extrs in extractors:
            print("    - '" + extrs[0] + "'")
        print()

    #--------------------------------------------------------------------------
    # Prepare results informations supports

    F1_micro = [[0 for x in classifiers] for y in extractors]
    F1_macro = [[0 for x in classifiers] for y in extractors]
    Time_train = [[0 for x in classifiers] for y in extractors]

    output_dir = options["output-dir"]
    individual_scores_dir = output_dir + "indiv_scores/"
    create_dir(individual_scores_dir)

    #--------------------------------------------------------------------------
    # Start the model comparison

    t0 = time()
    total_iteration = len(classifiers) * len(extractors)
    if options["verbosity"]:
        print("Starting model comparisons")

    # Loop for each pair features-extractor/classifier
    for idx_extr, extr in enumerate(extractors):
        extr_name = get_features_extr_name(extr)

        for idx_clf, clf in enumerate(classifiers):
            clf_name = get_classifier_name(clf)

            if options["verbosity"]:
                iteration_number = (idx_extr) * len(classifiers) + idx_clf + 1
                print("Iteration : " + str(iteration_number) + "/" +
                      str(total_iteration))
                print("Testing : Features: " + extr_name + " | Classifier: " +
                      clf_name)

            t0_step = time()

            # Build pipeline
            pipeline = get_pipeline(features_extr=extr,
                                    classifier=clf,
                                    verbose=False)

            # Start training + cross validation
            try:
                model, step_scores = train_model_cross_validation(
                    authors=Authors,
                    label_type=options["label-type"],
                    pipeline=pipeline,
                    verbose=False)
            except:
                print("some error occured - the features extracted and the \
                    classifier are problably incompatible\n")
                continue

            if options["verbosity"]:
                print("Training complete in " + str(round(time() - t0_step)) +
                      " seconds")
                print_scores(step_scores)
                print()

            # Save scores
            save_scores(scores=step_scores,
                        output_dir=individual_scores_dir,
                        filename=extr_name + "+" + clf_name,
                        verbose=False)
            F1_micro[idx_extr][idx_clf] = step_scores["mean_score_micro"]
            F1_macro[idx_extr][idx_clf] = step_scores["mean_score_macro"]
            Time_train[idx_extr][idx_clf] = round(time() - t0_step)

    # Save final micro and macro measuresand execution time
    save_comparison_table(F1_micro, extractors, classifiers,
                          output_dir + "micro.csv")
    save_comparison_table(F1_macro, extractors, classifiers,
                          output_dir + "macro.csv")
    save_comparison_table(Time_train, extractors, classifiers,
                          output_dir + "time.csv")

    if options["verbosity"]:
        print("Comparison task complete in " + str(round(time() - t0)) + " s")