Пример #1
0
def predict_tuned(x, classifier, threshold):
    print('Classifying the data')
    y_prob = classifier.predict_proba(x)

    # Ensures at least 1 predicted topic for each article
    y_pred_min_topics = r_cut(y_prob, 1)

    # Returns matrix where each elements is set to True if the element's value is bigger than threshold
    y_pred_threshold = y_prob > threshold

    return y_pred_min_topics + y_pred_threshold


if __name__ == '__main__':
    if first_option(
            'Do you want to use paragraphs trained classifier [p] or the article trained version? [a]',
            'p', 'a'):
        data = config.get_par_data('test')

        if first_option(
                'Do you want to use the biggest gap thresholding mechanism [b]'
                ' or half the biggest probability as threshold [h]?', 'b',
                'h'):
            print(
                'Loading the paragraph trained classifier trained on data processed by '
                'biggest gap thresholding mechanism ')
            classifier = load_pickle(config.classifier_par_biggest_gap)
            threshold = 0.91
            y_true = process_y(data, threshold_biggest_gap)
        else:
            print(
    print("Processing the predictions")
    for line_start, line_end, article_index in get_next(line_map):
        # set all the topics which were not in the original article to 0
        y_article = y_true[article_index] * y[line_start:line_end, :]

        y[line_start:line_end, :] = func(y_article)

    return y


if __name__ == '__main__':
    data = config.get_par_data('train')

    if first_option(
            'Do you want to use the biggest gap thresholding mechanism [b]'
            ' or half the biggest probability as threshold [h]?', 'b', 'h'):
        threshold_func = threshold_biggest_gap
        classifier_path = config.classifier_par_biggest_gap
    else:
        threshold_func = threshold_half_max
        classifier_path = config.classifier_par_half_max

    y = process_y(data, threshold_func)

    # Check if every topic was used at least once
    if 0 in np.sum(y, axis=0):
        print("WARNING: not all topics used")

    print("Loading x")
    x = load_sparse_csr(data['x'])
Пример #3
0
import numpy as np
from keras.models import load_model
from sklearn.metrics.pairwise import cosine_distances

import config
from segmentation.distance_based_methods import compute_distance
from segmentation.lstm.lstm_utils import split_to_time_steps
from utils import first_option, plot_thresholds

if __name__ == '__main__':
    if first_option(
            'Do you want to use the model trained on cosine distances [c] or on raw SVM predictions [r]?',
            'c', 'r'):
        cosine = True
        model = load_model(config.lstm_model_1)
    else:
        cosine = False
        model = load_model(config.lstm_model_577)

    time_steps = model.get_config()[0]['config']['batch_input_shape'][1]

    held_out = config.get_seg_data('held_out')

    X_held = np.load(held_out['y'])
    y_held = np.load(held_out['y_true_lm'])

    if cosine:
        print("Computing the distances")
        X_held = compute_distance(X_held, cosine_distances)
    else:
        y_held = np.append(0, y_held)
import numpy as np
from sklearn.svm import LinearSVC, SVC

import config
from segmentation.distance_based_methods import compute_distance
from utils import save_pickle, first_option, load_sparse_csr

if __name__ == '__main__':
    data = config.get_seg_data('train')

    if first_option('Do you want to use linear [l] or RBF [r] kernel?', 'l',
                    'r'):
        path = config.classifier_linear
        classifier = LinearSVC(random_state=0)
    else:
        path = config.classifier_rbf
        classifier = SVC(random_state=0, kernel='rbf')

    y_true = np.load(data['y_true_lm'])

    print("Loading x")
    x = load_sparse_csr(data['x'])

    print("Computing cosine distance")
    x_dists = compute_distance(x)

    print("Classifier training")
    classifier.fit(x_dists, y_true)

    print("Saving th classifier to: " + path)
    save_pickle(path, classifier)
from pathlib import Path
import numpy as np
from keras.models import load_model
from sklearn.metrics.pairwise import cosine_distances

import config
from segmentation.distance_based_methods import compute_distance
from segmentation.lstm.lstm_utils import split_to_time_steps, shuffle_the_data, build_model
from utils import first_option, create_dir, save_pickle

if __name__ == '__main__':
    # Number of vectors in one sequence, input data structure [samples, time_steps, features]
    time_steps = 200

    if Path(config.lstm_model_1).is_file() and first_option(
            'Do you want to continue training the saved model?', 'y', 'n'):
        print("Loading new model")
        model = load_model(config.lstm_model_1)
    else:
        print("Building new model")
        model = build_model(time_steps, 1)

    # Create dir for histories
    create_dir(config.hist_dir)

    print("Loading the data")
    train = config.get_seg_data('train')
    held_out = config.get_seg_data('held_out')

    X_train_or = np.load(train['y'])
    y_train_or = np.load(train['y_true_lm'])
Пример #6
0
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np

import config
from segmentation.clustering.custom_clusterer import CustomClusterer
from utils import first_option, print_measurements

if __name__ == '__main__':
    custom_clusterer = first_option('Do you want to use custom implementation of clusterer [c] or k-means [k]?',
                                    'c', 'k')

    data = config.get_seg_data('test')

    print("Loading the data")

    # Predictions to undergo clustering
    X = np.load(data['y'])
    y_true = np.load(data['y_true_lm'])

    range_n_clusters = [2, 3, 4, 5, 6]
    y_pred = np.zeros((y_true.shape[0], 1))
    window_size = 20
    one_percent = int(X.shape[0] / 100) if X.shape[0] > 100 else 1

    print("Clustering...")
    for i in range(0, X.shape[0] - window_size, window_size):
        # for i in range(0, 40, window_size):
        X_select = X[i:(i + window_size), :]