def predict_tuned(x, classifier, threshold): print('Classifying the data') y_prob = classifier.predict_proba(x) # Ensures at least 1 predicted topic for each article y_pred_min_topics = r_cut(y_prob, 1) # Returns matrix where each elements is set to True if the element's value is bigger than threshold y_pred_threshold = y_prob > threshold return y_pred_min_topics + y_pred_threshold if __name__ == '__main__': if first_option( 'Do you want to use paragraphs trained classifier [p] or the article trained version? [a]', 'p', 'a'): data = config.get_par_data('test') if first_option( 'Do you want to use the biggest gap thresholding mechanism [b]' ' or half the biggest probability as threshold [h]?', 'b', 'h'): print( 'Loading the paragraph trained classifier trained on data processed by ' 'biggest gap thresholding mechanism ') classifier = load_pickle(config.classifier_par_biggest_gap) threshold = 0.91 y_true = process_y(data, threshold_biggest_gap) else: print(
print("Processing the predictions") for line_start, line_end, article_index in get_next(line_map): # set all the topics which were not in the original article to 0 y_article = y_true[article_index] * y[line_start:line_end, :] y[line_start:line_end, :] = func(y_article) return y if __name__ == '__main__': data = config.get_par_data('train') if first_option( 'Do you want to use the biggest gap thresholding mechanism [b]' ' or half the biggest probability as threshold [h]?', 'b', 'h'): threshold_func = threshold_biggest_gap classifier_path = config.classifier_par_biggest_gap else: threshold_func = threshold_half_max classifier_path = config.classifier_par_half_max y = process_y(data, threshold_func) # Check if every topic was used at least once if 0 in np.sum(y, axis=0): print("WARNING: not all topics used") print("Loading x") x = load_sparse_csr(data['x'])
import numpy as np from keras.models import load_model from sklearn.metrics.pairwise import cosine_distances import config from segmentation.distance_based_methods import compute_distance from segmentation.lstm.lstm_utils import split_to_time_steps from utils import first_option, plot_thresholds if __name__ == '__main__': if first_option( 'Do you want to use the model trained on cosine distances [c] or on raw SVM predictions [r]?', 'c', 'r'): cosine = True model = load_model(config.lstm_model_1) else: cosine = False model = load_model(config.lstm_model_577) time_steps = model.get_config()[0]['config']['batch_input_shape'][1] held_out = config.get_seg_data('held_out') X_held = np.load(held_out['y']) y_held = np.load(held_out['y_true_lm']) if cosine: print("Computing the distances") X_held = compute_distance(X_held, cosine_distances) else: y_held = np.append(0, y_held)
import numpy as np from sklearn.svm import LinearSVC, SVC import config from segmentation.distance_based_methods import compute_distance from utils import save_pickle, first_option, load_sparse_csr if __name__ == '__main__': data = config.get_seg_data('train') if first_option('Do you want to use linear [l] or RBF [r] kernel?', 'l', 'r'): path = config.classifier_linear classifier = LinearSVC(random_state=0) else: path = config.classifier_rbf classifier = SVC(random_state=0, kernel='rbf') y_true = np.load(data['y_true_lm']) print("Loading x") x = load_sparse_csr(data['x']) print("Computing cosine distance") x_dists = compute_distance(x) print("Classifier training") classifier.fit(x_dists, y_true) print("Saving th classifier to: " + path) save_pickle(path, classifier)
from pathlib import Path import numpy as np from keras.models import load_model from sklearn.metrics.pairwise import cosine_distances import config from segmentation.distance_based_methods import compute_distance from segmentation.lstm.lstm_utils import split_to_time_steps, shuffle_the_data, build_model from utils import first_option, create_dir, save_pickle if __name__ == '__main__': # Number of vectors in one sequence, input data structure [samples, time_steps, features] time_steps = 200 if Path(config.lstm_model_1).is_file() and first_option( 'Do you want to continue training the saved model?', 'y', 'n'): print("Loading new model") model = load_model(config.lstm_model_1) else: print("Building new model") model = build_model(time_steps, 1) # Create dir for histories create_dir(config.hist_dir) print("Loading the data") train = config.get_seg_data('train') held_out = config.get_seg_data('held_out') X_train_or = np.load(train['y']) y_train_or = np.load(train['y_true_lm'])
from sklearn.metrics import precision_recall_fscore_support as prfs from sklearn.metrics import silhouette_score from sklearn.cluster import KMeans import numpy as np import config from segmentation.clustering.custom_clusterer import CustomClusterer from utils import first_option, print_measurements if __name__ == '__main__': custom_clusterer = first_option('Do you want to use custom implementation of clusterer [c] or k-means [k]?', 'c', 'k') data = config.get_seg_data('test') print("Loading the data") # Predictions to undergo clustering X = np.load(data['y']) y_true = np.load(data['y_true_lm']) range_n_clusters = [2, 3, 4, 5, 6] y_pred = np.zeros((y_true.shape[0], 1)) window_size = 20 one_percent = int(X.shape[0] / 100) if X.shape[0] > 100 else 1 print("Clustering...") for i in range(0, X.shape[0] - window_size, window_size): # for i in range(0, 40, window_size): X_select = X[i:(i + window_size), :]