def get_paginated_questions(): try: categories = util.get_categories(formatted=True) current_category = request.args.get('cateogry') questions = util.get_questions(formatted=False) page = request.args.get('page', 1, type=int) start = QUESTIONS_PER_PAGE * (page - 1) end = start + QUESTIONS_PER_PAGE questions_on_selected_page = [question.format() for question in questions[start: end]] if not questions_on_selected_page: raise ResourceNotFound return jsonify({ "success": True, "questions": questions_on_selected_page, "total_questions": len(questions), "categories": categories, "current_category": current_category }) except ResourceNotFound: abort(404) except Exception: abort(500)
def results_by_category(limit=None): for category in get_categories()[:limit]: source = os.path.join(TRAINING_CATEGORIES, category) if os.path.exists(source): results.clear() jobs = [] X_train, X_test, y_train, y_test = load_train_test(source, category) for param in params: p = multiprocessing.Process(target=worker, args=(X_train, y_train, X_test, y_test, param, results)) jobs.append(p) p.start() while len(jobs) >= 4: jobs[0].join() jobs = [j for j in jobs if j.is_alive()] while len(jobs) != 0: jobs[0].join() jobs = [j for j in jobs if j.is_alive()] print results best_score = max(results.keys()) yield [category, best_score] + results[best_score]
def __init__(self, x, y, lang='pt', batch_size=32, process_x=lambda x: x, process_y=lambda y: y, separate_val=validation, sampler=None): #self.h5 = h5py.File(h5_file, 'r', libver='latest')['test'] self.lang = lang #self.lang_dset = self.h5[lang] self.process_x = process_x self.process_y = process_y self.num_classes = len(util.get_categories()) self.x, self.y = x, y self.batch_size = batch_size self.indices = np.arange(self.x.shape[0]) self.separate_val = separate_val if (self.separate_val): self.val_indices = np.load(DATA_PATH + 'val_index_' + self.lang + '.npy') self.indices = np.setdiff1d(self.indices, self.val_indices) self.sampler = sampler if (self.sampler != None): self.indices, y = self.sampler.fit_resample( self.indices.reshape(-1, 1), self.y[self.indices]) del y gc.collect() self.indices = self.indices.reshape(-1) np.random.shuffle(self.indices)
def results_by_category(limit=None): for category in get_categories()[:limit]: source = os.path.join(TRAINING_CATEGORIES, category) if os.path.exists(source): X_train, X_test, y_train, y_test = load_train_test(source, category) f1s = [category] for m in ms: m.fit(X_train, y_train) y_pred = m.predict(X_test) f1s.append(metrics.f1_score(y_test, y_pred)) yield f1s
def get_categories(): try: categories = util.get_categories(formatted=True) if not categories: raise ResourceNotFound return jsonify({ "success": True, "categories": categories, "total_categories": len(categories) }) except ResourceNotFound: abort(404) except Exception: abort(500)
from keras.layers import Concatenate, Input, Dropout, SpatialDropout1D from keras.models import Model from attention import SeqSelfAttention from util import CyclicLR from sklearn.model_selection import StratifiedKFold import keras.backend as K NAME = "bi_lstm_gru_selfatt_kfold" PARAMS = { 'sequence_len': padding_len, 'embedding_dim': 200, 'epochs': 3, 'batch_size': 256, 'loss': 'categorical_crossentropy', 'num_classes': len(util.get_categories()), 'class_weights': None, 'sampler': None, 'k-folds': 4 } PATH = DATA_PATH+'models/'+NAME+'/' DEPENDENCIES = { 'categorical_recall': keras_metrics.categorical_recall(), 'balanced_recall': util.balanced_recall, 'SeqSelfAttention': SeqSelfAttention, 'f1': util.f1 } def load_model(path, extras={}):
def load_images_from_files(filenames): images = [] for path in filenames: images.append(cv2.imread(path, cv2.IMREAD_UNCHANGED)) assert len(filenames) == len(images) return filenames, get_categories(filenames), images
def train_classifier(feature_name, train_batch_num, base_npz_dir, test_batches): test_acc = [] base_path = util.get_base_path() categories = util.get_categories() train_batches = range(0, train_batch_num) #test_batches = range(train_batch_num,train_batch_num+1) JC edit set_name = 'setb50k' label_set_name = set_name subset = '' #'_pca1' classifier_paramstring = '' if do_norm: classifier_paramstring += 'N' if props['C'] != 0: classifier_paramstring += 'C%d' % props['C'] out_fn = os.path.join( base_npz_dir, feature_name, '%s%s_%s%s_%d-%d.pickle' % (classifier_type, classifier_paramstring, set_name, subset, train_batches[0], train_batches[-1])) if do_norm: out_fn_norm = os.path.join( base_npz_dir, feature_name, 'norm_%s%s_%d.pickle' % (set_name, subset, train_batches[0])) print 'Training %s...' % out_fn if classifier_type == 'sgd_svm': is_incremental = True else: is_incremental = False norm = dict() clf = None for i_batch, train_batch in enumerate(train_batches + test_batches): fn = os.path.join(base_npz_dir, feature_name, '%s_%05d%s.npz' % (set_name, train_batch, subset)) print 'Processing feature file %s.' % fn print fn with np.load(fn) as file_contents: data = file_contents['data'] true_labels, _ = util.load_labels(label_set_name, train_batch) if do_norm: if i_batch == 0: # Initial batch to determine mean and variance for normalization norm['mean'] = np.expand_dims(data.mean(axis=0), 0) norm['std'] = np.expand_dims(data.std(axis=0), 0) norm['std'] = np.maximum(norm['std'], 0.01) with open(out_fn_norm, 'wb') as fid: pickle.dump(norm, fid) data -= norm['mean'] data /= norm['std'] print 'Data after normalization: Mean %f, Std %f' % (data.mean( axis=0).mean(axis=0), data.std(axis=0).mean(axis=0)) if is_incremental: # Incremental: Do training every training iteration # Do testing not just on test but also during training before feeding the new training data do_train = (i_batch < len(train_batches)) do_test = (i_batch > 0) use_data = data use_true_labels = true_labels else: # Non-incremental: Train once when all training batches have been collected do_train = (i_batch == len(train_batches) - 1) do_test = (i_batch >= len(train_batches)) # data collection phase if not do_test: if i_batch == 0: data_all = data all_true_labels = true_labels else: data_all = np.concatenate((data_all, data), axis=0) all_true_labels = np.concatenate( (all_true_labels, true_labels), axis=0) use_data = data_all use_true_labels = all_true_labels print ' use data %s.' % str(use_data.shape) print ' use labels %s' % str(use_true_labels.shape) if do_test: # After some batch training has been done, predict performance pred_labels = clf.predict(data) acc = float(sum(pred_labels == true_labels)) / true_labels.size test_acc.append(acc) print ' Batch accuracy: %.1f%%' % (acc * 100) if do_train: if classifier_type == 'sgd_svm': clf = train_sgd(clf, 'hinge', use_data, use_true_labels) elif classifier_type == 'svm': clf = train_svm(clf, use_data, use_true_labels, props) pred_labels = clf.predict(use_data) acc = float( sum(pred_labels == use_true_labels)) / use_true_labels.size print ' Train accuracy: %.1f%%' % (acc * 100) # Dump classifier data at every iteration with open(out_fn, 'wb') as fid: pickle.dump(clf, fid) return np.mean(test_acc)
def categories(): if should_return_json(): return jsonify(dict(ok=True, data=get_categories())) else: return page_not_found()
import matplotlib.pyplot as plt import numpy as np import util as ut # Get the rating information ratings_array, rating_info = ut.get_categories() # Generate histogram of all ratings in the dataset plt.hist(ratings_array, bins=np.arange(0.5, 6.5, 1)) plt.title('Histogram of All Ratings') plt.xlabel('Ratings') plt.savefig('histograms/41.png') plt.show() # Generate histogram of ratings of 10 best movies plt.hist(rating_info[0], bins=np.arange(0.5, 6.5, 1)) plt.title('Histogram of Ratings of 10 Best Movies') plt.xlabel('Ratings') plt.savefig('histograms/43.png') plt.show() # Generate histogram of ratings of 10 most popular movies plt.hist(rating_info[1], bins=np.arange(0.5, 6.5, 1)) plt.title('Histogram of Ratings of 10 Most Popular Movies') plt.xlabel('Ratings') plt.savefig('histograms/42.png') plt.show() # Generate histogram of ratings of Animation movies plt.hist(rating_info[2], bins=np.arange(0.5, 6.5, 1)) plt.title('Histogram of Ratings of Animation Movies')
'text_cnn_att': 1 } #weights normalized model_weights = {k: (v / 45) for k, v in model_weights_int.items()} model_list = [k for k in model_weights_int.keys()] #weights for each epoch according with the number of epochs trained weigths_epoch = { 1: [1], 2: [0.35, 0.65], 3: [0.15, 0.35, 0.5], 4: [0.1, 0.2, 0.3, 0.4], 5: [0.1, 0.15, 0.2, 0.25, 0.3] } num_classes = len(util.get_categories()) #Load test data for each language data = {} for lang in ['es', 'pt']: X_test = util.get_X_test(data_type='keras_tokenized_tri', lang=lang, file_type="dump") index = np.load(DATA_PATH + 'test_index_' + lang + '.npy') data[lang] = {'index': index, 'X_test': process_x(X_test)} del X_test, index gc.collect() paths = {} for model_name in model_list: PATH = DATA_PATH + 'models/' + model_name + '/'