def upload_sequences(): def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS query_file = check_exists(request.files.get('queryFile'), 'queryFile') has_name_col = request.form.get('hasNameCol', False, type=bool) if query_file.filename == '': raise FileError('No file selected') if query_file and allowed_file(query_file.filename): query_file.save(UPLOAD_PATH) # TODO: limit the size of the uploaded set load_details = pygenex.loadDataset('upload', UPLOAD_PATH, hasNameCol=has_name_col) pygenex.normalize('upload') allTimeSeries = get_names_lengths_thumbnails('upload', load_details['count']) for i in range(load_details['count']): series = pygenex.getTimeSeries('upload', i) allTimeSeries[i]['raw'] = attach_index(series) pygenex.unloadDataset('upload') os.remove(UPLOAD_PATH) return jsonify(allTimeSeries) raise FileError('Invalid file type')
def _prune(self): old_cache = self._cache.copy() super(GenexCache, self)._prune() for k in old_cache: if isinstance(k, tuple) and k not in self._cache: try: pg.unloadDataset(make_name(*k)) logger.debug('Unloaded %s', k) except RuntimeError: logger.debug('%s is not a loaded dataset', k)
def group_dataset(name, from_st, to_st, dist, num_threads=15, dry_run=False, exclude_callback=None, progress_callback=None): dataset_path = os.path.join(DATASET_ROOT, name + '_DATA') info = pg.loadDataset(name, dataset_path, ',', -1, 1) logging.info('Loaded dataset %s. Count = %d. Length = %d', name, info['count'], info['length']) pg.normalize(name) logging.info('Normalized the dataset %s.', name) records = [] timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') records_name = name + '_records_' + timestamp + '.csv' records_path = os.path.join(GROUPS_ROOT, name, records_name) for d in dist: for st in np.arange(from_st, to_st, 0.1): st = round(st * 10) / 10 if exclude_callback is not None and exclude_callback(name, d, st): logging.info('Ignore [%s, %s, %.1f]', name, d, st) continue logging.info('Grouping [%s, %s, %.1f] with %d threads', name, d, st, num_threads) if dry_run: records.append({}) if not dry_run: start = time.time() group_count = pg.group(name, st, d, num_threads) end = time.time() logging.info('Finished [%s, %s, %.1f] after %f seconds', name, d, st, end - start) logging.info('[%s, %s, %.1f] generates %d groups', name, d, st, group_count) save_dir = os.path.join(GROUPS_ROOT, name, d) if not os.path.exists(save_dir): os.makedirs(save_dir) save_path = os.path.join( save_dir, name + '_GROUPS_' + '{:.1f}'.format(st)) logging.info('Saving groups [%s, %s, %.1f] to %s', name, d, st, save_path) pg.saveGroups(name, save_path) size_save_path = os.path.join( save_dir, name + '_GROUP_SIZES_' + '{:.1f}'.format(st)) logging.info('Saving groups size [%s, %s, %.1f] to %s', name, d, st, size_save_path) pg.saveGroupsSize(name, size_save_path) records.append({ 'dist_name': d, 'st': st, 'group_count': group_count, 'path': save_path, 'size_path': size_save_path, 'duration': end - start }) records_df = pd.DataFrame(records) records_df.to_csv(records_path, index=False) logging.info('Saved grouping record for %s to %s', name, records_path) if progress_callback is not None: progress_callback(name, d, st) pg.unloadDataset(name) logging.info('Unloaded %s', name) return records
def run_paa(name, dist, k, queries_df, dry_run=False): '''Run PAA experiment For each distance, this method iterates over the queries. It checks if the current query was already performed and recorded to the json result file. If it was not, it run the PAA method with that query JSON result structure { 'euclidean': [{ query: [index, start, end, outside] result_paa: [{'data': {'index': ..., 'end': .., 'start': ...}, 'dist': ...}, ...] time_paa: ....}, ...] 'manhattan': ... } ''' name, name_out = load_and_normalize(name) pg.preparePAA(name, 3) logging.info('Generate PAA of block size 3 for dataset %s.', name) results, experiment_path = get_results_object(name) for d in dist: if d not in results: results[d] = [] for i in range(queries_df.shape[0]): query = { 'index': queries_df['index'][i], 'start': queries_df.start[i], 'end': queries_df.end[i], 'outside': queries_df.outside[i] } find_query = filter( lambda o: 'result_paa' in o and o['query'] == query, results[d]) if len(find_query) == 0: logging.info('Running %s %s...(%d/%d)', name, query_description('PAA', k, query, d), i, queries_df.shape[0]) if not dry_run: # Run the query and measure response time start = time.time() if query['outside'] == 0: # is inside result_paa = pg.ksimpaa(k, name, name, query['index'], query['start'], query['end'], d) else: result_paa = pg.ksimpaa(k, name, name_out, query['index'], query['start'], query['end'], d) end = time.time() time_paa = end - start results[d].append({ 'query': query, 'result_paa': result_paa, 'time_paa': time_paa }) # Dump result to file immediatelly with open(experiment_path, 'w') as f: json.dump(results, f) logging.info('Finished %s after %.1f seconds', query_description('PAA', k, query, d), end - start) else: logging.info('Query %s is already run', query_description('PAA', k, query, d)) pg.unloadDataset(name) logging.info('Unloaded %s', name) pg.unloadDataset(name_out) logging.info('Unloaded %s', name_out)
''' from __future__ import print_function import os import pygenex as pg import pandas as pd from common import GROUPING_RECORDS, DATASET_ROOT all_datasets = [n for n in os.listdir(DATASET_ROOT) if n.endswith('DATA')] dataset_info = {} for i, ds in enumerate(all_datasets): name = ds[:len(ds) - len('_DATA')] print('(%d/%d) Processing...%s' % (i, len(all_datasets), name)) info = pg.loadDataset(name, os.path.join(DATASET_ROOT, ds), ',', -1, 1) dataset_info[name] = info pg.unloadDataset(name) query_info = pg.loadDataset(name , os.path.join(DATASET_ROOT, name + '_QUERY') , ',', -1, 1) dataset_info[name + '_out'] = query_info df = pd.DataFrame.from_dict(dataset_info, orient='index') df['subsequence'] = (df['count'] * df['length'] * (df['length'] - 1) / 2).astype('int') df.to_json(GROUPING_RECORDS, orient='index') print('Preview the first few datasets') print(df.head()) print('Saved info to', GROUPING_RECORDS)
def run_genex(name, dist, k, queries_df, num_subseq, dry_run=False): '''Run GENEX experiment For each distance, this method iterates over the queries. It checks if the current query was already performed and recorded to the json result file. If it was not, it run the GENEX method with that query. For GENEX method, we run 1-NN and 15-NN differently. For 1-NN, we run similarly as brute-force and PAA. For 15-NN, we also find the smallest extent of the parameter k_e such that we reach 100% for each query. JSON result structure (1-NN, in the same file as BF and PAA) { 'euclidean': [{ query: [index, start, end, outside] result_genex_0.1: [{'data': {'index': ..., 'end': .., 'start': ...}, 'dist': ...}, ...] time_genex_0.1: .... }, { query: [index, start, end, outside] result_genex_0.1: [{'data': {'index': ..., 'end': .., 'start': ...}, 'dist': ...}, ...] time_genex_0.1: ....}, ...] 'manhattan': ... } JSON result structure (15-NN, in a different file) { 'euclidean': [{ query: [index, start, end, outside] result_genex_0.1: [<accuracy_0.1>, <accuracy_0.2>, ..., <accuracy_5.0>] time_genex_0.1: [<time_0.1>, <time_0.2>, ..., <time_5.0>] }, { query: [index, start, end, outside] result_genex_0.2: [<accuracy_0.1>, <accuracy_0.2>, ..., <accuracy_5.0>] time_genex_0.2: [<time_0.1>, <time_0.2>, ..., <time_5.0>] }, ...] } accuracy_x means x percent of the dataset is explored ''' name, name_out = load_and_normalize(name) results, experiment_path = get_results_object(name) results_15nn, experiment_path_15nn = get_results_object(name + '_15NN') for d in dist: if d not in results: results[d] = [] if d not in results_15nn: results_15nn[d] = [] for st in [0.1, 0.2, 0.3, 0.4, 0.5]: group_file_name = '{}_GROUPS_{}'.format(name, str(st)) group_file_path = GROUPS_ROOT + '/{}/{}/{}'.format( name, d, group_file_name) if not os.path.exists(group_file_path): logging.info('Group %s not found. Moving on.' % group_file_name) continue logging.info('Loading group file %s', group_file_name) number_of_groups = pg.loadGroups(name, group_file_path) logging.info('Loaded %s with %d groups', group_file_name, number_of_groups) method_key = 'genex_' + str(st) for i in range(queries_df.shape[0]): query = { 'index': queries_df['index'][i], 'start': queries_df.start[i], 'end': queries_df.end[i], 'outside': queries_df.outside[i] } ##################################### ## 1-NN experiment ## ##################################### logging.info('1-NN experiment for %s', query_description('GENEX', 1, query, d)) find_query = filter( lambda o: 'result_' + method_key in o and o['query'] == query, results[d]) if len(find_query) == 0: logging.info('Running %s %s...(%d/%d)', name, query_description('GENEX', 1, query, d), i + 1, queries_df.shape[0]) if not dry_run: # Run the query and measure response time start = time.time() query_name = name if query['outside'] == 0 else name_out result_genex = pg.sim(name, query_name, query['index'], query['start'], query['end']) end = time.time() time_genex = end - start # Append new result to the result array results[d].append({ 'query': query, 'result_' + method_key: result_genex, 'time_' + method_key: time_genex }) # Dump result to file immediatelly with open(experiment_path, 'w') as f: json.dump(results, f) logging.info('Finished %s after %.1f seconds', query_description('GENEX', 1, query, d), end - start) else: logging.info('Query %s is already run', query_description('GENEX', 1, query, d)) ##################################### ## 15-NN experiment ## ##################################### logging.info('15-NN experiment for %s', query_description('GENEX', k, query, d)) result_bf = filter( lambda o: 'result_bf' in o and o['query'] == query, results[d]) find_query = filter( lambda o: 'result_' + method_key in o and o['query'] == query, results_15nn[d]) if len(find_query) == 0 and len(result_bf) > 0: logging.info('Running %s %s...(%d/%d)', name, query_description('GENEX', k, query, d), i + 1, queries_df.shape[0]) if not dry_run: dist_bf = [ r['dist'] for r in result_bf[0]['result_bf'] ] all_err = [] all_time = [] counter = 0 for ke_ratio in np.arange(0.1, 100.1, 0.1): ke = int(round(ke_ratio / 100 * subseq)) if counter % 10 == 0: logging.info('ke_ratio = %f. ke = %d', ke_ratio, ke) counter += 1 start = time.time() query_name = name if query[ 'outside'] == 0 else name_out result_genex = pg.ksim(k, ke, name, query_name, query['index'], query['start'], query['end']) end = time.time() dist_genex = [r['dist'] for r in result_genex] err = compute_rel_error(dist_genex, dist_bf) all_err.append(err) all_time.append(end - start) if abs(err) < 1e-9: break results_15nn[d].append({ 'query': query, 'result_' + method_key: all_err, 'time_' + method_key: all_time }) with open(experiment_path_15nn, 'w') as f: json.dump(results_15nn, f) logging.info('Finished %s after %.1f seconds', query_description('GENEX', k, query, d), end - start) else: logging.info( 'Query %s is already run or its bf result does not exist', query_description('GENEX', k, query, d)) pg.unloadDataset(name) logging.info('Unloaded %s', name) pg.unloadDataset(name_out) logging.info('Unloaded %s', name_out)