Exemplo n.º 1
0
def upload_sequences():
    def allowed_file(filename):
        return '.' in filename and \
                filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

    query_file = check_exists(request.files.get('queryFile'), 'queryFile')
    has_name_col = request.form.get('hasNameCol', False, type=bool)
    if query_file.filename == '':
        raise FileError('No file selected')

    if query_file and allowed_file(query_file.filename):
        query_file.save(UPLOAD_PATH)
        # TODO: limit the size of the uploaded set
        load_details = pygenex.loadDataset('upload',
                                           UPLOAD_PATH,
                                           hasNameCol=has_name_col)
        pygenex.normalize('upload')
        allTimeSeries = get_names_lengths_thumbnails('upload',
                                                 load_details['count'])
        for i in range(load_details['count']):
            series = pygenex.getTimeSeries('upload', i)
            allTimeSeries[i]['raw'] = attach_index(series)

        pygenex.unloadDataset('upload')
        os.remove(UPLOAD_PATH)
        return jsonify(allTimeSeries)

    raise FileError('Invalid file type')
Exemplo n.º 2
0
def load_and_normalize(name):
    name_out = name + '_out'
    dataset_path = os.path.join(DATASET_ROOT, name + '_DATA')
    query_path = os.path.join(DATASET_ROOT, name + '_QUERY')
    info = pg.loadDataset(name, dataset_path, ',', -1, 1)
    logging.info('Loaded dataset %s. Count = %d. Length = %d', name,
                 info['count'], info['length'])

    info = pg.loadDataset(name_out, query_path, ',', -1, 1)
    logging.info('Loaded dataset %s. Count = %d. Length = %d', name_out,
                 info['count'], info['length'])

    pg.normalize(name)
    logging.info('Normalized the dataset %s.', name)

    pg.normalize(name_out)
    logging.info('Normalized the dataset %s.', name_out)
    return name, name_out
Exemplo n.º 3
0
def load_and_group_dataset(datasetID, st, distance):
    key = (datasetID, st, distance)
    if cache.has(key):
        app.logger.debug('Found cache for %s', key)
        return cache.get(key)
    else:
        # Read dataset list
        with open('datasets.json', 'r') as datasets_json:
            datasets = json.load(datasets_json)
        name = make_name(*key)
        path = str(datasets[datasetID]['path'])
        hasNameCol = bool(datasets[datasetID]['hasNameCol'])
        # Load, normalize, and group the dataset
        load_details = pygenex.loadDataset(name, path, hasNameCol=hasNameCol)
        pygenex.normalize(name)
        allTimeSeries = get_names_lengths_thumbnails(name, load_details['count'])
        group_count = pygenex.group(name, st, distance)

        # Save group size
        if not os.path.exists(GROUPS_SIZE_FOLDER):
            os.makedirs(GROUPS_SIZE_FOLDER)
        group_size_path = os.path.join(GROUPS_SIZE_FOLDER, name)
        pygenex.saveGroupsSize(name, group_size_path)

        # Cache the results and return
        subsequences = load_details['count'] * load_details['length']\
            * (load_details['length'] - 1) / 2
        density = get_group_density_base64(group_size_path)
        info = {
            'count': load_details['count'],
            'length': load_details['length'],
            'subseq': subsequences,
            'groupCount': group_count,
            'groupDensity': density,
            'timeSeries': allTimeSeries
        }
        cache.set(key, info)
        return info
Exemplo n.º 4
0
def group_dataset(name,
                  from_st,
                  to_st,
                  dist,
                  num_threads=15,
                  dry_run=False,
                  exclude_callback=None,
                  progress_callback=None):
    dataset_path = os.path.join(DATASET_ROOT, name + '_DATA')
    info = pg.loadDataset(name, dataset_path, ',', -1, 1)
    logging.info('Loaded dataset %s. Count = %d. Length = %d', name,
                 info['count'], info['length'])
    pg.normalize(name)
    logging.info('Normalized the dataset %s.', name)
    records = []
    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    records_name = name + '_records_' + timestamp + '.csv'
    records_path = os.path.join(GROUPS_ROOT, name, records_name)
    for d in dist:
        for st in np.arange(from_st, to_st, 0.1):
            st = round(st * 10) / 10
            if exclude_callback is not None and exclude_callback(name, d, st):
                logging.info('Ignore [%s, %s, %.1f]', name, d, st)
                continue

            logging.info('Grouping [%s, %s, %.1f] with %d threads', name, d,
                         st, num_threads)
            if dry_run:
                records.append({})

            if not dry_run:
                start = time.time()
                group_count = pg.group(name, st, d, num_threads)
                end = time.time()

                logging.info('Finished [%s, %s, %.1f] after %f seconds', name,
                             d, st, end - start)
                logging.info('[%s, %s, %.1f] generates %d groups', name, d, st,
                             group_count)

                save_dir = os.path.join(GROUPS_ROOT, name, d)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                save_path = os.path.join(
                    save_dir, name + '_GROUPS_' + '{:.1f}'.format(st))
                logging.info('Saving groups [%s, %s, %.1f] to %s', name, d, st,
                             save_path)
                pg.saveGroups(name, save_path)

                size_save_path = os.path.join(
                    save_dir, name + '_GROUP_SIZES_' + '{:.1f}'.format(st))

                logging.info('Saving groups size [%s, %s, %.1f] to %s', name,
                             d, st, size_save_path)
                pg.saveGroupsSize(name, size_save_path)

                records.append({
                    'dist_name': d,
                    'st': st,
                    'group_count': group_count,
                    'path': save_path,
                    'size_path': size_save_path,
                    'duration': end - start
                })

                records_df = pd.DataFrame(records)
                records_df.to_csv(records_path, index=False)
                logging.info('Saved grouping record for %s to %s', name,
                             records_path)

                if progress_callback is not None:
                    progress_callback(name, d, st)

    pg.unloadDataset(name)
    logging.info('Unloaded %s', name)
    return records
Exemplo n.º 5
0
'''
from __future__ import print_function
import os
import pygenex as pg
import pandas as pd

from common import GROUPING_RECORDS, DATASET_ROOT

all_datasets = [n for n in os.listdir(DATASET_ROOT) if n.endswith('DATA')]

dataset_info = {}

for i, ds in enumerate(all_datasets):
	name = ds[:len(ds) - len('_DATA')]
	print('(%d/%d) Processing...%s' % (i, len(all_datasets), name))
	info = pg.loadDataset(name, os.path.join(DATASET_ROOT, ds), ',', -1, 1)
	dataset_info[name] = info
	pg.unloadDataset(name)

	query_info = pg.loadDataset(name
								, os.path.join(DATASET_ROOT, name + '_QUERY')
								, ',', -1, 1)
	dataset_info[name + '_out'] = query_info

df = pd.DataFrame.from_dict(dataset_info, orient='index')
df['subsequence'] = (df['count'] * df['length'] * (df['length'] - 1) / 2).astype('int')
df.to_json(GROUPING_RECORDS, orient='index')
print('Preview the first few datasets')
print(df.head())
print('Saved info to', GROUPING_RECORDS)