Exemplo n.º 1
0
def writeYAMLConfig(datasetId, path, newConfig):
    cache = getCache()
    cacheKey = 'config' + datasetId
    del cache[cacheKey]

    dataset_folder = join(sourceDir, datasetId)
    temp_settings_filename = ImpUtils.GetTempFileName()
    with open(temp_settings_filename, 'w') as temp_settings_file:
        temp_settings_file.write(newConfig)
    validators = {
        'settings': lambda path: (join(dataset_folder, 'settings'),
                                  SettingsDataset(temp_settings_filename, validate=True)),
        'genome': lambda path: (join(dataset_folder, 'refgenome', 'settings'),
                                  SettingsRefGenome(temp_settings_filename, validate=True)),
        'tablesById': lambda path: (join(dataset_folder, 'datatables', path[0], 'settings'),
                                    SettingsDataTable(temp_settings_filename, validate=True)),
        'twoDTablesById': lambda path: (join(dataset_folder, '2D_datatables', path[0], 'settings'),
                                        SettingsDataTable(temp_settings_filename, validate=True)),
    }
    path = path.split('.')
    try:
        (settings_file, validator) = validators[path[0]](path[1:])
        #Validation happens in the validator constructor that is called in the lambda
        #So if we get here without exception thrown by validation then we can copy the new settings onto the old
        os.system('mv %s %s' % (temp_settings_filename, settings_file))
    finally:
        try:
            os.remove(temp_settings_filename)
        except OSError:
            pass
Exemplo n.º 2
0
 def link(self, words):
     print 'Preprocessing Input...'
     processedWords = self.preprocess(words)
     print 'Caching Possible Wikipedia Pages For Faster Runtime...'
     c = cache.getCache(processedWords)
     print 'Linking Initial Input...'
     searchAgent = LocalSearch(processedWords, c)
     result = searchAgent.runLocalSearch(self.alpha, self.iterations)
     #print self.prettify(result)
     print result
     return result
Exemplo n.º 3
0
def getJSONConfig(datasetId, cache=True):
    if cache:
        cache = getCache()
        cacheKey = 'config' + datasetId
        try:
            result = cache[cacheKey]
        except KeyError:
            result = readJSONConfig(datasetId)
            cache.set(cacheKey, result, expire=5*60)
    else:
        result = readJSONConfig(datasetId)
    return result
Exemplo n.º 4
0
def writeJSONConfig(datasetId, action, path, newConfig):
    cache = getCache()
    cacheKey = 'config' + datasetId
    del cache[cacheKey]

    dataset_folder = join(sourceDir, datasetId)
    #We have a path in the combined JSON object - we now follow the path until we hit a subset confined to one YAML handler
    writers = {
        'settings': lambda path: (path, SettingsDataset(join(dataset_folder, 'settings'), validate=True)),
        'chromosomes': lambda path: (path, ReadOnlyErrorWriter('chromosomes')),
        'tablesById': lambda path: (path[1:],
                                    SettingsDataTable(join(dataset_folder, 'datatables', path[0], 'settings'), validate=True)),
        'twoDTablesById': lambda path: (path[1:],
                                        SettingsDataTable(join(dataset_folder, '2D_datatables', path[0], 'settings'), validate=True)),
        'genome': lambda path: (path, ReadOnlyErrorWriter('genome')), #For now as this will likely get a refactor
        'mapLayers': lambda path: (path, ReadOnlyErrorWriter('mapLayers')),  # For now as this will likely get a refactor
        'docs': lambda path: (path, DocsWriter(datasetId))
    }
    path = path.split('.')
    (path, writer) = writers[path[0]](path[1:])
    return writer.updateAndWriteBack(action, path, newConfig, validate=True)
Exemplo n.º 5
0
def proxyGET(subpath):
    # 用扩展名判断是否使用本地缓存
    cache_type = os.path.splitext(subpath)[1]
    if cache_type in ('.png', '.mp3', '.json'):
        # 先检测缓存是否有效
        cache_flag = cache.checkCacheServer(
            subpath, request.args.get('version', default=None))

        if cache_flag > 0:
            # 带着cache_flag进download,需要验证时就带If-Modified-Since
            resp = download(request, cache_flag)
            cache_byte = resp.data
        # 缓存有效或验证后返回304时用本地缓存
        if cache_flag == 0 or resp.status_code == 304:
            # 浏览器发送If-Modified-Since且是符合缓存内容时,要先检测这个
            if 'If-Modified-Since' in request.headers:
                last_modified = request.headers.get('If-Modified-Since')
                if cache.checkCacheBrowser(subpath, last_modified):
                    return Response('', 304, cache_headers)

            app.logger.debug('使用本地缓存: %s', subpath)
            cache_byte, last_modified = cache.getCache(subpath)
        else:
            last_modified = resp.headers['Last-Modified']
            cache_json = {
                'deadline': str(time.time() + 2592000),
                'version': request.args.get('version', default=None),
                'last_modified': last_modified,
            }
            cache.setCache(cache_byte, subpath, cache_json)
        # 用扩展名完成headers
        cache_headers.update(Content_Types[cache_type])
        cache_headers['Content-Length'] = len(cache_byte)
        cache_headers['Last-Modified'] = last_modified
        return Response(cache_byte, 200, cache_headers)

    else:
        app.logger.debug('转发GET: %s', request.url)
        return transmitGET(request)
Exemplo n.º 6
0
def response(returndata):
    url = returndata['url']
    
    cache = getCache()
    cacheKey = json.dumps([url])
    returndata['content'] = None
    use_cache = returndata['cache'] and not os.getenv('STAGING', '')
    if use_cache:
        try:
            returndata['content'] = cache[cacheKey]
        except KeyError:
            pass
    
    if returndata['content'] is None:
        file = urllib.request.urlopen(url)
        data = file.read()
        file.close()
        data = xmltodict.parse(data)
        returndata['content'] = DQXbase64.b64encode_var2(json.dumps(data))
        if use_cache:
            cache[cacheKey] = returndata['content']
    
    return returndata
Exemplo n.º 7
0
def response(returndata):
    url = returndata['url']

    cache = getCache()
    cacheKey = json.dumps([url])
    returndata['content'] = None
    use_cache = returndata['cache'] and not os.getenv('STAGING', '')
    if use_cache:
        try:
            returndata['content'] = cache[cacheKey]
        except KeyError:
            pass

    if returndata['content'] is None:
        file = urllib.request.urlopen(url)
        data = file.read()
        file.close()
        data = xmltodict.parse(data)
        returndata['content'] = DQXbase64.b64encode_var2(json.dumps(data))
        if use_cache:
            cache[cacheKey] = returndata['content']

    return returndata
Exemplo n.º 8
0
"""
Things remaining:
1. Renames with no content change. Tricky.
"""

CC_LSH = ['lsh', '-fmt', '%o%m|%Nd|%u|%En|%Vn|'+cc.getCommentFmt()+'\\n', '-recurse']
DELIM = '|'

ARGS = {
    'stash': 'Wraps the rebase in a stash to avoid file changes being lost',
    'dry_run': 'Prints a list of changesets to be imported',
    'lshistory': 'Prints the raw output of lshistory to be cached for load',
    'load': 'Loads the contents of a previously saved lshistory file',
}

cache = getCache()

def main(stash=False, dry_run=False, lshistory=False, load=None):
    validateCC()
    if not (stash or dry_run or lshistory):
        checkPristine()
    since = getSince()
    cache.start()
    if load:
        history = open(load, 'r').read().decode(ENCODING)
    else:
        cc.rebase()
        history = getHistory(since)
        write(join(GIT_DIR, '.git', 'lshistory.bak'), history.encode(ENCODING))
    if lshistory:
        print(history)
Exemplo n.º 9
0
"""

CC_LSH = [
    'lsh', '-fmt', '%o%m|%Nd|%u|%En|%Vn|' + cc.getCommentFmt() + '\\n',
    '-recurse'
]
DELIM = '|'

ARGS = {
    'stash': 'Wraps the rebase in a stash to avoid file changes being lost',
    'dry_run': 'Prints a list of changesets to be imported',
    'lshistory': 'Prints the raw output of lshistory to be cached for load',
    'load': 'Loads the contents of a previously saved lshistory file',
}

cache = getCache()


def main(stash=False, dry_run=False, lshistory=False, load=None):
    validateCC()
    if not (stash or dry_run or lshistory):
        checkPristine()
    since = getSince()
    cache.start()
    if load:
        history = open(load, 'r').read().decode(ENCODING)
    else:
        cc.rebase()
        history = getHistory(since)
        write(join(GIT_DIR, '.git', 'lshistory.bak'), history.encode(ENCODING))
    if lshistory:
Exemplo n.º 10
0
def handler(start_response, requestData):
    try:
        length = int(requestData['environ'].get('CONTENT_LENGTH', '0'))
    except ValueError:
        length = 0
    content = requestData['environ']['wsgi.input'].read(length).decode("utf-8")
    content = json.loads(content) if len(content) > 0 else None
    if not content:
        raise SyntaxError('No query parameters supplied')
    database = content['database']

    # Due to caching we check for auth here, as otherwise auth is only checked on DB read.
    credentials = DQXDbTools.CredentialInformation(requestData)
    credentials.VerifyCanDo(DQXDbTools.DbOperationRead(database))

    tableId = content['table']
    query = content['query']
    orderBy = json.loads(content.get('orderBy', '[]'))
    distinct = content.get('distinct', 'false') == 'true'
    rawColumns = json.loads(content['columns'])
    columns = list(map(decode, rawColumns))
    groupBy = content.get('groupBy', None)
    startRow, endRow = None, None
    if content.get('limit', False):
        startRow, endRow = content['limit'].split('~')
        startRow = int(startRow)
        endRow = int(endRow)
        if startRow < 0:
            startRow = 0
        if endRow <= startRow:
            endRow = startRow + 1
    randomSample = None
    if content.get('randomSample', False):
        randomSample = int(content['randomSample'])
    cacheData = content.get('cache', True)
    joins = json.loads(content.get('joins', '[]'))

    auth_query = credentials.get_auth_query(
        database, [join['foreignTable'] for join in joins] + [tableId])

    cache = getCache()
    cacheKey = json.dumps([
        tableId, query, orderBy, distinct, columns, groupBy, database,
        startRow, endRow, joins, auth_query
    ])
    data = None
    if cacheData and randomSample is None:  # Don't serve cache on random sample!!
        try:
            data = cache[cacheKey]
        except KeyError:
            pass

    if data is None:
        with DQXDbTools.DBCursor(requestData,
                                 database,
                                 read_timeout=config.TIMEOUT) as cur:

            whereClause = DQXDbTools.WhereClause()
            whereClause.ParameterPlaceHolder = '%s'
            whereClause.Decode(query, True)
            if auth_query:
                whereClause.query = {
                    "whcClass": "compound",
                    "isCompound": True,
                    "isRoot": True,
                    "Components": [whereClause.query, auth_query],
                    "Tpe": "AND"
                }
            whereClause.CreateSelectStatement()

            sqlQuery = "SELECT "
            if distinct:
                sqlQuery += " DISTINCT "
            sqlQuery += "{0} FROM {1}".format(','.join(columns),
                                              DBTBESC(tableId))
            for join in joins:
                if 'type' in join and join['type'] in [
                        '', 'INNER', 'LEFT', 'RIGHT', 'FULL'
                ]:
                    sqlQuery += " {0} JOIN {1} ON {2} = {3}".format(
                        join['type'].upper(), DBTBESC(join['foreignTable']),
                        DBCOLESC(join['foreignColumn']),
                        DBCOLESC(join['column']))
                else:
                    raise SyntaxError('Join type not valid')
            if len(whereClause.querystring_params) > 0:
                sqlQuery += " WHERE {0}".format(whereClause.querystring_params)
            if groupBy and len(groupBy) > 0:
                sqlQuery += " GROUP BY " + ','.join(
                    map(DBCOLESC, groupBy.split('~')))
            if len(orderBy) > 0:
                sqlQuery += " ORDER BY {0}".format(','.join([
                    DBCOLESC(col) + ' ' + direction
                    for direction, col in orderBy
                ]))
            if startRow is not None and endRow is not None:
                sqlQuery += " LIMIT {0} OFFSET {1}".format(
                    endRow - startRow + 1, startRow)
            if randomSample is not None:
                sqlQuery += " SAMPLE {0}".format(randomSample)

            if DQXDbTools.LogRequests:
                DQXUtils.LogServer('###QRY:' + sqlQuery)
                DQXUtils.LogServer('###PARAMS:' + str(whereClause.queryparams))
            cur.execute(sqlQuery, whereClause.queryparams)
            rows = cur.fetchall()
            result = {}
            for rawCol, (i, desc) in zip(rawColumns,
                                         enumerate(cur.description)):
                # Figure out the name we should return for the column - by deafult monet doesn't qualify names
                col_name = name(rawCol, desc[0])
                dtype = desciptionToDType(desc[1])
                if dtype in ['i1', 'i2', 'i4', 'S']:
                    null_value = NULL_VALUES[dtype]
                    result[col_name] = np.array(
                        [(str(row[i]).encode('utf-8') if dtype == 'S' else
                          row[i]) if row[i] is not None else null_value
                         for row in rows],
                        dtype=dtype)
                else:
                    result[col_name] = np.array([row[i] for row in rows],
                                                dtype=dtype)
            data = gzip(data=b''.join(
                arraybuffer.encode_array_set(list(result.items()))))
            if cacheData:
                cache[cacheKey] = data
    status = '200 OK'
    response_headers = [('Content-type', 'text/plain'),
                        ('Content-Length', str(len(data))),
                        ('Content-Encoding', 'gzip'),
                        ('Access-Control-Allow-Origin', '*')]
    start_response(status, response_headers)
    yield data
Exemplo n.º 11
0
def index_table_query(dataset, cur, table, fields, query, auth_query, order,
                      limit, offset, fail_limit, index_field, sample):
    if limit and fail_limit:
        raise Exception("Only one type of limit can be specified")
    where = DQXDbTools.WhereClause()
    where.ParameterPlaceHolder = '%s'  #NOTE!: MySQL PyODDBC seems to require this nonstardard coding
    where.Decode(query)
    if auth_query:
        where.query = {
            "whcClass": "compound",
            "isCompound": True,
            "isRoot": True,
            "Components": [where.query, auth_query],
            "Tpe": "AND"
        }
    where.CreateSelectStatement()
    if index_field not in fields:
        fields.append(index_field)
    if len(where.querystring_params) > 0:
        query = "WHERE " + where.querystring_params + ' AND ' + DQXDbTools.ToSafeIdentifier(
            index_field) + ' IS NOT NULL'
    else:
        query = "WHERE " + DQXDbTools.DBCOLESC(index_field) + ' IS NOT NULL'
    fields_string = ','.join('"' + DQXDbTools.ToSafeIdentifier(f) + '"'
                             for f in fields)
    table = DQXDbTools.ToSafeIdentifier(table)
    sqlquery = 'SELECT {fields_string} FROM "{table}" {query}'.format(
        **locals())
    if order:
        sqlquery += ' ORDER BY "{0}"'.format(
            DQXDbTools.ToSafeIdentifier(order))
    params = where.queryparams
    #Set the limit to one past the req
    limit = limit or fail_limit
    if limit:
        sqlquery += ' LIMIT %s'
        params.append(int(limit))
    if offset:
        sqlquery += ' OFFSET %s'
        params.append(int(offset))
    if sample is not None:
        sqlquery += ' SAMPLE {0}'.format(sample)

    cache = getCache()
    cacheKey = json.dumps([sqlquery, params])
    rows, description = None, None
    try:
        rows, description = cache[cacheKey]
    except KeyError:
        print('2D', sqlquery, params)
        pass

    if rows is None:
        cur.execute(sqlquery, params)
        rows = cur.fetchall()
        description = cur.description
        #We cache even if random sample is requested such that requests at different points on the col axis pick the same rows and vice-versa - ie we always want the same random sample.
        cache[cacheKey] = [rows, description]

    data = {}
    for i, (field, desc) in enumerate(zip(fields, description)):
        dtype = desciptionToDType(desc[1])
        data[field] = np.array([row[i] for row in rows], dtype=dtype)
    return data
Exemplo n.º 12
0
def handler(start_response, request_data):
    datatable = request_data['table']
    dataset = request_data['dataset']

    # Due to caching we check for auth here, as otherwise auth is only checked on DB read.
    credentials = DQXDbTools.CredentialInformation(request_data)
    credentials.VerifyCanDo(DQXDbTools.DbOperationRead(dataset))

    two_d_properties = request_data['2DProperties'].split('~')
    col_properties = request_data['colProperties'].split('~')
    row_properties = request_data['rowProperties'].split('~')
    col_qry = request_data['colQry']
    col_order = request_data['colOrder']
    row_qry = request_data['rowQry']
    row_order = request_data.get('rowOrder', None)
    row_order_columns = []
    if row_order == 'columns':
        try:
            row_order_columns = request_data['rowSortCols'].split('~')
        except KeyError:
            pass
        row_order = None
    try:
        col_limit = int(request_data['colLimit'])
    except KeyError:
        col_limit = None
    try:
        row_limit = int(request_data['rowLimit'])
    except KeyError:
        row_limit = None
    try:
        col_offset = int(request_data['colOffset'])
    except KeyError:
        col_offset = None
    try:
        row_offset = int(request_data['rowOffset'])
    except KeyError:
        row_offset = None
    #Set fail limit to one past so we know if we hit it
    try:
        col_fail_limit = int(request_data['colFailLimit']) + 1
    except KeyError:
        col_fail_limit = None
    try:
        row_sort_property = request_data['rowSortProperty']
    except KeyError:
        row_sort_property = None
    try:
        col_key = request_data['colKey']
    except KeyError:
        col_key = None
    try:
        sort_mode = request_data['sortMode']
    except KeyError:
        sort_mode = None
    try:
        row_random_sample = int(request_data['rowRandomSample'])
    except KeyError:
        row_random_sample = None

    col_index_field = datatable + '_column_index'
    row_index_field = datatable + '_row_index'
    col_properties.append(col_index_field)
    row_properties.append(row_index_field)

    with DQXDbTools.DBCursor(request_data,
                             dataset,
                             read_timeout=config.TIMEOUT) as cur:
        col_tablename, row_tablename = get_table_ids(cur, dataset, datatable)

    col_auth_query = credentials.get_auth_query(dataset, [col_tablename])
    row_auth_query = credentials.get_auth_query(dataset, [row_tablename])

    cache = getCache()
    cache_key = json.dumps([
        datatable, dataset, two_d_properties, col_properties, row_properties,
        col_qry, col_order, row_qry, row_order, row_order_columns,
        row_random_sample, col_limit, row_limit, col_offset, row_offset,
        col_fail_limit, row_sort_property, col_key, sort_mode, col_auth_query,
        row_auth_query
    ])
    data = None
    try:
        data = cache[cache_key]
    except KeyError:
        print('2D Cache miss')
        pass

    if data is None:
        with DQXDbTools.DBCursor(request_data,
                                 dataset,
                                 read_timeout=config.TIMEOUT) as cur:
            col_result = index_table_query(dataset, cur, col_tablename,
                                           col_properties, col_qry,
                                           col_auth_query, col_order,
                                           col_limit, col_offset,
                                           col_fail_limit, col_index_field,
                                           None)

            if len(row_order_columns) > 0:
                #If we are sorting by 2d data then we need to grab all the rows as the limit applies post sort.
                row_result = index_table_query(dataset, cur, row_tablename,
                                               row_properties, row_qry,
                                               row_auth_query, row_order, None,
                                               None, None, row_index_field,
                                               row_random_sample)

            else:
                row_result = index_table_query(dataset, cur, row_tablename,
                                               row_properties, row_qry,
                                               row_auth_query, row_order,
                                               row_limit, row_offset, None,
                                               row_index_field,
                                               row_random_sample)

            col_idx = col_result[col_index_field]
            row_idx = row_result[row_index_field]
            del col_result[col_index_field]
            del row_result[row_index_field]
            if len(col_idx) == col_fail_limit:
                result_set = [('_over_col_limit', np.array([0], dtype='i1'))]
                for name, array in list(row_result.items()):
                    result_set.append((('row_' + name), array))
            else:
                if len(row_order_columns) > 0 and len(row_idx) > 0:
                    #Translate primkeys to idx
                    sqlquery = 'SELECT "{col_field}", "{idx_field}" FROM "{table}" WHERE "{col_field}" IN ({params})'.format(
                        idx_field=DQXDbTools.ToSafeIdentifier(col_index_field),
                        table=DQXDbTools.ToSafeIdentifier(col_tablename),
                        params="'" + "','".join(
                            map(DQXDbTools.ToSafeIdentifier,
                                row_order_columns)) + "'",
                        col_field=DQXDbTools.ToSafeIdentifier(col_key))
                    idx_for_col = dict((k, v) for k, v in cur.fetchall())
                    #Sort by the order specified - reverse so last clicked is major sort
                    sort_col_idx = list(
                        reversed(
                            [idx_for_col[key] for key in row_order_columns]))
                    #grab the data needed to sort
                    sort_data = extract2D(dataset, datatable, row_idx,
                                          sort_col_idx, [row_sort_property])
                    rows = list(zip(row_idx, sort_data[row_sort_property]))
                    if sort_mode == 'call':
                        polyploid_key_func = lambda row: ''.join(
                            summarise_call(calls) for calls in row[1])
                        haploid_key_func = lambda row: ''.join(
                            [str(c).zfill(2) for c in row[1]])
                        if len(rows[0][1].shape) == 1:
                            rows.sort(key=haploid_key_func, reverse=True)
                        else:
                            rows.sort(key=polyploid_key_func, reverse=True)
                    elif sort_mode == 'fraction':
                        for i in range(len(sort_col_idx)):
                            #TODO Shuld be some fancy bayesian shizzle
                            def key_func(row):
                                if sum(row[1][i]) == 0:
                                    return '-1'
                                return str(1 - float(row[1][i][0]) /
                                           sum(row[1][i])) + str(sum(
                                               row[1][i])).zfill(4)

                            rows.sort(key=key_func, reverse=True)
                    else:
                        print("Unimplemented sort_mode")
                    row_pos_for_idx = dict(
                        list(zip(row_idx, list(range(len(row_idx))))))
                    #Now just get the row_idx to pass to 2d extract for the slice we need
                    row_idx = np.array(
                        map(itemgetter(0),
                            rows)[row_offset:row_offset + row_limit])
                    #Use this row idx to retieve the row data from the initial query
                    for name, array in list(row_result.items()):
                        row_result[name] = array[[
                            row_pos_for_idx[idx] for idx in row_idx
                        ]]

                two_d_result = extract2D(dataset, datatable, row_idx, col_idx,
                                         two_d_properties)

                result_set = []
                for name, array in list(col_result.items()):
                    result_set.append((('col_' + name), array))
                for name, array in list(row_result.items()):
                    result_set.append((('row_' + name), array))
                for name, array in list(two_d_result.items()):
                    result_set.append((('2D_' + name), array))
        data = gzip(data=b''.join(arraybuffer.encode_array_set(result_set)))
        cache[cache_key] = data
    status = '200 OK'
    response_headers = [('Content-type', 'text/plain'),
                        ('Content-Length', str(len(data))),
                        ('Content-Encoding', 'gzip'),
                        ('Access-Control-Allow-Origin', '*')]
    start_response(status, response_headers)
    yield data
Exemplo n.º 13
0
def ImportDataSet(calculationObject, baseFolder, datasetId, importSettings):
    with calculationObject.LogHeader(
            'Importing dataset {0}'.format(datasetId)):
        calculationObject.Log('Import settings: ' + str(importSettings))
        datasetFolder = join(baseFolder, datasetId)
        # Monetdb doesn't allow renames of non-empty schemas, so we import to a random, then set it as the user's default
        schema = ''.join(
            random.choice(string.ascii_letters) for i in range(10))
        dao = SettingsDAO(calculationObject, datasetId, schema=schema)

        if not importSettings['ConfigOnly']:
            calculationObject.SetInfo('Creating database')
            dao.createDatabase()
            # Creating new database
            scriptPath = os.path.dirname(os.path.realpath(__file__))
            dao.loadFile(scriptPath + "/createdataset.sql")
            dao.setDatabaseVersion(schemaversion.major, schemaversion.minor)
        else:
            #Raises an exception if not present
            dao.isDatabasePresent()
            # Verify is major schema version is OK - otherways we can't do config update only
            currentVersion = dao.getCurrentSchemaVersion()
            if currentVersion[0] < schemaversion.major:
                raise Exception(
                    "The database schema of this dataset is outdated. Actualise it by running a full data import or or top N preview import."
                )

        # dao.clearDatasetCatalogs()

        modules = PluginLoader(calculationObject,
                               datasetId,
                               importSettings,
                               dao=dao)
        modules.importAll('pre')

        importer = ImportDataTable(calculationObject,
                                   datasetId,
                                   importSettings,
                                   baseFolder=baseFolder,
                                   dao=dao)
        importer.importAllDataTables()

        import2D = Import2DDataTable(calculationObject,
                                     datasetId,
                                     importSettings,
                                     baseFolder,
                                     dataDir='2D_datatables',
                                     dao=dao)
        import2D.importAll2DTables()

        globalSettings = importer._globalSettings

        if ImportRefGenome.ImportRefGenome(calculationObject, datasetId,
                                           baseFolder, importSettings, dao):
            globalSettings['hasGenomeBrowser'] = True

        ImportDocs(calculationObject, datasetFolder, datasetId)
        ImportMaps(calculationObject, datasetFolder, datasetId)
        ImportCustomComponents(calculationObject, datasetFolder, datasetId)

        #Swap the live default schema
        dao._execSql('ALTER USER monetdb SET SCHEMA "%s"' % (schema))

        #Move the config files to live
        config = PanoptesConfig(calculationObject)
        try:
            os.rename(
                join(config.getBaseDir(), 'config', '_import_' + datasetId),
                join(config.getBaseDir(), 'config', datasetId))
        except OSError:
            # Not atomic but I can't se how to make it atomic easily
            shutil.rmtree(join(config.getBaseDir(), 'config', datasetId))
            os.rename(
                join(config.getBaseDir(), 'config', '_import_' + datasetId),
                join(config.getBaseDir(), 'config', datasetId))

        # Finalise: register dataset
        with calculationObject.LogHeader('Registering dataset'):
            dao.registerDataset(globalSettings['name'],
                                importSettings['ConfigOnly'])

        with calculationObject.LogHeader('Clear cache'):
            getCache().clear()

        for old_schema in dao._execSqlQuery(
                "SELECT name FROM sys.schemas WHERE system=False AND name<>%s",
                schema):
            dao._execSql('DROP SCHEMA "%s" CASCADE' % (old_schema))

        modules.importAll('post')
Exemplo n.º 14
0
def handler(start_response, requestData):
    try:
        length = int(requestData['environ'].get('CONTENT_LENGTH', '0'))
    except ValueError:
        length = 0
    content = requestData['environ']['wsgi.input'].read(length).decode("utf-8")
    content = json.loads(content) if len(content) > 0 else None
    if not content:
        raise SyntaxError('No query parameters supplied')
    database = content['database']

    # Due to caching we check for auth here, as otherwise auth is only checked on DB read.
    credentials = DQXDbTools.CredentialInformation(requestData)
    credentials.VerifyCanDo(DQXDbTools.DbOperationRead(database))

    tableId = content['table']
    query = content['query']
    orderBy = json.loads(content.get('orderBy', '[]'))
    distinct = content.get('distinct', 'false') == 'true'
    rawColumns = json.loads(content['columns'])
    columns = list(map(decode, rawColumns))
    groupBy = content.get('groupBy', None)
    startRow, endRow = None, None
    if content.get('limit', False):
        startRow, endRow = content['limit'].split('~')
        startRow = int(startRow)
        endRow = int(endRow)
        if startRow < 0:
            startRow = 0
        if endRow <= startRow:
            endRow = startRow + 1
    randomSample = None
    if content.get('randomSample', False):
        randomSample = int(content['randomSample'])
    cacheData = content.get('cache', True)
    joins = json.loads(content.get('joins', '[]'))

    auth_query = credentials.get_auth_query(database, [join['foreignTable'] for join in joins] + [tableId])

    cache = getCache()
    cacheKey = json.dumps([tableId, query, orderBy, distinct, columns, groupBy,
                           database, startRow, endRow, joins, auth_query])
    data = None
    if cacheData and randomSample is None:  # Don't serve cache on random sample!!
        try:
            data = cache[cacheKey]
        except KeyError:
            pass

    if data is None:
        with DQXDbTools.DBCursor(requestData, database, read_timeout=config.TIMEOUT) as cur:

            whereClause = DQXDbTools.WhereClause()
            whereClause.ParameterPlaceHolder = '%s'
            whereClause.Decode(query, True)
            if auth_query:
                whereClause.query = {
                    "whcClass": "compound",
                    "isCompound": True,
                    "isRoot": True,
                    "Components": [
                        whereClause.query,
                        auth_query
                    ],
                    "Tpe": "AND"
                }
            whereClause.CreateSelectStatement()

            sqlQuery = "SELECT "
            if distinct:
                sqlQuery += " DISTINCT "
            sqlQuery += "{0} FROM {1}".format(','.join(columns), DBTBESC(tableId))
            for join in joins:
                if 'type' in join and join['type'] in ['', 'INNER', 'LEFT', 'RIGHT', 'FULL']:
                    sqlQuery += " {0} JOIN {1} ON {2} = {3}".format(join['type'].upper(), DBTBESC(join['foreignTable']),
                                                                    DBCOLESC(join['foreignColumn']),
                                                                    DBCOLESC(join['column']))
                else:
                    raise SyntaxError('Join type not valid')
            if len(whereClause.querystring_params) > 0:
                sqlQuery += " WHERE {0}".format(whereClause.querystring_params)
            if groupBy and len(groupBy) > 0:
                sqlQuery += " GROUP BY " + ','.join(map(DBCOLESC, groupBy.split('~')))
            if len(orderBy) > 0:
                sqlQuery += " ORDER BY {0}".format(
                    ','.join([DBCOLESC(col) + ' ' + direction for direction, col in orderBy]))
            if startRow is not None and endRow is not None:
                sqlQuery += " LIMIT {0} OFFSET {1}".format(endRow - startRow + 1, startRow)
            if randomSample is not None:
                sqlQuery += " SAMPLE {0}".format(randomSample)

            if DQXDbTools.LogRequests:
                DQXUtils.LogServer('###QRY:' + sqlQuery)
                DQXUtils.LogServer('###PARAMS:' + str(whereClause.queryparams))
            cur.execute(sqlQuery, whereClause.queryparams)
            rows = cur.fetchall()
            result = {}
            for rawCol, (i, desc) in zip(rawColumns, enumerate(cur.description)):
                # Figure out the name we should return for the column - by deafult monet doesn't qualify names
                col_name = name(rawCol, desc[0])
                dtype = desciptionToDType(desc[1])
                if dtype in ['i1', 'i2', 'i4', 'S']:
                    null_value = NULL_VALUES[dtype]
                    result[col_name] = np.array([(row[i].encode('ascii', 'replace') if dtype == 'S' else row[i]) if row[
                                                                                                                        i] is not None else null_value
                                                 for row in rows], dtype=dtype)
                elif desc[1] == 'timestamp':
                    result[col_name] = np.array(
                        [datetimeToJulianDay(row[i]) if row[i] is not None else None for row in rows], dtype=dtype)
                else:
                    result[col_name] = np.array([row[i] for row in rows], dtype=dtype)
            data = gzip(data=b''.join(arraybuffer.encode_array_set(list(result.items()))))
            if cacheData:
                cache[cacheKey] = data
    status = '200 OK'
    response_headers = [('Content-type', 'text/plain'),
                        ('Content-Length', str(len(data))),
                        ('Content-Encoding', 'gzip'),
                        ('Access-Control-Allow-Origin', '*')
                        ]
    start_response(status, response_headers)
    yield data
Exemplo n.º 15
0
def index_table_query(dataset, cur, table, fields, query, auth_query, order, limit, offset, fail_limit, index_field, sample):
    if limit and fail_limit:
        raise Exception("Only one type of limit can be specified")
    where = DQXDbTools.WhereClause()
    where.ParameterPlaceHolder = '%s'#NOTE!: MySQL PyODDBC seems to require this nonstardard coding
    where.Decode(query)
    if auth_query:
        where.query = {
            "whcClass": "compound",
            "isCompound": True,
            "isRoot": True,
            "Components": [
                where.query,
                auth_query
            ],
            "Tpe": "AND"
        }
    where.CreateSelectStatement()
    if index_field not in fields:
        fields.append(index_field)
    if len(where.querystring_params) > 0:
        query = "WHERE " + where.querystring_params + ' AND ' + DQXDbTools.ToSafeIdentifier(index_field) + ' IS NOT NULL'
    else:
        query = "WHERE " + DQXDbTools.DBCOLESC(index_field) + ' IS NOT NULL'
    fields_string = ','.join('"'+DQXDbTools.ToSafeIdentifier(f)+'"' for f in fields)
    table = DQXDbTools.ToSafeIdentifier(table)
    sqlquery = 'SELECT {fields_string} FROM "{table}" {query}'.format(**locals())
    if order:
         sqlquery += ' ORDER BY "{0}"'.format(DQXDbTools.ToSafeIdentifier(order))
    params = where.queryparams
    #Set the limit to one past the req
    limit = limit or fail_limit
    if limit:
        sqlquery += ' LIMIT %s'
        params.append(int(limit))
    if offset:
        sqlquery += ' OFFSET %s'
        params.append(int(offset))
    if sample is not None:
        sqlquery += ' SAMPLE {0}'.format(sample)

    cache = getCache()
    cacheKey = json.dumps([sqlquery, params])
    rows, description = None, None
    try:
        rows, description = cache[cacheKey]
    except KeyError:
        print('2D', sqlquery, params)
        pass

    if rows is None:
        cur.execute(sqlquery, params)
        rows = cur.fetchall()
        description = cur.description
        #We cache even if random sample is requested such that requests at different points on the col axis pick the same rows and vice-versa - ie we always want the same random sample.
        cache[cacheKey] = [rows, description]

    data = {}
    for i, (field, desc) in enumerate(zip(fields, description)):
        dtype = desciptionToDType(desc[1])
        data[field] = np.array([row[i] for row in rows], dtype=dtype)
    return data
Exemplo n.º 16
0
def handler(start_response, request_data):
    datatable = request_data['table']
    dataset = request_data['dataset']

    # Due to caching we check for auth here, as otherwise auth is only checked on DB read.
    credentials = DQXDbTools.CredentialInformation(request_data)
    credentials.VerifyCanDo(DQXDbTools.DbOperationRead(dataset))

    two_d_properties = request_data['2DProperties'].split('~')
    col_properties = request_data['colProperties'].split('~')
    row_properties = request_data['rowProperties'].split('~')
    col_qry = request_data['colQry']
    col_order = request_data['colOrder']
    row_qry = request_data['rowQry']
    row_order = request_data.get('rowOrder', None)
    row_order_columns = []
    if row_order == 'columns':
        try:
            row_order_columns = request_data['rowSortCols'].split('~')
        except KeyError:
            pass
        row_order = None
    try:
        col_limit = int(request_data['colLimit'])
    except KeyError:
        col_limit = None
    try:
        row_limit = int(request_data['rowLimit'])
    except KeyError:
        row_limit = None
    try:
        col_offset = int(request_data['colOffset'])
    except KeyError:
        col_offset = None
    try:
        row_offset = int(request_data['rowOffset'])
    except KeyError:
        row_offset = None
    #Set fail limit to one past so we know if we hit it
    try:
        col_fail_limit = int(request_data['colFailLimit'])+1
    except KeyError:
        col_fail_limit = None
    try:
        row_sort_property = request_data['rowSortProperty']
    except KeyError:
        row_sort_property = None
    try:
        col_key = request_data['colKey']
    except KeyError:
        col_key = None
    try:
        sort_mode = request_data['sortMode']
    except KeyError:
        sort_mode = None
    try:
        row_random_sample = int(request_data['rowRandomSample'])
    except KeyError:
        row_random_sample = None

    col_index_field = datatable + '_column_index'
    row_index_field = datatable + '_row_index'
    col_properties.append(col_index_field)
    row_properties.append(row_index_field)

    with DQXDbTools.DBCursor(request_data, dataset, read_timeout=config.TIMEOUT) as cur:
        col_tablename, row_tablename = get_table_ids(cur, dataset, datatable)

    col_auth_query = credentials.get_auth_query(dataset, [col_tablename])
    row_auth_query = credentials.get_auth_query(dataset, [row_tablename])

    cache = getCache()
    cache_key = json.dumps([datatable, dataset, two_d_properties, col_properties, row_properties, col_qry, col_order,
                           row_qry, row_order, row_order_columns, row_random_sample, col_limit, row_limit, col_offset,
                           row_offset, col_fail_limit, row_sort_property, col_key, sort_mode, col_auth_query, row_auth_query])
    data = None
    try:
        data = cache[cache_key]
    except KeyError:
        print('2D Cache miss')
        pass

    if data is None:
        with DQXDbTools.DBCursor(request_data, dataset, read_timeout=config.TIMEOUT) as cur:
            col_result = index_table_query(dataset,
                                           cur,
                                           col_tablename,
                                           col_properties,
                                           col_qry,
                                           col_auth_query,
                                           col_order,
                                           col_limit,
                                           col_offset,
                                           col_fail_limit,
                                           col_index_field,
                                           None)

            if len(row_order_columns) > 0:
                #If we are sorting by 2d data then we need to grab all the rows as the limit applies post sort.
                row_result = index_table_query(dataset,
                                               cur,
                                               row_tablename,
                                               row_properties,
                                               row_qry,
                                               row_auth_query,
                                               row_order,
                                               None,
                                               None,
                                               None,
                                               row_index_field,
                                               row_random_sample)

            else:
                row_result = index_table_query(dataset,
                                               cur,
                                               row_tablename,
                                               row_properties,
                                               row_qry,
                                               row_auth_query,
                                               row_order,
                                               row_limit,
                                               row_offset,
                                               None,
                                               row_index_field,
                                               row_random_sample)

            col_idx = col_result[col_index_field]
            row_idx = row_result[row_index_field]
            del col_result[col_index_field]
            del row_result[row_index_field]
            if len(col_idx) == col_fail_limit:
                result_set = [('_over_col_limit', np.array([0], dtype='i1'))]
                for name, array in list(row_result.items()):
                    result_set.append((('row_'+name), array))
            else:
                if len(row_order_columns) > 0 and len(row_idx) > 0:
                    #Translate primkeys to idx
                    sqlquery = 'SELECT "{col_field}", "{idx_field}" FROM "{table}" WHERE "{col_field}" IN ({params})'.format(
                        idx_field=DQXDbTools.ToSafeIdentifier(col_index_field),
                        table=DQXDbTools.ToSafeIdentifier(col_tablename),
                        params="'"+"','".join(map(DQXDbTools.ToSafeIdentifier, row_order_columns))+"'",
                        col_field=DQXDbTools.ToSafeIdentifier(col_key))
                    idx_for_col = dict((k, v) for k,v in cur.fetchall())
                    #Sort by the order specified - reverse so last clicked is major sort
                    sort_col_idx = list(reversed([idx_for_col[key] for key in row_order_columns]))
                    #grab the data needed to sort
                    sort_data = extract2D(dataset, datatable, row_idx, sort_col_idx, [row_sort_property])
                    rows = list(zip(row_idx, sort_data[row_sort_property]))
                    if sort_mode == 'call':
                        polyploid_key_func = lambda row: ''.join(summarise_call(calls) for calls in row[1])
                        haploid_key_func = lambda row: ''.join([str(c).zfill(2) for c in row[1]])
                        if len(rows[0][1].shape) == 1:
                            rows.sort(key=haploid_key_func, reverse=True)
                        else:
                            rows.sort(key=polyploid_key_func, reverse=True)
                    elif sort_mode == 'fraction':
                        for i in range(len(sort_col_idx)):
                            #TODO Shuld be some fancy bayesian shizzle
                            def key_func(row):
                                if sum(row[1][i]) == 0:
                                    return '-1'
                                return str(1-float(row[1][i][0])/sum(row[1][i]))+str(sum(row[1][i])).zfill(4)
                            rows.sort(key=key_func, reverse=True)
                    else:
                        print("Unimplemented sort_mode")
                    row_pos_for_idx = dict(list(zip(row_idx, list(range(len(row_idx))))))
                    #Now just get the row_idx to pass to 2d extract for the slice we need
                    row_idx = np.array(map(itemgetter(0), rows)[row_offset: row_offset+row_limit])
                    #Use this row idx to retieve the row data from the initial query
                    for name, array in list(row_result.items()):
                        row_result[name] = array[[row_pos_for_idx[idx] for idx in row_idx]]

                two_d_result = extract2D(dataset, datatable, row_idx, col_idx, two_d_properties)

                result_set = []
                for name, array in list(col_result.items()):
                    result_set.append((('col_'+name), array))
                for name, array in list(row_result.items()):
                    result_set.append((('row_'+name), array))
                for name, array in list(two_d_result.items()):
                    result_set.append((('2D_'+name), array))
        data = gzip(data=b''.join(arraybuffer.encode_array_set(result_set)))
        cache[cache_key] = data
    status = '200 OK'
    response_headers = [('Content-type', 'text/plain'),
                        ('Content-Length', str(len(data))),
                        ('Content-Encoding', 'gzip'),
                        ('Access-Control-Allow-Origin', '*')
                        ]
    start_response(status, response_headers)
    yield data