Exemplo n.º 1
0
def get_sql_results(self, query_id, return_results=True, store_results=False):
    """Executes the sql query returns the results."""
    if not self.request.called_directly:
        engine = sqlalchemy.create_engine(
            app.config.get('SQLALCHEMY_DATABASE_URI'), poolclass=NullPool)
        session_class = sessionmaker()
        session_class.configure(bind=engine)
        session = session_class()
    else:
        session = db.session()
        session.commit()  # HACK
    query = session.query(models.Query).filter_by(id=query_id).one()
    database = query.database
    executed_sql = query.sql.strip().strip(';')
    db_engine_spec = database.db_engine_spec

    def handle_error(msg):
        """Local method handling error while processing the SQL"""
        query.error_message = msg
        query.status = QueryStatus.FAILED
        query.tmp_table_name = None
        session.commit()
        raise Exception(query.error_message)

    # Limit enforced only for retrieving the data, not for the CTA queries.
    is_select = is_query_select(executed_sql)
    if not is_select and not database.allow_dml:
        handle_error(
            "Only `SELECT` statements are allowed against this database")
    if query.select_as_cta:
        if not is_select:
            handle_error(
                "Only `SELECT` statements can be used with the CREATE TABLE "
                "feature.")
        if not query.tmp_table_name:
            start_dttm = datetime.fromtimestamp(query.start_time)
            query.tmp_table_name = 'tmp_{}_table_{}'.format(
                query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S'))
        executed_sql = create_table_as(executed_sql, query.tmp_table_name,
                                       database.force_ctas_schema)
        query.select_as_cta_used = True
    elif (query.limit and is_select
          and db_engine_spec.limit_method == LimitMethod.WRAP_SQL):
        executed_sql = database.wrap_sql_limit(executed_sql, query.limit)
        query.limit_used = True
    engine = database.get_sqla_engine(schema=query.schema)
    try:
        template_processor = get_template_processor(database=database,
                                                    query=query)
        executed_sql = template_processor.process_template(executed_sql)
    except Exception as e:
        logging.exception(e)
        msg = "Template rendering failed: " + utils.error_msg_from_exception(e)
        handle_error(msg)
    try:
        query.executed_sql = executed_sql
        logging.info("Running query: \n{}".format(executed_sql))
        result_proxy = engine.execute(query.executed_sql, schema=query.schema)
    except Exception as e:
        logging.exception(e)
        handle_error(utils.error_msg_from_exception(e))

    cursor = result_proxy.cursor
    query.status = QueryStatus.RUNNING
    session.flush()
    db_engine_spec.handle_cursor(cursor, query, session)

    cdf = None
    if result_proxy.cursor:
        column_names = [col[0] for col in result_proxy.cursor.description]
        if db_engine_spec.limit_method == LimitMethod.FETCH_MANY:
            data = result_proxy.fetchmany(query.limit)
        else:
            data = result_proxy.fetchall()
        cdf = dataframe.CaravelDataFrame(
            pd.DataFrame(data, columns=column_names))

    query.rows = result_proxy.rowcount
    query.progress = 100
    query.status = QueryStatus.SUCCESS
    if query.rows == -1 and cdf:
        # Presto doesn't provide result_proxy.row_count
        query.rows = cdf.size
    if query.select_as_cta:
        query.select_sql = '{}'.format(
            database.select_star(query.tmp_table_name, limit=query.limit))
    query.end_time = utils.now_as_float()
    session.flush()

    payload = {
        'query_id': query.id,
        'status': query.status,
        'data': [],
    }
    payload['data'] = cdf.data if cdf else []
    payload['columns'] = cdf.columns_dict if cdf else []
    payload['query'] = query.to_dict()
    payload = json.dumps(payload, default=utils.json_iso_dttm_ser)

    if store_results and results_backend:
        key = '{}'.format(uuid.uuid4())
        logging.info("Storing results in results backend, key: {}".format(key))
        results_backend.set(key, zlib.compress(payload))
        query.results_key = key

    session.flush()
    session.commit()

    if return_results:
        return payload
Exemplo n.º 2
0
def get_sql_results(query_id, return_results=True):
    """Executes the sql query returns the results."""
    session = db.session()
    session.commit()  # HACK
    query = session.query(models.Query).filter_by(id=query_id).one()
    database = query.database
    executed_sql = query.sql.strip().strip(';')


    def handle_error(msg):
        """Local method handling error while processing the SQL"""
        query.error_message = msg
        query.status = QueryStatus.FAILED
        query.tmp_table_name = None
        session.commit()
        raise Exception(query.error_message)

    # Limit enforced only for retrieving the data, not for the CTA queries.
    is_select = is_query_select(executed_sql);
    if not is_select and not database.allow_dml:
        handle_error(
            "Only `SELECT` statements are allowed against this database")
    if query.select_as_cta:
        if not is_select:
            handle_error(
                "Only `SELECT` statements can be used with the CREATE TABLE "
                "feature.")
        if not query.tmp_table_name:
            start_dttm = datetime.fromtimestamp(query.start_time)
            query.tmp_table_name = 'tmp_{}_table_{}'.format(
                query.user_id,
                start_dttm.strftime('%Y_%m_%d_%H_%M_%S'))
        executed_sql = create_table_as(
            executed_sql, query.tmp_table_name, database.force_ctas_schema)
        query.select_as_cta_used = True
    elif query.limit and is_select:
        executed_sql = database.wrap_sql_limit(executed_sql, query.limit)
        query.limit_used = True
    engine = database.get_sqla_engine(schema=query.schema)
    try:
        query.executed_sql = executed_sql
        logging.info("Running query: \n{}".format(executed_sql))
        result_proxy = engine.execute(query.executed_sql, schema=query.schema)
    except Exception as e:
        logging.exception(e)
        handle_error(utils.error_msg_from_exception(e))

    cursor = result_proxy.cursor
    query.status = QueryStatus.RUNNING
    session.flush()
    if database.backend == 'presto':
        polled = cursor.poll()
        # poll returns dict -- JSON status information or ``None``
        # if the query is done
        # https://github.com/dropbox/PyHive/blob/
        # b34bdbf51378b3979eaf5eca9e956f06ddc36ca0/pyhive/presto.py#L178
        while polled:
            # Update the object and wait for the kill signal.
            stats = polled.get('stats', {})
            if stats:
                completed_splits = float(stats.get('completedSplits'))
                total_splits = float(stats.get('totalSplits'))
                if total_splits and completed_splits:
                    progress = 100 * (completed_splits / total_splits)
                    if progress > query.progress:
                        query.progress = progress
                    session.commit()
            time.sleep(1)
            polled = cursor.poll()

    cdf = None
    if result_proxy.cursor:
        column_names = [col[0] for col in result_proxy.cursor.description]
        data = result_proxy.fetchall()
        cdf = dataframe.CaravelDataFrame(
            pd.DataFrame(data, columns=column_names))
    # TODO consider generating tuples instead of dicts to send
    # less data through the wire. The command bellow does that,
    # but we'd need to align on the client side.
    # data = df.values.tolist()

    query.rows = result_proxy.rowcount
    query.progress = 100
    query.status = QueryStatus.SUCCESS
    if query.rows == -1 and cdf:
        # Presto doesn't provide result_proxy.row_count
        query.rows = cdf.size
    if query.select_as_cta:
        query.select_sql = '{}'.format(database.select_star(
            query.tmp_table_name, limit=query.limit))
    query.end_time = utils.now_as_float()
    session.commit()

    if return_results:
        payload = {
            'query_id': query.id,
            'status': query.status,
            'data': [],
        }
        if query.status == models.QueryStatus.SUCCESS:
            payload['data'] = cdf.data if cdf else []
            payload['columns'] = cdf.columns_dict if cdf else []
        else:
            payload['error'] = query.error_message
        return payload
    '''
Exemplo n.º 3
0
def get_sql_results(query_id, return_results=True):
    """Executes the sql query returns the results."""
    db.session.commit()  # HACK
    q = db.session.query(models.Query).all()
    query = db.session.query(models.Query).filter_by(id=query_id).one()
    database = query.database
    executed_sql = query.sql.strip().strip(';')

    # Limit enforced only for retrieving the data, not for the CTA queries.
    if is_query_select(executed_sql):
        if query.select_as_cta:
            if not query.tmp_table_name:
                start_dttm = datetime.fromtimestamp(query.start_time)
                query.tmp_table_name = 'tmp_{}_table_{}'.format(
                    query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S'))
            executed_sql = create_table_as(executed_sql, query.tmp_table_name)
            query.select_as_cta_used = True
        elif query.limit:
            executed_sql = database.wrap_sql_limit(executed_sql, query.limit)
            query.limit_used = True
        engine = database.get_sqla_engine(schema=query.schema)
        try:
            query.executed_sql = executed_sql
            logging.info("Running query: \n{}".format(executed_sql))
            result_proxy = engine.execute(query.executed_sql,
                                          schema=query.schema)
        except Exception as e:
            logging.exception(e)
            query.error_message = utils.error_msg_from_exception(e)
            query.status = QueryStatus.FAILED
            query.tmp_table_name = None
            db.session.commit()
            raise Exception(query.error_message)

        cursor = result_proxy.cursor
        query.status = QueryStatus.RUNNING
        db.session.flush()
        if hasattr(cursor, "poll"):
            polled = cursor.poll()
            # poll returns dict -- JSON status information or ``None``
            # if the query is done
            # https://github.com/dropbox/PyHive/blob/
            # b34bdbf51378b3979eaf5eca9e956f06ddc36ca0/pyhive/presto.py#L178
            while polled:
                # Update the object and wait for the kill signal.
                stats = polled.get('stats', {})
                if stats:
                    completed_splits = float(stats.get('completedSplits'))
                    total_splits = float(stats.get('totalSplits'))
                    if total_splits and completed_splits:
                        progress = 100 * (completed_splits / total_splits)
                        if progress > query.progress:
                            query.progress = progress
                        db.session.commit()
                time.sleep(1)
                polled = cursor.poll()

        columns = None
        data = None
        if result_proxy.cursor:
            cols = [col[0] for col in result_proxy.cursor.description]
            data = result_proxy.fetchall()
            df = pd.DataFrame(data, columns=cols)
            columns = [c for c in df.columns]
            data = df.to_dict(orient='records')

        query.rows = result_proxy.rowcount
        query.progress = 100
        query.status = QueryStatus.SUCCESS
        if query.rows == -1 and data:
            # Presto doesn't provide result_proxy.row_count
            query.rows = len(data)

        # CTAs queries result in 1 cell having the # of the added rows.
        if query.select_as_cta:
            query.select_sql = '{}'.format(
                database.select_star(query.tmp_table_name, limit=query.limit))

        query.end_time = utils.now_as_float()
        db.session.commit()

    payload = {
        'query_id': query.id,
        'status': query.status,
    }
    if query.status == models.QueryStatus.SUCCESS:
        payload['data'] = data
        payload['columns'] = columns
    else:
        payload['error'] = query.error_message
    if return_results:
        return payload
Exemplo n.º 4
0
def get_sql_results(query_id, return_results=True):
    """Executes the sql query returns the results."""
    session = db.session()
    session.commit()  # HACK
    query = session.query(models.Query).filter_by(id=query_id).one()
    database = query.database
    executed_sql = query.sql.strip().strip(';')

    def handle_error(msg):
        """Local method handling error while processing the SQL"""
        query.error_message = msg
        query.status = QueryStatus.FAILED
        query.tmp_table_name = None
        session.commit()
        raise Exception(query.error_message)

    # Limit enforced only for retrieving the data, not for the CTA queries.
    is_select = is_query_select(executed_sql)
    if not is_select and not database.allow_dml:
        handle_error(
            "Only `SELECT` statements are allowed against this database")
    if query.select_as_cta:
        if not is_select:
            handle_error(
                "Only `SELECT` statements can be used with the CREATE TABLE "
                "feature.")
        if not query.tmp_table_name:
            start_dttm = datetime.fromtimestamp(query.start_time)
            query.tmp_table_name = 'tmp_{}_table_{}'.format(
                query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S'))
        executed_sql = create_table_as(executed_sql, query.tmp_table_name,
                                       database.force_ctas_schema)
        query.select_as_cta_used = True
    elif query.limit and is_select:
        executed_sql = database.wrap_sql_limit(executed_sql, query.limit)
        query.limit_used = True
    engine = database.get_sqla_engine(schema=query.schema)
    try:
        query.executed_sql = executed_sql
        logging.info("Running query: \n{}".format(executed_sql))
        result_proxy = engine.execute(query.executed_sql, schema=query.schema)
    except Exception as e:
        logging.exception(e)
        handle_error(utils.error_msg_from_exception(e))

    cursor = result_proxy.cursor
    query.status = QueryStatus.RUNNING
    session.flush()
    if database.backend == 'presto':
        polled = cursor.poll()
        # poll returns dict -- JSON status information or ``None``
        # if the query is done
        # https://github.com/dropbox/PyHive/blob/
        # b34bdbf51378b3979eaf5eca9e956f06ddc36ca0/pyhive/presto.py#L178
        while polled:
            # Update the object and wait for the kill signal.
            stats = polled.get('stats', {})
            if stats:
                completed_splits = float(stats.get('completedSplits'))
                total_splits = float(stats.get('totalSplits'))
                if total_splits and completed_splits:
                    progress = 100 * (completed_splits / total_splits)
                    if progress > query.progress:
                        query.progress = progress
                    session.commit()
            time.sleep(1)
            polled = cursor.poll()

    columns = None
    data = None
    if result_proxy.cursor:
        columns = [col[0] for col in result_proxy.cursor.description]
        data = result_proxy.fetchall()
        df = pd.DataFrame(data, columns=columns)
        df = df.where((pd.notnull(df)), None)
        # TODO consider generating tuples instead of dicts to send
        # less data through the wire. The command bellow does that,
        # but we'd need to align on the client side.
        # data = df.values.tolist()
        data = df.to_dict(orient='records')

    query.rows = result_proxy.rowcount
    query.progress = 100
    query.status = QueryStatus.SUCCESS
    if query.rows == -1 and data:
        # Presto doesn't provide result_proxy.row_count
        query.rows = len(data)

    # CTAs queries result in 1 cell having the # of the added rows.
    if query.select_as_cta:
        query.select_sql = '{}'.format(
            database.select_star(query.tmp_table_name, limit=query.limit))

    query.end_time = utils.now_as_float()
    session.commit()

    payload = {
        'query_id': query.id,
        'status': query.status,
        'data': [],
    }
    if query.status == models.QueryStatus.SUCCESS:
        payload['data'] = data
        payload['columns'] = columns
    else:
        payload['error'] = query.error_message
    if return_results:
        return payload
    '''
Exemplo n.º 5
0
def get_sql_results(self, query_id, return_results=True, store_results=False):
    """Executes the sql query returns the results."""
    if not self.request.called_directly:
        engine = sqlalchemy.create_engine(
            app.config.get('SQLALCHEMY_DATABASE_URI'), poolclass=NullPool)
        session_class = sessionmaker()
        session_class.configure(bind=engine)
        session = session_class()
    else:
        session = db.session()
        session.commit()  # HACK
    query = session.query(models.Query).filter_by(id=query_id).one()
    database = query.database
    executed_sql = query.sql.strip().strip(';')
    db_engine_spec = database.db_engine_spec

    def handle_error(msg):
        """Local method handling error while processing the SQL"""
        query.error_message = msg
        query.status = QueryStatus.FAILED
        query.tmp_table_name = None
        session.commit()
        raise Exception(query.error_message)

    # Limit enforced only for retrieving the data, not for the CTA queries.
    is_select = is_query_select(executed_sql);
    if not is_select and not database.allow_dml:
        handle_error(
            "Only `SELECT` statements are allowed against this database")
    if query.select_as_cta:
        if not is_select:
            handle_error(
                "Only `SELECT` statements can be used with the CREATE TABLE "
                "feature.")
        if not query.tmp_table_name:
            start_dttm = datetime.fromtimestamp(query.start_time)
            query.tmp_table_name = 'tmp_{}_table_{}'.format(
                query.user_id,
                start_dttm.strftime('%Y_%m_%d_%H_%M_%S'))
        executed_sql = create_table_as(
            executed_sql, query.tmp_table_name, database.force_ctas_schema)
        query.select_as_cta_used = True
    elif (
            query.limit and is_select and
            db_engine_spec.limit_method == LimitMethod.WRAP_SQL):
        executed_sql = database.wrap_sql_limit(executed_sql, query.limit)
        query.limit_used = True
    engine = database.get_sqla_engine(schema=query.schema)
    try:
        template_processor = get_template_processor(
            database=database, query=query)
        executed_sql = template_processor.process_template(executed_sql)
    except Exception as e:
        logging.exception(e)
        msg = "Template rendering failed: " + utils.error_msg_from_exception(e)
        handle_error(msg)
    try:
        query.executed_sql = executed_sql
        logging.info("Running query: \n{}".format(executed_sql))
        result_proxy = engine.execute(query.executed_sql, schema=query.schema)
    except Exception as e:
        logging.exception(e)
        handle_error(utils.error_msg_from_exception(e))

    cursor = result_proxy.cursor
    query.status = QueryStatus.RUNNING
    session.flush()
    db_engine_spec.handle_cursor(cursor, query, session)

    cdf = None
    if result_proxy.cursor:
        column_names = [col[0] for col in result_proxy.cursor.description]
        if db_engine_spec.limit_method == LimitMethod.FETCH_MANY:
            data = result_proxy.fetchmany(query.limit)
        else:
            data = result_proxy.fetchall()
        cdf = dataframe.CaravelDataFrame(
            pd.DataFrame(data, columns=column_names))

    query.rows = result_proxy.rowcount
    query.progress = 100
    query.status = QueryStatus.SUCCESS
    if query.rows == -1 and cdf:
        # Presto doesn't provide result_proxy.row_count
        query.rows = cdf.size
    if query.select_as_cta:
        query.select_sql = '{}'.format(database.select_star(
            query.tmp_table_name, limit=query.limit))
    query.end_time = utils.now_as_float()
    session.flush()

    payload = {
        'query_id': query.id,
        'status': query.status,
        'data': [],
    }
    payload['data'] = cdf.data if cdf else []
    payload['columns'] = cdf.columns_dict if cdf else []
    payload['query'] = query.to_dict()
    payload = json.dumps(payload, default=utils.json_iso_dttm_ser)

    if store_results and results_backend:
        key = '{}'.format(uuid.uuid4())
        logging.info("Storing results in results backend, key: {}".format(key))
        results_backend.set(key, zlib.compress(payload))
        query.results_key = key

    session.flush()
    session.commit()

    if return_results:
        return payload
Exemplo n.º 6
0
def get_sql_results(query_id, return_results=True):
    """Executes the sql query returns the results."""
    db.session.commit()  # HACK
    query = db.session.query(models.Query).filter_by(id=query_id).one()
    database = query.database
    executed_sql = query.sql.strip().strip(';')

    # Limit enforced only for retrieving the data, not for the CTA queries.
    if is_query_select(executed_sql):
        if query.select_as_cta:
            if not query.tmp_table_name:
                start_dttm = datetime.fromtimestamp(query.start_time)
                query.tmp_table_name = 'tmp_{}_table_{}'.format(
                    query.user_id,
                    start_dttm.strftime('%Y_%m_%d_%H_%M_%S'))
            executed_sql = create_table_as(
                executed_sql, query.tmp_table_name, database.force_ctas_schema)
            query.select_as_cta_used = True
        elif query.limit:
            executed_sql = database.wrap_sql_limit(executed_sql, query.limit)
            query.limit_used = True
        engine = database.get_sqla_engine(schema=query.schema)
        try:
            query.executed_sql = executed_sql
            logging.info("Running query: \n{}".format(executed_sql))
            result_proxy = engine.execute(query.executed_sql, schema=query.schema)
        except Exception as e:
            logging.exception(e)
            query.error_message = utils.error_msg_from_exception(e)
            query.status = QueryStatus.FAILED
            query.tmp_table_name = None
            db.session.commit()
            raise Exception(query.error_message)

        cursor = result_proxy.cursor
        query.status = QueryStatus.RUNNING
        db.session.flush()
        if database.backend == 'presto':
            polled = cursor.poll()
            # poll returns dict -- JSON status information or ``None``
            # if the query is done
            # https://github.com/dropbox/PyHive/blob/
            # b34bdbf51378b3979eaf5eca9e956f06ddc36ca0/pyhive/presto.py#L178
            while polled:
                # Update the object and wait for the kill signal.
                stats = polled.get('stats', {})
                if stats:
                    completed_splits = float(stats.get('completedSplits'))
                    total_splits = float(stats.get('totalSplits'))
                    if total_splits and completed_splits:
                        progress = 100 * (completed_splits / total_splits)
                        if progress > query.progress:
                            query.progress = progress
                        db.session.commit()
                time.sleep(1)
                polled = cursor.poll()

        columns = None
        data = None
        if result_proxy.cursor:
            columns = [col[0] for col in result_proxy.cursor.description]
            data = result_proxy.fetchall()
            df = pd.DataFrame(data, columns=columns)
            df = df.where((pd.notnull(df)), None)
            # TODO consider generating tuples instead of dicts to send
            # less data through the wire. The command bellow does that,
            # but we'd need to align on the client side.
            # data = df.values.tolist()
            data = df.to_dict(orient='records')

        query.rows = result_proxy.rowcount
        query.progress = 100
        query.status = QueryStatus.SUCCESS
        if query.rows == -1 and data:
            # Presto doesn't provide result_proxy.row_count
            query.rows = len(data)

        # CTAs queries result in 1 cell having the # of the added rows.
        if query.select_as_cta:
            query.select_sql = '{}'.format(database.select_star(
                query.tmp_table_name, limit=query.limit))

        query.end_time = utils.now_as_float()
        db.session.commit()

    payload = {
        'query_id': query.id,
        'status': query.status,
    }
    if query.status == models.QueryStatus.SUCCESS:
        payload['data'] = data
        payload['columns'] = columns
    else:
        payload['error'] = query.error_message
    if return_results:
        return payload