Exemplo n.º 1
0
    def __init__(self, system=None, operators=None, is_async=False, buffer_size=1024, use_shared_mem=False):
        # type: (WorkerSystem, list, bool, int, bool) -> None
        """

        :param operators:
        """

        self.system = system
        self.use_shared_mem = use_shared_mem

        self.__timer = Timer()
        self.total_elapsed_time = 0.0
        self.__debug_timer = OrderedDict() 
        self.completion_counter = 0

        if operators is None:
            self.operators = OrderedDict()
        else:
            self.operators = OrderedDict()
            for o in operators:
                self.operators[o.name] = o

        self.is_async = is_async
        self.queue = Queue()

        self.buffer_size = buffer_size
        # Cost related metrics
        self.sql_scanned_bytes = 0
        self.sql_returned_bytes = 0
        self.returned_bytes = 0
        self.num_http_get_requests = 0
Exemplo n.º 2
0
def test_datetime_bench_py():
    dt = "1970-01-02"

    timer = Timer()
    timer.start()

    for i in range(0, 100000):
        datetime_util.py_str_to_millis(dt)

    print(timer.elapsed())
Exemplo n.º 3
0
class S3Get(object):

    def __init__(self):
        self.s3key = "lineitem.csv"

        self.timer = Timer()

        # self.query_bytes = 0
        # self.bytes_scanned = 0
        # self.bytes_processed = 0
        self.bytes_returned = 0
        self.records_events = 0
        # self.rows_returned = 0

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

    def run(self):

        s3 = boto3.client('s3')

        self.timer.start()

        response = s3.get_object(
            Bucket='s3filter',
            Key=self.s3key
        )

        streaming_body = response['Body']

        bytes_ = streaming_body.read(1024)
        self.time_to_first_record_response = self.timer.elapsed()
        while bytes_:
            self.records_events += 1
            self.bytes_returned += len(bytes_)
            bytes_ = streaming_body.read(1024)
            self.time_to_last_record_response = self.timer.elapsed()

        if streaming_body:
            streaming_body.close()

        elapsed_time = self.timer.elapsed()

        print("elapsed_time: {}".format(elapsed_time))
        print("time_to_first_record_response: {}".format(self.time_to_first_record_response))
        print("time_to_last_record_response: {}".format(self.time_to_last_record_response))
        print("bytes_returned: {}".format(self.bytes_returned))
        print("records_events: {}".format(self.records_events))

        bytes_sec = self.bytes_returned / elapsed_time

        print("bytes_sec: {} (KB {:f}, MB {:f})".format(bytes_sec, bytes_sec / 1000.0, bytes_sec / 1000000))
Exemplo n.º 4
0
def test_timer_start():

    sleep_duration = 0.1

    t = Timer()

    t.start()
    time.sleep(sleep_duration)

    # print("Elapsed time {}".format(t.elapsed()))

    expected_elapsed_time = 1 * sleep_duration
    assert expected_elapsed_time - 0.1 < t.elapsed(
    ) < expected_elapsed_time + 0.1
Exemplo n.º 5
0
    def __init__(self):
        self.s3key = "lineitem.csv"

        self.timer = Timer()

        # self.query_bytes = 0
        # self.bytes_scanned = 0
        # self.bytes_processed = 0
        self.bytes_returned = 0
        self.records_events = 0
        # self.rows_returned = 0

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None
Exemplo n.º 6
0
    def __init__(self, fast_s3):
        self.fast_s3 = fast_s3

        self.s3key = None
        self.s3sql = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0

        self.num_http_get_requests = 0
Exemplo n.º 7
0
    def __init__(self):
        self.s3key = "lineitem.csv"
        self.s3sql = "select * from s3Object"
        self.event_stream = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0
        self.records_events = 0
        self.rows_returned = 0

        self.timer = Timer()

        self.prev_record_str = None
Exemplo n.º 8
0
    def __init__(self, s3):

        self.s3 = s3

        self.s3key = None
        self.s3sql = None
        self.need_s3select = True
        self.event_stream = None
        self.table_local_file_path = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0

        self.num_http_get_requests = 0
        self.table_data = None
Exemplo n.º 9
0
class OpMetrics(object):
    def __init__(self):
        self.__timer = Timer()

    def timer_start(self):
        self.__timer.start()

    def timer_stop(self):
        self.__timer.stop()

    def elapsed_time(self):
        return self.__timer.elapsed()

    def start_time(self):
        return self.__timer.start_time

    def timer_running(self):
        return self.__timer.running

    @staticmethod
    def print_metrics(op_list):
        for op in op_list:
            OpMetrics.pretty_print(op)

    @staticmethod
    def print_overall_metrics(op_list, name=None):
        total_elapsed = 0
        for op in op_list:
            total_elapsed += op.op_metrics.elapsed_time()
        if name is None:
            name = " + ".join([o.name for o in op_list])
        print('{}: {}'.format(total_elapsed, name))

    @staticmethod
    def pretty_print(op):
        print("{}: {}".format(op, op.op_metrics))

    def __repr__(self):
        return {'elapsed_time': round(self.elapsed_time(), 5)}.__repr__()
Exemplo n.º 10
0
    def __init__(self, s3, s3key):

        self.s3 = s3

        self.s3key = s3key 
        self.ranges = pd.DataFrame() 
        self.event_stream = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = len(self.s3key.encode('utf-8'))
        
        #self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0
        
        self.records_str = ''
        self.nthreads = 16
        
        self.num_http_get_requests = 0
Exemplo n.º 11
0
 def __init__(self):
     self.__timer = Timer()
Exemplo n.º 12
0
class S3Select(object):
    def __init__(self):
        self.s3key = "lineitem.csv"
        self.s3sql = "select * from s3Object"
        self.event_stream = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0
        self.records_events = 0
        self.rows_returned = 0

        self.timer = Timer()

        self.prev_record_str = None

    def parse_event(self, event):
        if 'Records' in event:

            # print("Records")

            elapsed_time = self.timer.elapsed()
            if self.time_to_first_record_response is None:
                self.time_to_first_record_response = elapsed_time

            self.time_to_last_record_response = elapsed_time

            # records_str = event['Records']['Payload'].decode('utf-8')
            #
            # records_str_rdr = io.StringIO(records_str)
            #
            # for record_str in records_str_rdr:
            #
            #     # Check record ends with newline (excluding an escaped newline)
            #     if record_str.endswith('\n') and not record_str.endswith('\\n'):  # It's a complete record
            #
            #         if prev_record_str is not None:  # There was an incomplete record present in the last payload
            #             # Append the current record to the previous incomplete record
            #             # print("Appending: {} {}".format(prev_record_str, record_str))
            #             record_str = prev_record_str + record_str
            #             prev_record_str = None
            #
            #         # print("Complete: {}".format(record_str))
            #
            #         # Parse CSV
            #         record_rdr = csv.reader([record_str])
            #         if PYTHON_3:
            #             record = next(record_rdr)
            #         else:
            #             record = record_rdr.next()
            #
            #         # Do something with the record
            #         rows_returned += 1
            #
            #     else:
            #         # It's an incomplete record, save for next iteration
            #         # print("Incomplete: {}".format(record_str))
            #         prev_record_str = record_str
            self.records_events += 1

        elif 'Stats' in event:
            self.bytes_scanned += event['Stats']['Details']['BytesScanned']
            self.bytes_processed += event['Stats']['Details']['BytesProcessed']
            self.bytes_returned += event['Stats']['Details']['BytesReturned']
            print(
                "Stats Event: bytes scanned: {}, bytes processed: {}, bytes returned: {}"
                .format(self.bytes_scanned, self.bytes_processed,
                        self.bytes_returned))

        elif 'Progress' in event:
            # pass
            print("Progress Event")

        elif 'End' in event:
            print("End Event")
            # pass

        elif 'Cont' in event:
            # pass
            print("Cont Event")

    def run(self):

        self.query_bytes = len(self.s3key.encode('utf-8')) + len(
            self.s3sql.encode('utf-8'))

        s3 = boto3.client('s3')

        self.timer.start()

        response = s3.select_object_content(Bucket='s3filter',
                                            Key=self.s3key,
                                            ExpressionType='SQL',
                                            Expression=self.s3sql,
                                            InputSerialization={
                                                'CSV': {
                                                    'FileHeaderInfo': 'Use',
                                                    'RecordDelimiter': '|\n',
                                                    'FieldDelimiter': '|'
                                                }
                                            },
                                            OutputSerialization={'CSV': {}})

        event_stream = response['Payload']

        map(self.parse_event, event_stream)

        if event_stream:
            event_stream.close()

        elapsed_time = self.timer.elapsed()

        print("elapsed_time: {}".format(elapsed_time))
        print("time_to_first_record_response: {}".format(
            self.time_to_first_record_response))
        print("time_to_last_record_response: {}".format(
            self.time_to_last_record_response))
        print("query_bytes: {}".format(self.query_bytes))
        print("bytes_scanned: {}".format(self.bytes_scanned))
        print("bytes_processed: {}".format(self.bytes_processed))
        print("bytes_returned: {}".format(self.bytes_returned))
        print("records_events: {}".format(self.records_events))
        print("rows_returned: {}".format(self.rows_returned))

        bytes_sec = self.bytes_returned / elapsed_time

        print("bytes_sec: {} (KB {:f}, MB {:f})".format(
            bytes_sec, bytes_sec / 1000.0, bytes_sec / 1000000))
Exemplo n.º 13
0
def run(use_pandas, secure, use_native):
    print("Cursor | Settings {}".format({
        'use_pandas': use_pandas,
        'secure': secure,
        'use_native': use_native
    }))

    key = 'sf1000-lineitem/lineitem_27.csv'
    sql = 'select * from S3Object limit 10000'

    boto3.set_stream_logger('', logging.DEBUG)

    if secure:
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10)
        session = Session()
        s3 = session.client('s3', config=cfg)
    else:
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10,
                     s3={'payload_signing_enabled': False})
        session = Session()
        s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    if use_pandas and not use_native:

        cur = PandasCursor(s3).select(key, sql)
        dfs = cur.execute()

        rows_returned = 0
        for df in dfs:
            rows_returned += len(df)

        print("{}".format({
            'elapsed_time':
            round(cur.timer.elapsed(), 5),
            'rows_returned':
            rows_returned,
            'query_bytes':
            cur.query_bytes,
            'bytes_scanned':
            cur.bytes_scanned,
            'bytes_processed':
            cur.bytes_processed,
            'bytes_returned':
            "{} ({} MB / {} GB)".format(
                cur.bytes_returned,
                round(float(cur.bytes_returned) / 1000000.0, 5),
                round(float(cur.bytes_returned) / 1000000000.0, 5)),
            'bytes_returned_per_sec':
            "{} ({} MB / {} GB)".format(
                round(float(cur.bytes_returned) / cur.timer.elapsed(), 5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() / 1000000,
                    5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() /
                    1000000000, 5)),
            'time_to_first_record_response':
            None if cur.time_to_first_record_response is None else round(
                cur.time_to_first_record_response, 5),
            'time_to_last_record_response':
            None if cur.time_to_last_record_response is None else round(
                cur.time_to_last_record_response, 5)
        }))

    elif use_pandas and use_native:

        closure = {'df': None, 'rows_returned': 0}

        def on_data(data):

            # print("|||")
            # print(type(data))
            # print(data)
            # print("|||")

            # if closure['df'] is None:
            #     closure['df'] = pd.DataFrame(data)
            # else:
            #     closure['df'] = pd.concat([closure['df'], pd.DataFrame(data)], ignore_index=True, copy=False)

            closure['rows_returned'] += len(pd.DataFrame(data))

        cur = NativeCursor(scan).select(key, sql)
        cur.execute(on_data)

        rows_returned = closure['rows_returned']

        print("{}".format({
            'elapsed_time':
            round(cur.timer.elapsed(), 5),
            'rows_returned':
            rows_returned,
            'query_bytes':
            cur.query_bytes,
            'bytes_scanned':
            cur.bytes_scanned,
            'bytes_processed':
            cur.bytes_processed,
            'bytes_returned':
            "{} ({} MB / {} GB)".format(
                cur.bytes_returned,
                round(float(cur.bytes_returned) / 1000000.0, 5),
                round(float(cur.bytes_returned) / 1000000000.0, 5)),
            'bytes_returned_per_sec':
            "{} ({} MB / {} GB)".format(
                round(float(cur.bytes_returned) / cur.timer.elapsed(), 5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() / 1000000,
                    5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() /
                    1000000000, 5)),
            'time_to_first_record_response':
            None if cur.time_to_first_record_response is None else round(
                cur.time_to_first_record_response, 5),
            'time_to_last_record_response':
            None if cur.time_to_last_record_response is None else round(
                cur.time_to_last_record_response, 5)
        }))

    else:

        timer = Timer()
        timer.start()

        cur = Cursor(s3).select(key, sql)
        rows = cur.execute()
        for _ in rows:
            pass

        print(timer.elapsed())

    print("Cursor | Done")
Exemplo n.º 14
0
class QueryPlan(object):
    """Container for the operators in a query. Really a convenience class that allows the plan graph to be
    generated and the operator execution metrics.

    """

    def __init__(self, system=None, operators=None, is_async=False, buffer_size=1024, use_shared_mem=False):
        # type: (WorkerSystem, list, bool, int, bool) -> None
        """

        :param operators:
        """

        self.system = system
        self.use_shared_mem = use_shared_mem

        self.__timer = Timer()
        self.total_elapsed_time = 0.0
        self.__debug_timer = OrderedDict() 
        self.completion_counter = 0

        if operators is None:
            self.operators = OrderedDict()
        else:
            self.operators = OrderedDict()
            for o in operators:
                self.operators[o.name] = o

        self.is_async = is_async
        self.queue = Queue()

        self.buffer_size = buffer_size
        # Cost related metrics
        self.sql_scanned_bytes = 0
        self.sql_returned_bytes = 0
        self.returned_bytes = 0
        self.num_http_get_requests = 0

    def get_operator(self, name):
        return self.operators[name]

    def add_operator(self, operator):
        """Adds the operator to the list of operators in the plan. This method ensures the operators are sorted by
        operator name - which is important only for retrieving the root operators.

        :param operator:
        :return:
        """

        if operator.name in self.operators:
            raise Exception("Cannot add multiple operators with same name. "
                            "Operator '{}' already added".format(operator.name))

        operator.set_query_plan(self)
        operator.set_buffer_size(self.buffer_size)

        self.operators[operator.name] = operator
        sorted_operators = sorted(self.operators.values(), key=lambda o: o.name)
        self.operators = OrderedDict((o.name, o) for o in sorted_operators)

        return operator

    def traverse_depth_first(self, from_operator, visited_operators):
        """Returns a sorted list of the operators according to where they appear in the query plan using depth first
        traversal, starting from the supplied operator.

        :param from_operator: First operator to start from
        :param visited_operators: Operators already traversed
        :return: Sorted list of operators
        """

        if from_operator.name not in visited_operators:
            visited_operators[from_operator.name] = from_operator

        for c in from_operator.consumers:
            visited_operators = self.traverse_depth_first(c, visited_operators)

        return visited_operators

    @staticmethod
    def traverse_breadth_first(from_operators):
        """Returns a sorted list of the operators according to where they appear in the query plan using breadth
        first traversal, starting from the supplied operators.

        :param from_operators: The operators to start from
        :return: Sorted list of operators
        """

        visited_operators = OrderedDict()
        operators_list = deque(from_operators)
        while operators_list:
            o = operators_list.popleft()
            if o.name not in visited_operators:
                visited_operators[o.name] = o

                # Add the consumers except any visited
                for c in o.consumers:
                    if c.name not in visited_operators:
                        operators_list.append(c)

        return visited_operators.values()

    def traverse_topological(self):
        """This method sorts the operators topgraphically and by operator name. This is to ensure
        that the operator metrics are written in the order that most closely aligns with the plan. Note that this
        is not the same order that operators execute in, which may be counterintuitive - e.g. A top operator can
        complete before a scan.

        :return: Sorted list of operators
        """

        g = networkx.DiGraph()

        for o in self.operators.values():
            g.add_node(o.name, value=o)

        for o in self.operators.values():
            for c in o.consumers:
                g.add_edge(o.name, c.name)

        sorted_operators = list(map(lambda name: g.nodes[name]['value'], networkx.topological_sort(g)))

        return sorted_operators

    def traverse_depth_first_from_root(self):
        """Returns a sorted list of the operators according to where they appear in the query plan, starting from
        the root operators.

        :return: Sorted list of operators
        """

        # Find the operators at the root of the graph, they will have no producers
        root_operators = self.find_root_operators()

        visited_operators = OrderedDict()
        for o in root_operators:
            visited_operators = self.traverse_depth_first(o, visited_operators)

        return visited_operators.values()

    def traverse_breadth_first_from_root(self):
        """Returns a sorted list of the operators according to where they appear in the query plan, starting from
        the root operators.

        :return: Sorted list of operators
        """

        # Find the operators at the root of the graph, they will have no producers
        root_operators = self.find_root_operators()

        visited_operators = self.traverse_breadth_first(root_operators)

        return visited_operators

    def traverse_topological_from_root(self):
        visited_operators = self.traverse_topological()
        return visited_operators

    def find_root_operators(self):
        root_operators = []
        for o in self.operators.values():
            if not o.producers:
                root_operators.append(o)
        return root_operators

    def send(self, message, operator_name, sender_op):
        # NOTE: Have to check that the sender op is async otherwise won't be able to send any messages back to it
        # (which is needed to send dataframes)
        if self.use_shared_mem:
            if type(message) is list:
                for e in message[0]:
                    if type(e) is pd.DataFrame:
                        self.system.send(operator_name, DataFrameMessage(e), sender_op.worker)
                    else:
                        self.system.send(operator_name, e, sender_op.worker)
            else:
                self.system.send(operator_name, message, sender_op.worker)
        else:
            if type(message) is list and len(message) <= 0:
                return
            else:
                o = self.operators[operator_name]
                o.queue.put(cPickle.dumps(message, cPickle.HIGHEST_PROTOCOL))

    def print_metrics(self):

        print("")
        print("Metrics")
        print("-------")
        print("")

        print("Plan")
        print("----")

        print("buffer_size: {}".format(self.buffer_size))
        print("is_parallel: {}".format(self.is_async))
        print("total_elapsed_time: {}".format(round(self.total_elapsed_time, 5)))

        print("")
        print("Cost")
        print("----")
        self.print_cost_metrics()

        print("")
        print("Operator Completion Time")
        print("------------------------")
        for k, v in self.__debug_timer.items():
            print("{}: {}".format(k, v))

        print("")
        print("Operators")
        print("---------")

        OpMetrics.print_metrics(self.traverse_topological_from_root())

        # self.assert_operator_time_equals_plan_time()

        print("")

    def print_cost_metrics(self):
        scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")]
        for op in scan_operators:
            if type(op) is TableRangeAccess:
                self.returned_bytes += op.op_metrics.bytes_returned
                self.num_http_get_requests += op.op_metrics.num_http_get_requests
            elif type(op) is TableScan:
                self.returned_bytes += op.op_metrics.bytes_returned
                self.num_http_get_requests += op.op_metrics.num_http_get_requests
            elif type(op) is SQLTableScan:
                self.sql_returned_bytes += op.op_metrics.bytes_returned
                self.sql_scanned_bytes += op.op_metrics.bytes_scanned
                self.num_http_get_requests += op.op_metrics.num_http_get_requests
            elif type(op) is SQLTableScanBloomUse:
                self.sql_returned_bytes += op.op_metrics.bytes_returned
                self.sql_scanned_bytes += op.op_metrics.bytes_scanned
                self.num_http_get_requests += op.op_metrics.num_http_get_requests
            else:
                # raise Exception("Unrecognized scan operator {}".format(type(op)))
                pass

        print("sql_scanned_bytes: {}".format(self.sql_scanned_bytes))
        print("sql_returned_bytes: {}".format(self.sql_returned_bytes))
        print("returned_bytes: {}".format(self.returned_bytes))
        print("num_http_get_requests: {}".format(self.num_http_get_requests))
        print("")

        cost, bytes_scanned, bytes_returned, http_requests, rows = self.cost()
        computation_cost = self.computation_cost()
        data_cost = self.data_cost()[0]
        print("total_scanned_bytes: {} MB".format(bytes_scanned * BYTE_TO_MB))
        print("total_returned_bytes: {} MB".format(bytes_returned * BYTE_TO_MB))
        print("total_http_get_requests: {}".format(http_requests))
        print("total_returned_rows: {}".format(rows))
        print("computation_cost: ${0:.10f}".format(computation_cost))
        print("data_cost: ${0:.10f}".format(data_cost))
        print("total_cost: ${0:.10f}".format(cost))

    def assert_operator_time_equals_plan_time(self):
        """Sanity check to make sure cumulative operator exec time approximately equals total plan exec time. We use a
        margin of error of 0.1 seconds to account for any time not captured during a context switch.

        Disabled when -O flag is set

        :return: None
        """

        if __debug__:

            margin = 0.1

            cum_op_time = 0.0
            for o in self.operators.values():
                cum_op_time += o.op_metrics.elapsed_time()

            upper_time_bound = self.total_elapsed_time * (1.0 + margin)
            lower_time_bound = self.total_elapsed_time * (1.0 - margin)

            assert lower_time_bound <= cum_op_time <= upper_time_bound, \
                "Cumulative operator execution time {} is not within {} seconds of total plan execution time {}".format(
                    margin, cum_op_time, self.total_elapsed_time)

    def write_graph(self, dir_, name):

        graph = Graph(name)

        for o in self.operators.values():
            graph.add_operator(o)

        graph.write(dir_)

    def execute(self):

        if self.is_async:
            map(lambda o: o.init_async(self.queue, self.system, self.use_shared_mem), self.operators.values())
            if self.use_shared_mem:
                self.system.start()
            else:
                map(lambda o: o.boot(), self.operators.values())

        # Find the root operators
        root_operators = self.find_root_operators()

        self.__timer.start()

        # Start the root operators
        for ro in root_operators:
            ro.start()

        # Things become event driven from here, because we may be running async, so we essentially go
        # into an event loop with a state machine to keep track of where we are
        # async_operators = {name: o for name, o in filter(lambda (name, o): o.async_, self.operators.items())}

        if self.is_async:
	    print("Operators: ")
	    print(self.operators)
            operator_completions = {k: False for k, v in self.operators.items()}
            while not all(operator_completions.values()):
                completed_message = self.listen(OperatorCompletedMessage)
                operator_completions[completed_message.name] = True
                self.debug_time(completed_message.name)

        self.__timer.stop()

        self.total_elapsed_time = self.__timer.elapsed()

        if self.is_async:
            # retrieve metrics of all operators. This is important in async queries since the operator runs in a
            # different process.
            operators = self.traverse_topological_from_root()
            for o in operators:
                if o.async_:
                    if self.use_shared_mem:
                        self.system.send(o.name, EvalMessage("self.op_metrics"), None)
                    else:
                        p_message = pickle.dumps(EvalMessage("self.op_metrics"))
                        o.queue.put(p_message)

                    evaluated_msg = self.listen(EvaluatedMessage)  # type: EvaluatedMessage
                    o.op_metrics = evaluated_msg.val
                else:
                    # Don't need to request metrics
                    pass

            map(lambda op: op.set_completed(True), self.operators.values())

    def listen(self, message_type):
        # type: (TypeVar[MessageBase]) -> MessageBase
        try:
            if self.use_shared_mem:
                msg = self.system.listen(message_type)
                return msg
            else:
                while True:
                    p_item = self.queue.get()
                    item = cPickle.loads(p_item)
                    # print(item)

                    if type(item) == message_type:
                        return item
                    else:
                        # Not the message being listened for, warn and skip
                        # This isn't exceptional, but the listener should be made aware that there are messages arriving that
                        # are being ignored
                        warning("While listening for message type {} received message type {} with contents {}".format(message_type, type(item), item))


        except BaseException as e:
            tb = traceback.format_exc(e)
            print(tb)

    def stop(self):
        if self.is_async:
            if self.use_shared_mem:
                self.system.send_all(StopMessage())
                self.system.join()
                self.system.close()
            else:
                map(lambda o: o.queue.put(cPickle.dumps(StopMessage())), filter(lambda o: o.async_, self.operators.values()))
                self.join()
                map(lambda o: o.queue.close(), filter(lambda o: o.async_, self.operators.values()))
        else:
            pass

    def join(self):
        return map(lambda o: o.join(), self.operators.values())

    def get_phase_runtime(self, phase_keyword=None):
        phase_operators = [op for op in self.operators.values() if phase_keyword in op.name]

        op_start_times = [op.op_metrics.start_time() for op in phase_operators]
        op_end_times = [op.op_metrics.elapsed_time() + op.op_metrics.start_time() for op in phase_operators]

        return max(op_end_times) - min(op_start_times)

    def cost(self):
        """
        calculates the overall query cost when runs on S3 by combining the cost of all scan operators in the query
        plus the computation cost based on the EC2 instance type and the query running time
        :return: the estimated cost of the whole query
        """
        total_data_cost, total_scanned_bytes, total_returned_bytes, total_http_requests, total_rows = self.data_cost()
        total_compute_cost = self.computation_cost()

        return total_compute_cost + total_data_cost, total_scanned_bytes, total_returned_bytes, total_http_requests,\
               total_rows

    def data_cost(self, ec2_region=None, phase_keyword=None):
        """
        calculates the estimated query cost when runs on S3 by combining the cost of all scan operators in the query
        :return: the estimated cost of the whole query
        """
        if phase_keyword is None:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")]
        else:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \
                              phase_keyword in op.name]

        total_data_cost = 0.0
        total_scanned_bytes = 0
        total_returned_bytes = 0
        total_http_requests = 0
        total_rows = 0

        for op in scan_operators:
            if op.is_completed():
                total_data_cost += op.op_metrics.data_cost(ec2_region)
                total_returned_bytes += op.op_metrics.bytes_returned
                total_scanned_bytes += op.op_metrics.bytes_scanned
                total_http_requests += op.op_metrics.num_http_get_requests
                total_rows += op.op_metrics.rows_returned
            else:
                raise Exception("Can't calculate query cost while one or more scan operators {} are still executing"
                                .format(op.name))

        return total_data_cost, total_scanned_bytes, total_returned_bytes, total_http_requests, total_rows

    def data_transfer_cost(self, ec2_region=None, phase_keyword=None):
        if phase_keyword is None:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")]
        else:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \
                              phase_keyword in op.name]

        total_data_transfer_cost = 0.0
        total_bytes_returned = 0
        total_returned_rows = 0

        for op in scan_operators:
            if op.is_completed():
                total_data_transfer_cost += op.op_metrics.data_transfer_cost()
                total_bytes_returned += op.op_metrics.bytes_returned
                total_returned_rows += op.op_metrics.rows_returned
            else:
                raise Exception("Can't calculate query cost while one or more scan operators {} are still executing"
                                .format(op.name))

        return total_bytes_returned, total_returned_rows, total_data_transfer_cost

    def data_scanning_cost(self, phase_keyword=None):
        if phase_keyword is None:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")]
        else:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \
                              phase_keyword in op.name]

        total_data_scanning_cost = 0.0
        total_bytes_scanned = 0

        for op in scan_operators:
            if op.is_completed():
                total_data_scanning_cost += op.op_metrics.data_scan_cost()
                total_bytes_scanned += op.op_metrics.bytes_scanned
            else:
                raise Exception("Can't calculate query cost while one or more scan operators {} are still executing"
                                .format(op.name))

        return total_bytes_scanned, total_data_scanning_cost

    def requests_cost(self, phase_keyword=None):
        if phase_keyword is None:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")]
        else:
            scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \
                              phase_keyword in op.name]

        total_http_requests_cost = 0.0
        total_http_requests = 0

        for op in scan_operators:
            if op.is_completed():
                total_http_requests_cost += op.op_metrics.requests_cost()
                total_http_requests += op.op_metrics.num_http_get_requests
            else:
                raise Exception("Can't calculate query cost while one or more scan operators {} are still executing"
                                .format(op.name))

        return total_http_requests, total_http_requests_cost

    def computation_cost(self, ec2_instance_type=None, os_type=None):
        """
        calculates the estimated computation cost of the query
        :return: computation cost
        """
        scan_op = None

        for op in self.operators.values():
            if hasattr(op.op_metrics, "cost"):
                scan_op = op
                break

        return scan_op.op_metrics.computation_cost(self.total_elapsed_time, ec2_instance_type, os_type)

    def estimated_cost_for_config(self, ec2_region, ec2_instance_type=None, os_type=None):
        """
        estimate the hypothetical cost if the query runs on EC2 instance in a different region
        :param ec2_region: AWSRegion to calculate the cost for
        :param ec2_instance_type: the code of an EC2 instance as defined by AWS (r4.8xlarge)
        :param os_type: the name of the os running on the host machine (Linux, Windows ... etc)
        :return: the hypothetical estimated cost for the provided region, instance type and os
        """
        return self.computation_cost(ec2_instance_type, os_type) + self.data_cost(ec2_region)

    def debug_time(self, name):
        self.__debug_timer[name] = self.__timer.elapsed()

    def retrieve_sampling_threshold(self):
        sampling_filter_builder = [op for op in self.operators.values() if type(op) == TopKFilterBuild]
        if len(sampling_filter_builder) > 0:
            filter_builder = sampling_filter_builder[0]
            if filter_builder.async_:
                if self.use_shared_mem:
                    self.system.send(filter_builder.name, EvalMessage("self.threshold"), None)
                else:
                    p_message = pickle.dumps(EvalMessage("self.threshold"))
                    filter_builder.queue.put(p_message)

                evaluated_msg = self.listen(EvaluatedMessage)  # type: EvaluatedMessage
                filter_builder.threshold = evaluated_msg.val

                return filter_builder.threshold
Exemplo n.º 15
0
class PandasCursor(object):
    """Represents a database cursor for managing the context of a fetch operation.

    Intended to be modelled after the Python DB API. All it supports at present is select, taking a s3 key and sql
    string to execute. The rows returned from s3 are returned as an iterator.

    Importantly supports streaming of records which gives more control over simply passing Python data structures
    around.

    """

    def __init__(self, s3):

        self.s3 = s3

        self.s3key = None
        self.s3sql = None
        self.event_stream = None
        self.table_local_file_path = None
        self.need_s3select = True

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0

        self.num_http_get_requests = 0
        self.table_data = None

        self.input = Format.CSV

    def parquet(self):
        self.input = Format.PARQUET
        return self

    def csv(self):
        self.input = Format.CSV
        return self

    def select(self, s3key, s3sql):
        """Creates a select cursor

        :param s3sql:  the sql to execute
        :param s3key: the s3 key to select against
        :return: the cursor
        """

        self.s3key = s3key
        self.s3sql = s3sql

        # TODO:only simple SQL queries are considered. Nested and complex queries will need a lot of work to handle
        self.need_s3select = not (s3sql.lower().replace(';', '').strip() == 'select * from s3object')

        # There doesn't seem to be a way to capture the bytes sent to s3, but we can use this for comparison purposes
        self.query_bytes = len(self.s3key.encode('utf-8')) + len(self.s3sql.encode('utf-8'))

        return self

    def execute(self):
        """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could
        model that API more precisely in future.

        :return: An iterable of the records fetched
        """
	
	"""
	All operations return a CSV file
	"""
	print("Executing Pandas cursor!")
	self.timer.start()
	self.input = Format.CSV
	
	config = TransferConfig(
                multipart_chunksize=8 * MB,
                multipart_threshold=8 * MB
	)
			
	
	try:	
		#h1 = http.client.HTTPConnection('127.0.0.1:5000')
		#h1 = http.client.HTTPConnection("3.87.65.94:5000")
		h1 = http.client.HTTPConnection(FILTER_IP)
		#print("Connected")
		
		h1.request('POST','/' + S3_BUCKET_NAME + '/' + self.s3key, body=self.s3sql)
		#print("Made request!")
		r = h1.getresponse()
		#print(r.read())
		r2 = r.read()#.decode("utf-8")
		#print(r2)	
		#LOST TO DO!!
		self.table_data = io.BytesIO()
		self.table_data.write(r2)
		
		#print("HELLO!")
		#print(self.table_data.getvalue().decode('utf-8'))
		self.num_http_get_requests = PandasCursor.calculate_num_http_requests(self.table_data, config)		
	
		return self.parse_file()	
	except Exception as e:
		print(e)
        
	#return self.parse_file()

    def parse_event_stream(self):
        """Generator that hands out records from the event stream lazily

        :return: Generator of the records returned from s3
        """

        prev_record_str = None

        records_str_rdr = cStringIO.StringIO()
        for event in self.event_stream:

            if 'Records' in event:

                elapsed_time = self.timer.elapsed()
                if self.time_to_first_record_response is None:
                    self.time_to_first_record_response = elapsed_time

                self.time_to_last_record_response = elapsed_time

                # records_str = event['Records']['Payload'].decode('utf-8')
                records_str = event['Records']['Payload']

                if prev_record_str is not None:
                    records_str_rdr.write(prev_record_str)
                    prev_record_str = None

                if records_str.endswith('\n') and not records_str.endswith('\\n'):
                    records_str_rdr.write(records_str)
                else:
                    last_newline_pos = records_str.rfind('\n', 0, len(records_str))
                    if last_newline_pos == -1:
                        prev_record_str = records_str
                    else:
                        prev_record_str = records_str[last_newline_pos + 1:]
                        records_str_rdr.write(records_str[:last_newline_pos + 1])

                if records_str_rdr.tell() > 1024 * 1024 * 16:
                    records_str_rdr.seek(0)
                    df = pd.read_csv(records_str_rdr, header=None, prefix='_', dtype=numpy.str, engine='c',
                                     quotechar='"', na_filter=False, compression=None, low_memory=False)
                    records_str_rdr.close()
                    records_str_rdr = cStringIO.StringIO()
                    yield df

                # Strangely the reading with the python csv reader and then loading into a dataframe "seems" faster than
                # reading csvs with pandas, agate is another option. need to test properly

                # record_rdr = agate.csv_py2.reader(records_str_rdr)
                # df = pd.DataFrame(list(record_rdr), dtype=str)
                # df = df.add_prefix('_')

                # record_rdr = csv.reader(records_str_rdr)
                # df = pd.DataFrame(list(record_rdr), dtype=str)
                # df = df.add_prefix('_')

            elif 'Stats' in event:
                self.bytes_scanned += event['Stats']['Details']['BytesScanned']
                self.bytes_processed += event['Stats']['Details']['BytesProcessed']
                self.bytes_returned += event['Stats']['Details']['BytesReturned']
                # print("{} Stats Event: bytes scanned: {}, bytes processed: {}, bytes returned: {}"
                #       .format(timeit.default_timer(), self.bytes_scanned, self.bytes_processed, self.bytes_returned))

            elif 'Progress' in event:
                pass
                # print("{} Progress Event".format(timeit.default_timer()))

            elif 'End' in event:
                # print("{} End Event".format(timeit.default_timer()))

                if records_str_rdr.tell() > 0:
                    records_str_rdr.seek(0)
                    df = pd.read_csv(records_str_rdr, header=None, prefix='_', dtype=numpy.str, engine='c',
                                     quotechar='"', na_filter=False, compression=None, low_memory=False)
                    records_str_rdr.flush()
                    records_str_rdr.close()
                    yield df
                else:
                    yield pd.DataFrame()

                return

            elif 'Cont' in event:
                pass
                # print("{} Cont Event".format(timeit.default_timer()))

            elif 'RequestLevelError' in event:
                raise Exception(event)
                # print("{} Cont Event".format(timeit.default_timer()))

            else:
                raise Exception("Unrecognized event {}".format(event))

    def parse_file(self):
        #try:
            if self.input is Format.CSV:
                if self.table_data and len(self.table_data.getvalue()) > 0:
                    ip_stream = cStringIO.StringIO(self.table_data.getvalue().decode('utf-8'))
                elif os.path.exists(self.table_local_file_path):
                    ip_stream = self.table_local_file_path
                else:
                    return

                self.time_to_first_record_response = self.time_to_last_record_response = self.timer.elapsed()
		#print("Printing ipstream:")
		#print(ip_stream.getvalue())
                for df in pd.read_csv(ip_stream, delimiter='|',
                                      header=None,
                                      prefix='_', dtype=numpy.str,
                                      engine='c', quotechar='"', na_filter=False, compression=None, low_memory=False,
                                      skiprows=1,
                                      chunksize=10 ** 7):
                    # Get read bytes
                    self.bytes_returned += ip_stream.tell()

                    # drop last column since the line separator | creates a new empty column at the end of every record
                    df_col_names = list(df)
                    last_col = df_col_names[-1]
                    df.drop(last_col, axis=1, inplace=True)

                    yield df
            elif self.input is Format.PARQUET:

                table = pq.read_table(self.table_data)
                self.table_data = None

                yield table.to_pandas()
            else:
                raise Exception("Unrecognized input type '{}'".format(self.input))
	
        #except Exception as e:
        #    print("can not read table data at with error {}".format(e.message))
        #    raise e
	
    def close(self):
        """Closes the s3 event stream

        :return: None
        """
        if self.event_stream:
            self.event_stream.close()
        if self.table_data:
            self.table_data.close()

    def save_table(self):
        """
        Saves the table data to disk
        :return:
        """
        if self.table_data:
            proj_dir = os.environ['PYTHONPATH'].split(":")[0]
            table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC)
            create_dirs(table_loc)

            self.table_local_file_path = os.path.join(table_loc, self.s3key)

            if not os.path.exists(self.table_local_file_path):
                create_file_dirs(self.table_local_file_path)
                with open(self.table_local_file_path, 'w') as table_file:
                    table_file.write(self.table_data.getvalue())

    @staticmethod
    def calculate_num_http_requests(table_data, config):
        if table_data is not None and len(table_data.getvalue()):
            shard_max_size = config.multipart_threshold
            return math.ceil(len(table_data.getvalue()) / (1.0 * shard_max_size))

        return 1
Exemplo n.º 16
0
class Cursor(object):
    """Represents a database cursor for managing the context of a fetch operation.

    Intended to be modelled after the Python DB API. All it supports at present is select, taking a s3 key and sql
    string to execute. The rows returned from s3 are returned as an iterator.

    Importantly supports streaming of records which gives more control over simply passing Python data structures
    around.

    """
    def __init__(self, s3):

        self.s3 = s3

        self.s3key = None
        self.s3sql = None
        self.need_s3select = True
        self.event_stream = None
        self.table_local_file_path = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0

        self.num_http_get_requests = 0
        self.table_data = None

    def select(self, s3key, s3sql):
        """Creates a select cursor

        :param s3sql:  the sql to execute
        :param s3key: the s3 key to select against
        :return: the cursor
        """

        self.s3key = s3key
        self.s3sql = s3sql

        # TODO:only simple SQL queries are considered. Nested and complex queries will need a lot of work to handle
        self.need_s3select = not (s3sql.lower().replace(';', '').strip()
                                  == 'select * from s3object')

        # There doesn't seem to be a way to capture the bytes sent to s3, but we can use this for comparison purposes
        self.query_bytes = len(self.s3key.encode('utf-8')) + len(
            self.s3sql.encode('utf-8'))

        return self

    def execute(self):
        """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could
        model that API more precisely in future.

        :return: An iterable of the records fetched
        """

        # print("Executing select_object_content")

        self.timer.start()

        if not self.need_s3select:
            proj_dir = os.environ['PYTHONPATH'].split(":")[0]
            table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC)
            if not os.path.exists(table_loc):
                os.makedirs(table_loc)

            self.table_local_file_path = os.path.join(table_loc, self.s3key)

            if not os.path.exists(
                    self.table_local_file_path) or not USE_CACHED_TABLES:
                config = TransferConfig(multipart_chunksize=8 * MB,
                                        multipart_threshold=8 * MB)

                self.table_data = io.BytesIO()

                self.s3.download_fileobj(Bucket=S3_BUCKET_NAME,
                                         Key=self.s3key,
                                         Fileobj=self.table_data,
                                         Config=config)

                self.num_http_get_requests = Cursor.calculate_num_http_requests(
                    self.table_data, config)

            return self.parse_file()
        else:
            # Note:
            #
            # CSV files use | as a delimiter and have a trailing delimiter so record delimiter is |\n
            #
            # NOTE: As responses are chunked the file headers are only returned in the first chunk.
            # We ignore them for now just because its simpler. It does mean the records are returned as a list
            #  instead of a dict though (can change in future).
            #
            response = self.s3.select_object_content(
                Bucket=S3_BUCKET_NAME,
                Key=self.s3key,
                ExpressionType='SQL',
                Expression=self.s3sql,
                InputSerialization={
                    'CSV': {
                        'FileHeaderInfo': 'Use',
                        'RecordDelimiter': '|\n',
                        'FieldDelimiter': '|'
                    }
                },
                OutputSerialization={'CSV': {}})

            self.event_stream = response['Payload']

            self.num_http_get_requests = 1

            return self.parse_event_stream()

    def parse_event_stream(self):
        """Generator that hands out records from the event stream lazily

        :return: Generator of the records returned from s3
        """

        prev_record_str = None

        # first_record = True

        for event in self.event_stream:

            if 'Records' in event:

                elapsed_time = self.timer.elapsed()
                if self.time_to_first_record_response is None:
                    self.time_to_first_record_response = elapsed_time

                self.time_to_last_record_response = elapsed_time

                records_str = event['Records']['Payload'].decode('utf-8')

                # print("{} Records Event {}".format(timeit.default_timer(), records_str))
                # print("{} Records Event".format(timeit.default_timer()))

                # Note that the select_object_content service may return partial chunks so we cant simply pass the
                # str to the csv reader. We need to examine the string itself to see if it's an incomplete record
                # (it won't end with a newline if its incomplete)

                records_str_rdr = io.StringIO(records_str)

                for record_str in records_str_rdr:

                    # Check record ends with newline (excluding an escaped newline)
                    if record_str.endswith('\n') and not record_str.endswith(
                            '\\n'):  # It's a complete record

                        if prev_record_str is not None:  # There was an incomplete record present in the last payload
                            # Append the current record to the previous incomplete record
                            # print("Appending: {} {}".format(prev_record_str, record_str))
                            record_str = prev_record_str + record_str
                            prev_record_str = None

                        # print("Complete: {}".format(record_str))

                        # Parse CSV
                        record_rdr = csv.reader([record_str])
                        if PYTHON_3:
                            record = next(record_rdr)
                        else:
                            record = record_rdr.next()

                        # print("Parsed: {}".format(record))

                        # if first_record:
                        #     # This will be the headers, keep them around
                        #     header = record
                        #     first_record = False
                        # else:
                        #     record_dict = dict(zip(header, record))
                        #     yield record_dict

                        # TODO: Not sure how to handle timers exactly, while time will be taken yielding it may
                        # genuinely be time that this method is waiting for data from s3. May be a case of measuring
                        # both, something to think about.

                        # self.timer.stop()
                        yield record
                        # self.timer.start()

                    else:
                        # It's an incomplete record, save for next iteration
                        # print("Incomplete: {}".format(record_str))
                        prev_record_str = record_str

            elif 'Stats' in event:
                self.bytes_scanned += event['Stats']['Details']['BytesScanned']
                self.bytes_processed += event['Stats']['Details'][
                    'BytesProcessed']
                self.bytes_returned += event['Stats']['Details'][
                    'BytesReturned']
                # print("{} Stats Event: bytes scanned: {}, bytes processed: {}, bytes returned: {}"
                #       .format(timeit.default_timer(), self.bytes_scanned, self.bytes_processed, self.bytes_returned))

            elif 'Progress' in event:
                pass
                # print("{} Progress Event".format(timeit.default_timer()))

            elif 'End' in event:
                # print("{} End Event".format(timeit.default_timer()))
                return

            elif 'Cont' in event:
                pass
                # print("{} Cont Event".format(timeit.default_timer()))

            elif 'RequestLevelError' in event:
                raise Exception(event)
                # print("{} Cont Event".format(timeit.default_timer()))

            else:
                raise Exception("Unrecognized event {}".format(event))

    def parse_file(self):
        try:
            if self.table_data and len(self.table_data.getvalue()) > 0:
                ip_stream = io.StringIO(
                    self.table_data.getvalue().decode('utf-8'))
            elif os.path.exists(self.table_local_file_path):
                ip_stream = open(self.table_local_file_path, 'r')
            else:
                return

            self.time_to_first_record_response = self.time_to_last_record_response = self.timer.elapsed(
            )

            # with open(self.table_local_file_path, 'r', buffering=(2 << 16) + 8) as table_file:
            header_row = True
            for t in ip_stream:

                if header_row:
                    header_row = False
                    continue

                tup = t.split('|')[:-1]

                yield tup

        except Exception as e:
            print("can not open table file at {} with error {}".format(
                self.table_local_file_path, e.message))
            raise e
        finally:
            ip_stream.close()

    def close(self):
        """Closes the s3 event stream

        :return: None
        """
        if self.event_stream:
            self.event_stream.close()
        if self.table_data:
            self.table_data.close()

    def save_table(self):
        """
        Saves the table data to disk
        :return:
        """
        if self.table_data:
            proj_dir = os.environ['PYTHONPATH'].split(":")[0]
            table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC)
            create_dirs(table_loc)

            self.table_local_file_path = os.path.join(table_loc, self.s3key)

            if not os.path.exists(self.table_local_file_path):
                create_file_dirs(self.table_local_file_path)
                with open(self.table_local_file_path, 'w') as table_file:
                    table_file.write(self.table_data.getvalue())

    @staticmethod
    def calculate_num_http_requests(table_data, config):
        if table_data is not None and len(table_data.getvalue()):
            shard_max_size = config.multipart_threshold
            return math.ceil(
                len(table_data.getvalue()) / (1.0 * shard_max_size))

        return 1
Exemplo n.º 17
0
class NativeCursor(object):
    """Represents a database cursor for managing the context of a fetch operation.

    Intended to be modelled after the Python DB API. All it supports at present is select, taking a s3 key and sql
    string to execute. The rows returned from s3 are returned as an iterator.

    Importantly supports streaming of records which gives more control over simply passing Python data structures
    around.

    """
    def __init__(self, fast_s3):
        self.fast_s3 = fast_s3

        self.s3key = None
        self.s3sql = None

        self.timer = Timer()

        self.time_to_first_record_response = None
        self.time_to_last_record_response = None

        self.query_bytes = 0
        self.bytes_scanned = 0
        self.bytes_processed = 0
        self.bytes_returned = 0

        self.num_http_get_requests = 0

    def select(self, s3key, s3sql):
        """Creates a select cursor

        :param s3sql:  the sql to execute
        :param s3key: the s3 key to select against
        :return: the cursor
        """

        self.s3key = s3key
        self.s3sql = s3sql

        # There doesn't seem to be a way to capture the bytes sent to s3, but we can use this for comparison purposes
        self.query_bytes = len(self.s3key.encode('utf-8')) + len(
            self.s3sql.encode('utf-8'))

        return self

    def execute(self, on_data):
        """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could
        model that API more precisely in future.

        :return: An iterable of the records fetched
        """

        # print("Executing select_object_content")

        # self.last_df = None

        def on_records(np_array):

            # print("|||")
            # print(type(np_array))
            # print(np_array)
            # print("|||")

            elapsed_time = self.timer.elapsed()
            if self.time_to_first_record_response is None:
                self.time_to_first_record_response = elapsed_time

            self.time_to_last_record_response = elapsed_time

            df = pd.DataFrame(np_array)
            df = df.add_prefix('_')

            # self.last_df = df

            on_data(df)

        self.timer.start()

        self.fast_s3.execute(self.s3key, self.s3sql, on_records)

        self.bytes_scanned = self.fast_s3.get_bytes_scanned()
        self.bytes_processed = self.fast_s3.get_bytes_processed()
        self.bytes_returned = self.fast_s3.get_bytes_returned()
Exemplo n.º 18
0
def test_timer_loop():

    num_sleeps = 100
    sleep_duration = 0.1

    t1 = Timer()
    t2 = Timer()

    t1.start()

    for i in range(0, num_sleeps):
        t2.start()
        time.sleep(sleep_duration)
        t2.stop()

    t1.stop()

    # print("Elapsed time {}".format(t1.elapsed()))
    # print("Elapsed time {}".format(t2.elapsed()))

    expected_elapsed_time = num_sleeps * sleep_duration
    assert expected_elapsed_time - 0.1 < t1.elapsed(
    ) < expected_elapsed_time + 0.1
    assert expected_elapsed_time - 0.1 < t2.elapsed(
    ) < expected_elapsed_time + 0.1