Пример #1
0
class Report:
    """
    Access, filter, publish, and extract data from in-memory reports.

    Create a Report object to load basic information on a report dataset. Specify subset of report
    to be fetched through Report.apply_filters() and Report.clear_filters(). Fetch dataset through
    Report.to_dataframe() method.

    Attributes:
        connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
        report_id: Identifier of a pre-existing report containing the required data.
    """
    def __init__(self, connection, report_id):
        """
        Initialize an instance of a report

        Args:
            connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
            report_id (str): Identifier of a pre-existing report containing the required data.
        """
        init_pbar = tqdm(
            desc='Initializing an instance of a report. Please wait...',
            bar_format='{desc}',
            leave=False,
            ncols=285)

        self._connection = connection
        self._report_id = report_id

        # load report information
        self.__definition()

        # load attribute elements
        self.__OFFSET = 0
        self.__attr_elements()

        self._filter = Filter(attributes=self._attributes,
                              metrics=self._metrics,
                              attr_elements=self._attr_elements)

        self._dataframe = None
        init_pbar.close()

    def to_dataframe(self, limit=25000, progress_bar=True):
        """Extract contents of a report instance into a Pandas Data Frame. Formerly `microstrategy.Connection.get_report()`.

        Args:
            limit (int, optional): Used to control data extract behavior on datasets with a large number of rows.
                The default is 25000. As an example, if the dataset has 50,000 rows, get_report() will incrementally
                extract all 50,000 rows in 1,000 row chunks. Depending on system resources, a higher limit (e.g. 10,000)
                may reduce the total time required to extract the entire dataset.
            progress_bar(bool, optional): If True (default), show the upload progress bar.

        Returns:
            Pandas Data Frame containing the report contents
        """
        inst_pbar = tqdm(
            desc='Connecting to MicroStrategy I-Server. Please wait...',
            bar_format='{desc}',
            leave=False,
            ncols=310)

        # Request a new instance, set instance id
        res = reports.report_instance(connection=self._connection,
                                      report_id=self._report_id,
                                      body=self._filter.filter_body(),
                                      offset=self.__OFFSET,
                                      limit=limit)
        inst_pbar.close()
        if not res.ok:
            msg = "Error getting report contents."
            self.__response_handler(response=res, msg=msg)
        else:
            _instance = res.json()
            _instance_id = _instance['instanceId']

            # Gets the pagination totals from the response object
            _pagination = _instance['result']['data']['paging']

            # If there are more rows to fetch, fetch them
            if _pagination['current'] != _pagination['total']:

                # initialize a list to capture slices from each query, and append the first request's result to the list
                table_data = [parsejson(response=_instance)]

                # Count the number of iterations
                it_total = int(_pagination['total'] /
                               limit) + (_pagination['total'] % limit != 0)

                # Fetch add'l rows from this object instance from the intelligence server
                with tqdm(total=it_total,
                          disable=(not progress_bar)) as fetch_pbar:
                    if progress_bar:
                        fetch_pbar.update()
                        fetch_pbar.set_description("Downloading")
                        fetch_pbar.set_postfix(rows=limit)
                    for _offset in range(limit, _pagination['total'], limit):
                        if progress_bar:
                            fetch_pbar.update()
                            fetch_pbar.set_description("Downloading")
                            fetch_pbar.set_postfix(
                                rows=min(_offset +
                                         limit, _pagination['total']))
                        response = reports.report_instance_id(
                            connection=self._connection,
                            report_id=self._report_id,
                            instance_id=_instance_id,
                            offset=_offset,
                            limit=limit)
                        table_data.append(parsejson(response=response.json()))

                # concatenate and return the list of result data as a data frame
                self._dataframe = pd.concat(table_data).reset_index(drop=True)
            else:
                # otherwise parse the first result and return it as a dataframe
                self._dataframe = parsejson(response=_instance)

            return self._dataframe

    def apply_filters(self, attributes=None, metrics=None, attr_elements=None):
        """Apply filters on the report's objects, so only selected attributes, metrics and attributes' elements
        will be retrieved from Intelligence Server.

        Args:
            attributes (list, optional): attributes to be included in the filter.
            metrics (list, optional): metrics to be included in the filter.
            attr_elements (list, optional): attributes' elements to be included in the filter.
        """
        if attributes:
            self._filter.select(object_id=attributes)

        if attributes == []:
            self._filter.attr_selected = []

        if metrics:
            self._filter.select(object_id=metrics)

        if metrics == []:
            self._filter.metr_selected = []

        if attr_elements is not None:
            self._filter.select(object_id=attr_elements)

    def clear_filters(self):
        """Clear previously set filters, allowing all attributes, metrics, and attribute elements to be retrieved."""
        self._filter.clear()

    def __definition(self):
        """Get the definition of a report, including attributes and metrics. Implements GET /reports/<report_id>"""

        res = reports.report(connection=self._connection,
                             report_id=self._report_id)

        if not res.ok:
            self.__response_handler(
                response=res,
                msg="Error getting report definition. Check report ID.")
            return None
        else:
            _definition = res.json()
            self._name = _definition["name"]
            full_attributes = _definition["result"]["definition"][
                "availableObjects"]["attributes"]
            full_metrics = _definition["result"]["definition"][
                "availableObjects"]["metrics"]
            self._attributes = [{
                'name': attr['name'],
                'id': attr['id']
            } for attr in full_attributes]
            self._metrics = [{
                'name': metr['name'],
                'id': metr['id']
            } for metr in full_metrics]

    def __attr_elements(self, limit=25000):
        """Get elements of report attributes. Implements GET /reports/<report_id>/attributes/<attribute_id>/elements"""

        attr_elements = []
        if self._attributes is not None:

            for attr in self._attributes:
                # Fetch first chunk of attribute elements.
                res = reports.report_single_attribute_elements(
                    connection=self._connection,
                    report_id=self._report_id,
                    attribute_id=attr['id'],
                    offset=self.__OFFSET,
                    limit=limit)
                if not res.ok:
                    msg = "Error retrieving attribute '" + attr[
                        'name'] + "' elements."
                    self.__response_handler(response=res, msg=msg)

                else:
                    # Get total number of rows from headers.
                    total = int(res.headers['x-mstr-total-count'])
                    # Get attribute elements from the response.
                    elements = res.json()

                    # If total number of elements is bigger than the chunk size (limit), fetch them incrementally.
                    for _offset in range(limit, total, limit):
                        res = reports.report_single_attribute_elements(
                            connection=self._connection,
                            report_id=self._report_id,
                            attribute_id=attr['id'],
                            offset=_offset,
                            limit=limit)
                        elements.extend(res.json())

                    # Append attribute data to the list of attributes.
                    attr_elements.append({
                        "attribute_name": attr['name'],
                        "attribute_id": attr['id'],
                        "elements": elements
                    })

            self._attr_elements = attr_elements

    @staticmethod
    def __response_handler(response, msg):
        """Generic error message handler for transactions against datasets.

        Args:
            response: Response object returned by HTTP request.
            msg (str): Message to print in addition to any server-generated error message(s).

        """
        res = json.loads(response.content)
        print(msg)
        print("HTTP %i %s" % (response.status_code, response.reason))
        print("I-Server Error %s, %s" % (res['code'], res['message']))

    @property
    def name(self):
        return self._name

    @property
    def attributes(self):
        return self._attributes

    @property
    def metrics(self):
        return self._metrics

    @property
    def attr_elements(self):
        if not self._attr_elements:
            self.__attr_elements()
        return self._attr_elements

    @property
    def selected_attributes(self):
        return self._filter.attr_selected

    @property
    def selected_metrics(self):
        return self._filter.metr_selected

    @property
    def selected_attr_elements(self):
        return self._filter.attr_elem_selected

    @property
    def dataframe(self):
        if self._dataframe is None:
            warnings.warn(
                "Dataframe not loaded. Retrieve with Report.to_dataframe().",
                Warning,
                stacklevel=2)
        return self._dataframe
Пример #2
0
class Cube:
    """
    Access, filter, publish, and extract data from MicroStrategy in-memory cubes.

    Create a Cube object to load basic information on a cube dataset. Specify subset of cube
    to be fetched through Cube.apply_filters() and Cube.clear_filters(). Fetch dataset through
    Cube.to_dataframe() method.

    Attributes:
        connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
        cube_id: Identifier of a pre-existing cube containing the required data.
    """
    def __init__(self, connection, cube_id):
        """
        Initialize an instance of a cube.

        Args:
            connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
            cube_id (str): Identifier of a pre-existing cube containing the required data.
        """
        init_pbar = tqdm(
            desc='Initializing an instance of a cube. Please wait...',
            bar_format='{desc}',
            leave=False,
            ncols=280)

        self._connection = connection
        self._cube_id = cube_id

        # load dataset information
        self.__info()
        self.__definition()

        # load attribute elements
        self.__OFFSET = 0
        self.__attr_elements()

        self._filter = Filter(attributes=self._attributes,
                              metrics=self._metrics,
                              attr_elements=self._attr_elements)

        self._dataframe = None
        self._dataframes = []
        init_pbar.close()

    def to_dataframe(self, limit=25000, progress_bar=True, multi_df=False):
        """
        Extract contents of a cube into a Pandas Data Frame. Previously `microstrategy.Connection.get_cube()`.

        Args:
            limit (int, optional): Used to control data extract behavior on datasets with a large number of rows.
                The default is 25000. As an example, if the dataset has 50,000 rows, get_cube() will incrementally
                extract all 50,000 rows in 1,000 row chunks. Depending on system resources, a higher limit (e.g. 10,000)
                may reduce the total time required to extract the entire dataset.
            progress_bar(bool, optional): If True (default), show the upload progress bar.
            multi_df (bool, optional): If True (default), return a list of df resembling the table structure of the cube. If False, returns one df.

        Returns:
            Pandas Data Frame containing the cube contents
        """
        inst_pbar = tqdm(
            desc='Connecting to MicroStrategy I-Server. Please wait...',
            bar_format='{desc}',
            leave=False,
            ncols=310)

        # Request a new instance, set instance id
        res = cubes.cube_instance(connection=self._connection,
                                  cube_id=self._cube_id,
                                  body=self._filter.filter_body(),
                                  offset=self.__OFFSET,
                                  limit=limit)
        inst_pbar.close()
        if not res.ok:
            msg = "Error getting cube contents."
            self.__response_handler(response=res, msg=msg)
        else:
            _instance = res.json()
            _instance_id = _instance['instanceId']

            # Gets the pagination totals from the response object
            _pagination = _instance['result']['data']['paging']

            # If there are more rows to fetch, fetch them
            if _pagination['current'] != _pagination['total']:

                # initialize a list to capture slices from each query, and append the first request's result to the list
                table_data = [parsejson(response=_instance)]

                # Count the number of iterations
                it_total = int(_pagination['total'] /
                               limit) + (_pagination['total'] % limit != 0)

                # Fetch add'l rows from this object instance from the intelligence server
                with tqdm(total=it_total,
                          disable=(not progress_bar)) as fetch_pbar:
                    if progress_bar:
                        fetch_pbar.update()
                        fetch_pbar.set_description("Downloading")
                        fetch_pbar.set_postfix(rows=limit)
                    for _offset in range(limit, _pagination['total'], limit):
                        if progress_bar:
                            fetch_pbar.update()
                            fetch_pbar.set_description("Downloading")
                            fetch_pbar.set_postfix(
                                rows=min(_offset +
                                         limit, _pagination['total']))
                        response = cubes.cube_instance_id(
                            connection=self._connection,
                            cube_id=self._cube_id,
                            instance_id=_instance_id,
                            offset=_offset,
                            limit=limit)
                        table_data.append(parsejson(response=response.json()))

                # concatenate and return the list of result data as a data frame
                self._dataframe = pd.concat(table_data).reset_index(drop=True)
            else:
                # otherwise parse the first result and return it as a dataframe
                self._dataframe = parsejson(response=_instance)

            # split dataframe to dataframes matching tables in Cube
            _tables = self.multitable_definition()
            if multi_df:
                self._dataframes = [
                    self._dataframe[columns].copy()
                    for _, columns in _tables.items()
                ]
                return self._dataframes
            else:
                return self._dataframe

    def multitable_definition(self):
        """
        Return all tables names and collumns as a dictionary
        """
        res_tables = datasets.dataset_definition(connection=self._connection,
                                                 dataset_id=self._cube_id,
                                                 fields=['tables', 'columns'])
        _ds_definition = res_tables.json()

        _columns_tables = {}
        for table in _ds_definition['result']['definition'][
                'availableObjects']['tables']:
            _column_list = []
            for column in _ds_definition['result']['definition'][
                    'availableObjects']['columns']:
                if (table['name'] == column['tableName']):
                    _column_list.append(column['columnName'])
            _columns_tables[table['name']] = _column_list
        return _columns_tables

    def apply_filters(self, attributes=None, metrics=None, attr_elements=None):
        """
        Apply filters on the cube data so only the chosen attributes, metrics, and attribute elements are retrieved
        from the Intelligence Server.

        Args:
            attributes (list, optional): ids of attributes to be included in the filter.
            metrics (list, optional): ids of metrics to be included in the filter.
            attr_elements (list, optional): attributes' elements to be included in the filter.
        """

        if attributes:
            self._filter.select(object_id=attributes)

        if attributes == []:
            self._filter.attr_selected = []

        if metrics:
            self._filter.select(object_id=metrics)

        if metrics == []:
            self._filter.metr_selected = []

        if attr_elements is not None:
            self._filter.select(object_id=attr_elements)

    def clear_filters(self):
        """Clear previously set filters, allowing all attributes, metrics, and attribute elements to be retrieved."""

        self._filter.clear()

    def __info(self):
        """Get metadata for specific cubes. Implements GET /cubes to retrieve basic metadata."""

        res = cubes.cube_info(connection=self._connection,
                              cube_id=self._cube_id)

        if not res.ok:
            self.__response_handler(
                response=res,
                msg="Error getting cube metadata information. Check Cube ID.")
            return None
        else:
            _info = res.json()["cubesInfos"][0]
            self._name = _info["cubeName"]
            self._owner_id = _info["ownerId"]
            self._path = _info["path"]
            self._last_modified = _info["modificationTime"]
            self._server_mode = _info["serverMode"]
            self._size = _info["size"]
            self._status = _info["status"]

    def __definition(self):
        """Get the definition of a cube, including attributes and metrics. Implements GET /cubes/<cube_id>."""

        res = cubes.cube(connection=self._connection, cube_id=self._cube_id)

        if not res.ok:
            self.__response_handler(
                response=res,
                msg="Error getting cube definition. Check Cube ID.")
            return None
        else:
            _definition = res.json()
            full_attributes = _definition["result"]["definition"][
                "availableObjects"]["attributes"]
            full_metrics = _definition["result"]["definition"][
                "availableObjects"]["metrics"]
            self._attributes = [{
                'name': attr['name'],
                'id': attr['id']
            } for attr in full_attributes]
            self._metrics = [{
                'name': metr['name'],
                'id': metr['id']
            } for metr in full_metrics]

    def __attr_elements(self, limit=25000):
        """Get attribute elements. Implements GET /cubes/<cube_id>/attributes/<attribute_id>/elements"""

        attr_elements = []
        if self._attributes is not None:

            for attr in self._attributes:
                # Fetch first chunk of attribute elements.
                res = cubes.cube_single_attribute_elements(
                    connection=self._connection,
                    cube_id=self._cube_id,
                    attribute_id=attr['id'],
                    offset=self.__OFFSET,
                    limit=limit)
                if not res.ok:
                    msg = "Error retrieving attribute '" + attr[
                        'name'] + "' elements."
                    self.__response_handler(response=res, msg=msg)
                else:
                    # Get total number of rows from headers.
                    total = int(res.headers['x-mstr-total-count'])
                    # Get attribute elements from the response.
                    elements = res.json()

                    # If total number of elements is bigger than the chunk size (limit), fetch them incrementally.
                    for _offset in range(limit, total, limit):
                        res = cubes.cube_single_attribute_elements(
                            connection=self._connection,
                            cube_id=self._cube_id,
                            attribute_id=attr['id'],
                            offset=_offset,
                            limit=limit)
                        elements.extend(res.json())

                    # Append attribute data to the list of attributes.
                    attr_elements.append({
                        "attribute_name": attr['name'],
                        "attribute_id": attr['id'],
                        "elements": elements
                    })

            self._attr_elements = attr_elements

    @staticmethod
    def __response_handler(response, msg):
        """Generic error message handler for transactions against datasets.

        Args:
            response: Response object returned by HTTP request.
            msg (str): Message to print in addition to any server-generated error message(s).

        """
        res = json.loads(response.content)
        print(msg)
        print("HTTP %i %s" % (response.status_code, response.reason))
        print("I-Server Error %s, %s" % (res['code'], res['message']))

    @property
    def name(self):
        return self._name

    @property
    def size(self):
        return self._size

    @property
    def status(self):
        return self._status

    @property
    def path(self):
        return self._path

    @property
    def last_modified(self):
        return self._last_modified

    @property
    def owner_id(self):
        return self._owner_id

    @property
    def attributes(self):
        return self._attributes

    @property
    def metrics(self):
        return self._metrics

    @property
    def attr_elements(self):
        return self._attr_elements

    @property
    def selected_attributes(self):
        return self._filter.attr_selected

    @property
    def selected_metrics(self):
        return self._filter.metr_selected

    @property
    def selected_attr_elements(self):
        return self._filter.attr_elem_selected

    @property
    def dataframe(self):
        if self._dataframe is None:
            warnings.warn(
                "Dataframe looks empty. Make sure the data is valid (default join on i-Server is inner) and then retrieve with Cube.to_dataframe().",
                Warning,
                stacklevel=2)
        return self._dataframe

    @property
    def dataframes(self):
        if len(self._dataframes) == 0:
            warnings.warn(
                "Dataframes look empty. Make sure the data is valid (default join on i-Server is inner) and then retrieve with Cube.to_dataframe().",
                Warning,
                stacklevel=2)
        return self._dataframes
Пример #3
0
class Cube:
    """
    Access, filter, publish, and extract data from MicroStrategy in-memory cubes.

    Create a Cube object to load basic information on a cube dataset. Specify subset of cube
    to be fetched through Cube.apply_filters() and Cube.clear_filters(). Fetch dataset through
    Cube.to_dataframe() method.

    Attributes:
        connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
        cube_id: Identifier of a pre-existing cube containing the required data.
        parallel (bool, optional): If True (default), utilize optimal number of threads to
            increase the download speed. If False, this feature will be disabled.
        progress_bar(bool, optional): If True (default), show the download progress bar.

    """

    def __init__(self, connection, cube_id, parallel=True, progress_bar=True):
        """Initialize an instance of a cube.

        Args:
            connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
            cube_id (str): Identifier of a pre-existing cube containing the required data.
            parallel (bool, optional): If True (default), utilize optimal number of threads to
                increase the download speed. If False, this feature will be disabled.
            progress_bar(bool, optional): If True (default), show the download progress bar.

        """

        self._connection = connection
        self._cube_id = cube_id
        self.parallel = parallel
        self.progress_bar = progress_bar
        self._size_limit = 10000000      # this sets desired chunk size in bytes
        self._initial_limit = 1000     # initial limit for the cube_instance request


        # load dataset information
        self.__info()
        self.__definition()

        # load attribute elements
        self.__OFFSET = 0
        self._attr_elements = []

        self._filter = Filter(attributes=self._attributes, metrics=self._metrics, attr_elements=self._attr_elements)

        self._dataframe = None
        self._dataframes = []
        self._table_definition = {}

    def to_dataframe(self, limit=None, multi_df=False):
        """Extract contents of a cube into a Pandas Data Frame. Previously `microstrategy.Connection.get_cube()`.

        Args:
            limit (None or int, optional): Used to control data extract behavior. By default (None)
                the limit is calculated automatically, based on an optimized physical size of one
                chunk. Setting limit manually will force the number of rows per chunk. Depending on
                system resources, a higher limit (e.g. 50,000) may reduce the total time required
                to extract the entire dataset.
            multi_df (bool, optional): If True, return a list of data frames resembling the table
                structure of the cube. If False (default), returns one data frame.

        Returns:
            Pandas Data Frame containing the cube contents
        """
        inst_pbar = tqdm(desc='Initializing an instance of a cube. Please wait...',
                         bar_format='{desc}', leave=False, ncols=280, disable=(not self.progress_bar))
        if limit:
            self._initial_limit = limit

        # Request a new instance, set instance id
        res = cubes.cube_instance(connection=self._connection,
                                  cube_id=self._cube_id,
                                  body=self._filter.filter_body(),
                                  offset=self.__OFFSET,
                                  limit=self._initial_limit)
        inst_pbar.close()

        # Gets the pagination totals and instance_id from the response object
        _instance = res.json()
        _instance_id = _instance['instanceId']
        _pagination = _instance['data']['paging']

        # initialize parser and process first response
        p = Parser(response=_instance, parse_cube=True)
        p.parse(response=_instance)

        # If there are more rows to fetch, fetch them
        if _pagination['current'] != _pagination['total']:
            if not limit:
                limit = max(1000, int((self._initial_limit * self._size_limit) / len(res.content)))
            # Count the number of additional iterations
            it_total = int((_pagination['total']-self._initial_limit)/limit) + ((_pagination['total']-self._initial_limit) % limit != 0)

            if self.parallel and it_total > 1:
                threads = helper.get_parallel_number(it_total)
                with FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) as session:
                    fetch_pbar = tqdm(desc="Downloading", total=it_total+1, disable=(not self.progress_bar))
                    future = self.__fetch_chunks_future(session, _pagination, _instance_id, limit)
                    fetch_pbar.update()
                    for i, f in enumerate(future, start=1):
                        response = f.result()
                        if not response.ok:
                            current_offset = self._initial_limit+(i-1)*limit
                            response = cubes.cube_instance_id(connection=self._connection, cube_id=self._cube_id,
                                                              instance_id=_instance_id, offset=current_offset,
                                                              limit=limit)
                        fetch_pbar.update()
                        fetch_pbar.set_postfix(rows=str(min(self._initial_limit+i*limit, _pagination['total'])))
                        p.parse(response.json())
                    fetch_pbar.close()
            else:
                self.__fetch_chunks(p, _pagination, it_total, _instance_id, limit)

        # return parsed data as a data frame
        self._dataframe = p.dataframe
        # split dataframe to dataframes matching tables in Cube
        if multi_df:
            # save the multitable_definition response to the instance
            self.__multitable_definition()
            # split dataframe to dataframes matching tables in Cube
            self._dataframes = [self._dataframe[columns].copy() for _, columns in self._table_definition.items()]
            return self._dataframes
        else:
            return self._dataframe

    def __fetch_chunks_future(self, session, pagination, instance_id, limit):
        # Fetch add'l rows from this object instance from the intelligence server
        return [cubes.cube_instance_id_coroutine(session, connection=self._connection,
                                                 cube_id=self._cube_id,
                                                 instance_id=instance_id,
                                                 offset=_offset,
                                                 limit=limit)
                for _offset in range(self._initial_limit, pagination['total'], limit)]

    def __fetch_chunks(self, parser, pagination, it_total, instance_id, limit):

        # Fetch add'l rows from this object instance from the intelligence server
        with tqdm(desc="Downloading", total=it_total+1, disable=(not self.progress_bar)) as fetch_pbar:
            fetch_pbar.update()
            for _offset in range(self._initial_limit, pagination['total'], limit):
                response = cubes.cube_instance_id(connection=self._connection,
                                                  cube_id=self._cube_id,
                                                  instance_id=instance_id,
                                                  offset=_offset,
                                                  limit=limit)
                fetch_pbar.update()
                fetch_pbar.set_postfix(rows=str(min(_offset+limit, pagination['total'])))
                parser.parse(response=response.json())

    def apply_filters(self, attributes=None, metrics=None, attr_elements=None):
        """Apply filters on the cube data so only the chosen attributes, metrics, and attribute elements are retrieved
        from the Intelligence Server.

        Args:
            attributes (list or None, optional): ids of attributes to be included in the filter.
                If list is empty, no attributes will be selected and metric data will be aggregated.
            metrics (list or None, optional): ids of metrics to be included in the filter.
                If list is empty, no metrics will be selected.
            attr_elements (list or None, optional): attributes' elements to be included in the filter.
        """
        if not all(element is None for element in [attributes, metrics, attr_elements]):
            if attr_elements:
                self.attr_elements

            if self._attr_elements and not self._filter.attr_elems:
                self._filter = Filter(attributes=self._attributes, metrics=self._metrics,
                                      attr_elements=self._attr_elements)

            if attributes:
                self._filter.select(object_id=attributes)

            if attributes == []:
                self._filter.attr_selected = []

            if metrics:
                self._filter.select(object_id=metrics)

            if metrics == []:
                self._filter.metr_selected = []

            if attr_elements is not None:
                self._filter.select(object_id=attr_elements)

    def clear_filters(self):
        """Clear previously set filters, allowing all attributes, metrics, and attribute elements to be retrieved."""

        self._filter.clear()

    def __multitable_definition(self):
        """Return all tables names and columns as a dictionary"""

        res_tables = datasets.dataset_definition(connection=self._connection,
                                                 dataset_id=self._cube_id,
                                                 fields=['tables', 'columns'])
        _ds_definition = res_tables.json()

        for table in _ds_definition['result']['definition']['availableObjects']['tables']:
            column_list = [column['columnName']
                           for column in _ds_definition['result']['definition']['availableObjects']['columns']
                           if table['name'] == column['tableName']]
            self._table_definition[table['name']] = column_list

    def __info(self):
        """Get metadata for specific cubes. Implements GET /cubes to retrieve basic metadata."""

        res = cubes.cube_info(connection=self._connection, cube_id=self._cube_id)

        _info = res.json()["cubesInfos"][0]
        self._name = _info["cubeName"]
        self._owner_id = _info["ownerId"]
        self._path = _info["path"]
        self._last_modified = _info["modificationTime"]
        self._server_mode = _info["serverMode"]
        self._size = _info["size"]
        self._status = _info["status"]

    def __definition(self):
        """Get the definition of a cube, including attributes and metrics. Implements GET /v2/cubes/<cube_id>."""

        res = cubes.cube_definition(connection=self._connection, cube_id=self._cube_id)

        _definition = res.json()
        full_attributes = _definition["definition"]["availableObjects"]["attributes"]
        full_metrics = _definition["definition"]["availableObjects"]["metrics"]
        self._attributes = [{'name': attr['name'], 'id': attr['id']} for attr in full_attributes]
        self._metrics = [{'name': metr['name'], 'id': metr['id']} for metr in full_metrics]

    def __get_attr_elements(self, limit=200000):
        """Get elements of report attributes synchronously. Implements GET /reports/<report_id>/attributes/<attribute_id>/elements"""

        attr_elements = []
        if self._attributes is not None:
            pbar = tqdm(self._attributes, desc="Loading attribute elements", leave=False, disable=(not self.progress_bar))
            # Fetch first chunk of attribute elements.
            for i, attr in enumerate(pbar):
                # Fetch first chunk of attribute elements.
                response = cubes.cube_single_attribute_elements(connection=self._connection,
                                                                cube_id=self._cube_id,
                                                                attribute_id=attr['id'],
                                                                offset=self.__OFFSET,
                                                                limit=limit)
                # Get total number of rows from headers.
                total = int(response.headers['x-mstr-total-count'])
                # Get attribute elements from the response.
                elements = response.json()

                # If total number of elements is bigger than the chunk size (limit), fetch them incrementally.
                for _offset in range(limit, total, limit):
                    response = cubes.cube_single_attribute_elements(connection=self._connection,
                                                                    cube_id=self._cube_id,
                                                                    attribute_id=attr['id'],
                                                                    offset=_offset,
                                                                    limit=limit)
                    elements.extend(response.json())

                # Append attribute data to the list of attributes.
                attr_elements.append({"attribute_name": attr['name'],
                                      "attribute_id": attr['id'],
                                      "elements": elements})
            pbar.close()

        return attr_elements

    def __get_attr_elements_async(self, limit=200000):
        """Get attribute elements. Implements GET /cubes/<cube_id>/attributes/<attribute_id>/elements"""

        attr_elements = []
        if self._attributes is not None:
            threads = helper.get_parallel_number(len(self._attributes))
            with FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) as session:
                # Fetch first chunk of attribute elements.
                futures = self.__fetch_attribute_elements_chunks(session, limit)
                pbar = tqdm(futures, desc="Loading attribute elements", leave=False, disable=(not self.progress_bar))
                for i, future in enumerate(pbar):
                    attr = self.attributes[i]
                    response = future.result()
                    if not response.ok:
                        response = cubes.cube_single_attribute_elements(connection=self._connection,
                                                                        cube_id=self._cube_id,
                                                                        attribute_id=attr["id"],
                                                                        offset=0,
                                                                        limit=limit)
                    elements = response.json()
                    # Get total number of rows from headers.
                    total = int(response.headers['x-mstr-total-count'])
                    for _offset in range(limit, total, limit):
                        response = cubes.cube_single_attribute_elements(connection=self._connection,
                                                                        cube_id=self._cube_id,
                                                                        attribute_id=attr["id"],
                                                                        offset=_offset,
                                                                        limit=limit)
                        elements.extend(response.json())
                    # Append attribute data to the list of attributes.
                    attr_elements.append({"attribute_name": attr['name'],
                                          "attribute_id": attr['id'],
                                          "elements": elements})
                pbar.close()

            return attr_elements

    def __fetch_attribute_elements_chunks(self, session, limit):
        # Fetch add'l rows from this object instance from the intelligence server
        return [cubes.cube_single_attribute_elements_coroutine(session,
                                                               connection=self._connection,
                                                               cube_id=self._cube_id,
                                                               attribute_id=attribute['id'],
                                                               offset=0,
                                                               limit=limit)
                for attribute in self._attributes]

    @property
    def name(self):
        return self._name

    @property
    def size(self):
        return self._size

    @property
    def status(self):
        return self._status

    @property
    def path(self):
        return self._path

    @property
    def last_modified(self):
        return self._last_modified

    @property
    def owner_id(self):
        return self._owner_id

    @property
    def attributes(self):
        return self._attributes

    @property
    def metrics(self):
        return self._metrics

    @property
    def attr_elements(self):
        if not self._attr_elements:
            if self.parallel is True:
                self._attr_elements = self.__get_attr_elements_async()
            else:
                self._attr_elements = self.__get_attr_elements()
        return self._attr_elements

    @property
    def selected_attributes(self):
        return self._filter.attr_selected

    @property
    def selected_metrics(self):
        return self._filter.metr_selected

    @property
    def selected_attr_elements(self):
        return self._filter.attr_elem_selected

    @property
    def dataframe(self):
        if self._dataframe is None:
            helper.exception_handler(msg="Dataframe not loaded. Retrieve with Report.to_dataframe().",
                                     exception_type=Warning, throw_error=False)
        return self._dataframe

    @property
    def dataframes(self):
        if len(self._dataframes) == 0:
            helper.exception_handler(msg="Dataframe not loaded. Retrieve with Report.to_dataframe().",
                                     exception_type=Warning, throw_error=False)
        return self._dataframes

    @property
    def table_definition(self):
        return self._table_definition
Пример #4
0
class Report:
    """
    Access, filter, publish, and extract data from in-memory reports.

    Create a Report object to load basic information on a report dataset. Specify subset of report
    to be fetched through Report.apply_filters() and Report.clear_filters(). Fetch dataset through
    Report.to_dataframe() method.

    Attributes:
        connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
        report_id: Identifier of a pre-existing report containing the required data.
        parallel (bool, optional): If True (default), utilize optimal number of threads to
            increase the download speed. If False, this feature will be disabled.
        progress_bar(bool, optional): If True (default), show the download progress bar.

    """

    def __init__(self, connection, report_id, parallel=True, progress_bar=True):
        """Initialize an instance of a report

        Args:
            connection: MicroStrategy connection object returned by `microstrategy.Connection()`.
            report_id (str): Identifier of a pre-existing report containing the required data.
            parallel (bool, optional): If True (default), utilize optimal number of threads to
                increase the download speed. If False, this feature will be disabled.
            progress_bar(bool, optional): If True (default), show the download progress bar.

        """

        self._connection = connection
        self._report_id = report_id
        self._subtotals = None
        self.cross_tab = False
        self.cross_tab_filter = {}

        self.parallel = parallel
        self.progress_bar = progress_bar
        self._size_limit = 10000000      # this sets desired chunk size in bytes
        self._initial_limit = 1000     # initial limit for the report_instance request

        # load report information
        self.__definition()

        # load attribute elements
        self.__OFFSET = 0
        self._attr_elements = []

        self._filter = Filter(attributes=self._attributes, metrics=self._metrics, attr_elements=self._attr_elements)

        if self.cross_tab:
            self._filter.select(object_id=[el['id'] for el in self._attributes])
            self._filter.select(object_id=[el['id'] for el in self._metrics])
        self._dataframe = None

    def to_dataframe(self, limit=None):
        """Extract contents of a report instance into a Pandas Data Frame. Formerly `microstrategy.Connection.get_report()`.

        Args:
            limit (None or int, optional): Used to control data extract behavior. By default (None)
                the limit is calculated automatically, based on an optimized physical size of one
                chunk. Setting limit manually will force the number of rows per chunk. Depending on
                system resources, a higher limit (e.g. 50,000) may reduce the total time required
                to extract the entire dataset.

        Returns:
            Pandas Data Frame containing the report contents
        """
        inst_pbar = tqdm(desc='Initializing an instance of a report. Please wait...',
                         bar_format='{desc}', leave=False, ncols=285, disable=(not self.progress_bar))

        # Switch off subtotals if I-Server version is higher than 11.2.1
        body = self._filter.filter_body()
        if version.parse(self._connection.iserver_version) >= version.parse("11.2.0100"):
            self._subtotals["visible"] = False
            body["subtotals"] = {"visible": self._subtotals["visible"]}

        if limit:
            self._initial_limit = limit
        # Request a new instance, set instance id
        res = reports.report_instance(connection=self._connection,
                                      report_id=self._report_id,
                                      body=body,
                                      offset=self.__OFFSET,
                                      limit=self._initial_limit)
        inst_pbar.close()

        # Gets the pagination totals from the response object
        _instance = res.json()
        _instance_id = _instance['instanceId']
        _pagination = _instance['data']['paging']

        # initialize parser and process first response
        p = Parser(response=_instance, parse_cube=False)
        p.parse(response=_instance)

        # If there are more rows to fetch, fetch them
        if _pagination['current'] != _pagination['total']:
            if not limit:
                limit = max(1000, int((self._initial_limit * self._size_limit) / len(res.content)))
            it_total = int((_pagination['total']-self._initial_limit)/limit) + ((_pagination['total']-self._initial_limit) % limit != 0)

            if self.parallel and it_total > 1:
                threads = helper.get_parallel_number(it_total)
                with FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) as session:
                    fetch_pbar = tqdm(desc="Downloading", total=it_total+1, disable=(not self.progress_bar))
                    future = self.__fetch_chunks_future(session, _pagination, _instance_id, limit)
                    fetch_pbar.update()
                    for i, f in enumerate(future, start=1):
                        response = f.result()
                        if not response.ok:
                            current_offset = self._initial_limit+(i-1)*limit
                            response = reports.report_instance_id(connection=self._connection,
                                                                  report_id=self._report_id, instance_id=_instance_id,
                                                                  offset=current_offset, limit=limit)
                        fetch_pbar.update()
                        fetch_pbar.set_postfix(rows=str(min(self._initial_limit+i*limit, _pagination['total'])))
                        p.parse(response.json())
                    fetch_pbar.close()
            else:
                self.__fetch_chunks(p, _pagination, it_total, _instance_id, limit)

        # return parsed data as a data frame
        self._dataframe = p.dataframe

        # filter received dataframe if report had crosstabs and filters were applied
        if self.cross_tab_filter != {}:
            if self.cross_tab_filter['metrics'] is not None:
                # drop metrics columns from dataframe
                metr_names = [el['name'] for el in list(filter(lambda x: x['id'] not in self.cross_tab_filter['metrics'],
                              self._metrics))]
                self._dataframe = self._dataframe.drop(metr_names, axis=1)

            if self.cross_tab_filter['attr_elements'] is not None:
                # create dict of attributes and elements to iterate through
                attr_dict = {}
                for attribute in self.cross_tab_filter['attr_elements']:
                    key = attribute[:32]
                    attr_dict.setdefault(key, []).append(attribute[33:])
                # initialize indexes series for filter
                indexes = pd.Series([False] * len(self._dataframe))

                # logical OR for filtered attribute elements
                for attribute in attr_dict:
                    attr_name = list(filter(lambda x: x['id'] in attribute, self._attributes))[0]['name']
                    elements = attr_dict[attribute]
                    indexes = indexes | self._dataframe[attr_name].isin(elements)
                # select datframe indexes with 
                self._dataframe = self._dataframe[indexes]

            if self.cross_tab_filter['attributes'] is not None:
                attr_names = [el['name'] for el in list(filter(lambda x: x['id'] not in self.cross_tab_filter['attributes'],
                              self._attributes))]
                self._dataframe = self._dataframe.drop(attr_names, axis=1)

        return self._dataframe

    def __fetch_chunks_future(self, session, pagination, instance_id, limit):
        # Fetch add'l rows from this object instance from the intelligence server
        return [reports.report_instance_id_coroutine(session, connection=self._connection,
                                                     report_id=self._report_id,
                                                     instance_id=instance_id,
                                                     offset=_offset,
                                                     limit=limit)
                for _offset in range(self._initial_limit, pagination['total'], limit)]

    def __fetch_chunks(self, parser, pagination, it_total, instance_id, limit):

        # Fetch add'l rows from this object instance from the intelligence server
        with tqdm(desc="Downloading", total=it_total+1, disable=(not self.progress_bar)) as fetch_pbar:
            fetch_pbar.update()
            for _offset in range(self._initial_limit, pagination['total'], limit):
                response = reports.report_instance_id(connection=self._connection,
                                                      report_id=self._report_id,
                                                      instance_id=instance_id,
                                                      offset=_offset,
                                                      limit=limit)
                fetch_pbar.update()
                fetch_pbar.set_postfix(rows=str(min(_offset+limit, pagination['total'])))
                parser.parse(response=response.json())

    def apply_filters(self, attributes=None, metrics=None, attr_elements=None):
        """Apply filters on the report's objects, so only selected attributes, metrics and attributes' elements
        will be retrieved from Intelligence Server.

        Args:
            attributes (list or None, optional): ids of attributes to be included in the filter.
                If list is empty, no attributes will be selected and metric data will be aggregated.
            metrics (list or None, optional): ids of metrics to be included in the filter.
                If list is empty, no metrics will be selected.
            attr_elements (list or None, optional): attributes' elements to be included in the filter.
        """
        if self.cross_tab:
            self.cross_tab_filter = {'attributes': attributes, 'metrics': metrics, 'attr_elements': attr_elements}
        elif not all(element is None for element in [attributes, metrics, attr_elements]):
            if attr_elements:           # If parameter attr_elements is applied then fetch all attribute elements
                self.attr_elements

            if self._attr_elements and not self._filter.attr_elems:
                self._filter = Filter(attributes=self._attributes, metrics=self._metrics,
                                      attr_elements=self._attr_elements)

            if attributes:
                self._filter.select(object_id=attributes)

            if attributes == []:
                self._filter.attr_selected = []

            if metrics:
                self._filter.select(object_id=metrics)

            if metrics == []:
                self._filter.metr_selected = []

            if attr_elements is not None:
                self._filter.select(object_id=attr_elements)

    def clear_filters(self):
        """Clear previously set filters, allowing all attributes, metrics, and attribute elements to be retrieved."""

        self._filter.clear()
        if self.cross_tab:
            self._filter.select(object_id=[el['id'] for el in self._attributes])
            self._filter.select(object_id=[el['id'] for el in self._metrics])

    def __definition(self):
        """Get the definition of a report, including attributes and metrics. Implements GET /v2/reports/<report_id>"""

        res = reports.report_definition(connection=self._connection, report_id=self._report_id)

        definition = res.json()
        grid = definition["definition"]["grid"]
        if version.parse(self._connection.iserver_version) >= version.parse("11.2.0100"):
            self._subtotals = grid["subtotals"]

        self._name = definition["name"]
        self.cross_tab = grid["crossTab"]
        available_objects = definition['definition']['availableObjects']

        # Check if report have custom groups or consolidations
        if available_objects['customGroups']:
            helper.exception_handler(msg="Reports with custom groups are not supported.", exception_type=ImportError)
        if available_objects['consolidations']:
            helper.exception_handler(msg="Reports with consolidations are not supported.", exception_type=ImportError)

        metrics_position = grid["metricsPosition"]
        full_metrics = grid[metrics_position["axis"]][metrics_position["index"]]["elements"]

        full_attributes = []
        for elem in grid["rows"]:
            if elem["type"] == "attribute":
                full_attributes.append(elem)
        for elem in grid["columns"]:
            if elem["type"] == "attribute":
                full_attributes.append(elem)

        self._attributes = [{'name': attr['name'], 'id': attr['id']} for attr in full_attributes]
        self._metrics = [{'name': metr['name'], 'id': metr['id']} for metr in full_metrics]

    def __get_attr_elements(self, limit=200000):
        """Get elements of report attributes synchronously. Implements GET /reports/<report_id>/attributes/<attribute_id>/elements"""

        attr_elements = []
        if self._attributes is not None:
            pbar = tqdm(self._attributes, desc="Loading attribute elements", leave=False, disable=(not self.progress_bar))
            # Fetch first chunk of attribute elements.
            for i, attr in enumerate(pbar):
                # Fetch first chunk of attribute elements.
                response = reports.report_single_attribute_elements(connection=self._connection,
                                                                    report_id=self._report_id,
                                                                    attribute_id=attr['id'],
                                                                    offset=self.__OFFSET,
                                                                    limit=limit)
                # Get total number of rows from headers.
                total = int(response.headers['x-mstr-total-count'])
                # Get attribute elements from the response.
                elements = response.json()

                # If total number of elements is bigger than the chunk size (limit), fetch them incrementally.
                for _offset in range(limit, total, limit):
                    response = reports.report_single_attribute_elements(connection=self._connection,
                                                                        report_id=self._report_id,
                                                                        attribute_id=attr['id'],
                                                                        offset=_offset,
                                                                        limit=limit)
                    elements.extend(response.json())

                # Append attribute data to the list of attributes.
                attr_elements.append({"attribute_name": attr['name'],
                                      "attribute_id": attr['id'],
                                      "elements": elements})
            pbar.close()

        return attr_elements

    def __get_attr_elements_async(self, limit=200000):
        """Get elements of report attributes asynchronously. Implements GET /reports/<report_id>/attributes/<attribute_id>/elements"""

        attr_elements = []
        if self._attributes is not None:
            threads = helper.get_parallel_number(len(self._attributes))
            with FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) as session:
                # Fetch first chunk of attribute elements.
                futures = self.__fetch_attribute_elements_chunks(session, limit)
                pbar = tqdm(futures, desc="Loading attribute elements", leave=False, disable=(not self.progress_bar))
                for i, future in enumerate(pbar):
                    attr = self.attributes[i]
                    response = future.result()
                    if not response.ok:
                        response = reports.report_single_attribute_elements(connection=self._connection,
                                                                            report_id=self._report_id,
                                                                            attribute_id=attr["id"],
                                                                            offset=0,
                                                                            limit=limit)
                    elements = response.json()
                    # Get total number of rows from headers.
                    total = int(response.headers['x-mstr-total-count'])
                    for _offset in range(limit, total, limit):
                        response = reports.report_single_attribute_elements(connection=self._connection,
                                                                            report_id=self._report_id,
                                                                            attribute_id=attr["id"],
                                                                            offset=_offset,
                                                                            limit=limit)
                        elements.extend(response.json())
                    # Append attribute data to the list of attributes.
                    attr_elements.append({"attribute_name": attr['name'],
                                          "attribute_id": attr['id'],
                                          "elements": elements})
                pbar.close()

            return attr_elements

    def __fetch_attribute_elements_chunks(self, session, limit):
        # Fetch add'l rows from this object instance from the intelligence server
        return [reports.report_single_attribute_elements_coroutine(session,
                                                                   connection=self._connection,
                                                                   report_id=self._report_id,
                                                                   attribute_id=attribute['id'],
                                                                   offset=0,
                                                                   limit=limit)
                for attribute in self._attributes]

    @property
    def name(self):
        return self._name

    @property
    def attributes(self):
        return self._attributes

    @property
    def metrics(self):
        return self._metrics

    @property
    def attr_elements(self):
        if not self._attr_elements:
            if self.parallel is True:
                self._attr_elements = self.__get_attr_elements_async()
            else:
                self._attr_elements = self.__get_attr_elements()
        return self._attr_elements

    @property
    def selected_attributes(self):
        return self._filter.attr_selected

    @property
    def selected_metrics(self):
        return self._filter.metr_selected

    @property
    def selected_attr_elements(self):
        return self._filter.attr_elem_selected

    @property
    def dataframe(self):
        if self._dataframe is None:
            helper.exception_handler(msg="Dataframe not loaded. Retrieve with Report.to_dataframe().",
                                     exception_type=Warning, throw_error=False)
        return self._dataframe