def to_dataframe(self, limit=None, multi_df=False): """Extract contents of a cube into a Pandas `DataFrame`. Args: limit (None or int, optional): Used to control data extract behavior. By default (None) the limit is calculated automatically, based on an optimized physical size of one chunk. Setting limit manually will force the number of rows per chunk. Depending on system resources, a higher limit (e.g. 50,000) may reduce the total time required to extract the entire dataset. multi_df (bool, optional): If True, return a list of data frames resembling the table structure of the cube. If False (default), returns one data frame. Returns: Pandas Data Frame containing the cube contents """ if limit: self._initial_limit = limit if self.instance_id is None: res = self.__initialize_cube(self._initial_limit) else: # try to get first chunk from already initialized instance of cube, # if not possible, initialize new instance try: res = self.__get_chunk(instance_id=self.instance_id, offset=0, limit=self._initial_limit) except requests.HTTPError: res = self.__initialize_cube(self._initial_limit) # Gets the pagination totals and instance_id from the response object _instance = res.json() _instance_id = _instance['instanceId'] _pagination = _instance['data']['paging'] # initialize parser and process first response p = Parser(response=res.json(), parse_cube=True) p.parse(response=_instance) # If there are more rows to fetch, fetch them if _pagination['current'] != _pagination['total']: if not limit: limit = max( 1000, int((self._initial_limit * self._size_limit) / len(res.content))) # Count the number of additional iterations it_total = int((_pagination['total'] - self._initial_limit) / limit) + \ ((_pagination['total'] - self._initial_limit) % limit != 0) if self.parallel and it_total > 1: threads = helper.get_parallel_number(it_total) with FuturesSession( executor=ThreadPoolExecutor(max_workers=threads), session=self._connection.session) as session: fetch_pbar = tqdm(desc="Downloading", total=it_total + 1, disable=(not self.progress_bar)) future = self.__fetch_chunks_future( session, _pagination, _instance_id, limit) fetch_pbar.update() for i, f in enumerate(future, start=1): response = f.result() if not response.ok: helper.response_handler( response, "Error getting cube contents.") fetch_pbar.update() fetch_pbar.set_postfix(rows=str( min(self._initial_limit + i * limit, _pagination['total']))) p.parse(response.json()) fetch_pbar.close() else: self.__fetch_chunks(p, _pagination, it_total, _instance_id, limit) # return parsed data as a data frame self._dataframe = p.dataframe # split dataframe to dataframes matching tables in Cube if multi_df: # split dataframe to dataframes matching tables in Cube self._dataframes = [ self._dataframe[columns].copy() for _, columns in self.__multitable_definition().items() ] return self._dataframes else: return self._dataframe
def to_dataframe(self, limit: int = None) -> pd.DataFrame: """Extract contents of a report instance into a Pandas `DataFrame`. Args: limit (None or int, optional): Used to control data extract behavior. By default (None) the limit is calculated automatically, based on an optimized physical size of one chunk. Setting limit manually will force the number of rows per chunk. Depending on system resources, a higher limit (e.g. 50,000) may reduce the total time required to extract the entire dataset. Returns: Pandas Data Frame containing the report contents. """ if limit: self._initial_limit = limit self.instance_id = None if self.instance_id is None: res = self.__initialize_report(self._initial_limit) else: # try to get first chunk from already initialized instance of report # if not possible, initialize new instance try: res = self.__get_chunk(instance_id=self.instance_id, offset=0, limit=self._initial_limit) except requests.HTTPError: res = self.__initialize_report(self._initial_limit) # Gets the pagination totals from the response object _instance = res.json() self.instance_id = _instance['instanceId'] paging = _instance['data']['paging'] # initialize parser and process first response p = Parser(response=_instance, parse_cube=False) p.parse(response=_instance) # If there are more rows to fetch, fetch them if paging['current'] != paging['total']: if not limit: limit = max( 1000, int((self._initial_limit * self._size_limit) / len(res.content))) # Count the number of additional iterations it_total = int((paging['total'] - self._initial_limit) / limit) + \ ((paging['total'] - self._initial_limit) % limit != 0) if self.parallel and it_total > 1: threads = helper.get_parallel_number(it_total) with FuturesSession( executor=ThreadPoolExecutor(max_workers=threads), session=self._connection.session) as session: fetch_pbar = tqdm(desc="Downloading", total=it_total + 1, disable=(not self.progress_bar)) future = self.__fetch_chunks_future( session, paging, self.instance_id, limit) fetch_pbar.update() for i, f in enumerate(future, start=1): response = f.result() if not response.ok: helper.response_handler( response, "Error getting report contents.") fetch_pbar.update() fetch_pbar.set_postfix(rows=str( min(self._initial_limit + i * limit, paging['total']))) p.parse(response.json()) fetch_pbar.close() else: self.__fetch_chunks(p, paging, it_total, self.instance_id, limit) # return parsed data as a data frame self._dataframe = p.dataframe # filter dataframe if report had crosstabs and filters were applied if self.cross_tab_filter != {}: if self.cross_tab_filter['metrics'] is not None: # drop metrics columns from dataframe metr_names = [ el['name'] for el in list( filter( lambda x: x['id'] not in self.cross_tab_filter[ 'metrics'], self.metrics)) ] self._dataframe = self._dataframe.drop(metr_names, axis=1) if self.cross_tab_filter['attr_elements'] is not None: # create dict of attributes and elements to iterate through attr_dict = {} for attribute in self.cross_tab_filter['attr_elements']: key = attribute[:32] attr_dict.setdefault(key, []).append(attribute[33:]) # initialize indexes series for filter indexes = pd.Series([False] * len(self._dataframe)) # logical OR for filtered attribute elements for attribute in attr_dict: attr_name = list( filter(lambda x: x['id'] in attribute, self.attributes))[0]['name'] elements = attr_dict[attribute] indexes = indexes | self._dataframe[attr_name].isin( elements) # select datframe indexes with self._dataframe = self._dataframe[indexes] if self.cross_tab_filter['attributes'] is not None: attr_names = [ el['name'] for el in list( filter( lambda x: x['id'] not in self.cross_tab_filter[ 'attributes'], self.attributes)) ] # filtering out attribute forms cloumns to_be_removed = [] to_be_added = [] for attr in attr_names: forms = [ column for column in self._dataframe.columns if column.startswith(attr + '@') ] if forms: to_be_removed.append(attr) to_be_added.extend(forms) for elem in to_be_removed: attr_names.remove(elem) attr_names.extend(to_be_added) # drop filtered out columns self._dataframe = self._dataframe.drop(attr_names, axis=1) return self._dataframe
def to_dataframe(self, limit=None, multi_df=False): """Extract contents of a cube into a Pandas Data Frame. Previously `microstrategy.Connection.get_cube()`. Args: limit (None or int, optional): Used to control data extract behavior. By default (None) the limit is calculated automatically, based on an optimized physical size of one chunk. Setting limit manually will force the number of rows per chunk. Depending on system resources, a higher limit (e.g. 50,000) may reduce the total time required to extract the entire dataset. multi_df (bool, optional): If True, return a list of data frames resembling the table structure of the cube. If False (default), returns one data frame. Returns: Pandas Data Frame containing the cube contents """ inst_pbar = tqdm(desc='Initializing an instance of a cube. Please wait...', bar_format='{desc}', leave=False, ncols=280, disable=(not self.progress_bar)) if limit: self._initial_limit = limit # Request a new instance, set instance id res = cubes.cube_instance(connection=self._connection, cube_id=self._cube_id, body=self._filter.filter_body(), offset=self.__OFFSET, limit=self._initial_limit) inst_pbar.close() # Gets the pagination totals and instance_id from the response object _instance = res.json() _instance_id = _instance['instanceId'] _pagination = _instance['data']['paging'] # initialize parser and process first response p = Parser(response=_instance, parse_cube=True) p.parse(response=_instance) # If there are more rows to fetch, fetch them if _pagination['current'] != _pagination['total']: if not limit: limit = max(1000, int((self._initial_limit * self._size_limit) / len(res.content))) # Count the number of additional iterations it_total = int((_pagination['total']-self._initial_limit)/limit) + ((_pagination['total']-self._initial_limit) % limit != 0) if self.parallel and it_total > 1: threads = helper.get_parallel_number(it_total) with FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) as session: fetch_pbar = tqdm(desc="Downloading", total=it_total+1, disable=(not self.progress_bar)) future = self.__fetch_chunks_future(session, _pagination, _instance_id, limit) fetch_pbar.update() for i, f in enumerate(future, start=1): response = f.result() if not response.ok: current_offset = self._initial_limit+(i-1)*limit response = cubes.cube_instance_id(connection=self._connection, cube_id=self._cube_id, instance_id=_instance_id, offset=current_offset, limit=limit) fetch_pbar.update() fetch_pbar.set_postfix(rows=str(min(self._initial_limit+i*limit, _pagination['total']))) p.parse(response.json()) fetch_pbar.close() else: self.__fetch_chunks(p, _pagination, it_total, _instance_id, limit) # return parsed data as a data frame self._dataframe = p.dataframe # split dataframe to dataframes matching tables in Cube if multi_df: # save the multitable_definition response to the instance self.__multitable_definition() # split dataframe to dataframes matching tables in Cube self._dataframes = [self._dataframe[columns].copy() for _, columns in self._table_definition.items()] return self._dataframes else: return self._dataframe
def to_dataframe(self, limit=None): """Extract contents of a report instance into a Pandas Data Frame. Formerly `microstrategy.Connection.get_report()`. Args: limit (None or int, optional): Used to control data extract behavior. By default (None) the limit is calculated automatically, based on an optimized physical size of one chunk. Setting limit manually will force the number of rows per chunk. Depending on system resources, a higher limit (e.g. 50,000) may reduce the total time required to extract the entire dataset. Returns: Pandas Data Frame containing the report contents """ inst_pbar = tqdm(desc='Initializing an instance of a report. Please wait...', bar_format='{desc}', leave=False, ncols=285, disable=(not self.progress_bar)) # Switch off subtotals if I-Server version is higher than 11.2.1 body = self._filter.filter_body() if version.parse(self._connection.iserver_version) >= version.parse("11.2.0100"): self._subtotals["visible"] = False body["subtotals"] = {"visible": self._subtotals["visible"]} if limit: self._initial_limit = limit # Request a new instance, set instance id res = reports.report_instance(connection=self._connection, report_id=self._report_id, body=body, offset=self.__OFFSET, limit=self._initial_limit) inst_pbar.close() # Gets the pagination totals from the response object _instance = res.json() _instance_id = _instance['instanceId'] _pagination = _instance['data']['paging'] # initialize parser and process first response p = Parser(response=_instance, parse_cube=False) p.parse(response=_instance) # If there are more rows to fetch, fetch them if _pagination['current'] != _pagination['total']: if not limit: limit = max(1000, int((self._initial_limit * self._size_limit) / len(res.content))) it_total = int((_pagination['total']-self._initial_limit)/limit) + ((_pagination['total']-self._initial_limit) % limit != 0) if self.parallel and it_total > 1: threads = helper.get_parallel_number(it_total) with FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) as session: fetch_pbar = tqdm(desc="Downloading", total=it_total+1, disable=(not self.progress_bar)) future = self.__fetch_chunks_future(session, _pagination, _instance_id, limit) fetch_pbar.update() for i, f in enumerate(future, start=1): response = f.result() if not response.ok: current_offset = self._initial_limit+(i-1)*limit response = reports.report_instance_id(connection=self._connection, report_id=self._report_id, instance_id=_instance_id, offset=current_offset, limit=limit) fetch_pbar.update() fetch_pbar.set_postfix(rows=str(min(self._initial_limit+i*limit, _pagination['total']))) p.parse(response.json()) fetch_pbar.close() else: self.__fetch_chunks(p, _pagination, it_total, _instance_id, limit) # return parsed data as a data frame self._dataframe = p.dataframe # filter received dataframe if report had crosstabs and filters were applied if self.cross_tab_filter != {}: if self.cross_tab_filter['metrics'] is not None: # drop metrics columns from dataframe metr_names = [el['name'] for el in list(filter(lambda x: x['id'] not in self.cross_tab_filter['metrics'], self._metrics))] self._dataframe = self._dataframe.drop(metr_names, axis=1) if self.cross_tab_filter['attr_elements'] is not None: # create dict of attributes and elements to iterate through attr_dict = {} for attribute in self.cross_tab_filter['attr_elements']: key = attribute[:32] attr_dict.setdefault(key, []).append(attribute[33:]) # initialize indexes series for filter indexes = pd.Series([False] * len(self._dataframe)) # logical OR for filtered attribute elements for attribute in attr_dict: attr_name = list(filter(lambda x: x['id'] in attribute, self._attributes))[0]['name'] elements = attr_dict[attribute] indexes = indexes | self._dataframe[attr_name].isin(elements) # select datframe indexes with self._dataframe = self._dataframe[indexes] if self.cross_tab_filter['attributes'] is not None: attr_names = [el['name'] for el in list(filter(lambda x: x['id'] not in self.cross_tab_filter['attributes'], self._attributes))] self._dataframe = self._dataframe.drop(attr_names, axis=1) return self._dataframe