def get_slice( self, data_source: GithubDataSource, permissions: Optional[dict] = None, offset: int = 0, limit: Optional[int] = None, get_row_count: Optional[bool] = False, ) -> DataSlice: """ Method to retrieve a part of the data as a pandas dataframe and the total size filtered with permissions - offset is the index of the starting row - limit is the number of pages to retrieve Exemple: if offset = 5 and limit = 10 then 10 results are expected from 6th row """ preview_datasource = GithubDataSource( page_limit=1, dataset=data_source.dataset, domain=f'preview_{data_source.domain}', name=data_source.name, organization=data_source.organization, entities_limit=3, ) df = self.get_df(preview_datasource, permissions) if limit is not None: return DataSlice(df[offset:offset + limit], len(df)) else: return DataSlice(df[offset:], len(df))
def get_slice( self, data_source: GoogleSheets2DataSource, permissions: Optional[dict] = None, offset: int = 0, limit=50, get_row_count: Optional[bool] = False, ) -> DataSlice: """ Method to retrieve a part of the data as a pandas dataframe and the total size filtered with permissions - offset is the index of the starting row - limit is the number of pages to retrieve Exemple: if offset = 5 and limit = 10 then 10 results are expected from 6th row """ preview_datasource = GoogleSheets2DataSource( domain=data_source.domain, name=data_source.name, spreadsheet_id=data_source.spreadsheet_id, sheet=data_source.sheet, header_row=data_source.header_row, rows_limit=limit, parse_dates=data_source.parse_dates, ) df = self.get_df(preview_datasource, permissions) if limit is not None: return DataSlice(df[offset:offset + limit], len(df)) else: return DataSlice(df[offset:], len(df))
def get_slice(self, data_source: MongoDataSource, permissions: Optional[str] = None, offset: int = 0, limit: Optional[int] = None) -> DataSlice: # Create a copy in order to keep the original (deepcopy-like) data_source = MongoDataSource.parse_obj(data_source) if offset or limit is not None: data_source.query = apply_permissions(data_source.query, permissions) data_source.query = normalize_query(data_source.query, data_source.parameters) facet = { "$facet": { 'count': data_source.query.copy(), 'df': data_source.query.copy(), } } facet['$facet']['count'].append({'$count': 'value'}) if offset: facet['$facet']['df'].append({'$skip': offset}) if limit is not None: facet['$facet']['df'].append({'$limit': limit}) data_source.query = [facet] res = self._execute_query(data_source).next() total_count = res['count'][0]['value'] if len( res['count']) > 0 else 0 df = pd.DataFrame(res['df']) else: df = self.get_df(data_source, permissions) total_count = len(df) return DataSlice(df, total_count)
def get_slice( self, connection: SnowflakeConnection, data_source: SfDataSource, offset: Optional[int] = None, limit: Optional[int] = None, get_row_count: bool = False, ) -> DataSlice: result = self.fetch_data(connection, data_source, offset, limit, get_row_count) stats = DataStats( query_generation_time=self.query_generation_time, data_extraction_time=self.data_extraction_time, data_conversion_time=self.data_conversion_time, total_returned_rows=len(result), df_memory_size=result.memory_usage().sum(), total_rows=self.total_rows_count, ) return DataSlice( df=result, query_metadata=QueryMetadata(columns=self.column_names_and_types), # In the case of user defined limit/offset, get the info # Not used for now # input_parameters={ # 'limit': SqlQueryHelper.extract_limit(data_source.query), # 'offset': SqlQueryHelper.extract_offset(data_source.query), # }, stats=stats, )
def get_slice( self, data_source: RedshiftDataSource, permissions: Optional[dict] = None, offset: int = 0, limit: Optional[int] = None, get_row_count: Optional[bool] = False, ) -> DataSlice: """ Method to retrieve a part of the data as a pandas dataframe and the total size filtered with permissions - offset is the index of the starting row - limit is the number of pages to retrieve Exemple: if offset = 5 and limit = 10 then 10 results are expected from 6th row """ df: pd.DataFrame = self._retrieve_data(data_source, False, offset, limit) total_returned_rows = len(df) if df is not None else 0 run_count_request = get_row_count and SqlQueryHelper.count_query_needed( data_source.query) if run_count_request: df_count: pd.DataFrame = self._retrieve_data(data_source, True) total_rows = (df_count.total_rows[0] if df_count is not None and len(df_count.total_rows) > 0 else 0) else: total_rows = total_returned_rows return DataSlice( df, stats=DataStats(total_returned_rows=total_returned_rows, total_rows=total_rows), )
def get_slice( self, data_source: MongoDataSource, permissions: Optional[str] = None, offset: int = 0, limit: Optional[int] = None, ) -> DataSlice: # Create a copy in order to keep the original (deepcopy-like) data_source = MongoDataSource.parse_obj(data_source) if offset or limit is not None: data_source.query = apply_condition_filter(data_source.query, permissions) data_source.query = normalize_query(data_source.query, data_source.parameters) df_facet = [] if offset: df_facet.append({'$skip': offset}) if limit is not None: df_facet.append({'$limit': limit}) facet = { '$facet': { # counting more than 1M values can be really slow, and the exact number is not that much relevant 'count': [{'$limit': MAX_COUNTED_ROWS}, {'$count': 'value'}], 'df': df_facet, # df_facet is never empty } } data_source.query.append(facet) res = self._execute_query(data_source).next() total_count = res['count'][0]['value'] if len(res['count']) > 0 else 0 df = pd.DataFrame(res['df']) else: df = self.get_df(data_source, permissions) total_count = len(df) return DataSlice(df, total_count)
def test_redshiftconnector_get_slice_df_is_none(mock_retreive_data, redshift_datasource, redshift_connector): mock_retreive_data.return_value = None result = redshift_connector.get_slice(data_source=redshift_datasource) assert result == DataSlice(df=None, total_count=None, stats=DataStats(total_rows=0, total_returned_rows=0))
def test_redshiftconnector_get_slice_without_count(mock_retreive_data, redshift_datasource, redshift_connector): mock_df = Mock() mock_df.__len__ = lambda x: 10 mock_retreive_data.return_value = mock_df result = redshift_connector.get_slice(data_source=redshift_datasource) assert result == DataSlice(df=mock_df, total_count=None, stats=DataStats(total_rows=10, total_returned_rows=10))
def test_redshiftconnector_get_slice(mock_retreive_data, redshift_datasource, redshift_connector): mock_df = Mock() mock_df.__len__ = lambda x: 1 type(mock_df).total_rows = [10] mock_retreive_data.return_value = mock_df result = redshift_connector.get_slice(data_source=redshift_datasource, permissions=None, offset=0, limit=1, get_row_count=True) assert result == DataSlice(df=mock_df, total_count=None, stats=DataStats(total_rows=10, total_returned_rows=1))
def _execute_parallelized_queries( self, connection: SnowflakeConnection, query: str, query_parameters: Optional[Dict] = None, offset: Optional[int] = None, limit: Optional[int] = None, get_row_count=False, ) -> DataSlice: """Call parallelized execute query to extract data & row count from query""" run_count_request = get_row_count and SqlQueryHelper.count_query_needed( query) self.logger.info(f'Execute count request: {run_count_request}') with concurrent.futures.ThreadPoolExecutor( max_workers=2 if run_count_request else 1) as executor: prepared_query, prepared_query_parameters = SqlQueryHelper.prepare_limit_query( query, query_parameters, offset, limit) future_1 = executor.submit( self._execute_query, connection, prepared_query, prepared_query_parameters, ) future_1.add_done_callback(self.set_data) futures = [future_1] if run_count_request: ( prepared_query_count, prepared_query_parameters_count, ) = SqlQueryHelper.prepare_count_query(query, query_parameters) future_2 = executor.submit( self._execute_query, connection, prepared_query_count, prepared_query_parameters_count, ) future_2.add_done_callback(self.set_total_rows_count) futures.append(future_2) for future in concurrent.futures.as_completed(futures): if future.exception() is not None: raise future.exception() else: self.logger.info('query finish') return DataSlice(self.data)