Пример #1
0
    def get_slice(
        self,
        data_source: GithubDataSource,
        permissions: Optional[dict] = None,
        offset: int = 0,
        limit: Optional[int] = None,
        get_row_count: Optional[bool] = False,
    ) -> DataSlice:
        """
        Method to retrieve a part of the data as a pandas dataframe
        and the total size filtered with permissions

        - offset is the index of the starting row
        - limit is the number of pages to retrieve
        Exemple: if offset = 5 and limit = 10 then 10 results are expected from 6th row
        """
        preview_datasource = GithubDataSource(
            page_limit=1,
            dataset=data_source.dataset,
            domain=f'preview_{data_source.domain}',
            name=data_source.name,
            organization=data_source.organization,
            entities_limit=3,
        )
        df = self.get_df(preview_datasource, permissions)
        if limit is not None:
            return DataSlice(df[offset:offset + limit], len(df))
        else:
            return DataSlice(df[offset:], len(df))
Пример #2
0
    def get_slice(
        self,
        data_source: GoogleSheets2DataSource,
        permissions: Optional[dict] = None,
        offset: int = 0,
        limit=50,
        get_row_count: Optional[bool] = False,
    ) -> DataSlice:
        """
        Method to retrieve a part of the data as a pandas dataframe
        and the total size filtered with permissions

        - offset is the index of the starting row
        - limit is the number of pages to retrieve
        Exemple: if offset = 5 and limit = 10 then 10 results are expected from 6th row
        """
        preview_datasource = GoogleSheets2DataSource(
            domain=data_source.domain,
            name=data_source.name,
            spreadsheet_id=data_source.spreadsheet_id,
            sheet=data_source.sheet,
            header_row=data_source.header_row,
            rows_limit=limit,
            parse_dates=data_source.parse_dates,
        )
        df = self.get_df(preview_datasource, permissions)
        if limit is not None:
            return DataSlice(df[offset:offset + limit], len(df))
        else:
            return DataSlice(df[offset:], len(df))
 def get_slice(self,
               data_source: MongoDataSource,
               permissions: Optional[str] = None,
               offset: int = 0,
               limit: Optional[int] = None) -> DataSlice:
     # Create a copy in order to keep the original (deepcopy-like)
     data_source = MongoDataSource.parse_obj(data_source)
     if offset or limit is not None:
         data_source.query = apply_permissions(data_source.query,
                                               permissions)
         data_source.query = normalize_query(data_source.query,
                                             data_source.parameters)
         facet = {
             "$facet": {
                 'count': data_source.query.copy(),
                 'df': data_source.query.copy(),
             }
         }
         facet['$facet']['count'].append({'$count': 'value'})
         if offset:
             facet['$facet']['df'].append({'$skip': offset})
         if limit is not None:
             facet['$facet']['df'].append({'$limit': limit})
         data_source.query = [facet]
         res = self._execute_query(data_source).next()
         total_count = res['count'][0]['value'] if len(
             res['count']) > 0 else 0
         df = pd.DataFrame(res['df'])
     else:
         df = self.get_df(data_source, permissions)
         total_count = len(df)
     return DataSlice(df, total_count)
Пример #4
0
    def get_slice(
        self,
        connection: SnowflakeConnection,
        data_source: SfDataSource,
        offset: Optional[int] = None,
        limit: Optional[int] = None,
        get_row_count: bool = False,
    ) -> DataSlice:
        result = self.fetch_data(connection, data_source, offset, limit,
                                 get_row_count)

        stats = DataStats(
            query_generation_time=self.query_generation_time,
            data_extraction_time=self.data_extraction_time,
            data_conversion_time=self.data_conversion_time,
            total_returned_rows=len(result),
            df_memory_size=result.memory_usage().sum(),
            total_rows=self.total_rows_count,
        )
        return DataSlice(
            df=result,
            query_metadata=QueryMetadata(columns=self.column_names_and_types),
            # In the case of user defined limit/offset, get the info
            # Not used for now
            # input_parameters={
            #     'limit': SqlQueryHelper.extract_limit(data_source.query),
            #     'offset': SqlQueryHelper.extract_offset(data_source.query),
            # },
            stats=stats,
        )
Пример #5
0
    def get_slice(
        self,
        data_source: RedshiftDataSource,
        permissions: Optional[dict] = None,
        offset: int = 0,
        limit: Optional[int] = None,
        get_row_count: Optional[bool] = False,
    ) -> DataSlice:
        """
        Method to retrieve a part of the data as a pandas dataframe
        and the total size filtered with permissions
        - offset is the index of the starting row
        - limit is the number of pages to retrieve
        Exemple: if offset = 5 and limit = 10 then 10 results are expected from 6th row
        """
        df: pd.DataFrame = self._retrieve_data(data_source, False, offset,
                                               limit)
        total_returned_rows = len(df) if df is not None else 0

        run_count_request = get_row_count and SqlQueryHelper.count_query_needed(
            data_source.query)

        if run_count_request:
            df_count: pd.DataFrame = self._retrieve_data(data_source, True)
            total_rows = (df_count.total_rows[0] if df_count is not None
                          and len(df_count.total_rows) > 0 else 0)
        else:
            total_rows = total_returned_rows

        return DataSlice(
            df,
            stats=DataStats(total_returned_rows=total_returned_rows,
                            total_rows=total_rows),
        )
Пример #6
0
    def get_slice(
        self,
        data_source: MongoDataSource,
        permissions: Optional[str] = None,
        offset: int = 0,
        limit: Optional[int] = None,
    ) -> DataSlice:
        # Create a copy in order to keep the original (deepcopy-like)
        data_source = MongoDataSource.parse_obj(data_source)
        if offset or limit is not None:
            data_source.query = apply_condition_filter(data_source.query, permissions)
            data_source.query = normalize_query(data_source.query, data_source.parameters)

            df_facet = []
            if offset:
                df_facet.append({'$skip': offset})
            if limit is not None:
                df_facet.append({'$limit': limit})
            facet = {
                '$facet': {
                    # counting more than 1M values can be really slow, and the exact number is not that much relevant
                    'count': [{'$limit': MAX_COUNTED_ROWS}, {'$count': 'value'}],
                    'df': df_facet,  # df_facet is never empty
                }
            }
            data_source.query.append(facet)

            res = self._execute_query(data_source).next()
            total_count = res['count'][0]['value'] if len(res['count']) > 0 else 0
            df = pd.DataFrame(res['df'])
        else:
            df = self.get_df(data_source, permissions)
            total_count = len(df)
        return DataSlice(df, total_count)
Пример #7
0
def test_redshiftconnector_get_slice_df_is_none(mock_retreive_data,
                                                redshift_datasource,
                                                redshift_connector):
    mock_retreive_data.return_value = None
    result = redshift_connector.get_slice(data_source=redshift_datasource)
    assert result == DataSlice(df=None,
                               total_count=None,
                               stats=DataStats(total_rows=0,
                                               total_returned_rows=0))
Пример #8
0
def test_redshiftconnector_get_slice_without_count(mock_retreive_data,
                                                   redshift_datasource,
                                                   redshift_connector):
    mock_df = Mock()
    mock_df.__len__ = lambda x: 10

    mock_retreive_data.return_value = mock_df
    result = redshift_connector.get_slice(data_source=redshift_datasource)
    assert result == DataSlice(df=mock_df,
                               total_count=None,
                               stats=DataStats(total_rows=10,
                                               total_returned_rows=10))
Пример #9
0
def test_redshiftconnector_get_slice(mock_retreive_data, redshift_datasource,
                                     redshift_connector):
    mock_df = Mock()
    mock_df.__len__ = lambda x: 1
    type(mock_df).total_rows = [10]

    mock_retreive_data.return_value = mock_df
    result = redshift_connector.get_slice(data_source=redshift_datasource,
                                          permissions=None,
                                          offset=0,
                                          limit=1,
                                          get_row_count=True)
    assert result == DataSlice(df=mock_df,
                               total_count=None,
                               stats=DataStats(total_rows=10,
                                               total_returned_rows=1))
Пример #10
0
    def _execute_parallelized_queries(
        self,
        connection: SnowflakeConnection,
        query: str,
        query_parameters: Optional[Dict] = None,
        offset: Optional[int] = None,
        limit: Optional[int] = None,
        get_row_count=False,
    ) -> DataSlice:
        """Call parallelized execute query to extract data & row count from query"""

        run_count_request = get_row_count and SqlQueryHelper.count_query_needed(
            query)
        self.logger.info(f'Execute count request: {run_count_request}')
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=2 if run_count_request else 1) as executor:
            prepared_query, prepared_query_parameters = SqlQueryHelper.prepare_limit_query(
                query, query_parameters, offset, limit)
            future_1 = executor.submit(
                self._execute_query,
                connection,
                prepared_query,
                prepared_query_parameters,
            )
            future_1.add_done_callback(self.set_data)
            futures = [future_1]

            if run_count_request:
                (
                    prepared_query_count,
                    prepared_query_parameters_count,
                ) = SqlQueryHelper.prepare_count_query(query, query_parameters)
                future_2 = executor.submit(
                    self._execute_query,
                    connection,
                    prepared_query_count,
                    prepared_query_parameters_count,
                )
                future_2.add_done_callback(self.set_total_rows_count)
                futures.append(future_2)
            for future in concurrent.futures.as_completed(futures):
                if future.exception() is not None:
                    raise future.exception()
                else:
                    self.logger.info('query finish')
        return DataSlice(self.data)