def _extract_numerical_feature_sample(self) -> pd.DataFrame: """Extracts a random sample of values from selected numerical features. Returns: results: Extracted values as a DataFrame. """ logging.info('Extracting a random sample of numerical features.') logging.info('Creating the sql code.') sql_segment = self._create_column_list_sql( self._numerical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'label_column': self._label_column, 'positive_class_label': self._positive_class_label, 'negative_class_label': self._negative_class_label, 'num_pos_instances': self._num_pos_instances, 'num_neg_instances': self._num_neg_instances, 'sql_code_segment': sql_segment } sql_query = utils.configure_sql(_EXTRACT_NUM_FEATURE_SAMPLE_SQL_PATH, query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results
def test_congigure_sql_creates_tuple_given_list_of_strings(self): test_sql = 'SELECT * FROM test_table WHERE test_column IN {test_list};' query_params = {'test_list': 'value1,value2'} mock_open = absltest.mock.mock_open(read_data=test_sql) expected_sql = test_sql.format(test_list=('value1', 'value2')) with absltest.mock.patch('builtins.open', mock_open): actual = utils.configure_sql(self.test_sql, query_params) self.assertEqual(expected_sql, actual)
def test_congigure_sql_replaces_params(self): test_sql = 'SELECT * FROM {project}.{dataset}.{table};' query_params = { 'project': 'test_project', 'dataset': 'test_dataset', 'table': 'test_table' } mock_open = absltest.mock.mock_open(read_data=test_sql) expected_sql = test_sql.format(**query_params) with absltest.mock.patch('builtins.open', mock_open): actual = utils.configure_sql(self.test_sql, query_params) self.assertEqual(expected_sql, actual)
def _calc_numerical_fact_stats(self) -> pd.DataFrame: """Calculates the statistics for selected numerical fact variables. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from numerical facts.') logging.info('Reading the sql query from the file.') query_params = { 'bq_facts_table': self._numerical_facts_table_path, } sql_query = utils.configure_sql(_CALC_NUM_FACT_STATS_SQL_PATH, query_params) results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished calculating statistics from numerical facts.') results['date'] = pd.to_datetime(results['date']) return results
def _calc_categorical_fact_stats(self) -> pd.DataFrame: """Calculates the statistics for selected categorical fact variables. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from categorical facts.') logging.info('Reading the sql query from the file.') query_params = { 'bq_facts_table': self._facts_table_path, 'categorical_fact_list': self._categorical_facts, 'number_top_levels': self._number_top_levels } sql_query = utils.configure_sql(_CALC_CAT_FACT_STATS_SQL_PATH, query_params) results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished calculating statistics from categorical facts.') return results
def _calc_categorical_feature_stats(self) -> pd.DataFrame: """Calculates the statistics from selected categorical features. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from categorical features.') logging.info('Creating the sql code.') sql_segment = self._create_struct_column_list_sql( self._categorical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'sql_code_segment': sql_segment } sql_query = utils.configure_sql(_CALC_CAT_FEATURE_STATS_SQL_PATH, query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results