예제 #1
0
  def accuracy(self):
    """Get accuracy numbers for each target and overall.

    Returns:
      A DataFrame with two columns: 'class' and 'accuracy'. It also contains the overall
      accuracy with class being '_all'.

    Raises:
      Exception if the CSV headers do not include 'target' or 'predicted', or BigQuery
      does not return 'target' or 'predicted' column.
    """

    if self._input_csv_files:
      df = self._get_data_from_csv_files()
      if 'target' not in df or 'predicted' not in df:
        raise ValueError('Cannot find "target" or "predicted" column')

      labels = sorted(set(df['target']) | set(df['predicted']))
      accuracy_results = []

      for label in labels:
        correct_count = len(df[(df['target'] == df['predicted']) & (df['target'] == label)])
        total_count = len(df[(df['target'] == label)])
        accuracy_results.append({
            'target': label,
            'accuracy': float(correct_count) / total_count if total_count > 0 else 0,
            'count': total_count
        })

      total_correct_count = len(df[(df['target'] == df['predicted'])])
      if len(df) > 0:
        total_accuracy = float(total_correct_count) / len(df)
        accuracy_results.append({'target': '_all', 'accuracy': total_accuracy, 'count': len(df)})
      return pd.DataFrame(accuracy_results)
    elif self._bigquery:
      query = bq.Query("""
SELECT
  target,
  SUM(CASE WHEN target=predicted THEN 1 ELSE 0 END)/COUNT(*) as accuracy,
  COUNT(*) as count
FROM
  %s
GROUP BY
  target""" % self._bigquery)
      query_all = bq.Query("""
SELECT
  "_all" as target,
  SUM(CASE WHEN target=predicted THEN 1 ELSE 0 END)/COUNT(*) as accuracy,
  COUNT(*) as count
FROM
  %s""" % self._bigquery)

      df = self._get_data_from_bigquery([query, query_all])
      return df
예제 #2
0
def create_dataframes(frac=None):
    # small dataset for testing
    if frac > 0 and frac < 1:
        sample = " AND RAND() < {}".format(frac)
    else:
        sample = ""

    train_query, eval_query = create_queries()
    train_query = "{} {}".format(train_query, sample)
    eval_query = "{} {}".format(eval_query, sample)

    import google.datalab.bigquery as bq
    train_df = bq.Query(train_query).execute().result().to_dataframe()
    eval_df = bq.Query(eval_query).execute().result().to_dataframe()
    return train_df, eval_df
예제 #3
0
    def execute(self, context):
        if self.data_source:
            kwargs = {}
            if self.csv_options:
                csv_kwargs = {}
                if 'delimiter' in self.csv_options:
                    csv_kwargs['delimiter'] = self.csv_options['delimiter']
                if 'skip' in self.csv_options:
                    csv_kwargs['skip_leading_rows'] = self.csv_options['skip']
                if 'strict' in self.csv_options:
                    csv_kwargs['allow_jagged_rows'] = self.csv_options[
                        'strict']
                if 'quote' in self.csv_options:
                    csv_kwargs['quote'] = self.csv_options['quote']
                kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs)

            if self.format:
                kwargs['source_format'] = self.format

            if self.max_bad_records:
                kwargs['max_bad_records'] = self.max_bad_records

            external_data_source = bq.ExternalDataSource(source=self.path,
                                                         schema=bq.Schema(
                                                             self.schema),
                                                         **kwargs)
            query = bq.Query(
                sql=self.sql,
                data_sources={self.data_source: external_data_source})
        else:
            query = bq.Query(sql=self.sql)

        # use_cache is False since this is most likely the case in pipeline scenarios
        # allow_large_results can be True only if table is specified (i.e. when it's not None)
        kwargs = {}
        if self.mode is not None:
            kwargs['mode'] = self.mode
        output_options = bq.QueryOutput.table(name=self.table,
                                              use_cache=False,
                                              allow_large_results=self.table
                                              is not None,
                                              **kwargs)
        query_params = bq.Query.get_query_parameters(self.parameters)
        job = query.execute(output_options=output_options,
                            query_params=query_params)

        # Returning the table-name here makes it available for downstream task instances.
        return {'table': job.result().full_name}
예제 #4
0
def run_numerical_analysis(table, schema_list, args):
    """Find min/max values for the numerical columns and writes a json file.

  Args:
    table: Reference to FederatedTable (if bigquery_table is false) or a
        regular Table (otherwise)
    schema_list: Bigquery schema json object
    args: the command line args
  """
    import google.datalab.bigquery as bq

    # Get list of numerical columns.
    numerical_columns = []
    for col_schema in schema_list:
        col_type = col_schema['type'].lower()
        if col_type == 'integer' or col_type == 'float':
            numerical_columns.append(col_schema['name'])

    # Run the numerical analysis
    if numerical_columns:
        sys.stdout.write('Running numerical analysis...')
        max_min = [('max({name}) as max_{name}, '
                    'min({name}) as min_{name}, '
                    'avg({name}) as avg_{name} ').format(name=name)
                   for name in numerical_columns]
        if args.bigquery_table:
            sql = 'SELECT  %s from `%s`' % (
                ', '.join(max_min), parse_table_name(args.bigquery_table))
            numerical_results = bq.Query(sql).execute().result().to_dataframe()
        else:
            sql = 'SELECT  %s from csv_table' % ', '.join(max_min)
            query = bq.Query(sql, data_sources={'csv_table': table})
            numerical_results = query.execute().result().to_dataframe()

        # Convert the numerical results to a json file.
        results_dict = {}
        for name in numerical_columns:
            results_dict[name] = {
                'max': numerical_results.iloc[0]['max_%s' % name],
                'min': numerical_results.iloc[0]['min_%s' % name],
                'mean': numerical_results.iloc[0]['avg_%s' % name]
            }

        file_io.write_string_to_file(
            os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE),
            json.dumps(results_dict, indent=2, separators=(',', ': ')))

        sys.stdout.write('done.\n')
예제 #5
0
  def percentile_nearest(self, percentile):
    """Get nearest percentile from regression model evaluation results.

    Args:
      percentile: a 0~100 float number.

    Returns:
      the percentile float number.

    Raises:
      Exception if the CSV headers do not include 'target' or 'predicted', or BigQuery
      does not return 'target' or 'predicted' column, or if target or predicted is not
      number.
    """

    if self._input_csv_files:
      df = self._get_data_from_csv_files()
      if 'target' not in df or 'predicted' not in df:
        raise ValueError('Cannot find "target" or "predicted" column')

      df = df[['target', 'predicted']].apply(pd.to_numeric)
      abs_errors = np.array((df['target'] - df['predicted']).apply(abs))
      return np.percentile(abs_errors, percentile, interpolation='nearest')
    elif self._bigquery:
      query = bq.Query("""
        SELECT
          PERCENTILE_DISC(ABS(predicted-target), %f) OVER() AS percentile
        FROM
          %s
        LIMIT 1""" % (float(percentile) / 100, self._bigquery))
      df = self._get_data_from_bigquery([query])
      if df.empty:
        return None
      return df['percentile'][0]
예제 #6
0
  def mae(self):
    """Get MAE (Mean Absolute Error) for regression model evaluation results.

    Returns:
      the MAE float number.

    Raises:
      Exception if the CSV headers do not include 'target' or 'predicted', or BigQuery
      does not return 'target' or 'predicted' column, or if target or predicted is not
      number.
    """

    if self._input_csv_files:
      df = self._get_data_from_csv_files()
      if 'target' not in df or 'predicted' not in df:
        raise ValueError('Cannot find "target" or "predicted" column')

      df = df[['target', 'predicted']].apply(pd.to_numeric)
      mae = mean_absolute_error(df['target'], df['predicted'])
      return mae
    elif self._bigquery:
      query = bq.Query("""
        SELECT
          SUM(ABS(predicted-target)) / COUNT(*) as mae
        FROM
          %s""" % self._bigquery)
      df = self._get_data_from_bigquery([query])
      if df.empty:
        return None
      return df['mae'][0]
예제 #7
0
  def rmse(self):
    """Get RMSE for regression model evaluation results.

    Returns:
      the RMSE float number.

    Raises:
      Exception if the CSV headers do not include 'target' or 'predicted', or BigQuery
      does not return 'target' or 'predicted' column, or if target or predicted is not
      number.
    """

    if self._input_csv_files:
      df = self._get_data_from_csv_files()
      if 'target' not in df or 'predicted' not in df:
        raise ValueError('Cannot find "target" or "predicted" column')

      df = df[['target', 'predicted']].apply(pd.to_numeric)
      # if df is empty or contains non-numeric, scikit learn will raise error.
      mse = mean_squared_error(df['target'], df['predicted'])
      return math.sqrt(mse)
    elif self._bigquery:
      query = bq.Query("""
        SELECT
          SQRT(SUM(ABS(predicted-target) * ABS(predicted-target)) / COUNT(*)) as rmse
        FROM
          %s""" % self._bigquery)
      df = self._get_data_from_bigquery([query])
      if df.empty:
        return None
      return df['rmse'][0]
예제 #8
0
 def _get_gcs_csv_row_count(self, external_data_source):
     import google.datalab.bigquery as bq
     results = bq.Query('SELECT count(*) from data',
                        data_sources={
                            'data': external_data_source
                        }).execute().result()
     return results[0].values()[0]
예제 #9
0
  def test_pipeline_cell(self, mock_env, mock_get_notebook_item, mock_bucket_class,
                         mock_default_context):
    context = TestCases._create_context()
    mock_default_context.return_value = context
    mock_bucket_class.return_value = mock.Mock()
    mock_get_notebook_item.return_value = bq.Query(
        'SELECT * FROM publicdata.samples.wikipedia LIMIT 5')
    small_cell_body = """
            emails: [email protected]
            schedule:
                start: 2009-05-05T22:28:15Z
                end: 2009-05-06T22:28:15Z
                interval: '@hourly'
            input:
                table: project.test.table
            transformation:
                query: foo_query
            output:
                table: project.test.table
       """
    args = {'name': 'bq_pipeline_test', 'gcs_dag_bucket': 'foo_bucket', 'gcs_dag_folder': 'dags'}
    actual = bq.commands._bigquery._pipeline_cell(args, small_cell_body)
    self.assertIn("successfully deployed", actual)
    self.assertNotIn("'email': ['*****@*****.**']", actual)

    args['debug'] = True
    actual = bq.commands._bigquery._pipeline_cell(args, small_cell_body)
    self.assertIn("successfully deployed", actual)
    self.assertIn("'email': ['*****@*****.**']", actual)
예제 #10
0
    def test_execute_cell(self, mock_get_notebook_item, mock_query_execute,
                          mock_default_context):
        args = {
            'query': 'test_query',
            'verbose': None,
            'to_dataframe': None,
            'table': None,
            'dataframe_start_row': None,
            'dataframe_max_rows': None,
            'nocache': None,
            'mode': None,
            'large': None
        }
        cell_body = ''
        mock_get_notebook_item.return_value = bq.Query('test_sql')
        bq.commands._bigquery._execute_cell(args, cell_body)

        args['to_dataframe'] = True
        bq.commands._bigquery._execute_cell(args, cell_body)

        # test --verbose
        args['verbose'] = True
        with mock.patch('sys.stdout', new=six.StringIO()) as mocked_stdout:
            bq.commands._bigquery._execute_cell(args, cell_body)
        self.assertEqual(mocked_stdout.getvalue(), 'test_sql\n')
        args['verbose'] = False
예제 #11
0
 def size(self):
   """The size of the schema. If the underlying data source changes, it may be outdated.
   """
   if self._size is None:
     self._size = bq.Query('SELECT COUNT(*) FROM %s' %
                           self._get_source()).execute().result()[0].values()[0]
   return self._size
예제 #12
0
  def _execute_sql(sql, table):
    """Runs a BigQuery job and dowloads the results into local memeory.

    Args:
      sql: a SQL string
      table: bq.ExternalDataSource or bq.Table

    Returns:
      A Pandas dataframe.
    """
    import google.datalab.bigquery as bq
    if isinstance(table, bq.ExternalDataSource):
      query = bq.Query(sql, data_sources={'csv_table': table})
    else:
      query = bq.Query(sql)
    return query.execute().result().to_dataframe()
예제 #13
0
def _get_query_argument(args, cell, env):
  """ Get a query argument to a cell magic.

  The query is specified with args['query']. We look that up and if it is a BQ query
  object, just return it. If it is a string, build a query object out of it and return
  that

  Args:
    args: the dictionary of magic arguments.
    cell: the cell contents which can be variable value overrides (if args has a 'query'
        value) or inline SQL otherwise.
    env: a dictionary that is used for looking up variable values.

  Returns:
    A Query object.
  """
  sql_arg = args.get('query', None)
  if sql_arg is None:
    # Assume we have inline SQL in the cell
    if not isinstance(cell, basestring):
      raise Exception('Expected a --query argument or inline SQL')
    return bigquery.Query(cell, env=env)

  item = google.datalab.utils.commands.get_notebook_item(sql_arg)
  if isinstance(item, bigquery.Query):
    return item
  else:
    raise Exception('Expected a query object, got %s.' % type(item))
예제 #14
0
def _query_cell(args, cell_body):
  """Implements the BigQuery cell magic for used to build SQL objects.

  The supported syntax is:

      %%bq query <args>
      [<inline SQL>]

  Args:
    args: the optional arguments following '%%bql query'.
    cell_body: the contents of the cell
  """
  name = args['name']
  udfs = args['udfs']
  datasources = args['datasources']
  subqueries = args['subqueries']

  # Finally build the query object
  query = bigquery.Query(cell_body, env=IPython.get_ipython().user_ns, udfs=udfs,
                         data_sources=datasources, subqueries=subqueries)

  # if no name is specified, execute this query instead of defining it
  if name is None:
    return query.execute().result()
  else:
    google.datalab.utils.commands.notebook_environment()[name] = query
예제 #15
0
    def test_extract_cell_query(self, mock_get_notebook_item,
                                mock_query_execute, mock_default_context):
        args = {
            'table': None,
            'view': None,
            'query': None,
            'path': None,
            'format': None,
            'delimiter': None,
            'header': None,
            'compress': None,
            'nocache': None
        }
        with self.assertRaisesRegexp(Exception,
                                     'A query, table, or view is needed'):
            bq.commands._bigquery._extract_cell(args, None)

        args['query'] = 'test-query'
        mock_get_notebook_item.return_value = None
        with self.assertRaisesRegexp(Exception,
                                     'Could not find query test-query'):
            bq.commands._bigquery._extract_cell(args, None)

        mock_get_notebook_item.return_value = bq.Query('sql')
        mock_query_execute.return_value.failed = True
        mock_query_execute.return_value.fatal_error = 'test-error'
        with self.assertRaisesRegexp(Exception, 'Extract failed: test-error'):
            bq.commands._bigquery._extract_cell(args, None)

        mock_query_execute.return_value.failed = False
        mock_query_execute.return_value.errors = 'test-errors'
        with self.assertRaisesRegexp(
                Exception, 'Extract completed with errors: test-errors'):
            bq.commands._bigquery._extract_cell(args, None)

        mock_query_execute.return_value.errors = None
        mock_query_execute.return_value.result = lambda: 'results'
        self.assertEqual(bq.commands._bigquery._extract_cell(args, None),
                         'results')

        cell_body = {
            'parameters': [{
                'name': 'arg1',
                'type': 'INT64',
                'value': 5
            }]
        }
        bq.commands._bigquery._extract_cell(args, json.dumps(cell_body))
        mock_get_notebook_item.assert_called_with('test-query')
        call_args = mock_query_execute.call_args[1]
        self.assertDictEqual(
            call_args['query_params'][0], {
                'parameterValue': {
                    'value': 5
                },
                'name': 'arg1',
                'parameterType': {
                    'type': 'INT64'
                }
            })
예제 #16
0
  def test_get_query_argument(self, mock_get_notebook_item):
    args = {}
    cell = None
    env = {}
    # an Exception should be raised if no query is specified by name or body
    with self.assertRaises(Exception):
      bq.commands._bigquery._get_query_argument(args, cell, env)

    # specify query name, no cell body
    args = {'query': 'test_query'}
    mock_get_notebook_item.return_value = bq.Query('test_sql')
    q = bq.commands._bigquery._get_query_argument(args, cell, env)
    self.assertEqual(q.sql, 'test_sql')

    # specify query in cell body, no name
    args = {}
    cell = 'test_sql2'
    q = bq.commands._bigquery._get_query_argument(args, cell, env)
    self.assertEqual(q.sql, 'test_sql2')

    # specify query by bad name
    args = {'query': 'test_query'}
    mock_get_notebook_item.return_value = None
    with self.assertRaises(Exception):
      bq.commands._bigquery._get_query_argument(args, cell, env)
예제 #17
0
def get_hpo_table_columns(hpo_id):
    """
    Get column names and row counts for all tables associated with an HPO
    :param hpo_id: hpo site id
    :return: dataframe with table name, column name and table row count
    """
    query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id 
               FROM {dataset}.INFORMATION_SCHEMA.COLUMNS c
               JOIN {dataset}.__TABLES__ t on c.table_name=t.table_id
               WHERE STARTS_WITH(table_id, lower('{hpo_id}'))=true AND
               NOT(table_id like '_mapping%') AND 
                (
                  table_id like '%person' OR
                  table_id like '%visit_occurrence' OR
                  table_id like '%condition_occurrence' OR
                  table_id like '%procedure_occurrence' OR
                  table_id like '%drug_exposure' OR
                  table_id like '%measurement' OR
                  table_id like '%observation' OR
                  table_id like '%device_exposure' OR
                  table_id like '%death' OR
                  table_id like '%provider' OR
                  table_id like '%specimen' OR
                  table_id like '%location' OR
                  table_id like '%care_site' OR
                  table_id like '%note'
                  )""".format(hpo_id=hpo_id, dataset=dataset)
    df = bq.Query(query).execute(output_options=bq.QueryOutput.dataframe(
        use_cache=False)).result()
    return df
예제 #18
0
    def from_bigquery(sql):
        """Create a ConfusionMatrix from a BigQuery table or query.
    Args:
      sql: Can be one of:
          A SQL query string.
          A Bigquery table string.
          A Query object defined with '%%bq query --name [query_name]'.
      The query results or table must include "target", "predicted" columns.
    Returns:
      A ConfusionMatrix that can be plotted.
    Raises:
      ValueError if query results or table does not include 'target' or 'predicted' columns.
    """
        if isinstance(sql, bq.Query):
            sql = sql._expanded_sql()

        parts = sql.split('.')
        if len(parts) == 1 or len(parts) > 3 or any(' ' in x for x in parts):
            sql = '(' + sql + ')'  # query, not a table name
        else:
            sql = '`' + sql + '`'  # table name

        query = bq.Query(
            'SELECT target, predicted, count(*) as count FROM %s group by target, predicted'
            % sql)
        df = query.execute().result().to_dataframe()
        labels = sorted(set(df['target']) | set(df['predicted']))
        labels_count = len(labels)
        df['target'] = [labels.index(x) for x in df['target']]
        df['predicted'] = [labels.index(x) for x in df['predicted']]
        cm = [[0] * labels_count for i in range(labels_count)]
        for index, row in df.iterrows():
            cm[row['target']][row['predicted']] = row['count']
        return ConfusionMatrix(cm, labels)
예제 #19
0
 def sample_to(self, count, skip_header_rows, strategy, target):
     """Sample rows from GCS or local file and save results to target file.
 Args:
   count: number of rows to sample. If strategy is "BIGQUERY", it is used as approximate number.
   skip_header_rows: whether to skip first row when reading from source.
   strategy: can be "LOCAL" or "BIGQUERY". If local, the sampling happens in local memory,
     and number of resulting rows matches count. If BigQuery, sampling is done
     with BigQuery in cloud, and the number of resulting rows will be approximated to
     count.
   target: The target file path, can be GCS or local path.
 Raises:
   Exception if strategy is "BIGQUERY" but source is not a GCS path.
 """
     # TODO(qimingj) Add unit test
     # Read data from source into DataFrame.
     if strategy == 'BIGQUERY':
         import google.datalab.bigquery as bq
         if not self.path.startswith('gs://'):
             raise Exception('Cannot use BIGQUERY if data is not in GCS')
         external_data_source = self._create_external_data_source(
             skip_header_rows)
         row_count = self._get_gcs_csv_row_count(external_data_source)
         query = bq.Query('SELECT * from data',
                          data_sources={'data': external_data_source})
         sampling = bq.Sampling.random(count * 100 / float(row_count))
         sample = query.sample(sampling=sampling)
         df = sample.to_dataframe()
     elif strategy == 'LOCAL':
         local_file = self.path
         if self.path.startswith('gs://'):
             local_file = tempfile.mktemp()
             google.datalab.utils.gcs_copy_file(self.path, local_file)
         with open(local_file) as f:
             row_count = sum(1 for line in f)
         start_row = 1 if skip_header_rows == True else 0
         skip_count = row_count - count - 1 if skip_header_rows == True else row_count - count
         skip = sorted(
             random.sample(xrange(start_row, row_count), skip_count))
         header_row = 0 if skip_header_rows == True else None
         df = pd.read_csv(local_file,
                          skiprows=skip,
                          header=header_row,
                          delimiter=self._delimiter)
         if self.path.startswith('gs://'):
             os.remove(local_file)
     else:
         raise Exception('strategy must be BIGQUERY or LOCAL')
     # Write to target.
     if target.startswith('gs://'):
         with tempfile.NamedTemporaryFile() as f:
             df.to_csv(f, header=False, index=False)
             f.flush()
             google.datalab.utils.gcs_copy_file(f.name, target)
     else:
         with open(target, 'w') as f:
             df.to_csv(f,
                       header=False,
                       index=False,
                       sep=str(self._delimiter))
예제 #20
0
def _analyze(args, cell):
  # For now, always run python2. If needed we can run python3 when the current kernel
  # is py3. Since now our transform cannot work on py3 anyway, I would rather run
  # everything with python2.
  cmd_args = ['python', 'analyze.py', '--output', _abs_path(args['output'])]
  if args['cloud']:
    cmd_args.append('--cloud')

  training_data = args['training_data']
  if args['cloud']:
    tmpdir = os.path.join(args['output'], 'tmp')
  else:
    tmpdir = tempfile.mkdtemp()

  try:
    if isinstance(training_data, dict):
      if 'csv' in training_data and 'schema' in training_data:
        schema = training_data['schema']
        schema_file = _create_json_file(tmpdir, schema, 'schema.json')
        cmd_args.append('--csv=' + _abs_path(training_data['csv']))
        cmd_args.extend(['--schema', schema_file])
      elif 'bigquery_table' in training_data:
        cmd_args.extend(['--bigquery', training_data['bigquery_table']])
      elif 'bigquery_sql' in training_data:
        # see https://cloud.google.com/bigquery/querying-data#temporary_and_permanent_tables
        print('Creating temporary table that will be deleted in 24 hours')
        r = bq.Query(training_data['bigquery_sql']).execute().result()
        cmd_args.extend(['--bigquery', r.full_name])
      else:
        raise ValueError('Invalid training_data dict. '
                         'Requires either "csv_file_pattern" and "csv_schema", or "bigquery".')
    elif isinstance(training_data, google.datalab.ml.CsvDataSet):
      schema_file = _create_json_file(tmpdir, training_data.schema, 'schema.json')
      for file_name in training_data.input_files:
        cmd_args.append('--csv=' + _abs_path(file_name))

      cmd_args.extend(['--schema', schema_file])
    elif isinstance(training_data, google.datalab.ml.BigQueryDataSet):
      # TODO: Support query too once command line supports query.
      cmd_args.extend(['--bigquery', training_data.table])
    else:
      raise ValueError('Invalid training data. Requires either a dict, '
                       'a google.datalab.ml.CsvDataSet, or a google.datalab.ml.BigQueryDataSet.')

    features = args['features']
    features_file = _create_json_file(tmpdir, features, 'features.json')
    cmd_args.extend(['--features', features_file])

    if args['package']:
      code_path = os.path.join(tmpdir, 'package')
      _archive.extract_archive(args['package'], code_path)
    else:
      code_path = MLTOOLBOX_CODE_PATH

    _shell_process.run_and_monitor(cmd_args, os.getpid(), cwd=code_path)
  finally:
    file_io.delete_recursively(tmpdir)
예제 #21
0
  def test_dry_run_cell(self, mock_get_query_argument, mock_context_default, mock_dry_run):
    args = {'query': 'test_query'}
    cell_body = ''
    mock_get_query_argument.return_value = bq.Query('test_sql')

    # test --verbose
    args['verbose'] = True
    with mock.patch('sys.stdout', new=six.StringIO()) as mocked_stdout:
      bq.commands._bigquery._dryrun_cell(args, cell_body)
    self.assertEqual(mocked_stdout.getvalue(), 'test_sql\n')
    args['verbose'] = False
예제 #22
0
 def sample(self, n):
     """Samples data into a Pandas DataFrame. Note that it calls BigQuery so it will
    incur cost.
 Args:
   n: number of sampled counts. Note that the number of counts returned is approximated.
 Returns:
   A dataframe containing sampled data.
 Raises:
   Exception if n is larger than number of rows.
 """
     total = bq.Query('select count(*) from %s' %
                      self._get_source()).execute().result()[0].values()[0]
     if n > total:
         raise ValueError('sample larger than population')
     sampling = bq.Sampling.random(percent=n * 100.0 / float(total))
     if self._query is not None:
         source = self._query
     else:
         source = 'SELECT * FROM `%s`' % self._table
     sample = bq.Query(source).execute(sampling=sampling).result()
     df = sample.to_dataframe()
     return df
예제 #23
0
def test_datalab_query():
    # [START bigquery_migration_datalab_query]
    import google.datalab.bigquery as bq

    sql = """
        SELECT name FROM `bigquery-public-data.usa_names.usa_1910_current`
        WHERE state = "TX"
        LIMIT 100
    """
    df = bq.Query(sql).execute().result().to_dataframe()
    # [END bigquery_migration_datalab_query]

    assert len(df) == 100
 def execute(self, context):
     query = bq.Query(sql=self._sql)
     # allow_large_results is set to False because setting it True requires the destination table to
     # be specified.
     # use_cache is set to False since this is most likely the case for all pipeline scenarios
     output_options = bq.QueryOutput.table(name=self._table,
                                           mode=self._mode,
                                           use_cache=False,
                                           allow_large_results=False)
     pydatalab_context = google.datalab.Context.default()
     query_params = Pipeline._get_query_parameters(self._parameters)
     # TODO(rajivpb): Is this a sync or an async operation? Unlike load(), this does not wrap its
     # async counterpart with a job.wait(). Test and wait if necessary.
     query.execute(output_options,
                   context=pydatalab_context,
                   query_params=query_params)
예제 #25
0
def get_distinct_values(column_name):
    # Using String format to form sql query
    query = """
    SELECT
        {0},
        COUNT(1) as num_babies,
        AVG(weight_pounds) as avg_wt
    FROM
      publicdata.samples.natality
    WHERE year > 2000
    GROUP BY
    {0}
    """.format(
        column_name
    )
    return bq.Query(query).execute().result().to_dataframe()
예제 #26
0
    def plot(self, data):
        """ Plots a featire slice view on given data.

    Args:
      data: Can be one of:
          A string of sql query.
          A sql query module defined by "%%sql --module module_name".
          A pandas DataFrame.
        Regardless of data type, it must include the following columns:
          "feature": identifies a slice of features. For example: "petal_length:4.0-4.2".
          "count": number of instances in that slice of features.
        All other columns are viewed as metrics for its feature slice. At least one is required.
    """
        import IPython

        if ((sys.version_info.major > 2 and isinstance(data, str)) or
            (sys.version_info.major <= 2 and isinstance(data, basestring))):
            data = bq.Query(data)

        if isinstance(data, bq.Query):
            df = data.execute().result().to_dataframe()
            data = self._get_lantern_format(df)
        elif isinstance(data, pd.core.frame.DataFrame):
            data = self._get_lantern_format(data)
        else:
            raise Exception(
                'data needs to be a sql query, or a pandas DataFrame.')

        HTML_TEMPLATE = """<link rel="import" href="/nbextensions/gcpdatalab/extern/lantern-browser.html" >
        <lantern-browser id="{html_id}"></lantern-browser>
        <script>
        var browser = document.querySelector('#{html_id}');
        browser.metrics = {metrics};
        browser.data = {data};
        browser.sourceType = 'colab';
        browser.weightedExamplesColumn = 'count';
        browser.calibrationPlotUriFn = function(s) {{ return '/' + s; }}
        </script>"""
        # Serialize the data and list of metrics names to JSON string.
        metrics_str = str(map(str, data[0]['metricValues'].keys()))
        data_str = str([{str(k): json.dumps(v)
                         for k, v in elem.iteritems()} for elem in data])
        html_id = 'l' + datalab.utils.commands.Html.next_id()
        html = HTML_TEMPLATE.format(html_id=html_id,
                                    metrics=metrics_str,
                                    data=data_str)
        IPython.display.display(IPython.display.HTML(html))
예제 #27
0
def getDATA():
    import pandas as pd
    import google.datalab.bigquery as bq
    query_K2019tweets = """

    SELECT created_at,keywords FROM `hdc-politie-team-3.politie.tweets` WHERE iso_language_code='nl' and created_at BETWEEN "2019-04-26" AND "2019-04-29" ORDER BY created_at ASC

    """
    # Transform your query to BigQuery object
    bq_object = bq.Query(query_K2019tweets)

    # Execute your query
    result = bq_object.execute().result()

    # Transform your output to a Pandas dataframe
    kings2019tweets_df = result.to_dataframe()

    return kings2019tweets_df
예제 #28
0
def row_counts(dataset_ids):
    sq = "SELECT '{dataset_id}' dataset_id, table_id, row_count FROM {dataset_id}.__TABLES__"
    sqs = [sq.format(dataset_id=d) for d in dataset_ids]
    iq = "\nUNION ALL\n".join(sqs)
    q = """ 
    SELECT dataset_id, table_id, row_count 
    FROM ({iq})
    WHERE table_id NOT LIKE '%union%' 
      AND table_id NOT LIKE '%ipmc%'
    ORDER BY table_id, dataset_id""".format(iq=iq)
    df = bq.Query(q).execute(output_options=bq.QueryOutput.dataframe(
        use_cache=False)).result()
    df['load_date'] = df.dataset_id.str[-8:]
    df['load_date'] = df['load_date'].astype('category')
    df['dataset_id'] = df['dataset_id'].astype('category')
    df['table_id'] = df['table_id'].astype('category')
    g = sns.FacetGrid(df, col='table_id', sharey=False, col_wrap=5)
    g.map(sns.barplot, 'load_date', 'row_count', ci=None)
    g.set_xticklabels(rotation=45, ha='right')
    return df
예제 #29
0
def gender_by_race(dataset_id):
    q = bq.Query('''
    SELECT 
     c1.concept_name AS gender,
     c2.concept_name AS race,
     COUNT(1) AS `count`
    FROM `{dataset_id}.person` p
    JOIN `{latest.vocabulary}.concept` c1 
      ON p.gender_concept_id = c1.concept_id
    JOIN `{latest.vocabulary}.concept` c2
      ON p.race_concept_id = c2.concept_id
    GROUP BY c2.concept_name, c1.concept_name
    '''.format(dataset_id=dataset_id, latest=DEFAULT_DATASETS.latest))
    df = q.execute(output_options=bq.QueryOutput.dataframe()).result()
    df['race'] = df['race'].astype('category')
    df['gender'] = df['gender'].astype('category')
    g = sns.FacetGrid(df, col='race', hue='gender', col_wrap=5)
    g.map(sns.barplot, 'gender', 'count', ci=None)
    g.set_xticklabels([])
    g.set_axis_labels('', '')
    g.add_legend()
예제 #30
0
def getkeywords():
    import pandas as pd
    import google.datalab.bigquery as bq
    query_K2019keys = """

   SELECT distinct key FROM `hdc-politie-team-3.politie.tweets`,unnest(keywords) as key

    """
    # Transform your query to BigQuery object
    bq_object = bq.Query(query_K2019keys)

    # Execute your query
    result = bq_object.execute().result()

    # Transform your output to a Pandas dataframe
    kings2019keys_df = result.to_dataframe()
    aj = kings2019keys_df.values.tolist()
    newa = []
    for i in range(len(aj)):
        newa.append(aj[i][0])

    nexta = sorted(newa)

    return nexta