예제 #1
0
def _dataset_line(args):
  """Implements the BigQuery dataset magic subcommand used to operate on datasets

   The supported syntax is:
   %bq datasets <command> <args>

  Commands:
    {list, create, delete}

  Args:
    args: the optional arguments following '%bq datasets command'.
  """
  if args['command'] == 'list':
    filter_ = args['filter'] if args['filter'] else '*'
    context = google.datalab.Context.default()
    if args['project']:
      context = google.datalab.Context(args['project'], context.credentials)
    return _render_list([str(dataset) for dataset in bigquery.Datasets(context)
                         if fnmatch.fnmatch(str(dataset), filter_)])

  elif args['command'] == 'create':
    try:
      bigquery.Dataset(args['name']).create(friendly_name=args['friendly'])
    except Exception as e:
      print('Failed to create dataset %s: %s' % (args['name'], e))

  elif args['command'] == 'delete':
    try:
      bigquery.Dataset(args['name']).delete()
    except Exception as e:
      print('Failed to delete dataset %s: %s' % (args['name'], e))
예제 #2
0
def test_datalab_load_table_from_dataframe(to_delete):
    # [START bigquery_migration_datalab_load_table_from_dataframe]
    import google.datalab.bigquery as bq
    import pandas

    # Create the dataset
    dataset_id = 'import_sample'
    # [END bigquery_migration_datalab_load_table_from_dataframe]
    # Use unique dataset ID to avoid collisions when running tests
    dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000))
    to_delete.append(dataset_id)
    # [START bigquery_migration_datalab_load_table_from_dataframe]
    bq.Dataset(dataset_id).create()

    # Create the table and load the data
    dataframe = pandas.DataFrame([
        {'title': 'The Meaning of Life', 'release_year': 1983},
        {'title': 'Monty Python and the Holy Grail', 'release_year': 1975},
        {'title': 'Life of Brian', 'release_year': 1979},
        {
            'title': 'And Now for Something Completely Different',
            'release_year': 1971
        },
    ])
    schema = bq.Schema.from_data(dataframe)
    table = bq.Table(
        '{}.monty_python'.format(dataset_id)).create(schema=schema)
    table.insert(dataframe)  # Starts steaming insert of data
예제 #3
0
def test_datalab_load_table_from_gcs_csv(to_delete):
    # [START bigquery_migration_datalab_load_table_from_gcs_csv]
    import google.datalab.bigquery as bq

    # Create the dataset
    dataset_id = 'import_sample'
    # [END bigquery_migration_datalab_load_table_from_gcs_csv]
    # Use unique dataset ID to avoid collisions when running tests
    dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000))
    to_delete.append(dataset_id)
    # [START bigquery_migration_datalab_load_table_from_gcs_csv]
    bq.Dataset(dataset_id).create()

    # Create the table
    schema = [
        {'name': 'name', 'type': 'STRING'},
        {'name': 'post_abbr', 'type': 'STRING'},
    ]
    table = bq.Table(
        '{}.us_states'.format(dataset_id)).create(schema=schema)
    table.load(
        'gs://cloud-samples-data/bigquery/us-states/us-states.csv',
        mode='append',
        source_format='csv',
        csv_options=bq.CSVOptions(skip_leading_rows=1)
    )  # Waits for the job to complete
    # [END bigquery_migration_datalab_load_table_from_gcs_csv]

    assert table.length == 50
예제 #4
0
    def test_numerics(self):
        """Build a BQ table, and then call analyze on it."""
        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        project_id = dl.Context.default().project_id
        dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
        table_name = 'temp_table'
        full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

        output_folder = tempfile.mkdtemp()

        try:
            # Make a dataset, a table, and insert data.
            db = bq.Dataset((project_id, dataset_name))
            db.create()

            table = bq.Table(full_table_name)
            table.create(schema=bq.Schema(schema), overwrite=True)

            data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)]
            table.insert(data)

            analyze_data.run_cloud_analysis(output_dir=output_folder,
                                            csv_file_pattern=None,
                                            bigquery_table=full_table_name,
                                            schema=schema,
                                            features={
                                                'col1': {
                                                    'transform': 'scale'
                                                },
                                                'col2': {
                                                    'transform': 'identity'
                                                }
                                            })

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze_data.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
            db.delete(delete_contents=True)
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""
    try:
      self._create_test_data()

      # Make a BQ table, and insert 1 row.
      project_id = dl.Context.default().project_id
      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])
      table.insert(data=[{'num_col': 23.0, 'img_col': self.img_filepath}])

      tfex_dir = os.path.join(self.output_folder, 'test_results')
      cmd = ['python ' + os.path.join(CODE_PATH, 'transform_raw_data.py'),
             '--bigquery-table=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analyze-output-dir=' + self.output_folder,
             '--output-filename-prefix=features',
             '--project-id=' + project_id,
             '--output-dir=' + tfex_dir]
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(tfex_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_example = next(
          tf.python_io.tf_record_iterator(
              record_filepath,
              options=options))
      example = tf.train.Example()
      example.ParseFromString(serialized_example)

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 24.0)

      image_bytes = example.features.feature['img_col'].bytes_list.value[0]
      raw_img = Image.open(self.img_filepath).convert('RGB')
      img_file = six.BytesIO()
      raw_img.save(img_file, 'jpeg')
      expected_image_bytes = img_file.getvalue()

      self.assertEqual(image_bytes, expected_image_bytes)
    finally:
      dataset.delete(delete_contents=True)
      shutil.rmtree(self.output_folder)
예제 #6
0
def BigQuery_exportation(df, bigquery_dataset_name, bigquery_table_name):

    print('\nBigQuery exportation started ...')
    start_time = time()

    #Export vers BigQuery
    bigquery_dataset_name = bigquery_dataset_name
    bigquery_table_name = bigquery_table_name

    # Define BigQuery dataset and table
    dataset = bq.Dataset(bigquery_dataset_name)
    table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

    # Create or overwrite the existing table if it exists
    table_schema = bq.Schema.from_data(df)
    table.create(schema=table_schema, overwrite=True)

    # Write the DataFrame to a BigQuery table
    table.insert(df)

    print(
        'BigQuery Exportation Finished. \nTotal exportation time = {:0.2f} min'
        .format((time() - start_time) / 60))
예제 #7
0
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""

    # Make a BQ table, and insert 1 row.
    try:
      bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
      bucket_root = 'gs://%s' % bucket_name
      bucket = storage.Bucket(bucket_name)
      bucket.create()

      project_id = dl.Context.default().project_id

      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'key_col', 'type': 'INTEGER'},
                    {'name': 'target_col', 'type': 'FLOAT'},
                    {'name': 'cat_col', 'type': 'STRING'},
                    {'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])

      img1_file = os.path.join(self.source_dir, 'img1.jpg')
      dest_file = os.path.join(bucket_root, 'img1.jpg')
      file_io.copy(img1_file, dest_file)

      data = [
          {
           'key_col': 1,
           'target_col': 1.0,
           'cat_col': 'Monday',
           'num_col': 23.0,
           'img_col': dest_file,
          },
      ]
      table.insert(data=data)

      cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
             '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analysis=' + self.analysis_dir,
             '--prefix=features',
             '--project-id=' + project_id,
             '--output=' + self.output_dir]
      print('cmd ', ' '.join(cmd))
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(self.output_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options))
      self.assertEqual(len(serialized_examples), 1)

      example = tf.train.Example()
      example.ParseFromString(serialized_examples[0])

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 23.0)
      transformed_category = example.features.feature['cat_col'].int64_list.value[0]
      self.assertEqual(transformed_category, 2)
      image_bytes = example.features.feature['img_col'].float_list.value
      self.assertEqual(len(image_bytes), 2048)
      self.assertTrue(any(x != 0.0 for x in image_bytes))
    finally:
      dataset.delete(delete_contents=True)

      for obj in bucket.objects():
        obj.delete()
      bucket.delete()
예제 #8
0
파일: main.py 프로젝트: BOTTINYA/AutoDMS-V2
#---------------------- retravail de la classification avec les règles en dur -------------------------

#Application de ces règles au dataframe
Final_predicted_df['FLAG_RUPTURE'] = Final_predicted_df.apply(
    hard_coded_rules.flag_rupture, axis=1)
Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply(
    hard_coded_rules.rejet_cause_prev, axis=1)
Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply(
    hard_coded_rules.flag_livraison, axis=1)

#--------------------- BigQuery Exportation ----------------------------
print('Export to BigQuery table...')
start_time = time()

#Export vers BigQuery
bigquery_dataset_name = 'electric-armor-213817.Donnees_journalieres'
bigquery_table_name = 'Classification_journaliere'

# Define BigQuery dataset and table
dataset = bq.Dataset(bigquery_dataset_name)
table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

# Create or overwrite the existing table if it exists
table_schema = bq.Schema.from_data(Final_predicted_df)
table.create(schema=table_schema, overwrite=True)

# Write the DataFrame to a BigQuery table
table.insert(Final_predicted_df)
print('BigQuery export finished. \nExporting process took {:0.2f}min'.format(
    (time() - start_time) / 60))
예제 #9
0
def _table_cell(args, cell_body):
  """Implements the BigQuery table magic subcommand used to operate on tables

   The supported syntax is:
   %%bq tables <command> <args>

  Commands:
    {list, create, delete, describe, view}

  Args:
    args: the optional arguments following '%%bq tables command'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The HTML rendering for the table of datasets.
  """
  if args['command'] == 'list':
    filter_ = args['filter'] if args['filter'] else '*'
    if args['dataset']:
      if args['project'] is None:
        datasets = [bigquery.Dataset(args['dataset'])]
      else:
        context = google.datalab.Context(args['project'],
                                         google.datalab.Context.default().credentials)
        datasets = [bigquery.Dataset(args['dataset'], context)]
    else:
      default_context = google.datalab.Context.default()
      context = google.datalab.Context(default_context.project_id, default_context.credentials)
      if args['project']:
        context.set_project_id(args['project'])
      datasets = bigquery.Datasets(context)

    tables = []
    for dataset in datasets:
      tables.extend([table.full_name
                     for table in dataset if fnmatch.fnmatch(table.full_name, filter_)])

    return _render_list(tables)

  elif args['command'] == 'create':
    if cell_body is None:
      print('Failed to create %s: no schema specified' % args['name'])
    else:
      try:
        record = google.datalab.utils.commands.parse_config(
            cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False)
        jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
        schema = bigquery.Schema(record['schema'])
        bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite'])
      except Exception as e:
        print('Failed to create table %s: %s' % (args['name'], e))

  elif args['command'] == 'describe':
    name = args['name']
    table = _get_table(name)
    if not table:
      raise Exception('Could not find table %s' % name)

    html = _repr_html_table_schema(table.schema)
    return IPython.core.display.HTML(html)

  elif args['command'] == 'delete':
    try:
      bigquery.Table(args['name']).delete()
    except Exception as e:
      print('Failed to delete table %s: %s' % (args['name'], e))

  elif args['command'] == 'view':
    name = args['name']
    table = _get_table(name)
    if not table:
      raise Exception('Could not find table %s' % name)
    return table