Пример #1
0
 def _get_gcs_csv_row_count(self, federated_table):
     import datalab.bigquery as bq
     results = bq.Query('SELECT count(*) from data',
                        data_sources={
                            'data': federated_table
                        }).results()
     return results[0].values()[0]
Пример #2
0
 def sample_to(self, count, skip_header_rows, strategy, target):
     """Sample rows from GCS or local file and save results to target file.
 Args:
   count: number of rows to sample. If strategy is "BIGQUERY", it is used as approximate number.
   skip_header_rows: whether to skip first row when reading from source.
   strategy: can be "LOCAL" or "BIGQUERY". If local, the sampling happens in local memory,
       and number of resulting rows matches count. If BigQuery, sampling is done
       with BigQuery in cloud, and the number of resulting rows will be approximated to
       count.
   target: The target file path, can be GCS or local path.
 Raises:
   Exception if strategy is "BIGQUERY" but source is not a GCS path.
 """
     # TODO(qimingj) Add unit test
     # Read data from source into DataFrame.
     if strategy == 'BIGQUERY':
         import datalab.bigquery as bq
         if not self.path.startswith('gs://'):
             raise Exception('Cannot use BIGQUERY if data is not in GCS')
         federated_table = self._create_federated_table(skip_header_rows)
         row_count = self._get_gcs_csv_row_count(federated_table)
         query = bq.Query('SELECT * from data',
                          data_sources={'data': federated_table})
         sampling = bq.Sampling.random(count * 100 / float(row_count))
         sample = query.sample(sampling=sampling)
         df = sample.to_dataframe()
     elif strategy == 'LOCAL':
         local_file = self.path
         if self.path.startswith('gs://'):
             local_file = tempfile.mktemp()
             datalab.utils.gcs_copy_file(self.path, local_file)
         with open(local_file) as f:
             row_count = sum(1 for line in f)
         start_row = 1 if skip_header_rows == True else 0
         skip_count = row_count - count - 1 if skip_header_rows == True else row_count - count
         skip = sorted(
             random.sample(xrange(start_row, row_count), skip_count))
         header_row = 0 if skip_header_rows == True else None
         df = pd.read_csv(local_file,
                          skiprows=skip,
                          header=header_row,
                          delimiter=self._delimiter)
         if self.path.startswith('gs://'):
             os.remove(local_file)
     else:
         raise Exception('strategy must be BIGQUERY or LOCAL')
     # Write to target.
     if target.startswith('gs://'):
         with tempfile.NamedTemporaryFile() as f:
             df.to_csv(f, header=False, index=False)
             f.flush()
             datalab.utils.gcs_copy_file(f.name, target)
     else:
         with open(target, 'w') as f:
             df.to_csv(f,
                       header=False,
                       index=False,
                       sep=str(self._delimiter))
Пример #3
0
    def bq(self, line, cell=None) -> pd.DataFrame:

        # Parse args
        args = parse_argstring(self.bq, line)
        args_dict = dict(args._get_kwargs())
        execute_kwargs = {
            k: args_dict.get(k, getattr(self, k))
            for k in inspect.signature(bq.Query.execute).parameters.keys()
            if k in args_dict or hasattr(self, k)
        }
        to_dataframe_kwargs = {
            k: args_dict.get(k, getattr(self, k))
            for k in inspect.signature(
                bq.QueryResultsTable.to_dataframe).parameters.keys()
            if k in args_dict or hasattr(self, k)
        }

        # Parse code
        code = ' '.join(args.rest + [cell or ''])
        code = code.replace(
            '$', '$$'
        )  # Make '$' safe by assuming no variable references (see bq.Query? for details)

        # Run query
        self._print(args.quiet, 'Running query...')
        start_s = time.time()
        query = bq.Query(code).execute(**execute_kwargs)
        self._print(
            args.quiet,
            '[%.0fs, %s]' % (time.time() - start_s, bq_url_for_query(query)))

        # Fetch results
        self._print(args.quiet, 'Fetching results...')
        start_s = time.time()
        df = query.results.to_dataframe(**to_dataframe_kwargs)
        self._print(args.quiet, '[%.0fs]' % (time.time() - start_s))

        # Store output
        if args.out:
            self.shell.user_ns[args.out] = df

        # Return (maybe)
        if not args.no_return:
            return df.T if args.transpose else df
Пример #4
0
def bqq(sql: str, max_rows=1000, **kwargs) -> pd.DataFrame:
    """
    e.g. bqq('select 42')
    """

    kwargs.setdefault('billing_tier', 3)
    sql = sql.replace(
        '$', '$$'
    )  # Make '$' safe by assuming no variable references (see bq.Query? for details)

    print('Running query...')
    start_s = time.time()
    query = bq.Query(sql).execute(dialect='standard', **kwargs)
    print('[%.0fs, %s]' % (time.time() - start_s, bq_url_for_query(query)))

    print('Fetching results...')
    start_s = time.time()
    df = query.results.to_dataframe(max_rows=max_rows)
    print('[%.0fs]' % (time.time() - start_s))

    return df
Пример #5
0
            # Training
            query = "{0} AND ABS(HASH(pickup_datetime)) % 4 < 2".format(
                base_query)
        else:
            # Validation
            query = "{0} AND ABS(HASH(pickup_datetime)) % 4 == {1}".format(
                base_query, phase)
    else:
        query = "{0} AND ABS(HASH(pickup_datetime)) % {1} == {2}".format(
            base_query, EVERY_N, phase)

    return query


query = create_query(2, 100000)
df = bq.Query(query).to_dataframe()

# In[ ]:

print_rmse(model, 'benchmark', df)

# RMSE on benchmark dataset is <b>9.41</b> (your results will vary because of random seeds).
#
# This is not only way more than our original benchmark of 6.00, but it doesn't even beat our distance-based rule's RMSE of 8.02.
#
# Fear not -- you have learned how to write a TensorFlow model, but not to do all the things that you will have to do to your ML model performant. We will do this in the next chapters. In this chapter though, we will get our TensorFlow model ready for these improvements.
#
# In a software sense, the rest of the labs in this chapter will be about refactoring the code so that we can improve it.

# ## Challenge Exercise
#
Пример #6
0
def make_prediction_input_fn(df, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = None,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )
  
  
  
  def make_feature_cols():
  input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES]
  return input_columns
  
  
  
  
  
  
  tf.logging.set_verbosity(tf.logging.INFO)

OUTDIR = 'taxi_trained'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

model = tf.estimator.LinearRegressor(
      feature_columns = make_feature_cols(), model_dir = OUTDIR)

model.train(input_fn = make_input_fn(df_train, num_epochs = 10))




def print_rmse(model, name, df):
  metrics = model.evaluate(input_
  
fn = make_input_fn(df, 1))
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
print_rmse(model, 'validation', df_valid)




predictions = model.predict(input_fn = make_prediction_input_fn(df_valid, 1))
for i in xrange(5):
  print(predictions.next())
  
  
  
  
  #DNN
  tf.logging.set_verbosity(tf.logging.INFO)
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
model = tf.estimator.DNNRegressor(hidden_units = [32, 8, 2],
      feature_columns = make_feature_cols(), model_dir = OUTDIR)
model.train(input_fn = make_input_fn(df_train, num_epochs = 100));
print_rmse(model, 'validation', df_valid)


#benchamark

import datalab.bigquery as bq
import numpy as np
import pandas as pd


def create_query(phase, EVERY_N):
  """
  phase: 1 = train 2 = valid
  """
  base_query = """
SELECT
  (tolls_amount + fare_amount) AS fare_amount,
  CONCAT(STRING(pickup_datetime), STRING(pickup_longitude), STRING(pickup_latitude), STRING(dropoff_latitude), STRING(dropoff_longitude)) AS key,
  DAYOFWEEK(pickup_datetime)*1.0 AS dayofweek,
  HOUR(pickup_datetime)*1.0 AS hourofday,
  pickup_longitude AS pickuplon,
  pickup_latitude AS pickuplat,
  dropoff_longitude AS dropofflon,
  dropoff_latitude AS dropofflat,
  passenger_count*1.0 AS passengers,
FROM
  [nyc-tlc:yellow.trips]
WHERE
  trip_distance > 0
  AND fare_amount >= 2.5
  AND pickup_longitude > -78
  AND pickup_longitude < -70
  AND dropoff_longitude > -78
  AND dropoff_longitude < -70
  AND pickup_latitude > 37
  AND pickup_latitude < 45
  AND dropoff_latitude > 37
  AND dropoff_latitude < 45
  AND passenger_count > 0
  """

  if EVERY_N == None:
    if phase < 2:
      # Training
      query = "{0} AND ABS(HASH(pickup_datetime)) % 4 < 2".format(base_query)
    else:
      # Validation
      query = "{0} AND ABS(HASH(pickup_datetime)) % 4 == {1}".format(base_query, phase)
  else:
    query = "{0} AND ABS(HASH(pickup_datetime)) % {1} == {2}".format(base_query, EVERY_N, phase)
    
  return query

query = create_query(2, 100000)
df = bq.Query(query).to_dataframe()

print_rmse(model, 'benchmark', df)