def _get_gcs_csv_row_count(self, federated_table): import datalab.bigquery as bq results = bq.Query('SELECT count(*) from data', data_sources={ 'data': federated_table }).results() return results[0].values()[0]
def sample_to(self, count, skip_header_rows, strategy, target): """Sample rows from GCS or local file and save results to target file. Args: count: number of rows to sample. If strategy is "BIGQUERY", it is used as approximate number. skip_header_rows: whether to skip first row when reading from source. strategy: can be "LOCAL" or "BIGQUERY". If local, the sampling happens in local memory, and number of resulting rows matches count. If BigQuery, sampling is done with BigQuery in cloud, and the number of resulting rows will be approximated to count. target: The target file path, can be GCS or local path. Raises: Exception if strategy is "BIGQUERY" but source is not a GCS path. """ # TODO(qimingj) Add unit test # Read data from source into DataFrame. if strategy == 'BIGQUERY': import datalab.bigquery as bq if not self.path.startswith('gs://'): raise Exception('Cannot use BIGQUERY if data is not in GCS') federated_table = self._create_federated_table(skip_header_rows) row_count = self._get_gcs_csv_row_count(federated_table) query = bq.Query('SELECT * from data', data_sources={'data': federated_table}) sampling = bq.Sampling.random(count * 100 / float(row_count)) sample = query.sample(sampling=sampling) df = sample.to_dataframe() elif strategy == 'LOCAL': local_file = self.path if self.path.startswith('gs://'): local_file = tempfile.mktemp() datalab.utils.gcs_copy_file(self.path, local_file) with open(local_file) as f: row_count = sum(1 for line in f) start_row = 1 if skip_header_rows == True else 0 skip_count = row_count - count - 1 if skip_header_rows == True else row_count - count skip = sorted( random.sample(xrange(start_row, row_count), skip_count)) header_row = 0 if skip_header_rows == True else None df = pd.read_csv(local_file, skiprows=skip, header=header_row, delimiter=self._delimiter) if self.path.startswith('gs://'): os.remove(local_file) else: raise Exception('strategy must be BIGQUERY or LOCAL') # Write to target. if target.startswith('gs://'): with tempfile.NamedTemporaryFile() as f: df.to_csv(f, header=False, index=False) f.flush() datalab.utils.gcs_copy_file(f.name, target) else: with open(target, 'w') as f: df.to_csv(f, header=False, index=False, sep=str(self._delimiter))
def bq(self, line, cell=None) -> pd.DataFrame: # Parse args args = parse_argstring(self.bq, line) args_dict = dict(args._get_kwargs()) execute_kwargs = { k: args_dict.get(k, getattr(self, k)) for k in inspect.signature(bq.Query.execute).parameters.keys() if k in args_dict or hasattr(self, k) } to_dataframe_kwargs = { k: args_dict.get(k, getattr(self, k)) for k in inspect.signature( bq.QueryResultsTable.to_dataframe).parameters.keys() if k in args_dict or hasattr(self, k) } # Parse code code = ' '.join(args.rest + [cell or '']) code = code.replace( '$', '$$' ) # Make '$' safe by assuming no variable references (see bq.Query? for details) # Run query self._print(args.quiet, 'Running query...') start_s = time.time() query = bq.Query(code).execute(**execute_kwargs) self._print( args.quiet, '[%.0fs, %s]' % (time.time() - start_s, bq_url_for_query(query))) # Fetch results self._print(args.quiet, 'Fetching results...') start_s = time.time() df = query.results.to_dataframe(**to_dataframe_kwargs) self._print(args.quiet, '[%.0fs]' % (time.time() - start_s)) # Store output if args.out: self.shell.user_ns[args.out] = df # Return (maybe) if not args.no_return: return df.T if args.transpose else df
def bqq(sql: str, max_rows=1000, **kwargs) -> pd.DataFrame: """ e.g. bqq('select 42') """ kwargs.setdefault('billing_tier', 3) sql = sql.replace( '$', '$$' ) # Make '$' safe by assuming no variable references (see bq.Query? for details) print('Running query...') start_s = time.time() query = bq.Query(sql).execute(dialect='standard', **kwargs) print('[%.0fs, %s]' % (time.time() - start_s, bq_url_for_query(query))) print('Fetching results...') start_s = time.time() df = query.results.to_dataframe(max_rows=max_rows) print('[%.0fs]' % (time.time() - start_s)) return df
# Training query = "{0} AND ABS(HASH(pickup_datetime)) % 4 < 2".format( base_query) else: # Validation query = "{0} AND ABS(HASH(pickup_datetime)) % 4 == {1}".format( base_query, phase) else: query = "{0} AND ABS(HASH(pickup_datetime)) % {1} == {2}".format( base_query, EVERY_N, phase) return query query = create_query(2, 100000) df = bq.Query(query).to_dataframe() # In[ ]: print_rmse(model, 'benchmark', df) # RMSE on benchmark dataset is <b>9.41</b> (your results will vary because of random seeds). # # This is not only way more than our original benchmark of 6.00, but it doesn't even beat our distance-based rule's RMSE of 8.02. # # Fear not -- you have learned how to write a TensorFlow model, but not to do all the things that you will have to do to your ML model performant. We will do this in the next chapters. In this chapter though, we will get our TensorFlow model ready for these improvements. # # In a software sense, the rest of the labs in this chapter will be about refactoring the code so that we can improve it. # ## Challenge Exercise #
def make_prediction_input_fn(df, num_epochs): return tf.estimator.inputs.pandas_input_fn( x = df, y = None, batch_size = 128, num_epochs = num_epochs, shuffle = True, queue_capacity = 1000, num_threads = 1 ) def make_feature_cols(): input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES] return input_columns tf.logging.set_verbosity(tf.logging.INFO) OUTDIR = 'taxi_trained' shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time model = tf.estimator.LinearRegressor( feature_columns = make_feature_cols(), model_dir = OUTDIR) model.train(input_fn = make_input_fn(df_train, num_epochs = 10)) def print_rmse(model, name, df): metrics = model.evaluate(input_ fn = make_input_fn(df, 1)) print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss']))) print_rmse(model, 'validation', df_valid) predictions = model.predict(input_fn = make_prediction_input_fn(df_valid, 1)) for i in xrange(5): print(predictions.next()) #DNN tf.logging.set_verbosity(tf.logging.INFO) shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time model = tf.estimator.DNNRegressor(hidden_units = [32, 8, 2], feature_columns = make_feature_cols(), model_dir = OUTDIR) model.train(input_fn = make_input_fn(df_train, num_epochs = 100)); print_rmse(model, 'validation', df_valid) #benchamark import datalab.bigquery as bq import numpy as np import pandas as pd def create_query(phase, EVERY_N): """ phase: 1 = train 2 = valid """ base_query = """ SELECT (tolls_amount + fare_amount) AS fare_amount, CONCAT(STRING(pickup_datetime), STRING(pickup_longitude), STRING(pickup_latitude), STRING(dropoff_latitude), STRING(dropoff_longitude)) AS key, DAYOFWEEK(pickup_datetime)*1.0 AS dayofweek, HOUR(pickup_datetime)*1.0 AS hourofday, pickup_longitude AS pickuplon, pickup_latitude AS pickuplat, dropoff_longitude AS dropofflon, dropoff_latitude AS dropofflat, passenger_count*1.0 AS passengers, FROM [nyc-tlc:yellow.trips] WHERE trip_distance > 0 AND fare_amount >= 2.5 AND pickup_longitude > -78 AND pickup_longitude < -70 AND dropoff_longitude > -78 AND dropoff_longitude < -70 AND pickup_latitude > 37 AND pickup_latitude < 45 AND dropoff_latitude > 37 AND dropoff_latitude < 45 AND passenger_count > 0 """ if EVERY_N == None: if phase < 2: # Training query = "{0} AND ABS(HASH(pickup_datetime)) % 4 < 2".format(base_query) else: # Validation query = "{0} AND ABS(HASH(pickup_datetime)) % 4 == {1}".format(base_query, phase) else: query = "{0} AND ABS(HASH(pickup_datetime)) % {1} == {2}".format(base_query, EVERY_N, phase) return query query = create_query(2, 100000) df = bq.Query(query).to_dataframe() print_rmse(model, 'benchmark', df)