Пример #1
0
  def read_and_validate_rows(self, options):
    json_data = self.generate_data()

    class CompareJson(beam.DoFn, unittest.TestCase):
      def process(self, row):
        country_code = row["country_code"]
        expected = json_data[country_code]

        # Test country (JSON String)
        country_actual = json.loads(row["country"])
        country_expected = json.loads(expected["country"])
        self.assertTrue(country_expected == country_actual)

        # Test stats (JSON String in BigQuery struct)
        for stat, value in row["stats"].items():
          stats_actual = json.loads(value)
          stats_expected = json.loads(expected["stats"][stat])
          self.assertTrue(stats_expected == stats_actual)

        # Test cities (JSON String in BigQuery array of structs)
        for city_row in row["cities"]:
          city = city_row["city"]
          city_name = city_row["city_name"]

          city_actual = json.loads(city)
          city_expected = json.loads(expected["cities"][city_name])
          self.assertTrue(city_expected == city_actual)

        # Test landmarks (JSON String in BigQuery array)
        landmarks_actual = row["landmarks"]
        landmarks_expected = expected["landmarks"]
        for i in range(len(landmarks_actual)):
          l_actual = json.loads(landmarks_actual[i])
          l_expected = json.loads(landmarks_expected[i])
          self.assertTrue(l_expected == l_actual)

    parser = argparse.ArgumentParser()
    parser.add_argument('--read_method')
    parser.add_argument('--query')
    parser.add_argument('--input')

    known_args, pipeline_args = parser.parse_known_args(options)

    method = ReadFromBigQuery.Method.DIRECT_READ if \
      known_args.read_method == "DIRECT_READ" else \
      ReadFromBigQuery.Method.EXPORT

    if known_args.query:
      json_query_data = self.generate_query_data()
      with beam.Pipeline(argv=pipeline_args) as p:
        data = p | 'Read rows' >> ReadFromBigQuery(
            query=known_args.query, method=method, use_standard_sql=True)
        assert_that(data, equal_to(json_query_data))
    else:
      with beam.Pipeline(argv=pipeline_args) as p:
        _ = p | 'Read rows' >> ReadFromBigQuery(
            table=known_args.input,
            method=method,
        ) | 'Validate rows' >> beam.ParDo(CompareJson())
Пример #2
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline,
        query: Text,
        use_bigquery_source: bool = False) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.
    use_bigquery_source: Whether to use BigQuerySource instead of experimental
      `ReadFromBigQuery` PTransform.

  Returns:
    PCollection of dict.
  """
    # TODO(b/155441037): Consolidate to ReadFromBigQuery once its performance
    # on dataflow runner is on par with BigQuerySource.
    if use_bigquery_source:
        return (
            pipeline
            | 'ReadFromBigQuerySource' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

    return (pipeline
            | 'ReadFromBigQuery' >> ReadFromBigQuery(
                query=query,
                use_standard_sql=True,
                bigquery_job_labels=telemetry_utils.get_labels_dict()))
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1) LIMIT 100"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText('output_ownership.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Beam'

    schema_id = 'occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #4
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline,
        query: Text,
        project: Optional[Text],
        use_bigquery_source: bool = False) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.
    project: The ID of the project running this job.
    use_bigquery_source: Whether to use BigQuerySource instead of experimental
      `ReadFromBigQuery` PTransform.

  Returns:
    PCollection of dict.
  """
    # TODO(b/155441037): Consolidate to ReadFromBigQuery once its performance
    # on dataflow runner is on par with BigQuerySource.
    if use_bigquery_source:
        return (
            pipeline
            | 'ReadFromBigQuerySource' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))
    # TODO(b/155441037): Change this to top level import after Beam version is
    # upgraded to 2.21.
    try:
        from apache_beam.io.gcp.bigquery import ReadFromBigQuery  # pylint: disable=import-outside-toplevel,g-import-not-at-top
    except ImportError:
        from apache_beam.io.gcp.bigquery import _ReadFromBigQuery as ReadFromBigQuery  # pylint: disable=import-outside-toplevel,g-import-not-at-top
    return (pipeline
            | 'ReadFromBigQuery' >> ReadFromBigQuery(
                query=query, use_standard_sql=True, project=project))
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) LIMIT 100"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Application' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText('output_application.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Beam'

    schema_id = 'CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECESION_DATE:DATE, VISA_CLASS:STRING, employer_name:STRING, employer_city:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #6
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='teacher',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    sql = 'SELECT tid, instructor, dept FROM college_staging.Teacher'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Format Name' >> beam.ParDo(FormatName())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Teacher_Dataflow'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT * FROM imdb_refined.Writers limit 250'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Split Writers' >> beam.ParDo(SplitWriters())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Writers_Beam'
    schema_id = 'tconst:STRING,writers:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #8
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT tid, instructor, dept FROM college_staging.Teacher limit 50'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Format Name' >> beam.ParDo(FormatName())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Teacher_Beam'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
     PROJECT_ID = 'acquired-rarity-288205'
     BUCKET = 'gs://ykdb_beam/temp'

     options = {
     'project': PROJECT_ID
     }
     opts = beam.pipeline.PipelineOptions(flags=[], **options)

     p = beam.Pipeline('DirectRunner', options=opts)
        
     
     # ***************************************** FIXING soc_code ****************************************************
     sql = "SELECT * FROM H_1B_refined.Occupation_fix_date WHERE length(soc_code) != 7 AND soc_code NOT LIKE '%-%' LIMIT 200"
     bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
     
     query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)
     
     out_pcoll = query_results | 'Format Soc' >> beam.ParDo(FormatSocCode())
     
     out_pcoll | 'Log output' >> WriteToText('output_occupation.txt')
     
    
     # ***************************************** INSERT INTO BQ ****************************************************
     dataset_id = 'H_1B_refined'
     table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Beam'
     schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:DATE, employment_end_date:DATE, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

     out_pcoll | 'Write to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)
     
     result = p.run()
     result.wait_until_finish()      
Пример #10
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1) LIMIT 100"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText('output_employer.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Beam'

    schema_id = 'employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def compute_stats(
    input_handle,
    stats_path,
    max_rows=None,
    for_eval=False,
    pipeline_args=None,
    publish_to_bq=None,
    metrics_dataset=None,
    metrics_table=None,
    project=None):
  """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
  namespace = metrics_table
  pipeline = beam.Pipeline(argv=pipeline_args)
  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        namespace=namespace,
        filters=MetricsFilter().with_namespace(namespace),
    )

  query = taxi.make_sql(
      table_name=input_handle, max_rows=max_rows, for_eval=for_eval)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(
          query=query, project=project, use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace))
      | 'ConvertToTFDVInput' >> beam.Map(
          lambda x:
          {key: np.asarray([x[key]])
           for key in x if x[key] is not None}))

  _ = (
      raw_data
      | 'GenerateStatistics' >> tfdv.GenerateStatistics()
      | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace))
      | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
          stats_path,
          shard_name_template='',
          coder=beam.coders.ProtoCoder(
              statistics_pb2.DatasetFeatureStatisticsList)))
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
Пример #12
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='class',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    sql = 'SELECT sid, cno, cname, credits, grade FROM college_staging.Class'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    takes_pcoll = query_results | 'Make Takes' >> beam.ParDo(MakeTakes())

    takes_pcoll | 'Log takes output' >> WriteToText(DIR_PATH +
                                                    'takes_output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Takes_Dataflow'
    schema_id = 'sid:STRING,cno:STRING,grade:STRING'

    takes_pcoll | 'Write takes to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    class_pcoll = query_results | 'Make Class' >> beam.ParDo(MakeClass())

    grouped_class_pcoll = class_pcoll | 'GroupByKey' >> beam.GroupByKey()

    grouped_class_pcoll | 'Log class groups' >> WriteToText(
        DIR_PATH + 'class_groups_output.txt')

    unique_class_pcoll = grouped_class_pcoll | 'Make Unique Class' >> beam.ParDo(
        MakeUniqueClass())

    unique_class_pcoll | 'Log class unique' >> WriteToText(
        DIR_PATH + 'class_unique_output.txt')

    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Class_Dataflow'
    schema_id = 'cno:STRING,cname:STRING,credits:INTEGER'

    unique_class_pcoll | 'Write class to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #13
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT sid, cno, cname, credits, grade FROM college_staging.Class limit 50'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    takes_pcoll = query_results | 'Make Takes' >> beam.ParDo(MakeTakes())

    takes_pcoll | 'Log takes output' >> WriteToText('takes_output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Takes_Beam'
    schema_id = 'sid:STRING,cno:STRING,grade:STRING'

    takes_pcoll | 'Write takes to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    class_pcoll = query_results | 'Make Class' >> beam.ParDo(MakeClass())

    grouped_class_pcoll = class_pcoll | 'GroupByKey' >> beam.GroupByKey()

    grouped_class_pcoll | 'Log class groups' >> WriteToText(
        'class_groups_output.txt')

    unique_class_pcoll = grouped_class_pcoll | 'Make Unique Class' >> beam.ParDo(
        MakeUniqueClass())

    unique_class_pcoll | 'Log class unique' >> WriteToText(
        'class_unique_output.txt')

    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Class_Beam'
    schema_id = 'cno:STRING,cname:STRING,credits:INTEGER'

    unique_class_pcoll | 'Write class to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #14
0
def run():
    # set up location
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # use DataflowRunner instead of DirectRunner
    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='imdbcharacter',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # retrieve the data from imdb_refined dataset and save this information (location)
    sql = 'SELECT * FROM imdb_refined.Characters'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    # use the previously saved information (location) and read from BigQuery
    # query results is now input P collection
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    # Use ParDo to call function on query results
    out_pcoll = query_results | 'Split characters' >> beam.ParDo(
        SplitCharacters())

    # write the results into text file
    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Characters_Dataflow'
    schema_id = 'tconst:STRING,nconst:STRING,characters:STRING'

    # write to BigQuery using the location set above
    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    # run and display results after everything is finished
    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='occupation',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)
    '''
     sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS job_id, * FROM (SELECT job_title, emp.employer_id AS employer_id, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) AS occ JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = occ.employer_name AND emp.employer_city = occ.employer_city) AS t WHERE length(soc_code) >= 6 AND length(soc_code) <= 10 AND length(soc_code) != 8" '''

    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS job_id, job_title, employer_id, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM(SELECT job_title, emp.employer_id AS employer_id, occ.employer_name, occ.employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM H_1B_refined.Occupation as occ JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = occ.employer_name AND emp.employer_city = occ.employer_city WHERE prevailing_wage_YR > 5000 AND length(soc_code) >= 6 AND length(soc_code) <= 10 AND length(soc_code) != 8) GROUP BY job_title, employer_id, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) as t"

    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)
    '''
     out_pcoll_fix_date = query_results | 'Format Date' >> beam.ParDo(FormatDate())
        
     out_pcoll_fix_date | 'Log fix_date_output' >> WriteToText(DIR_PATH + 'output_occ_fix_date.txt')
     '''
    out_pcoll = query_results | 'Format Soc' >> beam.ParDo(FormatSocCode())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_occupation.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Dataflow'

    schema_id = 'job_id:INTEGER, job_title:STRING, employer_id:INTEGER, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 AND length(soc_code) > 5 GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) LIMIT 50"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll_no_dup = query_results | 'Format prevailing_wage_YR and Remove dups' >> beam.ParDo(
        NoDuplicates())

    out_pcoll_no_dup | 'Log no_dup_output' >> WriteToText(
        'output_occ_no_dup.txt')

    out_pcoll_fix_date = out_pcoll_no_dup | 'Format Date' >> beam.ParDo(
        FormatDate())

    out_pcoll_fix_date | 'Log fix_date_output' >> WriteToText(
        'output_occ_fix_date.txt')

    out_pcoll = out_pcoll_fix_date | 'Format Soc' >> beam.ParDo(
        FormatSocCode())

    out_pcoll | 'Log output' >> WriteToText('output_occupation.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Beam'
    schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:Date, employment_end_date:Date, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='application',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS case_id, * FROM (SELECT emp.employer_id, CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE AS DECISION_DATE, VISA_CLASS FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) AS app JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = app.employer_name AND emp.employer_city = app.employer_city) AS t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Application Transfromation' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH +
                                            'output_appplication.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Dataflow'

    schema_id = 'case_id:INTEGER, employer_id:INTEGER, CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECISION_DATE:DATE, VISA_CLASS:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #18
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='employer',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS employer_id, * FROM (SELECT employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1)) AS t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_employer.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Dataflow'

    schema_id = 'employer_id:INTEGER, employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #19
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='ownership',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) as ownership_id, * FROM (SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1)) as  t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_ownership.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Dataflow'

    schema_id = 'ownership_id:INTEGER, occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #20
0
def run():
    # set up location
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    # executed with DirectRunner
    p = beam.Pipeline('DirectRunner', options=opts)

    # retrieve the data from imdb_refined dataset and save this information (location)
    sql = 'SELECT * FROM imdb_refined.Primary_Professions limit 250'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    # use the previously saved information (location) and read from BigQuery
    # query results is now input P collection
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    # Use ParDo to call function on query results
    out_pcoll = query_results | 'Split Primary Professions' >> beam.ParDo(
        SplitPrimaryProfessions())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Primary_Professions_Beam'
    schema_id = 'nconst:STRING,primaryProfession:STRING'

    # write to BigQuery using the location set above
    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    # run and display results after everything is finished
    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_table',
                        required=True,
                        help='Input table to process.')
    parser.add_argument('--num_records',
                        required=True,
                        help='The expected number of records',
                        type=int)
    parser.add_argument('--num_slow',
                        default=0,
                        help=('Percentage of rows that will be slow. '
                              'Must be in the range [0, 100)'))
    parser.add_argument('--beam_bq_source',
                        default=False,
                        type=bool,
                        help=('Whether to use the new ReadFromBigQuery'
                              ' transform, or the BigQuerySource.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)
    with TestPipeline(options=options) as p:
        if known_args.beam_bq_source:
            reader = ReadFromBigQuery(
                table='%s:%s' % (options.view_as(GoogleCloudOptions).project,
                                 known_args.input_table))
        else:
            reader = beam.io.Read(
                beam.io.BigQuerySource(known_args.input_table))

        # pylint: disable=expression-not-assigned
        count = (p | 'read' >> reader
                 | 'row to string' >> beam.ParDo(RowToStringWithSlowDown(),
                                                 num_slow=known_args.num_slow)
                 | 'count' >> beam.combiners.Count.Globally())

        assert_that(count, equal_to([known_args.num_records]))
Пример #22
0
def run():
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # use DataflowRunner instead of DirectRunner
    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='imdbwriters',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    sql = 'SELECT * FROM imdb_refined.Writers'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Split Writers' >> beam.ParDo(SplitWriters())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Writers_Dataflow'
    schema_id = 'tconst:STRING,writers:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #23
0
    def process(self, lines):
        with self.gcsio().open(f'{self.gcs_path}/index.csv',
                               'w',
                               mime_type='text/csv') as fp:
            fp.write(lines.encode())


job_name = f"reviewr-automl--{datetime.utcnow().strftime('%Y%m%d-%H%I%S')}"
gcs_path = f'{GCS_DESTINATION}/{job_name}'
pipeline_options = PipelineOptions(project=PROJECT_ID,
                                   region=DATAFLOW_REGION,
                                   job_name=job_name,
                                   temp_location=f'{gcs_path}/temp')

p = beam.Pipeline(runner=RUNNER, options=pipeline_options)
bq_row = p | 'ReadFromBigQuery' >> ReadFromBigQuery(
    query=
    f"SELECT * FROM `{BQ_SOURCE}`{' LIMIT 10' if RUNNER == 'DirectRunner' else ''}",
    project=PROJECT_ID,
    use_standard_sql=True,
    gcs_location=f'{gcs_path}/temp')

bq_row | 'WriteExampleFile' >> beam.ParDo(WriteExampleFile(gcs_path))

bq_row | 'CreateLine' >> beam.ParDo(CreateLine(gcs_path))\
    | 'CombineLines' >> beam.CombineGlobally(lambda lines: '\n'.join(lines))\
    | 'WriteIndexFile' >> beam.ParDo(WriteIndexFile(gcs_path))

p.run()
Пример #24
0
def process_tfma(schema_file,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None,
                 publish_to_bq=False,
                 project=None,
                 metrics_table=None,
                 metrics_dataset=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
  schema_file: A file containing a text-serialized Schema that describes the
      eval data.
  big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
  eval_model_dir: A directory where the eval model is located.
  max_eval_rows: Number of rows to query from BigQuery.
  pipeline_args: additional DataflowRunner or DirectRunner args passed to
  the beam pipeline.
  publish_to_bq:
  project:
  metrics_dataset:
  metrics_table:

  Raises:
  ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if big_query_table is None:
    raise ValueError(
        '--big_query_table should be provided.')

  slice_spec = [
      tfma.slicer.SingleSliceSpec(),
      tfma.slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]
  metrics_namespace = metrics_table

  schema = taxi.read_schema(schema_file)

  eval_shared_model = tfma.default_eval_shared_model(
      eval_saved_model_path=eval_model_dir,
      add_metrics_callbacks=[
          tfma.post_export_metrics.calibration_plot_and_prediction_histogram(),
          tfma.post_export_metrics.auc_plots()
      ])

  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        filters=MetricsFilter().with_namespace(metrics_namespace)
    )

  pipeline = beam.Pipeline(argv=pipeline_args)

  query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project,
                                           use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace))
      | 'CleanData' >> beam.Map(lambda x: (
          taxi.clean_raw_data_dict(x, raw_feature_spec))))

  # Examples must be in clean tf-example format.
  coder = taxi.make_proto_coder(schema)
  # Prepare arguments for Extract, Evaluate and Write steps
  extractors = tfma.default_extractors(
      eval_shared_model=eval_shared_model,
      slice_spec=slice_spec,
      desired_batch_size=None,
      materialize=False)

  evaluators = tfma.default_evaluators(
      eval_shared_model=eval_shared_model,
      desired_batch_size=None,
      num_bootstrap_samples=1)
  _ = (
      raw_data
      | 'ToSerializedTFExample' >> beam.Map(coder.encode)
      | 'Extract Results' >> tfma.InputsToExtracts()
      | 'Extract and evaluate' >> tfma.ExtractAndEvaluate(
          extractors=extractors,
          evaluators=evaluators)
      | 'Map Evaluations to PCollection' >> MapEvalToPCollection()
      | 'Measure time: End' >> beam.ParDo(
          MeasureTime(metrics_namespace))
  )
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
Пример #25
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None,
                   publish_to_bq=False,
                   project=None,
                   metrics_table=None,
                   metrics_dataset=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    namespace = metrics_table
    metrics_monitor = None
    if publish_to_bq:
        metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=project,
            bq_table=metrics_table,
            bq_dataset=metrics_dataset,
            namespace=namespace,
            filters=MetricsFilter().with_namespace(namespace))
    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    pipeline = beam.Pipeline(argv=pipeline_args)
    with tft_beam.Context(temp_dir=working_dir):
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> ReadFromBigQuery(
                query=query, project=project, use_standard_sql=True)
            | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace)))
        decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                    raw_feature_spec=raw_feature_spec)

        if transform_dir is None:
            decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
            transform_fn = (
                (decoded_data, raw_data_metadata) |
                ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

            _ = (
                transform_fn |
                ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)))
        else:
            transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

        # Shuffling the data before materialization will improve Training
        # effectiveness downstream. Here we shuffle the raw_data (as opposed to
        # decoded data) since it has a compact representation.
        shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
        )

        decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
        (transformed_data, transformed_metadata) = (
            ((decoded_data, raw_data_metadata), transform_fn)
            | 'Transform' >> tft_beam.TransformDataset())

        coder = example_proto_coder.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace))
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(working_dir, outfile_prefix),
                 file_name_suffix='.gz'))
    result = pipeline.run()
    result.wait_until_finish()
    if metrics_monitor:
        metrics_monitor.publish_metrics(result)