Пример #1
0
 def test(self):
     output = (
         self.pipeline
         | 'Read from BigQuery' >> Read(
             BigQuerySource(dataset=self.input_dataset,
                            table=self.input_table))
         | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
         | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
         | 'Count' >> Count.Globally())
     assert_that(output, equal_to([self.input_options['num_records']]))
Пример #2
0
def run(argv=None):
    known_args, options = pipeline_options(argv)
    with beam.Pipeline(options=options) as p:
        lines = p | "Read BigQuery" >> beam.io.Read(
            BigQuerySource(
                query=
                "SELECT * FROM `bigquery-public-data.hacker_news.comments` LIMIT 100",
                use_standard_sql=True,
            ))
        with_author = (
            lines | "Add Author" >> beam.Map(lambda row: (row["author"], row))
            | "Group By Autho" >> beam.GroupByKey())
        author_jsons = with_author | "Convert Row To JSON" >> beam.Map(
            row_to_json)
        author_jsons | beam.ParDo(OutputToFile(known_args.output))
Пример #3
0
def run(args=None):
  pipeline_options = PipelineOptions(args)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options.view_as(SetupOptions).save_main_session = True

  normalize_options = pipeline_options.view_as(NormalizeOptions)
  gcp_options = pipeline_options.view_as(GoogleCloudOptions)

  d1, d2 = parse_date_range(normalize_options.date_range)
  helper = QueryHelper(table=normalize_options.source_table, first_date_ts=d1, last_date_ts=d2)
  select_fields = ['mmsi', 'timestamp', 'seg_id', 'shipname', 'callsign', 'imo']
  where_sql = 'shipname is not null or callsign is not null or imo is not null'
  if normalize_options.mmsi_quotient > 1:
      where_sql = "hash(mmsi) % {} = 0 and ({})".format(normalize_options.mmsi_quotient, where_sql)

  source_schema = helper.filter_table_schema(select_fields)
  source = BigQuerySource(query=helper.build_query(include_fields=select_fields, where_sql=where_sql))

  dest_schema = TableSchema(fields=source_schema.fields)
  dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_SHIPNAME, type='STRING'))
  dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_CALLSIGN, type='STRING'))
  dest_schema.fields.append(TableFieldSchema(name=VALID_IMO, type='INTEGER'))

  pipeline = beam.Pipeline(options=pipeline_options)
  (
      pipeline
      | "ReadSource" >> ReadAsJSONDict(source)
      | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn())
      | "AddTimestamp" >> beam.ParDo(TimestampedValueDoFn())
      | "NormalizeNames" >> beam.ParDo(NormalizeNamesDoFn())
      | "WriteDest" >> WriteToBigQueryDatePartitioned(
          temp_gcs_location=gcp_options.temp_location,
          table=normalize_options.dest_table,
          schema=dest_schema,
          write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
  )

  result = pipeline.run()
  success_states = set([PipelineState.DONE])

  if normalize_options.wait:
    result.wait_until_finish()
  else:
      success_states.add(PipelineState.RUNNING)

  return 0 if result.state in success_states else 1
def run():

    PROJECT_ID = 'data-lake-290221'
    BUCKET = 'gs://dataflow-log-data'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DirectRunner',
                              project=PROJECT_ID,
                              job_name='transpose',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    options.view_as(SetupOptions).save_main_session = True

    p = beam.pipeline.Pipeline(options=options)

    sql = '''select farm_fingerprint(concat(cast(latitude as string), cast(longitude as string))) as location_id, * from covid19_confirmed.raw_cases'''

    #bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
    bq_source = BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Transpose' >> beam.ParDo(Transpose())

    #out_pcoll | 'Write to log' >> WriteToText('records.txt')

    dataset_id = 'covid19_confirmed'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'daily_cases'
    schema_id = 'location_id:INTEGER,date:DATE,cases:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Пример #5
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--date", required=True, type=str, help="")
    parser.add_argument("--output",
                        required=True,
                        type=str,
                        help="PROJECT:DATASET.TABLE")
    known_args, pipeline_args = parser.parse_known_args(argv)

    # file_path = os.path.join(known_args.output, known_args.date, "user", known_args.date+"-user")
    table_name = known_args.output + "$" + re.sub("-", "", known_args.date)

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        query_of_user = get_query('user', date=known_args.date)

        # CSV FORMAT
        # result = (p
        # | "ReadFromBQ" >> beam.io.Read(BigQuerySource(query=query_of_user, use_standard_sql=True))
        # | "Projected" >> beam.ParDo(ProjectionBQ(), PROJECT_FIELDS_USER)
        # | "Format" >> beam.ParDo(FormatAsCSV(), COLUMNS, False)
        # | "Write" >> WriteToText(file_path, ".csv", shard_name_template="-SS", header=HEADERS))

        # BIGQUERY FORMAT
        result = (
            p
            | "ReadFromBQ" >> beam.io.Read(
                BigQuerySource(query=query_of_user, use_standard_sql=True))
            | "Projected" >> beam.ParDo(ProjectionBQ(), PROJECT_FIELDS_USER)
            | "Write" >> beam.io.Write(
                BigQuerySink(table_name,
                             schema=SCHEMA,
                             create_disposition=beam.io.BigQueryDisposition.
                             CREATE_IF_NEEDED,
                             write_disposition=beam.io.BigQueryDisposition.
                             WRITE_APPEND)))
def run(argv=None, comments=None):
    """Run the beam pipeline.

    Args:
        argv: (optional) the command line flags to parse.
        comments_collection: (optional) a list of comment JSON objects to
            process. Used in unit-tests to avoid requiring a BigQuery source.
    """
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    if comments is not None:
        comments = p | ("Read in-memory comments") >> beam.Create(comments)
    else:
        comments = p | ("Read " + args.reddit_table) >> Read(
            BigQuerySource(args.reddit_table))

    comments |= (
        "Normalise comments" >> beam.Map(
            partial(normalise_comment, max_length=args.max_length)))

    thread_id_to_comments = comments | (
        "Key by thread id" >> beam.Map(
            lambda comment: (comment.thread_id, comment)))
    threads = thread_id_to_comments | (
        "Group comments by thread ID" >> beam.GroupByKey())
    threads = threads | ("Get threads" >> beam.Map(lambda t: t[1]))

    examples = threads | (
        "Create {} examples".format(args.dataset_format) >> beam.FlatMap(
            partial(create_examples,
                    parent_depth=args.parent_depth,
                    min_length=args.min_length,
                    format=args.dataset_format,
                    )))
    examples = _shuffle(examples)

    # [START dataflow_molecules_split_to_train_and_eval_datasets]
    # Split the dataset into a training set and an evaluation set
    assert 0 < (100 - args.train_split*100) < 100, 'eval_percent must in the range (0-100)'
    eval_percent = 100 - args.train_split*100
    train_dataset, eval_dataset = (
        examples
        | 'Split dataset' >> beam.Partition(
            lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
    # [END dataflow_molecules_split_to_train_and_eval_datasets]

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps

    serialized_train_examples = train_dataset | (
        "serialize {} examples".format('train') >> beam.Map(serialize_fn))
    (
        serialized_train_examples | ("write " + 'train')
        >> write_sink(
            os.path.join(args.output_dir, 'train'),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
    )

    serialized_test_examples = eval_dataset | (
        "serialize {} examples".format('valid') >> beam.Map(serialize_fn))
    (
        serialized_test_examples | ("write " + 'valid')
        >> write_sink(
            os.path.join(args.output_dir, 'valid'),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
    )

    result = p.run()
    result.wait_until_finish()
Пример #7
0
import logging
from google.cloud import firestore
from apache_beam.options.pipeline_options import PipelineOptions

PROJECT = 'yelphelp'
BUCKET = 'yelp_help_dataflow'

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT
google_cloud_options.staging_location = "gs://" + BUCKET + "/staging"
google_cloud_options.temp_location = "gs://" + BUCKET + "/temp"
options.view_as(StandardOptions).runner = "DataFlowRunner"

source = BigQuerySource(
    query=
    """SELECT business_id, count(checkin) as checkins FROM `yelphelp.YearlyData.Checkin` group by business_id""",
    use_standard_sql=True)


class CreateEntities(beam.DoFn):
    def process(self, element):
        document_id = str(element.pop('business_id'))
        element['checkins'] = str(element['checkins'])
        return [(document_id, element)]


class FirestoreWriteDoFn(beam.DoFn):
    MAX_DOCUMENTS = 200

    def start_bundle(self):
        self._records = []
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="referentiel"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_WorkProgramRisk"
)


source = BigQuerySource(query="SELECT distinct CORE_ID_WorkProgramId as CORE_ID_WorkProgramId, null as CORE_ID_RiskId, CORE_LB_WorkProgramCode as CORE_LB_WorkProgramCode, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Referentiel` ref left outer join `studied-client-307710.SMT_DWH.SMT_REF_WorkProgram` wrk on ref.Code = wrk.CORE_LB_WorkProgramCode", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #9
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="Missions")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_AuditProcess")

source = BigQuerySource(
    query=
    "SELECT distinct 1 as CORE_ID_AuditId, CORE_ID_ProcessId as CORE_ID_ProcessId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Missions` miss left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` process on cast(miss.AuditedProcesses as INT64) = process.CORE_ID_ProcessSourceId",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="CS_FindingsExportImport")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_FindingDivision")

source = BigQuerySource(
    query=
    "SELECT distinct fin.CORE_ID_FindingId as CORE_ID_FindingId, div.CORE_ID_DivisionId as CORE_ID_DivisionId, id as CORE_ID_FindingSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` imp LEFT OUTER JOIN   `studied-client-307710.SMT_DWH.SMT_REF_Finding` fin ON   CAST(imp.Id AS int64) = fin.CORE_ID_FindingSourceId LEFT OUTER JOIN  `studied-client-307710.SMT_DWH.SMT_REF_Division` div ON CAST(imp.CS_ConcernedDivisions AS int64) = div.CORE_ID_DivisionSourceId",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
from google.cloud import firestore
from apache_beam.options.pipeline_options import PipelineOptions

PROJECT = 'yelphelp'
BUCKET = 'yelp_help_dataflow'

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT
google_cloud_options.staging_location = "gs://" + BUCKET + "/staging"
google_cloud_options.temp_location = "gs://" + BUCKET + "/temp"
options.view_as(StandardOptions).runner = "DataFlowRunner"

source = BigQuerySource(
    query="""SELECT business_id, ARRAY_AGG(t.text) as tips FROM 
(SELECT `yelphelp.YearlyData.Tip`.*, ROW_NUMBER() OVER (PARTITION BY business_id ORDER BY compliment_count, date DESC)
 AS seqnum FROM `yelphelp.YearlyData.Tip` ) t
where seqnum <= 3 group by business_id""",
    use_standard_sql=True)


class CreateEntities(beam.DoFn):
    def process(self, element):
        document_id = str(element.pop('business_id'))
        return [(document_id, element)]


class FirestoreWriteDoFn(beam.DoFn):
    MAX_DOCUMENTS = 200

    def start_bundle(self):
        self._records = []
Пример #12
0
import datetime

PROJECT = 'yelphelp'
BUCKET = 'yelp_help_dataflow'

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT
google_cloud_options.staging_location = "gs://" + BUCKET + "/staging"
google_cloud_options.temp_location = "gs://" + BUCKET + "/temp"
options.view_as(StandardOptions).runner = "DataFlowRunner"

source = BigQuerySource(
    query=
    """SELECT R.business_id, b.name, b.categories, b.city, b.state, sum(R.useful) as useful,\
 sum(R.funny) as funny, sum(R.cool) as cool, avg(R.stars) as avg_stars, count(R.stars) as num_stars \
 FROM `yelphelp.YearlyData.Review` as R LEFT JOIN `yelphelp.YearlyData.Business` as B ON \
  R.business_id = B.business_id group by business_id, name, categories, state, city""",
    use_standard_sql=True)


class CreateEntities(beam.DoFn):
    def process(self, element):
        document_id = str(element.pop('business_id'))
        element['name'] = str(element['name'])
        element['categories'] = str(element['categories'])
        element['city'] = str(element['city'])
        element['state'] = str(element['state'])
        element['cool'] = int(element['cool'])
        element['funny'] = int(element['funny'])
        element['useful'] = int(element['useful'])
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_FindingsExportImport"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingConcernedFunction"
)


source = BigQuerySource(query="SELECT distinct fin.CORE_ID_FindingId as CORE_ID_FindingId, con.CORE_ID_ConcernedFunctionId as CORE_ID_ConcernedFunctionId, id as CORE_ID_FindingSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` imp left outer join `studied-client-307710.SMT_DWH.SMT_REF_Finding` fin on cast(imp.Id as int64) = fin.CORE_ID_FindingSourceId left outer join `studied-client-307710.SMT_DWH.SMT_REF_ConcernedFunction` con on cast(imp.CS_ConcernedFunc as int64) = con.CORE_ID_ConcernedFunctionSourceId ", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #14
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_DWH",
                                            tableId="Processes")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_Process")

source = BigQuerySource(
    query=
    "SELECT ROW_NUMBER() over(order by NameFR) as CORE_ID_ProcessId, Parent as  CORE_LB_ProcessParentReference, Level as  CORE_LB_ProcessLevel, Ref as CORE_LB_ProcessReference, `Order` as CORE_LB_Order, Active as CORE_LB_Active, NameEN as CORE_LB_ProcessNameEN, NameFR as CORE_LB_ProcessNameFR, risk.CORE_ID_RiskId as CORE_ID_RiskId, DefEN as CORE_LB_ProcessDefEN, DefFR as CORE_LB_ProcessDefFR, Graph as CORE_LB_Graph, Id as CORE_ID_ProcessSourceId, LastMod as CORE_DT_LastModif, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Processes` proc left outer join `studied-client-307710.SMT_DWH.SMT_REF_Risk` risk on cast(proc.CS_Risk as INT64) = risk.CORE_ID_RiskSourceId",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
 def test(self):
     self.result = (self.pipeline
                    | 'Read from BigQuery' >> Read(
                        BigQuerySource(dataset=self.input_dataset,
                                       table=self.input_table))
                    | 'Count' >> Count.Globally())
Пример #16
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="MissionTypes"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_AuditType"
)


source = BigQuerySource(query="SELECT ROW_NUMBER() over(order by Code) as CORE_ID_AuditTypeId, Code as CORE_LB_AuditTypeCode, NameEN as CORE_LB_AuditTypeNameEN, NameFR as CORE_LB_AuditTypeNameFR, Active as CORE_LB_Active, `Order` as CORE_LB_Order, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, TrackingDisplay as CORE_LB_TrackingDisplay, Id as CORE_ID_AuditTypeSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.MissionTypes`", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #17
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_FindingsExportImport"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingRisk"
)


source = BigQuerySource(query="SELECT distinct id as CORE_ID_FindingId, -1 as CORE_ID_RiskId, -1 as CORE_ID_FindingSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` ", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #18
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="Referentiel"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_WorkProgram"
)


source = BigQuerySource(query="SELECT ROW_NUMBER() over(order by CS_WFLevel)as CORE_ID_WorkProgramId, CS_WFLevel as CORE_LB_WFLevel, Parent as CORE_LB_WorkProgramParentCode, MasterAuditReferential as CORE_LB_MasterAuditReferential, CS_AuditReferenceDataLink as CORE_LB_CS_AuditReferenceDataLink, Level as CORE_LB_WorkProgramLevel, OriginLevel as CORE_LB_ReferentieOriginLevel, Code as CORE_LB_WorkProgramCode, OrderCode as CORE_LB_Order, ProcessOrigin as CORE_LB_ProcessOrigin, CORE_ID_ProcessId as CORE_ID_ProcessId, AuditPointOrigin as CORE_LB_AuditPoINT64Origin, CS_ControlPoint as CORE_LB_ControlPoINT64, null as CORE_LB_WorkProgramNameEN, null as CORE_LB_Context, CS_CSNewItem as CORE_LB_CSNewITem, CS_Progress as CORE_LB_CS_Progress, KPCount as CORE_FL_KPCount, RecoCount as CORE_FL_RecoCount, null as CORE_LB_WFICone, null as CORE_LB_SynthesisMessage, null as CORE_LB_DirectoryWorkingPManagers, DirectoryWorkingPAuditors as CORE_LB_DirectoryWorkingPAuditors, CChoiceList as CORE_LB_CChoiceList, CAnswer as CORE_LB_CAnswer, AnswerAttachedFile as CORE_LB_AnswerAttachedFile, CS_AssessControl as CORE_LB_AssessControl, CS_SubProcessAssessment as CORE_LB_SubProcessAssessment, LevelNo as CORE_LB_LevelNo, Hyperlink as CORE_LB_HyperLink, CS_Hyperlink2 as CORE_LB_HyperLink2, CS_Hyperlink3 as CORE_LB_HyperLink3, CS_Hyperlink4 as CORE_LB_HyperLink4, CS_CopyOfAuditGuide as CORE_LB_CopyOfAuditGuide, MissionManagers as CORE_LB_MissionManagers, MissionAuditTeam as CORE_LB_MissionAuditTeam, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, null as CORE_LB_TrackingDisplay, null as CORE_LB_Source, null as CORE_LB_OfflineRisks, null as CORE_LB_CopyOfTitle, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Referentiel` ref left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` process on ref.Domain = cast(process.CORE_ID_ProcessSourceId as STRING)", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #19
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="Plans")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_AuditPlan")

source = BigQuerySource(
    query=
    "SELECT ROW_NUMBER() over(order by CS_ReferencePlan) as CORE_ID_AuditPlanId, CS_ReferencePlan as CORE_LB_AuditPlanReference, Title as CORE_LB_AuditPlanTitle, Status as CORE_LB_AuditPlanStatus, Archived as CORE_LB_Archived, ArchivingDate as CORE_DT_ArchivedDate, ArchivingResponsibleFor as CORE_LB_ArchivingResponsibleFor, Reason as CORE_LB_Reason, AuditedSites as CORE_LB_AuditedSites, Services as CORE_LB_Services, CS_Objective as CORE_LB_Objective, AttachedFile as CORE_LB_AttachedFile, NbMissions as CORE_FL_NbMissions, StartDate as CORE_DT_StartDate, EndDate as CORE_DT_EndDate, Id as CORE_ID_AuditPlanSourceId, LastModif as CORE_DT_LastModif, Source as CORE_LB_Source, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Plans`",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="CS_ConcernedFunctions")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_ConcernedFunction")

source = BigQuerySource(
    query=
    "SELECT ROW_NUMBER() over(order by CS_Order) as CORE_ID_ConcernedFunctionId, CS_Order as CORE_LB_ConcernedFunctionOrder, CS_ConcernedFunctionsEN as CORE_LB_ConcernedFunctionNameEN, CS_ConcernedFunctionsFR as CORE_LB_ConcernedFunctionNameFR, CS_Id as CORE_ID_ConcernedFunctionSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_ConcernedFunctions`",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
Пример #21
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="Missions"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Audit"
)


source = BigQuerySource(query="SELECT ROW_NUMBER() over(order by CS_Ref) as CORE_ID_AuditId, CS_Ref as CORE_LB_AuditReference, Title as CORE_LB_AuditTitle, CS_CopyOfStatus as CORE_LB_AuditStatus, plan.CORE_ID_AuditPlanId as CORE_ID_AuditPlanId, CS_PreviousAudit as CORE_LB_PreviousAudit, CS_ReferenceAudits as CORE_LB_ReferenceAudits, LinkTypology as CORE_LB_LinkTypolygy, MissionTypology as CORE_LB_MissionTypolygy, CS_Cycle as CORE_LB_Cycle, Language as CORE_LB_Language, Archived as CORE_LB_Archived, ArchivingDate as CORE_DT_ArchivingDate, ArchivingResponsibleFor as CORE_LB_ArchivingResponsibleFor, Reason as CORN_LB_Reason, Services as CORE_LB_Services, zone.CORE_Id_ZoneId as CORE_ID_ZoneId, CS_CAnet as CORE_FL_CANet, CS_UnitsSold as CORE_FL_UnitsSold, CS_UnitsProduced as CORE_FL_UnitsProduced, CS_REXResultatDexploitation as CORE_FL_REXResultatDexploitation, CS_BAI as CORE_FL_BAI, CS_StatutoryEmployees as CORE_FL_StatutoryEmployees, CS_TotalEmployeesFTE as CORE_FL_TotalEmployeesFTE, CS_OverdueValue as CORE_FL_OverdueValue, CS_BadDebtProvisionValue as CORE_FL_BadDebtProvisionValue, CS_ReturnsValue as CORE_FL_ReturnsValue, CS_ServiceRateDivPerc as CORE_FL_ServiceRateDivPerc, CS_StockValue as CORE_FL_StockValue, CS_DestructionValue as CORE_FL_DestructionValue, CS_InfluencersValue as CORE_FL_InfluencersValue, CS_NbNonStatutoryEmployees as CORE_FL_NbNonStatutoryEmployees, Planned as CORE_LB_Planned, Criticity as CORE_LB_Criticity, typ.CORE_ID_AuditTypeId as CORE_ID_AuditTypeId, CS_CurrMissPha as CORE_LB_CurrMissPha, Initiator as CORE_LB_Initiator, Objective as CORT_LB_Objective, AuditContext as CORE_LB_AuditContext, ExNihilo as CORE_LB_ExNihilo, ToDuplicate as CORE_LB_ToDuplicate, RefCreate as CORE_LB_RefCreate, ActualStartDate as CORE_DT_ActualStartDate, ActualEndDate as CORE_DT_ActualEndDate, Agenda as CORE_LB_Agenda, null as CORE_FL_EntityDAF, CS_EntityICM as CORE_LB_EntityICMCode, InternalRN as CORE_FL_INT64ernalRN, ExternalRN as CORE_FL_ExternalRN, null as CORE_LB_CheckAvailabilityTA, CS_RecoTransDate as CORE_DT_RecoTransDate, CS_RecoAccepDate as CORE_DT_RecoAccepDate, CS_approvReco as CORE_FL_ApprovReco, CS_APValidated as CORE_LB_APValidated, CS_APValDate as CORE_DT_APValDate, CS_RepIssDate as CORE_DT_RepissDate, CS_ActionPlanClosed as CORE_LB_ActionPlanClosed, MissionManager as CORE_LB_MissionManager, GlobalMark as CORE_LB_GlobalMark, LabelMark as CORE_LB_LabelMark, null as CORE_LB_MarkDescription, null as CORE_LB_Conclusion, null as CORE_LB_Improve, null as CORE_LB_CopyOfSynthesis, null as CORE_LB_RATitle, null as CORE_LB_RASubTitle, null as CORE_DT_ARDate, null as CORE_LB_FinalAuditReportCode, null as CORE_DT_FinalAuditEmi, null as CORE_ID_MissionSourceId, null as CORE_FL_DurationReal, null as CORE_FL_DurationPlan, null as CORE_DT_LastUpdateReceived, null as CORE_DT_LastModif, null as CORE_LB_Source, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Missions` miss left outer join `studied-client-307710.SMT_DWH.SMT_REF_AuditPlan` plan on cast(miss.PlanCode as INT64) = plan.CORE_ID_AuditPlanSourceId left outer join `studied-client-307710.SMT_DWH.SMT_REF_AuditType` typ on cast(miss.Type as INT64) = typ.CORE_ID_AuditTypeSourceId left outer join `studied-client-307710.SMT_DWH.SMT_REF_Zone` zone on cast(miss.CS_ZoneScope as INT64) = zone.CORE_ID_ZoneSourceId", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="Recommendations"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingRecommendation"
)


source = BigQuerySource(query="SELECT distinct fin.CORE_ID_FindingId as CORE_ID_FindingId, recom.CORE_ID_RecommendationId as CORE_ID_RecommendationId, id as CORE_ID_RecommendationSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Recommendations` rec left outer join `studied-client-307710.SMT_DWH.SMT_REF_Finding` fin on rec.AssociatedKeyPoint = cast(fin.CORE_ID_FindingSourceId as string) left outer join `studied-client-307710.SMT_DWH.SMT_REF_Recommendation` recom on rec.id = cast(recom.CORE_ID_RecommendationSourceId as string)", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #23
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="referentiel"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_WorkProgramAudit"
)


source = BigQuerySource(query="SELECT distinct ROW_NUMBER() over(order by Parent)as CORE_ID_WorkProgramAuditId, Parent as CORE_LB_WorkProgramParentCode, Level as CORE_LB_WorkProgramLevel, Code as CORE_LB_WorkProgramCode, CORE_ID_WorkProgramId as CORE_ID_WorkProgramId, null as CORE_ID_AuditId, CS_Progress as CORE_LB_CS_Progress, CS_AssessControl as CORE_LB_AssessControl, CS_SubProcessAssessment as CORE_LB_SubProcessAssessment, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, null as CORE_ID_WorkProgramSourceId,  current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted  FROM `studied-client-307710.SMT_STG.Referentiel` ref left outer join `studied-client-307710.SMT_DWH.SMT_REF_WorkProgram` wrk on ref.Code = wrk.CORE_LB_WorkProgramCode", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #24
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_DWH",
                                            tableId="Risks")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_Risk")

source = BigQuerySource(
    query=
    "SELECT ROW_NUMBER() over(order by Parent) as CORE_ID_RiskId, Parent as CORE_LB_RiskParentReference, Level  as CORE_LB_RiskLevel, Ref as CORE_LB_RiskReference, `Order` as CORE_LB_Order, Active as CORE_LB_Active, NameEN as CORE_LB_RiskNameEN, NameFR as CORE_LB_RiskNameFR, DefEN as CORE_LB_RiskDefEN, DefFR as CORE_LB_RiskDefFR, KeyRisk as CORE_LB_KeyRisk, NonConformityRisk as CORE_LB_NonConformityRisk, IdentificationDate as CORE_DT_IdentificationDate, Id as CORE_ID_RiskSourceId, LastMod as CORE_DT_LastModif, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Risks`",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
Пример #25
0
def run(argv=None, comments=None):
    """Run the beam pipeline.

    Args:
        argv: (optional) the command line flags to parse.
        comments_collection: (optional) a list of comment JSON objects to
            process. Used in unit-tests to avoid requiring a BigQuery source.
    """
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    if comments is not None:
        comments = p | ("Read in-memory comments") >> beam.Create(comments)
    else:
        comments = p | ("Read " + args.reddit_table) >> Read(
            BigQuerySource(args.reddit_table))

    comments |= ("Normalise comments" >> beam.Map(
        partial(normalise_comment, max_length=args.max_length)))

    thread_id_to_comments = comments | (
        "Key by thread id" >> beam.Map(lambda comment:
                                       (comment.thread_id, comment)))
    threads = thread_id_to_comments | (
        "Group comments by thread ID" >> beam.GroupByKey())
    threads = threads | ("Get threads" >> beam.Map(lambda t: t[1]))

    examples = threads | (
        "Create {} examples".format(args.dataset_format) >> beam.FlatMap(
            partial(
                create_examples,
                parent_depth=args.parent_depth,
                min_length=args.min_length,
                format=args.dataset_format,
            )))
    examples = _shuffle(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(train_split=args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (serialized_examples | ("write " + name) >> write_sink(
            os.path.join(args.output_dir, name),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        ))

    result = p.run()
    result.wait_until_finish()
Пример #26
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="CS_FindingsExportImport")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_Finding")

source = BigQuerySource(
    query=
    "SELECT  distinct ROW_NUMBER() over(order by Reference) as CORE_ID_FindingId, Reference as CORE_LB_FindingReference, null as CORE_LB_FindingTitle, null as CORE_ID_AuditId, null as CORE_ID_WorkProgramAuditId, null as CORE_LB_FindingDescription, CS_ShowTop5Subject as CORE_LB_ShowTop5Subject, top.CORE_ID_Top5SubjectId as CORE_ID_Top5SubjectId, CS_PreviousAuditFinding as CORE_LB_PreviousAuditFinding, CS_Levelofrisk as CORE_LB_RiskLevel, null as CORE_LB_OtherSuggestedRisks, AttachedFileLink as CORE_LB_AttachedFileLink, pro.CORE_ID_ProcessId as CORE_ID_ProcessId, CS_WorkProgramProcess as CORE_LB_WorkProgramProcess, AuditReportDisplay as CORE_LB_AuditReportDisplay, AuditReportOrder as CORE_LB_AuditReportOrder, Id as CORE_ID_FindingSourceId, LastModif as CORE_DT_LastModif, Source as CORE_LB_Source, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted	 FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` imp left outer join `studied-client-307710.SMT_DWH.SMT_REF_Top5Subject` top on cast(top.CORE_ID_Top5SubjectSourceId as string) = imp.CS_CorresTop5Subject left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` pro on cast(pro.CORE_ID_ProcessSourceId as string ) = imp.CS_ReferenceProcess ",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
Пример #27
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="CS_Top5Subjects")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_Top5Subject")

source = BigQuerySource(
    query=
    "SELECT ROW_NUMBER() over(order by CS_Id) as CORE_ID_Top5SubjectId, CS_Id as CORE_ID_Top5SubjectSourceId, CS_Reference as CORE_LB_Top5SubjectReference, CS_Ordre as CORE_LB_Top5SubjectOrder, CS_TitreFREN as CORE_LB_Top5SubjectNameEN, CS_TitreFRFR as CORE_LB_Top5SubjectNameFR, CS_Active as CORE_LB_Top5SubjectActive, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_Top5Subjects`",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py" 

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_STG", tableId="Recommendations"
)
sink_table_spec = bigquery.TableReference(
    projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Recommendation"
)


source = BigQuerySource(query="SELECT  distinct ROW_NUMBER() over(order by CS_MainRecommandation) as CORE_ID_RecommendationId, CS_LinkToAnExistingRecommandation as CORE_LB_LinkToAnExistingRecommandation, CS_MainRecommandation as CORE_LB_MainRecommandation, -1 as CORE_ID_AuditId, Template as CORE_LB_Template, Reference as CORE_LB_RecommendationReference, null as CORE_LB_RecommendationTitle, AllowFollowUp as CORE_LB_AllowFollowUp, -1 as CORE_ID_WorkProgramAuditId, pro.CORE_ID_ProcessId as CORE_ID_ProcessId, sub.CORE_ID_ProcessId as CORE_ID_SubProcessId, AuditedSite as CORE_LB_AuditedSiteCode, Archived as CORE_LB_Archived, ArchivingDate as CORE_LB_ArchivingDate, ArchivingResponsibleFor as CORE_LB_ArchivingResponsibleFor, Reason as CORE_LB_Reason, Services as CORE_LB_Services, CS_Levelofrisk as CORE_LB_LevelOfRisk, CS_OtherSuggestedRisks as CORE_LB_OtherSuggestedRisks, null as CORE_LB_OtherSuggestedRisksAll, null as CORE_LB_CopyOfDescription, CS_ZoneCommentsIA as CORE_LB_ZoneCommentsIA, CS_RecommendationAnswer as CORE_LB_RecommendationAnswer, CS_AcceptanceDate as CORE_DT_AcceptanceDate, null as CORE_LB_EntityComment, null as CORE_LB_AuditTeamAuditeesReason, CS_Status as CORE_LB_RecommendationStatus, CS_ImplemRatPercent as CORE_FL_ImplemRatPercent, CS_ImplemRatAudit as CORE_FL_ImplemRatAudit, Calendar as CORE_DT_Calendar, null as CORE_LB_Fonction, null as CORE_LB_InChargeOfRecoText, FUCreation as CORE_DT_FUCreation, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, Id as CORE_ID_RecommendationSourceId, LastModif as CORE_DT_LastModif, Source as CORE_LB_Source, AuditTypology as CORE_LB_AuditTypology, CS_FollowupCreation as CORE_DT_FollowUpCreation, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Recommendations` rec left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` pro on cast(pro.CORE_ID_ProcessSourceId as STRING) = rec.CS_ProcessLevel left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` sub on cast(sub.CORE_ID_ProcessSourceId as STRING) = rec.CS_SubprocessLevel ", use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p 
            | "ReadTable" >> beam.io.Read(source) 
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec,
                                    #   schema=table_schema,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
Пример #29
0
        "CORE_ID_ZoneSourceId": "STRING",
        "CORE_DT_LastMod": "DATETIME",
        "CORE_DT_RecordCreationDate": "DATETIME",
        "CORE_DT_RecordModificationDate": "DATETIME",
        "CORE_FL_IsDeleted": "INTEGER",
        "CORE_FL_Latitude": "FLOAT",
        "CORE_FL_Longitude": "FLOAT"
    }
    mapping_list = [{"name": k, "type": mapping[k]} for k in mapping.keys()]
    return json.JSONEncoder(sort_keys=True).encode({"fields": mapping_list})


table_schema = parse_table_schema_from_json(make_sink_schema())

source = BigQuerySource(
    query=
    "SELECT  ROW_NUMBER() over(order by Code) as CORE_Id_ZoneId, Parent as CORE_LB_ZoneParentCode, Level as CORE_LB_Level, Code as CORE_LB_ZoneCode, EntityEN as CORE_LB_ZoneEntityEN, EntityFR as CORE_LB_ZoneEntityFR, EntityStatus as CORE_LB_ZoneEntityStatus, `Order` as CORE_LB_Order, Id as CORE_ID_ZoneSourceId, LastMod as CORE_DT_LastMod, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted, geo.Latitude as CORE_FL_Latitude, geo.Longitude as CORE_FL_Longitude FROM `studied-client-307710.SMT_STG.SecondAxis` ax left outer join `studied-client-307710.SMT_STG.Geographic_Coordinates` geo on geo.SubZone = ax.EntityEN",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(
Пример #30
0
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging"
google_cloud_options.temp_location = "gs://projet_smart_gcp/temp"
#options.view_as(StandardOptions).runner = "DirectRunner"  # use this for debugging
options.view_as(StandardOptions).runner = "DataFlowRunner"
options.view_as(SetupOptions).setup_file = "./setup.py"

# see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/
source_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                            datasetId="SMT_STG",
                                            tableId="RecommendationCriticity")
sink_table_spec = bigquery.TableReference(projectId="studied-client-307710",
                                          datasetId="SMT_DWH",
                                          tableId="SMT_REF_RiskLevel")

source = BigQuerySource(
    query=
    "SELECT ROW_NUMBER() over(order by Code) as CORE_ID_RiskLevelId, Code as CORE_LB_RiskLevelCode, NameEN as CORE_LB_RiskLevelNameEN, NameFR as CORE_LB_RiskLevelNameFR, DescriptionEN as CORE_LB_RiskLevelDescriptionEN, DescriptionFR as CORE_LB_RiskLevelDescriptionFR, Active as CORE_LB_Active, `Order` as CORE_LB_Order, TextColor as CORE_LB_TextColor, BackgroundColor as CORE_LB_BackgroundColor, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, TrackingDisplay as CORE_LB_TrackingDisplay, Id as CORE_ID_RisklLevelSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.RecommendationCriticity`",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
#target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
            p
            | "ReadTable" >> beam.io.Read(source)
            | "cleanup" >> beam.ParDo(ElementCleanup())
            | "writeTable" >> beam.io.WriteToBigQuery(