예제 #1
0
    def _load_to_bq(self, client, dataset, table_name, table_schema,
                    table_config, key_props, metadata_columns, truncate, rows):
        logger = self.logger
        partition_field = table_config.get("partition_field", None)
        cluster_fields = table_config.get("cluster_fields", None)
        force_fields = table_config.get("force_fields", {})

        schema = build_schema(table_schema,
                              key_properties=key_props,
                              add_metadata=metadata_columns,
                              force_fields=force_fields)
        load_config = LoadJobConfig()
        load_config.ignore_unknown_values = True
        load_config.schema = schema
        if partition_field:
            load_config.time_partitioning = bigquery.table.TimePartitioning(
                type_=bigquery.table.TimePartitioningType.DAY,
                field=partition_field)

        if cluster_fields:
            load_config.clustering_fields = cluster_fields

        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            logger.info(f"Load {table_name} by FULL_TABLE (truncate)")
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        else:
            logger.info(f"Appending to {table_name}")
            load_config.write_disposition = WriteDisposition.WRITE_APPEND

        logger.info("loading {} to BigQuery".format(table_name))

        load_job = None
        try:
            load_job = client.load_table_from_file(rows,
                                                   dataset.table(table_name),
                                                   job_config=load_config,
                                                   rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            job = load_job.result()
            logger.info(job._properties)

            return job

        except google_exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table_name, str(err)))
            if load_job and load_job.errors:
                reason = err.errors[0]["reason"]
                messages = [f"{err['message']}" for err in load_job.errors]
                logger.error("reason: {reason}, errors:\n{e}".format(
                    reason=reason, e="\n".join(messages)))
                err.message = f"reason: {reason}, errors: {';'.join(messages)}"

            raise err
예제 #2
0
        def load_task():
            client = Client()
            job_config = LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
예제 #3
0
    def process_response_rows_for_bigquery(self, rows: list,
                                           table_reference: TableReference):
        rows_dataframe = DataFrame.from_records(rows)

        rows_dataframe = concat(
            [rows_dataframe, rows_dataframe['dimensions'].apply(Series)],
            axis=1,
            join='inner')
        rows_dataframe = rows_dataframe.drop(['dimensions'], axis=1)
        rows_dataframe['date'] = rows_dataframe['date'].apply(
            lambda x: x.date())

        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column)
            for column in list(rows_dataframe.columns.values)
        ]

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                rows_dataframe, table_reference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
예제 #4
0
 def __create_load_job_config(
         self, ems_load_job_config: EmsLoadJobConfig) -> LoadJobConfig:
     config = LoadJobConfig()
     config.labels = ems_load_job_config.labels
     config.create_disposition = ems_load_job_config.create_disposition.value
     config.write_disposition = ems_load_job_config.write_disposition.value
     config.schema = _parse_schema_resource(ems_load_job_config.schema)
     config.skip_leading_rows = ems_load_job_config.skip_leading_rows
     return config
예제 #5
0
    def _process_data_for_bigquery(self, data: DataFrame,
                                   output_tablereference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                data, output_tablereference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
예제 #6
0
def DTSTableDefinition_to_BQLoadJobConfig(dts_tabledef):
    """
    https://cloud.google.com/bigquery/docs/reference/data-transfer/partner/rpc/google.cloud.bigquery.datatransfer.v1#tabledefinition

    TO

    https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html#google.cloud.bigquery.job.LoadJob

    :param dts_tabledef:
    :return:
    """
    from bq_dts import rest_client
    job_config = LoadJobConfig()

    dts_schema = RPCRecordSchema_to_GCloudSchema(dts_tabledef['schema'])
    job_config.schema = dts_schema

    # BQ DTS does not provide controls for the following dispositions
    job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    if 'format' in dts_tabledef:
        dts_format = dts_tabledef['format']
        source_format = rest_client.BQ_DTS_FORMAT_TO_BQ_SOURCE_FORMAT_MAP[
            dts_format]
        assert source_format is not None
        job_config.source_format = source_format

    if 'max_bad_records' in dts_tabledef:
        job_config.max_bad_records = dts_tabledef['max_bad_records']

    if 'encoding' in dts_tabledef:
        dts_encoding = dts_tabledef['encoding']
        job_config.encoding = rest_client.BQ_DTS_ENCODING_TO_BQ_ENCODING_MAP[
            dts_encoding]

    if 'csv_options' in dts_tabledef:
        csv_opts = dts_tabledef['csv_options']
        if 'field_delimiter' in csv_opts:
            job_config.field_delimiter = csv_opts['field_delimiter']
        if 'allow_quoted_newlines' in csv_opts:
            job_config.allow_quoted_newlines = csv_opts[
                'allow_quoted_newlines']
        if 'quote_char' in csv_opts:
            job_config.quote_character = csv_opts['quote_char']
        if 'skip_leading_rows' in csv_opts:
            job_config.skip_leading_rows = csv_opts['skip_leading_rows']

    return job_config
예제 #7
0
    def push_bq():
        for table in rows.keys():

            table_ref = bigquery_client.dataset(dataset_id).table(table)
            SCHEMA = build_schema(schemas[table])
            load_config = LoadJobConfig()
            load_config.schema = SCHEMA
            load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

            if truncate:
                load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

            rows[table].seek(0)
            logger.info('loading {} to Bigquery.\n'.format(table))
            load_job = bigquery_client.load_table_from_file(
                rows[table], table_ref, job_config=load_config)
            logger.info('loading job {}'.format(load_job.job_id))
            logger.info(load_job.result())
            rows[table] = TemporaryFile(mode='w+b')
예제 #8
0
파일: gbq.py 프로젝트: aloosley/pandas-gbq
    def load_data(self, dataframe, dataset_id, table_id, chunksize):
        from google.cloud.bigquery import LoadJobConfig
        from six import BytesIO

        destination_table = self.client.dataset(dataset_id).table(table_id)
        job_config = LoadJobConfig()
        job_config.write_disposition = 'WRITE_APPEND'
        job_config.source_format = 'NEWLINE_DELIMITED_JSON'
        rows = []
        remaining_rows = len(dataframe)

        total_rows = remaining_rows
        self._print("\n\n")

        for index, row in dataframe.reset_index(drop=True).iterrows():
            row_json = row.to_json(force_ascii=False,
                                   date_unit='s',
                                   date_format='iso')
            rows.append(row_json)
            remaining_rows -= 1

            if (len(rows) % chunksize == 0) or (remaining_rows == 0):
                self._print("\rLoad is {0}% Complete".format(
                    ((total_rows - remaining_rows) * 100) / total_rows))

                body = '{}\n'.format('\n'.join(rows))
                if isinstance(body, bytes):
                    body = body.decode('utf-8')
                body = body.encode('utf-8')
                body = BytesIO(body)

                try:
                    self.client.load_table_from_file(
                        body, destination_table,
                        job_config=job_config).result()
                except self.http_error as ex:
                    self.process_http_error(ex)

                rows = []

        self._print("\n")
예제 #9
0
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    # try:
    #     dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    # except exceptions.Conflict:
    #     pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            dat = bytes(json.dumps(msg.record) + '\n', 'UTF-8')

            rows[msg.stream].write(dat)
            # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8'))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            # tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
            # try:
            #     tables[table] = bigquery_client.create_table(tables[table])
            # except exceptions.Conflict:
            #     pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))
        load_job = bigquery_client.load_table_from_file(
            rows[table], table_ref, job_config=load_config)
        logger.info("loading job {}".format(load_job.job_id))
        logger.info(load_job.result())

    # for table in errors.keys():
    #     if not errors[table]:
    #         print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path)
    #     else:
    #         print('Errors:', errors[table], sep=" ")

    return state
def persist_lines_job(project_id,
                      dataset_id,
                      lines=None,
                      truncate=False,
                      validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                log_message = ('A record for stream {} was encountered '
                               'before a corresponding schema')
                raise Exception(log_message.format(msg.stream))

            schema = schemas[msg.stream]

            msg.record = convert_dict_keys_to_bigquery_format(
                record=msg.record)

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data,
            # with a newline character splitting each row.
            dat = bytes(simplejson.dumps(msg.record) + '\n', 'UTF-8')

            rows[msg.stream].write(dat)

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream

            schema = convert_schema_column_names_to_bigquery_format(
                schema=msg.schema)

            schemas[table] = schema
            key_properties[table] = msg.key_properties
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)

        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()

        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))

        load_job = bigquery_client.load_table_from_file(rows[table],
                                                        table_ref,
                                                        job_config=load_config)

        logger.info("loading job {}".format(load_job.job_id))

    return state
예제 #11
0
def persist_lines_job(project_id,
                      dataset_id,
                      lines=None,
                      truncate=False,
                      validate_records=True):
    state = None
    schemas = {}
    rows = {}

    bigquery_client = bigquery.Client(project=project_id)

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects JSON string data, with a newline splitting each row.
            rows[msg.stream].write(
                bytes(json.dumps(msg.record) + "\n", "UTF-8"))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            rows[table] = TemporaryFile(mode="w+b")

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])

        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        else:
            load_config.schema_update_options = [
                SchemaUpdateOption.ALLOW_FIELD_ADDITION
            ]

        load_job = bigquery_client.load_table_from_file(rows[table],
                                                        table_ref,
                                                        job_config=load_config,
                                                        rewind=True)
        logger.info(
            f"Loading '{table}' to BigQuery as job '{load_job.job_id}'",
            extra={"stream": table})

        try:
            load_job.result()
        except Exception as e:
            logger.error(f"Error on inserting to table '{table}': {str(e)}",
                         extra={"stream": table})
            return

        logger.info(f"Loaded {load_job.output_rows} row(s) to '{table}'",
                    extra={"stream": table})

    return state
예제 #12
0
def persist_lines_job(
    client,
    dataset,
    lines=None,
    truncate=False,
    forced_fulltables=[],
    validate_records=True,
    table_suffix=None,
):
    state = None
    schemas = {}
    key_properties = {}
    rows = {}
    errors = {}
    table_suffix = table_suffix or ""

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            table_name = msg.stream + table_suffix

            if table_name not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(table_name))

            schema = schemas[table_name]

            if validate_records:
                validate(msg.record, schema)

            new_rec = filter(schema, msg.record)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            data = bytes(
                json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8")

            rows[table_name].write(data)

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table_name = msg.stream + table_suffix

            if table_name in rows:
                continue

            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties
            rows[table_name] = TemporaryFile(mode="w+b")
            errors[table_name] = None

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        key_props = key_properties[table]
        SCHEMA = build_schema(schemas[table], key_properties=key_props)
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate or (table in forced_fulltables):
            logger.info(f"Load {table} by FULL_TABLE")
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        logger.info("loading {} to Bigquery.\n".format(table))

        try:
            load_job = client.load_table_from_file(rows[table],
                                                   dataset.table(table),
                                                   job_config=load_config,
                                                   rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            logger.info(load_job.result())
        except google_exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table, str(err)))
            if load_job.errors:
                messages = [
                    f"reason: {err['reason']}, message: {err['message']}"
                    for err in load_job.errors
                ]
                logger.error("errors:\n{}".format("\n".join(messages)))
            raise

    yield state
예제 #13
0
def persist_lines_job(project_id,
                      dataset_id,
                      lines=None,
                      truncate=False,
                      validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    # try:
    #     dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    # except exceptions.Conflict:
    #     pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            simplified = dict()
            for k in msg.record:
                v = msg.record[k]
                if isinstance(v, decimal.Decimal):
                    v = float(v)
                if isinstance(v, bool):
                    v = str(v)
                simplified[k] = v

            json_row = json.dumps(simplified)

            rows[msg.stream].write(bytes(json_row, "UTF-8"))
            rows[msg.stream].write(bytes("\n", "UTF-8"))
            # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8'))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = json.loads(msg.schema)
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(
                bigquery_client.dataset(dataset_id).table(table),
                schema=build_schema(schemas[table]))
            rows[table] = TemporaryFile(mode="w+b")
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))

        data = rows[table]
        logger.info(
            f"table_ref: {table_ref}, config: {load_config}, data: {data}")

        try:
            load_job = bigquery_client.load_table_from_file(
                data, table_ref, job_config=load_config)

            logger.info("loading job {}".format(load_job.job_id))
            logger.info(load_job.result())
        except exceptions.GoogleAPIError as e:
            logger.error(f"Exception in load job: {e}")
            for er in e.errors:
                logger.error(f"\t Error: {er}")

            raise

    for table in errors.keys():
        if not errors[table]:
            print(
                "Loaded {} row(s) into {}:{}".format(rows[table], dataset_id,
                                                     table),
                tables[table].path)
        else:
            print("Errors: {}", errors[table])

    return state