Пример #1
0
    def test_flat_schema(self):
        state = State()
        state.merge({
            "bookmarks": {
                "stream_one": {
                    "timestamp": "2020-01-10T00:00:00.000000Z"
                }
            }
        })
        print(state)

        state.merge({
            "bookmarks": {
                "stream_one": {
                    "timestamp": "2020-01-11T00:00:00.000000Z"
                }
            }
        })
        print(state)

        state.merge({
            "bookmarks": {
                "stream_two": {
                    "timestamp": "2020-01-11T00:00:00.000000Z"
                }
            }
        })
        print(state)
Пример #2
0
    def __init__(self, logger, **kwargs):
        super(LoadJobProcessHandler, self).__init__(logger, **kwargs)

        self.truncate = kwargs.get("truncate", False)
        self.partially_loaded_streams = set()
        self.add_metadata_columns = kwargs.get("add_metadata_columns", True)
        self.validate_records = kwargs.get("validate_records", True)
        self.table_configs = kwargs.get("table_configs", {}) or {}

        self.INIT_STATE = kwargs.get("initial_state") or {}
        self.STATE = State(**self.INIT_STATE)

        self.rows = {}

        self.client = bigquery.Client(project=self.project_id,
                                      location=kwargs.get("location", "US"))
Пример #3
0
class LoadJobProcessHandler(BaseProcessHandler):
    def __init__(self, logger, **kwargs):
        super(LoadJobProcessHandler, self).__init__(logger, **kwargs)

        self.truncate = kwargs.get("truncate", False)
        self.partially_loaded_streams = set()
        self.add_metadata_columns = kwargs.get("add_metadata_columns", True)
        self.validate_records = kwargs.get("validate_records", True)
        self.table_configs = kwargs.get("table_configs", {}) or {}

        self.INIT_STATE = kwargs.get("initial_state") or {}
        self.STATE = State(**self.INIT_STATE)

        self.rows = {}

        self.client = bigquery.Client(project=self.project_id,
                                      location=kwargs.get("location", "US"))

    def handle_schema_message(self, msg):
        for s in super(LoadJobProcessHandler, self).handle_schema_message(msg):
            yield s

        if msg.stream not in self.rows:
            self.rows[msg.stream] = TemporaryFile(mode="w+b")

        yield from ()

    def handle_record_message(self, msg):
        assert isinstance(msg, singer.RecordMessage)

        stream = msg.stream

        if stream not in self.schemas:
            raise Exception(
                f"A record for stream {msg.stream} was encountered before a corresponding schema"
            )

        schema = self.schemas[stream]

        if self.validate_records:
            validate(msg.record, schema)

        if self.add_metadata_columns:
            msg.record["_sdc_extracted_at"] = msg.time_extracted.isoformat() \
                if msg.time_extracted else datetime.utcnow().isoformat()
            msg.record["_sdc_received_at"] = datetime.utcnow().isoformat()
            msg.record["_sdc_sequence"] = int(time.time_ns())

        new_rec = filter_by_schema(schema, msg.record)

        data = bytes(json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8")
        self.rows[stream].write(data)

        yield from ()

    def handle_state_message(self, msg):
        assert isinstance(msg, singer.StateMessage)

        self.STATE.merge(msg.value)

        yield from ()

    def on_stream_end(self):
        self._do_temp_table_based_load(self.rows)
        yield self.STATE

    def _do_temp_table_based_load(self, rows):
        assert isinstance(rows, dict)

        loaded_tmp_tables = []
        try:
            for stream in rows.keys():
                tmp_table_name = "t_{}_{}".format(
                    self.tables[stream],
                    str(uuid.uuid4()).replace("-", ""))

                job = self._load_to_bq(
                    client=self.client,
                    dataset=self.dataset,
                    table_name=tmp_table_name,
                    table_schema=self.schemas[stream],
                    table_config=self.table_configs.get(stream, {}),
                    key_props=self.key_properties[stream],
                    metadata_columns=self.add_metadata_columns,
                    truncate=self.truncate
                    if stream not in self.partially_loaded_streams else False,
                    rows=self.rows[stream])

                loaded_tmp_tables.append((stream, tmp_table_name))

            # copy tables to production tables
            copy_config = CopyJobConfig()
            copy_config.write_disposition = WriteDisposition.WRITE_APPEND

            for stream, tmp_table_name in loaded_tmp_tables:
                self.logger.info(
                    f"Copy {tmp_table_name} to {self.tables[stream]}")

                self.client.copy_table(
                    sources=self.dataset.table(tmp_table_name),
                    destination=self.dataset.table(self.tables[stream]),
                    job_config=copy_config).result()

                self.partially_loaded_streams.add(stream)
                self.rows[stream].close()  # erase the file
                self.rows[stream] = TemporaryFile(mode="w+b")

        except Exception as e:
            raise e

        finally:  # delete temp tables
            for stream, tmp_table_name in loaded_tmp_tables:
                self.client.delete_table(
                    table=self.dataset.table(tmp_table_name),
                    not_found_ok=True)

    def _load_to_bq(self, client, dataset, table_name, table_schema,
                    table_config, key_props, metadata_columns, truncate, rows):
        logger = self.logger
        partition_field = table_config.get("partition_field", None)
        cluster_fields = table_config.get("cluster_fields", None)
        force_fields = table_config.get("force_fields", {})

        schema = build_schema(table_schema,
                              key_properties=key_props,
                              add_metadata=metadata_columns,
                              force_fields=force_fields)
        load_config = LoadJobConfig()
        load_config.schema = schema
        if partition_field:
            load_config.time_partitioning = bigquery.table.TimePartitioning(
                type_=bigquery.table.TimePartitioningType.DAY,
                field=partition_field)

        if cluster_fields:
            load_config.clustering_fields = cluster_fields

        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            logger.info(f"Load {table_name} by FULL_TABLE (truncate)")
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        else:
            logger.info(f"Appending to {table_name}")
            load_config.write_disposition = WriteDisposition.WRITE_APPEND

        logger.info("loading {} to BigQuery".format(table_name))

        load_job = None
        try:
            load_job = client.load_table_from_file(rows,
                                                   dataset.table(table_name),
                                                   job_config=load_config,
                                                   rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            job = load_job.result()
            logger.info(job._properties)

            return job

        except google_exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table_name, str(err)))
            if load_job and load_job.errors:
                reason = err.errors[0]["reason"]
                messages = [f"{err['message']}" for err in load_job.errors]
                logger.error("reason: {reason}, errors:\n{e}".format(
                    reason=reason, e="\n".join(messages)))
                err.message = f"reason: {reason}, errors: {';'.join(messages)}"

            raise err
Пример #4
0
    def __init__(self, logger, **kwargs):
        super(BookmarksStatePartialLoadJobProcessHandler,
              self).__init__(logger, **kwargs)

        self.EMITTED_STATE = State(**self.INIT_STATE)
Пример #5
0
 def test_init(self):
     s = State(**{"a": 1})
     print(s)