def test_flat_schema(self): state = State() state.merge({ "bookmarks": { "stream_one": { "timestamp": "2020-01-10T00:00:00.000000Z" } } }) print(state) state.merge({ "bookmarks": { "stream_one": { "timestamp": "2020-01-11T00:00:00.000000Z" } } }) print(state) state.merge({ "bookmarks": { "stream_two": { "timestamp": "2020-01-11T00:00:00.000000Z" } } }) print(state)
def __init__(self, logger, **kwargs): super(LoadJobProcessHandler, self).__init__(logger, **kwargs) self.truncate = kwargs.get("truncate", False) self.partially_loaded_streams = set() self.add_metadata_columns = kwargs.get("add_metadata_columns", True) self.validate_records = kwargs.get("validate_records", True) self.table_configs = kwargs.get("table_configs", {}) or {} self.INIT_STATE = kwargs.get("initial_state") or {} self.STATE = State(**self.INIT_STATE) self.rows = {} self.client = bigquery.Client(project=self.project_id, location=kwargs.get("location", "US"))
class LoadJobProcessHandler(BaseProcessHandler): def __init__(self, logger, **kwargs): super(LoadJobProcessHandler, self).__init__(logger, **kwargs) self.truncate = kwargs.get("truncate", False) self.partially_loaded_streams = set() self.add_metadata_columns = kwargs.get("add_metadata_columns", True) self.validate_records = kwargs.get("validate_records", True) self.table_configs = kwargs.get("table_configs", {}) or {} self.INIT_STATE = kwargs.get("initial_state") or {} self.STATE = State(**self.INIT_STATE) self.rows = {} self.client = bigquery.Client(project=self.project_id, location=kwargs.get("location", "US")) def handle_schema_message(self, msg): for s in super(LoadJobProcessHandler, self).handle_schema_message(msg): yield s if msg.stream not in self.rows: self.rows[msg.stream] = TemporaryFile(mode="w+b") yield from () def handle_record_message(self, msg): assert isinstance(msg, singer.RecordMessage) stream = msg.stream if stream not in self.schemas: raise Exception( f"A record for stream {msg.stream} was encountered before a corresponding schema" ) schema = self.schemas[stream] if self.validate_records: validate(msg.record, schema) if self.add_metadata_columns: msg.record["_sdc_extracted_at"] = msg.time_extracted.isoformat() \ if msg.time_extracted else datetime.utcnow().isoformat() msg.record["_sdc_received_at"] = datetime.utcnow().isoformat() msg.record["_sdc_sequence"] = int(time.time_ns()) new_rec = filter_by_schema(schema, msg.record) data = bytes(json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8") self.rows[stream].write(data) yield from () def handle_state_message(self, msg): assert isinstance(msg, singer.StateMessage) self.STATE.merge(msg.value) yield from () def on_stream_end(self): self._do_temp_table_based_load(self.rows) yield self.STATE def _do_temp_table_based_load(self, rows): assert isinstance(rows, dict) loaded_tmp_tables = [] try: for stream in rows.keys(): tmp_table_name = "t_{}_{}".format( self.tables[stream], str(uuid.uuid4()).replace("-", "")) job = self._load_to_bq( client=self.client, dataset=self.dataset, table_name=tmp_table_name, table_schema=self.schemas[stream], table_config=self.table_configs.get(stream, {}), key_props=self.key_properties[stream], metadata_columns=self.add_metadata_columns, truncate=self.truncate if stream not in self.partially_loaded_streams else False, rows=self.rows[stream]) loaded_tmp_tables.append((stream, tmp_table_name)) # copy tables to production tables copy_config = CopyJobConfig() copy_config.write_disposition = WriteDisposition.WRITE_APPEND for stream, tmp_table_name in loaded_tmp_tables: self.logger.info( f"Copy {tmp_table_name} to {self.tables[stream]}") self.client.copy_table( sources=self.dataset.table(tmp_table_name), destination=self.dataset.table(self.tables[stream]), job_config=copy_config).result() self.partially_loaded_streams.add(stream) self.rows[stream].close() # erase the file self.rows[stream] = TemporaryFile(mode="w+b") except Exception as e: raise e finally: # delete temp tables for stream, tmp_table_name in loaded_tmp_tables: self.client.delete_table( table=self.dataset.table(tmp_table_name), not_found_ok=True) def _load_to_bq(self, client, dataset, table_name, table_schema, table_config, key_props, metadata_columns, truncate, rows): logger = self.logger partition_field = table_config.get("partition_field", None) cluster_fields = table_config.get("cluster_fields", None) force_fields = table_config.get("force_fields", {}) schema = build_schema(table_schema, key_properties=key_props, add_metadata=metadata_columns, force_fields=force_fields) load_config = LoadJobConfig() load_config.schema = schema if partition_field: load_config.time_partitioning = bigquery.table.TimePartitioning( type_=bigquery.table.TimePartitioningType.DAY, field=partition_field) if cluster_fields: load_config.clustering_fields = cluster_fields load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: logger.info(f"Load {table_name} by FULL_TABLE (truncate)") load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE else: logger.info(f"Appending to {table_name}") load_config.write_disposition = WriteDisposition.WRITE_APPEND logger.info("loading {} to BigQuery".format(table_name)) load_job = None try: load_job = client.load_table_from_file(rows, dataset.table(table_name), job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) job = load_job.result() logger.info(job._properties) return job except google_exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table_name, str(err))) if load_job and load_job.errors: reason = err.errors[0]["reason"] messages = [f"{err['message']}" for err in load_job.errors] logger.error("reason: {reason}, errors:\n{e}".format( reason=reason, e="\n".join(messages))) err.message = f"reason: {reason}, errors: {';'.join(messages)}" raise err
def __init__(self, logger, **kwargs): super(BookmarksStatePartialLoadJobProcessHandler, self).__init__(logger, **kwargs) self.EMITTED_STATE = State(**self.INIT_STATE)
def test_init(self): s = State(**{"a": 1}) print(s)