def persist_lines_stream(config, lines=None, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} # bigquery_client = bigquery.Client(project=project_id) service = service_account.Credentials.from_service_account_file( config['key_file_location']) bigquery_client = bigquery.Client(project=config['project_id'], credentials=service) dataset_ref = bigquery_client.dataset(config['dataset_id']) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: js = json.loads(line) # msg = singer.parse_message(line) if js['type'] == 'RECORD': msg = singer.messages.RecordMessage(stream=js.get('stream'), record=js.get('record'), version=js.get('version'), time_extracted=None) else: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) errors[msg.stream] = bigquery_client.insert_rows_json( tables[msg.stream], [msg.record]) rows[msg.stream] += 1 elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: logger.info('Loaded {} row(s) into {}:{}'.format( rows[table], config['dataset_id'], table, tables[table].path)) emit_state(state) else: logger.error('Errors:', errors[table]) return state
def test_parse_message_record_missing_record(self): with self.assertRaises(Exception): singer.parse_message('{"type": "RECORD", "stream": "users"}')
def test_parse_message_schema_missing_stream(self): with self.assertRaises(Exception): message = singer.parse_message('{"type": "SCHEMA", "schema": {"type": "object", "properties": {"name": {"type": "string"}}}, "key_properties": ["name"]}') # nopep8
def persist_lines_stream(project_id, dataset_id, lines=None, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) err = None try: err = bigquery_client.insert_rows_json(tables[msg.stream], [msg.record]) except Exception as exc: logger.error( f"failed to insert rows for {tables[msg.stream]}: {str(exc)}\n{msg.record}" ) raise errors[msg.stream] = err rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: logging.info("Loaded {} row(s) from {} into {}:{}".format( rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error("Errors: %s", errors[table]) return state
def persist_messages(delimiter, quotechar, messages, destination_path, without_headers): state = None schemas = {} key_properties = {} headers = {} validators = {} now = datetime.now().strftime('%Y%m%dT%H%M%S') for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) validators[o['stream']].validate(o['record']) filename = o['stream'] + '-' + now + '.csv' filename = os.path.expanduser( os.path.join(destination_path, filename)) file_is_empty = ( not os.path.isfile(filename)) or os.stat(filename).st_size == 0 flattened_record = flatten(o['record']) if o['stream'] not in headers and not file_is_empty: with open(filename, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) first_line = next(reader) headers[o[ 'stream']] = first_line if first_line else flattened_record.keys( ) else: headers[o['stream']] = flattened_record.keys() with open(filename, 'a') as csvfile: writer = csv.DictWriter(csvfile, headers[o['stream']], extrasaction='ignore', delimiter=delimiter, quotechar=quotechar) if file_is_empty and not without_headers: writer.writeheader() writer.writerow(flattened_record) state = None elif message_type == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] validators[stream] = Draft4Validator(o['schema']) key_properties[stream] = o['key_properties'] else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) return state
def persist_lines_stream( project_id, dataset_id, lines=None, validate_records=True, key_path=None, ): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} if key_path: credentials = service_account.Credentials.from_service_account_file( key_path, scopes=['https://www.googleapis.com/auth/cloud-platform'], ) bigquery_client = bigquery.Client(credentials=credentials, project=project_id) else: bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error('Unable to parse:\n{}'.format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( 'A record for stream {} was encountered before a corresponding schema' .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) errors[msg.stream] = bigquery_client.insert_rows_json( tables[msg.stream], [msg.record]) rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception('Unrecognized message {}'.format(msg)) for table in errors.keys(): if not errors[table]: logging.info('Loaded {} row(s) into {}:{}'.format( rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error(errors[table]) return state
def test_parse_message_state_good(self): message = singer.parse_message( '{"type": "STATE", "value": {"seq": 1}}') self.assertEqual(message, singer.StateMessage(value={'seq': 1}))
def persist_lines_job(project_id, dataset_id, lines=None): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) # try: # dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) # except exceptions.Conflict: # pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception("A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] validate(msg.record, schema) dat = bytes(str(json.loads(json.dumps(msg.record), object_pairs_hook=clear_dict_hook)) + '\n', 'UTF-8') rows[msg.stream].write(dat) #rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8')) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties #tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = TemporaryFile(mode='w+b') errors[table] = None # try: # tables[table] = bigquery_client.create_table(tables[table]) # except exceptions.Conflict: # pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) load_job = bigquery_client.load_table_from_file( rows[table], table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) # for table in errors.keys(): # if not errors[table]: # print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path) # else: # print('Errors:', errors[table], sep=" ") return state
def persist_lines_stream(project_id, dataset_id, lines=None): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception("A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] validate(msg.record, schema) errors[msg.stream] = bigquery_client.create_rows(tables[msg.stream], [msg.record]) rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path) else: print('Errors:', errors[table], sep=" ") return state
def write_records( project_id, dataset_name, lines=None, stream=False, on_invalid_record="abort", partition_by=None, partition_type="day", partition_exp_ms=None, table_prefix="", table_ext="", load_config_properties=None, numeric_type="NUMERIC", max_warnings=20, ): if on_invalid_record not in ("abort", "skip", "force"): raise ValueError("on_invalid_record must be one of" + " (abort, skip, force)") state = None schemas = {} bq_schemas = {} tables = {} key_properties = {} table_files = {} row_count = {} invalids = {} errors = {} client = bigquery.Client(project=project_id) dataset = get_or_create_dataset(client, project_id, dataset_name) count = 0 for line in lines: try: message = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(message, singer.RecordMessage): if stream: json_dumps = False else: json_dumps = True record, validation = clean_and_validate( message, schemas, json_dumps, ) if not validation["is_valid"]: invalids[message.stream] += 1 instance = validation["instance"] type_ = validation["type"] invalid_record_str = json.dumps(validation["record"]) invalid_message = validation["message"] if invalids[message.stream] <= max_warnings: logger.warn( f"Invalid record found and the process will {on_invalid_record}. " f"[{instance}] :: {type_} :: {invalid_record_str} :: {message}" ) if invalids[message.stream] == max_warnings: logger.warn("Max validation warning reached. " "Further validation warnings are suppressed.") if on_invalid_record == "abort": raise Exception( "Validation required and failed. Aborting.") if validation["is_valid"] or on_invalid_record == "force": # https://cloud.google.com/bigquery/streaming-data-into-bigquery if stream: errors[message.stream] = client.insert_rows( tables[message.stream], [record]) else: table_files[message.stream].write(record) row_count[message.stream] += 1 state = None elif isinstance(message, singer.StateMessage): state = message.value # State may contain sensitive info. Not logging in production logger.debug("State: %s" % state) currently_syncing = state.get("currently_syncing") bookmarks = state.get("bookmarks") if currently_syncing and bookmarks: logger.info( f"State: currently_syncing {currently_syncing} - bookmark: {bookmarks.get(currently_syncing)}" ) elif isinstance(message, singer.SchemaMessage): table_name = message.stream if schemas.get(table_name): # Redundant schema rows continue schemas[table_name] = message.schema bq_schema = parse_schema(schemas[table_name], numeric_type) bq_schemas[table_name] = bq_schema tables[table_name] = get_or_create_table( client, project_id, dataset_name, f"{table_prefix}{table_name}{table_ext}", bq_schema, partition_by, partition_type, partition_exp_ms, ) if stream: # Ensure the table is created before streaming... time.sleep(3) if not stream: table_files[table_name] = TemporaryFile(mode='w+b') key_properties[table_name] = message.key_properties row_count[table_name] = 0 invalids[table_name] = 0 errors[table_name] = None elif isinstance(message, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(message)) count = count + 1 # We already wrote the data in the streaming mode if stream: for table_name in errors.keys(): if not errors[table_name]: logger.info("Streamed {} row(s) into {}.{}.{}".format( row_count[table_name], project_id, dataset_name, table_name)) else: logger.warn("Errors:", errors[table_name], sep=" ") return state # For batch job mode only for table_name in table_files.keys(): if invalids[table_name] > 0: if on_invalid_record == "skip": logger.warn( f"Persisting {table_name} stream by skipping the invalid records." ) elif on_invalid_record == "force": logger.warn( f"Persisting {table_name} stream by replacing invalids with null." ) bq_schema = bq_schemas[table_name] # We should already have get-or-created: table = tables[table_name] load_config_props = { "schema": bq_schema, "source_format": SourceFormat.NEWLINE_DELIMITED_JSON } if load_config_properties: load_config_props.update(load_config_properties) load_config = LoadJobConfig(**load_config_props) if row_count[table_name] == 0: logger.info(f"Zero records for {table}. Skip loading.") continue logger.info(f"Batch loading {table} to Bigquery") table_files[table_name].seek(0) table_id = f"{project_id}.{dataset_name}.{table_prefix}{table_name}{table_ext}" try: load_job = client.load_table_from_file(table_files[table_name], table_id, job_config=load_config) except exceptions.BadRequest: logger.error("Error loading records for table " + table_name) logger.error(bq_schema) table_files[table_name].seek(0) logger.debug(table_files[table_name].read()) raise logger.info("Batch loading job {}".format(load_job.job_id)) try: logger.debug(load_job.result()) except Exception as e: logger.critical(load_job.errors) raise for key, value in row_count.items(): row_uploads = { "type": "counter", "metric": "row_uploads", "value": value, "tags": { "endpoint": key }, } logger.info(f"{json.dumps(row_uploads)}") for key, value in invalids.items(): invalid_rows = { "type": "counter", "metric": "invalid_records", "value": value, "tags": { "endpoint": key }, } logger.info(f"{json.dumps(invalid_rows)}") return state
def persist_messages(messages, config, s3_client): state = None schemas = {} key_properties = {} headers = {} validators = {} delimiter = config.get("delimiter", ",") quotechar = config.get("quotechar", '"') # Use the system specific temp directory if no custom temp_dir provided temp_dir = os.path.expanduser(config.get("temp_dir", tempfile.gettempdir())) # Create temp_dir if not exists if temp_dir: os.makedirs(temp_dir, exist_ok=True) filenames = [] now = datetime.now().strftime("%Y%m%dT%H%M%S") for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o["type"] if message_type == "RECORD": if o["stream"] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o["stream"])) # Validate record try: validators[o["stream"]].validate( utils.float_to_decimal(o["record"])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n" "'multipleOf' validations that allows long precisions are not supported" " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema." .format(o["record"])) raise ex record_to_load = o["record"] if config.get("add_metadata_columns"): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) filename = o["stream"] + "-" + now + ".csv" filename = os.path.expanduser(os.path.join(temp_dir, filename)) target_key = utils.get_target_key( o, prefix=config.get("s3_key_prefix", ""), timestamp=now, naming_convention=config.get("naming_convention"), ) if not (filename, target_key) in filenames: filenames.append((filename, target_key)) file_is_empty = ( not os.path.isfile(filename)) or os.stat(filename).st_size == 0 flattened_record = utils.flatten_record(record_to_load) if o["stream"] not in headers and not file_is_empty: with open(filename, "r") as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) first_line = next(reader) headers[o["stream"]] = (first_line if first_line else flattened_record.keys()) else: headers[o["stream"]] = flattened_record.keys() with open(filename, "a") as csvfile: if file_is_empty: header = (",".join([ json.dumps( v, ensure_ascii=False, default=decimal_default) for v in headers[o["stream"]] ]) + "\n") # header = header.encode('UTF-8') csvfile.write(header) row = (",".join([ json.dumps( flattened_record[k], ensure_ascii=False, default=decimal_default, ) for k in headers[o["stream"]] ]) + "\n") # row = row.encode('UTF-8') csvfile.write(row) state = None elif message_type == "STATE": logger.debug("Setting state to {}".format(o["value"])) state = o["value"] elif message_type == "SCHEMA": stream = o["stream"] schemas[stream] = o["schema"] if config.get("add_metadata_columns"): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o["schema"]) validators[stream] = Draft7Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o["key_properties"] elif message_type == "ACTIVATE_VERSION": logger.debug("ACTIVATE_VERSION message") else: logger.warning("Unknown message type {} in message {}".format( o["type"], o)) # Upload created CSV files to S3 targets = [] for filename, target_key in filenames: compressed_file = None if config.get("compression") is None or config["compression"].lower( ) == "none": pass # no compression else: if config["compression"] == "gzip": compressed_file = f"{filename}.gz" try: with open(filename, "rb") as f_in: with gzip.open(compressed_file, "wb") as f_out: logger.info( f"Compressing file as '{compressed_file}'") shutil.copyfileobj(f_in, f_out) except FileNotFoundError: logger.error( "Could not find file (most likely already in AWS): {}". format(filename)) else: raise NotImplementedError( "Compression type '{}' is not supported. " "Expected: 'none' or 'gzip'".format(config["compression"])) s3.upload_file( compressed_file or filename, s3_client, config.get("s3_bucket"), target_key, encryption_type=config.get("encryption_type"), encryption_key=config.get("encryption_key"), ) # Remove the local file(s) os.remove(filename) if compressed_file: os.remove(compressed_file) targets.append(target_key) return state, targets
def persist_messages(messages, config, s3_client, do_timestamp_file=True): logger.info('persist_messages') state = None schemas = {} key_properties = {} validators = {} filenames = [] filename = None timestamp_file_part = '-' + datetime.now().strftime( '%Y%m%dT%H%M%S') if do_timestamp_file else '' max_file_size_mb = config.get('max_temp_file_size_mb', 50) stream = None if config.get('record_unique_field'): a = set() write_temp_pickle() for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) # Validate record try: validators[o['stream']].validate( utils.float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( """Data validation failed and cannot load to destination. RECORD: {}\n 'multipleOf' validations that allows long precisions are not supported (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema. """.format(o['record'])) raise ex record_to_load = o['record'] if config.get('add_metadata_columns'): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) flattened_record = utils.flatten(record_to_load) filename = o['stream'] + timestamp_file_part + '.jsonl' filename = os.path.join(tempfile.gettempdir(), filename) filename = os.path.expanduser(filename) if not (filename, o['stream']) in filenames: filenames.append((filename, o['stream'])) with open(filename, 'a') as f: f.write(json.dumps(flattened_record, cls=DecimalEncoder)) f.write('\n') file_size = os.path.getsize(filename) if os.path.isfile( filename) else 0 if file_size >> 20 > max_file_size_mb: logger.info('file_size: {} MB, filename: {}'.format( round(file_size >> 20, 2), filename)) upload_to_s3(s3_client, config.get("s3_bucket"), os.environ["TARGET_S3_SOURCE_NAME"], filename, o['stream'], config.get('field_to_partition_by_time'), config.get('record_unique_field'), config.get("compression"), config.get('encryption_type'), config.get('encryption_key')) filenames.remove((filename, o['stream'])) state = None elif message_type == 'STATE': logger.info('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] if config.get('add_metadata_columns'): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o['schema']) validators[stream] = Draft4Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o['key_properties'] elif message_type == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) # Upload created CSV files to S3 for filename, stream in filenames: upload_to_s3(s3_client, config.get("s3_bucket"), os.environ["TARGET_S3_SOURCE_NAME"], filename, stream, config.get('field_to_partition_by_time'), config.get('record_unique_field'), config.get("compression"), config.get('encryption_type'), config.get('encryption_key')) return state
def persist_messages(messages, config, s3_client): state = None schemas = {} key_properties = {} headers = {} validators = {} delimiter = config.get('delimiter', ',') quotechar = config.get('quotechar', '"') # Use the system specific temp directory if no custom temp_dir provided temp_dir = os.path.expanduser(config.get('temp_dir', tempfile.gettempdir())) # Create temp_dir if not exists if temp_dir: os.makedirs(temp_dir, exist_ok=True) filenames = [] now = datetime.now().strftime('%Y%m%dT%H%M%S') for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) # Validate record try: validators[o['stream']].validate( utils.float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n" "'multipleOf' validations that allows long precisions are not supported" " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema." .format(o['record'])) raise ex record_to_load = o['record'] if config.get('add_metadata_columns'): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) filename = o['stream'] + '-' + now + '.csv' filename = os.path.expanduser(os.path.join(temp_dir, filename)) target_key = utils.get_target_key( o, prefix=config.get('s3_key_prefix', ''), timestamp=now, naming_convention=config.get('naming_convention')) if not (filename, target_key) in filenames: filenames.append((filename, target_key)) file_is_empty = ( not os.path.isfile(filename)) or os.stat(filename).st_size == 0 flattened_record = utils.flatten_record(record_to_load) if o['stream'] not in headers and not file_is_empty: with open(filename, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) first_line = next(reader) headers[o[ 'stream']] = first_line if first_line else flattened_record.keys( ) else: headers[o['stream']] = flattened_record.keys() with open(filename, 'a') as csvfile: writer = csv.DictWriter(csvfile, headers[o['stream']], extrasaction='ignore', delimiter=delimiter, quotechar=quotechar) if file_is_empty: writer.writeheader() writer.writerow(flattened_record) state = None elif message_type == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] if config.get('add_metadata_columns'): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o['schema']) validators[stream] = Draft7Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o['key_properties'] elif message_type == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) # Upload created CSV files to S3 for filename, target_key in filenames: compressed_file = None if config.get("compression") is None or config["compression"].lower( ) == "none": pass # no compression else: if config["compression"] == "gzip": compressed_file = f"{filename}.gz" with open(filename, 'rb') as f_in: with gzip.open(compressed_file, 'wb') as f_out: logger.info(f"Compressing file as '{compressed_file}'") shutil.copyfileobj(f_in, f_out) else: raise NotImplementedError( "Compression type '{}' is not supported. " "Expected: 'none' or 'gzip'".format(config["compression"])) s3.upload_file(compressed_file or filename, s3_client, config.get('s3_bucket'), target_key, encryption_type=config.get('encryption_type'), encryption_key=config.get('encryption_key')) # Remove the local file(s) os.remove(filename) if compressed_file: os.remove(compressed_file) return state
def persist_lines_stream(project_id, dataset_id, ensure_ascii, lines=None, validate_records=True, array_nodes=[], force_to_string_fields=[]): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = collections.defaultdict(list) data_holder = [] lines_read = False stream = None if flags.no_records: no_records = int(flags.no_records) else: logger.info('Number of records not specified. Setting to maximum: {}'.format(MAX_NO_RECORDS)) no_records = MAX_NO_RECORDS if flags.data_location: bigquery_client = bigquery.Client(project=project_id, location=flags.data_location) else: bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass payload_size = 0 for line in lines: lines_read = True # skip SCHEMA messages (except for the the intial one) if '{"anyOf": [{' in line: continue try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if is_record_deleted(msg.record, SDC_DELETED_AT): continue if validate_records: validate(msg.record, schema) modified_record = handle_decimal_values(msg.record) modified_record = handle_empty_arrays(array_nodes, modified_record) modified_record = force_fields_to_string(force_to_string_fields, modified_record, ensure_ascii) item_size = getsize(modified_record) if payload_size + item_size >= MAX_PAYLOAD_SIZE: logger.info('Near max request size. Sending: {} records, payload size: {}.'.format(len(data_holder), payload_size)) upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder) if upload_res: logger.error('Upload error: {}'.format(upload_res)) else: rows[msg.stream] += len(data_holder) data_holder = [] payload_size = 0 data_holder.append(modified_record) payload_size += item_size else: if len(data_holder) >= no_records: logger.info( "Max request size not reached, max #records reached. Sending: {} records, payload size: {} bytes.".format( len(data_holder), item_size + payload_size)) upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder) if upload_res: logger.error('Upload error: {}'.format(upload_res)) else: rows[msg.stream] += len(data_holder) data_holder = [] payload_size = 0 data_holder.append(modified_record) payload_size += item_size stream = msg.stream state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) if len(data_holder) > 0 and lines_read and stream: logger.info( "Remaining records. Sending: {} records, payload size: {} bytes.".format(len(data_holder), payload_size)) upload_res = bigquery_client.insert_rows_json(tables[stream], data_holder) if upload_res: logger.error('Upload error: {}'.format(upload_res)) else: rows[stream] += len(data_holder) for table in errors.keys(): if not errors[table]: logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error('Errors:', errors[table]) return state
def process(ProcessHandler, tap_stream, **kwargs): handler = ProcessHandler(logger, **kwargs) assert isinstance(handler, BaseProcessHandler) if handler.emit_initial_state(): s = kwargs.get("initial_state", {}) assert isinstance(s, dict) logger.info(f"Pushing state: {s}") yield s # yield init state, so even if there is an exception right after we get proper state emitted update_fields = kwargs.get("update_fields", False) for line in tap_stream: if update_fields: obj = json.loads(line.strip()) msg_type = obj['type'] new_obj = dict() # only deal with the first depth of fields, for Google Analytics schemas and records if msg_type == 'RECORD': for key, value in obj.items(): if key == 'record': new_obj[key] = dict() for k, v in obj[key].items(): new_obj[key][k.replace(':', '_')] = v else: new_obj[key] = value line = json.dumps(new_obj) elif msg_type == 'SCHEMA': for key, value in obj.items(): if key == 'schema': new_obj[key] = dict() for schema_key, schema_value in obj[key].items(): if schema_key == 'properties': new_obj[key][schema_key] = dict() for k, v in obj[key][schema_key].items(): new_obj[key][schema_key][k.replace( ':', '_')] = v else: new_obj[key][schema_key] = schema_value else: new_obj[key] = value line = json.dumps(new_obj) try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): for s in handler.handle_record_message(msg): logger.info(f"Pushing state: {s}") yield s elif isinstance(msg, singer.StateMessage): logger.info("Updating state with {}".format(msg.value)) for s in handler.handle_state_message(msg): logger.info(f"Pushing state: {s}") yield s elif isinstance(msg, singer.SchemaMessage): logger.info("{} schema: {}".format(msg.stream, msg.schema)) for s in handler.handle_schema_message(msg): logger.info(f"Pushing state: {s}") yield s elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for s in handler.on_stream_end(): logger.info(f"Pushing state: {s}") yield s
def persist_messages(messages, config, s3_client): state = None schemas = {} key_properties = {} headers = {} validators = {} filenames = [] file_size_counters = dict() file_count_counters = dict() file_data = dict() filename = None s3_path, s3_filename = None, None now = datetime.now().strftime('%Y%m%dT%H%M%S') max_file_size_mb = config.get('max_temp_file_size_mb', 1000) stream = None if config.get('record_unique_field'): a = set() write_temp_pickle() for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] # if message_type != 'RECORD': # logger.info("{} - message: {}".format(message_type, o)) # if message_type not in message_types: # logger.info("{} - message: {}".format(message_type, o)) if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) # Validate record try: validators[o['stream']].validate( utils.float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( """Data validation failed and cannot load to destination. RECORD: {}\n 'multipleOf' validations that allows long precisions are not supported (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema. """.format(o['record'])) raise ex record_to_load = o['record'] if config.get('add_metadata_columns'): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) flattened_record = utils.flatten_record(record_to_load) if filename is None: filename = '{}.jsonl'.format(now) filename = os.path.join(tempfile.gettempdir(), filename) filename = os.path.expanduser(filename) file_size_counters[filename] = 0 file_count_counters[filename] = file_count_counters.get( filename, 1) full_s3_target = str( file_count_counters[filename]) + '_' + filename if not (filename, full_s3_target) in filenames: filenames.append((filename, full_s3_target)) file_size = os.path.getsize(filename) if os.path.isfile( filename) else 0 if file_size >> 20 > file_size_counters[ filename] and file_size >> 20 % 100 == 0: logger.info('file_size: {} MB, filename: {}'.format( round(file_size >> 20, 2), filename)) file_size_counters[filename] = file_size_counters.get( filename, 0) + 10 if file_size >> 20 > max_file_size_mb: logger.info( 'Max file size reached: {}, dumping to s3...'.format( max_file_size_mb)) upload_to_s3(s3_client, config.get("s3_bucket"), filename, stream, config.get('field_to_partition_by_time'), config.get('record_unique_field'), config.get("compression"), config.get('encryption_type'), config.get('encryption_key')) file_size = 0 file_count_counters[filename] = file_count_counters.get( filename, 1) + 1 if filename in headers: del headers[filename] file_is_empty = file_size == 0 if file_is_empty: logger.info('creating file: {}'.format(filename)) with open(filename, 'a') as f: f.write(json.dumps(flattened_record)) f.write('\n') state = None elif message_type == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] if config.get('add_metadata_columns'): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o['schema']) validators[stream] = Draft4Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o['key_properties'] filename = None if config.get('field_to_partition_by_time' ) not in key_properties[stream]: raise Exception( """field_to_partition_by_time '{}' is not in key_properties: {}""" .format(config.get('field_to_partition_by_time'), key_properties[stream])) elif message_type == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) # Upload created CSV files to S3 for filename, s3_target in filenames: upload_to_s3(s3_client, config.get("s3_bucket"), filename, stream, config.get('field_to_partition_by_time'), config.get('record_unique_field'), config.get("compression"), config.get('encryption_type'), config.get('encryption_key')) return state
def persist_lines_job( project_id, dataset_id, lines=None, truncate=False, validate_records=True, key_path=None, ): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} if key_path: credentials = service_account.Credentials.from_service_account_file( key_path, scopes=['https://www.googleapis.com/auth/cloud-platform'], ) bigquery_client = bigquery.Client(credentials=credentials, project=project_id) else: bigquery_client = bigquery.Client(project=project_id) # try: # dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) # except exceptions.Conflict: # pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error('Unable to parse:\n{}'.format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( 'A record for stream {} was encountered before a corresponding schema' .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. dat = bytes(json.dumps(msg.record) + '\n', 'UTF-8') rows[msg.stream].write(dat) # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8')) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties # tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = TemporaryFile(mode='w+b') errors[table] = None # try: # tables[table] = bigquery_client.create_table(tables[table]) # except exceptions.Conflict: # pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception('Unrecognized message {}'.format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info('loading {} to Bigquery.\n'.format(table)) load_job = bigquery_client.load_table_from_file(rows[table], table_ref, job_config=load_config) logger.info('loading job {}'.format(load_job.job_id)) logger.info(load_job.result()) # for table in errors.keys(): # if not errors[table]: # print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path) # else: # print('Errors:', errors[table], sep=" ") return state
def summarize_output(output): summary = OutputSummary() for line in output: summary.add(singer.parse_message(line)) return summary
def test_parse_message_record_naive_extraction_time(self): with self.assertRaisesRegex(ValueError, "must be either None or an aware datetime"): message = singer.parse_message( '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2, "time_extracted": "1970-01-02T00:00:00"}')
def persist_lines_job(lines=None, truncate=False, validate_records=True): state = None schemas = {} key_properties = {} bq_schemas = {} rows = {} errors = {} # try: # dataset = BIGQUERY_CLIENT.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) # except exceptions.Conflict: # pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) msg.record = apply_string_conversions(msg.record, bq_schemas[msg.stream]) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. new_record = apply_decimal_conversions(msg.record) dat = bytes(json.dumps(new_record) + '\n', 'UTF-8') rows[msg.stream].write(dat) # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8')) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties bq_schemas[table] = build_schema(schemas[table]) rows[table] = TemporaryFile(mode='w+b') errors[table] = None # try: # tables[table] = BIGQUERY_CLIENT.create_table(tables[table]) # except exceptions.Conflict: # pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = BIGQUERY_CLIENT.dataset(DATASET_ID).table(fix_name(table)) load_config = LoadJobConfig() load_config.schema = bq_schemas[table] load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0, os.SEEK_END) if rows[table].tell() == 0: continue rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) load_job = BIGQUERY_CLIENT.load_table_from_file( rows[table], table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) sync_state_for_table(table, state) # for table in errors.keys(): # if not errors[table]: # print('Loaded {} row(s) into {}:{}'.format(rows[table], DATASET_ID, table), tables[table].path) # else: # print('Errors:', errors[table], sep=" ") return state
def test_parse_message_state_missing_value(self): with self.assertRaises(Exception): singer.parse_message('{"type": "STATE"}')
def persist_lines_stream(lines=None, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} dataset_ref = BIGQUERY_CLIENT.dataset(DATASET_ID) dataset = Dataset(dataset_ref) try: dataset = BIGQUERY_CLIENT.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) msg.record = apply_string_conversions(msg.record, tables[msg.stream].schema) msg.record = apply_decimal_conversions(msg.record) errors[msg.stream] = BIGQUERY_CLIENT.insert_rows_json(tables[msg.stream], [msg.record]) rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value sync_state(state) emit_state(state) elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = BIGQUERY_CLIENT.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], DATASET_ID, table, tables[table].path)) emit_state(state) else: logging.error('Errors:', str(errors[table])) return state
def persist_lines_job( project_id, dataset_id, lines=None, truncate=False, validate_records=True, table_suffix=None, ): state = None schemas = {} key_properties = {} rows = {} errors = {} table_suffix = table_suffix or "" class DecimalEncoder(json.JSONEncoder): # pylint: disable=method-hidden def default(self, o): if isinstance(o, decimal.Decimal): return str(o) return super(DecimalEncoder, self).default(o) bigquery_client = bigquery.Client(project=project_id) for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): table_name = msg.stream + table_suffix if table_name not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(table_name)) schema = schemas[table_name] if validate_records: validate(msg.record, schema) new_rec = filter(schema, msg.record) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. dat = bytes( json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8") rows[table_name].write(dat) state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table_name = msg.stream + table_suffix if table_name in rows: continue schemas[table_name] = msg.schema key_properties[table_name] = msg.key_properties rows[table_name] = TemporaryFile(mode="w+b") errors[table_name] = None elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE logger.info("loading {} to Bigquery.\n".format(table)) try: load_job = bigquery_client.load_table_from_file( rows[table], table_ref, job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) except exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table, str(err))) if load_job.errors: messages = [ f"reason: {err['reason']}, message: {err['message']}" for err in load_job.errors ] logger.error("errors:\n{}".format("\n".join(messages))) raise return state
def persist_messages(delimiter, quotechar, file, messages): state = None schemas = {} key_properties = {} headers = {} validators = {} logger.info("do persists") now = datetime.now().strftime('%Y%m%dT%H%M%S') if file is not None: try: os.remove(file) except FileNotFoundError: pass for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) validators[o['stream']].validate(o['record']) filename = o['stream'] + '-' + now + '.csv' if file is not None: filename = file file_is_empty = ( not os.path.isfile(filename)) or os.stat(filename).st_size == 0 flattened_record = flatten(o['record']) with open(filename, 'a') as csvfile: if o['stream'] not in headers: if not file_is_empty: with open(filename, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) first_line = next(reader) headers[o[ 'stream']] = first_line if first_line else flattened_record.keys( ) else: headers[o['stream']] = extract_header_names( property=schemas[o['stream']]['properties']) logger.info(f"generated headers: {headers[o['stream']]}") writer = csv.DictWriter(csvfile, headers[o['stream']], extrasaction='ignore', delimiter=delimiter, quotechar=quotechar) if file_is_empty: header_names = headers[o['stream']] writer.writeheader() for header in headers[o['stream']]: if header not in flattened_record: flattened_record[header] = None writer.writerow(flattened_record) state = None elif message_type == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] validators[stream] = Draft4Validator(o['schema']) key_properties[stream] = o['key_properties'] else: raise Exception("Unknown message type {} in message {}".format( o['type'], o)) return state
def test_parse_message_record_with_version_good(self): message = singer.parse_message( '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2}') self.assertEqual( message, singer.RecordMessage(record={'name': 'foo'}, stream='users', version=2))
def persist_messages(messages, config, s3_client): state = None schemas = {} key_properties = {} headers = {} validators = {} file_counts = {} delimiter = config.get('delimiter', ',') quotechar = config.get('quotechar', '"') max_file_size = config.get('max_file_size_mb', 1000) * 1000000 compression = config.get('compression') flatten = config.get('flatten', True) s3_bucket = config.get('s3_bucket') skip_upload = s3_bucket == 'localhost' # Use the system specific temp directory if no custom temp_dir provided temp_dir = os.path.expanduser(config.get('temp_dir', tempfile.gettempdir())) # Create temp_dir if not exists if temp_dir: os.makedirs(temp_dir, exist_ok=True) filenames = [] now = datetime.now().strftime('%Y%m%dT%H%M%S') for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) # Validate record try: # validators[o['stream']].validate(utils.float_to_decimal(o['record'])) pass # Skipping validation as it slows things way down except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n" "'multipleOf' validations that allows long precisions are not supported" " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema." .format(o['record'])) raise ex record_to_load = o['record'] if config.get('add_metadata_columns'): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) filename = o['stream'] + '-' + now + '.csv' filename = os.path.expanduser(os.path.join(temp_dir, filename)) target_key = utils.get_target_key( o, prefix=config.get('s3_key_prefix', ''), timestamp=now, naming_convention=config.get('naming_convention')) if not (filename, target_key) in filenames: filenames.append((filename, target_key)) file_is_empty = ( not os.path.isfile(filename)) or os.stat(filename).st_size == 0 flattened_record = utils.flatten_record( record_to_load) if flatten else record_to_load if o['stream'] not in headers and not file_is_empty: with open(filename, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) first_line = next(reader) headers[o[ 'stream']] = first_line if first_line else flattened_record.keys( ) else: headers[o['stream']] = flattened_record.keys() with open(filename, 'a') as csvfile: writer = csv.DictWriter(csvfile, headers[o['stream']], extrasaction='ignore', delimiter=delimiter, quotechar=quotechar) if file_is_empty: writer.writeheader() writer.writerow(flattened_record) if os.stat(filename).st_size > max_file_size: cnt = file_counts[filename] = file_counts.get(filename, 0) + 1 # Add counter sequence to filename rename_file = utils.add_file_count(filename, cnt) os.rename(filename, rename_file) filename = rename_file compressed_file = utils.compress_file(filename, compression) comp_ext = '.gz' if compressed_file else '' if skip_upload: continue # Skip S3 upload and keep local files # upload to s3 with amended target_key s3.upload_file(compressed_file or filename, s3_client, config.get('s3_bucket'), utils.add_file_count(target_key, cnt) + comp_ext, encryption_type=config.get('encryption_type'), encryption_key=config.get('encryption_key')) state = None elif message_type == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] if config.get('add_metadata_columns'): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o['schema']) validators[stream] = Draft7Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o['key_properties'] elif message_type == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) # Upload created CSV files to S3 for filename, target_key in filenames: if not os.path.isfile(filename): continue cnt = file_counts.get(filename, 0) + 1 if cnt > 1: target_key = utils.add_file_count(target_key, cnt) compressed_file = utils.compress_file(filename, compression) comp_ext = '.gz' if compressed_file else '' if skip_upload: continue # Skip S3 upload and keep local files s3.upload_file(compressed_file or filename, s3_client, s3_bucket, target_key + comp_ext, encryption_type=config.get('encryption_type'), encryption_key=config.get('encryption_key')) return state
def test_parse_message_record_missing_stream(self): with self.assertRaises(Exception): singer.parse_message( '{"type": "RECORD", "record": {"name": "foo"}}')
def write_records(cluster, username, password, bucket, lines=None, collection_map=None, index_keys=None, on_invalid_record="abort"): if on_invalid_record not in ("abort", "skip", "force"): raise ValueError("on_invalid_record must be one of" + " (abort, skip, force)") state = None schemas = {} tables = {} key_properties = {} table_files = {} row_count = {} errors = {} cluster = Cluster( "couchbase://" + cluster, ClusterOptions(PasswordAuthenticator(username, password))) count = 0 invalids = 0 current_batch = defaultdict(dict) for line in lines: try: message = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(message, singer.RecordMessage): json_dumps = False record, invalids = clean_and_validate(message, schemas, invalids, on_invalid_record, json_dumps) if invalids == 0 or on_invalid_record == "force": record["_stream"] = message.stream collection_name = None if collection_map: collection_name = collection_map.get(message.stream) if index_keys and index_keys.get(message.stream): key = record[index_keys[message.stream]] else: key = uuid.uuid4().hex current_batch[collection_name or "_"][key] = record if is_batch_ready(current_batch[collection_name or "_"]): flush_batch(cluster, bucket, current_batch.pop(collection_name or "_")) row_count[message.stream] += 1 state = None elif isinstance(message, singer.StateMessage): state = message.value # State may contain sensitive info. Not logging in production logger.debug("State: %s" % state) currently_syncing = state.get("currently_syncing") bookmarks = state.get("bookmarks") if currently_syncing and bookmarks: logger.info( "State: currently_syncing %s - last_update: %s" % (currently_syncing, bookmarks.get( currently_syncing, dict()).get("last_update"))) elif isinstance(message, singer.SchemaMessage): table_name = message.stream if schemas.get(table_name): # Redundant schema rows continue schemas[table_name] = message.schema key_properties[table_name] = message.key_properties row_count[table_name] = 0 errors[table_name] = None elif isinstance(message, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(message)) count = count + 1 for collection_name, batch in current_batch.items(): if batch: if collection_name == "_": collection_name = None flush_batch(cluster, bucket, batch, collection_name) return state
def test_parse_message_schema_missing_schema(self): with self.assertRaises(Exception): message = singer.parse_message( '{"type": "SCHEMA", "stream": "users", "key_properties": ["name"]}') # nopep8
def persist_lines(project_id, dataset_id, table_id, lines): state = None schemas = {} key_properties = {} rows = [] for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] validate(msg.record, schema) bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass table_ref = dataset.table(table_id) table_schema = build_schema(schema) table = bigquery.Table(table_ref, schema=table_schema) try: table = bigquery_client.create_table(table) except exceptions.Conflict: pass rows.append(msg.record) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): schemas[msg.stream] = msg.schema key_properties[msg.stream] = msg.key_properties elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) errors = bigquery_client.create_rows(table, rows) if not errors: print('Loaded {} row(s) into {}:{}'.format(len(rows), dataset_id, table_id)) else: print('Errors:') pprint(errors) return state