def parse_args(): ''' This is to replace singer's default singer_utils.parse_args() https://github.com/singer-io/singer-python/blob/master/singer/utils.py Parse standard command-line args. Parses the command-line arguments mentioned in the SPEC and the BEST_PRACTICES documents: -c,--config Config file -s,--state State file -d,--discover Run in discover mode --catalog Catalog file Returns the parsed args object from argparse. For each argument that point to JSON files (config, state, properties), we will automatically load and parse the JSON file. ''' parser = argparse.ArgumentParser() parser.add_argument( '-c', '--config', help='Config file', required=True) parser.add_argument( '-s', '--state', help='State file') parser.add_argument( '-p', '--properties', help='Property selections: DEPRECATED, Please use --catalog instead') parser.add_argument( '--catalog', help='Catalog file') parser.add_argument( '-d', '--discover', action='store_true', help='Do schema discovery') # Capture additional args parser.add_argument( "--start_datetime", type=str, help="Inclusive start date time in ISO8601-Date-String format: 2019-04-11T00:00:00Z") parser.add_argument( "--end_datetime", type=str, help="Exclusive end date time in ISO8601-Date-String format: 2019-04-12T00:00:00Z") args = parser.parse_args() if args.config: args.config = singer_utils.load_json(args.config) if args.state: args.state = singer_utils.load_json(args.state) else: args.state = {} if args.properties: args.properties = singer_utils.load_json(args.properties) if args.catalog: args.catalog = Catalog.load(args.catalog) return args
def load_schema(entity_name): schema = utils.load_json( get_abs_path('schemas/{}.json'.format(entity_name))) if entity_name in ["contacts", "companies", "deals"]: custom_schema = get_custom_schema(entity_name) schema['properties']['properties'] = { "type": "object", "properties": custom_schema, } if entity_name in ["deals"]: v3_schema = get_v3_schema(entity_name) for key, value in v3_schema.items(): if any(prefix in key for prefix in V3_PREFIXES): custom_schema[key] = value # Move properties to top level custom_schema_top_level = { 'property_{}'.format(k): v for k, v in custom_schema.items() } schema['properties'].update(custom_schema_top_level) # Make properties_versions selectable and share the same schema. versions_schema = utils.load_json( get_abs_path('schemas/versions.json')) schema['properties']['properties_versions'] = versions_schema if entity_name == "contacts": schema['properties'][ 'associated-company'] = load_associated_company_schema() return schema
def __init__( self, config: Union[Dict[str, Any], Path], state: Union[None, Dict[str, Any], Path] = None, catalog: Union[None, Dict[str, Any], Catalog, Path] = None, discover: bool = False, **kwargs, ): self.catalog_path = self.state_path = self.config_path = None if isinstance(catalog, Path): self.catalog_path = str(catalog) catalog = Catalog.load(catalog) elif isinstance(catalog, dict): catalog = Catalog.from_dict(catalog) if isinstance(config, Path): self.config_path = str(config) config = load_json(config) if isinstance(state, Path): self.state_path = state state = load_json(state) self.config = config self.state = state self.catalog = catalog self.discover = discover for name, val in kwargs.items(): setattr(self, name, val)
def main(): global STATE global AUTH try: AUTH = utils.load_json(PATH) except FileNotFoundError: LOGGER.error('Config file not found') sys.exit(1) if STATE_PATH is not None: try: state = utils.load_json(STATE_PATH) except FileNotFoundError: LOGGER.error('State file not found') sys.exit(1) if AUTH['type'] == 'day': LOGGER.info('Started data load for daily level metrics') STATE = {"filter":state, "increment":AUTH['increment'], "type":AUTH['type']} start_load_day(AUTH) elif AUTH['type'] == 'minute': LOGGER.info('Started data load for minutes level metrics') STATE = {"filter":state, "increment":AUTH['increment'], "type":AUTH['type']} start_load_day(AUTH) else: LOGGER.error('Load type should be minute or day') sys.exit(1) else: LOGGER.info('--state option is not passed running tap with default options') if AUTH['type'] == 'minute': STATE = DEFAULT_FILTER_MIN try: date = str(parse(AUTH['start_date']).date()) time_portion = str(parse(AUTH['start_date']).time())[0:5] except ValueError: LOGGER.error('Start date not in RFC3339 format') sys.exit(1) STATE['filter']['date_ranges'][0]['last_day'] = date STATE['filter']['time_ranges'][0]['until'] = time_portion STATE['increment'] = AUTH['increment'] STATE['type'] = AUTH['type'] start_load_min(AUTH) LOGGER.info('Minute Level info done') elif AUTH['type'] == 'day': STATE = DEFAULT_FILTER_DAY try: date = str(parse(AUTH['start_date']).date()) except ValueError: LOGGER.error('start date not in RC3339 format') sys.exit(1) STATE['filter']['date_ranges'][0]['last_day'] = date STATE['increment'] = AUTH['increment'] STATE['type'] = AUTH['type'] start_load_day(AUTH) LOGGER.info('Day Level Filter Done') else: LOGGER.error('Load type should me minute or day') sys.exit(1)
def parse_args(required_config_keys): # fork function to be able to grab path of state file '''Parse standard command-line args. Parses the command-line arguments mentioned in the SPEC and the BEST_PRACTICES documents: -c,--config Config file -s,--state State file -d,--discover Run in discover mode -p,--properties Properties file: DEPRECATED, please use --catalog instead --catalog Catalog file Returns the parsed args object from argparse. For each argument that point to JSON files (config, state, properties), we will automatically load and parse the JSON file. ''' parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='Config file', required=True) parser.add_argument('-s', '--state', help='State file') parser.add_argument( '-p', '--properties', help='Property selections: DEPRECATED, Please use --catalog instead') parser.add_argument('--catalog', help='Catalog file') parser.add_argument('-d', '--discover', action='store_true', help='Do schema discovery') args = parser.parse_args() if args.config: setattr(args, 'config_path', args.config) args.config = utils.load_json(args.config) if args.state: setattr(args, 'state_path', args.state) args.state_file = args.state args.state = utils.load_json(args.state) else: args.state_file = None args.state = {} if args.properties: setattr(args, 'properties_path', args.properties) args.properties = utils.load_json(args.properties) if args.catalog: setattr(args, 'catalog_path', args.catalog) args.catalog = Catalog.load(args.catalog) utils.check_config(args.config, required_config_keys) return args
def load_schema_for_v3_entity(entity_name): schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name))) custom_schema = get_custom_schema(entity_name) # Move properties to top level custom_schema_top_level = {k: v["properties"]["value"] for k, v in custom_schema.items()} schema['properties'].update(custom_schema_top_level) # Make properties_versions selectable and share the same schema. versions_schema = utils.load_json(get_abs_path('schemas/versions.json')) schema['properties']['properties_versions'] = versions_schema return schema
def load_schema(entity): '''Returns the schema for the specified source''' schema = utils.load_json( os.path.join(CONFIG["schema_dir"], "{}.json".format(entity))) # schema = utils.load_json(get_abs_path(CONFIG["schema_dir"] + "/{}.json".format(entity))) return schema
def load_schema(tap_stream_id): path = "schemas/{}.json".format(tap_stream_id) schema = utils.load_json(get_abs_path(path)) refs = schema.pop("definitions", {}) if refs: singer.resolve_schema_references(schema, refs) return schema
def parse_args(required_config_keys): parser = argparse.ArgumentParser() parser.add_argument( '-c', '--config', help='Config file', required=True) parser.add_argument( '-s', '--state', help='state file') parser.add_argument( '-p', '--properties', help='Property selections: DEPRECATED, Please use --catalog instead') parser.add_argument( '--catalog', help='Catalog file') parser.add_argument( '-d', '--discover', action='store_true', help='Do schema discovery') args = parser.parse_args() if args.config: setattr(args, 'config_path', args.config) args.config = utils.load_json(args.config) if args.state: setattr(args, 'state_path', args.state) args.state_file = args.state args.state = utils.load_json(args.state) else: args.state_file = None args.state = {} if args.properties: setattr(args, 'properties_path', args.properties) args.properties = utils.load_json(args.properties) if args.catalog: setattr(args, 'catalog_path', args.catalog) args.catalog = Catalog.load(args.catalog) utils.check_config(args.config, required_config_keys) return args
def load_schema_references(): shared_schema_path = get_abs_path('schemas/definitions.json') refs = {} # load json from the path refs["definitions.json"] = utils.load_json(shared_schema_path) return refs
def load_schema(tap_stream_id): path = "schemas/{}.json".format(tap_stream_id) schema = utils.load_json(get_abs_path(path)) dependencies = schema.pop("tap_schema_dependencies", []) refs = {} for sub_stream_id in dependencies: refs[sub_stream_id] = load_schema(sub_stream_id) if refs: singer.resolve_schema_references(schema, refs) return schema
def test_path(self): # from valid path with tempfile.NamedTemporaryFile() as fil: fil.write(self.expected_json.encode()) fil.seek(0) from_path = u.load_json(fil.name) self.assertEqual(from_path, json.loads(self.expected_json)) # from invalid path self.assertRaises(FileNotFoundError, u.load_json, 'does_not_exist.json')
def load_schema(entity_name): if entity_name in v3_entities_with_dynamic_fields: return load_schema_for_v3_entity(entity_name) schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name))) if entity_name in ["contacts", "companies", "deal_histories"]: custom_schema = get_custom_schema(entity_name) # Move properties to top level custom_schema_top_level = {'property_{}'.format(k): v for k, v in custom_schema.items()} schema['properties'].update(custom_schema_top_level) # Make properties_versions selectable and share the same schema. versions_schema = utils.load_json(get_abs_path('schemas/versions.json')) schema['properties']['properties_versions'] = versions_schema if entity_name == "contacts": schema['properties']['associated-company'] = load_associated_company_schema() return schema
def load_schema(stream): path = get_abs_path('schemas/{}.json'.format(stream.name)) field_class = stream.field_class schema = utils.load_json(path) for k in schema['properties']: if k in set(stream.key_properties): schema['properties'][k]['inclusion'] = 'automatic' elif k in field_class.__dict__: schema['properties'][k]['inclusion'] = 'available' return schema
def load_schema(entity_name): schema = utils.load_json( get_abs_path('schemas/{}.json'.format(entity_name))) if entity_name in ["contacts", "companies", "deals"]: custom_schema = get_custom_schema(entity_name) schema['properties']['properties'] = { "type": "object", "properties": custom_schema, } return schema
def load_schema(stream): path = get_abs_path('schemas/{}.json'.format(stream.name)) field_class = stream.field_class schema = utils.load_json(path) for k in schema['properties']: if k not in field_class.__dict__: LOGGER.warning( 'Property %s.%s is not defined in the facebook_business library', stream.name, k) return schema
def _prep_config(): cwd, _ = os.path.split(__file__) usgs_dir = os.path.join(cwd, "../examples/usgs") config = utils.load_json(os.path.join(usgs_dir, "config/tap_config.json")) config["schema_dir"] = os.path.join(usgs_dir, "schema") config["catalog_dir"] = os.path.join(usgs_dir, "catalog") catalog = Catalog.load(os.path.join(usgs_dir, config["catalog_dir"], "earthquakes.json")) config["start_datetime"] = (datetime.datetime.now() - datetime.timedelta(hours=1)).isoformat() streams = {} streams["earthquakes"] = Stream("earthquakes", config) return config, catalog, streams
def load_schema(entity_name): schema = utils.load_json( get_abs_path('schemas/{}.json'.format(entity_name))) if entity_name in ["contacts", "companies", "deals", "tickets"]: custom_schema = get_custom_schema(entity_name) schema['properties']['properties'] = { "type": "object", "properties": custom_schema, } if entity_name == "contacts": schema['properties'][ 'associated-company'] = load_associated_company_schema() return schema
def load_schema(stream): path = get_abs_path('schemas/{}.json'.format(stream.name)) field_class = stream.field_class schema = utils.load_json(path) for k in schema['properties']: if k in set(stream.key_properties) or k == UPDATED_TIME_KEY: schema['properties'][k]['inclusion'] = 'automatic' else: if k not in field_class.__dict__: LOGGER.warning( 'Property %s.%s is not defined in the facebook_business library', stream.name, k) schema['properties'][k]['inclusion'] = 'available' return schema
def load_schema(entity_name): schema = utils.load_json( get_abs_path("schemas/{}.json".format(entity_name))) # don't load custom properties for companies as they are not used # if entity_name in ["contacts", "companies", "deals"]: if entity_name in ["contacts", "deals"]: custom_schema = get_custom_schema(entity_name) schema["properties"]["properties"] = { "type": "object", "properties": custom_schema, } # associated companies include 700 properties # removed for efficiencies # if entity_name == "contacts": # schema['properties']['associated-company'] = load_associated_company_schema() return schema
def load_schema(entity_name: str) -> Schema: schema = utils.load_json( get_abs_path('schemas/{}.json'.format(entity_name))) if entity_name in ["contacts", "companies", "deals"]: custom_schema = get_custom_schema(entity_name) if entity_name in ["deals"]: v3_schema = get_v3_schema() for key, value in v3_schema.items(): if any(prefix in key for prefix in V3_PREFIXES): custom_schema[key] = value # Move properties to top level custom_schema_top_level = { 'property_{}'.format(k): v for k, v in custom_schema.items() } schema['properties'].update(custom_schema_top_level) return schema
def load_schema(entity_name): schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name))) return schema
def load_metadata(entity): return utils.load_json(get_abs_path("metadata/{}.json".format(entity)))
def load_schema(tap_stream_id): path = "schemas/{}.json".format(tap_stream_id) return utils.load_json(get_abs_path(path))
def load_schema(stream): path = get_abs_path('schemas/{}.json'.format(stream.name)) schema = utils.load_json(path) return schema
def load_schema(entity): return utils.load_json(get_abs_path("schemas/{}.json".format(entity)))
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file): lsn_comitted = min([ get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams ]) start_lsn = lsn_comitted lsn_to_flush = None time_extracted = utils.now() slot = locate_replication_slot(conn_info) lsn_last_processed = None lsn_currently_processing = None lsn_received_timestamp = None lsn_processed_count = 0 logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300 poll_interval = 10 poll_timestamp = None selected_tables = [] for s in logical_streams: selected_tables.append("{}.{}".format( s['metadata'][0]['metadata']['schema-name'], s['table_name'])) for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) # Create replication connection and cursor conn = post_db.open_connection(conn_info, True) cur = conn.cursor() try: LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format( datetime.datetime.utcnow(), int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot)) cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, options={ 'write-in-chunks': 1, 'add-tables': ','.join(selected_tables) }) except psycopg2.ProgrammingError: raise Exception( "Unable to start replication with logical replication (slot {})". format(slot)) # Emulate some behaviour of pg_recvlogical LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format( datetime.datetime.utcnow())) cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True) time.sleep(1) lsn_received_timestamp = datetime.datetime.utcnow() poll_timestamp = datetime.datetime.utcnow() while True: # Disconnect when no data received for logical_poll_total_seconds # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds() if poll_duration > logical_poll_total_seconds: LOGGER.info( "{} : Breaking - {} seconds of polling with no data".format( datetime.datetime.utcnow(), poll_duration)) break try: msg = cur.read_message() except Exception as e: LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e)) raise if msg: if msg.data_start > end_lsn: LOGGER.info( "{} : Breaking - current {} is past end_lsn {}".format( datetime.datetime.utcnow(), int_to_lsn(msg.data_start), int_to_lsn(end_lsn))) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn) # When using wal2json with write-in-chunks, multiple messages can have the same lsn # This is to ensure we only flush to lsn that has completed entirely if lsn_currently_processing is None: lsn_currently_processing = msg.data_start LOGGER.info("{} : First message received is {} at {}".format( datetime.datetime.utcnow(), int_to_lsn(lsn_currently_processing), datetime.datetime.utcnow())) # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info( "{} : Confirming write up to {}, flush to {}".format( datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush))) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) elif (int(msg.data_start) > lsn_currently_processing): lsn_last_processed = lsn_currently_processing lsn_currently_processing = msg.data_start lsn_received_timestamp = datetime.datetime.utcnow() lsn_processed_count = lsn_processed_count + 1 if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) lsn_processed_count = 0 # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL if datetime.datetime.utcnow() >= ( poll_timestamp + datetime.timedelta(seconds=poll_interval)): if lsn_currently_processing is None: LOGGER.info( "{} : Sending keep-alive message to source server (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() elif state_file is None: LOGGER.info( "{} : Sending keep-alive message to source server (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() else: # Read lsn_comitted currently captured in state file on disk lsn_comitted = min([ get_bookmark(utils.load_json(state_file), s['tap_stream_id'], 'lsn') for s in logical_streams ]) lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info( "{} : Confirming write up to {}, flush to {} (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) poll_timestamp = datetime.datetime.utcnow() # Close replication connection and cursor cur.close() conn.close() if lsn_last_processed: if lsn_comitted > lsn_last_processed: lsn_last_processed = lsn_comitted LOGGER.info( "Current lsn_last_processed {} is older than lsn_comitted {}". format(int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted))) for s in logical_streams: LOGGER.info( "updating bookmark for stream {} to lsn = {} ({})".format( s['tap_stream_id'], lsn_last_processed, int_to_lsn(lsn_last_processed))) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def load_schema(entity): '''Returns the schema for the specified source''' schema = utils.load_json(get_abs_path("schemas/{}.json".format(entity))) return schema
PARSER.add_argument('--state', action='store', dest='state', help='Path for state file') ARGUMENTS = PARSER.parse_args() LOGGER = singer.logger.get_logger() URL = 'https://advertising.criteo.com/API/v201305/AdvertiserService.asmx' if ARGUMENTS.path is None: LOGGER.error('Specify configuration file folder.') sys.exit(1) PATH = ARGUMENTS.path AUTH = utils.load_json(PATH) CLIENT = soapclient( 'https://advertising.criteo.com/API/v201305/AdvertiserService.asmx?WSDL', headers={'User-Agent': AUTH['user_agent']}) HEADERS = CLIENT.factory.create('apiHeader') if ARGUMENTS.state: STATE = utils.load_json(ARGUMENTS.state) STATE_DEFINED = True else: STATE = {"aggregationType": "Hourly",\ "startDate":"",\ "reportType":"",\ "endDate": "",\ "reportSelector": {},\
def parse_args(spec_file, required_config_keys): ''' This is to replace singer's default utils.parse_args() https://github.com/singer-io/singer-python/blob/master/singer/utils.py Parse standard command-line args. Parses the command-line arguments mentioned in the SPEC and the BEST_PRACTICES documents: -c,--config Config file -s,--state State file -d,--discover Run in discover mode --catalog Catalog file Returns the parsed args object from argparse. For each argument that point to JSON files (config, state, properties), we will automatically load and parse the JSON file. ''' # Read default spec file default_spec = {} default_spec_file = get_abs_path("default_spec.json") with open(default_spec_file, "r") as f: default_spec.update(json.load(f)) # Read spec file with open(spec_file, "r") as f: SPEC.update(json.load(f)) # TODO: What about the fields other than arg for a in default_spec["args"]: if SPEC["args"].get(a) is None: SPEC["args"][a] = default_spec["args"][a] parser = argparse.ArgumentParser(SPEC["application"]) parser.add_argument("spec_file", type=str, help="Specification file") # Capture additional args for arg in SPEC["args"].keys(): parser.add_argument("--" + arg, type=TYPES[SPEC["args"][arg]["type"]], default=SPEC["args"][arg].get("default"), help=SPEC["args"][arg].get("help"), required=SPEC["args"][arg].get("required", False)) # Default arguments parser.add_argument('-c', '--config', help='Config file', required=True) """ parser.add_argument( "--schema_dir", type=str, help="Path to the schema directory.", required=True) """ parser.add_argument('-s', '--state', help='State file') parser.add_argument('--catalog', help='Catalog file') parser.add_argument('-d', '--discover', action='store_true', help='Do schema discovery') parser.add_argument('-i', '--infer_schema', action='store_true', help='Do infer schema') parser.add_argument( "--url", type=str, help="REST API endpoint with {params}. Required in config.") args = parser.parse_args() if args.config: args.config = utils.load_json(args.config) if args.state: args.state = utils.load_json(args.state) else: args.state = {} if args.catalog and os.path.isfile(args.catalog): args.catalog = Catalog.load(args.catalog) utils.check_config(args.config, required_config_keys) return args