Пример #1
0
def parse_args():
    ''' This is to replace singer's default singer_utils.parse_args()
    https://github.com/singer-io/singer-python/blob/master/singer/utils.py

    Parse standard command-line args.
    Parses the command-line arguments mentioned in the SPEC and the
    BEST_PRACTICES documents:
    -c,--config     Config file
    -s,--state      State file
    -d,--discover   Run in discover mode
    --catalog       Catalog file
    Returns the parsed args object from argparse. For each argument that
    point to JSON files (config, state, properties), we will automatically
    load and parse the JSON file.
    '''
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-c', '--config',
        help='Config file',
        required=True)

    parser.add_argument(
        '-s', '--state',
        help='State file')

    parser.add_argument(
        '-p', '--properties',
        help='Property selections: DEPRECATED, Please use --catalog instead')

    parser.add_argument(
        '--catalog',
        help='Catalog file')

    parser.add_argument(
        '-d', '--discover',
        action='store_true',
        help='Do schema discovery')

    # Capture additional args
    parser.add_argument(
        "--start_datetime", type=str,
        help="Inclusive start date time in ISO8601-Date-String format: 2019-04-11T00:00:00Z")
    parser.add_argument(
        "--end_datetime", type=str,
        help="Exclusive end date time in ISO8601-Date-String format: 2019-04-12T00:00:00Z")

    args = parser.parse_args()
    if args.config:
        args.config = singer_utils.load_json(args.config)
    if args.state:
        args.state = singer_utils.load_json(args.state)
    else:
        args.state = {}
    if args.properties:
        args.properties = singer_utils.load_json(args.properties)
    if args.catalog:
        args.catalog = Catalog.load(args.catalog)

    return args
def load_schema(entity_name):
    schema = utils.load_json(
        get_abs_path('schemas/{}.json'.format(entity_name)))
    if entity_name in ["contacts", "companies", "deals"]:
        custom_schema = get_custom_schema(entity_name)

        schema['properties']['properties'] = {
            "type": "object",
            "properties": custom_schema,
        }

        if entity_name in ["deals"]:
            v3_schema = get_v3_schema(entity_name)
            for key, value in v3_schema.items():
                if any(prefix in key for prefix in V3_PREFIXES):
                    custom_schema[key] = value

        # Move properties to top level
        custom_schema_top_level = {
            'property_{}'.format(k): v
            for k, v in custom_schema.items()
        }
        schema['properties'].update(custom_schema_top_level)

        # Make properties_versions selectable and share the same schema.
        versions_schema = utils.load_json(
            get_abs_path('schemas/versions.json'))
        schema['properties']['properties_versions'] = versions_schema

    if entity_name == "contacts":
        schema['properties'][
            'associated-company'] = load_associated_company_schema()

    return schema
Пример #3
0
    def __init__(
        self,
        config: Union[Dict[str, Any], Path],
        state: Union[None, Dict[str, Any], Path] = None,
        catalog: Union[None, Dict[str, Any], Catalog, Path] = None,
        discover: bool = False,
        **kwargs,
    ):
        self.catalog_path = self.state_path = self.config_path = None

        if isinstance(catalog, Path):
            self.catalog_path = str(catalog)
            catalog = Catalog.load(catalog)
        elif isinstance(catalog, dict):
            catalog = Catalog.from_dict(catalog)

        if isinstance(config, Path):
            self.config_path = str(config)
            config = load_json(config)
        if isinstance(state, Path):
            self.state_path = state
            state = load_json(state)

        self.config = config
        self.state = state
        self.catalog = catalog
        self.discover = discover

        for name, val in kwargs.items():
            setattr(self, name, val)
Пример #4
0
def main():
    global STATE
    global AUTH
    try:
        AUTH = utils.load_json(PATH)
    except FileNotFoundError:
        LOGGER.error('Config file not found')
        sys.exit(1)
    if STATE_PATH is not None:
        try:
            state = utils.load_json(STATE_PATH)
        except FileNotFoundError:
            LOGGER.error('State file not found')
            sys.exit(1)
        if AUTH['type'] == 'day':
            LOGGER.info('Started data load for daily level metrics')
            STATE = {"filter":state, "increment":AUTH['increment'], "type":AUTH['type']}
            start_load_day(AUTH)
        elif AUTH['type'] == 'minute':
            LOGGER.info('Started data load for minutes level metrics')
            STATE = {"filter":state, "increment":AUTH['increment'], "type":AUTH['type']}
            start_load_day(AUTH)
        else:
            LOGGER.error('Load type should be minute or day')
            sys.exit(1)
    else:
        LOGGER.info('--state option is not passed running tap with default options')
        if AUTH['type'] == 'minute':
            STATE = DEFAULT_FILTER_MIN
            try:
                date = str(parse(AUTH['start_date']).date())
                time_portion = str(parse(AUTH['start_date']).time())[0:5]
            except ValueError:
                LOGGER.error('Start date not in RFC3339 format')
                sys.exit(1)
            STATE['filter']['date_ranges'][0]['last_day'] = date
            STATE['filter']['time_ranges'][0]['until'] = time_portion
            STATE['increment'] = AUTH['increment']
            STATE['type'] = AUTH['type']
            start_load_min(AUTH)
            LOGGER.info('Minute Level info done')
        elif AUTH['type'] == 'day':
            STATE = DEFAULT_FILTER_DAY
            try:
                date = str(parse(AUTH['start_date']).date())
            except ValueError:
                LOGGER.error('start date not in RC3339 format')
                sys.exit(1)
            STATE['filter']['date_ranges'][0]['last_day'] = date
            STATE['increment'] = AUTH['increment']
            STATE['type'] = AUTH['type']
            start_load_day(AUTH)
            LOGGER.info('Day Level Filter Done')
        else:
            LOGGER.error('Load type should me minute or day')
            sys.exit(1)
Пример #5
0
def parse_args(required_config_keys):
    # fork function to be able to grab path of state file
    '''Parse standard command-line args.

    Parses the command-line arguments mentioned in the SPEC and the
    BEST_PRACTICES documents:

    -c,--config     Config file
    -s,--state      State file
    -d,--discover   Run in discover mode
    -p,--properties Properties file: DEPRECATED, please use --catalog instead
    --catalog       Catalog file

    Returns the parsed args object from argparse. For each argument that
    point to JSON files (config, state, properties), we will automatically
    load and parse the JSON file.
    '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-c', '--config', help='Config file', required=True)

    parser.add_argument('-s', '--state', help='State file')

    parser.add_argument(
        '-p',
        '--properties',
        help='Property selections: DEPRECATED, Please use --catalog instead')

    parser.add_argument('--catalog', help='Catalog file')

    parser.add_argument('-d',
                        '--discover',
                        action='store_true',
                        help='Do schema discovery')

    args = parser.parse_args()
    if args.config:
        setattr(args, 'config_path', args.config)
        args.config = utils.load_json(args.config)
    if args.state:
        setattr(args, 'state_path', args.state)
        args.state_file = args.state
        args.state = utils.load_json(args.state)
    else:
        args.state_file = None
        args.state = {}
    if args.properties:
        setattr(args, 'properties_path', args.properties)
        args.properties = utils.load_json(args.properties)
    if args.catalog:
        setattr(args, 'catalog_path', args.catalog)
        args.catalog = Catalog.load(args.catalog)

    utils.check_config(args.config, required_config_keys)

    return args
Пример #6
0
def load_schema_for_v3_entity(entity_name):
    schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name)))
    custom_schema = get_custom_schema(entity_name)
    # Move properties to top level
    custom_schema_top_level = {k: v["properties"]["value"] for k, v in custom_schema.items()}

    schema['properties'].update(custom_schema_top_level)

    # Make properties_versions selectable and share the same schema.
    versions_schema = utils.load_json(get_abs_path('schemas/versions.json'))
    schema['properties']['properties_versions'] = versions_schema

    return schema
Пример #7
0
def load_schema(entity):
    '''Returns the schema for the specified source'''
    schema = utils.load_json(
        os.path.join(CONFIG["schema_dir"], "{}.json".format(entity)))
    # schema = utils.load_json(get_abs_path(CONFIG["schema_dir"] + "/{}.json".format(entity)))

    return schema
Пример #8
0
def load_schema(tap_stream_id):
    path = "schemas/{}.json".format(tap_stream_id)
    schema = utils.load_json(get_abs_path(path))
    refs = schema.pop("definitions", {})
    if refs:
        singer.resolve_schema_references(schema, refs)
    return schema
Пример #9
0
def parse_args(required_config_keys):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-c', '--config',
        help='Config file',
        required=True)

    parser.add_argument(
        '-s', '--state',
        help='state file')

    parser.add_argument(
        '-p', '--properties',
        help='Property selections: DEPRECATED, Please use --catalog instead')

    parser.add_argument(
        '--catalog',
        help='Catalog file')

    parser.add_argument(
        '-d', '--discover',
        action='store_true',
        help='Do schema discovery')

    args = parser.parse_args()
    if args.config:
        setattr(args, 'config_path', args.config)
        args.config = utils.load_json(args.config)
    if args.state:
        setattr(args, 'state_path', args.state)
        args.state_file = args.state
        args.state = utils.load_json(args.state)
    else:
        args.state_file = None
        args.state = {}
    if args.properties:
        setattr(args, 'properties_path', args.properties)
        args.properties = utils.load_json(args.properties)
    if args.catalog:
        setattr(args, 'catalog_path', args.catalog)
        args.catalog = Catalog.load(args.catalog)

    utils.check_config(args.config, required_config_keys)

    return args
Пример #10
0
def load_schema_references():
    shared_schema_path = get_abs_path('schemas/definitions.json')

    refs = {}
    # load json from the path
    refs["definitions.json"] = utils.load_json(shared_schema_path)

    return refs
Пример #11
0
def load_schema(tap_stream_id):
    path = "schemas/{}.json".format(tap_stream_id)
    schema = utils.load_json(get_abs_path(path))
    dependencies = schema.pop("tap_schema_dependencies", [])
    refs = {}
    for sub_stream_id in dependencies:
        refs[sub_stream_id] = load_schema(sub_stream_id)
    if refs:
        singer.resolve_schema_references(schema, refs)
    return schema
Пример #12
0
 def test_path(self):
     # from valid path
     with tempfile.NamedTemporaryFile() as fil:
         fil.write(self.expected_json.encode())
         fil.seek(0)
         from_path = u.load_json(fil.name)
         self.assertEqual(from_path, json.loads(self.expected_json))
     # from invalid path
     self.assertRaises(FileNotFoundError, u.load_json,
                       'does_not_exist.json')
Пример #13
0
def load_schema(entity_name):
    if entity_name in v3_entities_with_dynamic_fields:
        return load_schema_for_v3_entity(entity_name)

    schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name)))
    if entity_name in ["contacts", "companies", "deal_histories"]:
        custom_schema = get_custom_schema(entity_name)
        # Move properties to top level
        custom_schema_top_level = {'property_{}'.format(k): v for k, v in custom_schema.items()}
        schema['properties'].update(custom_schema_top_level)

        # Make properties_versions selectable and share the same schema.
        versions_schema = utils.load_json(get_abs_path('schemas/versions.json'))
        schema['properties']['properties_versions'] = versions_schema

    if entity_name == "contacts":
        schema['properties']['associated-company'] = load_associated_company_schema()

    return schema
Пример #14
0
def load_schema(stream):
    path = get_abs_path('schemas/{}.json'.format(stream.name))
    field_class = stream.field_class
    schema = utils.load_json(path)
    for k in schema['properties']:
        if k in set(stream.key_properties):
            schema['properties'][k]['inclusion'] = 'automatic'
        elif k in field_class.__dict__:
            schema['properties'][k]['inclusion'] = 'available'
    return schema
Пример #15
0
def load_schema(entity_name):
    schema = utils.load_json(
        get_abs_path('schemas/{}.json'.format(entity_name)))
    if entity_name in ["contacts", "companies", "deals"]:
        custom_schema = get_custom_schema(entity_name)
        schema['properties']['properties'] = {
            "type": "object",
            "properties": custom_schema,
        }

    return schema
Пример #16
0
def load_schema(stream):
    path = get_abs_path('schemas/{}.json'.format(stream.name))
    field_class = stream.field_class
    schema = utils.load_json(path)

    for k in schema['properties']:
        if k not in field_class.__dict__:
            LOGGER.warning(
                'Property %s.%s is not defined in the facebook_business library',
                stream.name, k)

    return schema
Пример #17
0
def _prep_config():
    cwd, _ = os.path.split(__file__)
    usgs_dir = os.path.join(cwd, "../examples/usgs")
    config = utils.load_json(os.path.join(usgs_dir, "config/tap_config.json"))
    config["schema_dir"] = os.path.join(usgs_dir, "schema")
    config["catalog_dir"] = os.path.join(usgs_dir, "catalog")
    catalog = Catalog.load(os.path.join(usgs_dir, config["catalog_dir"],
                                        "earthquakes.json"))
    config["start_datetime"] = (datetime.datetime.now() -
                                datetime.timedelta(hours=1)).isoformat()
    streams = {}
    streams["earthquakes"] = Stream("earthquakes", config)
    return config, catalog, streams
Пример #18
0
def load_schema(entity_name):
    schema = utils.load_json(
        get_abs_path('schemas/{}.json'.format(entity_name)))
    if entity_name in ["contacts", "companies", "deals", "tickets"]:
        custom_schema = get_custom_schema(entity_name)
        schema['properties']['properties'] = {
            "type": "object",
            "properties": custom_schema,
        }

    if entity_name == "contacts":
        schema['properties'][
            'associated-company'] = load_associated_company_schema()
    return schema
Пример #19
0
def load_schema(stream):
    path = get_abs_path('schemas/{}.json'.format(stream.name))
    field_class = stream.field_class
    schema = utils.load_json(path)
    for k in schema['properties']:
        if k in set(stream.key_properties) or k == UPDATED_TIME_KEY:
            schema['properties'][k]['inclusion'] = 'automatic'
        else:
            if k not in field_class.__dict__:
                LOGGER.warning(
                    'Property %s.%s is not defined in the facebook_business library',
                    stream.name, k)
            schema['properties'][k]['inclusion'] = 'available'

    return schema
Пример #20
0
def load_schema(entity_name):
    schema = utils.load_json(
        get_abs_path("schemas/{}.json".format(entity_name)))
    # don't load custom properties for companies as they are not used
    # if entity_name in ["contacts", "companies", "deals"]:
    if entity_name in ["contacts", "deals"]:
        custom_schema = get_custom_schema(entity_name)
        schema["properties"]["properties"] = {
            "type": "object",
            "properties": custom_schema,
        }

    # associated companies include 700 properties
    # removed for efficiencies
    # if entity_name == "contacts":
    #     schema['properties']['associated-company'] = load_associated_company_schema()

    return schema
Пример #21
0
def load_schema(entity_name: str) -> Schema:
    schema = utils.load_json(
        get_abs_path('schemas/{}.json'.format(entity_name)))
    if entity_name in ["contacts", "companies", "deals"]:
        custom_schema = get_custom_schema(entity_name)

        if entity_name in ["deals"]:
            v3_schema = get_v3_schema()
            for key, value in v3_schema.items():
                if any(prefix in key for prefix in V3_PREFIXES):
                    custom_schema[key] = value

        # Move properties to top level
        custom_schema_top_level = {
            'property_{}'.format(k): v
            for k, v in custom_schema.items()
        }
        schema['properties'].update(custom_schema_top_level)
    return schema
Пример #22
0
def load_schema(entity_name):
    schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name)))
    return schema
Пример #23
0
def load_metadata(entity):
    return utils.load_json(get_abs_path("metadata/{}.json".format(entity)))
Пример #24
0
def load_schema(tap_stream_id):
    path = "schemas/{}.json".format(tap_stream_id)
    return utils.load_json(get_abs_path(path))
Пример #25
0
def load_schema(stream):
    path = get_abs_path('schemas/{}.json'.format(stream.name))
    schema = utils.load_json(path)

    return schema
Пример #26
0
def load_schema(entity):
    return utils.load_json(get_abs_path("schemas/{}.json".format(entity)))
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
    lsn_comitted = min([
        get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams
    ])
    start_lsn = lsn_comitted
    lsn_to_flush = None
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    lsn_last_processed = None
    lsn_currently_processing = None
    lsn_received_timestamp = None
    lsn_processed_count = 0
    logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300
    poll_interval = 10
    poll_timestamp = None

    selected_tables = []
    for s in logical_streams:
        selected_tables.append("{}.{}".format(
            s['metadata'][0]['metadata']['schema-name'], s['table_name']))

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    # Create replication connection and cursor
    conn = post_db.open_connection(conn_info, True)
    cur = conn.cursor()

    try:
        LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format(
            datetime.datetime.utcnow(), int_to_lsn(start_lsn),
            int_to_lsn(end_lsn), slot))
        cur.start_replication(slot_name=slot,
                              decode=True,
                              start_lsn=start_lsn,
                              options={
                                  'write-in-chunks': 1,
                                  'add-tables': ','.join(selected_tables)
                              })
    except psycopg2.ProgrammingError:
        raise Exception(
            "Unable to start replication with logical replication (slot {})".
            format(slot))

    # Emulate some behaviour of pg_recvlogical
    LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format(
        datetime.datetime.utcnow()))
    cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True)
    time.sleep(1)

    lsn_received_timestamp = datetime.datetime.utcnow()
    poll_timestamp = datetime.datetime.utcnow()

    while True:
        # Disconnect when no data received for logical_poll_total_seconds
        # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts
        poll_duration = (datetime.datetime.utcnow() -
                         lsn_received_timestamp).total_seconds()
        if poll_duration > logical_poll_total_seconds:
            LOGGER.info(
                "{} : Breaking - {} seconds of polling with no data".format(
                    datetime.datetime.utcnow(), poll_duration))
            break

        try:
            msg = cur.read_message()
        except Exception as e:
            LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e))
            raise

        if msg:
            if msg.data_start > end_lsn:
                LOGGER.info(
                    "{} : Breaking - current {} is past end_lsn {}".format(
                        datetime.datetime.utcnow(), int_to_lsn(msg.data_start),
                        int_to_lsn(end_lsn)))
                break

            state = consume_message(logical_streams, state, msg,
                                    time_extracted, conn_info, end_lsn)

            # When using wal2json with write-in-chunks, multiple messages can have the same lsn
            # This is to ensure we only flush to lsn that has completed entirely
            if lsn_currently_processing is None:
                lsn_currently_processing = msg.data_start
                LOGGER.info("{} : First message received is {} at {}".format(
                    datetime.datetime.utcnow(),
                    int_to_lsn(lsn_currently_processing),
                    datetime.datetime.utcnow()))

                # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush:
                    lsn_to_flush = lsn_currently_processing
                LOGGER.info(
                    "{} : Confirming write up to {}, flush to {}".format(
                        datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush),
                        int_to_lsn(lsn_to_flush)))
                cur.send_feedback(write_lsn=lsn_to_flush,
                                  flush_lsn=lsn_to_flush,
                                  reply=True)

            elif (int(msg.data_start) > lsn_currently_processing):
                lsn_last_processed = lsn_currently_processing
                lsn_currently_processing = msg.data_start
                lsn_received_timestamp = datetime.datetime.utcnow()
                lsn_processed_count = lsn_processed_count + 1
                if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD:
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))
                    lsn_processed_count = 0

        # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL
        if datetime.datetime.utcnow() >= (
                poll_timestamp + datetime.timedelta(seconds=poll_interval)):
            if lsn_currently_processing is None:
                LOGGER.info(
                    "{} : Sending keep-alive message to source server (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback()
            elif state_file is None:
                LOGGER.info(
                    "{} : Sending keep-alive message to source server (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback()
            else:
                # Read lsn_comitted currently captured in state file on disk
                lsn_comitted = min([
                    get_bookmark(utils.load_json(state_file),
                                 s['tap_stream_id'], 'lsn')
                    for s in logical_streams
                ])
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush:
                    lsn_to_flush = lsn_currently_processing
                LOGGER.info(
                    "{} : Confirming write up to {}, flush to {} (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback(write_lsn=lsn_to_flush,
                                  flush_lsn=lsn_to_flush,
                                  reply=True)
            poll_timestamp = datetime.datetime.utcnow()

    # Close replication connection and cursor
    cur.close()
    conn.close()

    if lsn_last_processed:
        if lsn_comitted > lsn_last_processed:
            lsn_last_processed = lsn_comitted
            LOGGER.info(
                "Current lsn_last_processed {} is older than lsn_comitted {}".
                format(int_to_lsn(lsn_last_processed),
                       int_to_lsn(lsn_comitted)))
        for s in logical_streams:
            LOGGER.info(
                "updating bookmark for stream {} to lsn = {} ({})".format(
                    s['tap_stream_id'], lsn_last_processed,
                    int_to_lsn(lsn_last_processed)))
            state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                          lsn_last_processed)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
def load_schema(entity):
    '''Returns the schema for the specified source'''
    schema = utils.load_json(get_abs_path("schemas/{}.json".format(entity)))

    return schema
Пример #29
0
def load_schema(entity):
    return utils.load_json(get_abs_path("schemas/{}.json".format(entity)))
Пример #30
0
PARSER.add_argument('--state',
                    action='store',
                    dest='state',
                    help='Path for state file')

ARGUMENTS = PARSER.parse_args()

LOGGER = singer.logger.get_logger()
URL = 'https://advertising.criteo.com/API/v201305/AdvertiserService.asmx'

if ARGUMENTS.path is None:
    LOGGER.error('Specify configuration file folder.')
    sys.exit(1)

PATH = ARGUMENTS.path
AUTH = utils.load_json(PATH)

CLIENT = soapclient(
    'https://advertising.criteo.com/API/v201305/AdvertiserService.asmx?WSDL',
    headers={'User-Agent': AUTH['user_agent']})
HEADERS = CLIENT.factory.create('apiHeader')

if ARGUMENTS.state:
    STATE = utils.load_json(ARGUMENTS.state)
    STATE_DEFINED = True
else:
    STATE = {"aggregationType": "Hourly",\
                       "startDate":"",\
                       "reportType":"",\
                       "endDate": "",\
                        "reportSelector": {},\
Пример #31
0
def parse_args(spec_file, required_config_keys):
    ''' This is to replace singer's default utils.parse_args()
    https://github.com/singer-io/singer-python/blob/master/singer/utils.py

    Parse standard command-line args.
    Parses the command-line arguments mentioned in the SPEC and the
    BEST_PRACTICES documents:
    -c,--config     Config file
    -s,--state      State file
    -d,--discover   Run in discover mode
    --catalog       Catalog file
    Returns the parsed args object from argparse. For each argument that
    point to JSON files (config, state, properties), we will automatically
    load and parse the JSON file.
    '''
    # Read default spec file
    default_spec = {}
    default_spec_file = get_abs_path("default_spec.json")
    with open(default_spec_file, "r") as f:
        default_spec.update(json.load(f))

    # Read spec file
    with open(spec_file, "r") as f:
        SPEC.update(json.load(f))

    # TODO: What about the fields other than arg
    for a in default_spec["args"]:
        if SPEC["args"].get(a) is None:
            SPEC["args"][a] = default_spec["args"][a]

    parser = argparse.ArgumentParser(SPEC["application"])
    parser.add_argument("spec_file", type=str, help="Specification file")

    # Capture additional args
    for arg in SPEC["args"].keys():
        parser.add_argument("--" + arg,
                            type=TYPES[SPEC["args"][arg]["type"]],
                            default=SPEC["args"][arg].get("default"),
                            help=SPEC["args"][arg].get("help"),
                            required=SPEC["args"][arg].get("required", False))

    # Default arguments
    parser.add_argument('-c', '--config', help='Config file', required=True)
    """
    parser.add_argument(
        "--schema_dir",
        type=str,
        help="Path to the schema directory.",
        required=True)
    """

    parser.add_argument('-s', '--state', help='State file')

    parser.add_argument('--catalog', help='Catalog file')

    parser.add_argument('-d',
                        '--discover',
                        action='store_true',
                        help='Do schema discovery')

    parser.add_argument('-i',
                        '--infer_schema',
                        action='store_true',
                        help='Do infer schema')

    parser.add_argument(
        "--url",
        type=str,
        help="REST API endpoint with {params}. Required in config.")

    args = parser.parse_args()
    if args.config:
        args.config = utils.load_json(args.config)
    if args.state:
        args.state = utils.load_json(args.state)
    else:
        args.state = {}
    if args.catalog and os.path.isfile(args.catalog):
        args.catalog = Catalog.load(args.catalog)

    utils.check_config(args.config, required_config_keys)

    return args