Exemplo n.º 1
0
    def test_transform_json_handles_list_custom_fields(self):
        expected = [{'custom_fields': [{'field_set_id': '_custom_1', 'id': 'id', 'value': '1'},
                                       {'field_set_id': '_custom_2', 'id': 'id', 'value': '2'},
                                       {'field_set_id': '_custom_2', 'id': 'index', 'value': '0'},
                                       {'field_set_id': '_custom_2', 'id': 'id', 'value': '3'},
                                       {'field_set_id': '_custom_2', 'id': 'index', 'value': '1'}],
                     'id': '1'}]

        actual = transform_json([{"_custom_1" : {"id": '1'},
                                  "_custom_2" : [{"id": '2', "index": '0'},
                                                 {"id": '3', "index": '1'}],
                                  "id": '1'}],
                                "my_path")

        self.assertEqual(expected, actual)
Exemplo n.º 2
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        path,
        endpoint_config,
        api_version,
        api_method,
        static_params,
        sub_type,
        bookmark_query_field=None,
        bookmark_field=None,
        bookmark_type=None,
        data_key=None,
        body=None,
        id_fields=None,
        parent=None,
        parent_id=None):

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    max_bookmark_value = None
    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, sub_type, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, sub_type, start_date)
        max_bookmark_value = last_datetime

    write_schema(catalog, stream_name)

    # pagination: loop thru all pages of data
    # Pagination reference: https://api.mambu.com/?http#pagination
    # Each page has an offset (starting value) and a limit (batch size, number of records)
    # Increase the "offset" by the "limit" for each batch.
    # Continue until the "record_count" returned < "limit" is null/zero or
    offset = 0  # Starting offset value for each batch API call
    limit = client.page_size  # Batch size; Number of records per API call
    total_records = 0  # Initialize total
    record_count = limit  # Initialize, reset for each API call

    while record_count == limit:  # break out of loop when record_count < limit (or not data returned)
        params = {
            'offset': offset,
            'limit': limit,
            **static_params  # adds in endpoint specific, sort, filter params
        }

        if bookmark_query_field:
            if bookmark_type == 'datetime':
                params[bookmark_query_field] = last_datetime
            elif bookmark_type == 'integer':
                params[bookmark_query_field] = last_integer

        LOGGER.info('Stream: {}, Type: {} - Sync start {}'.format(
            stream_name, sub_type, 'since: {}, '.format(last_datetime)
            if bookmark_query_field else ''))

        # Squash params to query-string params
        querystring = '&'.join(
            ['%s=%s' % (key, value) for (key, value) in params.items()])
        LOGGER.info('URL for {} ({}, {}): {}/{}?{}'\
            .format(stream_name, api_method, api_version, client.base_url, path, querystring))
        if body is not None:
            LOGGER.info('body = {}'.format(body))

        # API request data
        data = client.request(method=api_method,
                              path=path,
                              version=api_version,
                              params=querystring,
                              endpoint=stream_name,
                              json=body)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == []:
            record_count = 0
            LOGGER.warning('Stream: {} - NO DATA RESULTS')
            break  # NO DATA

        # Transform data with transform_json from transform.py
        #  This function converts camelCase to snake_case for fieldname keys.
        # The data_key may identify array/list of records below the <root> element
        # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
        transformed_data = []  # initialize the record list
        data_list = []
        # If a single record dictionary, append to a list[]
        if isinstance(data, dict):
            data_list.append(data)
            data = data_list
        if data_key is None:
            transformed_data = transform_json(data, stream_name)
        elif data_key in data:
            transformed_data = transform_json(data, data_key)[data_key]
        # LOGGER.info('transformed_data = {}'.format(transformed_data))  # TESTING, comment out
        if not transformed_data or transformed_data is None:
            record_count = 0
            LOGGER.warning('Stream: {} - NO TRANSFORMED DATA RESULTS')
            break  # No data results

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value, record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            bookmark_type=bookmark_type,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime,
            last_integer=last_integer,
            parent=parent,
            parent_id=parent_id)

        total_records = total_records + record_count

        # Loop thru parent batch records for each children objects (if should stream)
        children = endpoint_config.get('children')
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                should_stream, last_stream_child = should_sync_stream(
                    get_selected_streams(catalog), None, child_stream_name)
                if should_stream:
                    # For each parent record
                    for record in transformed_data:
                        i = 0
                        # Set parent_id
                        for id_field in id_fields:
                            if i == 0:
                                parent_id_field = id_field
                            if id_field == 'id':
                                parent_id_field = id_field
                            i = i + 1
                        parent_id = record.get(parent_id_field)

                        # sync_endpoint for child
                        LOGGER.info(
                            'Syncing: {}, parent_stream: {}, parent_id: {}'.
                            format(child_stream_name, stream_name, parent_id))
                        child_path = child_endpoint_config.get('path').format(
                            str(parent_id))
                        child_total_records = sync_endpoint(
                            client=client,
                            catalog=catalog,
                            state=state,
                            start_date=start_date,
                            stream_name=child_stream_name,
                            path=child_path,
                            endpoint_config=child_endpoint_config,
                            api_version=child_endpoint_config.get(
                                'api_version', 'v2'),
                            api_method=child_endpoint_config.get(
                                'api_method', 'GET'),
                            static_params=child_endpoint_config.get(
                                'params', {}),
                            sub_type=sub_type,
                            bookmark_query_field=child_endpoint_config.get(
                                'bookmark_query_field'),
                            bookmark_field=child_endpoint_config.get(
                                'bookmark_field'),
                            bookmark_type=child_endpoint_config.get(
                                'bookmark_type'),
                            data_key=child_endpoint_config.get(
                                'data_key', None),
                            body=child_endpoint_config.get('body', None),
                            id_fields=child_endpoint_config.get('id_fields'),
                            parent=child_endpoint_config.get('parent'),
                            parent_id=parent_id)
                        LOGGER.info(
                            'Synced: {}, parent_id: {}, total_records: {}'.
                            format(child_stream_name, parent_id,
                                   child_total_records))

        # Update the state with the max_bookmark_value for the stream
        if bookmark_field:
            write_bookmark(state, stream_name, sub_type, max_bookmark_value)

        # to_rec: to record; ending record for the batch
        to_rec = offset + limit
        if record_count < limit:
            to_rec = total_records

        LOGGER.info('{} - Synced records: {} to {}'.format(
            stream_name, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size)
        offset = offset + limit

        # End: while record_count == limit

    # Return total_records across all batches
    return total_records
Exemplo n.º 3
0
 def test_transform_json_handles_dictionary_custom_fields(self):
     expected = [{'custom_fields': [{'field_set_id': '_custom_1', 'id': 'id', 'value': '1'}],
                  'id': '1'}]
     actual = transform_json([{"_custom_1" : {"id": '1'}, "id": '1'}],
                             "my_path")
     self.assertEqual(expected, actual)
Exemplo n.º 4
0
 def test_transform_no_custom_fields(self):
     expected = [{'custom_fields': [],
                  'id': '1'}]
     actual = transform_json([{"id": '1'}],
                             "my_path")
     self.assertEquals(expected, actual)