예제 #1
0
def get_sheet_metadata(sheet, spreadsheet_id, client):
    sheet_id = sheet.get('properties', {}).get('sheetId')
    sheet_title = sheet.get('properties', {}).get('title')
    LOGGER.info('sheet_id = {}, sheet_title = {}'.format(
        sheet_id, sheet_title))

    stream_name = 'sheet_metadata'
    stream_metadata = STREAMS.get(stream_name)
    api = stream_metadata.get('api', 'sheets')
    params = stream_metadata.get('params', {})
    sheet_title_encoded = urllib.parse.quote_plus(sheet_title)
    sheet_title_escaped = re.escape(sheet_title)
    # create querystring for preparing the request
    querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
        params.items()]).replace('{sheet_title}', sheet_title_encoded)
    # create path for preparing the request
    path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
        spreadsheet_id), querystring)

    sheet_md_results = client.get(path=path,
                                  api=api,
                                  endpoint=sheet_title_escaped)
    # sheet_metadata: 1st `sheets` node in results
    sheet_metadata = sheet_md_results.get('sheets')[0]

    # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
    try:
        sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
    except Exception as err:
        LOGGER.warning('{}'.format(err))
        LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title))
        sheet_json_schema, columns = None, None

    return sheet_json_schema, columns
예제 #2
0
def get_sheet_metadata(sheet, spreadsheet_id, client):
    sheet_id = sheet.get('properties', {}).get('sheetId')
    sheet_title = sheet.get('properties', {}).get('title')
    LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))

    stream_name = 'sheet_metadata'
    stream_metadata = STREAMS.get(stream_name)
    params = stream_metadata.get('params', {})

    # GET sheet_metadata
    sheet_md_results = client.request(endpoint=stream_name,
                                      spreadsheet_id=spreadsheet_id,
                                      sheet_title=sheet_title,
                                      params=params)
    # sheet_metadata: 1st `sheets` node in results
    sheet_metadata = sheet_md_results.get('sheets')[0]

    # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
    try:
        sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
    except Exception as err:
        LOGGER.warning('{}'.format(err))
        LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title))
        sheet_json_schema, columns = None, None

    return sheet_json_schema, columns
예제 #3
0
def get_schemas(client, spreadsheet_id):
    schemas = {}
    field_metadata = {}

    for stream_name, stream_metadata in STREAMS.items():
        schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
        with open(schema_path) as file:
            schema = json.load(file)
        schemas[stream_name] = schema
        mdata = metadata.new()

        # Documentation:
        # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
        # Reference:
        # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=stream_metadata.get('key_properties', None),
            valid_replication_keys=stream_metadata.get('replication_keys',
                                                       None),
            replication_method=stream_metadata.get('replication_method', None))
        field_metadata[stream_name] = mdata

        if stream_name == 'spreadsheet_metadata':
            api = stream_metadata.get('api', 'sheets')
            params = stream_metadata.get('params', {})
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
            path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
                spreadsheet_id), querystring)

            # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
            spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
                endpoint=stream_name)

            sheets = spreadsheet_md_results.get('sheets')
            if sheets:
                # Loop thru each worksheet in spreadsheet
                for sheet in sheets:
                    # GET sheet_json_schema for each worksheet (from function above)
                    sheet_json_schema, columns = get_sheet_metadata(
                        sheet, spreadsheet_id, client)

                    # SKIP empty sheets (where sheet_json_schema and columns are None)
                    if sheet_json_schema and columns:
                        sheet_title = sheet.get('properties', {}).get('title')
                        schemas[sheet_title] = sheet_json_schema
                        sheet_mdata = metadata.new()
                        sheet_mdata = metadata.get_standard_metadata(
                            schema=sheet_json_schema,
                            key_properties=['__sdc_row'],
                            valid_replication_keys=None,
                            replication_method='FULL_TABLE')
                        field_metadata[sheet_title] = sheet_mdata

    return schemas, field_metadata
예제 #4
0
def get_sheet_metadata(sheet, spreadsheet_id, client):
    sheet_id = sheet.get('properties', {}).get('sheetId')
    sheet_title = sheet.get('properties', {}).get('title')
    LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))

    stream_name = 'sheet_metadata'
    stream_metadata = STREAMS.get(stream_name)
    api = stream_metadata.get('api', 'sheets')
    params = stream_metadata.get('params', {})
    querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in
                            params.items()]).replace('{sheet_title}', sheet_title)
    path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}',
                                                              spreadsheet_id), querystring)

    sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
    # sheet_metadata: 1st `sheets` node in results
    sheet_metadata = sheet_md_results.get('sheets')[0]

    # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
    sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)

    return sheet_json_schema, columns
예제 #5
0
def get_schemas(client, spreadsheet_id):
    schemas = {}
    field_metadata = {}

    for stream_name, stream_metadata in STREAMS.items():
        schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
        with open(schema_path) as file:
            schema = json.load(file)
        schemas[stream_name] = schema
        mdata = metadata.new()

        # Documentation:
        # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
        # Reference:
        # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=stream_metadata.get('key_properties', None),
            valid_replication_keys=stream_metadata.get('replication_keys',
                                                       None),
            replication_method=stream_metadata.get('replication_method', None))
        field_metadata[stream_name] = mdata

        if stream_name == 'spreadsheet_metadata':
            api = stream_metadata.get('api', 'sheets')
            params = stream_metadata.get('params', {})
            # prepare the query string for the request
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
            # prepare the path for request
            path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
                spreadsheet_id), querystring)

            # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
            spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
                endpoint=stream_name)

            sheets = spreadsheet_md_results.get('sheets')
            if sheets:
                # Loop thru each worksheet in spreadsheet
                for sheet in sheets:
                    # GET sheet_json_schema for each worksheet (from function above)
                    sheet_json_schema, columns = get_sheet_metadata(
                        sheet, spreadsheet_id, client)

                    # SKIP empty sheets (where sheet_json_schema and columns are None)
                    if sheet_json_schema and columns:
                        sheet_title = sheet.get('properties', {}).get('title')
                        schemas[sheet_title] = sheet_json_schema
                        sheet_mdata = metadata.new()
                        sheet_mdata = metadata.get_standard_metadata(
                            schema=sheet_json_schema,
                            key_properties=['__sdc_row'],
                            valid_replication_keys=None,
                            replication_method='FULL_TABLE')
                        # for each column check if the `columnSkipped` value is true and the `prior_column_skipped` is false or None
                        # in the columns dict. The `prior_column_skipped` would be true  when it is the first column of the two
                        # consecutive empty headers column if true: update the incusion property to `unsupported`
                        for column in columns:
                            if column.get('columnSkipped') and not column.get(
                                    'prior_column_skipped'):
                                mdata = metadata.to_map(sheet_mdata)
                                sheet_mdata = metadata.write(
                                    mdata,
                                    ('properties', column.get('columnName')),
                                    'inclusion', 'unsupported')
                                sheet_mdata = metadata.to_list(mdata)
                        field_metadata[sheet_title] = sheet_mdata

    return schemas, field_metadata
예제 #6
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')
    spreadsheet_id = config.get('spreadsheet_id')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams:
        return

    # FILE_METADATA
    file_metadata = {}
    stream_name = 'file_metadata'
    file_metadata_config = STREAMS.get(stream_name)

    # GET file_metadata
    LOGGER.info('GET file_meatadata')
    file_metadata, time_extracted = get_data(
        stream_name=stream_name,
        endpoint_config=file_metadata_config,
        client=client,
        spreadsheet_id=spreadsheet_id)
    # Transform file_metadata
    LOGGER.info('Transform file_meatadata')
    file_metadata_tf = transform_file_metadata(file_metadata)
    # LOGGER.info('file_metadata_tf = {}'.format(file_metadata_tf))

    # Check if file has changed, if not break (return to __init__)
    last_datetime = strptime_to_utc(
        get_bookmark(state, stream_name, start_date))
    this_datetime = strptime_to_utc(file_metadata.get('modifiedTime'))
    LOGGER.info('last_datetime = {}, this_datetime = {}'.format(
        last_datetime, this_datetime))
    if this_datetime <= last_datetime:
        LOGGER.info(
            'this_datetime <= last_datetime, FILE NOT CHANGED. EXITING.')
        # Update file_metadata bookmark
        write_bookmark(state, 'file_metadata', strftime(this_datetime))
        return
    # Sync file_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state,
                file_metadata_tf, time_extracted)
    # file_metadata bookmark is updated at the end of sync

    # SPREADSHEET_METADATA
    spreadsheet_metadata = {}
    stream_name = 'spreadsheet_metadata'
    spreadsheet_metadata_config = STREAMS.get(stream_name)

    # GET spreadsheet_metadata
    LOGGER.info('GET spreadsheet_meatadata')
    spreadsheet_metadata, ss_time_extracted = get_data(
        stream_name=stream_name,
        endpoint_config=spreadsheet_metadata_config,
        client=client,
        spreadsheet_id=spreadsheet_id)

    # Transform spreadsheet_metadata
    LOGGER.info('Transform spreadsheet_meatadata')
    spreadsheet_metadata_tf = transform_spreadsheet_metadata(
        spreadsheet_metadata)

    # Sync spreadsheet_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state, spreadsheet_metadata_tf, \
        ss_time_extracted)

    # SHEET_METADATA and SHEET_DATA
    sheets = spreadsheet_metadata.get('sheets')
    sheet_metadata = []
    sheets_loaded = []
    sheets_loaded_config = STREAMS['sheets_loaded']
    if sheets:
        # Loop thru sheets (worksheet tabs) in spreadsheet
        for sheet in sheets:
            sheet_title = sheet.get('properties', {}).get('title')
            sheet_id = sheet.get('properties', {}).get('sheetId')

            # GET sheet_metadata and columns
            sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id,
                                                       client)
            # LOGGER.info('sheet_schema: {}'.format(sheet_schema))

            # SKIP empty sheets (where sheet_schema and columns are None)
            if not sheet_schema or not columns:
                LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
            else:
                # Transform sheet_metadata
                sheet_metadata_tf = transform_sheet_metadata(
                    spreadsheet_id, sheet, columns)
                # LOGGER.info('sheet_metadata_tf = {}'.format(sheet_metadata_tf))
                sheet_metadata.append(sheet_metadata_tf)

                # SHEET_DATA
                # Should this worksheet tab be synced?
                if sheet_title in selected_streams:
                    LOGGER.info('STARTED Syncing Sheet {}'.format(sheet_title))
                    update_currently_syncing(state, sheet_title)
                    selected_fields = get_selected_fields(catalog, sheet_title)
                    LOGGER.info('Stream: {}, selected_fields: {}'.format(
                        sheet_title, selected_fields))
                    write_schema(catalog, sheet_title)

                    # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs)
                    # everytime after each sheet sync is complete.
                    # This forces hard deletes on the data downstream if fewer records are sent.
                    # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137
                    last_integer = int(get_bookmark(state, sheet_title, 0))
                    activate_version = int(time.time() * 1000)
                    activate_version_message = singer.ActivateVersionMessage(
                        stream=sheet_title, version=activate_version)
                    if last_integer == 0:
                        # initial load, send activate_version before AND after data sync
                        singer.write_message(activate_version_message)
                        LOGGER.info(
                            'INITIAL SYNC, Stream: {}, Activate Version: {}'.
                            format(sheet_title, activate_version))

                    # Determine max range of columns and rows for "paging" through the data
                    sheet_last_col_index = 1
                    sheet_last_col_letter = 'A'
                    for col in columns:
                        col_index = col.get('columnIndex')
                        col_letter = col.get('columnLetter')
                        if col_index > sheet_last_col_index:
                            sheet_last_col_index = col_index
                            sheet_last_col_letter = col_letter
                    sheet_max_row = sheet.get('properties').get(
                        'gridProperties', {}).get('rowCount')

                    # Initialize paging for 1st batch
                    is_last_row = False
                    if config.get('batch_rows'):
                        batch_rows = config.get('batch_rows')
                    else:
                        batch_rows = 200
                    from_row = 2
                    if sheet_max_row < batch_rows:
                        to_row = sheet_max_row
                    else:
                        to_row = batch_rows

                    # Loop thru batches (each having 200 rows of data)
                    while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row:
                        range_rows = 'A{}:{}{}'.format(from_row,
                                                       sheet_last_col_letter,
                                                       to_row)

                        # GET sheet_data for a worksheet tab
                        sheet_data, time_extracted = get_data(
                            stream_name=sheet_title,
                            endpoint_config=sheets_loaded_config,
                            client=client,
                            spreadsheet_id=spreadsheet_id,
                            range_rows=range_rows)
                        # Data is returned as a list of arrays, an array of values for each row
                        sheet_data_rows = sheet_data.get('values', [])

                        # Transform batch of rows to JSON with keys for each column
                        sheet_data_tf, row_num = transform_sheet_data(
                            spreadsheet_id=spreadsheet_id,
                            sheet_id=sheet_id,
                            sheet_title=sheet_title,
                            from_row=from_row,
                            columns=columns,
                            sheet_data_rows=sheet_data_rows)
                        if row_num < to_row:
                            is_last_row = True

                        # Process records, send batch of records to target
                        record_count = process_records(
                            catalog=catalog,
                            stream_name=sheet_title,
                            records=sheet_data_tf,
                            time_extracted=ss_time_extracted,
                            version=activate_version)
                        LOGGER.info('Sheet: {}, records processed: {}'.format(
                            sheet_title, record_count))

                        # Update paging from/to_row for next batch
                        from_row = to_row + 1
                        if to_row + batch_rows > sheet_max_row:
                            to_row = sheet_max_row
                        else:
                            to_row = to_row + batch_rows

                    # End of Stream: Send Activate Version and update State
                    singer.write_message(activate_version_message)
                    write_bookmark(state, sheet_title, activate_version)
                    LOGGER.info(
                        'COMPLETE SYNC, Stream: {}, Activate Version: {}'.
                        format(sheet_title, activate_version))
                    LOGGER.info(
                        'FINISHED Syncing Sheet {}, Total Rows: {}'.format(
                            sheet_title,
                            row_num - 2))  # subtract 1 for header row
                    update_currently_syncing(state, None)

                    # SHEETS_LOADED
                    # Add sheet to sheets_loaded
                    sheet_loaded = {}
                    sheet_loaded['spreadsheetId'] = spreadsheet_id
                    sheet_loaded['sheetId'] = sheet_id
                    sheet_loaded['title'] = sheet_title
                    sheet_loaded['loadDate'] = strftime(utils.now())
                    sheet_loaded['lastRowNumber'] = row_num
                    sheets_loaded.append(sheet_loaded)

    stream_name = 'sheet_metadata'
    # Sync sheet_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state, sheet_metadata)

    stream_name = 'sheets_loaded'
    # Sync sheet_metadata if selected
    sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded)

    # Update file_metadata bookmark
    write_bookmark(state, 'file_metadata', strftime(this_datetime))

    return