Exemplo n.º 1
0
def sync(config,
         streams,
         state,
         catalog,
         assume_sorted=True,
         max_page=None,
         auth_method="basic",
         raw=False,
         filter_by_schema=True):
    """
    Sync the streams that were selected

    - assume_sorted: Assume the data to be sorted and exit the process as soon
      as a record having greater than end index/datetime/timestamp is detected.
    - max_page: Stop after making this number of API call is made.
    - auth_method: HTTP auth method (basic, no_auth, digest)
    - raw: Output raw JSON records to stdout
    - filter_by_schema: When True, check the extracted records against the
      schema and undefined/unmatching fields won't be written out.
    """
    start_process_at = datetime.datetime.now()
    remaining_streams = get_streams_to_sync(streams, state)
    selected_streams = get_selected_streams(remaining_streams, catalog)
    if len(selected_streams) < 1:
        raise Exception("No Streams selected, please check that you have a " +
                        "schema selected in your catalog")

    LOGGER.info("Starting sync. Will sync these streams: %s" %
                [stream.tap_stream_id for stream in selected_streams])

    for stream in selected_streams:
        LOGGER.info("%s Start sync" % stream.tap_stream_id)

        singer.set_currently_syncing(state, stream.tap_stream_id)
        if raw is False:
            singer.write_state(state)

        try:
            state = sync_rows(config,
                              state,
                              stream.tap_stream_id,
                              max_page=max_page,
                              auth_method=auth_method,
                              assume_sorted=assume_sorted,
                              raw_output=raw,
                              filter_by_schema=filter_by_schema)
        except Exception as e:
            LOGGER.critical(e)
            raise e

        bookmark_type = get_bookmark_type(config)
        last_update = state["bookmarks"][stream.tap_stream_id]["last_update"]
        if bookmark_type == "timestamp":
            last_update = str(last_update) + " (" + str(
                datetime.datetime.fromtimestamp(last_update)) + ")"
        LOGGER.info("%s End sync" % stream.tap_stream_id)
        LOGGER.info("%s Last record's %s: %s" %
                    (stream.tap_stream_id, bookmark_type, last_update))

    end_process_at = datetime.datetime.now()
    LOGGER.info("Completed sync at %s" % str(end_process_at))
    LOGGER.info("Process duration: " + str(end_process_at - start_process_at))
Exemplo n.º 2
0
    def do_sync(self):
        logger.debug('Starting sync')

        # resuming when currently_syncing within state
        resume_from_stream = False
        if self.state and 'currently_syncing' in self.state:
            resume_from_stream = self.state['currently_syncing']

        for stream in self.streams:
            stream.tap = self

            if resume_from_stream:
                if stream.schema == resume_from_stream:
                    logger.info('Resuming from {}'.format(resume_from_stream))
                    resume_from_stream = False
                else:
                    logger.info('Skipping stream {} as resuming from {}'.format(stream.schema, resume_from_stream))
                    continue

            # stream state, from state/bookmark or start_date
            stream.set_initial_state(self.state, self.config['start_date'])

            # currently syncing
            if stream.state_field:
                set_currently_syncing(self.state, stream.schema)
                self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state))
                singer.write_state(self.state)

            # schema
            stream.write_schema()

            # paginate
            while stream.has_data():

                with singer.metrics.http_request_timer(stream.schema) as timer:
                    try:
                        response = self.execute_stream_request(stream)
                    except (ConnectionError, RequestException) as e:
                        raise e
                    timer.tags[singer.metrics.Tag.http_status_code] = response.status_code

                self.validate_response(response)
                self.rate_throttling(response)
                stream.paginate(response)

                # records with metrics
                with singer.metrics.record_counter(stream.schema) as counter:
                    with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING) as optimus_prime:
                        for row in self.iterate_response(response):
                            row = stream.process_row(row)

                            if not row: # in case of a non-empty response with an empty element
                                continue
                            row = optimus_prime.transform(row, stream.get_schema())
                            if stream.write_record(row):
                                counter.increment()
                            stream.update_state(row)

            # update state / bookmarking only when supported by stream
            if stream.state_field:
                self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field,
                                                   str(stream.earliest_state))
            singer.write_state(self.state)

        # clear currently_syncing
        try:
            del self.state['currently_syncing']
        except KeyError as e:
            pass
        singer.write_state(self.state)
Exemplo n.º 3
0
def update_currently_syncing(state, stream_name):
    if (stream_name is None) and ('currently_syncing' in state):
        del state['currently_syncing']
    else:
        singer.set_currently_syncing(state, stream_name)
    singer.write_state(state)
Exemplo n.º 4
0
def sync(  # noqa: WPS210, WPS213
    twinfield: Twinfield,
    state: dict,
    catalog: Catalog,
    start_date: str,
) -> None:
    """Sync data from tap source.

    Arguments:
        twinfield {Twinfield} -- Twinfield client
        state {dict} -- Tap state
        catalog {Catalog} -- Stream catalog
        start_date {str} -- Start date
    """
    # For every stream in the catalog
    LOGGER.info('Sync')
    LOGGER.debug('Current state:\n{state}')

    # Only selected streams are synced, whether a stream is selected is
    # determined by whether the key-value: "selected": true is in the schema
    # file.
    for stream in catalog.get_selected_streams(state):
        LOGGER.info(f'Syncing stream: {stream.tap_stream_id}')

        # Update the current stream as active syncing in the state
        singer.set_currently_syncing(state, stream.tap_stream_id)

        # Retrieve the state of the stream
        stream_state: dict = tools.get_stream_state(
            state,
            stream.tap_stream_id,
        )

        LOGGER.debug(f'Stream state: {stream_state}')

        # Write the schema
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=stream.schema.to_dict(),
            key_properties=stream.key_properties,
        )

        # Every stream has a corresponding method in the PayPal object e.g.:
        # The stream: paypal_transactions will call: paypal.paypal_transactions
        tap_data: Callable = getattr(twinfield, stream.tap_stream_id)

        # The tap_data method yields rows of data from the API
        # The state of the stream is used as kwargs for the method
        # E.g. if the state of the stream has a key 'start_date', it will be
        # used in the method as start_date='2021-01-01T00:00:00+0000'
        for row in tap_data(**stream_state):

            # Write a row to the stream
            singer.write_record(
                stream.tap_stream_id,
                row,
                time_extracted=datetime.now(timezone.utc),
            )

            bookmark: Optional[str] = tools.get_bookmark_value(
                stream.tap_stream_id,
                row,
            )

        # Update bookmark
        tools.update_bookmark(stream, bookmark, state)
        sys.stdout.flush()
Exemplo n.º 5
0
def do_sync(client, config, catalog, state):
    """
    Translate metadata into a set of metrics and dimensions and call out
    to sync to generate the required reports.
    """
    selected_streams = catalog.get_selected_streams(state)
    for stream in selected_streams:
        # Transform state for this report to new format before proceeding
        state = clean_state_for_report(config, state, stream.tap_stream_id)

        state = singer.set_currently_syncing(state, stream.tap_stream_id)
        singer.write_state(state)

        metrics = []
        dimensions = []
        mdata = metadata.to_map(stream.metadata)
        for field_path, field_mdata in mdata.items():
            if field_path == tuple():
                continue
            if field_mdata.get('inclusion') == 'unsupported':
                continue
            _, field_name = field_path
            if field_mdata.get('inclusion') == 'automatic' or \
               field_mdata.get('selected') or \
               (field_mdata.get('selected-by-default') and field_mdata.get('selected') is None):
                if field_mdata.get('behavior') == 'METRIC':
                    metrics.append(field_name)
                elif field_mdata.get('behavior') == 'DIMENSION':
                    dimensions.append(field_name)

        view_ids = get_view_ids(config)

        # NB: Resume from previous view for this report, dropping all
        # views before it to keep streams moving forward
        current_view = state.get('currently_syncing_view')
        if current_view:
            if current_view in view_ids:
                view_not_current = functools.partial(lambda cv, v: v != cv,
                                                     current_view)
                view_ids = list(itertools.dropwhile(view_not_current,
                                                    view_ids))
            else:
                state.pop('currently_syncing_view', None)

        reports_per_view = [{
            "profile_id": view_id,
            "name": stream.stream,
            "id": stream.tap_stream_id,
            "metrics": metrics,
            "dimensions": dimensions
        } for view_id in view_ids]

        end_date = get_end_date(config)

        schema = stream.schema.to_dict()

        singer.write_schema(stream.stream, schema, stream.key_properties)

        for report in reports_per_view:
            state['currently_syncing_view'] = report['profile_id']
            singer.write_state(state)

            is_historical_sync, start_date = get_start_date(
                config, report['profile_id'], state, report['id'])

            sync_report(client, schema, report, start_date, end_date, state,
                        is_historical_sync)
        state.pop('currently_syncing_view', None)
        singer.write_state(state)
    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 6
0
def sync(config, state, catalog):
    # Any client required PARAMETERS to hit the endpoint
    client = IreckonuClient(config)

    run_id = int(time.time())
    pipeline_start = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
    pipeline_start_time = time.perf_counter()
    stream_comments = []
    total_records = 0

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            batch_start = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
            start_time = time.perf_counter()
            record_count = 0

            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info("Staring sync for stream: %s", tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(
                tap_stream_id,
                stream_schema,
                stream_obj.key_properties,
                stream.replication_key,
            )

            try:
                for record in stream_obj.sync(
                    config["start_date"], config["hotel_codes"]
                ):
                    transformed_record = transformer.transform(
                        record, stream_schema, stream_metadata
                    )

                    singer.write_record(
                        tap_stream_id,
                        transformed_record,
                    )
                    record_count += 1
                    total_records += 1

                # If there is a Bookmark or state based key to store
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    "Last Run Date",
                    datetime.strftime(datetime.today(), "%Y-%m-%d"),
                )
                singer.write_state(state)

                batch_stop = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
                AuditLogs.write_audit_log(
                    run_id=run_id,
                    stream_name=tap_stream_id,
                    batch_start=batch_start,
                    batch_end=batch_stop,
                    records_synced=record_count,
                    run_time=(time.perf_counter() - start_time),
                )

            except Exception as e:
                stream_comments.append(f"{tap_stream_id.upper()}: {e}")
                batch_stop = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
                AuditLogs.write_audit_log(
                    run_id=run_id,
                    stream_name=tap_stream_id,
                    batch_start=batch_start,
                    batch_end=batch_stop,
                    records_synced=record_count,
                    run_time=(time.perf_counter() - start_time),
                    comments=e,
                )

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)

    # Comment out for local runs
    if config["slack_notifications"] == True:
        SlackMessenger.send_message(
            run_id=run_id,
            start_time=pipeline_start,
            run_time=(time.perf_counter() - pipeline_start_time),
            record_count=total_records,
            comments="\n".join(stream_comments),
        )
Exemplo n.º 7
0
def sync_stream(client, stream, state):
    tap_stream_id = stream['tap_stream_id']

    common.COUNTS[tap_stream_id] = 0
    common.TIMES[tap_stream_id] = 0
    common.SCHEMA_COUNT[tap_stream_id] = 0
    common.SCHEMA_TIMES[tap_stream_id] = 0

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    database_name = metadata.get(md_map, (), 'database-name')

    stream_projection = load_stream_projection(stream)

    # Emit a state message to indicate that we've started this stream
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, stream['tap_stream_id'])
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    write_schema_message(stream)
    common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        timer.tags['table'] = stream['table_name']

        if replication_method == 'LOG_BASED':
            if oplog.oplog_has_aged_out(client, state, tap_stream_id):
                # remove all state for stream
                # then it will do a full sync and start oplog again.
                LOGGER.info("Clearing state because Oplog has aged out")
                state.get('bookmarks', {}).pop(tap_stream_id)

            # make sure initial full table sync has been completed
            if not singer.get_bookmark(state, tap_stream_id,
                                       'initial_full_table_complete'):
                msg = 'Must complete full table sync before starting oplog replication for %s'
                LOGGER.info(msg, tap_stream_id)

                # only mark current ts in oplog on first sync so tap has a
                # starting point after the full table sync
                if singer.get_bookmark(state, tap_stream_id,
                                       'version') is None:
                    collection_oplog_ts = oplog.get_latest_ts(client)
                    oplog.update_bookmarks(state, tap_stream_id,
                                           collection_oplog_ts)

                full_table.sync_collection(client, stream, state,
                                           stream_projection)

            oplog.sync_collection(client, stream, state, stream_projection)

        elif replication_method == 'FULL_TABLE':
            full_table.sync_collection(client, stream, state,
                                       stream_projection)

        elif replication_method == 'INCREMENTAL':
            incremental.sync_collection(client, stream, state,
                                        stream_projection)
        else:
            raise Exception(
                "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \
                methods are supported (you passed {})".format(
                    replication_method))

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 8
0
def _sync_stream(client, stream, transformer, config, state, catalog,
                 **kwargs):
    record = kwargs.get('record', None)
    substreams = kwargs.get('substreams')
    tap_stream_id = stream.tap_stream_id

    stream_obj = stream()
    stream_catalog = catalog.get_stream(stream.tap_stream_id)
    replication_key = stream_obj.replication_key
    stream_schema = stream_catalog.schema.to_dict()
    stream_metadata = metadata.to_map(stream_catalog.metadata)
    replication_method = metadata.get(stream_metadata, (),
                                      'replication-method')
    stream_obj.update_replication_method(replication_method)

    LOGGER.debug('Starting sync for stream: %s', tap_stream_id)
    state = singer.set_currently_syncing(state, tap_stream_id)
    singer.write_state(state)

    # Only write schema once
    if not tap_stream_id in schemas_written:
        singer.write_schema(tap_stream_id, stream_schema,
                            stream_obj.key_properties, stream.replication_key)
        schemas_written.append(tap_stream_id)

    start_date = singer.get_bookmark(state, tap_stream_id, replication_key,
                                     config['start_date'])
    offset = singer.get_bookmark(state, tap_stream_id, 'offset', 0)

    max_record_value = start_date
    for page, cursor in stream_obj.sync(client,
                                        config,
                                        state,
                                        record=record,
                                        start_date=start_date,
                                        offset=offset):
        for record in page:
            transformed_record = transformer.transform(record, stream_schema,
                                                       stream_metadata)

            time_extracted = singer.utils.now()
            singer.write_record(tap_stream_id,
                                transformed_record,
                                time_extracted=time_extracted)

            if stream_obj.replication_method == 'INCREMENTAL':
                current_replication_value = deep_get(record, replication_key)
                if current_replication_value \
                        and current_replication_value > max_record_value:
                    max_record_value = current_replication_value

            if substreams:
                _sync_streams(client,
                              substreams.values(),
                              transformer,
                              config,
                              state,
                              catalog,
                              record=record,
                              start_date=start_date)

        state = singer.write_bookmark(state, tap_stream_id, 'offset', cursor)

        if stream_obj.replication_method == 'INCREMENTAL':
            state = singer.write_bookmark(state, tap_stream_id,
                                          replication_key, max_record_value)

        singer.write_state(state)

    state = singer.clear_bookmark(state, tap_stream_id, 'offset')
    singer.write_state(state)
Exemplo n.º 9
0
    def run_sync(cls, config, properties, arguments):
        streams = properties['streams']

        for stream in streams:
            table = re.sub('[^0-9a-zA-Z_]+', '_', stream['table_name']).lower()
            schema = stream['metadata']
            properties = {}

            # Clean field names
            for f_name in schema['properties'].keys():
                clean_f_name = re.sub('[^0-9a-zA-Z_]+', '_', f_name).lower()
                properties[clean_f_name] = schema['properties'][f_name]

            schema['properties'] = properties

            if table != 'relations' and schema['selected']:
                STATE.update(singer.set_currently_syncing(STATE, table))
                singer.write_state(STATE)

                response = Airtable.get_response(config['base_id'],
                                                 schema["name"])
                if response.json().get('records'):
                    response_records = response.json().get('records')
                    clean_response_records = []

                    for response_record in response_records:
                        fields = {}

                        # Clean field names
                        for f_name in response_record['fields'].keys():
                            clean_f_name = re.sub('[^0-9a-zA-Z_]+', '_',
                                                  f_name).lower()
                            fields[clean_f_name] = response_record['fields'][
                                f_name]

                        response_record['fields'] = fields
                        clean_response_records.append(response_record)

                    records = JsonUtils.match_record_with_keys(
                        schema, clean_response_records,
                        config['remove_emojis'])

                    singer.write_schema(table, schema, 'id')
                    singer.write_records(table, records)

                    offset = response.json().get("offset")

                    while offset:
                        response = Airtable.get_response(
                            config['base_id'], schema["name"], offset)
                        if response.json().get('records'):
                            records = JsonUtils.match_record_with_keys(
                                schema,
                                response.json().get('records'),
                                config['remove_emojis'])

                        singer.write_records(table, records)
                        offset = response.json().get("offset")

        relations_table = {
            "name": "relations",
            "properties": {
                "id": {
                    "type": ["null", "string"]
                },
                "relation1": {
                    "type": ["null", "string"]
                },
                "relation2": {
                    "type": ["null", "string"]
                }
            }
        }

        singer.write_schema('relations', relations_table, 'id')
        singer.write_records('relations', Relations.get_records())

        STATE.update(singer.set_currently_syncing(STATE, None))
        singer.write_state(STATE)
        LOGGER.info("Sync completed")
Exemplo n.º 10
0
def do_sync(config, state, stream):
    singer.set_currently_syncing(state, stream.tap_stream_id)
    singer.write_state(state)

    client = bigquery.Client()
    metadata = stream.metadata[0]["metadata"]
    tap_stream_id = stream.tap_stream_id

    inclusive_start = True
    start_datetime = singer.get_bookmark(state, tap_stream_id,
                                         BOOKMARK_KEY_NAME)
    if start_datetime:
        if not config.get("start_always_inclusive"):
            inclusive_start = False
    else:
        start_datetime = config.get("start_datetime")
    start_datetime = dateutil.parser.parse(start_datetime).strftime(
        "%Y-%m-%d %H:%M:%S.%f")

    if config.get("end_datetime"):
        end_datetime = dateutil.parser.parse(
            config.get("end_datetime")).strftime("%Y-%m-%d %H:%M:%S.%f")

    singer.write_schema(tap_stream_id, stream.schema.to_dict(),
                        stream.key_properties)

    keys = {
        "table": metadata["table"],
        "columns": metadata["columns"],
        "datetime_key": metadata.get("datetime_key"),
        "start_datetime": start_datetime,
        "end_datetime": end_datetime
    }

    limit = config.get("limit", None)
    query = _build_query(keys,
                         metadata.get("filters", []),
                         inclusive_start,
                         limit=limit)
    query_job = client.query(query)

    properties = stream.schema.properties
    last_update = start_datetime

    LOGGER.info("Running query:\n    %s" % query)

    extract_tstamp = datetime.datetime.utcnow()
    extract_tstamp = extract_tstamp.replace(tzinfo=datetime.timezone.utc)

    with metrics.record_counter(tap_stream_id) as counter:
        for row in query_job:
            record = {}
            for key in properties.keys():
                prop = properties[key]

                if key in [
                        LEGACY_TIMESTAMP, EXTRACT_TIMESTAMP, BATCH_TIMESTAMP
                ]:
                    continue

                if row[key] is None:
                    if prop.type[0] != "null":
                        raise ValueError(
                            "NULL value not allowed by the schema")
                    else:
                        record[key] = None
                elif prop.format == "date-time":
                    if type(row[key]) == str:
                        r = dateutil.parser.parse(row[key])
                    elif type(row[key]) == datetime.date:
                        r = datetime.datetime(year=row[key].year,
                                              month=row[key].month,
                                              day=row[key].day)
                    elif type(row[key]) == datetime.datetime:
                        r = row[key]
                    record[key] = r.isoformat()
                elif prop.type[1] == "string":
                    record[key] = str(row[key])
                elif prop.type[1] == "number":
                    record[key] = Decimal(row[key])
                elif prop.type[1] == "integer":
                    record[key] = int(row[key])
                else:
                    record[key] = row[key]

            if LEGACY_TIMESTAMP in properties.keys():
                record[LEGACY_TIMESTAMP] = int(round(time.time() * 1000))
            if EXTRACT_TIMESTAMP in properties.keys():
                record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat()

            singer.write_record(stream.stream, record)

            last_update = record[keys["datetime_key"]]
            counter.increment()

    state = singer.write_bookmark(state, tap_stream_id, BOOKMARK_KEY_NAME,
                                  last_update)

    singer.write_state(state)
Exemplo n.º 11
0
def sync(config, state, catalog):
    client = SquareClient(config)

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            start_time = singer.get_bookmark(state, tap_stream_id,
                                             replication_key,
                                             config['start_date'])
            bookmarked_cursor = singer.get_bookmark(state, tap_stream_id,
                                                    'cursor')

            if tap_stream_id == 'shifts':
                replication_key = stream_obj.replication_key

                sync_start_bookmark = singer.get_bookmark(
                    state, tap_stream_id, 'sync_start',
                    singer.utils.strftime(
                        singer.utils.now(),
                        format_str=singer.utils.DATETIME_PARSE))
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    'sync_start',
                    sync_start_bookmark,
                )
                for page, cursor in stream_obj.sync(start_time,
                                                    bookmarked_cursor):
                    for record in page:
                        if record[replication_key] >= start_time:
                            transformed_record = transformer.transform(
                                record,
                                stream_schema,
                                stream_metadata,
                            )
                            singer.write_record(
                                tap_stream_id,
                                transformed_record,
                            )
                    state = singer.write_bookmark(state, tap_stream_id,
                                                  'cursor', cursor)
                    singer.write_state(state)

                state = singer.clear_bookmark(state, tap_stream_id,
                                              'sync_start')
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    replication_key,
                    sync_start_bookmark,
                )
                singer.write_state(state)

            elif stream_obj.replication_method == 'INCREMENTAL':
                replication_key = stream_obj.replication_key
                max_record_value = start_time
                for page, cursor in stream_obj.sync(start_time,
                                                    bookmarked_cursor):
                    for record in page:
                        transformed_record = transformer.transform(
                            record, stream_schema, stream_metadata)
                        singer.write_record(
                            tap_stream_id,
                            transformed_record,
                        )
                        if record[replication_key] > max_record_value:
                            max_record_value = transformed_record[
                                replication_key]

                    state = singer.write_bookmark(state, tap_stream_id,
                                                  'cursor', cursor)
                    state = singer.write_bookmark(state, tap_stream_id,
                                                  replication_key,
                                                  max_record_value)
                    singer.write_state(state)

            else:
                for record in stream_obj.sync(start_time, bookmarked_cursor):
                    transformed_record = transformer.transform(
                        record, stream_schema, stream_metadata)
                    singer.write_record(
                        tap_stream_id,
                        transformed_record,
                    )
            state = singer.clear_bookmark(state, tap_stream_id, 'cursor')
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 12
0
    def do_sync(self):
        logger.debug('Starting sync')

        # resuming when currently_syncing within state
        resume_from_stream = False
        if self.state and 'currently_syncing' in self.state:
            resume_from_stream = self.state['currently_syncing']

        for stream in self.streams:
            stream.tap = self

            if resume_from_stream:
                if stream.schema == resume_from_stream:
                    logger.info('Resuming from {}'.format(resume_from_stream))
                    resume_from_stream = False
                else:
                    logger.info('Skipping stream {} as resuming from {}'.format(stream.schema, resume_from_stream))
                    continue

            # stream state, from state/bookmark or start_date
            stream.set_initial_state(self.state, self.config['start_date'])

            # currently syncing
            if stream.state_field:
                set_currently_syncing(self.state, stream.schema)
                self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state))
                singer.write_state(self.state)

            # schema
            stream.write_schema()

            if stream.id_list: # see if we want to iterate over a list of deal_ids

                for deal_id in stream.get_deal_ids(self):
                    is_last_id = False

                    if deal_id == stream.these_deals[-1]: #find out if this is last deal_id in the current set
                        is_last_id = True

                    # if last page of deals, more_items in collection will be False
                    # Need to set it to True to get deal_id pagination for the first deal on the last page
                    if deal_id == stream.these_deals[0]:
                        stream.more_items_in_collection = True

                    stream.update_endpoint(deal_id)
                    stream.start = 0   # set back to zero for each new deal_id
                    self.do_paginate(stream)

                    if not is_last_id:
                        stream.more_items_in_collection = True   #set back to True for pagination of next deal_id request
                    elif is_last_id and stream.more_ids_to_get:  # need to get the next batch of deal_ids
                        stream.more_items_in_collection = True
                        stream.start = stream.next_start
                    else:
                        stream.more_items_in_collection = False

                # set the attribution window so that the bookmark will reflect the new initial_state for the next sync
                stream.earliest_state = stream.stream_start.subtract(hours=3)
            else:
                # paginate
                self.do_paginate(stream)

            # update state / bookmarking only when supported by stream
            if stream.state_field:
                self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field,
                                                   str(stream.earliest_state))
            singer.write_state(self.state)

        # clear currently_syncing
        try:
            del self.state['currently_syncing']
        except KeyError as e:
            pass
        singer.write_state(self.state)
def sync(config, state, catalog):
    errors_encountered = False

    selected_stream_ids = get_selected_streams(catalog)

    client = Client(config)

    # Check if there are existing bookmarks, if not create a new one
    state['bookmarks'] = state.get('bookmarks', {})

    # Loop over streams in catalog
    for stream in catalog['streams']:
        stream_id = stream['tap_stream_id']
        stream_schema = stream['schema']
        report_definition = Report.get_report_definition(stream)

        stream_metadata = metadata.to_map(stream['metadata'])
        key_properties = metadata.get(stream_metadata, (),
                                      "table-key-properties")

        if stream_id in selected_stream_ids:
            start_date = utils.strptime_to_utc(
                get_bookmark(
                    state,
                    stream_id,
                    'last_report_date',
                    default=config['start_date'].strftime('%Y-%m-%d')))
            start_date = start_date - timedelta(
                days=config.get('lookback_days', 15))
            end_date = config['end_date']
            date_interval = config['date_batching']
            segment_id = config.get('segment_id', None)

            LOGGER.info(f'Syncing stream: {stream_id}')
            LOGGER.info(
                f'Will sync data from {start_date.isoformat()} until {end_date.isoformat()}'
            )

            # Sets the currently sycing stream in state
            singer.set_currently_syncing(state, stream_id)
            # Writes the schema for the current stream
            singer.write_schema(stream_id, stream_schema, key_properties)

            for start_date, end_date in batch_report_dates(
                    start_date, end_date, date_interval):
                LOGGER.info(
                    f'Request for {start_date.isoformat()} to {end_date.isoformat()} started.'
                )
                start = timer()
                try:
                    results = client.process_stream(start_date, end_date,
                                                    report_definition,
                                                    segment_id)

                    # Writes individual items from results array as records
                    singer.write_records(stream_id, results)
                    # Updates the stream bookmark with the latest report timestamp
                    singer.write_bookmark(state, stream_id, 'last_report_date',
                                          end_date.strftime("%Y-%m-%d"))
                    singer.write_state(state)
                except GaInvalidArgumentError as e:
                    errors_encountered = True
                    LOGGER.error(
                        "Skipping stream: '{}' due to invalid report definition."
                        .format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                except GaRateLimitError as e:
                    errors_encountered = True
                    LOGGER.error(
                        "Skipping stream: '{}' due to Rate Limit Errors.".
                        format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                except GaQuotaExceededError as e:
                    errors_encountered = True
                    LOGGER.error(
                        "Skipping stream: '{}' due to Quota Exceeded Errors.".
                        format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                except GaAuthenticationError as e:
                    LOGGER.error(
                        "Stopping execution while processing '{}' due to Authentication Errors."
                        .format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                    sys.exit(1)
                except GaUnknownError as e:
                    LOGGER.error(
                        "Stopping execution while processing '{}' due to Unknown Errors."
                        .format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                    sys.exit(1)
                end = timer()
                LOGGER.info(
                    f'Request for {start_date.isoformat()} to {end_date.isoformat()} finished in {(end-start):.2f}.'
                )

            singer.set_currently_syncing(state, '')
            singer.write_state(state)
        else:
            LOGGER.info('Skipping unselected stream: ' + stream_id)

    # If we encountered errors, exit with 1
    if errors_encountered:
        sys.exit(1)

    return
Exemplo n.º 14
0
def generate_messages(con, config, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = md_map.get((), {}).get('replication-key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                LOGGER.info("Stream %s is using incremental replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [replication_key])

                for message in incremental.sync_table(con, catalog_entry,
                                                      state, columns):
                    yield message
            elif replication_method == 'LOG_BASED':
                if catalog_entry.is_view:
                    raise Exception(
                        "Unable to replicate stream({}) with binlog because it is a view."
                        .format(catalog_entry.stream))

                LOGGER.info("Stream %s is using binlog replication",
                            catalog_entry.stream)

                log_file = singer.get_bookmark(state,
                                               catalog_entry.tap_stream_id,
                                               'log_file')

                log_pos = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'log_pos')

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                if log_file and log_pos:
                    columns = binlog.add_automatic_properties(
                        catalog_entry, columns)

                    for message in binlog.sync_table(con, config,
                                                     catalog_entry, state,
                                                     columns):
                        yield message
                else:
                    LOGGER.info("Performing initial full table sync")

                    log_file, log_pos = binlog.fetch_current_log_file_and_pos(
                        con)

                    stream_version = common.get_stream_version(
                        catalog_entry.tap_stream_id, state)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'version', stream_version)

                    for message in full_table.sync_table(
                            con, catalog_entry, state, columns,
                            stream_version):
                        yield message

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_file', log_file)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_pos', log_pos)

                    yield singer.StateMessage(value=copy.deepcopy(state))
            elif replication_method == 'FULL_TABLE':
                LOGGER.info("Stream %s is using full table replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                stream_version = common.get_stream_version(
                    catalog_entry.tap_stream_id, state)

                for message in full_table.sync_table(con, catalog_entry, state,
                                                     columns, stream_version):
                    yield message

                # Prefer initial_full_table_complete going forward
                singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'version')

                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'initial_full_table_complete',
                                              True)

                yield singer.StateMessage(value=copy.deepcopy(state))
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))