Exemplo n.º 1
0
def sync_generic_basic_endpoint(sdk_client, stream, stream_metadata):
    discovered_schema = load_schema(stream)
    field_list = get_field_list(discovered_schema, stream, stream_metadata)

    discovered_schema['properties']['_sdc_customer_id'] = {
        'description': 'Profile ID',
        'type': 'string',
        'field': "customer_id"
    }
    primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys']
    write_schema(stream, discovered_schema, primary_keys)

    LOGGER.info("Syncing %s for customer %s", stream,
                sdk_client.client_customer_id)

    start_index = 0
    selector = {
        'fields': field_list,
        'paging': {
            'startIndex': str(start_index),
            'numberResults': str(PAGE_SIZE)
        }
    }

    while True:
        page = get_page(sdk_client, selector, stream, start_index)
        if page['totalNumEntries'] > GOOGLE_MAX_START_INDEX:
            raise Exception("Too many %s (%s > %s) for customer %s", stream,
                            GOOGLE_MAX_START_INDEX, page['totalNumEntries'],
                            sdk_client.client_customer_id)

        if 'entries' in page:
            with metrics.record_counter(stream) as counter:
                time_extracted = utils.now()

                for entry in page['entries']:
                    obj = suds_to_dict(entry)
                    obj['_sdc_customer_id'] = sdk_client.client_customer_id
                    with Transformer(
                            singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING
                    ) as bumble_bee:
                        bumble_bee.pre_hook = transform_pre_hook
                        record = bumble_bee.transform(obj, discovered_schema)

                        singer.write_record(stream,
                                            record,
                                            time_extracted=time_extracted)
                        counter.increment()

        start_index += PAGE_SIZE
        if start_index > int(page['totalNumEntries']):
            break
    LOGGER.info("Done syncing %s for customer_id %s", stream,
                sdk_client.client_customer_id)
Exemplo n.º 2
0
def sync_branches(project):
    url = get_url("branches", project['id'])
    with Transformer(pre_hook=transform_row) as transformer:
        for row in gen_request(url):
            row['project_id'] = project['id']
            flatten_id(row, "commit")
            transformed_row = transformer.transform(
                row, RESOURCES["branches"]["schema"])
            singer.write_record("branches",
                                transformed_row,
                                time_extracted=utils.now())
Exemplo n.º 3
0
def sync_issues(project):
    url = get_url("issues", project['id'])
    with Transformer(pre_hook=format_timestamp) as transformer:
        for row in gen_request(url):
            flatten_id(row, "author")
            flatten_id(row, "assignee")
            flatten_id(row, "milestone")
            transformed_row = transformer.transform(row, RESOURCES["issues"]["schema"])

            if row["updated_at"] >= get_start("project_{}".format(project["id"])):
                singer.write_record("issues", transformed_row, time_extracted=utils.now())
Exemplo n.º 4
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250,
              'includeAssociations': False,
              'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        params['includeAllProperties'] = True
        params['allPropertiesFetchMode'] = 'latest_version'

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata)
                singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 5
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        max_bookmark_value=None,
        last_datetime=None,
        parent=None,
        parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer() as transformer:
                try:
                    transformed_record = transformer.transform(
                        record, schema, stream_metadata)
                except Exception as err:
                    LOGGER.error('Transformer Error: {}'.format(err))
                    LOGGER.error('Stream: {}, record: {}'.format(
                        stream_name, record))
                    raise err

                # Reset max_bookmark_value to new value if higher
                if transformed_record.get(bookmark_field):
                    if max_bookmark_value is None or \
                        transformed_record[bookmark_field] > transform_datetime(max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    last_dttm = transform_datetime(last_datetime)
                    bookmark_dttm = transform_datetime(
                        transformed_record[bookmark_field])
                    # Keep only records whose bookmark is after the last_datetime
                    if bookmark_dttm:
                        if bookmark_dttm >= last_dttm:
                            write_record(stream_name, transformed_record, \
                                time_extracted=time_extracted)
                            counter.increment()
                else:
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Exemplo n.º 6
0
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))
    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + CHUNK_SIZES[entity_name]
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(row, schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc ))) # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 7
0
def sync_stream(service, catalog, state, start_date, stream, mdata):
    stream_name = stream.tap_stream_id
    last_datetime = get_bookmark(state, stream_name, start_date)

    write_schema(stream)

    max_modified = last_datetime

    ## TODO: add metrics?
    entitycls = service.entities[stream_name]
    query = service.query(entitycls)

    if hasattr(entitycls, MODIFIED_DATE_FIELD):
        LOGGER.info('{} - Syncing data since {}'.format(
            stream.tap_stream_id, last_datetime))
        query = (query.filter(
            getattr(entitycls, MODIFIED_DATE_FIELD) >=
            singer.utils.strptime_with_tz(last_datetime)).order_by(
                getattr(entitycls, MODIFIED_DATE_FIELD).asc()))
    else:
        LOGGER.info('{} - Syncing using full replication'.format(
            stream.tap_stream_id))

    schema = stream.schema.to_dict()
    optionset_map = get_optionset_metadata(service, stream.tap_stream_id)
    with metrics.record_counter(stream.tap_stream_id) as counter:
        for record in query:
            dict_record = {}
            for odata_prop in entitycls.__odata_schema__['properties']:
                prop_name = odata_prop['name']
                value = getattr(record, prop_name)
                if isinstance(value, datetime):
                    value = singer.utils.strftime(value)
                dict_record[prop_name] = value

                if prop_name in optionset_map:
                    label_prop_name = get_optionset_fieldname(prop_name)
                    if value is None:
                        dict_record[label_prop_name] = None
                    else:
                        dict_record[label_prop_name] = optionset_map[
                            prop_name][value]

            if MODIFIED_DATE_FIELD in dict_record and dict_record[
                    MODIFIED_DATE_FIELD] > max_modified:
                max_modified = dict_record[MODIFIED_DATE_FIELD]

            with Transformer() as transformer:
                dict_record_t = transformer.transform(dict_record, schema,
                                                      mdata)
            singer.write_record(stream.tap_stream_id, dict_record_t)
            counter.increment()

    write_bookmark(state, stream_name, max_modified)
Exemplo n.º 8
0
def sync_users(project):
    url = get_url("users", project['id'])
    project["users"] = []
    with Transformer(pre_hook=transform_row) as transformer:
        for row in gen_request(url):
            transformed_row = transformer.transform(
                row, RESOURCES["users"]["schema"])
            project["users"].append(row["id"])
            singer.write_record("users",
                                transformed_row,
                                time_extracted=utils.now())
Exemplo n.º 9
0
def sync_stream(service, catalog, state, start_date, stream, mdata):
    stream_name = stream.tap_stream_id
    last_datetime = get_bookmark(state, stream_name, start_date)

    write_schema(stream)

    max_modified = last_datetime

    ## TODO: add metrics?
    entitycls = service.entities[stream_name]
    query = service.query(entitycls)

    if hasattr(entitycls, MODIFIED_DATE_FIELD):
        LOGGER.info("{} - Syncing data since {}".format(
            stream.tap_stream_id, last_datetime))
        query = query.filter(
            getattr(entitycls, MODIFIED_DATE_FIELD) >=
            singer.utils.strptime_with_tz(last_datetime)).order_by(
                getattr(entitycls, MODIFIED_DATE_FIELD).asc())
    else:
        LOGGER.info("{} - Syncing using full replication".format(
            stream.tap_stream_id))

    schema = stream.schema.to_dict()

    count = 0
    with metrics.http_request_timer(stream.tap_stream_id):
        with metrics.record_counter(stream.tap_stream_id) as counter:
            for record in query:
                dict_record = {}
                for odata_prop in entitycls.__odata_schema__["properties"]:
                    prop_name = odata_prop["name"]
                    value = getattr(record, prop_name)
                    if isinstance(value, datetime):
                        value = singer.utils.strftime(value)
                    dict_record[prop_name] = value

                if MODIFIED_DATE_FIELD in dict_record:
                    if dict_record[MODIFIED_DATE_FIELD] > max_modified:
                        max_modified = dict_record[MODIFIED_DATE_FIELD]
                    else:
                        continue

                with Transformer() as transformer:
                    dict_record = transformer.transform(
                        dict_record, schema, mdata)
                singer.write_record(stream.tap_stream_id, dict_record)
                counter.increment()

                count += 1
                if count % 5000 == 0:
                    write_bookmark(state, stream_name, max_modified)

    write_bookmark(state, stream_name, max_modified)
Exemplo n.º 10
0
def sync_milestones(project):
    url = get_url("milestones", project['id'])
    with Transformer(pre_hook=transform_row) as transformer:
        for row in gen_request(url):
            transformed_row = transformer.transform(
                row, RESOURCES["milestones"]["schema"])

            if row["updated_at"] >= get_start("project_{}".format(
                    project["id"])):
                singer.write_record("milestones",
                                    transformed_row,
                                    time_extracted=utils.now())
Exemplo n.º 11
0
def sync(client, config, catalog, state):
    LOGGER.info('Starting Sync..')
    selected_streams = catalog.get_selected_streams(state)

    streams = []
    stream_keys = []

    with Transformer() as transformer:
        for catalog_entry in selected_streams:
            streams.append(catalog_entry)
            stream_keys.append(catalog_entry.stream)

        for catalog_entry in streams:
            stream = AVAILABLE_STREAMS[catalog_entry.stream](client=client,
                                                             config=config,
                                                             catalog=catalog,
                                                             state=state)
            LOGGER.info('Syncing stream: %s', catalog_entry.stream)
            stream.update_currently_syncing()
            stream.write_state()
            bookmark_date = stream.get_bookmark(stream.name,
                                                config['start_date'])
            bookmark_dttm = strptime_to_utc(bookmark_date)
            stream_schema = catalog_entry.schema.to_dict()
            stream.write_schema()
            stream_metadata = metadata.to_map(catalog_entry.metadata)
            new_bookmark = bookmark_dttm

            with singer.metrics.job_timer(job_type=stream.name):
                with singer.metrics.record_counter(
                        endpoint=stream.name) as counter:
                    for page in stream.sync(bookmark_dttm):
                        transformed_records = transform(page)
                        for transformed in transformed_records:
                            record_bookmark = datetime.datetime.fromtimestamp(
                                transformed.get('last_modified') / 1000.0,
                                tz=timezone.utc)
                            new_bookmark = max(new_bookmark, record_bookmark)

                            if record_bookmark > bookmark_dttm:
                                singer.write_record(
                                    catalog_entry.stream,
                                    transformer.transform(
                                        transformed,
                                        stream_schema,
                                        stream_metadata,
                                    ))
                                counter.increment()

                    stream.update_bookmark(stream.name, strftime(new_bookmark))
                    stream.write_state()

        LOGGER.info('Finished Sync..')
Exemplo n.º 12
0
def persist_records(catalog, stream_id, records):
    stream = catalog.get_stream(stream_id)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)
    with metrics.record_counter(stream_id) as counter:
        for record in records:
            with Transformer(
                    integer_datetime_fmt=UNIX_SECONDS_INTEGER_DATETIME_PARSING
            ) as transformer:
                record = transformer.transform(record, schema, stream_metadata)
            singer.write_record(stream_id, record)
            counter.increment()
Exemplo n.º 13
0
def sync():
    initialize_shopify_client()

    # Emit all schemas first so we have them for child streams
    for stream in Context.catalog["streams"]:
        if Context.is_selected(stream["tap_stream_id"]):
            singer.write_schema(stream["tap_stream_id"],
                                stream["schema"],
                                stream["key_properties"],
                                bookmark_properties=stream["replication_key"])
            Context.counts[stream["tap_stream_id"]] = 0

    # If there is a currently syncing stream bookmark, shuffle the
    # stream order so it gets sync'd first
    currently_sync_stream_name = Context.state.get(
        'bookmarks', {}).get('currently_sync_stream')
    if currently_sync_stream_name:
        shuffle_streams(currently_sync_stream_name)

    # Loop over streams in catalog
    for catalog_entry in Context.catalog['streams']:
        stream_id = catalog_entry['tap_stream_id']
        stream = Context.stream_objects[stream_id]()

        if not Context.is_selected(stream_id):
            LOGGER.info('Skipping stream: %s', stream_id)
            continue

        LOGGER.info('Syncing stream: %s', stream_id)

        if not Context.state.get('bookmarks'):
            Context.state['bookmarks'] = {}
        Context.state['bookmarks']['currently_sync_stream'] = stream_id

        with Transformer() as transformer:
            for rec in stream.sync():
                extraction_time = singer.utils.now()
                record_schema = catalog_entry['schema']
                record_metadata = metadata.to_map(catalog_entry['metadata'])
                rec = transformer.transform(rec, record_schema,
                                            record_metadata)
                singer.write_record(stream_id,
                                    rec,
                                    time_extracted=extraction_time)
                Context.counts[stream_id] += 1

        Context.state['bookmarks'].pop('currently_sync_stream')
        singer.write_state(Context.state)

    LOGGER.info('----------------------')
    for stream_id, stream_count in Context.counts.items():
        LOGGER.info('%s: %d', stream_id, stream_count)
    LOGGER.info('----------------------')
Exemplo n.º 14
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        bookmark_type=None,
        max_bookmark_value=None,
        last_datetime=None,
        last_integer=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(
                    record, schema, stream_metadata)
                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    if (max_bookmark_value is None) or \
                        (transformed_record[bookmark_field] > transform_datetime(max_bookmark_value)):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    if bookmark_type == 'integer':
                        # Keep only records whose bookmark is after the last_integer
                        if transformed_record[bookmark_field] >= last_integer:
                            write_record(stream_name, transformed_record, \
                                time_extracted=time_extracted)
                            counter.increment()
                    elif bookmark_type == 'datetime':
                        last_dttm = transform_datetime(last_datetime)
                        bookmark_dttm = transform_datetime(
                            transformed_record[bookmark_field])
                        # Keep only records whose bookmark is after the last_datetime
                        if bookmark_dttm >= last_dttm:
                            # LOGGER.info('record1: {}'.format(record)) # TESTING, comment out
                            write_record(stream_name, transformed_record, \
                                time_extracted=time_extracted)
                            counter.increment()
                else:
                    # LOGGER.info('record2: {}'.format(record)) # TESTING, comment out
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        LOGGER.info('Stream: {}, Processed {} records'.format(
            stream_name, counter.value))
        return max_bookmark_value
Exemplo n.º 15
0
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter):
    bulk = Bulk(sf)
    current_bookmark = singer.get_bookmark(
        state, catalog_entry['tap_stream_id'],
        'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry)
    current_bookmark = singer_utils.strptime_with_tz(current_bookmark)
    batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                    'BatchIDs')

    start_time = singer_utils.now()
    stream = catalog_entry['stream']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry.get('metadata'))
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    schema = catalog_entry['schema']

    if not bulk.job_exists(job_id):
        LOGGER.info(
            "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state."
        )
        return counter

    # Iterate over the remaining batches, removing them once they are synced
    for batch_id in batch_ids[:]:
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry):
                counter.increment()
                rec = transformer.transform(rec, schema)
                rec = fix_record_anytype(rec, schema)
                singer.write_message(
                    singer.RecordMessage(stream=(stream_alias or stream),
                                         record=rec,
                                         version=stream_version,
                                         time_extracted=start_time))

                # Update bookmark if necessary
                replication_key_value = replication_key and singer_utils.strptime_with_tz(
                    rec[replication_key])
                if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark:
                    current_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])

        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'JobHighestBookmarkSeen',
                                      singer_utils.strftime(current_bookmark))
        batch_ids.remove(batch_id)
        LOGGER.info("Finished syncing batch %s. Removing batch from state.",
                    batch_id)
        LOGGER.info("Batches to go: %d", len(batch_ids))
        singer.write_state(state)

    return counter
Exemplo n.º 16
0
def persist_records(catalog, stream_id, records):
    if records:  # check for empty array
        stream = catalog.get_stream(stream_id)
        schema = stream.schema.to_dict()
        stream_metadata = metadata.to_map(stream.metadata)
        with metrics.record_counter(stream_id) as counter:
            for record in records:
                with Transformer() as transformer:
                    record = transformer.transform(record, schema,
                                                   stream_metadata)
                singer.write_record(stream_id, record)
                counter.increment()
Exemplo n.º 17
0
def sync_deal_pipelines(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    schema = load_schema('deal_pipelines')
    singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias'))
    LOGGER.info('sync_deal_pipelines')
    data = request(get_url('deal_pipelines')).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema)
            singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now())
    singer.write_state(STATE)
    return STATE
Exemplo n.º 18
0
def process_records(catalog, stream_name, records):
    if records:
        stream = catalog.get_stream(stream_name)
        schema = stream.schema.to_dict()
        stream_metedata = metadata.to_map(stream.metadata)
        with metrics.record_counter(stream_name) as counter:
            for record in records:
                with Transformer() as transformer:
                    record = transformer.transform(record, schema,
                                                   stream_metedata)
                singer.write_record(stream_name, record)
                counter.increment()
Exemplo n.º 19
0
def sync_sub_stream(sub_stream_name,
                    parent_obj,
                    updates=False):
    """
    Given a parent object, retrieve its values for the specified substream.
    """
    extraction_time = singer.utils.now()

    if sub_stream_name == "invoice_line_items":
        object_list = parent_obj.lines
    elif sub_stream_name == "subscription_items":
        # parent_obj.items is a function that returns a dict iterator, so use the attribute
        object_list = parent_obj.get("items")
    elif sub_stream_name == "payout_transactions":
        payout_id = parent_obj['id']
        acct_id = Context.config.get('account_id')
        # Balance transaction history with a payout id param
        # provides the link of transactions to payouts
        object_list = stripe.BalanceTransaction.list(limit=100,
                                                     stripe_account=acct_id,
                                                     payout=payout_id)
    else:
        raise Exception("Attempted to sync substream that is not implemented: {}"
                        .format(sub_stream_name))

    with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        iterator = get_object_list_iterator(object_list)
        for sub_stream_obj in iterator:
            obj_ad_dict = sub_stream_obj.to_dict_recursive()

            if sub_stream_name == "invoice_line_items":
                # Synthetic addition of a key to the record we sync
                obj_ad_dict["invoice"] = parent_obj.id
            elif sub_stream_name == "payout_transactions":
                # payout_transactions is a join table
                obj_ad_dict = {"id": obj_ad_dict['id'], "payout_id": parent_obj['id']}

            rec = transformer.transform(unwrap_data_objects(obj_ad_dict),
                                        Context.get_catalog_entry(sub_stream_name)['schema'],
                                        metadata.to_map(
                                            Context.get_catalog_entry(sub_stream_name)['metadata']
                                        ))
            # NB: Older structures (such as invoice_line_items) may not have had their ID present.
            #     Skip these if they don't match the structure we expect.
            if "id" in rec:
                singer.write_record(sub_stream_name,
                                    rec,
                                    time_extracted=extraction_time)
            if updates:
                Context.updated_counts[sub_stream_name] += 1
            else:
                Context.new_counts[sub_stream_name] += 1
Exemplo n.º 20
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Append all the properties fields for deals to the request
    additional_properties = schema.get("properties").get("properties").get(
        "properties")
    for key in additional_properties.keys():
        params['properties'].append(key)

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore",
                               ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(row, schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 21
0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    sync_one_one = config.get('sync_one_one', "True")
    if sync_one_one or sync_one_one == "True" or sync_one_one == "true":
        sync_one_one = True
    elif not sync_one_one or sync_one_one == "False" or sync_one_one == "false":
        sync_one_one = False
    else:
        raise Exception("Don't understand sync_one_one param in config, must be boolean")
    table_name = table_spec['table_name']
    s3_file_handle, tags = s3.get_file_handle_custom(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0
    for row in iterator:
        if not sync_one_one:
            custom_columns = {
                s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
                s3.SDC_SOURCE_FILE_COLUMN: s3_path,

                # index zero, +1 for header row
                s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
            }
            rec = {**row, **custom_columns}
            with Transformer() as transformer:
                to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata']))
            write_record(table_name, to_write)
        if sync_one_one:
            write_message(
                OneOneMessage(table_name, row, TagSet=tags, sync_one_one=sync_one_one, _sdc_source_file=s3_path))

        records_synced += 1

    return records_synced
Exemplo n.º 22
0
def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("engagements")
    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must save a lookback window
    # that handles the duration of time that this stream was last syncing,
    # and look back by that amount on the next sync
    last_sync_duration = get_previous_time_window(STATE, "engagements")
    current_sync_start = utils.now()
    if has_bookmark(STATE, "engagements", bookmark_key) and \
       last_sync_duration is not None:
        LOGGER.info(("Last sync of engagements lasted {} seconds. Adjusting bookmark by this "
                     "amount to account for race conditions with record updates.").format(last_sync_duration))
        start = utils.strptime_to_utc(start) - datetime.timedelta(seconds=last_sync_duration)
        start = utils.strftime(start)
    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(engagement, schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, max_bk_value)
    # Write duration for next sync's lookback window
    STATE = write_stream_duration(STATE, 'engagements', current_sync_start, utils.now())
    singer.write_state(STATE)
    return STATE
Exemplo n.º 23
0
def sync_sub_stream(sub_stream_name,
                    parent_obj,
                    parent_replication_key,
                    save_bookmarks=True,
                    updates=False):
    """
    Given a parent object, retrieve its values for the specified substream.
    """
    extraction_time = singer.utils.now()

    if sub_stream_name == "invoice_line_items":
        object_list = parent_obj.lines
    elif sub_stream_name == "subscription_items":
        # parent_obj.items is a function that returns a dict iterator, so use the attribute
        object_list = parent_obj.get("items")
    else:
        raise Exception(
            "Attempted to sync substream that is not implemented: {}".format(
                sub_stream_name))

    with Transformer(
            singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        iterator = get_object_list_iterator(object_list)
        for sub_stream_obj in iterator:
            obj_ad_dict = sub_stream_obj.to_dict_recursive()

            if sub_stream_name == "invoice_line_items":
                # Synthetic addition of a key to the record we sync
                obj_ad_dict["invoice"] = parent_obj.id

            rec = transformer.transform(
                unwrap_data_objects(obj_ad_dict),
                Context.get_catalog_entry(sub_stream_name)['schema'],
                metadata.to_map(
                    Context.get_catalog_entry(sub_stream_name)['metadata']))
            # NB: Older structures (such as invoice_line_items) may not have had their ID present.
            #     Skip these if they don't match the structure we expect.
            if "id" in rec:
                singer.write_record(sub_stream_name,
                                    rec,
                                    time_extracted=extraction_time)
            if updates:
                Context.updated_counts[sub_stream_name] += 1
            else:
                Context.new_counts[sub_stream_name] += 1

            sub_stream_bookmark = parent_obj.get(parent_replication_key)

            if save_bookmarks:
                singer.write_bookmark(Context.state, sub_stream_name,
                                      parent_replication_key,
                                      sub_stream_bookmark)
Exemplo n.º 24
0
def process_records(catalog, #pylint: disable=too-many-branches
                    stream_name,
                    records,
                    time_extracted,
                    bookmark_field=None,
                    bookmark_type=None,
                    max_bookmark_value=None,
                    last_datetime=None,
                    last_integer=None,
                    parent=None,
                    parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(record,
                                               schema,
                                               stream_metadata)

                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    if (max_bookmark_value is None) or \
                        (transformed_record[bookmark_field] > max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field:
                    if bookmark_field in transformed_record:
                        if bookmark_type == 'integer':
                            # Keep only records whose bookmark is after the last_integer
                            if transformed_record[bookmark_field] >= last_integer:
                                write_record(stream_name, transformed_record, time_extracted=time_extracted)
                                counter.increment()
                        elif bookmark_type == 'datetime':
                            last_dttm = transformer._transform_datetime(last_datetime)
                            bookmark_dttm = transformer._transform_datetime(transformed_record[bookmark_field])
                            # Keep only records whose bookmark is after the last_datetime
                            if bookmark_dttm >= last_dttm:
                                write_record(stream_name, transformed_record, time_extracted=time_extracted)
                                counter.increment()
                else:
                    write_record(stream_name, transformed_record, time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Exemplo n.º 25
0
def sync_campaign_ids_endpoint(sdk_client,
                               campaign_ids,
                               stream_schema,
                               stream,
                               stream_metadata):
    discovered_schema = load_schema(stream)

    field_list = get_field_list(discovered_schema, stream, stream_metadata)
    discovered_schema['properties']['_sdc_customer_id'] = {
        'description': 'Profile ID',
        'type': 'string',
        'field': "customer_id"
    }
    primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys']
    write_schema(stream, discovered_schema, primary_keys)

    LOGGER.info("Syncing %s for customer %s", stream, sdk_client.client_customer_id)

    for safe_selector in get_campaign_ids_safe_selectors(
            sdk_client,
            campaign_ids,
            stream):
        start_index = 0
        while True:
            page = get_campaign_ids_filtered_page(sdk_client,
                                                  field_list,
                                                  safe_selector,
                                                  stream,
                                                  start_index)
            if page['totalNumEntries'] > GOOGLE_MAX_RESULTSET_SIZE:
                raise Exception("Too many {} ({} > {}) for customer {}, campaigns {}".format(
                    stream,
                    GOOGLE_MAX_RESULTSET_SIZE,
                    page['totalNumEntries'],
                    sdk_client.client_customer_id,
                    campaign_ids))
            if 'entries' in page:
                with metrics.record_counter(stream) as counter:
                    for entry in page['entries']:
                        obj = suds_to_dict(entry)
                        obj['_sdc_customer_id'] = sdk_client.client_customer_id
                        with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: #pylint: disable=line-too-long
                            bumble_bee.pre_hook = transform_pre_hook
                            record = bumble_bee.transform(obj, discovered_schema)

                            singer.write_record(stream, record)
                            counter.increment()

            start_index += PAGE_SIZE
            if start_index > int(page['totalNumEntries']):
                break
    LOGGER.info("Done syncing %s for customer_id %s", stream, sdk_client.client_customer_id)
Exemplo n.º 26
0
def sync_deal_pipelines(STATE, catalog):
    schema = load_schema('deal_pipelines')
    singer.write_schema('deal_pipelines', schema, ['pipelineId'],
                        catalog.get('stream_alias'))
    LOGGER.info('sync_deal_pipelines')
    data = request(get_url('deal_pipelines')).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema)
            singer.write_record("deal_pipelines", record,
                                catalog.get('stream_alias'))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 27
0
def tidy_response(stream, raw_row, needs_coalescing, replication_key):

    if needs_coalescing:
        bookmark_value = raw_row[replication_key[0]] or raw_row[
            replication_key[1]]
    else:
        bookmark_value = raw_row[replication_key]

    with Transformer() as transformer:
        tidy_row = transformer.transform(raw_row, stream.schema.to_dict(),
                                         metadata.to_map(stream.metadata))

    return tidy_row, bookmark_value
Exemplo n.º 28
0
 def write_page(self, page):
     stream = Context.get_catalog_entry(self.tap_stream_id)
     stream_metadata = metadata.to_map(stream.metadata)
     extraction_time = singer.utils.now()
     for rec in page:
         with Transformer() as transformer:
             rec = transformer.transform(rec, stream.schema.to_dict(),
                                         stream_metadata)
         singer.write_record(self.tap_stream_id,
                             rec,
                             time_extracted=extraction_time)
     with metrics.record_counter(self.tap_stream_id) as counter:
         counter.increment(len(page))
Exemplo n.º 29
0
def write_members(members):
    """ Process members array and output SCHEMA and RECORD messages
    """
    members_stream = CATALOG.get_stream("members")
    write_catalog_schema(members_stream)

    for member in members:
        user = member.pop("user")
        member.update(user)

        with Transformer(pre_hook=transform_datetimes_hook) as xform:
            member = xform.transform(member, members_stream.schema.to_dict())
            singer.write_record("members", member)
Exemplo n.º 30
0
 def write_page(self, ctx, page):
     """Formats a list of records in place and outputs the data to
     stdout."""
     stream = ctx.catalog.get_stream(self.tap_stream_id)
     with Transformer() as transformer:
         for rec in page:
             singer.write_record(
                 self.tap_stream_id,
                 transformer.transform(
                     rec,
                     stream.schema.to_dict(),
                     metadata.to_map(stream.metadata),
                 ))
     self.metrics(page)