Exemplo n.º 1
0
def _sync_contacts_by_company(STATE, ctx, company_id):
    schema = load_schema(CONTACTS_BY_COMPANY)
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    url = get_url("contacts_by_company", company_id=company_id)
    path = 'vids'
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        with metrics.record_counter(CONTACTS_BY_COMPANY) as counter:
            data = request(url, default_contacts_by_company_params).json()
            for row in data[path]:
                counter.increment()
                record = {'company-id': company_id, 'contact-id': row}
                record = bumble_bee.transform(record, schema, mdata)
                singer.write_record("contacts_by_company",
                                    record,
                                    time_extracted=utils.now())

    return STATE
Exemplo n.º 2
0
def find_weekly_emails_received(selected_date=previous_week, page_token=None):
    week = create_week(selected_date)
    start_date = week[0]
    end_date = week[-1]

    for date in week:
        process_received(date, page_token)

    total = total_emails_count(total_weekly_emails_received)
    json_response = create_json_response(start_date, end_date, total, 'weekly emails received')

    with Transformer() as transformer:
        transformed_record = transformer.transform(json_response, 'gmail')

    singer.write_schema('gmail', gmail_schema, 'timestamp')
    singer.write_records('gmail', transformed_record)

    return json_response
Exemplo n.º 3
0
def sync_deal_pipelines(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema('deal_pipelines')
    singer.write_schema('deal_pipelines', schema, ['pipelineId'],
                        catalog.get('stream_alias'))
    LOGGER.info('sync_deal_pipelines')
    data = request(get_url('deal_pipelines')).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(lift_properties_and_versions(row),
                                          schema, mdata)
            singer.write_record("deal_pipelines",
                                record,
                                catalog.get('stream_alias'),
                                time_extracted=utils.now())
    singer.write_state(STATE)
    return STATE
Exemplo n.º 4
0
def sync_campaigns(STATE, catalog):
    schema = load_schema("campaigns")
    singer.write_schema("campaigns", schema, ["id"],
                        catalog.get('stream_alias'))
    LOGGER.info("sync_campaigns(NO bookmarks)")
    url = get_url("campaigns_all")
    params = {'limit': 500}

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'campaigns', url, params, "campaigns",
                               "hasMore", ["offset"], ["offset"]):
            record = request(get_url("campaigns_detail",
                                     campaign_id=row['id'])).json()
            record = bumble_bee.transform(record, schema)
            singer.write_record("campaigns", record,
                                catalog.get('stream_alias'))

    return STATE
Exemplo n.º 5
0
    def sync(self, client, **kwargs):
        startdate = kwargs['startdate']
        start, end = self.get_absolute_start_end_time(
            startdate, lookback=int(self.config.get('lookback')))

        max_bookmark_dttm = start

        with singer.metrics.record_counter(endpoint=self.name) as counter:
            while start != end:
                start_str = start.strftime(INVOICE_DATETIME_FMT)
                next_window_str = start_str
                results = client.get_paginated_data(self.api_method,
                                                    self.version,
                                                    self.endpoint,
                                                    data_key=self.data_key,
                                                    params=self.build_params(),
                                                    body=self.build_body(
                                                        start_str,
                                                        next_window_str))

                max_bookmark_value = strftime(max_bookmark_dttm)
                with Transformer(
                        integer_datetime_fmt="no-integer-datetime-parsing"
                ) as transformer:
                    for page in results:
                        for record in page.get(self.data_key):
                            transformed_record = self.transform(record)

                            record_timestamp = strptime_to_utc(
                                transformed_record[self.replication_key])
                            if record_timestamp > max_bookmark_dttm:
                                max_bookmark_value = strftime(record_timestamp)

                            singer.write_record(
                                stream_name=self.name,
                                record=transformer.transform(
                                    data=transformed_record,
                                    schema=self.stream_schema,
                                    metadata=self.stream_metadata),
                                time_extracted=singer.utils.now())
                            counter.increment()
                start = start + timedelta(days=DATE_WINDOW_SIZE)
                self.update_bookmark(self.name, max_bookmark_value)
            return counter.value
Exemplo n.º 6
0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
            s3.SDC_SOURCE_FILE_COLUMN: s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata']))

        write_record(table_name, to_write)
        records_synced += 1

    return records_synced
Exemplo n.º 7
0
def process_records(catalog,
                    stream_name,
                    records,
                    time_extracted,
                    bookmark_field=None,
                    max_bookmark_value=None,
                    last_datetime=None,
                    parent=None,
                    parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer(integer_datetime_fmt=UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) \
                as transformer:
                transformed_record = transformer.transform(
                    record,
                    schema,
                    stream_metadata)

                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    if max_bookmark_value is None or strptime_to_utc(transformed_record[bookmark_field]) > strptime_to_utc(max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    last_dttm = strptime_to_utc(last_datetime)
                    bookmark_dttm = strptime_to_utc(transformed_record[bookmark_field])
                    # Keep only records whose bookmark is after the last_datetime
                    if bookmark_dttm >= last_dttm:
                        write_record(stream_name, transformed_record, time_extracted=time_extracted)
                        counter.increment()
                else:
                    write_record(stream_name, transformed_record, time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Exemplo n.º 8
0
def sync_report_for_day(stream_name, stream_schema, sdk_client, start, field_list): # pylint: disable=too-many-locals
    report_downloader = sdk_client.GetReportDownloader(version=VERSION)
    customer_id = sdk_client.client_customer_id
    report = {
        'reportName': 'Seems this is required',
        'dateRangeType': 'CUSTOM_DATE',
        'reportType': stream_name,
        'downloadFormat': 'CSV',
        'selector': {
            'fields': field_list,
            'dateRange': {'min': start.strftime('%Y%m%d'),
                          'max': start.strftime('%Y%m%d')}}}

     # Fetch the report as a csv string
    with metrics.http_request_timer(stream_name):
        result = attempt_download_report(report_downloader, report)

    headers, values = parse_csv_string(result)
    with metrics.record_counter(stream_name) as counter:
        time_extracted = utils.now()

        for _, val in enumerate(values):
            obj = dict(zip(get_xml_attribute_headers(stream_schema, headers), val))
            obj['_sdc_customer_id'] = customer_id
            obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME
            with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                bumble_bee.pre_hook = transform_pre_hook
                obj = bumble_bee.transform(obj, stream_schema)

            singer.write_record(stream_name, obj, time_extracted=time_extracted)
            counter.increment()

        if start > get_start_for_stream(sdk_client.client_customer_id, stream_name):
            LOGGER.info('updating bookmark: %s > %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name))
            bookmarks.write_bookmark(STATE,
                                     state_key_name(sdk_client.client_customer_id, stream_name),
                                     'date',
                                     start.strftime(utils.DATETIME_FMT))
            singer.write_state(STATE)
        else:
            LOGGER.info('not updating bookmark: %s <= %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name))

        LOGGER.info("Done syncing %s records for the %s report for customer_id %s on %s",
                    counter.value, stream_name, customer_id, start)
Exemplo n.º 9
0
def process_records(stream, mdata, max_modified, records, filter_field):
    schema = stream.schema.to_dict()
    with metrics.record_counter(stream.tap_stream_id) as counter:
        for record in records:
            record_flat = {}

            for prop, value in record.items():
                record_flat[prop] = value

            if (filter_field in record_flat
                    and record_flat[filter_field] > max_modified):
                max_modified = record_flat[filter_field]

            with Transformer() as transformer:
                record_typed = transformer.transform(record_flat, schema,
                                                     mdata)
            singer.write_record(stream.tap_stream_id, record_typed)
            counter.increment()
        return max_modified
Exemplo n.º 10
0
    def sync(self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict:
        """
        The sync logic for an full table stream.

        :param state: A dictionary representing singer state
        :param stream_schema: A dictionary containing the stream schema
        :param stream_metadata: A dictionnary containing stream metadata
        :param config: A dictionary containing tap config data
        :param transformer: A singer Transformer object
        :return: State data in the form of a dictionary
        """
        with metrics.record_counter(self.tap_stream_id) as counter:
            for record in self.get_records(config):
                transformed_record = transformer.transform(record, stream_schema, stream_metadata)
                singer.write_record(self.tap_stream_id, transformed_record)
                counter.increment()

        singer.write_state(state)
        return state
Exemplo n.º 11
0
def sync_file(conn, f, stream, table_spec):
    LOGGER.info('Syncing file "%s".', f["filepath"])

    try:
        file_handle = conn.get_file_handle(f)
    except OSError:
        return 0

    # Add file_name to opts and flag infer_compression to support gzipped files
    opts = {
        'key_properties': table_spec['key_properties'],
        'delimiter': table_spec['delimiter'],
        'file_name': f['filepath']
    }

    readers = csv.get_row_iterators(file_handle,
                                    options=opts,
                                    infer_compression=True)

    records_synced = 0

    for reader in readers:
        with Transformer() as transformer:
            for row in reader:
                custom_columns = {
                    '_sdc_source_file': f["filepath"],

                    # index zero, +1 for header row
                    '_sdc_source_lineno': records_synced + 2
                }
                rec = {**row, **custom_columns}

                to_write = transformer.transform(
                    rec, stream.schema.to_dict(),
                    metadata.to_map(stream.metadata))

                singer.write_record(stream.tap_stream_id, to_write)
                records_synced += 1

    stats.add_file_data(table_spec, f['filepath'], f['last_modified'],
                        records_synced)

    return records_synced
Exemplo n.º 12
0
    def process_record(self, record, time_extracted, bookmark_field):
        with Transformer() as transformer:
            transformed_record = transformer.transform(record,
                                                       self.schema,
                                                       self.stream_metadata)

        self._update_bookmark(transformed_record, bookmark_field)
        if self._is_record_past_bookmark(transformed_record, bookmark_field):
            try:
                write_record(self.stream_name,
                             transformed_record,
                             time_extracted=time_extracted)
            except OSError as err:
                LOGGER.info(f'OS Error writing record for: {self.stream_name}')
                LOGGER.info(f'record: {transformed_record}')
                raise err

            return True
        return False
Exemplo n.º 13
0
def sync_stream(state, instance):
    stream = instance.stream

    with metrics.record_counter(stream.tap_stream_id) as counter:
        for (stream, record) in instance.sync(state):
            counter.increment()

            with Transformer(integer_datetime_fmt=
                             "unix-milliseconds-integer-datetime-parsing"
                             ) as transformer:
                record = transformer.transform(
                    record, stream.schema.to_dict(),
                    metadata.to_map(stream.metadata))
            singer.write_record(stream.tap_stream_id, record)

        if instance.replication_method == "INCREMENTAL":
            singer.write_state(state)

        return counter.value
Exemplo n.º 14
0
def sync_channels(client, catalog, channel_ids, endpoint_config):

    stream_name = 'channels'
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    id_fields = endpoint_config.get('key_properties', 'id')
    params = endpoint_config.get('params', {})
    params['id'] = channel_ids

    records = get_paginated_data(client=client,
                                 url=DATA_URL,
                                 path=stream_name,
                                 endpoint=stream_name,
                                 params=params,
                                 data_key='items')
    time_extracted = utils.now()

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            for key in id_fields:
                if not record.get(key):
                    raise ValueError('Stream: {}, Missing key: {}'.format(
                        stream_name, key))

            with Transformer() as transformer:
                try:
                    transformed_record = transformer.transform(
                        transform_data_record(record), schema, stream_metadata)
                except Exception as err:
                    LOGGER.error('Transformer Error: %s', err)
                    LOGGER.error('Stream: %s, record: %s', stream_name, record)
                    raise err

                write_record(stream_name,
                             transformed_record,
                             time_extracted=time_extracted)
                counter.increment()

        LOGGER.info('Stream: {}, Processed {} records'.format(
            stream_name, counter.value))
        return counter.value
Exemplo n.º 15
0
def process_records(stream, mdata, max_modified, records, filter_field, fks):
    schema = stream.schema.to_dict()
    with metrics.record_counter(stream.tap_stream_id) as counter:
        for record in records:
            record_flat = {
                'id': record['id']
            }
            for prop, value in record['attributes'].items():
                if prop == 'id':
                    raise Exception('Error flattening Outeach record - conflict with `id` key')
                record_flat[prop] = value

            if 'relationships' in record:
                for prop, value in record['relationships'].items():
                    if 'data' not in value and 'links' not in value:
                        raise Exception('Only `data` or `links` expected in relationships')

                    fk_field_name = '{}Id'.format(prop)

                    if 'data' in value and fk_field_name in fks:
                        data_value = value['data']
                        if data_value is not None and 'id' not in data_value:
                            raise Exception('null or `id` field expected for `data` relationship')

                        if fk_field_name in record_flat:
                            raise Exception(
                                '`{}` exists as both an attribute and generated relationship name'.format(fk_field_name))

                        if data_value == None:
                            record_flat[fk_field_name] = None
                        else:
                            record_flat[fk_field_name] = data_value['id']

            if filter_field in record_flat and record_flat[filter_field] > max_modified:
                max_modified = record_flat[filter_field]

            with Transformer() as transformer:
                record_typed = transformer.transform(record_flat,
                                                     schema,
                                                     mdata)
            singer.write_record(stream.tap_stream_id, record_typed)
            counter.increment()
        return max_modified
Exemplo n.º 16
0
def sync_contact_lists(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get("metadata"))
    schema = load_schema("contact_lists")
    bookmark_key = "updatedAt"
    singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key],
                        catalog.get("stream_alias"))

    start = get_start(STATE, "contact_lists", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_contact_lists from %s", start)

    url = get_url("contact_lists")
    params = {"count": 250}
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(
                STATE,
                "contact_lists",
                url,
                params,
                "lists",
                "has-more",
            ["offset"],
            ["offset"],
        ):
            record = bumble_bee.transform(row, schema, mdata)

            if record[bookmark_key] >= start:
                singer.write_record(
                    "contact_lists",
                    record,
                    catalog.get("stream_alias"),
                    time_extracted=utils.now(),
                )
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

    STATE = singer.write_bookmark(STATE, "contact_lists", bookmark_key,
                                  max_bk_value)
    singer.write_state(STATE)

    return STATE
Exemplo n.º 17
0
def process_records(
        catalog,  # pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        max_bookmark_value=None,
        last_datetime=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(
                    record, schema, stream_metadata)

                # Reset max_bookmark_value to new value if higher
                if transformed_record.get(bookmark_field):
                    if max_bookmark_value is None or transformed_record[
                            bookmark_field] > transform_datetime(
                                max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    last_dttm = transform_datetime(last_datetime)
                    bookmark_dttm = transform_datetime(
                        transformed_record[bookmark_field])
                    # Keep only records whose bookmark is after the last_datetime
                    if bookmark_dttm >= last_dttm:
                        write_record(stream_name,
                                     transformed_record,
                                     time_extracted=time_extracted)
                        counter.increment()
                else:
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Exemplo n.º 18
0
def ORIGINAL_TAP(config):
    shop_url = "https://{k}:{p}@{s}.myshopify.com/admin".format(k=config['api_key'],p=config['api_password'],s=config['shop_name'])
    shopify.ShopifyResource.set_site(shop_url)
    
    start_time = time.time()
    if WRITE_TO_TARGET:
        singer.write_schema(config['stream_id'], config['stream_schema'], config['key_properties'], bookmark_properties=config['replication_key'])
    rec_count = 0
    with Transformer() as transformer:
        for rec in sync(config):
            extraction_time = singer.utils.now()
            record_metadata = metadata.to_map(config['stream_metadata'])
            rec = transformer.transform(rec, config['stream_schema'], record_metadata)
            if WRITE_TO_TARGET:
                singer.write_record(config['stream_id'], rec, time_extracted=extraction_time)
            rec_count += 1
    duration = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))

    return (rec_count, duration)
Exemplo n.º 19
0
def sync_deal_pipelines(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get("metadata"))
    schema = load_schema("deal_pipelines")
    singer.write_schema("deal_pipelines", schema, ["pipelineId"],
                        catalog.get("stream_alias"))
    LOGGER.info("sync_deal_pipelines")
    data = request(get_url("deal_pipelines")).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema, mdata)
            singer.write_record(
                "deal_pipelines",
                record,
                catalog.get("stream_alias"),
                time_extracted=utils.now(),
            )
    singer.write_state(STATE)
    return STATE
Exemplo n.º 20
0
def sync_contacts(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    bookmark_key = 'versionTimestamp'
    start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key))
    LOGGER.info("sync_contacts from %s", start)

    max_bk_value = start
    schema = load_schema("contacts")

    singer.write_schema("contacts", schema, ["vid"], [bookmark_key],
                        catalog.get('stream_alias'))

    url = get_url("contacts_all")

    vids = []
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'contacts', url, default_contact_params,
                               'contacts', 'has-more', ['vid-offset'],
                               ['vidOffset']):
            modified_time = None
            if bookmark_key in row:
                modified_time = utils.strptime_with_tz(
                    _transform_datetime(  # pylint: disable=protected-access
                        row[bookmark_key],
                        UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING))

            if not modified_time or modified_time >= start:
                vids.append(row['vid'])

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if len(vids) == 100:
                _sync_contact_vids(catalog, vids, schema, bumble_bee)
                vids = []

        _sync_contact_vids(catalog, vids, schema, bumble_bee)

    STATE = singer.write_bookmark(STATE, 'contacts', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 21
0
def sync_stream(state, start_date, instance):
    stream = instance.stream

    bookmark_date = instance.get_bookmark(state, instance.name, start_date,
                                          instance.replication_key)
    bookmark_dttm = strptime_to_utc(bookmark_date)
    new_bookmark = bookmark_dttm

    with metrics.record_counter(stream.tap_stream_id) as counter, Transformer(
            integer_datetime_fmt="unix-milliseconds-integer-datetime-parsing"
    ) as transformer:
        (stream, records) = instance.sync(state)
        for record in records:
            schema_dict = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            transformed_record = instance.transform(record)

            try:
                transformed_record = transformer.transform(
                    transformed_record, schema_dict, stream_metadata)
            except Exception as err:
                LOGGER.error('Error: %s', err)
                LOGGER.error(' for schema: %s',
                             json.dumps(schema_dict, sort_keys=True, indent=2))
                LOGGER.error('Transform failed for %s', record)
                raise err

            record_timestamp = strptime_to_utc(
                transformed_record.get(
                    humps.decamelize(instance.replication_key)))
            new_bookmark = max(new_bookmark, record_timestamp)

            if record_timestamp > bookmark_dttm:
                singer.write_record(stream.tap_stream_id, transformed_record)
                counter.increment()

        instance.update_bookmark(state, instance.name, strftime(new_bookmark),
                                 instance.replication_key)
        singer.write_state(state)

        return counter.value
Exemplo n.º 22
0
def sync(client, config, catalog, state):
    LOGGER.info('Starting Sync..')
    selected_streams = catalog.get_selected_streams(state)

    streams = []
    stream_keys = []
    with Transformer() as transformer:
        for catalog_entry in selected_streams:
            streams.append(catalog_entry)
            stream_keys.append(catalog_entry.stream)

        for catalog_entry in streams:
            stream = AVAILABLE_STREAMS[catalog_entry.stream](client=client,
                                                             config=config,
                                                             catalog=catalog,
                                                             state=state)
            LOGGER.info('Syncing stream: %s', catalog_entry.stream)
            stream.write_state()
            stream_schema = catalog_entry.schema.to_dict()
            stream.write_schema()
            stream_metadata = metadata.to_map(catalog_entry.metadata)
            max_bookmark_value = None

            with singer.metrics.record_counter(
                    endpoint=stream.name) as counter:
                for page in stream.sync(catalog_entry.metadata):
                    for records in page:
                        transformed_records = transform(records)
                        for transformed in transformed_records:
                            singer.write_record(
                                catalog_entry.stream,
                                transformer.transform(
                                    transformed,
                                    stream_schema,
                                    stream_metadata,
                                ))
                            counter.increment()
                    stream.update_bookmark(stream.name, max_bookmark_value)
                    stream.write_state()

        stream.write_state()
        LOGGER.info('Finished Sync..')
Exemplo n.º 23
0
def sync(config, state, catalog):
    client = PeekClient(config['token'])
    partner_id = config['partner_id']

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            start_date = singer.get_bookmark(state, tap_stream_id,
                                             replication_key,
                                             config['start_date'])
            end_date = singer.utils.strftime(
                singer.utils.now(), format_str=singer.utils.DATETIME_PARSE)

            for record in stream_obj.sync(partner_id=partner_id,
                                          start_date=start_date,
                                          end_date=end_date):
                LOGGER.info(f"Writing record: {record}")
                transformed_record = transformer.transform(
                    record, stream_schema, stream_metadata)
                singer.write_record(
                    tap_stream_id,
                    transformed_record,
                )
            state = singer.clear_bookmark(state, tap_stream_id, 'start_date')
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Exemplo n.º 24
0
def sync():
    # Emit all schemas first so we have them for child streams
    for stream in Context.catalog["streams"]:
        if Context.is_selected(stream["tap_stream_id"]):
            singer.write_schema(stream["tap_stream_id"],
                                stream["schema"],
                                stream["key_properties"])
            Context.counts[stream["tap_stream_id"]] = 0

    # Loop over streams in catalog
    for catalog_entry in Context.catalog['streams']:
        stream_id = catalog_entry['tap_stream_id']
        stream = Context.stream_objects[stream_id]()

        if not Context.is_selected(stream_id):
            LOGGER.info('Skipping stream: %s', stream_id)
            continue

        LOGGER.info('Syncing stream: %s', stream_id)

        if not Context.state.get('bookmarks'):
            Context.state['bookmarks'] = {}
        Context.state['bookmarks']['currently_sync_stream'] = stream_id

        with Transformer() as transformer:
            for rec in stream.sync():
                extraction_time = singer.utils.now()
                record_schema = catalog_entry['schema']
                record_metadata = metadata.to_map(catalog_entry['metadata'])
                rec = transformer.transform(rec, record_schema, record_metadata)
                singer.write_record(stream_id,
                                    rec,
                                    time_extracted=extraction_time)
                Context.counts[stream_id] += 1

        Context.state['bookmarks'].pop('currently_sync_stream')
        singer.write_state(Context.state)

    LOGGER.info('----------------------')
    for stream_id, stream_count in Context.counts.items():
        LOGGER.info('%s: %d', stream_id, stream_count)
    LOGGER.info('----------------------')
Exemplo n.º 25
0
def do_sync(account, catalog, state):
    streams_to_sync = get_streams_to_sync(account, catalog, state)
    refs = load_shared_schema_refs()
    for stream in streams_to_sync:
        LOGGER.info('Syncing %s, fields %s', stream.name, stream.fields())
        schema = singer.resolve_schema_references(load_schema(stream), refs)
        bookmark_key = BOOKMARK_KEYS.get(stream.name)
        singer.write_schema(stream.name, schema, stream.key_properties, bookmark_key, stream.stream_alias)
        with Transformer(pre_hook=transform_date_hook) as transformer:
            with metrics.record_counter(stream.name) as counter:
                for message in stream:
                    if 'record' in message:
                        counter.increment()
                        time_extracted = utils.now()
                        record = transformer.transform(message['record'], schema)
                        singer.write_record(stream.name, record, stream.stream_alias, time_extracted)
                    elif 'state' in message:
                        singer.write_state(message['state'])
                    else:
                        raise TapFacebookException('Unrecognized message {}'.format(message))
Exemplo n.º 26
0
def process_records(catalog,
                    stream_name,
                    records,
                    time_extracted,
                    version=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)
    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(
                    record, schema, stream_metadata)
                write_record(stream_name=stream_name,
                             record=transformed_record,
                             time_extracted=time_extracted,
                             version=version)
                counter.increment()
        return counter.value
Exemplo n.º 27
0
    def for_each_estimate(estimate, time_extracted):
        # Sync estimate messages
        sync_endpoint("estimate_messages",
                      endpoint=("estimates/{}/messages".format(
                          estimate['id'])),
                      path="estimate_messages",
                      with_updated_since=False,
                      date_fields=["send_reminder_on"],
                      map_handler=map_estimate_message)

        # Extract all estimate_line_items
        line_items_schema = load_and_write_schema("estimate_line_items")
        with Transformer() as transformer:
            for line_item in estimate['line_items']:
                line_item['estimate_id'] = estimate['id']
                line_item = transformer.transform(line_item, line_items_schema)

                singer.write_record("estimate_line_items",
                                    line_item,
                                    time_extracted=time_extracted)
Exemplo n.º 28
0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict,
                    stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        write_record(table_name, to_write)
        records_synced += 1

    return records_synced
Exemplo n.º 29
0
    def for_each_invoice(invoice, time_extracted):
        def map_invoice_message(message):
            message['invoice_id'] = invoice['id']
            return message

        def map_invoice_payment(payment):
            payment['invoice_id'] = invoice['id']
            payment['payment_gateway_id'] = payment['payment_gateway']['id']
            payment['payment_gateway_name'] = payment['payment_gateway'][
                'name']
            return payment

        # Sync invoice messages
        sync_endpoint("invoice_messages",
                      endpoint=("invoices/{}/messages".format(invoice['id'])),
                      path="invoice_messages",
                      with_updated_since=False,
                      map_handler=map_invoice_message)

        # Sync invoice payments
        sync_endpoint("invoice_payments",
                      endpoint=("invoices/{}/payments".format(invoice['id'])),
                      path="invoice_payments",
                      with_updated_since=False,
                      map_handler=map_invoice_payment,
                      date_fields=["send_reminder_on"])

        # Extract all invoice_line_items
        line_items_schema = load_and_write_schema("invoice_line_items")
        with Transformer() as transformer:
            for line_item in invoice['line_items']:
                line_item['invoice_id'] = invoice['id']
                if line_item['project'] is not None:
                    line_item['project_id'] = line_item['project']['id']
                else:
                    line_item['project_id'] = None
                line_item = transformer.transform(line_item, line_items_schema)

                singer.write_record("invoice_line_items",
                                    line_item,
                                    time_extracted=time_extracted)
Exemplo n.º 30
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250,
              'properties' : []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias'))

    # Append all the properties fields for deals to the request
    additional_properties = schema.get("properties").get("properties").get("properties")
    for key in additional_properties.keys():
        params['properties'].append(key)

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(row, schema)
                singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE