Exemplo n.º 1
0
def sync_events():
    schema = load_schema("events")
    singer.write_schema("events", schema, [])

    for export_bundle in request_export_bundles():
        with metrics.record_counter("events") as counter:
            for event in download_events(export_bundle['Id']):
                transform_event(event)
                counter.increment()
                singer.write_record("events", event)
            stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop'])
            utils.update_state(STATE, "events", stop_timestamp)
            singer.write_state(STATE)
def load_and_write_schema(ctx, stream):
    singer.write_schema(
        stream.tap_stream_id,
        load_schema(ctx, stream.tap_stream_id),
        stream.pk_fields,
    )
Exemplo n.º 3
0
def sync(config, state, catalog):
    """ Sync data from tap source """
    # Loop over selected streams in catalog
    for stream in catalog.get_selected_streams(state):
        LOGGER.info("Syncing stream:" + stream.tap_stream_id)
        schema = stream.schema.to_dict()
        singer.write_schema(
            stream_name=stream.stream,
            schema=schema,
            key_properties=stream.key_properties,
        )

        id_list_ordered = []
        name_list_ordered = []

        if "sale" == stream.tap_stream_id:
            tap_data_types_formulaires = get_sales_data_from_API(config, state, stream.tap_stream_id, "f")
            tap_data_types_ventes = get_sales_data_from_API(config, state, stream.tap_stream_id, "v")
            record_dict = {}
            for row in tap_data_types_formulaires:
                row = html.unescape(row)
                keys = list(stream.schema.properties.keys())
                value = row.split(";")

                record_dict['types'] = "f"
                for i in range(0, len(value)):
                    record_dict[keys[i+1]] = value[i]

                singer.write_records(stream.tap_stream_id, [record_dict])

            for row in tap_data_types_ventes:
                row = html.unescape(row)
                keys = list(stream.schema.properties.keys())
                value = row.split(";")
                record_dict = {}

                record_dict['types'] = "v"

                for i in range(0, len(value)):
                    record_dict[keys[i+1]] = value[i]

                singer.write_records(stream.tap_stream_id, [record_dict])

        elif stream.tap_stream_id in ["stats_by_campain", "stats_by_site"]:
            if stream.tap_stream_id == "stats_by_campain":
                select_data_by_campain_or_site = "campain"
            else:
                select_data_by_campain_or_site = "site"

            tap_data = get_stats_data_from_API(config, state, stream.tap_stream_id)

            for row in tap_data:
                value = row.split(";")
                id_value = value[0]
                name_value = value[1]
                if id_value not in id_list_ordered and name_value not in name_list_ordered:
                    id_list_ordered.append(id_value)
                    name_list_ordered.append(name_value)

                continue

            for id, name in zip(id_list_ordered, name_list_ordered):
                tap_data = get_stats_data_from_API_by_id(config, state, stream.tap_stream_id, id,
                                                         select_data_by_campain_or_site)
                for row in tap_data:
                    row = html.unescape(row)
                    keys = list(stream.schema.properties.keys())
                    value = row.split(";")

                    record_dict = {}
                    if stream.tap_stream_id == "stats_by_site":
                        record_dict['idsite'] = id
                        record_dict['nomsite'] = name
                    else:
                        record_dict['idcamp'] = id
                        record_dict['nomcamp'] = name

                    for j in range(0, len(value)):
                        if j == 0:  # init date and skip id and nom
                            last_date = value[0][0:4] + "-" + value[0][4:6] + "-" + value[0][6:8] + " 13:37:42 UTC"
                            record_dict[keys[0]] = last_date
                        else:
                            record_dict[keys[j + 2]] = value[j]

                    singer.write_records(stream.tap_stream_id, [record_dict])

        else:
            tap_data = get_stats_data_from_API(config, state, stream.tap_stream_id)
            for row in tap_data:
                row = html.unescape(row)
                keys = list(stream.schema.properties.keys())
                value = row.split(";")
                record_dict = {}
                for i in range(0, len(value)):
                    record_dict[keys[i]] = value[i]

                    if "stats_by_day" in stream.tap_stream_id:
                        last_date = value[0][0:4] + "-" + value[0][4:6] + "-" + value[0][6:8] + " 13:37:42 UTC"
                        record_dict[keys[0]] = last_date

                    elif "stats_by_month" in stream.tap_stream_id:
                        last_date = value[0][0:4] + "-" + value[0][4:6] + "-01 13:37:42 UTC"
                        record_dict[keys[0]] = last_date

                singer.write_records(stream.tap_stream_id, [record_dict])

        bookmark_state(stream.tap_stream_id, state)
    return
Exemplo n.º 4
0
def sync(config, state, catalog):
    errors_encountered = False

    selected_stream_ids = get_selected_streams(catalog)

    client = GAClient(config)

    # Loop over streams in catalog
    for stream in catalog['streams']:
        stream_id = stream['tap_stream_id']
        stream_schema = stream['schema']

        stream_metadata = metadata.to_map(stream['metadata'])
        key_properties = metadata.get(stream_metadata, (),
                                      "table-key-properties")

        if stream_id in selected_stream_ids:
            LOGGER.info('Syncing stream: ' + stream_id)

            try:
                report_definition = ReportsHelper.get_report_definition(stream)
                results = client.process_stream(report_definition)

                # we write the schema message after we are sure that we could
                #  fetch records without errors
                singer.write_schema(stream_id, stream_schema, key_properties)
                singer.write_records(stream_id, results)
            except TapGaInvalidArgumentError as e:
                errors_encountered = True
                LOGGER.error(
                    "Skipping stream: '{}' due to invalid report definition.".
                    format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
            except TapGaRateLimitError as e:
                errors_encountered = True
                LOGGER.error(
                    "Skipping stream: '{}' due to Rate Limit Errors.".format(
                        stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
            except TapGaQuotaExceededError as e:
                errors_encountered = True
                LOGGER.error(
                    "Skipping stream: '{}' due to Quota Exceeded Errors.".
                    format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
            except TapGaAuthenticationError as e:
                LOGGER.error(
                    "Stopping execution while processing '{}' due to Authentication Errors."
                    .format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
                sys.exit(1)
            except TapGaUnknownError as e:
                LOGGER.error(
                    "Stopping execution while processing '{}' due to Unknown Errors."
                    .format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
                sys.exit(1)
        else:
            LOGGER.info('Skipping unselected stream: ' + stream_id)

    # If we encountered errors, exit with 1
    if errors_encountered:
        sys.exit(1)

    return
Exemplo n.º 5
0
            'type': 'string',
            "format": "date"
        },
        'countrycode': {
            'type': 'string',
            'pattern': "^[A-Z]{2}$"
        },
        'store_name': {
            'type': 'string'
        }
    },
    '$schema': 'http://json-schema.org/draft-07/schema#'
}

# Write the schema to stdout.
singer.write_schema(stream_name='products', schema=schema, key_properties=[])


# Return the set of items scraped from a specific store as a list
def retrieve_store_items(store_name, items_endpoint=items_template):
    return requests.get(f"{items_endpoint}{store_name}").json()["items"]


def main():
    for shop in requests.get(shops_template).json()["shops"]:
        singer.write_records(
            stream_name='products',
            # Add the name of the store to every record.
            records=({
                'store_name': shop,
                **item
Exemplo n.º 6
0
def sync_modified_rows(STATE,
                       catalog,
                       schema_name="orders",
                       key_properties=["order_id"]):
    schema = load_schema(schema_name, CONFIG["schema_dir"])
    singer.write_schema(schema_name, schema, key_properties)

    start = get_start(STATE, schema_name, "last_update")
    last_update = start
    offset = 0

    start_at_time = parser.parse(start)
    if start_at_time.tzinfo is None:
        start_at_time = start_at_time.replace(tzinfo=pytz.utc)

    tz_offset = parser.parse("1970-01-02T00:00:00+00:00").replace(
        tzinfo=pytz.utc) - parser.parse(
            "1970-01-02T00:00:00" +
            CONFIG["timezone_offset"]) + datetime.timedelta(
                seconds=CONFIG["relative_time_safety_margin"])

    id_set = set()

    start_process_at = datetime.datetime.now()
    LOGGER.info("Starting %s Sync at %s" %
                (schema_name, str(start_process_at)))
    LOGGER.info("Only syncing %s updated since %s" % (schema_name, start))

    while True:
        # We need to update the relative time inside the loop as the time is moving
        utc_now = datetime.datetime.utcnow().replace(tzinfo=pytz.utc)
        datediff = utc_now - start_at_time + tz_offset

        LOGGER.info("Offset: %d" % offset)
        # First get the list of IDs
        params = {
            "resource": schema_name,
            "days": datediff.days,
            "hours": datediff.seconds / 3600,
            "offset": offset,
            "items_per_page": CONFIG["items_per_page"]
        }
        endpoint = get_endpoint("modified_items", params)
        LOGGER.info("GET %s", endpoint)
        rows = gen_modified_items_request(schema_name, endpoint)
        for row in rows[schema_name]:
            # last_updated is an unix timestamp
            current_timestamp = None
            if row["last_updated"]:
                current_timestamp = datetime.datetime.utcfromtimestamp(
                    int(row["last_updated"])).replace(tzinfo=pytz.utc)
            end_at = parser.parse(CONFIG["end_at"])
            if end_at.tzinfo is None:
                end_at = end_at.replace(tzinfo=pytz.utc)
            if CONFIG.get("end_at") is None or row["last_updated"] is None or (
                    current_timestamp and current_timestamp < end_at):
                id_set.add(row["id"])

        if len(rows[schema_name]) < CONFIG["items_per_page"]:
            LOGGER.info("End of records %d" % len(rows[schema_name]))
            break
        else:
            offset = offset + CONFIG["items_per_page"]

    LOGGER.info("Found %d records" % len(id_set))
    ids = list(id_set)
    with metrics.record_counter(schema_name) as counter:
        current_idx = 0
        while current_idx < len(ids):
            params = {
                "resource":
                schema_name,
                "ids":
                ",".join(ids[
                    current_idx:min(len(ids), current_idx +
                                    INCREMENTAL_ITEMS_PER_PAGE)])
            }
            endpoint = get_endpoint(schema_name + "_by_id", params)
            LOGGER.info("GET %s", endpoint)
            rows = gen_request(schema_name, endpoint)
            if len(rows) < len(ids[current_idx:min(
                    len(ids), current_idx + INCREMENTAL_ITEMS_PER_PAGE)]):
                LOGGER.warning(
                    "Number of items returned from WC API is lower than the ID list size"
                )
            for row in rows:
                counter.increment()
                row = filter_result(row, schema)
                if "_etl_tstamp" in schema["properties"].keys():
                    row["_etl_tstamp"] = time.time()
                singer.write_record(schema_name, row)
            current_idx = current_idx + INCREMENTAL_ITEMS_PER_PAGE

    STATE = singer.write_bookmark(STATE, schema_name, 'last_update',
                                  last_update)
    singer.write_state(STATE)
    end_process_at = datetime.datetime.now()
    LOGGER.info("Completed %s Sync at %s" % (schema_name, str(end_process_at)))
    LOGGER.info("Process duration: " + str(end_process_at - start_process_at))

    return STATE
Exemplo n.º 7
0
def sync_project(  # pylint: disable=too-many-arguments
    schema_name,
    endpoint=None,
    path=None,
    special_field_name=None,
    special_field_value=None,
    date_fields=None,
    with_updated_since=True,
    for_each_handler=None,
    map_handler=None,
    object_to_id=None,
    is_selected=False,
    selected_sub_stream=[],
):
    schema = load_schema(schema_name)
    if is_selected:
        bookmark_property = 'updated_at'
        LOGGER.info('Loading ' + schema_name)

        singer.write_schema(schema_name,
                            schema, ['id'],
                            bookmark_properties=[bookmark_property])

        start = get_start(schema_name)

    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url

        response = request(url, None)

        for row in response:

            item = transformer.transform(row, schema)

            time_extracted = utils.now()

            # find related
            if 'expense_items' in selected_sub_stream:
                sync_endpoint(
                    'expense_items', BASE_API_URL + 'projects/' +
                    str(row['id']) + '/expense_items', None, 'project_id',
                    str(row['id']))
            if 'invoices' in selected_sub_stream:
                sync_endpoint(
                    'invoices',
                    BASE_API_URL + 'projects/' + str(row['id']) + '/invoices',
                    None, 'project_id', str(row['id']))

            if 'milestones' in selected_sub_stream:
                sync_endpoint(
                    'milestones', BASE_API_URL + 'projects/' + str(row['id']) +
                    '/milestones', None, 'project_id', str(row['id']))
            if 'project_team' in selected_sub_stream:
                sync_endpoint(
                    'project_team',
                    BASE_API_URL + 'projects/' + str(row['id']) + '/team',
                    None,
                    'project_id',
                    str(row['id']),
                    ['person_id', 'project_id'],
                )
            if 'sprints' in selected_sub_stream:
                sync_endpoint(
                    'sprints',
                    BASE_API_URL + 'projects/' + str(row['id']) + '/sprints',
                    None, 'project_id', str(row['id']))
            if 'workflow_columns' in selected_sub_stream:
                sync_endpoint(
                    'workflow_columns', BASE_API_URL + 'projects/' +
                    str(row['id']) + '/workflow_columns', None, 'project_id',
                    str(row['id']))
            if 'project_financials' in selected_sub_stream:
                sync_endpoint(
                    'project_financials',
                    BASE_API_URL + 'projects/' + str(row['id']) +
                    '/financials',
                    None,
                    None,
                    None,
                    ['project_id'],
                )

            if is_selected and (bookmark_property in item and item[bookmark_property] \
                >= start):
                singer.write_record(schema_name,
                                    item,
                                    time_extracted=time_extracted)

                utils.update_state(STATE, schema_name, item[bookmark_property])
    singer.write_state(STATE)
Exemplo n.º 8
0
def sync_endpoint(schema_name,
                  endpoint=None,
                  path=None,
                  special_field_name=None,
                  special_field_value=None,
                  keys=None,
                  object_to_id=None,
                  parameter_for_updated=None):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    if keys is None:
        keys = ['id']
    singer.write_schema(schema_name,
                        schema,
                        keys,
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)
    start_dt = datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S.%fZ')
    updated_since = start_dt.strftime("%Y%m%dT%H%M%S")
    LOGGER.info('updated_since ' + updated_since)
    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url
        if parameter_for_updated is not None:
            url = url + '?' + parameter_for_updated + '=' + updated_since
        response = request(url, None)
        LOGGER.info('URL :' + url)
        if schema_name is 'project_financials':
            response = [response]

        time_extracted = utils.now()

        for row in response:

            if special_field_name is not None:
                row[special_field_name] = special_field_value

            if object_to_id is not None:
                for key in object_to_id:
                    if row[key] is not None:
                        row[key + '_id'] = row[key]['id']
                    else:
                        row[key + '_id'] = None

            item = transformer.transform(row, schema)

            if not bookmark_property in item:
                item[bookmark_property] = \
                    datetime.datetime.now().strftime('%Y-%m-%d') \
                    + 'T00:00:00.00Z'

            if datetime.datetime.strptime(item[bookmark_property],
                                          '%Y-%m-%dT%H:%M:%S.%fZ') >= start_dt:
                singer.write_record(schema_name,
                                    item,
                                    time_extracted=time_extracted)

                utils.update_state(STATE, schema_name, item[bookmark_property])
    singer.write_state(STATE)
Exemplo n.º 9
0
def write_schema(stream):
    schema = stream.schema.to_dict()
    singer.write_schema(stream.tap_stream_id, schema, stream.key_properties)
Exemplo n.º 10
0
    def sync(self):
        key_properties = self.catalog.get('key_properties')
        table = self.TABLE

        singer.write_schema(self.catalog.get('stream'),
                            self.catalog.get('schema'),
                            key_properties=key_properties)

        field_selector = get_field_selector(self.catalog,
                                            self.catalog.get('schema'))

        includeGeoIpData = self.any_selected([
            'geoIPCity', 'geoIPStateRegion', 'geoIPZip', 'geoIPCountry',
            'geoIPCountryCode'
        ])

        includeTechnologyData = self.any_selected([
            'primaryBrowser', 'mobileBrowser', 'primaryEmailClient'
            'mobileEmailClient', 'operatingSystem'
        ])

        includeRFMData = self.any_selected([
            'firstOrderDate', 'lastOrderDate', 'lastOrderTotal'
            'totalOrders', 'totalRevenue', 'averageOrderValue'
        ])

        includeEngagementData = self.any_selected(
            ['lastDeliveryDate', 'lastOpenDate', 'lastClickDate'])

        if includeGeoIpData:
            LOGGER.info('Including GEOIP data.')

        if includeTechnologyData:
            LOGGER.info('Including technology data.')

        if includeRFMData:
            LOGGER.info('Including RFM data.')

        if includeEngagementData:
            LOGGER.info('Including engagement data.')

        LOGGER.info('Syncing contacts.')

        start = self.get_start_date(table)
        end = start
        interval = timedelta(hours=6)

        def flatten(item):
            read_only_data = item.get('readOnlyContactData', {}) or {}
            item.pop('readOnlyContactData', None)
            return dict(item, **read_only_data)

        while end < datetime.now(pytz.utc):
            start = end
            end = start + interval
            LOGGER.info("Fetching contacts modified from {} to {}".format(
                start, end))

            _filter = self.make_filter(start, end)

            pageNumber = 1
            hasMore = True
            while hasMore:
                retry_count = 0
                try:
                    results = self.client.service.readContacts(
                        filter=_filter,
                        includeLists=True,
                        fields=[],
                        pageNumber=pageNumber,
                        includeSMSKeywords=True,
                        includeGeoIPData=includeGeoIpData,
                        includeTechnologyData=includeTechnologyData,
                        includeRFMData=includeRFMData,
                        includeEngagementData=includeEngagementData)

                except socket.timeout:
                    retry_count += 1
                    if retry_count >= 5:
                        LOGGER.error(
                            "Retried more than five times, moving on!")
                        raise
                    LOGGER.warn("Timeout caught, retrying request")
                    continue
                except Fault as e:
                    if '103' in e.message:
                        LOGGER.warn(
                            "Got signed out - logging in again and retrying")
                        self.login()
                        continue
                    else:
                        raise

                LOGGER.info("... {} results".format(len(results)))
                extraction_time = singer.utils.now()
                for result in results:
                    result_dict = zeep.helpers.serialize_object(
                        result, target_cls=dict)
                    flattened = flatten(result_dict)
                    singer.write_record(table,
                                        field_selector(flattened),
                                        time_extracted=extraction_time)

                if len(results) == 0:
                    hasMore = False

                pageNumber = pageNumber + 1

            self.state = incorporate(self.state, table, self.REPLICATION_KEY,
                                     start.replace(microsecond=0).isoformat())

            save_state(self.state)

        LOGGER.info("Done syncing contacts.")
Exemplo n.º 11
0
def sync_invoices():
    messages_schema = load_schema("invoice_messages")
    bookmark_property = 'updated_at'
    singer.write_schema("invoice_messages",
                        messages_schema, ["id"],
                        bookmark_properties=[bookmark_property])

    payments_schema = load_schema("invoice_payments")
    singer.write_schema("invoice_payments",
                        payments_schema, ["id"],
                        bookmark_properties=[bookmark_property])

    schema = load_schema("invoices")
    singer.write_schema("invoices",
                        schema, ["id"],
                        bookmark_properties=[bookmark_property])

    start = get_start("invoices")

    start_dt = pendulum.parse(start)
    updated_since = start_dt.strftime("%Y-%m-%d %H:%M")

    url = get_url("invoices")
    with Transformer() as transformer:
        while True:
            data = request(url, {"updated_since": updated_since})
            invoices_time_extracted = utils.now()

            for row in data:
                item = row["invoices"]
                item = transformer.transform(item, schema)
                append_times_to_dates(item, ["issued_at", "due_at"])

                singer.write_record("invoices",
                                    item,
                                    time_extracted=invoices_time_extracted)

                utils.update_state(STATE, "invoices", item['updated_at'])

                suburl = url + "/{}/messages".format(item['id'])
                messages_data = request(suburl)
                messages_time_extracted = utils.now()
                for subrow in messages_data:
                    subitem = subrow["message"]
                    if subitem['updated_at'] >= start:
                        append_times_to_dates(subitem, ["send_reminder_on"])
                        singer.write_record(
                            "invoice_messages",
                            subitem,
                            time_extracted=messages_time_extracted)

                suburl = url + "/{}/payments".format(item['id'])
                payments_data = request(suburl)
                payments_time_extracted = utils.now()

                for subrow in payments_data:
                    subitem = subrow["payment"]
                    subitem = transformer.transform(subitem, payments_schema)
                    if subitem['updated_at'] >= start:
                        singer.write_record(
                            "invoice_payments",
                            subitem,
                            time_extracted=payments_time_extracted)

                singer.write_state(STATE)

            if len(data) < 50:
                break

        singer.write_state(STATE)
Exemplo n.º 12
0
def sync_projects():
    bookmark_property = 'updated_at'
    tasks_schema = load_schema("project_tasks")
    singer.write_schema("project_tasks",
                        tasks_schema, ["id"],
                        bookmark_properties=[bookmark_property])

    users_schema = load_schema("project_users")
    singer.write_schema("project_users",
                        users_schema, ["id"],
                        bookmark_properties=[bookmark_property])

    entries_schema = load_schema("time_entries")
    singer.write_schema("time_entries",
                        entries_schema, ["id"],
                        bookmark_properties=[bookmark_property])

    schema = load_schema("projects")
    singer.write_schema("projects",
                        schema, ["id"],
                        bookmark_properties=[bookmark_property])
    start = get_start("projects")

    start_dt = pendulum.parse(start)
    updated_since = start_dt.strftime("%Y-%m-%d %H:%M")

    url = get_url("projects")
    projects_data = request(url)
    projects_time_extracted = utils.now()

    with Transformer() as transformer:
        for row in projects_data:
            item = row["project"]
            item = transformer.transform(item, schema)
            date_fields = [
                "starts_on", "ends_on", "hint_earliest_record_at",
                "hint_latest_record_at"
            ]

            append_times_to_dates(item, date_fields)

            if item[bookmark_property] >= start:
                singer.write_record("projects",
                                    item,
                                    time_extracted=projects_time_extracted)

                utils.update_state(STATE, "projects", item[bookmark_property])

            suburl = url + "/{}/user_assignments".format(item["id"])
            project_users_data = request(
                suburl, params={"updated_since": updated_since})
            project_users_time_extracted = utils.now()

            for subrow in project_users_data:
                subitem = subrow["user_assignment"]
                subitem = transformer.transform(subitem, users_schema)
                singer.write_record(
                    "project_users",
                    subitem,
                    time_extracted=project_users_time_extracted)

            suburl = url + "/{}/task_assignments".format(item["id"])
            task_assignments_data = request(
                suburl, params={"updated_since": updated_since})
            task_assignments_time_extracted = utils.now()

            for subrow in task_assignments_data:
                subitem = subrow["task_assignment"]
                subitem = transformer.transform(subitem, tasks_schema)
                singer.write_record(
                    "project_tasks",
                    subitem,
                    time_extracted=task_assignments_time_extracted)

            suburl = url + "/{}/entries".format(item["id"])
            subparams = {
                "from": start_dt.strftime("%Y%m%d"),
                "to": datetime.datetime.utcnow().strftime("%Y%m%d"),
                "updated_since": updated_since,
            }

            time_entries_data = request(suburl, params=subparams)
            time_entries_time_extracted = utils.now()

            for subrow in time_entries_data:
                subitem = subrow["day_entry"]
                subitem = transformer.transform(subitem, entries_schema)
                singer.write_record("time_entries",
                                    subitem,
                                    time_extracted=time_entries_time_extracted)

    singer.write_state(STATE)
Exemplo n.º 13
0
def sync_transactions():
    schema = load_schema("transactions")

    singer.write_schema("transactions",
                        schema, ["id"],
                        bookmark_properties=['created_at'])

    latest_updated_at = utils.strptime_to_utc(
        STATE.get('latest_updated_at', DEFAULT_TIMESTAMP))

    run_maximum_updated_at = latest_updated_at

    latest_disbursement_date = utils.strptime_to_utc(
        STATE.get('latest_disbursment_date', DEFAULT_TIMESTAMP))

    run_maximum_disbursement_date = latest_disbursement_date

    latest_start_date = utils.strptime_to_utc(get_start("transactions"))

    period_start = latest_start_date - TRAILING_DAYS

    period_end = utils.now()

    logger.info("transactions: Syncing from {}".format(period_start))

    logger.info(
        "transactions: latest_updated_at from {}, disbursement_date from {}".
        format(latest_updated_at, latest_disbursement_date))

    logger.info(
        "transactions: latest_start_date from {}".format(latest_start_date))

    # increment through each day (20k results max from api)
    for start, end in daterange(period_start, period_end):

        end = min(end, period_end)

        data = braintree.Transaction.search(
            braintree.TransactionSearch.created_at.between(start, end))
        time_extracted = utils.now()

        logger.info("transactions: Fetched {} records from {} - {}".format(
            data.maximum_size, start, end))

        row_written_count = 0
        row_skipped_count = 0

        for row in data:
            # Ensure updated_at consistency
            if not getattr(row, 'updated_at'):
                row.updated_at = row.created_at

            transformed = transform_row(row, schema)
            updated_at = to_utc(row.updated_at)

            # if disbursement is successful, get disbursement date
            # set disbursement datetime to min if not found

            if row.disbursement_details is None:
                disbursement_date = datetime.min

            else:
                if row.disbursement_details.disbursement_date is None:
                    row.disbursement_details.disbursement_date = datetime.min

                disbursement_date = to_utc(
                    datetime.combine(
                        row.disbursement_details.disbursement_date,
                        datetime.min.time()))

            # Is this more recent than our past stored value of update_at?
            # Is this more recent than our past stored value of disbursement_date?
            # Use >= for updated_at due to non monotonic updated_at values
            # Use > for disbursement_date - confirming all transactions disbursed
            # at the same time
            # Update our high water mark for updated_at and disbursement_date
            # in this run
            if (updated_at >= latest_updated_at) or (disbursement_date >=
                                                     latest_disbursement_date):

                if updated_at > run_maximum_updated_at:
                    run_maximum_updated_at = updated_at

                if disbursement_date > run_maximum_disbursement_date:
                    run_maximum_disbursement_date = disbursement_date

                singer.write_record("transactions",
                                    transformed,
                                    time_extracted=time_extracted)
                row_written_count += 1

            else:

                row_skipped_count += 1

        logger.info("transactions: Written {} records from {} - {}".format(
            row_written_count, start, end))

        logger.info("transactions: Skipped {} records from {} - {}".format(
            row_skipped_count, start, end))

    # End day loop
    logger.info("transactions: Complete. Last updated record: {}".format(
        run_maximum_updated_at))

    logger.info("transactions: Complete. Last disbursement date: {}".format(
        run_maximum_disbursement_date))

    latest_updated_at = min(run_maximum_updated_at, period_end)

    latest_disbursement_date = min(run_maximum_disbursement_date, period_end)

    STATE['latest_updated_at'] = utils.strftime(latest_updated_at)

    STATE['latest_disbursement_date'] = utils.strftime(
        latest_disbursement_date)

    utils.update_state(STATE, "transactions", utils.strftime(end))

    singer.write_state(STATE)
Exemplo n.º 14
0
def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))

    if "schema" in catalog:
        schema = catalog["schema"]
    else:
        schema = load_schema('engagements')

    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"],
                        [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "engagements") or utils.now()
    STATE = write_current_sync_start(STATE, "engagements", current_sync_start)
    singer.write_state(STATE)

    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key,
                              "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(
                lift_properties_and_versions(engagement), schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start)
    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'engagements', None)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 15
0
 def write_schema(self):
     singer.write_schema(self.schema,
                         self.get_schema(),
                         key_properties=self.key_properties)
Exemplo n.º 16
0
    def sync_substream(self, state, parent, sub_stream, parent_response):
        bookmark_date = self.get_bookmark(state, sub_stream.name,
                                          self.config.get('start_date'),
                                          sub_stream.replication_key)
        # If last sync was interrupted, get last processed parent record
        last_processed = self.get_bookmark(state,
                                           sub_stream.name,
                                           None,
                                           key="last_processed")
        bookmark_dttm = strptime_to_utc(bookmark_date)
        new_bookmark = bookmark_dttm

        singer.write_schema(sub_stream.name,
                            sub_stream.stream.schema.to_dict(),
                            sub_stream.key_properties)

        # Slice response for >= last processed
        if last_processed:
            for i, e in enumerate(parent_response):
                if e.get(parent.key_properties[0]) == last_processed:
                    LOGGER.info("Resuming %s sync with %s", sub_stream.name,
                                e.get(parent.key_properties[0]))
                    parent_response = parent_response[i:len(parent_response)]
                    continue

        for record in parent_response:
            try:
                with metrics.record_counter(
                        sub_stream.name) as counter, Transformer(
                            integer_datetime_fmt=
                            "unix-milliseconds-integer-datetime-parsing"
                        ) as transformer:
                    stream_events = sub_stream.sync(
                        state, new_bookmark,
                        record.get(parent.key_properties[0]))
                    for event in stream_events:
                        counter.increment()

                        schema_dict = sub_stream.stream.schema.to_dict()
                        stream_metadata = metadata.to_map(
                            sub_stream.stream.metadata)

                        transformed_event = humps.decamelize(event)

                        try:
                            transformed_record = transformer.transform(
                                transformed_event, schema_dict,
                                stream_metadata)
                        except Exception as err:
                            LOGGER.error('Error: %s', err)
                            LOGGER.error(
                                ' for schema: %s',
                                json.dumps(schema_dict,
                                           sort_keys=True,
                                           indent=2))
                            raise err

                        event_time = strptime_to_utc(
                            transformed_record.get(sub_stream.replication_key))

                        new_bookmark = max(new_bookmark, event_time)
                        singer.write_record(sub_stream.stream.tap_stream_id,
                                            transformed_record)

            except HTTPError:
                LOGGER.warning(
                    "Unable to retrieve %s Event for Stream (ID: %s)",
                    sub_stream.name, record[parent.key_properties[0]])

            # All events for all parents processed; can removed last processed
            self.update_bookmark(state=state,
                                 stream=sub_stream.name,
                                 bookmark_value=record.get(
                                     parent.key_properties[0]),
                                 bookmark_key="last_processed")
            self.update_bookmark(state=state,
                                 stream=sub_stream.name,
                                 bookmark_value=strftime(new_bookmark),
                                 bookmark_key=sub_stream.replication_key)
        # After processing for all parent ids we can remove our resumption state
        state.get('bookmarks').get(sub_stream.name).pop('last_processed')
        update_currently_syncing(state, None)
Exemplo n.º 17
0
 def write_schema(self):
     singer.write_schema(self.catalog.stream,
                         self.catalog.schema.to_dict(),
                         key_properties=self.stream_metadata.get(
                             'table-key-properties', []))
Exemplo n.º 18
0
def load_and_write_schema(tap_stream_id, catalog):
    stream = get_stream_from_catalog(tap_stream_id, catalog)
    singer.write_schema(tap_stream_id, stream['schema'],
                        PK_FIELDS[tap_stream_id])
Exemplo n.º 19
0
def sync_allocations(
    schema_name,
    endpoint=None,
    path=None,
    special_field_name=None,
    special_field_value=None,
    keys=None,
    object_to_id=None,
):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    if keys is None:
        keys = ['id']
    singer.write_schema(schema_name,
                        schema,
                        keys,
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)
    weekDays = [
        'monday',
        'tuesday',
        'wednesday',
        'thursday',
        'friday',
        'saturday',
        'sunday',
    ]
    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url

        response = request(url, None)

        time_extracted = utils.now()

        for row in response:

            # add here logic

            date = datetime.datetime.strptime(row['start_date'], '%Y-%m-%d')
            LOGGER.info("Project" + str(row['project']) + "-" +
                        str(row['person']))
            end_date = datetime.datetime.strptime(row['end_date'], '%Y-%m-%d')

            newRow = {}
            #LOGGER.info("ID:"  + str(row['id']))
            #LOGGER.info("Date :  "  + date.strftime('%Y%m%d'))

            while date <= end_date:
                #LOGGER.info('Date :  ' + str(date.weekday()) + 'weekday'
                #             + weekDays[date.weekday()])
                #LOGGER.info(row['project'])
                #LOGGER.info(row[weekDays[date.weekday()]])
                #LOGGER.info(str(date.strftime('%Y-%m-%d')))
                #if row['id'] = 72051:
                #    LOGGER.info(row['project'])
                #    LOGGER.info(row['person'])
                #    LOGGER.info(str(date.strftime('%Y-%m-%d')))
                #    LOGGER.info(str(end_date.strftime('%Y-%m-%d')))

                newRow['allocation'] = row[weekDays[date.weekday()]]
                if not newRow['allocation'] > 0:
                    date = date + timedelta(days=1)
                    continue
                newRow['project'] = row['project']
                newRow['non_project_time'] = row['non_project_time']
                newRow['connected_project'] = row['connected_project']
                newRow['person'] = row['person']
                newRow['project'] = row['project']
                newRow['date'] = date.strftime('%Y-%m-%d')
                newRow['notes'] = row['notes']
                newRow['created_by'] = row['created_by']
                newRow['updated_by'] = row['updated_by']
                newRow['created_at'] = row['created_at']
                newRow['updated_at'] = row['updated_at']
                newRow['id'] = str(row['id']) \
                    + str(date.strftime('%Y%m%d'))

                date = date + timedelta(days=1)

                item = transformer.transform(newRow, schema)

                if not bookmark_property in item:
                    item[bookmark_property] = \
                        datetime.datetime.now().strftime('%Y-%m-%d') \
                        + 'T00:00:00Z'

                if bookmark_property in item \
                    and item[bookmark_property] >= start:
                    singer.write_record(schema_name,
                                        item,
                                        time_extracted=time_extracted)

                    utils.update_state(STATE, schema_name,
                                       item[bookmark_property])
                else:
                    singer.write_record(schema_name,
                                        item,
                                        time_extracted=time_extracted)

                    # take any additional actions required for the currently loaded endpoint

                    utils.update_state(STATE, schema_name,
                                       item[bookmark_property])
        singer.write_state(STATE)
Exemplo n.º 20
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(
                    lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE, ctx,
                                                      record['companyId'])

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 21
0
def sync_rate_cards(  # pylint: disable=too-many-arguments
    schema_name,
    endpoint=None,
    path=None,
    special_field_name=None,
    special_field_value=None,
    date_fields=None,
    with_updated_since=True,
    for_each_handler=None,
    map_handler=None,
    object_to_id=None,
):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    singer.write_schema(schema_name,
                        schema, ['id'],
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)

    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url
        response = request(url, None)

        time_extracted = utils.now()

        for row in response:
            if map_handler is not None:
                row = map_handler(row)

            if object_to_id is not None:
                for key in object_to_id:
                    if row[key] is not None:
                        row[key + '_id'] = row[key]['id']
                    else:
                        row[key + '_id'] = None

            item = transformer.transform(row, schema)
            if not bookmark_property in item:
                item[bookmark_property] = \
                    datetime.datetime.now().strftime('%Y-%m-%d') \
                    + 'T00:00:00Z'

            # find expenses

            sync_endpoint(
                'rate_cards_rates',
                BASE_API_URL + 'rate_cards/' + str(row['id']) + '/rates',
                None,
                'rate_card_id',
                str(row['id']),
                ['rate_card_id', 'role'],
            )

            singer.write_record(schema_name,
                                item,
                                time_extracted=time_extracted)

            # take any additional actions required for the currently loaded endpoint

            utils.update_state(STATE, schema_name, item[bookmark_property])
    singer.write_state(STATE)
Exemplo n.º 22
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    if mdata.get(('properties', 'properties'),
                 {}).get('selected') or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        params['includeAllProperties'] = True
        params['allPropertiesFetchMode'] = 'latest_version'

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore",
                               ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(
                    lift_properties_and_versions(row), schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 23
0
 def write_schema(self):
     # for /recents/ streams override default (schema name equals to endpoint) with items
     singer.write_schema(self.schema,
                         self.get_schema(),
                         key_properties=self.key_properties)
Exemplo n.º 24
0
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(
                    UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(
                            lift_properties_and_versions(row), schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset',
                                                  data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 25
0
def output_schema(stream):
    schema = schemas.load_schema(stream.tap_stream_id)
    pk_fields = schemas.PK_FIELDS[stream.tap_stream_id]
    singer.write_schema(stream.tap_stream_id, schema, pk_fields)
Exemplo n.º 26
0
 def write_schema(self):
     singer.write_schema(self.catalog.stream,
                         self.catalog.schema.to_dict(),
                         key_properties=self.KEY_PROPERTIES,
                         bookmark_properties=self.BOOKMARK_PROPERTIES)
Exemplo n.º 27
0
def do_sync(config, state, catalog):
    """ Sync data from tap source """
    # Loop over selected streams in catalog
    # pickup_year is the most recent year value in the STATE file
    now = datetime.datetime.now()

    for stream in catalog.get_selected_streams(state):

        LOGGER.info("Syncing stream:" + stream.tap_stream_id)
        bookmark_column = stream.replication_key
        is_sorted = False  # TODO: indicate whether data is sorted ascending on bookmark value

        if "startyear" in config.keys():
            stream_start_year = config['startyear']
        else:
            stream_start_year = "2000"

        if ("endyear" in config.keys()) and (len(config['endyear']) > 3):
            try:
                stream_end_year = config['endyear'] if int(
                    config['endyear']) <= now.year else now.year
            except:
                stream_end_year = now.year
        else:
            stream_end_year = now.year

        if "calculations" in config.keys():
            stream_calculations = config['calculations']
        else:
            stream_calculations = "False"

        if "annualaverage" in config.keys():
            stream_annualaverage = config['annualaverage']
        else:
            stream_annualaverage = "False"

        if "aspects" in config.keys():
            stream_aspects = config['aspects']
        else:
            stream_aspects = "False"

        # check if the STATE.json requests a more recent start date
        if "bookmarks" in state.keys(
        ):  # check the state even has bookmarks...
            if stream.stream in state["bookmarks"].keys(
            ):  # if this stream as an entry in the state.json file...
                try:
                    pickup_year = int(
                        state["bookmarks"][stream.stream]['year'])
                except:
                    start_year = False
                    year_reset = "There was an error with the year format \"" + state[
                        stream.
                        stream] + "\" in the State.json file for stream " + str(
                            stream.stream) + " - pickin up at year " + str(
                                stream_start_year) + "."
                    LOGGER.info(year_reset)
                else:
                    start_year = int(config['startyear'])
                    if (start_year < pickup_year and pickup_year <= now.year):
                        stream_start_year = str(pickup_year)
                        year_reset = "As per state, overriding start year for stream " + str(
                            stream.stream) + " to " + stream_start_year
                        LOGGER.info(year_reset)

        # make the call
        the_call = {
            "seriesid": [stream.tap_stream_id],
            "startyear": stream_start_year,
            "endyear": stream_end_year,
            "calculations": stream_calculations,
            "annualaverage": stream_annualaverage,
            "aspects": stream_aspects
        }

        if 'api-key' in config.keys():
            the_call["registrationkey"] = config['api-key']

        json_data = call_api(the_call)

        if not json_data:
            return

        raw_schema = stream.schema.to_dict()

        series_frequency = json_data['Results']['series'][0]['data'][0][
            'period'][
                0]  # assigns 'A' for annual, 'Q' for quarterly and 'M' for monthly.

        if series_frequency == "A":  # series is annual
            raw_schema['properties']['year'] = {"type": ["null", "integer"]}
        if series_frequency == "S":  # series is semi-annual
            raw_schema['properties']['period'] = {"type": ["null", "integer"]}
        if series_frequency == "Q":  # series is quarterly
            raw_schema['properties']['quarter'] = {"type": ["null", "integer"]}
            raw_schema['properties']['year'] = {"type": ["null", "integer"]}
        if series_frequency == "M":  # series is monthly
            raw_schema['properties']['month'] = {"type": ["null", "integer"]}

        if ("calculations"
                in config.keys()) and (config['calculations'].lower()
                                       == "true"):
            raw_schema['properties']['net_change_1'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['net_change_3'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['net_change_6'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['net_change_12'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['pct_change_1'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['pct_change_3'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['pct_change_6'] = {
                "type": ["null", "number"]
            }
            raw_schema['properties']['pct_change_12'] = {
                "type": ["null", "number"]
            }

        if ("aspects" in config.keys()) and (config['aspects'].lower()
                                             == "true"):
            raw_schema['properties']['aspects'] = {"type": ["null", "string"]}

        if ("annualaverage"
                in config.keys()) and (config['annualaverage'].lower()
                                       == "true"):
            raw_schema['properties']['annualaverage'] = {
                "type": ["null", "number"]
            }

        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=
            raw_schema,  #the "to_dict()" bit is a change to the current cookiecutter template on Github.
            key_properties=stream.key_properties,
        )

        max_bookmark = 0
        max_year = 0
        utc = pytz.timezone('UTC')
        thetime = utc.localize(datetime.datetime.now())
        thetimeformatted = thetime.astimezone().isoformat()

        for series in json_data['Results']['series']:
            seriesId = series['seriesID']
            time_extracted = utc.localize(
                datetime.datetime.now()).astimezone().isoformat()

            for item in series['data']:
                year = item['year']
                if max_year < int(year):
                    max_year = int(year)
                period = item['period']
                if period[0] == 'M':
                    month = int(period[1] + period[2])
                    quarter = round((int(period[1] + period[2]) / 3) + 0.3)
                elif period[0] == 'Q':
                    month = 0
                    quarter = period[2]
                elif period[0] == 'S':
                    month = 0
                    quarter = 0
                    period = period[2]
                elif period[0] == 'A':
                    month = 0
                    quarter = 0
                else:
                    month = ""
                    quater = ""
                value = item['value']

                # if series_frequency == "A":
                #    next_row['year'] = item['something']
                # if series_frequency == "Q":
                #    next_row['quarter'] = item['something']
                #    next_row['year'] = item['something']
                # if series_frequency == "M":
                #    next_row['month'] = item['something']

                full_period = str(year) + "-" + str(
                    "{0:0=2d}".format(month)) + "-01T00:00:00-04:00"
                footnotes = ""
                for footnote in item['footnotes']:
                    if footnote:
                        footnotes = footnotes + footnote['text'] + ','

                next_row = {
                    "type": "RECORD",
                    "stream": seriesId,
                    "time_extracted": time_extracted,
                    "schema": seriesId,
                    "frequency": series_frequency,
                    "record": {
                        "SeriesID": seriesId,
                        "year": year,
                        "period": period,
                        "value": value,
                        "footnotes": footnotes[0:-1],
                        "month": str(month),
                        "quarter": str(quarter),
                        "time_extracted": time_extracted,
                        "full_period": full_period
                    }
                }

                if ("calculations"
                        in config.keys()) and (config['calculations'].lower()
                                               == "true"):
                    if ("calculations" in item.keys()):
                        if ("net_changes" in item["calculations"].keys()):
                            next_row['net_change_1'] = float(
                                item['calculations']['net_changes']
                                ['1']) if '1' in item['calculations'][
                                    'net_changes'].keys() else None
                            next_row['net_change_3'] = float(
                                item['calculations']['net_changes']
                                ['3']) if '3' in item['calculations'][
                                    'net_changes'].keys() else None
                            next_row['net_change_6'] = float(
                                item['calculations']['net_changes']
                                ['6']) if '6' in item['calculations'][
                                    'net_changes'].keys() else None
                            next_row['net_change_12'] = float(
                                item['calculations']['net_changes']
                                ['12']) if '12' in item['calculations'][
                                    'net_changes'].keys() else None
                        else:
                            next_row['net_change_1'] = next_row[
                                'net_change_3'] = next_row[
                                    'net_change_6'] = next_row[
                                        'net_change_12'] = None

                        if ("net_changes" in item["calculations"].keys()):
                            next_row['pct_change_1'] = float(
                                item['calculations']['pct_changes']
                                ['1']) if '1' in item['calculations'][
                                    'pct_changes'].keys() else None
                            next_row['pct_change_3'] = float(
                                item['calculations']['pct_changes']
                                ['3']) if '3' in item['calculations'][
                                    'pct_changes'].keys() else None
                            next_row['pct_change_6'] = float(
                                item['calculations']['pct_changes']
                                ['6']) if '6' in item['calculations'][
                                    'pct_changes'].keys() else None
                            next_row['pct_change_12'] = float(
                                item['calculations']['pct_changes']
                                ['12']) if '12' in item['calculations'][
                                    'pct_changes'].keys() else None
                        else:
                            next_row['pct_change_1'] = next_row[
                                'pct_change_3'] = next_row[
                                    'pct_change_6'] = next_row[
                                        'pct_change_12'] = None
                    else:
                        next_row['net_change_1'] = next_row[
                            'net_change_3'] = next_row[
                                'net_change_6'] = next_row[
                                    'net_change_12'] = next_row[
                                        'pct_change_1'] = next_row[
                                            'pct_change_3'] = next_row[
                                                'pct_change_6'] = next_row[
                                                    'pct_change_12'] = None

                if ("aspects" in config.keys()) and (config['aspects'].lower()
                                                     == "true"):
                    next_row['aspects'] = str(item['aspects'])

                if ("annualaverage"
                        in config.keys()) and (config['annualaverage'].lower()
                                               == "true"):
                    if period == 'M13' or period == 'Q5':
                        next_row['annualaverage'] = float(item['value'])
                    else:
                        next_row['annualaverage'] = None

                # write one or more rows to the stream:
                singer.write_records(stream.tap_stream_id, [next_row])
                # capture stream state
                if bookmark_column:

                    if is_sorted:
                        # update bookmark to latest value - this is redundant for tap-bls
                        singer.write_state({
                            stream.tap_stream_id:
                            next_row["record"][bookmark_column[0]]
                        })
                    else:
                        # if data unsorted, save max value until end of writes.  tap-bls goes by the year and will use this approach
                        max_bookmark = max(
                            max_bookmark,
                            int(next_row["record"][bookmark_column[0]]))

        if bookmark_column and not is_sorted:
            singer.write_state({stream.tap_stream_id: max_bookmark})
            if (config['update_state'].lower() == 'true') and (
                    stream_start_year == config['startyear']
            ):  # if you set 'uptadate_state' in config.json the *tap* will update the STATE file - note this is NOT standard behaviour in Singer data flows as the *target* should handle STATE updates.
                LOGGER.info(update_state({stream.tap_stream_id: max_bookmark}))
    return
Exemplo n.º 28
0
def load_and_write_schema(tap_stream_id):
    schema = load_schema(tap_stream_id)
    singer.write_schema(tap_stream_id, schema, pk_fields[tap_stream_id])
Exemplo n.º 29
0
def do_sync(sf, catalog, state):
    starting_stream = state.get("current_stream")

    if starting_stream:
        LOGGER.info("Resuming sync from %s", starting_stream)
    else:
        LOGGER.info("Starting sync")

    for catalog_entry in catalog["streams"]:
        stream_version = get_stream_version(catalog_entry, state)
        stream = catalog_entry['stream']
        stream_alias = catalog_entry.get('stream_alias')
        stream_name = catalog_entry["tap_stream_id"]
        activate_version_message = singer.ActivateVersionMessage(
            stream=(stream_alias or stream), version=stream_version)

        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        mdata = metadata.to_map(catalog_entry['metadata'])

        if not stream_is_selected(mdata):
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        if starting_stream:
            if starting_stream == stream_name:
                LOGGER.info("%s: Resuming", stream_name)
                starting_stream = None
            else:
                LOGGER.info("%s: Skipping - already synced", stream_name)
                continue
        else:
            LOGGER.info("%s: Starting", stream_name)

        state["current_stream"] = stream_name
        singer.write_state(state)
        key_properties = metadata.to_map(catalog_entry['metadata']).get(
            (), {}).get('table-key-properties')
        singer.write_schema(stream, catalog_entry['schema'], key_properties,
                            replication_key, stream_alias)

        job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                     'JobID')
        if job_id:
            with metrics.record_counter(stream) as counter:
                LOGGER.info(
                    "Found JobID from previous Bulk Query. Resuming sync for job: %s",
                    job_id)
                # Resuming a sync should clear out the remaining state once finished
                counter = resume_syncing_bulk_query(sf, catalog_entry, job_id,
                                                    state, counter)
                LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                            counter.value)
                # Remove Job info from state once we complete this resumed query. One of a few cases could have occurred:
                # 1. The job succeeded, in which case make JobHighestBookmarkSeen the new bookmark
                # 2. The job partially completed, in which case make JobHighestBookmarkSeen the new bookmark, or
                #    existing bookmark if no bookmark exists for the Job.
                # 3. The job completely failed, in which case maintain the existing bookmark, or None if no bookmark
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('JobID', None)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('BatchIDs', None)
                bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                     .pop('JobHighestBookmarkSeen', None)
                existing_bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                              .pop(replication_key, None)
                state = singer.write_bookmark(
                    state, catalog_entry['tap_stream_id'], replication_key,
                    bookmark or existing_bookmark
                )  # If job is removed, reset to existing bookmark or None
                singer.write_state(state)
        else:
            # Tables with a replication_key or an empty bookmark will emit an
            # activate_version at the beginning of their sync
            bookmark_is_empty = state.get('bookmarks', {}).get(
                catalog_entry['tap_stream_id']) is None

            if replication_key or bookmark_is_empty:
                singer.write_message(activate_version_message)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              'version', stream_version)
            counter = sync_stream(sf, catalog_entry, state)
            LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                        counter.value)

    state["current_stream"] = None
    singer.write_state(state)
    LOGGER.info("Finished sync")
Exemplo n.º 30
0
 def test_write_schema(self):
     schema={'type': 'object',
             'properties': {
                 'name': {'type': 'string'}}}
     singer.write_schema("users", schema, ["name"])
Exemplo n.º 31
0
async def sync_report_interval(client, account_id, report_stream, start_date,
                               end_date):
    state_key = '{}_{}'.format(account_id, report_stream.stream)
    report_name = stringcase.pascalcase(report_stream.stream)

    report_schema = get_report_schema(client, report_name)
    singer.write_schema(report_stream.stream, report_schema, [])

    report_time = arrow.get().isoformat()

    request_id = get_report_request_id(client, account_id, report_stream,
                                       report_name, start_date, end_date,
                                       state_key)

    singer.write_bookmark(STATE, state_key, 'request_id', request_id)
    singer.write_state(STATE)

    try:
        success, download_url = await poll_report(client, account_id,
                                                  report_name, start_date,
                                                  end_date, request_id)

    except Exception as some_error:
        LOGGER.info(
            'The request_id %s for %s is invalid, generating a new one',
            request_id, state_key)
        request_id = get_report_request_id(client,
                                           account_id,
                                           report_stream,
                                           report_name,
                                           start_date,
                                           end_date,
                                           state_key,
                                           force_refresh=True)

        singer.write_bookmark(STATE, state_key, 'request_id', request_id)
        singer.write_state(STATE)

        success, download_url = await poll_report(client, account_id,
                                                  report_name, start_date,
                                                  end_date, request_id)

    if success and download_url:
        LOGGER.info(
            'Streaming report: {} for account {} - from {} to {}'.format(
                report_name, account_id, start_date, end_date))

        stream_report(report_stream.stream, report_name, download_url,
                      report_time)
        singer.write_bookmark(STATE, state_key, 'request_id', None)
        singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat())
        singer.write_state(STATE)
        return True
    elif success and not download_url:
        LOGGER.info(
            'No data for report: {} for account {} - from {} to {}'.format(
                report_name, account_id, start_date, end_date))
        singer.write_bookmark(STATE, state_key, 'request_id', None)
        singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat())
        singer.write_state(STATE)
        return True
    else:
        LOGGER.info(
            'Unsuccessful request for report: {} for account {} - from {} to {}'
            .format(report_name, account_id, start_date, end_date))
        singer.write_bookmark(STATE, state_key, 'request_id', None)
        singer.write_state(STATE)
        return False
Exemplo n.º 32
0
 def write_schema(self):
     singer.write_schema(
         self.catalog.get('stream'),
         self.catalog.get('schema'),
         key_properties=self.catalog.get('key_properties'))