def __init__(self, ns_account=None, ns_consumer_key=None, ns_consumer_secret=None, ns_token_key=None, ns_token_secret=None, is_sandbox=True, select_fields_by_default=None, default_start_date=None): self.ns_account = ns_account self.ns_consumer_key = ns_consumer_key self.ns_consumer_secret = ns_consumer_secret self.ns_token_key = ns_token_key self.ns_token_secret = ns_token_secret self.is_sandbox = is_sandbox self.select_fields_by_default = select_fields_by_default is True or ( isinstance(select_fields_by_default, str) and select_fields_by_default.lower() == 'true') self.default_start_date = default_start_date if ns_account is not None: if is_sandbox is True: self.ns_account = self.ns_account + '_SB1' self.ns_client = None # validate start_date if default_start_date is not None: singer_utils.strptime(default_start_date)
def sync_deals(): last_sync = utils.strptime(get_start("deals")) days_since_sync = (datetime.datetime.utcnow() - last_sync).days if days_since_sync > 30: endpoint = "deals_all" else: endpoint = "deals_recent" schema = load_schema("deals") singer.write_schema("deals", schema, ["portalId", "dealId"]) url = get_url(endpoint) params = {'count': 250} for i, row in enumerate( gen_request(url, params, "deals", "hasMore", "offset", "offset")): record = request(get_url("deals_detail", deal_id=row['dealId'])).json() record = transform(record, schema) modified_time = None if 'hs_lastmodifieddate' in record: modified_time = utils.strptime( record['hs_lastmodifieddate']['value']) elif 'createdate' in record: modified_time = utils.strptime(record['createdate']['value']) if not modified_time or modified_time >= last_sync: singer.write_record("deals", record) utils.update_state(STATE, "deals", modified_time) if i % 250 == 0: singer.write_state(STATE)
def __init__(self, refresh_token=None, token=None, sf_client_id=None, sf_client_secret=None, quota_percent_per_run=None, quota_percent_total=None, is_sandbox=None, select_fields_by_default=None, default_start_date=None, api_type=None): self.api_type = api_type.upper() if api_type else None self.refresh_token = refresh_token self.token = token self.sf_client_id = sf_client_id self.sf_client_secret = sf_client_secret self.session = requests.Session() self.access_token = None self.instance_url = None self.quota_percent_per_run = float( quota_percent_per_run) if quota_percent_per_run is not None else 25 self.quota_percent_total = float( quota_percent_total) if quota_percent_total is not None else 80 self.is_sandbox = is_sandbox is True or (isinstance(is_sandbox, str) and is_sandbox.lower() == 'true') self.select_fields_by_default = select_fields_by_default is True or (isinstance(select_fields_by_default, str) and select_fields_by_default.lower() == 'true') self.default_start_date = default_start_date self.rest_requests_attempted = 0 self.jobs_completed = 0 self.login_timer = None self.data_url = "{}/services/data/v41.0/{}" self.pk_chunking = False # validate start_date singer_utils.strptime(default_start_date)
def get_start(key): if key in STATE: return utils.strptime(STATE[key]) if "start_date" in CONFIG: return utils.strptime(CONFIG["start_date"]) return datetime.datetime.now() - datetime.timedelta(days=30)
def sync_contacts(): last_sync = utils.strptime(get_start("contacts")) days_since_sync = (datetime.datetime.utcnow() - last_sync).days if days_since_sync > 30: endpoint = "contacts_all" offset_keys = ['vid-offset'] offset_targets = ['vidOffset'] else: endpoint = "contacts_recent" offset_keys = ['vid-offset', 'time-offset'] offset_targets = ['vidOffset', 'timeOffset'] schema = load_schema("contacts") singer.write_schema("contacts", schema, ["canonical-vid"]) url = get_url(endpoint) params = { 'showListMemberships': True, 'count': 100, } vids = [] for row in gen_request(url, params, 'contacts', 'has-more', offset_keys, offset_targets): modified_time = None if 'lastmodifieddate' in row['properties']: modified_time = utils.strptime( _transform_datetime( row['properties']['lastmodifieddate']['value'])) if not modified_time or modified_time >= last_sync: vids.append(row['vid']) if len(vids) == 100: data = request(get_url("contacts_detail"), params={ 'vid': vids }).json() for vid, record in data.items(): record = transform(record, schema) singer.write_record("contacts", record) modified_time = None if 'lastmodifieddate' in record['properties']: modified_time = record['properties']['lastmodifieddate'][ 'value'] utils.update_state(STATE, "contacts", modified_time) vids = [] singer.write_state(STATE)
def sync_entity_chunked(entity_name, key_properties, path): schema = load_schema(entity_name) singer.write_schema(entity_name, schema, key_properties) start = get_start(entity_name) now_ts = int(datetime.datetime.utcnow().timestamp() * 1000) start_ts = int(utils.strptime(start).timestamp() * 1000) url = get_url(entity_name) while start_ts < now_ts: end_ts = start_ts + CHUNK_SIZES[entity_name] params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } for row in gen_request(url, params, path, "hasMore", "offset", "offset"): record = transform(row, schema) singer.write_record(entity_name, record) utils.update_state(STATE, entity_name, datetime.datetime.utcfromtimestamp(end_ts / 1000)) singer.write_state(STATE) start_ts = end_ts
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"]) now = datetime.datetime.utcnow() start = utils.strptime(get_start("transactions")) logger.info("transactions: Syncing from {}".format(start)) while start < now: end = start + datetime.timedelta(days=1) if end > now: end = now data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) for row in data: transformed = transform_row(row, schema) singer.write_record("transactions", transformed) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE) start += datetime.timedelta(days=1)
def get_sync_domain(state, stream, model_name): """ Return a domain (a filter expression) that can be used to filter records. If the model's replication methd """ domain = [] for entry in stream.metadata: # stream metadata will have empty breadcrumb if not entry['breadcrumb'] and \ entry['metadata'].get('replication-method', None) == 'FULL_TABLE': return domain last_updated_at = get_bookmark(state, stream.tap_stream_id, 'last_updated_at') if last_updated_at: last_updated_at = utils.strptime(last_updated_at) domain.extend([ 'OR', [('write_date', '>', last_updated_at)], [('create_date', '>', last_updated_at)], ]) last_record_id = get_bookmark(state, stream.tap_stream_id, 'last_record_id') if last_record_id: domain.append(('id', '>', last_record_id)) return domain
def __init__(self, refresh_token=None, token=None, qb_client_id=None, qb_client_secret=None, quota_percent_per_run=None, quota_percent_total=None, is_sandbox=None, select_fields_by_default=None, default_start_date=None, api_type=None, realm_id=None): self.api_type = api_type.upper() if api_type else None self.realm_id = realm_id self.refresh_token = refresh_token self.token = token self.qb_client_id = qb_client_id self.qb_client_secret = qb_client_secret self.session = requests.Session() self.access_token = None self.base_url = "https://sandbox-quickbooks.api.intuit.com/v3/company/" if is_sandbox is True else 'https://quickbooks.api.intuit.com/v3/company/' self.instance_url = f"{self.base_url}{realm_id}" LOGGER.info(f"Instance URL :- {self.instance_url}") if isinstance(quota_percent_per_run, str) and quota_percent_per_run.strip() == '': quota_percent_per_run = None if isinstance(quota_percent_total, str) and quota_percent_total.strip() == '': quota_percent_total = None self.quota_percent_per_run = float( quota_percent_per_run) if quota_percent_per_run is not None else 25 self.quota_percent_total = float( quota_percent_total) if quota_percent_total is not None else 80 self.is_sandbox = is_sandbox is True or (isinstance(is_sandbox, str) and is_sandbox.lower() == 'true') self.select_fields_by_default = select_fields_by_default is True or ( isinstance(select_fields_by_default, str) and select_fields_by_default.lower() == 'true') self.default_start_date = default_start_date self.rest_requests_attempted = 0 self.jobs_completed = 0 self.login_timer = None self.data_url = "{}/services/data/v41.0/{}" self.pk_chunking = False # validate start_date singer_utils.strptime(default_start_date)
def sync_companies(): last_sync = utils.strptime(get_start("companies")) days_since_sync = (datetime.datetime.utcnow() - last_sync).days if days_since_sync > 30: endpoint = "companies_all" path = "companies" more_key = "has-more" offset_keys = ["offset"] offset_targets = ["offset"] else: endpoint = "companies_recent" path = "results" more_key = "hasMore" offset_keys = ["offset"] offset_targets = ["offset"] schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"]) url = get_url(endpoint) params = {'count': 250} for i, row in enumerate( gen_request(url, params, path, more_key, offset_keys, offset_targets)): record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = transform(record, schema) modified_time = None if 'hs_lastmodifieddate' in record: modified_time = utils.strptime( record['hs_lastmodifieddate']['value']) elif 'createdate' in record: modified_time = utils.strptime(record['createdate']['value']) if not modified_time or modified_time >= last_sync: singer.write_record("companies", record) utils.update_state(STATE, "companies", modified_time) if i % 250 == 0: singer.write_state(STATE)
def __init__(self, credentials=None, token=None, quota_percent_per_run=None, quota_percent_total=None, is_sandbox=None, select_fields_by_default=None, default_start_date=None, api_type=None): self.api_type = api_type.upper() if api_type else None self.session = requests.Session() if isinstance(quota_percent_per_run, str) and quota_percent_per_run.strip() == '': quota_percent_per_run = None if isinstance(quota_percent_total, str) and quota_percent_total.strip() == '': quota_percent_total = None self.quota_percent_per_run = float( quota_percent_per_run) if quota_percent_per_run is not None else 25 self.quota_percent_total = float( quota_percent_total) if quota_percent_total is not None else 80 self.is_sandbox = is_sandbox is True or (isinstance( is_sandbox, str) and is_sandbox.lower() == 'true') self.select_fields_by_default = select_fields_by_default is True or ( isinstance(select_fields_by_default, str) and select_fields_by_default.lower() == 'true') self.default_start_date = default_start_date self.rest_requests_attempted = 0 self.jobs_completed = 0 self.data_url = "{}/services/data/v41.0/{}" self.pk_chunking = False self.auth = SalesforceAuth.from_credentials(credentials, is_sandbox=self.is_sandbox) # validate start_date singer_utils.strptime(default_start_date)
def sync_records(sf, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz( sf.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') replication_key = catalog_entry.get('replication_key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing Salesforce data for stream %s', stream) with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in sf.query(catalog_entry, state): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if sf.pk_chunking: if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark: # Replace the highest seen bookmark and save the state in case we need to resume later chunked_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(chunked_bookmark)) singer.write_state(state) # Before writing a bookmark, make sure Salesforce has not given us a # record with one outside our range elif replication_key_value and replication_key_value <= start_time: state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, rec[replication_key]) singer.write_state(state) # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', None) # If pk_chunking is set, only write a bookmark at the end if sf.pk_chunking: # Write a bookmark with the highest value we've seen state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, singer_utils.strptime(chunked_bookmark))
def __init__(self, refresh_token=None, token=None, sf_client_id=None, sf_client_secret=None, quota_percent_per_run=None, quota_percent_total=None, is_sandbox=None, select_fields_by_default=None, default_start_date=None, api_type=None, source_type=None, object_name=None, report_id=None): self.api_type = api_type.upper() if api_type else None self.refresh_token = refresh_token self.token = token self.sf_client_id = sf_client_id self.sf_client_secret = sf_client_secret self.session = requests.Session() self.access_token = None self.instance_url = None if isinstance(quota_percent_per_run, str) and quota_percent_per_run.strip() == '': quota_percent_per_run = None if isinstance(quota_percent_total, str) and quota_percent_total.strip() == '': quota_percent_total = None self.quota_percent_per_run = float( quota_percent_per_run) if quota_percent_per_run is not None else 25 self.quota_percent_total = float( quota_percent_total) if quota_percent_total is not None else 80 self.is_sandbox = is_sandbox is True or (isinstance( is_sandbox, str) and is_sandbox.lower() == 'true') self.select_fields_by_default = select_fields_by_default is True or ( isinstance(select_fields_by_default, str) and select_fields_by_default.lower() == 'true') self.default_start_date = default_start_date self.rest_requests_attempted = 0 self.jobs_completed = 0 self.login_timer = None self.data_url = "{}/services/data/v41.0/{}" self.pk_chunking = False self.source_type = source_type if source_type else None self.object_name = object_name if object_name else None self.report_id = report_id if report_id else None # validate start_date singer_utils.strptime(default_start_date) # Validate params if source_type != 'object' and source_type != 'report': LOGGER.error( 'Invalid report_type, supported types are report & object') raise Exception( 'Invalid report_type, supported types are report & object') if source_type == 'object' and object_name == None: LOGGER.error('Object name is required when source type is object') raise Exception( 'Object name is required when source type is object') if source_type == 'report' and (report_id == None): LOGGER.error('Report id is required when source type is report') raise Exception('Report id is required when source type is report')
def sync_in_app_events(): schema = load_schema("raw_data/in_app_events") singer.write_schema("in_app_events", schema, [ "event_time", "event_name", "appsflyer_id" ]) # This order matters fieldnames = ( "attributed_touch_type", "attributed_touch_time", "install_time", "event_time", "event_name", "event_value", "event_revenue", "event_revenue_currency", "event_revenue_usd", "event_source", "is_receipt_validated", "af_prt", "media_source", "af_channel", "af_keywords", "campaign", "af_c_id", "af_adset", "af_adset_id", "af_ad", "af_ad_id", "af_ad_type", "af_siteid", "af_sub_siteid", "af_sub1", "af_sub2", "af_sub3", "af_sub4", "af_sub5", "af_cost_model", "af_cost_value", "af_cost_currency", "contributor1_af_prt", "contributor1_media_source", "contributor1_campaign", "contributor1_touch_type", "contributor1_touch_time", "contributor2_af_prt", "contributor2_media_source", "contributor2_campaign", "contributor2_touch_type", "contributor2_touch_time", "contributor3_af_prt", "contributor3_media_source", "contributor3_campaign", "contributor3_touch_type", "contributor3_touch_time", "region", "country_code", "state", "city", "postal_code", "dma", "ip", "wifi", "operator", "carrier", "language", "appsflyer_id", "advertising_id", "idfa", "android_id", "customer_user_id", "imei", "idfv", "platform", "device_type", "os_version", "app_version", "sdk_version", "app_id", "app_name", "bundle_id", "is_retargeting", "retargeting_conversion_type", "af_attribution_lookback", "af_reengagement_window", "is_primary_attribution", "user_agent", "http_referrer", "original_url", ) stop_time = datetime.datetime.now() from_datetime = get_start("in_app_events") to_datetime = get_stop(from_datetime, stop_time, 10) while from_datetime < stop_time: LOGGER.info("Syncing data from %s to %s", from_datetime, to_datetime) params = dict() params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M") params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M") params["api_token"] = CONFIG["api_token"] url = get_url("in_app_events", app_id=CONFIG["app_id"]) request_data = request(url, params) csv_data = RequestToCsvAdapter(request_data) reader = csv.DictReader(csv_data, fieldnames) next(reader) # Skip the heading row bookmark = from_datetime for i, row in enumerate(reader): record = xform(row, schema) singer.write_record("in_app_events", record) # AppsFlyer returns records in order of most recent first. if utils.strptime(record["event_time"]) > bookmark: bookmark = utils.strptime(record["event_time"]) # Write out state utils.update_state(STATE, "in_app_events", bookmark) singer.write_state(STATE) # Move the timings forward from_datetime = to_datetime to_datetime = get_stop(from_datetime, stop_time, 10)
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"], bookmark_properties=['created_at']) latest_updated_at = to_utc( utils.strptime(STATE.get('latest_updated_at', DEFAULT_TIMESTAMP))) run_maximum_updated_at = latest_updated_at latest_disbursement_date = to_utc( utils.strptime(STATE.get('latest_disbursement_date', DEFAULT_TIMESTAMP))) run_maximum_disbursement_date = latest_disbursement_date latest_start_date = to_utc(utils.strptime(get_start("transactions"))) period_start = latest_start_date - TRAILING_DAYS period_end = utils.now() logger.info("transactions: Syncing from {}".format(period_start)) logger.info( "transactions: latest_updated_at from {}, disbursement_date from {}". format(latest_updated_at, latest_disbursement_date)) logger.info( "transactions: latest_start_date from {}".format(latest_start_date)) # increment through each day (20k results max from api) for start, end in daterange(period_start, period_end): end = min(end, period_end) data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) time_extracted = utils.now() logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) row_written_count = 0 row_skipped_count = 0 for row in data: # Ensure updated_at consistency if not getattr(row, 'updated_at'): row.updated_at = row.created_at transformed = transform_row(row, schema) updated_at = to_utc(row.updated_at) # if disbursement is successful, get disbursement date # set disbursement datetime to min if not found if row.disbursement_details is None: disbursement_date = datetime.min else: if row.disbursement_details.disbursement_date is None: row.disbursement_details.disbursement_date = datetime.min disbursement_date = to_utc( datetime.combine( row.disbursement_details.disbursement_date, datetime.min.time())) # Is this more recent than our past stored value of update_at? # Is this more recent than our past stored value of disbursement_date? # Use >= for updated_at due to non monotonic updated_at values # Use > for disbursement_date - confirming all transactions disbursed # at the same time # Update our high water mark for updated_at and disbursement_date # in this run if (updated_at >= latest_updated_at) or (disbursement_date >= latest_disbursement_date): if updated_at > run_maximum_updated_at: run_maximum_updated_at = updated_at if disbursement_date > run_maximum_disbursement_date: run_maximum_disbursement_date = disbursement_date singer.write_record("transactions", transformed, time_extracted=time_extracted) row_written_count += 1 else: row_skipped_count += 1 logger.info("transactions: Written {} records from {} - {}".format( row_written_count, start, end)) logger.info("transactions: Skipped {} records from {} - {}".format( row_skipped_count, start, end)) # End day loop logger.info("transactions: Complete. Last updated record: {}".format( run_maximum_updated_at)) logger.info("transactions: Complete. Last disbursement date: {}".format( run_maximum_disbursement_date)) latest_updated_at = run_maximum_updated_at latest_disbursement_date = run_maximum_disbursement_date STATE['latest_updated_at'] = utils.strftime(latest_updated_at) STATE['latest_disbursement_date'] = utils.strftime( latest_disbursement_date) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE)
def get_update_start_ts(key): if key not in STATE: STATE[key] = CONFIG['start_date'] return int(utils.strptime(STATE[key]).timestamp())
def get_start_ts(key): return int(utils.strptime(get_start(key)).timestamp())
def sync_parthners(): schema = load_schema("raw_data/parthners") singer.write_schema("partners_report", schema, []) fieldnames = ( "agency_pmd_af_prt", "media_source_pid", "campaign", "impressions", "clicks", "ctr", "installs", "conversion_rate", "sessions", "loyal_users", "loyal_users_Installs", "total_revenue", "total_cost", "roi", "arpu", "average_ecpi", "af_content_view_unique_users", "af_content_view_event_counter", "af_content_view_sales_in_usd", "app_confirmed_sms_unique_users", "app_confirmed_sms_event_counter", "app_confirmed_sms_sales_in_usd", "app_facial_image_unique_users", "app_facial_image_event_counter", "app_facial_image_sales_in_usd", "app_loginpage_unique_users", "app_loginpage_event_counter", "app_loginpage_sales_in_usd", "app_onboard_success_unique_users", "app_onboard_success_event_counter", "app_onboard_success_sales_in_usd", "app_open_unique_users", "app_open_event_counter", "app_open_sales_in_usd", "app_passcode_1_unique_users", "app_passcode_1_event_counter", "app_passcode_1_sales_in_usd", "app_passcode_2_unique_users", "app_passcode_2_event_counter", "app_passcode_2_sales_in_usd", "app_phone_number_add_unique_users", "app_phone_number_add_event_counter", "app_phone_number_add_sales_in_usd", "app_registered_success_unique_users", "app_registered_success_event_counter", "app_registered_success_sales_in_usd", "app_waiting_sms_code_unique_users", "app_waiting_sms_code_event_counter", "app_waiting_sms_code_sales_in_usd", "emotion_validation_unique_users", "emotion_validation_event_counter", "emotion_validation_sales_in_usd" ) from_datetime = get_start("partners") to_datetime = get_stop(from_datetime, datetime.datetime.now()) if to_datetime < from_datetime: LOGGER.error("to_datetime (%s) is less than from_endtime (%s).", to_datetime, from_datetime) return params = dict() params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M") params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M") params["api_token"] = CONFIG["api_token"] url = get_url("partners", app_id=CONFIG["app_id"]) request_data = request(url, params) csv_data = RequestToCsvAdapter(request_data) reader = csv.DictReader(csv_data, fieldnames) next(reader) # Skip the heading row bookmark = from_datetime for i, row in enumerate(reader): record = xform(row, schema) singer.write_record("parthners", record) # AppsFlyer returns records in order of most recent first. if utils.strptime(record["attributed_touch_time"]) > bookmark: bookmark = utils.strptime(record["attributed_touch_time"]) # Write out state utils.update_state(STATE, "parthners", bookmark) singer.write_state(STATE)
def get_stop(start_datetime, stop_time, days=30): if "end_date" in CONFIG: return utils.strptime(CONFIG["end_date"]) return min(start_datetime + datetime.timedelta(days=days), stop_time)