def save_state(state): if not state: return LOGGER.info('Updating state.') singer.write_state(state)
def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): with metrics.record_counter("events") as counter: for event in download_events(export_bundle['Id']): transform_event(event) counter.increment() singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def update_currently_syncing(state, stream_name): if (stream_name is None) and ('currently_syncing' in state): del state['currently_syncing'] else: singer.set_currently_syncing(state, stream_name) singer.write_state(state)
def sync_parthners(): schema = load_schema("raw_data/parthners") singer.write_schema("partners_report", schema, []) fieldnames = ( "agency_pmd_af_prt", "media_source_pid", "campaign", "impressions", "clicks", "ctr", "installs", "conversion_rate", "sessions", "loyal_users", "loyal_users_Installs", "total_revenue", "total_cost", "roi", "arpu", "average_ecpi", "af_content_view_unique_users", "af_content_view_event_counter", "af_content_view_sales_in_usd", "app_confirmed_sms_unique_users", "app_confirmed_sms_event_counter", "app_confirmed_sms_sales_in_usd", "app_facial_image_unique_users", "app_facial_image_event_counter", "app_facial_image_sales_in_usd", "app_loginpage_unique_users", "app_loginpage_event_counter", "app_loginpage_sales_in_usd", "app_onboard_success_unique_users", "app_onboard_success_event_counter", "app_onboard_success_sales_in_usd", "app_open_unique_users", "app_open_event_counter", "app_open_sales_in_usd", "app_passcode_1_unique_users", "app_passcode_1_event_counter", "app_passcode_1_sales_in_usd", "app_passcode_2_unique_users", "app_passcode_2_event_counter", "app_passcode_2_sales_in_usd", "app_phone_number_add_unique_users", "app_phone_number_add_event_counter", "app_phone_number_add_sales_in_usd", "app_registered_success_unique_users", "app_registered_success_event_counter", "app_registered_success_sales_in_usd", "app_waiting_sms_code_unique_users", "app_waiting_sms_code_event_counter", "app_waiting_sms_code_sales_in_usd", "emotion_validation_unique_users", "emotion_validation_event_counter", "emotion_validation_sales_in_usd" ) from_datetime = get_start("partners") to_datetime = get_stop(from_datetime, datetime.datetime.now()) if to_datetime < from_datetime: LOGGER.error("to_datetime (%s) is less than from_endtime (%s).", to_datetime, from_datetime) return params = dict() params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M") params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M") params["api_token"] = CONFIG["api_token"] url = get_url("partners", app_id=CONFIG["app_id"]) request_data = request(url, params) csv_data = RequestToCsvAdapter(request_data) reader = csv.DictReader(csv_data, fieldnames) next(reader) # Skip the heading row bookmark = from_datetime for i, row in enumerate(reader): record = xform(row, schema) singer.write_record("parthners", record) # AppsFlyer returns records in order of most recent first. if utils.strptime(record["attributed_touch_time"]) > bookmark: bookmark = utils.strptime(record["attributed_touch_time"]) # Write out state utils.update_state(STATE, "parthners", bookmark) singer.write_state(STATE)
def update_current_stream(state, stream_name=None): set_currently_syncing(state, stream_name) singer.write_state(state)
def write_bookmark(state, stream, value): if 'bookmarks' not in state: state['bookmarks'] = {} state['bookmarks'][stream] = value LOGGER.info('Write state for stream: {}, value: {}'.format(stream, value)) singer.write_state(state)
def write_state(self): return singer.write_state(self.state)
def do_sync(self): logger.debug('Starting sync') # resuming when currently_syncing within state resume_from_stream = False if self.state and 'currently_syncing' in self.state: resume_from_stream = self.state['currently_syncing'] for stream in self.streams: stream.tap = self if resume_from_stream: if stream.schema == resume_from_stream: logger.info('Resuming from {}'.format(resume_from_stream)) resume_from_stream = False else: logger.info('Skipping stream {} as resuming from {}'.format(stream.schema, resume_from_stream)) continue # stream state, from state/bookmark or start_date stream.set_initial_state(self.state, self.config['start_date']) # currently syncing if stream.state_field: set_currently_syncing(self.state, stream.schema) self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state)) singer.write_state(self.state) # schema stream.write_schema() # paginate while stream.has_data(): with singer.metrics.http_request_timer(stream.schema) as timer: try: response = self.execute_stream_request(stream) except (ConnectionError, RequestException) as e: raise e timer.tags[singer.metrics.Tag.http_status_code] = response.status_code self.validate_response(response) self.rate_throttling(response) stream.paginate(response) # records with metrics with singer.metrics.record_counter(stream.schema) as counter: with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING) as optimus_prime: for row in self.iterate_response(response): row = stream.process_row(row) if not row: # in case of a non-empty response with an empty element continue row = optimus_prime.transform(row, stream.get_schema()) if stream.write_record(row): counter.increment() stream.update_state(row) # update state / bookmarking only when supported by stream if stream.state_field: self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.earliest_state)) singer.write_state(self.state) # clear currently_syncing try: del self.state['currently_syncing'] except KeyError as e: pass singer.write_state(self.state)
def test_write_state(self): singer.write_state({"foo": 1})
def sync_event_updates(stream_name): ''' Get updates via events endpoint look at 'events update' bookmark and pull events after that ''' LOGGER.info("Started syncing event based updates") bookmark_value = singer.get_bookmark(Context.state, stream_name + '_events', 'updates_created') or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) max_created = bookmark_value date_window_start = max_created date_window_end = max_created + 604800 # Number of seconds in a week stop_paging = False # Create a map to hold relate event object ids to timestamps updated_object_timestamps = {} while not stop_paging: extraction_time = singer.utils.now() response = STREAM_SDK_OBJECTS['events']['sdk_object'].list( **{ "limit": 100, "type": STREAM_TO_TYPE_FILTER[stream_name]['type'], "stripe_account": Context.config.get('account_id'), # None passed to starting_after appears to retrieve # all of them so this should always be safe. "created[gte]": date_window_start, "created[lt]": date_window_end, }) # If no results, and we are not up to current time if not len(response) and date_window_end > extraction_time.timestamp(): # pylint: disable=len-as-condition stop_paging = True for events_obj in response.auto_paging_iter(): event_resource_obj = events_obj.data.object sub_stream_name = SUB_STREAMS.get(stream_name) # Check whether we should sync the event based on its created time if not should_sync_event( events_obj, STREAM_TO_TYPE_FILTER[stream_name]['object'], updated_object_timestamps): continue # Syncing an event as its the first time we've seen it or its the most recent version with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING ) as transformer: event_resource_metadata = metadata.to_map( Context.get_catalog_entry(stream_name)['metadata']) # Filter out line items with null ids if isinstance( events_obj.get('data').get('object'), stripe.Invoice): invoice_obj = events_obj.get('data', {}).get('object', {}) line_items = invoice_obj.get('lines', {}).get('data') if line_items: filtered_line_items = [ line_item for line_item in line_items if line_item.get('id') ] invoice_obj['lines']['data'] = filtered_line_items rec = unwrap_data_objects( event_resource_obj.to_dict_recursive()) rec = reduce_foreign_keys(rec, stream_name) rec["updated"] = events_obj.created rec = transformer.transform( rec, Context.get_catalog_entry(stream_name)['schema'], event_resource_metadata) if events_obj.created >= bookmark_value: if rec.get('id') is not None: singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.updated_counts[stream_name] += 1 # Delete events should be synced but not their subobjects if events_obj.get('type', '').endswith('.deleted'): continue if sub_stream_name and Context.is_selected( sub_stream_name): if event_resource_obj: sync_sub_stream(sub_stream_name, event_resource_obj, updates=True) if events_obj.created > max_created: max_created = events_obj.created date_window_start = date_window_end date_window_end = date_window_end + 604800 singer.write_bookmark(Context.state, stream_name + '_events', 'updates_created', max_created) singer.write_state(Context.state) singer.write_state(Context.state)
def sync_stream(stream_name): """ Sync each stream, looking for newly created records. Updates are captured by events stream. """ LOGGER.info("Started syncing stream %s", stream_name) stream_metadata = metadata.to_map( Context.get_catalog_entry(stream_name)['metadata']) stream_field_whitelist = json.loads( Context.config.get('whitelist_map', '{}')).get(stream_name) extraction_time = singer.utils.now() replication_key = metadata.get(stream_metadata, (), 'valid-replication-keys')[0] # Invoice Items bookmarks on `date`, but queries on `created` filter_key = 'created' if stream_name == 'invoice_items' else replication_key stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) bookmark = stream_bookmark # if this stream has a sub_stream, compare the bookmark sub_stream_name = SUB_STREAMS.get(stream_name) # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark) should_sync_sub_stream = sub_stream_name and Context.is_selected( sub_stream_name) if should_sync_sub_stream: sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \ or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) # if there is a sub stream, set bookmark to sub stream's bookmark # since we know it must be earlier than the stream's bookmark if sub_stream_bookmark != stream_bookmark: bookmark = sub_stream_bookmark else: sub_stream_bookmark = None with Transformer( singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: end_time = dt_to_epoch(utils.now()) window_size = int( Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE)) if DEFAULT_DATE_WINDOW_SIZE != window_size: LOGGER.info('Using non-default date window size of %d', window_size) start_window = bookmark # NB: We observed records coming through newest->oldest and so # date-windowing was added and the tap only bookmarks after it has # gotten through a date window while start_window < end_time: stop_window = dt_to_epoch( epoch_to_dt(start_window) + timedelta(days=window_size)) # cut off the last window at the end time if stop_window > end_time: stop_window = end_time for stream_obj in paginate( STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key, start_window, stop_window): # get the replication key value from the object rec = unwrap_data_objects(stream_obj.to_dict_recursive()) rec = reduce_foreign_keys(rec, stream_name) stream_obj_created = rec[replication_key] rec['updated'] = stream_obj_created # sync stream if object is greater than the bookmark if stream_obj_created > stream_bookmark: rec = transformer.transform( rec, Context.get_catalog_entry(stream_name)['schema'], stream_metadata) # At this point, the record has been transformed and so # any de-selected fields have been pruned. Now, prune off # any fields that aren't present in the whitelist. if stream_field_whitelist: rec = apply_whitelist(rec, stream_field_whitelist) singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.new_counts[stream_name] += 1 # sync sub streams if its selected and the parent object # is greater than its bookmark if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark: sync_sub_stream(sub_stream_name, stream_obj) # Update stream/sub-streams bookmarks as stop window if stop_window > stream_bookmark: stream_bookmark = stop_window singer.write_bookmark(Context.state, stream_name, replication_key, stream_bookmark) # the sub stream bookmarks on its parent if should_sync_sub_stream and stop_window > sub_stream_bookmark: sub_stream_bookmark = stop_window singer.write_bookmark(Context.state, sub_stream_name, replication_key, sub_stream_bookmark) singer.write_state(Context.state) # update window for next iteration start_window = stop_window singer.write_state(Context.state)
def sync_endpoint(schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, keys=None, object_to_id=None, parameter_for_updated=None): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) if keys is None: keys = ['id'] singer.write_schema(schema_name, schema, keys, bookmark_properties=[bookmark_property]) start = get_start(schema_name) start_dt = datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S.%fZ') updated_since = start_dt.strftime("%Y%m%dT%H%M%S") LOGGER.info('updated_since ' + updated_since) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url if parameter_for_updated is not None: url = url + '?' + parameter_for_updated + '=' + updated_since response = request(url, None) LOGGER.info('URL :' + url) if schema_name is 'project_financials': response = [response] time_extracted = utils.now() for row in response: if special_field_name is not None: row[special_field_name] = special_field_value if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None item = transformer.transform(row, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00.00Z' if datetime.datetime.strptime(item[bookmark_property], '%Y-%m-%dT%H:%M:%S.%fZ') >= start_dt: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_rate_cards( # pylint: disable=too-many-arguments schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, date_fields=None, with_updated_since=True, for_each_handler=None, map_handler=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) singer.write_schema(schema_name, schema, ['id'], bookmark_properties=[bookmark_property]) start = get_start(schema_name) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) time_extracted = utils.now() for row in response: if map_handler is not None: row = map_handler(row) if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None item = transformer.transform(row, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00Z' # find expenses sync_endpoint( 'rate_cards_rates', BASE_API_URL + 'rate_cards/' + str(row['id']) + '/rates', None, 'rate_card_id', str(row['id']), ['rate_card_id', 'role'], ) singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_project( # pylint: disable=too-many-arguments schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, date_fields=None, with_updated_since=True, for_each_handler=None, map_handler=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) singer.write_schema(schema_name, schema, ['id'], bookmark_properties=[bookmark_property]) start = get_start(schema_name) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) for row in response: item = transformer.transform(row, schema) time_extracted = utils.now() # find related sync_endpoint( 'expense_items', BASE_API_URL + 'projects/' + str(row['id']) + '/expense_items', None, 'project_id', str(row['id'])) sync_endpoint( 'invoices', BASE_API_URL + 'projects/' + str(row['id']) + '/invoices', None, 'project_id', str(row['id'])) sync_endpoint( 'milestones', BASE_API_URL + 'projects/' + str(row['id']) + '/milestones', None, 'project_id', str(row['id'])) sync_endpoint( 'project_team', BASE_API_URL + 'projects/' + str(row['id']) + '/team', None, 'project_id', str(row['id']), ['person_id', 'project_id'], ) sync_endpoint( 'sprints', BASE_API_URL + 'projects/' + str(row['id']) + '/sprints', None, 'project_id', str(row['id'])) sync_endpoint( 'workflow_columns', BASE_API_URL + 'projects/' + str(row['id']) + '/workflow_columns', None, 'project_id', str(row['id'])) sync_endpoint( 'project_financials', BASE_API_URL + 'projects/' + str(row['id']) + '/financials', None, None, None, ['project_id'], ) if bookmark_property in item and item[bookmark_property] \ >= start: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_allocations( schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, keys=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) if keys is None: keys = ['id'] singer.write_schema(schema_name, schema, keys, bookmark_properties=[bookmark_property]) start = get_start(schema_name) weekDays = [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', ] with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) time_extracted = utils.now() for row in response: # add here logic date = datetime.datetime.strptime(row['start_date'], '%Y-%m-%d') LOGGER.info("Project" + str(row['project']) + "-" + str(row['person'])) end_date = datetime.datetime.strptime(row['end_date'], '%Y-%m-%d') newRow = {} #LOGGER.info("ID:" + str(row['id'])) #LOGGER.info("Date : " + date.strftime('%Y%m%d')) while date <= end_date: #LOGGER.info('Date : ' + str(date.weekday()) + 'weekday' # + weekDays[date.weekday()]) #LOGGER.info(row['project']) #LOGGER.info(row[weekDays[date.weekday()]]) #LOGGER.info(str(date.strftime('%Y-%m-%d'))) #if row['id'] = 72051: # LOGGER.info(row['project']) # LOGGER.info(row['person']) # LOGGER.info(str(date.strftime('%Y-%m-%d'))) # LOGGER.info(str(end_date.strftime('%Y-%m-%d'))) newRow['allocation'] = row[weekDays[date.weekday()]] if not newRow['allocation'] > 0: date = date + timedelta(days=1) continue newRow['project'] = row['project'] newRow['non_project_time'] = row['non_project_time'] newRow['connected_project'] = row['connected_project'] newRow['person'] = row['person'] newRow['project'] = row['project'] newRow['date'] = date.strftime('%Y-%m-%d') newRow['notes'] = row['notes'] newRow['created_by'] = row['created_by'] newRow['updated_by'] = row['updated_by'] newRow['created_at'] = row['created_at'] newRow['updated_at'] = row['updated_at'] newRow['id'] = str(row['id']) \ + str(date.strftime('%Y%m%d')) date = date + timedelta(days=1) item = transformer.transform(newRow, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00Z' if bookmark_property in item \ and item[bookmark_property] >= start: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) else: singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_query(config, state, stream): table_name = stream['tap_stream_id'] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, table_name, 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, table_name, 'last_evaluated_key') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, table_name, 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, table_name, 'version', stream_version) singer.write_state(state) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_version(table_name, stream_version) mdata = metadata.to_map(stream['metadata']) index_name = metadata.get(mdata, (), "IndexName") key_condition_expression = metadata.get(mdata, (), "KeyConditionExpression") expression_attribute_values = metadata.get(mdata, (), "ExpressionAttributeValues") rows_saved = 0 deserializer = Deserializer() for result in query_table(table_name, index_name, key_condition_expression, expression_attribute_values, config): for item in result.get('Items', []): rows_saved += 1 # TODO: Do we actually have to put the item we retreive from # dynamo into a map before we can deserialize? record = deserializer.deserialize_item(item) record_message = singer.RecordMessage(stream=table_name, record=record, version=stream_version) singer.write_message(record_message) if result.get('LastEvaluatedKey'): state = singer.write_bookmark(state, table_name, 'last_evaluated_key', result.get('LastEvaluatedKey')) singer.write_state(state) state = singer.clear_bookmark(state, table_name, 'last_evaluated_key') state = singer.write_bookmark(state, table_name, 'initial_full_table_complete', True) singer.write_state(state) singer.write_version(table_name, stream_version) return rows_saved
def write_state_message(self): '''Writes a Singer state message.''' return singer.write_state(self.state)
def sync_tickets(): bookmark_property = 'updated_at' singer.write_schema("tickets", utils.load_schema("tickets"), ["id"], bookmark_properties=[bookmark_property]) singer.write_schema("conversations", utils.load_schema("conversations"), ["id"], bookmark_properties=[bookmark_property]) singer.write_schema("satisfaction_ratings", utils.load_schema("satisfaction_ratings"), ["id"], bookmark_properties=[bookmark_property]) singer.write_schema("time_entries", utils.load_schema("time_entries"), ["id"], bookmark_properties=[bookmark_property]) start = get_start("tickets") params = { 'updated_since': start, 'order_by': bookmark_property, 'order_type': "asc", 'include': "requester,company,stats" } for i, row in enumerate(gen_request(get_url("tickets"), params)): logger.info("Ticket {}: Syncing".format(row['id'])) row.pop('attachments', None) row['custom_fields'] = transform_dict(row['custom_fields'], force_str=True) # get all sub-entities and save them logger.info("Ticket {}: Syncing conversations".format(row['id'])) try: for subrow in gen_request(get_url("sub_ticket", id=row['id'], entity="conversations")): subrow.pop("attachments", None) subrow.pop("body", None) if subrow[bookmark_property] >= start: singer.write_record("conversations", subrow, time_extracted=singer.utils.now()) except HTTPError as e: if e.response.status_code == 403: logger.info('Invalid ticket ID requested from Freshdesk {0}'.format(row['id'])) else: raise try: logger.info("Ticket {}: Syncing satisfaction ratings".format(row['id'])) for subrow in gen_request(get_url("sub_ticket", id=row['id'], entity="satisfaction_ratings")): subrow['ratings'] = transform_dict(subrow['ratings'], key_key="question") if subrow[bookmark_property] >= start: singer.write_record("satisfaction_ratings", subrow, time_extracted=singer.utils.now()) except HTTPError as e: if e.response.status_code == 403: logger.info("The Surveys feature is unavailable. Skipping the satisfaction_ratings stream.") else: raise try: logger.info("Ticket {}: Syncing time entries".format(row['id'])) for subrow in gen_request(get_url("sub_ticket", id=row['id'], entity="time_entries")): if subrow[bookmark_property] >= start: singer.write_record("time_entries", subrow, time_extracted=singer.utils.now()) except HTTPError as e: if e.response.status_code == 403: logger.info("The Timesheets feature is unavailable. Skipping the time_entries stream.") else: raise utils.update_state(STATE, "tickets", row[bookmark_property]) singer.write_record("tickets", row, time_extracted=singer.utils.now()) singer.write_state(STATE)
def sync_tickets_by_filter(bookmark_property, predefined_filter=None): endpoint = "tickets" state_entity = endpoint if predefined_filter: state_entity = state_entity + "_" + predefined_filter start = get_start(state_entity) params = { 'updated_since': start, 'order_by': bookmark_property, 'order_type': "asc", 'include': "requester,company,stats" } if predefined_filter: logger.info("Syncing tickets with filter {}".format(predefined_filter)) if predefined_filter: params['filter'] = predefined_filter for i, row in enumerate(gen_request(get_url(endpoint), params)): logger.info("Ticket {}: Syncing".format(row['id'])) row.pop('attachments', None) row['custom_fields'] = transform_dict(row['custom_fields'], force_str=True) # get all sub-entities and save them logger.info("Ticket {}: Syncing conversations".format(row['id'])) try: for subrow in gen_request( get_url("sub_ticket", id=row['id'], entity="conversations")): subrow.pop("attachments", None) subrow.pop("body", None) if subrow[bookmark_property] >= start: singer.write_record("conversations", subrow, time_extracted=singer.utils.now()) except HTTPError as e: if e.response.status_code == 403: logger.info( 'Invalid ticket ID requested from Freshdesk {0}'.format( row['id'])) else: raise try: logger.info("Ticket {}: Syncing satisfaction ratings".format( row['id'])) for subrow in gen_request( get_url("sub_ticket", id=row['id'], entity="satisfaction_ratings")): subrow['ratings'] = transform_dict(subrow['ratings'], key_key="question") if subrow[bookmark_property] >= start: singer.write_record("satisfaction_ratings", subrow, time_extracted=singer.utils.now()) except HTTPError as e: if e.response.status_code == 403: logger.info( "The Surveys feature is unavailable. Skipping the satisfaction_ratings stream." ) else: raise try: logger.info("Ticket {}: Syncing time entries".format(row['id'])) for subrow in gen_request( get_url("sub_ticket", id=row['id'], entity="time_entries")): if subrow[bookmark_property] >= start: singer.write_record("time_entries", subrow, time_extracted=singer.utils.now()) except HTTPError as e: if e.response.status_code == 403: logger.info( "The Timesheets feature is unavailable. Skipping the time_entries stream." ) elif e.response.status_code == 404: # 404 is being returned for deleted tickets and spam logger.info( "Could not retrieve time entries for ticket id {}. This may be caused by tickets " "marked as spam or deleted.".format(row['id'])) else: raise utils.update_state(STATE, state_entity, row[bookmark_property]) singer.write_record(endpoint, row, time_extracted=singer.utils.now()) singer.write_state(STATE)
def sync_report_for_day(stream_name, stream_schema, sdk_client, start, field_list): # pylint: disable=too-many-locals report_downloader = sdk_client.GetReportDownloader(version=VERSION) customer_id = sdk_client.client_customer_id report = { 'reportName': 'Seems this is required', 'dateRangeType': 'CUSTOM_DATE', 'reportType': stream_name, 'downloadFormat': 'CSV', 'selector': { 'fields': field_list, 'dateRange': { 'min': start.strftime('%Y%m%d'), 'max': start.strftime('%Y%m%d') } } } # Fetch the report as a csv string with metrics.http_request_timer(stream_name): result = attempt_download_report(report_downloader, report) headers, csv_reader = parse_csv_stream(result) with metrics.record_counter(stream_name) as counter: time_extracted = utils.now() with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING ) as bumble_bee: for row in csv_reader: obj = dict( zip(get_xml_attribute_headers(stream_schema, headers), row)) obj['_sdc_customer_id'] = customer_id obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME bumble_bee.pre_hook = transform_pre_hook obj = bumble_bee.transform(obj, stream_schema) singer.write_record(stream_name, obj, time_extracted=time_extracted) counter.increment() if start > get_start_for_stream(sdk_client.client_customer_id, stream_name): LOGGER.info( 'updating bookmark: %s > %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) bookmarks.write_bookmark( STATE, state_key_name(sdk_client.client_customer_id, stream_name), 'date', start.strftime(utils.DATETIME_FMT)) singer.write_state(STATE) else: LOGGER.info( 'not updating bookmark: %s <= %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) LOGGER.info( "Done syncing %s records for the %s report for customer_id %s on %s", counter.value, stream_name, customer_id, start)
def write_state(self): singer.write_state(self.state)
def sync(self, start_date): for page in self.client.get(self.endpoint, params={}): for rec in page: yield rec singer.write_state(self.state)
def update_bookmark(self, last_updated): singer.bookmarks.write_bookmark( self.state, self.stream, self.stream_metadata.get('replication-key'), safe_to_iso8601(last_updated)) singer.write_state(self.state)
def sync(self, state): bookmark = self.get_bookmark(state) tickets = self.client.tickets.incremental(start_time=bookmark) audits_stream = TicketAudits(self.client) metrics_stream = TicketMetrics(self.client) comments_stream = TicketComments(self.client) def emit_sub_stream_metrics(sub_stream): if sub_stream.is_selected(): singer.metrics.log( LOGGER, Point(metric_type='counter', metric=singer.metrics.Metric.record_count, value=sub_stream.count, tags={'endpoint': sub_stream.stream.tap_stream_id})) sub_stream.count = 0 if audits_stream.is_selected(): LOGGER.info("Syncing ticket_audits per ticket...") for ticket in tickets: zendesk_metrics.capture('ticket') generated_timestamp_dt = datetime.datetime.utcfromtimestamp( ticket.generated_timestamp).replace(tzinfo=pytz.UTC) self.update_bookmark(state, utils.strftime(generated_timestamp_dt)) ticket_dict = ticket.to_dict() ticket_dict.pop( 'fields' ) # NB: Fields is a duplicate of custom_fields, remove before emitting should_yield = self._buffer_record((self.stream, ticket_dict)) if audits_stream.is_selected(): try: for audit in audits_stream.sync(ticket_dict["id"]): zendesk_metrics.capture('ticket_audit') self._buffer_record(audit) except RecordNotFoundException: LOGGER.warning("Unable to retrieve audits for ticket (ID: %s), " \ "the Zendesk API returned a RecordNotFound error", ticket_dict["id"]) if metrics_stream.is_selected(): try: for metric in metrics_stream.sync(ticket_dict["id"]): zendesk_metrics.capture('ticket_metric') self._buffer_record(metric) except RecordNotFoundException: LOGGER.warning("Unable to retrieve metrics for ticket (ID: %s), " \ "the Zendesk API returned a RecordNotFound error", ticket_dict["id"]) if comments_stream.is_selected(): try: # add ticket_id to ticket_comment so the comment can # be linked back to it's corresponding ticket for comment in comments_stream.sync(ticket_dict["id"]): zendesk_metrics.capture('ticket_comment') comment[1].ticket_id = ticket_dict["id"] self._buffer_record(comment) except RecordNotFoundException: LOGGER.warning("Unable to retrieve comments for ticket (ID: %s), " \ "the Zendesk API returned a RecordNotFound error", ticket_dict["id"]) if should_yield: for rec in self._empty_buffer(): yield rec emit_sub_stream_metrics(audits_stream) emit_sub_stream_metrics(metrics_stream) emit_sub_stream_metrics(comments_stream) singer.write_state(state) for rec in self._empty_buffer(): yield rec emit_sub_stream_metrics(audits_stream) emit_sub_stream_metrics(metrics_stream) emit_sub_stream_metrics(comments_stream) singer.write_state(state)
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"], bookmark_properties=['created_at']) latest_updated_at = utils.strptime_to_utc( STATE.get('latest_updated_at', DEFAULT_TIMESTAMP)) run_maximum_updated_at = latest_updated_at latest_disbursement_date = utils.strptime_to_utc( STATE.get('latest_disbursment_date', DEFAULT_TIMESTAMP)) run_maximum_disbursement_date = latest_disbursement_date latest_start_date = utils.strptime_to_utc(get_start("transactions")) period_start = latest_start_date - TRAILING_DAYS period_end = utils.now() logger.info("transactions: Syncing from {}".format(period_start)) logger.info( "transactions: latest_updated_at from {}, disbursement_date from {}". format(latest_updated_at, latest_disbursement_date)) logger.info( "transactions: latest_start_date from {}".format(latest_start_date)) # increment through each day (20k results max from api) for start, end in daterange(period_start, period_end): end = min(end, period_end) data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) time_extracted = utils.now() logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) row_written_count = 0 row_skipped_count = 0 for row in data: # Ensure updated_at consistency if not getattr(row, 'updated_at'): row.updated_at = row.created_at transformed = transform_row(row, schema) updated_at = to_utc(row.updated_at) # if disbursement is successful, get disbursement date # set disbursement datetime to min if not found if row.disbursement_details is None: disbursement_date = datetime.min else: if row.disbursement_details.disbursement_date is None: row.disbursement_details.disbursement_date = datetime.min disbursement_date = to_utc( datetime.combine( row.disbursement_details.disbursement_date, datetime.min.time())) # Is this more recent than our past stored value of update_at? # Is this more recent than our past stored value of disbursement_date? # Use >= for updated_at due to non monotonic updated_at values # Use > for disbursement_date - confirming all transactions disbursed # at the same time # Update our high water mark for updated_at and disbursement_date # in this run if (updated_at >= latest_updated_at) or (disbursement_date >= latest_disbursement_date): if updated_at > run_maximum_updated_at: run_maximum_updated_at = updated_at if disbursement_date > run_maximum_disbursement_date: run_maximum_disbursement_date = disbursement_date singer.write_record("transactions", transformed, time_extracted=time_extracted) row_written_count += 1 else: row_skipped_count += 1 logger.info("transactions: Written {} records from {} - {}".format( row_written_count, start, end)) logger.info("transactions: Skipped {} records from {} - {}".format( row_skipped_count, start, end)) # End day loop logger.info("transactions: Complete. Last updated record: {}".format( run_maximum_updated_at)) logger.info("transactions: Complete. Last disbursement date: {}".format( run_maximum_disbursement_date)) latest_updated_at = run_maximum_updated_at latest_disbursement_date = run_maximum_disbursement_date STATE['latest_updated_at'] = utils.strftime(latest_updated_at) STATE['latest_disbursement_date'] = utils.strftime( latest_disbursement_date) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE)
def do_sync(client, config, catalog, state): """ Translate metadata into a set of metrics and dimensions and call out to sync to generate the required reports. """ selected_streams = catalog.get_selected_streams(state) for stream in selected_streams: # Transform state for this report to new format before proceeding state = clean_state_for_report(config, state, stream.tap_stream_id) state = singer.set_currently_syncing(state, stream.tap_stream_id) singer.write_state(state) metrics = [] dimensions = [] mdata = metadata.to_map(stream.metadata) for field_path, field_mdata in mdata.items(): if field_path == tuple(): continue if field_mdata.get('inclusion') == 'unsupported': continue _, field_name = field_path if field_mdata.get('inclusion') == 'automatic' or \ field_mdata.get('selected') or \ (field_mdata.get('selected-by-default') and field_mdata.get('selected') is None): if field_mdata.get('behavior') == 'METRIC': metrics.append(field_name) elif field_mdata.get('behavior') == 'DIMENSION': dimensions.append(field_name) view_ids = get_view_ids(config) # NB: Resume from previous view for this report, dropping all # views before it to keep streams moving forward current_view = state.get('currently_syncing_view') if current_view: if current_view in view_ids: view_not_current = functools.partial(lambda cv, v: v != cv, current_view) view_ids = list(itertools.dropwhile(view_not_current, view_ids)) else: state.pop('currently_syncing_view', None) reports_per_view = [{ "profile_id": view_id, "name": stream.stream, "id": stream.tap_stream_id, "metrics": metrics, "dimensions": dimensions } for view_id in view_ids] end_date = get_end_date(config) schema = stream.schema.to_dict() singer.write_schema(stream.stream, schema, stream.key_properties) for report in reports_per_view: state['currently_syncing_view'] = report['profile_id'] singer.write_state(state) is_historical_sync, start_date = get_start_date( config, report['profile_id'], state, report['id']) sync_report(client, schema, report, start_date, end_date, state, is_historical_sync) state.pop('currently_syncing_view', None) singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def write_bookmark(state, stream_name, value): if 'bookmarks' not in state: state['bookmarks'] = {} state['bookmarks'][stream_name] = value singer.write_state(state)
def sync_records(sf, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz( sf.get_start_date(state, catalog_entry)) stream = catalog_entry["stream"] schema = catalog_entry["schema"] stream_alias = catalog_entry.get("stream_alias") catalog_metadata = metadata.to_map(catalog_entry["metadata"]) replication_key = catalog_metadata.get((), {}).get("valid-replication-keys")[0] stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info("Syncing Salesforce data for stream %s", stream) for rec in sf.query(catalog_entry, state): replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if replication_key_value <= chunked_bookmark: continue counter.increment() with Transformer(pre_hook=transform_bulk_data_hook) as transformer: rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage( stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time, )) if sf.pk_chunking: if (replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark): # Replace the highest seen bookmark and save the state in case we need to resume later chunked_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark( state, catalog_entry["tap_stream_id"], "JobHighestBookmarkSeen", singer_utils.strftime(chunked_bookmark), ) singer.write_state(state) # Before writing a bookmark, make sure Salesforce has not given us a # record with one outside our range elif replication_key_value and replication_key_value <= start_time: state = singer.write_bookmark( state, catalog_entry["tap_stream_id"], replication_key, rec[replication_key], ) singer.write_state(state) # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry["tap_stream_id"], "version", None) # If pk_chunking is set, only write a bookmark at the end if sf.pk_chunking: # Write a bookmark with the highest value we've seen state = singer.write_bookmark( state, catalog_entry["tap_stream_id"], replication_key, singer_utils.strftime(chunked_bookmark), )
def sync_in_app_events(): schema = load_schema("raw_data/in_app_events") singer.write_schema("in_app_events", schema, [ "event_time", "event_name", "appsflyer_id" ]) # This order matters fieldnames = ( "attributed_touch_type", "attributed_touch_time", "install_time", "event_time", "event_name", "event_value", "event_revenue", "event_revenue_currency", "event_revenue_usd", "event_source", "is_receipt_validated", "af_prt", "media_source", "af_channel", "af_keywords", "campaign", "af_c_id", "af_adset", "af_adset_id", "af_ad", "af_ad_id", "af_ad_type", "af_siteid", "af_sub_siteid", "af_sub1", "af_sub2", "af_sub3", "af_sub4", "af_sub5", "af_cost_model", "af_cost_value", "af_cost_currency", "contributor1_af_prt", "contributor1_media_source", "contributor1_campaign", "contributor1_touch_type", "contributor1_touch_time", "contributor2_af_prt", "contributor2_media_source", "contributor2_campaign", "contributor2_touch_type", "contributor2_touch_time", "contributor3_af_prt", "contributor3_media_source", "contributor3_campaign", "contributor3_touch_type", "contributor3_touch_time", "region", "country_code", "state", "city", "postal_code", "dma", "ip", "wifi", "operator", "carrier", "language", "appsflyer_id", "advertising_id", "idfa", "android_id", "customer_user_id", "imei", "idfv", "platform", "device_type", "os_version", "app_version", "sdk_version", "app_id", "app_name", "bundle_id", "is_retargeting", "retargeting_conversion_type", "af_attribution_lookback", "af_reengagement_window", "is_primary_attribution", "user_agent", "http_referrer", "original_url", ) stop_time = datetime.datetime.now() from_datetime = get_start("in_app_events") to_datetime = get_stop(from_datetime, stop_time, 10) while to_datetime <= stop_time: LOGGER.info("Syncing data from %s to %s", from_datetime, to_datetime) params = dict() params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M") params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M") params["api_token"] = CONFIG["api_token"] url = get_url("in_app_events", app_id=CONFIG["app_id"]) request_data = request(url, params) csv_data = RequestToCsvAdapter(request_data) reader = csv.DictReader(csv_data, fieldnames) next(reader) # Skip the heading row bookmark = from_datetime for i, row in enumerate(reader): record = xform(row, schema) singer.write_record("in_app_events", record) # AppsFlyer returns records in order of most recent first. if utils.strptime(record["event_time"]) > bookmark: bookmark = utils.strptime(record["event_time"]) # Write out state utils.update_state(STATE, "in_app_events", bookmark) singer.write_state(STATE) # Move the timings forward from_datetime = to_datetime to_datetime = get_stop(from_datetime, stop_time, 10)
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter): bulk = Bulk(sf) current_bookmark = singer.get_bookmark( state, catalog_entry["tap_stream_id"], "JobHighestBookmarkSeen") or sf.get_start_date(state, catalog_entry) current_bookmark = singer_utils.strptime_with_tz(current_bookmark) batch_ids = singer.get_bookmark(state, catalog_entry["tap_stream_id"], "BatchIDs") start_time = singer_utils.now() stream = catalog_entry["stream"] stream_alias = catalog_entry.get("stream_alias") catalog_metadata = metadata.to_map(catalog_entry.get("metadata")) replication_key = catalog_metadata.get((), {}).get("valid-replication-keys")[0] stream_version = get_stream_version(catalog_entry, state) schema = catalog_entry["schema"] if not bulk.job_exists(job_id): LOGGER.info( "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state." ) return counter # Iterate over the remaining batches, removing them once they are synced for batch_id in batch_ids[:]: with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage( stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time, )) # Update bookmark if necessary replication_key_value = (replication_key and singer_utils.strptime_with_tz( rec[replication_key])) if (replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark): current_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark( state, catalog_entry["tap_stream_id"], "JobHighestBookmarkSeen", singer_utils.strftime(current_bookmark), ) batch_ids.remove(batch_id) LOGGER.info("Finished syncing batch %s. Removing batch from state.", batch_id) LOGGER.info("Batches to go: %d", len(batch_ids)) singer.write_state(state) return counter
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime( datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def sync(config, state, catalog): errors_encountered = False selected_stream_ids = get_selected_streams(catalog) client = GAClient(config) # Loop over streams in catalog for stream in catalog['streams']: stream_id = stream['tap_stream_id'] stream_schema = stream['schema'] if state and stream_id in state: client.start_date = state[stream_id] stream_metadata = metadata.to_map(stream['metadata']) key_properties = update_key_properties(stream_schema, stream_metadata) if stream_id in selected_stream_ids: LOGGER.info('Syncing stream: ' + stream_id) try: singer.write_schema(stream_id, stream_schema, key_properties) report_definition = ReportsHelper.get_report_definition(stream) for page, date in client.process_stream(report_definition): singer.write_records(stream_id, page) if date is not None: # we need to update all dates that are not "golden", even if it's the start date singer.write_state({stream_id: date}) except TapGaInvalidArgumentError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to invalid report definition.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaRateLimitError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Rate Limit Errors.".format( stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaQuotaExceededError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Quota Exceeded Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaAuthenticationError as e: LOGGER.error( "Stopping execution while processing '{}' due to Authentication Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) except TapGaUnknownError as e: LOGGER.error( "Stopping execution while processing '{}' due to Unknown Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) else: LOGGER.info('Skipping unselected stream: ' + stream_id) # If we encountered errors, exit with 1 if errors_encountered: sys.exit(1) return