def sync_ad_groups(client, account_id, campaign_ids, selected_streams): ad_group_ids = [] for campaign_id in campaign_ids: response = client.GetAdGroupsByCampaignId(CampaignId=campaign_id) response_dict = sobject_to_dict(response) if 'AdGroup' in response_dict: ad_groups = sobject_to_dict(response)['AdGroup'] if 'ad_groups' in selected_streams: LOGGER.info('Syncing AdGroups for Account: %s, Campaign: %s', account_id, campaign_id) selected_fields = get_selected_fields( selected_streams['ad_groups']) singer.write_schema('ad_groups', get_core_schema(client, 'AdGroup'), ['Id']) with metrics.record_counter('ad_groups') as counter: singer.write_records( 'ad_groups', filter_selected_fields_many(selected_fields, ad_groups)) counter.increment(len(ad_groups)) ad_group_ids += list(map(lambda x: x['Id'], ad_groups)) return ad_group_ids
def sync_data(self): table = self.__class__.TABLE selector = FuelSDK.ET_Email search_filter = None retrieve_all_since = get_last_record_value_for_table(self.state, table) if retrieve_all_since is not None: search_filter = { 'Property': 'ModifiedDate', 'SimpleOperator': 'greaterThan', 'Value': retrieve_all_since } stream = request('Email', selector, self.auth_stub, search_filter) for email in stream: email = self.filter_keys_and_parse(email) self.state = incorporate(self.state, table, 'ModifiedDate', email.get('ModifiedDate')) singer.write_records(table, [email]) save_state(self.state)
def sync_table_file(config, s3_file, table_spec, schema): logger.info('Syncing file "{}".'.format(s3_file)) bucket = config['bucket'] table_name = table_spec['name'] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_file) records_synced = 0 for row in iterator: metadata = { '_s3_source_bucket': bucket, '_s3_source_file': s3_file, # index zero, +1 for header row '_s3_source_lineno': records_synced + 2 } try: to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_name, to_write) except BrokenPipeError as bpe: logger.error( f'Pipe to loader broke after {records_synced} records were written from {s3_file}: troubled line was {row}' ) raise bpe records_synced += 1 return records_synced
def sync_data(self): table = self.TABLE LOGGER.info('Syncing data for entity {}'.format(table)) url = self.get_url(self.api_path) body = self.get_body() index = 0 count = 5000 while True: LOGGER.info('Syncing {} rows from index {}'.format(count, index)) params = self.get_params(index, count) result = self.client.make_request(url, self.API_METHOD, params=params, body=body) data = self.get_stream_data(result) if len(data) == 0: break else: index += count with singer.metrics.record_counter(endpoint=table) as counter: for obj in data: singer.write_records(table, [obj]) counter.increment() return self.state
def do_sync(base, start_date, api_key): logger.info('Replicating exchange rate data from fixer.io starting from {}'.format(start_date)) singer.write_schema('exchange_rate', schema, 'date') state = {'start_date': start_date} next_date = start_date try: while True: response = request(base_url + next_date, {'base': base, 'access_key': api_key}) payload = response.json() if datetime.strptime(next_date, DATE_FORMAT) > datetime.utcnow(): break else: singer.write_records('exchange_rate', [parse_response(payload)]) state = {'start_date': next_date} next_date = (datetime.strptime(next_date, DATE_FORMAT) + timedelta(days=1)).strftime(DATE_FORMAT) except requests.exceptions.RequestException as e: logger.fatal('Error on ' + e.request.url + '; received status ' + str(e.response.status_code) + ': ' + e.response.text) singer.write_state(state) sys.exit(-1) singer.write_state(state) logger.info('Tap exiting normally')
def sync(config, state, catalog): """ Sync data from tap source """ # Loop over selected streams in catalog for stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream:" + stream.tap_stream_id) bookmark_column = stream.replication_key is_sorted = False singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) max_bookmark = None for row in tap_data(config=config, stream=stream): transformed = transform(row) singer.write_records(stream.tap_stream_id, [transformed]) if bookmark_column: if is_sorted: # update bookmark to latest value singer.write_state( {stream.tap_stream_id: row[bookmark_column]}) else: # if data unsorted, save max value until end of writes max_bookmark = max(max_bookmark, row[bookmark_column]) if bookmark_column and not is_sorted: singer.write_state({stream.tap_stream_id: max_bookmark}) return
def write_metadados(self): schema = { 'properties': { "id": { "type": "integer" }, "nome": { "type": "string" }, "URL": { "type": "string" }, "pesquisa": { "type": "string" }, "assunto": { "type": "string" }, "periodicidade_frequencia": { "type": "string" }, "periodicidade_inicio": { "type": "integer" }, "periodicidade_fim": { "type": "integer" }, "timestamp": { "type": "string", "format": "date-time" }, } } singer.write_schema('metadados_agregados', schema, ['id']) singer.write_records('metadados_agregados', self.metadados)
def sync_data(self, return_ids=False): table = self.TABLE response = self.client.make_request(self.get_url(), 'GET') # In development, the fastest way to decrease iteration time is to # slice the following data down to something very small like 10. # # all_technicians = self.get_stream_data(response)[:10] all_technicians = self.get_stream_data(response) if not return_ids: with singer.metrics.record_counter(endpoint=table) as counter: for obj in all_technicians: singer.write_records(table, [obj]) counter.increment() technician_ids = sorted( [technician.get('nodeid') for technician in all_technicians]) if return_ids: return technician_ids for substream in self.substreams: substream.state = self.state LOGGER.info("Syncing {}".format(substream.TABLE)) substream.sync_data(parent_ids=technician_ids) self.state = substream.state
def sync_data(self): table = self.TABLE LOGGER.info('Syncing data for {}'.format(table)) url = self.get_url() params = self.get_params() body = self.get_body() while True: response = self.client.make_request(url, self.API_METHOD, params=params, body=body) transformed = self.get_stream_data(response) with singer.metrics.record_counter(endpoint=table) as counter: singer.write_records(table, transformed) counter.increment(len(transformed)) page_number = body['page_number'] LOGGER.info('Synced page {} for {}'.format(page_number, table)) if len(transformed) == 0: break body['page_number'] += 1 self.save_state(transformed[-1]) return self.state
def sync_data(self): table = self.__class__.TABLE selector = FuelSDK.ET_ContentArea search_filter = None retrieve_all_since = get_last_record_value_for_table(self.state, table) if retrieve_all_since is not None: search_filter = { 'Property': 'ModifiedDate', 'SimpleOperator': 'greaterThan', 'Value': retrieve_all_since } stream = request('ContentAreaDataAccessObject', selector, self.auth_stub, search_filter) for content_area in stream: content_area = self.filter_keys_and_parse(content_area) self.state = incorporate(self.state, table, 'ModifiedDate', content_area.get('ModifiedDate')) singer.write_records(table, [content_area]) save_state(self.state)
def sync(config, state, catalog): """ Sync data from tap source """ # Loop over selected streams in catalog for stream in catalog.get_selected_streams(state): LOGGER.info('Syncing stream: %s', stream.tap_stream_id) bookmark_column = stream.replication_key is_sorted = True # TODO: indicate whether data is sorted ascending on bookmark value singer.write_schema(stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties) max_bookmark = None for row in tap_data(): # TODO: place type conversions or transformations here # write one or more rows to the stream: singer.write_records(stream.tap_stream_id, [row]) if bookmark_column: if is_sorted: # update bookmark to latest value singer.write_state( {stream.tap_stream_id: row[bookmark_column]}) else: # if data unsorted, save max value until end of writes max_bookmark = max(max_bookmark, row[bookmark_column]) if bookmark_column and not is_sorted: singer.write_state({stream.tap_stream_id: max_bookmark}) return
def sync(config, state, catalog): LOGGER = singer.get_logger() ref = get_ref() """ Sync data from tap source """ # Loop over selected streams in catalog for stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream: " + stream.tap_stream_id) bookmark_column = stream.replication_key #print(config) singer.write_schema(stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties) s_date = singer.get_bookmark(state, stream.tap_stream_id, "startdate") e_date = singer.get_bookmark(state, stream.tap_stream_id, "lastrun") offset = (pd.to_datetime(e_date) - pd.to_datetime(s_date)).days if offset <= 0: offset = 1 tokens = {"accessTokenKey": config["token_key"], "accessTokenSecret": config["token_secret"]} query = {"datefiltermode": config["datefiltermode"], "filterorder": config["filterorder"], "dateinterval": config["interval"], "dateintervaloffset": offset } # TODO: delete and replace this inline function with your own data retrieval process: response = make_request(config, stream, tokens, query) tap_data, state = append_data(response, state, config, stream, tokens, query) for row in tap_data: # write one or more rows to the stream: singer.write_records(stream.tap_stream_id, [row]) new_offset = query["dateintervaloffset"] - config["interval"] new_date = (datetime.now() - timedelta(new_offset)).strftime("%Y-%m-%d") state = write_bookmark(state, stream.tap_stream_id, "startdate", new_date) state = write_bookmark(state, stream.tap_stream_id, "lastrun", datetime.now().strftime("%Y-%m-%d")) state = singer.write_state(state) return
def get_incremental_pull_additional_properties(stream, endpoint, state, api_key, start_date): latest_event_time = get_starting_point_additional_properties( stream, state, start_date) with metrics.record_counter(stream['stream']) as counter: url = '{}{}/export'.format(endpoint, stream['tap_stream_id']) for response in get_all_additional_properties_using_next( stream['stream'], url, api_key, latest_event_time): events = response.json() if events: # latest_date = None for result in events.get('results'): result['id'] = events.get('id') result['data'][0]['segment'] = result['segment'] result['data'][0]['metric'] = events['metric'] counter.increment(1) hash_object = hashlib.sha224( (result['segment'] + result['data'][0]['date']).encode('utf-8')) result['data'][0]['id'] = hash_object.hexdigest() singer.write_records(stream['stream'], result.get('data')) update_state(state, stream['stream'], datetime.datetime.today().strftime(DATETIME_FMT)) singer.write_state(state) return state
def write_file(target_filename, table_spec, schema): LOGGER.info('Syncing file "{}".'.format(target_filename)) target_uri = table_spec['path'] + '/' + target_filename iterator = tap_spreadsheets_anywhere.format_handler.get_row_iterator( table_spec, target_uri) records_synced = 0 for row in iterator: metadata = { '_smart_source_bucket': table_spec['path'], '_smart_source_file': target_filename, # index zero, +1 for header row '_smart_source_lineno': records_synced + 2 } try: to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_spec['name'], to_write) except BrokenPipeError as bpe: LOGGER.error( f'Pipe to loader broke after {records_synced} records were written from {target_filename}: troubled ' f'line was {to_write[0]}') raise bpe records_synced += 1 return records_synced
def sync_records(self, request_config, end_date=None): table = self.TABLE raw_orders = self.client.fetch_orders(request_config) orders = self.get_stream_data(raw_orders) with singer.metrics.record_counter(endpoint=table) as counter: singer.write_records(table, orders) counter.increment(len(orders)) if len(orders) > 0: state_key = 'LastUpdateDate' last_order = orders[-1] order_time = last_order[state_key] self.state = incorporate(self.state, self.TABLE, state_key, order_time) save_state(self.state) parsed = parse(order_time).date() if end_date is not None and parsed > end_date: LOGGER.info( "Synced past the specified end_date ({}) - quitting". format(parsed)) return None, orders next_token = raw_orders.parsed.get('NextToken', {}).get('value') return next_token, orders
def get_characters(): marvel_limit = 100 offset = 0 #Format the time t = time.strftime("%Y%d%m%H%M%S") m = hashlib.md5() m.update("{}{}{}".format(t, CONFIG['private_key'], CONFIG['public_key']).encode("utf-8")) hash = m.hexdigest() #Now let's get Singer going schema = {'type': 'object', 'properties': { 'id': {'type': 'integer'}, 'name': {'type': 'string'}, 'modified': {'type': 'string', 'format': 'date-time'} }} singer.write_schema('characters', schema, 'id') #call the api and get records until there aren't anymore while True: response = requests.get('https://gateway.marvel.com:443/v1/public/characters?orderBy=modified&apikey={}&ts={}&hash={}&limit={}&offset={}'.format(CONFIG['public_key'],t,hash,marvel_limit,offset)) body = response.json()['data'] singer.write_records('characters', body['results']) offset = offset + marvel_limit if body['count'] < marvel_limit: break
def sync_users(config, state): stream_id = 'users' api_key = config['api_key'] sc = SlingClient(api_key) raw_users = sc.make_request('users') user_records = [] for user in raw_users: record = { 'id': str(user.get('id')) if user.get('id') else None, 'type': user.get('type'), 'name': user.get('name'), 'last_name': user.get('lastname'), 'avatar': user.get('avatar'), 'email': user.get('email'), 'timezone': user.get('timezone'), 'hours_cap': user.get('hoursCap'), 'active': user.get('active'), 'deactivated_at': user.get('deactivatedAt'), } user_records.append(record) singer.write_records(stream_id, user_records) return state
def sync_data(self): table = self.TABLE page = 1 LOGGER.info('Syncing data for entity {} (page={})'.format(table, page)) url = "{}{}".format(self.client.base_url, self.api_path) while True: params = self.get_params(page=page) body = self.get_body() result = self.client.make_request(url, self.API_METHOD, params=params, body=body) data = self.get_stream_data(result) with singer.metrics.record_counter(endpoint=table) as counter: for obj in data: singer.write_records(table, [obj]) counter.increment() paging = result['paging'] if page >= paging['totalPages']: break page += 1 return self.state
def sync_data(self): table = self.__class__.TABLE selector = FuelSDK.ET_List search_filter = None retrieve_all_since = get_last_record_value_for_table( self.state, table, self.config.get('start_date')) if retrieve_all_since is not None: search_filter = { 'Property': 'ModifiedDate', 'SimpleOperator': 'greaterThan', 'Value': retrieve_all_since } stream = request('List', selector, self.auth_stub, search_filter) for _list in stream: _list = self.filter_keys_and_parse(_list) self.state = incorporate(self.state, table, 'ModifiedDate', _list.get('ModifiedDate')) singer.write_records(table, [_list]) save_state(self.state)
def sync_data(self): cursor = request('Campaign', FuelSDK.ET_Campaign, self.auth_stub) for campaign in cursor: campaign = self.filter_keys_and_parse(campaign) singer.write_records(self.__class__.TABLE, [campaign])
def pull_subscribers_batch(self, subscriber_keys): if not subscriber_keys: return table = self.__class__.TABLE _filter = {} if len(subscriber_keys) == 1: _filter = { 'Property': 'SubscriberKey', 'SimpleOperator': 'equals', 'Value': subscriber_keys[0] } elif len(subscriber_keys) > 1: _filter = { 'Property': 'SubscriberKey', 'SimpleOperator': 'IN', 'Value': subscriber_keys } else: LOGGER.info('Got empty set of subscriber keys, moving on') return stream = request('Subscriber', FuelSDK.ET_Subscriber, self.auth_stub, _filter) for subscriber in stream: subscriber = self.filter_keys_and_parse(subscriber) subscriber = self.remove_sensitive_data(subscriber) singer.write_records(table, [subscriber])
def sync(client, config, state, catalog): """ Sync data from tap source """ # Loop over selected streams in catalog for stream in catalog.get_selected_streams(state): stream_id = stream.tap_stream_id logger.info("Syncing stream:" + stream_id) singer.write_schema( stream_name=stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) yesterday = datetime.now() - timedelta(1) day = state.get(stream_id) or config.get('start_date') day = day and datetime.strptime(day, DATE_FORMAT) or yesterday while day <= yesterday: tap_data = client.request_report(stream, day) singer.write_records(stream_id, tap_data) state[stream_id] = day.strftime(DATE_FORMAT) singer.write_state(state) day += timedelta(1) return
def sync(config, state, catalog): """ Sync data from tap source """ # Loop over selected streams in catalog for stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream:" + stream.tap_stream_id) bookmark_column = stream.replication_key is_sorted = True # TODO: indicate whether data is sorted ascending on bookmark value singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # TODO: delete and replace this inline function with your own data retrieval process: tap_data = lambda: [{"id": x, "name": f'row${x}'} for x in range(1000)] max_bookmark = None for row in tap_data(): # TODO: place type conversions or transformations here # write one or more rows to the stream: singer.write_records(stream.tap_stream_id, [row]) if bookmark_column: if is_sorted: # update bookmark to latest value singer.write_state( {stream.tap_stream_id: row[bookmark_column]}) else: # if data unsorted, save max value until end of writes max_bookmark = max(max_bookmark, row[bookmark_column]) if bookmark_column and not is_sorted: singer.write_state({stream.tap_stream_id: max_bookmark}) return
def sync_paginated(self, url, params=None, async_session=None): table = self.TABLE _next = True page = 1 all_resources = [] transformer = singer.Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) while _next is not None: result = self.client.make_request(url, self.API_METHOD, params=params) _next = result.get('next') data = self.get_stream_data(result['data'], transformer) with singer.metrics.record_counter(endpoint=table) as counter: singer.write_records( table, data) counter.increment(len(data)) all_resources.extend(data) if _next: params['offset'] = _next LOGGER.info('Synced page {} for {}'.format(page, self.TABLE)) page += 1 transformer.log_warning() return all_resources
def get_all_commits(repo_path, state): if 'commits' in state and state['commits'] is not None: query_string = '?since={}'.format(state['commits']) else: query_string = '' latest_commit_time = None with metrics.record_counter('commits') as counter: for response in authed_get_all_pages( 'commits', 'https://api.github.com/repos/{}/commits{}'.format( repo_path, query_string)): commits = response.json() for commit in commits: counter.increment() commit.pop('author', None) commit.pop('committer', None) singer.write_records('commits', commits) if not latest_commit_time: latest_commit_time = commits[0]['commit']['committer']['date'] state['commits'] = latest_commit_time return state
def sync_accounts_stream(account_ids, catalog_item): selected_fields = get_selected_fields(catalog_item) accounts = [] LOGGER.info('Initializing CustomerManagementService client - Loading WSDL') client = CustomServiceClient('CustomerManagementService') account_schema = get_core_schema(client, 'AdvertiserAccount') singer.write_schema('accounts', account_schema, ['Id']) for account_id in account_ids: # Loop over the multiple account_ids client = create_sdk_client('CustomerManagementService', account_id) # Get account data response = client.GetAccount(AccountId=account_id) accounts.append(sobject_to_dict(response)) accounts_bookmark = singer.get_bookmark(STATE, 'accounts', 'last_record') if accounts_bookmark: accounts = list( filter( lambda x: x is not None and x['LastModifiedTime'] >= accounts_bookmark, accounts)) max_accounts_last_modified = max([x['LastModifiedTime'] for x in accounts]) with metrics.record_counter('accounts') as counter: # Write only selected fields singer.write_records( 'accounts', filter_selected_fields_many(selected_fields, accounts)) counter.increment(len(accounts)) singer.write_bookmark(STATE, 'accounts', 'last_record', max_accounts_last_modified) singer.write_state(STATE)
def sync_table_file(config, s3_file, table_spec, schema): logger.info('Syncing file "{}".'.format(s3_file)) bucket = config['bucket'] table_name = table_spec['name'] iterator = tap_s3_csv.format_handler.get_row_iterator( config, table_spec, s3_file) records_synced = 0 for row in iterator: metadata = { '_s3_source_bucket': bucket, '_s3_source_file': s3_file, # index zero, +1 for header row '_s3_source_lineno': records_synced + 2 } to_write = [{**conversion.convert_row(row, schema), **metadata}] singer.write_records(table_name, to_write) records_synced += 1 return records_synced
def sync(config, state, catalog): """ Sync data from tap source """ geo_source = GeoSource(path=config['path'], config=config) for stream in catalog.get_selected_streams(state): # Fetch the layer from the geo source layer = geo_source.layers[stream.tap_stream_id] # Log some info about the stream LOGGER.info( f'Syncing stream: {stream.tap_stream_id} | Transformation: {str(layer.should_transform)}') singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) with record_counter(log_interval=10) as counter: for row in layer.features(): # Write a row to the stream singer.write_records(stream.tap_stream_id, [row]) # Log the records counter.increment()
def sync_data(self): table = self.TABLE LOGGER.info('Syncing data for entity {}'.format(table)) domain = self.get_domain() url = ( 'https://{domain}{api_path}'.format( domain=domain, api_path=self.api_path)) params = self.get_params() body = self.get_body() result = self.client.make_request( url, self.API_METHOD, params=params, body=body) data = self.get_stream_data(result) with singer.metrics.record_counter(endpoint=table) as counter: for obj in data: singer.write_records( table, [obj]) counter.increment() return self.state
def sync_data(self): table = self.__class__.TABLE endpoints = { 'sent': FuelSDK.ET_SentEvent, 'click': FuelSDK.ET_ClickEvent, 'open': FuelSDK.ET_OpenEvent, 'bounce': FuelSDK.ET_BounceEvent, 'unsub': FuelSDK.ET_UnsubEvent } for event_name, selector in endpoints.items(): search_filter = None start = get_last_record_value_for_table(self.state, event_name) if start is None: start = self.config.get('start_date') if start is None: raise RuntimeError('start_date not defined!') pagination_unit = self.config.get( 'pagination__{}_interval_unit'.format(event_name), 'minutes') pagination_quantity = self.config.get( 'pagination__{}_interval_quantity'.format(event_name), 10) unit = {pagination_unit: int(pagination_quantity)} end = increment_date(start, unit) while before_now(start): LOGGER.info("Fetching {} from {} to {}" .format(event_name, start, end)) search_filter = get_date_page('EventDate', start, unit) stream = request(event_name, selector, self.auth_stub, search_filter) for event in stream: event = self.filter_keys_and_parse(event) self.state = incorporate(self.state, event_name, 'EventDate', event.get('EventDate')) singer.write_records(table, [event]) self.state = incorporate(self.state, event_name, 'EventDate', start) save_state(self.state) start = end end = increment_date(start, unit)
def sync_data_for_date(self, date, interval): LOGGER.info('Syncing data for {}'.format(date.isoformat())) table = self.TABLE updated_after = date updated_before = updated_after + interval cursor = None has_data = True page = 1 while has_data: url = 'https://{}.uservoice.com{}'.format( self.config.get('subdomain'), self.API_PATH) result = self.client.fetch_data( url, updated_after, updated_before, cursor, endpoint=table) cursor = result.get('pagination', {}).get('cursor') total_pages = result.get('pagination', {}).get('total_pages') data = self.get_stream_data(result) has_data = ((data is not None) and (len(data) > 0)) if has_data: with singer.metrics.record_counter(endpoint=table) \ as counter: for obj in data: singer.write_records( table, [self.filter_keys(obj)]) counter.increment() self.state = incorporate(self.state, table, 'updated_at', obj.get('updated_at')) if page == total_pages: LOGGER.info('Reached end of stream, moving on.') has_data = False elif cursor is None: raise RuntimeError(('Found data, but there is no ' 'continuation cursor! (Expected ' '{} pages, found {})').format( total_pages, page)) else: LOGGER.info('No data returned, moving on.') page = page + 1 self.state = incorporate(self.state, table, 'updated_at', date.isoformat()) save_state(self.state)