def call_incremental_stream(self, stream): start_date, end_date = self.intialize_dates(self) request_config = { 'url': self.url, 'headers': self.build_headers(), 'run': True, 'data': self.build_body(stream, start_date) } while request_config['run']: res = self.client.make_request(request_config, method='POST') if res.status_code != 200: raise AttributeError(f'Received status_code {res.status_code}') records = self.convert_to_json(res) if len(records) == self.MAX_RECORDS: raise ValueError('Number of records returned is equal to ' 'hudson\'s limit. This means that we will be ' 'missing data.') transform_write_and_count(stream, records) start_date = start_date.add(days=1) if start_date == end_date: request_config['run'] = False else: request_config['data'] = self.build_body(stream, start_date)
def call_incremental_stream(self, stream): """Method to call all incremental streams""" last_updated = format_last_updated_for_request( stream.update_and_return_bookmark(), self.replication_key_format) request_config = { "url": self.url, "headers": self.build_headers(), "params": self.build_params(stream, last_updated=last_updated), "run": True } while request_config['run']: res = self.client.make_request(request_config) if res.status_code != 200: raise AttributeError(f'Received status_code {res.status_code}') records = res.json() transform_write_and_count(stream, records) last_updated = self.get_latest_for_next_call( records, stream.stream_metadata['replication-key'], last_updated) stream.update_bookmark(last_updated) request_config = self.update_for_next_call(res, request_config) return last_updated
def call_incremental_stream(self, stream): """ Method to call all incremental synced streams """ last_updated = format_last_updated_for_request( stream.update_and_return_bookmark(), self.replication_key_format) request_config = { 'url': self.generate_api_url(stream), 'headers': self.build_headers(), 'params': self.build_initial_params(stream, last_updated), 'run': True, 'api_key': self.api_key } LOGGER.info("Extracting %s since %s" % (stream, last_updated)) self.total_contacts = 0 while request_config['run']: LOGGER.info("Params: %s" % (request_config['params'])) res = self.client.make_request(request_config) if res.status_code != 200: raise AttributeError('Received status code {}'.format( res.status_code)) root = ElementTree.fromstring(res.text) records = [] for child in root: xml_dict = {} for grandchild in child: xml_dict[grandchild.tag] = grandchild.text records.append(xml_dict) self.total_contacts += len(records) LOGGER.info('Total Records is {}'.format(self.total_contacts)) if self.should_write(records, stream, last_updated): transform_write_and_count(stream, records) # last_updated = self.get_lastest_update( # records, # last_updated # ) stream.update_bookmark(last_updated) request_config = self.update_for_next_call(res, request_config, stream, records) formated_update = self.format_updated( request_config['params']['updated_before']) LOGGER.info('setting last updated to {}'.format(formated_update)) return formated_update
def call_stream(self, stream, request_config): res = self.client.make_request(request_config) records = res.json() if not records: records = [] elif not isinstance(records, list): # subsequent methods are expecting a list records = [records] transform_write_and_count(stream, records)
def call_stream(self, stream, request_config): for location in self.client.config['locations']: request_config['params'] = self.build_params( location[0], location[1], location[2]) res = self.client.make_request(request_config) records = res.json() if not records: records = [] elif not isinstance(records, list): # subsequent methods are expecting a list records = [records] transform_write_and_count(stream, records)
def call_stream(self, stream, club_id, request_config, curr_upper_bound=None): """ Utility method shared by incremental and full streams; handles API calls and record writes """ while request_config['run']: res = self.client.make_request(request_config) if stream.is_incremental: LOGGER.info( 'Received {n} records on page {i} for club {c}'.format( n=res.json()['status']['count'], i=res.json()['request']['page'], c=club_id)) else: LOGGER.info('Received {n} records for club {c}'.format( n=res.json()['status']['count'], c=club_id)) records = res.json().get(stream.stream_metadata['response-key']) if not records: records = [] elif not isinstance(records, list): # subsequent methods are expecting a list records = [records] # for endpoints that do not provide club_id if stream.stream in STREAMS_TO_HYDRATE: records = self.hydrate_record_with_club_id(records, club_id) transform_write_and_count(stream, records) if stream.is_incremental: LOGGER.info( '{s} bookmark for club {c} is currently {b}'.format( s=stream.stream, c=club_id, b=curr_upper_bound)) request_config, curr_upper_bound = self.update_for_next_call( int(res.json()['status']['count']), request_config, stream, curr_upper_bound) return curr_upper_bound
def call_full_stream(self, stream): """Method to call all fully synched streams""" request_config = { "url": self.url, "headers": self.build_headers(), "params": self.build_params(stream), "run": True } while request_config['run']: res = self.client.make_request(request_config) if res.status_code != 200: raise AttributeError(f'Received status_code {res.status_code}') records = res.json() records = self._add_pdf_s3_link(records, request_config) transform_write_and_count(stream, records) request_config = self.update_for_next_call(res, request_config)
def call_incremental_stream(self, stream): """ Method to call all incremental synced streams """ last_updated = format_last_updated_for_request( stream.update_and_return_bookmark(), self.replication_key_format ) request_config = { 'url': self.generate_api_url(stream), "headers": self.build_headers(), 'params': self.build_initial_params(last_updated), 'run': True } LOGGER.info("Extracting stream {s} since {d}".format(s=stream, d=last_updated)) while request_config['run']: res = self.client.make_request(request_config) records = res.json().get('items') LOGGER.info('Received {n} records'.format(n=len(records))) transform_write_and_count(stream, records) last_updated = self.get_latest_record_date(records) LOGGER.info('Setting last updated for stream {s} to {d}'.format( s=stream, d=last_updated )) stream.update_bookmark(last_updated) request_config = self.update_for_next_call(len(records), request_config) return last_updated
def call_full_stream(self, stream): """ Method to call all fully synced streams """ request_config = { 'url': self.generate_api_url(stream), 'headers': self.build_headers(), 'params': self.build_initial_params(), 'run': True } LOGGER.info("Extracting {}".format(stream)) while request_config['run']: res = self.client.make_request(request_config) records = res.json().get('items') LOGGER.info('Received {n} records'.format(n=len(records))) transform_write_and_count(stream, records) request_config = self.update_for_next_call(len(records), request_config)
def call_full_stream(self, stream): """ Method to call all fully synced streams """ stream_name = stream.stream event_name = EVENT_TYPES.get(stream_name) start_date = pendulum.parse(self.config['full_table_start_date']) end_date = start_date.add(months=1) LOGGER.info("Extracting %s since %s." % (stream_name, start_date)) requests = [(start_date, end_date)] while requests: request_filters = requests.pop(0) request_start = request_filters[0] request_end = request_filters[1] request_start_str = request_start.to_datetime_string() request_end_str = request_end.to_datetime_string() LOGGER.info('Requesting export from %s to %s.' % (request_start_str, request_end_str)) sync_uri = self.client.request_bulk_export(stream, request_start_str, request_end_str, event_name) offset = 0 run = True while run: records, has_more, total_records = self.client.fetch_bulk_export_records( sync_uri, offset, MAX_RECORDS_RETURNED, run) if total_records >= EXPORT_LIMIT: LOGGER.info( 'Export exceeds 5M record limit. Splitting into multiple requests.' ) new_end_date = request_start + ( (request_end - request_start) / 2) requests.append((request_start, new_end_date)) requests.append((new_end_date, request_end)) run = False elif total_records == 0: LOGGER.info('No records found between %s and %s.' % (request_start, request_end)) run = False else: transform_write_and_count(stream, records) if not has_more: run = False LOGGER.info( 'Completed fetching records. Fetched %s records.' % total_records) else: offset = offset + MAX_RECORDS_RETURNED LOGGER.info( 'Fetched %s of %s records. Fetching next set of records.' % (offset, total_records)) if request_end < pendulum.now(): new_end_date = request_end.add(months=1) requests.append((request_end, new_end_date))
def call_incremental_stream(self, stream): """ Method to call incrementally synced streams TODO: only for bulk api, update for rest api too Args: stream (cls) Returns: last_record_date (dttime) """ stream_name = stream.stream event_name = EVENT_TYPES.get(stream_name) last_updated = format_last_updated_for_request( stream.update_and_return_bookmark(), self.replication_key_format) LOGGER.info("Extracting %s since %s." % (stream_name, last_updated)) latest_record_date = last_updated start_date = pendulum.parse(last_updated) end_date = pendulum.now() requests = [(start_date, end_date)] while requests: request_filters = requests.pop(0) request_start = request_filters[0] request_end = request_filters[1] request_start_str = request_start.to_datetime_string() request_end_str = request_end.to_datetime_string() LOGGER.info('Requesting export from %s to %s.' % (request_start_str, request_end_str)) sync_uri = self.client.request_bulk_export(stream, request_start_str, request_end_str, event_name) offset = 0 run = True while run: records, has_more, total_records = self.client.fetch_bulk_export_records( sync_uri, offset, MAX_RECORDS_RETURNED, run) if total_records >= EXPORT_LIMIT: LOGGER.info( 'Export exceeds 5M record limit. Splitting into multiple requests.' ) new_end_date = request_start + ( (request_end - request_start) / 2) requests.append((request_start, new_end_date)) requests.append((new_end_date, request_end)) run = False elif total_records == 0: LOGGER.info('No records found between %s and %s.' % (request_start, request_end)) run = False else: transform_write_and_count(stream, records) latest_record_batch = self.get_latest_for_next_call( records=records, replication_key=stream.meta_fields.get( 'replication_key'), last_updated=last_updated) if latest_record_batch > latest_record_date: latest_record_date = latest_record_batch if not has_more: run = False LOGGER.info( 'Completed fetching records. Fetched %s records.' % total_records) else: offset = offset + MAX_RECORDS_RETURNED LOGGER.info( 'Fetched %s of %s records. Fetching next set of records.' % (offset, total_records)) return latest_record_date