def sync_data(self, parent_ids=None): if parent_ids is None: raise RuntimeError( 'Cannot pull tweet engagement for {}'.format(parent_ids)) self.write_schema() start = self.get_start_for_tweet_ids(parent_ids) LOGGER.info( "Pulling data from {} for a batch of 25 tweets".format(start)) table = self.TABLE url = self.get_url() while True: start = self.get_start_for_tweet_ids(parent_ids) end = min(datetime.datetime.utcnow(), start + datetime.timedelta(weeks=4)) if start > (end - datetime.timedelta(hours=3)): break body = self.get_body(start, end, parent_ids) self.request_start = datetime.datetime.utcnow() result = self.client.make_request(url, self.API_METHOD, body=body) data = self.get_stream_data(result) with singer.metrics.record_counter(endpoint=table) as counter: for index, obj in enumerate(data): singer.write_records(table, [self.filter_keys(obj)]) self.state = incorporate( self.state, 'tweet_engagements.{}'.format(obj.get('tweet_id')), 'date', obj.get('date')) counter.increment() save_state(self.state) max_sleep = 35 sleep_seconds = min( max_sleep, ((self.request_start + datetime.timedelta(seconds=max_sleep)) - datetime.datetime.utcnow()).seconds) if sleep_seconds > 0: LOGGER.info("Sleeping for {} seconds before making " "next request".format(sleep_seconds)) time.sleep(sleep_seconds)
def sync_data(self): table = self.TABLE url = self.get_url() for handle in self.config.get('handles'): params = { 'user_id': handle, 'count': 200, 'include_rts': 1, } has_more = True max_id = None while has_more: last_max_id = max_id result = self.client.make_request(url, self.API_METHOD, params=params) data = self.get_stream_data(result) with singer.metrics.record_counter(endpoint=table) as counter: for index, obj in enumerate(data): LOGGER.debug("On {} of {}".format(index, len(data))) processed = self.filter_keys(obj) singer.write_records(table, [processed]) counter.increment() if max_id is None: max_id = obj.get('id') else: max_id = min(max_id, obj.get('id')) params['max_id'] = max_id self.state = incorporate( self.state, "tweet_engagements.{}".format(obj.get('id')), 'date', processed.get('created_at')) for substream in self.substreams: substream.state = self.state for tweets in funcy.chunks(25, data): substream.sync_data(parent_ids=[ tweet.get('id_str') for tweet in tweets ]) if last_max_id == max_id: has_more = False
def sync_data(self): table = self.TABLE done = False filters = self.get_filters() start_date = get_last_record_value_for_table(self.state, table) if start_date is None: start_date = get_config_start_date(self.config) else: start_date = start_date.replace(tzinfo=pytz.UTC) td = self.get_interval() end_date = start_date + td while not done: max_date = start_date LOGGER.info("Querying {} starting at {}".format(table, start_date)) body = { "startMSeconds": int(start_date.timestamp() * 1000), "endMSeconds": int(end_date.timestamp() * 1000), } if filters is not None: body["filters"] = filters LOGGER.info(body) try: response = self.client.make_request(self.get_url(), "POST", body=body) except RuntimeError as e: if "502" in str(e) or "504" in str(e): # try one more time response = self.client.make_request(self.get_url(), "POST", body=body) else: raise e to_write = self.get_stream_data(response) with singer.metrics.record_counter(endpoint=table) as ctr: singer.write_records(table, to_write) ctr.increment(amount=len(to_write)) for item in to_write: max_date = max(max_date, self.get_time_for_state(item)) self.state = incorporate(self.state, table, "start_date", start_date) if max_date > datetime.datetime.now(pytz.UTC): done = True if len(to_write) == 0: LOGGER.info("Advancing one full interval.") if end_date > datetime.datetime.now(pytz.UTC): done = True else: start_date = end_date elif start_date == max_date: LOGGER.info("Advancing one millisecond.") start_date = start_date + datetime.timedelta(milliseconds=1) else: LOGGER.info("Advancing by one page.") start_date = max_date end_date = start_date + td save_state(self.state)
def sync_data(self): table = self.TABLE done = False start_date = get_last_record_value_for_table(self.state, table) if start_date is None: start_date = get_config_start_date(self.config) else: start_date = start_date.replace(tzinfo=pytz.UTC) td = datetime.timedelta(hours=1) end_date = start_date + td while not done: max_date = start_date LOGGER.info("Querying {} starting at {}".format(table, start_date)) body = { "filters": { "environment": self.config.get("environment"), "lastUpdated": { "gte": int(start_date.timestamp() * 1000), "lte": int(end_date.timestamp() * 1000), }, } } LOGGER.info(body) response = self.client.make_request(self.get_url(), "POST", body=body) to_write = self.get_stream_data(response) with singer.metrics.record_counter(endpoint=table) as ctr: singer.write_records(table, to_write) ctr.increment(amount=len(to_write)) for item in to_write: max_date = max(max_date, self.get_time_for_state(item)) self.state = incorporate(self.state, table, "start_date", start_date) if max_date > datetime.datetime.now(pytz.UTC): done = True if len(to_write) == 0: LOGGER.info("Advancing one full interval.") if end_date > datetime.datetime.now(pytz.UTC): done = True else: start_date = end_date elif start_date == max_date: LOGGER.info("Advancing one second.") start_date = start_date + datetime.timedelta(seconds=1) else: LOGGER.info("Advancing by one page.") start_date = max_date end_date = start_date + td save_state(self.state)