def sync_data(self, parent_ids=None):
        if parent_ids is None:
            raise RuntimeError(
                'Cannot pull tweet engagement for {}'.format(parent_ids))

        self.write_schema()

        start = self.get_start_for_tweet_ids(parent_ids)

        LOGGER.info(
            "Pulling data from {} for a batch of 25 tweets".format(start))

        table = self.TABLE

        url = self.get_url()

        while True:
            start = self.get_start_for_tweet_ids(parent_ids)
            end = min(datetime.datetime.utcnow(),
                      start + datetime.timedelta(weeks=4))

            if start > (end - datetime.timedelta(hours=3)):
                break

            body = self.get_body(start, end, parent_ids)

            self.request_start = datetime.datetime.utcnow()

            result = self.client.make_request(url, self.API_METHOD, body=body)

            data = self.get_stream_data(result)

            with singer.metrics.record_counter(endpoint=table) as counter:
                for index, obj in enumerate(data):
                    singer.write_records(table, [self.filter_keys(obj)])

                    self.state = incorporate(
                        self.state,
                        'tweet_engagements.{}'.format(obj.get('tweet_id')),
                        'date', obj.get('date'))

                    counter.increment()

            save_state(self.state)

            max_sleep = 35
            sleep_seconds = min(
                max_sleep,
                ((self.request_start + datetime.timedelta(seconds=max_sleep)) -
                 datetime.datetime.utcnow()).seconds)

            if sleep_seconds > 0:
                LOGGER.info("Sleeping for {} seconds before making "
                            "next request".format(sleep_seconds))
                time.sleep(sleep_seconds)
예제 #2
0
    def sync_data(self):
        table = self.TABLE

        url = self.get_url()

        for handle in self.config.get('handles'):
            params = {
                'user_id': handle,
                'count': 200,
                'include_rts': 1,
            }
            has_more = True
            max_id = None

            while has_more:
                last_max_id = max_id

                result = self.client.make_request(url,
                                                  self.API_METHOD,
                                                  params=params)

                data = self.get_stream_data(result)

                with singer.metrics.record_counter(endpoint=table) as counter:
                    for index, obj in enumerate(data):
                        LOGGER.debug("On {} of {}".format(index, len(data)))

                        processed = self.filter_keys(obj)

                        singer.write_records(table, [processed])

                        counter.increment()

                        if max_id is None:
                            max_id = obj.get('id')
                        else:
                            max_id = min(max_id, obj.get('id'))

                        params['max_id'] = max_id

                        self.state = incorporate(
                            self.state,
                            "tweet_engagements.{}".format(obj.get('id')),
                            'date', processed.get('created_at'))

                for substream in self.substreams:
                    substream.state = self.state
                    for tweets in funcy.chunks(25, data):
                        substream.sync_data(parent_ids=[
                            tweet.get('id_str') for tweet in tweets
                        ])

                if last_max_id == max_id:
                    has_more = False
예제 #3
0
    def sync_data(self):
        table = self.TABLE
        done = False

        filters = self.get_filters()
        start_date = get_last_record_value_for_table(self.state, table)

        if start_date is None:
            start_date = get_config_start_date(self.config)
        else:
            start_date = start_date.replace(tzinfo=pytz.UTC)

        td = self.get_interval()

        end_date = start_date + td

        while not done:
            max_date = start_date

            LOGGER.info("Querying {} starting at {}".format(table, start_date))

            body = {
                "startMSeconds": int(start_date.timestamp() * 1000),
                "endMSeconds": int(end_date.timestamp() * 1000),
            }

            if filters is not None:
                body["filters"] = filters

            LOGGER.info(body)

            try:
                response = self.client.make_request(self.get_url(),
                                                    "POST",
                                                    body=body)
            except RuntimeError as e:
                if "502" in str(e) or "504" in str(e):
                    # try one more time
                    response = self.client.make_request(self.get_url(),
                                                        "POST",
                                                        body=body)

                else:
                    raise e

            to_write = self.get_stream_data(response)

            with singer.metrics.record_counter(endpoint=table) as ctr:
                singer.write_records(table, to_write)

                ctr.increment(amount=len(to_write))

                for item in to_write:
                    max_date = max(max_date, self.get_time_for_state(item))

            self.state = incorporate(self.state, table, "start_date",
                                     start_date)

            if max_date > datetime.datetime.now(pytz.UTC):
                done = True

            if len(to_write) == 0:
                LOGGER.info("Advancing one full interval.")

                if end_date > datetime.datetime.now(pytz.UTC):
                    done = True
                else:
                    start_date = end_date

            elif start_date == max_date:
                LOGGER.info("Advancing one millisecond.")
                start_date = start_date + datetime.timedelta(milliseconds=1)
            else:
                LOGGER.info("Advancing by one page.")
                start_date = max_date

            end_date = start_date + td

            save_state(self.state)
예제 #4
0
    def sync_data(self):
        table = self.TABLE
        done = False

        start_date = get_last_record_value_for_table(self.state, table)

        if start_date is None:
            start_date = get_config_start_date(self.config)
        else:
            start_date = start_date.replace(tzinfo=pytz.UTC)

        td = datetime.timedelta(hours=1)

        end_date = start_date + td

        while not done:
            max_date = start_date

            LOGGER.info("Querying {} starting at {}".format(table, start_date))

            body = {
                "filters": {
                    "environment": self.config.get("environment"),
                    "lastUpdated": {
                        "gte": int(start_date.timestamp() * 1000),
                        "lte": int(end_date.timestamp() * 1000),
                    },
                }
            }

            LOGGER.info(body)

            response = self.client.make_request(self.get_url(),
                                                "POST",
                                                body=body)

            to_write = self.get_stream_data(response)

            with singer.metrics.record_counter(endpoint=table) as ctr:
                singer.write_records(table, to_write)

                ctr.increment(amount=len(to_write))

                for item in to_write:
                    max_date = max(max_date, self.get_time_for_state(item))

            self.state = incorporate(self.state, table, "start_date",
                                     start_date)

            if max_date > datetime.datetime.now(pytz.UTC):
                done = True

            if len(to_write) == 0:
                LOGGER.info("Advancing one full interval.")

                if end_date > datetime.datetime.now(pytz.UTC):
                    done = True
                else:
                    start_date = end_date

            elif start_date == max_date:
                LOGGER.info("Advancing one second.")
                start_date = start_date + datetime.timedelta(seconds=1)

            else:
                LOGGER.info("Advancing by one page.")
                start_date = max_date

            end_date = start_date + td

            save_state(self.state)