Exemplo n.º 1
0
    def call_incremental_stream(self, stream):
        start_date, end_date = self.intialize_dates(self)

        request_config = {
            'url': self.url,
            'headers': self.build_headers(),
            'run': True,
            'data': self.build_body(stream, start_date)
        }

        while request_config['run']:
            res = self.client.make_request(request_config, method='POST')

            if res.status_code != 200:
                raise AttributeError(f'Received status_code {res.status_code}')

            records = self.convert_to_json(res)
            if len(records) == self.MAX_RECORDS:
                raise ValueError('Number of records returned is equal to '
                                 'hudson\'s limit. This means that we will be '
                                 'missing data.')
            transform_write_and_count(stream, records)

            start_date = start_date.add(days=1)
            if start_date == end_date:
                request_config['run'] = False
            else:
                request_config['data'] = self.build_body(stream, start_date)
Exemplo n.º 2
0
    def call_incremental_stream(self, stream):
        """Method to call all incremental streams"""
        last_updated = format_last_updated_for_request(
            stream.update_and_return_bookmark(), self.replication_key_format)
        request_config = {
            "url": self.url,
            "headers": self.build_headers(),
            "params": self.build_params(stream, last_updated=last_updated),
            "run": True
        }

        while request_config['run']:
            res = self.client.make_request(request_config)

            if res.status_code != 200:
                raise AttributeError(f'Received status_code {res.status_code}')

            records = res.json()
            transform_write_and_count(stream, records)

            last_updated = self.get_latest_for_next_call(
                records, stream.stream_metadata['replication-key'],
                last_updated)
            stream.update_bookmark(last_updated)

            request_config = self.update_for_next_call(res, request_config)

        return last_updated
Exemplo n.º 3
0
    def call_incremental_stream(self, stream):
        """
        Method to call all incremental synced streams
        """
        last_updated = format_last_updated_for_request(
            stream.update_and_return_bookmark(), self.replication_key_format)

        request_config = {
            'url': self.generate_api_url(stream),
            'headers': self.build_headers(),
            'params': self.build_initial_params(stream, last_updated),
            'run': True,
            'api_key': self.api_key
        }

        LOGGER.info("Extracting %s since %s" % (stream, last_updated))
        self.total_contacts = 0

        while request_config['run']:

            LOGGER.info("Params: %s" % (request_config['params']))
            res = self.client.make_request(request_config)

            if res.status_code != 200:
                raise AttributeError('Received status code {}'.format(
                    res.status_code))

            root = ElementTree.fromstring(res.text)
            records = []
            for child in root:
                xml_dict = {}
                for grandchild in child:
                    xml_dict[grandchild.tag] = grandchild.text
                records.append(xml_dict)

            self.total_contacts += len(records)
            LOGGER.info('Total Records is {}'.format(self.total_contacts))

            if self.should_write(records, stream, last_updated):
                transform_write_and_count(stream, records)

            # last_updated = self.get_lastest_update(
            #     records,
            #     last_updated
            # )

            stream.update_bookmark(last_updated)

            request_config = self.update_for_next_call(res, request_config,
                                                       stream, records)

        formated_update = self.format_updated(
            request_config['params']['updated_before'])
        LOGGER.info('setting last updated to {}'.format(formated_update))
        return formated_update
Exemplo n.º 4
0
    def call_stream(self, stream, request_config):
        res = self.client.make_request(request_config)

        records = res.json()

        if not records:
            records = []
        elif not isinstance(records, list):
            # subsequent methods are expecting a list
            records = [records]

        transform_write_and_count(stream, records)
Exemplo n.º 5
0
    def call_stream(self, stream, request_config):
        for location in self.client.config['locations']:
            request_config['params'] = self.build_params(
                location[0], location[1], location[2])
            res = self.client.make_request(request_config)

            records = res.json()

            if not records:
                records = []
            elif not isinstance(records, list):
                # subsequent methods are expecting a list
                records = [records]

            transform_write_and_count(stream, records)
Exemplo n.º 6
0
    def call_stream(self,
                    stream,
                    club_id,
                    request_config,
                    curr_upper_bound=None):
        """
        Utility method shared by incremental and full streams; handles API calls and
        record writes
        """
        while request_config['run']:
            res = self.client.make_request(request_config)

            if stream.is_incremental:
                LOGGER.info(
                    'Received {n} records on page {i} for club {c}'.format(
                        n=res.json()['status']['count'],
                        i=res.json()['request']['page'],
                        c=club_id))
            else:
                LOGGER.info('Received {n} records for club {c}'.format(
                    n=res.json()['status']['count'], c=club_id))

            records = res.json().get(stream.stream_metadata['response-key'])

            if not records:
                records = []
            elif not isinstance(records, list):
                # subsequent methods are expecting a list
                records = [records]

            # for endpoints that do not provide club_id
            if stream.stream in STREAMS_TO_HYDRATE:
                records = self.hydrate_record_with_club_id(records, club_id)

            transform_write_and_count(stream, records)

            if stream.is_incremental:
                LOGGER.info(
                    '{s} bookmark for club {c} is currently {b}'.format(
                        s=stream.stream, c=club_id, b=curr_upper_bound))

            request_config, curr_upper_bound = self.update_for_next_call(
                int(res.json()['status']['count']), request_config, stream,
                curr_upper_bound)

        return curr_upper_bound
Exemplo n.º 7
0
    def call_full_stream(self, stream):
        """Method to call all fully synched streams"""
        request_config = {
            "url": self.url,
            "headers": self.build_headers(),
            "params": self.build_params(stream),
            "run": True
        }

        while request_config['run']:
            res = self.client.make_request(request_config)

            if res.status_code != 200:
                raise AttributeError(f'Received status_code {res.status_code}')

            records = res.json()
            records = self._add_pdf_s3_link(records, request_config)
            transform_write_and_count(stream, records)
            request_config = self.update_for_next_call(res, request_config)
Exemplo n.º 8
0
    def call_incremental_stream(self, stream):
        """
        Method to call all incremental synced streams
        """
        last_updated = format_last_updated_for_request(
            stream.update_and_return_bookmark(),
            self.replication_key_format
        )

        request_config = {
            'url': self.generate_api_url(stream),
            "headers": self.build_headers(),
            'params': self.build_initial_params(last_updated),
            'run': True
        }

        LOGGER.info("Extracting stream {s} since {d}".format(s=stream,
                                                             d=last_updated))

        while request_config['run']:
            res = self.client.make_request(request_config)

            records = res.json().get('items')

            LOGGER.info('Received {n} records'.format(n=len(records)))

            transform_write_and_count(stream, records)

            last_updated = self.get_latest_record_date(records)

            LOGGER.info('Setting last updated for stream {s} to {d}'.format(
                s=stream,
                d=last_updated
            ))
            stream.update_bookmark(last_updated)

            request_config = self.update_for_next_call(len(records), request_config)

        return last_updated
Exemplo n.º 9
0
    def call_full_stream(self, stream):
        """
        Method to call all fully synced streams
        """

        request_config = {
            'url': self.generate_api_url(stream),
            'headers': self.build_headers(),
            'params': self.build_initial_params(),
            'run': True
        }

        LOGGER.info("Extracting {}".format(stream))

        while request_config['run']:
            res = self.client.make_request(request_config)

            records = res.json().get('items')

            LOGGER.info('Received {n} records'.format(n=len(records)))

            transform_write_and_count(stream, records)

            request_config = self.update_for_next_call(len(records), request_config)
Exemplo n.º 10
0
    def call_full_stream(self, stream):
        """
        Method to call all fully synced streams
        """

        stream_name = stream.stream
        event_name = EVENT_TYPES.get(stream_name)
        start_date = pendulum.parse(self.config['full_table_start_date'])
        end_date = start_date.add(months=1)
        LOGGER.info("Extracting %s since %s." % (stream_name, start_date))

        requests = [(start_date, end_date)]

        while requests:
            request_filters = requests.pop(0)
            request_start = request_filters[0]
            request_end = request_filters[1]
            request_start_str = request_start.to_datetime_string()
            request_end_str = request_end.to_datetime_string()
            LOGGER.info('Requesting export from %s to %s.' %
                        (request_start_str, request_end_str))

            sync_uri = self.client.request_bulk_export(stream,
                                                       request_start_str,
                                                       request_end_str,
                                                       event_name)
            offset = 0
            run = True

            while run:
                records, has_more, total_records = self.client.fetch_bulk_export_records(
                    sync_uri, offset, MAX_RECORDS_RETURNED, run)
                if total_records >= EXPORT_LIMIT:
                    LOGGER.info(
                        'Export exceeds 5M record limit. Splitting into multiple requests.'
                    )
                    new_end_date = request_start + (
                        (request_end - request_start) / 2)
                    requests.append((request_start, new_end_date))
                    requests.append((new_end_date, request_end))
                    run = False

                elif total_records == 0:
                    LOGGER.info('No records found between %s and %s.' %
                                (request_start, request_end))
                    run = False

                else:
                    transform_write_and_count(stream, records)

                    if not has_more:
                        run = False
                        LOGGER.info(
                            'Completed fetching records. Fetched %s records.' %
                            total_records)

                    else:
                        offset = offset + MAX_RECORDS_RETURNED
                        LOGGER.info(
                            'Fetched %s of %s records. Fetching next set of records.'
                            % (offset, total_records))
            if request_end < pendulum.now():
                new_end_date = request_end.add(months=1)
                requests.append((request_end, new_end_date))
Exemplo n.º 11
0
    def call_incremental_stream(self, stream):
        """
        Method to call incrementally synced streams
        TODO: only for bulk api, update for rest api too
        Args:
            stream (cls)
        Returns:
            last_record_date (dttime)
        """
        stream_name = stream.stream
        event_name = EVENT_TYPES.get(stream_name)
        last_updated = format_last_updated_for_request(
            stream.update_and_return_bookmark(), self.replication_key_format)
        LOGGER.info("Extracting %s since %s." % (stream_name, last_updated))
        latest_record_date = last_updated
        start_date = pendulum.parse(last_updated)
        end_date = pendulum.now()

        requests = [(start_date, end_date)]

        while requests:
            request_filters = requests.pop(0)
            request_start = request_filters[0]
            request_end = request_filters[1]
            request_start_str = request_start.to_datetime_string()
            request_end_str = request_end.to_datetime_string()
            LOGGER.info('Requesting export from %s to %s.' %
                        (request_start_str, request_end_str))

            sync_uri = self.client.request_bulk_export(stream,
                                                       request_start_str,
                                                       request_end_str,
                                                       event_name)
            offset = 0
            run = True

            while run:
                records, has_more, total_records = self.client.fetch_bulk_export_records(
                    sync_uri, offset, MAX_RECORDS_RETURNED, run)
                if total_records >= EXPORT_LIMIT:
                    LOGGER.info(
                        'Export exceeds 5M record limit. Splitting into multiple requests.'
                    )
                    new_end_date = request_start + (
                        (request_end - request_start) / 2)
                    requests.append((request_start, new_end_date))
                    requests.append((new_end_date, request_end))
                    run = False

                elif total_records == 0:
                    LOGGER.info('No records found between %s and %s.' %
                                (request_start, request_end))
                    run = False

                else:
                    transform_write_and_count(stream, records)

                    latest_record_batch = self.get_latest_for_next_call(
                        records=records,
                        replication_key=stream.meta_fields.get(
                            'replication_key'),
                        last_updated=last_updated)
                    if latest_record_batch > latest_record_date:
                        latest_record_date = latest_record_batch

                    if not has_more:
                        run = False
                        LOGGER.info(
                            'Completed fetching records. Fetched %s records.' %
                            total_records)

                    else:
                        offset = offset + MAX_RECORDS_RETURNED
                        LOGGER.info(
                            'Fetched %s of %s records. Fetching next set of records.'
                            % (offset, total_records))

        return latest_record_date