def read(self):
        if not self.advertiser_ids:
            self.advertiser_ids = self.sa360_client.get_all_advertisers_of_agency(
                self.agency_id)

        yield NormalizedJSONStream("results" + "_".join(self.advertiser_ids),
                                   self.result_generator())
Пример #2
0
def run(processors, state_service_name, state_service_host, state_service_port, normalize_keys):
    state.configure(state_service_name, state_service_host, state_service_port)

    processor_instances = [p() for p in processors]

    _readers = list(filter(lambda o: isinstance(o, Reader), processor_instances))
    _writers = list(filter(lambda o: isinstance(o, Writer), processor_instances))

    if len(_readers) < 1:
        raise click.BadParameter("You must specify a reader")

    if len(_readers) > 1:
        raise click.BadParameter("You cannot specify multiple readers")

    if len(_writers) < 1:
        raise click.BadParameter("You must specify at least one writer")

    reader = _readers[0]

    # A stream should represent a full file!
    for stream in reader.read():
        for writer in _writers:
            if normalize_keys and issubclass(stream.__class__, JSONStream):
                writer.write(NormalizedJSONStream.create_from_stream(stream))
            else:
                writer.write(stream)
Пример #3
0
    def read(self):
        def result_generator():

            watermark_value = None

            if self._watermark_column:
                watermark_value = self.state.get(
                    self._name) or self._watermark_init

            if self._object_type:
                self._query = self.build_object_type_query(
                    self._object_type, self._watermark_column)

            if self._watermark_column:
                self._query = self._query.format(
                    **{self._watermark_column: watermark_value})

            records = self._client.query(self._query)

            for rec in records:
                row = self._clean_record(rec)
                yield row

                if self._watermark_column:
                    self.state.set(self._name, row[self._watermark_column])

        yield NormalizedJSONStream(self._name, result_generator())
    def read(self):
        if self.manager_id:
            self.client_customer_ids = self.get_customer_ids(self.manager_id)

        yield NormalizedJSONStream(
            "results_" + self.report_name + "_" +
            "_".join(self.client_customer_ids),
            self.format_and_yield(),
        )
    def read(self):
        """
        :return: stream that returns Radarly posts one by one
        """
        date_ranges_and_posts_volumes: Dict = self.split_date_range()
        logging.info(
            f"API Compliant Date Ranges and Posts Volumes: {date_ranges_and_posts_volumes}"
        )
        api_compliant_date_ranges = list(date_ranges_and_posts_volumes.keys())

        t0 = time.time()
        ingestion_tracker = []

        for i, date_range in enumerate(api_compliant_date_ranges):

            if self.throttle:
                current_time = time.time() - t0
                ingestion_tracker.append(current_time)
                posts_ingested_over_window = (sum(
                    np.array(ingestion_tracker) > current_time -
                    self.api_window) * self.api_date_period_limit)
                if (posts_ingested_over_window >
                        self.throttling_threshold_coefficient *
                        self.api_quarterly_posts_limit):
                    sleep_duration = self.api_window * (
                        self.api_date_period_limit /
                        self.api_quarterly_posts_limit)
                    logging.info(
                        f"Throttling activated: waiting for {sleep_duration} seconds..."
                    )
                    time.sleep(sleep_duration)

            all_publications = self.get_publications_iterator(date_range)
            name = f"""radarly_{date_range[0].strftime("%Y-%m-%d-%H-%M-%S")}_{date_range[1].strftime(
                "%Y-%m-%d-%H-%M-%S")}"""

            def result_generator():
                while True:
                    try:
                        pub = next(all_publications)
                        yield dict(pub)
                    except StopIteration:
                        break
                    except Exception:
                        ex_type, ex, tb = sys.exc_info()
                        logging.warning(
                            f"Failed to ingest post with error: {ex}. Traceback: {traceback.print_tb(tb)}"
                        )

            yield NormalizedJSONStream(name, result_generator())
    def read(self):

        client = gspread.authorize(self._credentials)
        spreadsheet = client.open_by_url(self._url)

        for _sheet_name in self._sheet_name:

            worksheet = spreadsheet.worksheet(_sheet_name)

            def result_generator():
                for record in worksheet.get_all_records():
                    yield record

            yield NormalizedJSONStream(worksheet.title, result_generator())
    def _run_query(self):
        logging.info("Running %s query %s", self.connector_name(), self._query)

        rows = self._engine.execute(self._query)

        logging.info("%s result set contains %d rows", self.connector_name(),
                     rows.rowcount)

        def result_generator():
            row = rows.fetchone()
            while row:
                yield dict(row.items())

                if self._watermark_column:
                    self.state.set(self._name, row[self._watermark_column])

                row = rows.fetchone()
            rows.close()

        return NormalizedJSONStream(self._name, result_generator())
    def read(self):

        for prefix in self._prefix_list:

            objects_sorted_by_time = sorted(
                self.list_objects(bucket=self._bucket, prefix=prefix),
                key=lambda o: self.get_timestamp(o),
            )

            for _object in objects_sorted_by_time:

                _object = self.to_object(_object)

                logging.info(
                    f"Found {self._platform} file {self.get_key(_object)}")

                if not self.is_compatible_object(_object):
                    logging.info(
                        f"Wrong extension: Skipping file {self.get_key(_object)}"
                    )
                    continue

                if self.has_already_processed_object(_object):
                    logging.info(
                        f"Skipping already processed file {self.get_key(_object)}"
                    )
                    continue

                def result_generator():
                    temp = tempfile.TemporaryFile()
                    self.download_object_to_file(_object, temp)

                    for record in self._reader(temp):
                        yield record

                    self.checkpoint_object(_object)

                name = self.get_key(_object).split("/",
                                                   self._dest_key_split)[-1]

                yield NormalizedJSONStream(name, result_generator())
    def write(self, stream):

        normalized_stream = NormalizedJSONStream.create_from_stream(stream)

        gcs_writer = GCSWriter(self._bucket, self._project_id)
        gcs_uri, blob = gcs_writer.write(normalized_stream)

        table_ref = self._get_table_ref()

        load_job = self._client.load_table_from_uri(
            gcs_uri, table_ref, job_config=self.job_config())

        logging.info("Loading data into BigQuery %s:%s", self._dataset,
                     self._table)
        result = load_job.result()

        assert result.state == "DONE"

        if not self._keep_files:
            logging.info("Deleting GCS file: %s", gcs_uri)
            blob.delete()
    def read(self):

        yield NormalizedJSONStream(
            "results_" + self.object_type + "_" + "_".join(self.object_ids),
            self.get_data(),
        )
Пример #11
0
 def read(self):
     yield NormalizedJSONStream("search_console_results",
                                self.result_generator())
Пример #12
0
 def read(self):
     FacebookAdsApi.init(self.app_id, self.app_secret, self.access_token)
     yield NormalizedJSONStream(
         "results_" + self.ad_object_type + "_" +
         "_".join(self.ad_object_ids), self.get_data())