def execute(self, context):
        hook = MovielensHook(self._conn_id)

        try:
            self.log.info(
                f"Fetching ratings for {self._start_date} to {self._end_date}")
            ratings = list(
                hook.get_ratings(
                    start_date=self._start_date,
                    end_date=self._end_date,
                    batch_size=self._batch_size,
                ))
            self.log.info(f"Fetched {len(ratings)} ratings")
        finally:
            # Make sure we always close our hook's session.
            hook.close()

        self.log.info(f"Writing ratings to {self._output_path}")

        # Make sure output directory exists.
        output_dir = os.path.dirname(self._output_path)
        os.makedirs(output_dir, exist_ok=True)

        # Write output as JSON.
        with open(self._output_path, "w") as file_:
            json.dump(ratings, fp=file_)
예제 #2
0
    def _fetch_ratings(api_conn_id, s3_conn_id, s3_bucket, **context):
        year = context["execution_date"].year
        month = context["execution_date"].month

        # Fetch ratings from our API.
        logging.info(f"Fetching ratings for {year}/{month:02d}")

        api_hook = MovielensHook(conn_id=api_conn_id)
        ratings = pd.DataFrame.from_records(
            api_hook.get_ratings_for_month(year=year, month=month),
            columns=["userId", "movieId", "rating", "timestamp"],
        )

        logging.info(f"Fetched {ratings.shape[0]} rows")

        # Write ratings to temp file.
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_path = path.join(tmp_dir, "ratings.csv")
            ratings.to_csv(tmp_path, index=False)

            # Upload file to S3.
            logging.info(f"Writing results to ratings/{year}/{month:02d}.csv")
            s3_hook = S3Hook(s3_conn_id)
            s3_hook.load_file(
                tmp_path,
                key=f"ratings/{year}/{month:02d}.csv",
                bucket_name=s3_bucket,
                replace=True,
            )
def _fetch_ratings(api_conn_id, wasb_conn_id, container, **context):
    year = context["execution_date"].year
    month = context["execution_date"].month

    logging.info(f"Fetching ratings for {year}/{month:02d}")

    api_hook = MovielensHook(conn_id=api_conn_id)
    ratings = pd.DataFrame.from_records(
        api_hook.get_ratings_for_month(year=year, month=month),
        columns=["userId", "movieId", "rating", "timestamp"],
    )

    logging.info(f"Fetched {ratings.shape[0]} rows")

    # Write ratings to temp file.
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_path = path.join(tmp_dir, "ratings.csv")
        ratings.to_csv(tmp_path, index=False)

        # Upload file to Azure Blob.
        logging.info(f"Writing results to {container}/{year}/{month:02d}.csv")
        hook = WasbHook(wasb_conn_id)
        hook.load_file(
            tmp_path, container_name=container, blob_name=f"{year}/{month:02d}.csv"
        )
예제 #4
0
    def poke(self, context):
        hook = MovielensHook(self._conn_id)

        try:
            next(
                hook.get_ratings(
                    start_date=self._start_date, end_date=self._end_date, batch_size=1
                )
            )
            # If no StopIteration is raised, the request returned at least one record.
            # This means that there are records for the given period, which we indicate
            # to Airflow by returning True.
            self.log.info(
                f"Found ratings for {self._start_date} to {self._end_date}, continuing!"
            )
            return True
        except StopIteration:
            self.log.info(
                f"Didn't find any ratings for {self._start_date} "
                f"to {self._end_date}, waiting..."
            )
            # If StopIteration is raised, we know that the request did not find
            # any records. This means that there a no ratings for the time period,
            # so we should return False.
            return False
        finally:
            # Make sure we always close our hook's session.
            hook.close()
예제 #5
0
    def _fetch_ratings(conn_id, templates_dict, batch_size=1000, **_):
        logger = logging.getLogger(__name__)

        start_date = templates_dict["start_date"]
        end_date = templates_dict["end_date"]
        output_path = templates_dict["output_path"]

        logger.info(f"Fetching ratings for {start_date} to {end_date}")
        hook = MovielensHook(conn_id=conn_id)
        ratings = list(
            hook.get_ratings(start_date=start_date,
                             end_date=end_date,
                             batch_size=batch_size))
        logger.info(f"Fetched {len(ratings)} ratiings")
        logger.info(f"Writing ratings to {output_path}")

        # Making sure output directory exists.
        output_dir = os.path.dirname(output_path)
        os.makedirs(output_dir, exist_ok=True)

        with open(output_path, "w") as file_:
            json.dumps(ratings, fp=file_)