def execute(self, context): hook = MovielensHook(self._conn_id) try: self.log.info( f"Fetching ratings for {self._start_date} to {self._end_date}") ratings = list( hook.get_ratings( start_date=self._start_date, end_date=self._end_date, batch_size=self._batch_size, )) self.log.info(f"Fetched {len(ratings)} ratings") finally: # Make sure we always close our hook's session. hook.close() self.log.info(f"Writing ratings to {self._output_path}") # Make sure output directory exists. output_dir = os.path.dirname(self._output_path) os.makedirs(output_dir, exist_ok=True) # Write output as JSON. with open(self._output_path, "w") as file_: json.dump(ratings, fp=file_)
def _fetch_ratings(api_conn_id, s3_conn_id, s3_bucket, **context): year = context["execution_date"].year month = context["execution_date"].month # Fetch ratings from our API. logging.info(f"Fetching ratings for {year}/{month:02d}") api_hook = MovielensHook(conn_id=api_conn_id) ratings = pd.DataFrame.from_records( api_hook.get_ratings_for_month(year=year, month=month), columns=["userId", "movieId", "rating", "timestamp"], ) logging.info(f"Fetched {ratings.shape[0]} rows") # Write ratings to temp file. with tempfile.TemporaryDirectory() as tmp_dir: tmp_path = path.join(tmp_dir, "ratings.csv") ratings.to_csv(tmp_path, index=False) # Upload file to S3. logging.info(f"Writing results to ratings/{year}/{month:02d}.csv") s3_hook = S3Hook(s3_conn_id) s3_hook.load_file( tmp_path, key=f"ratings/{year}/{month:02d}.csv", bucket_name=s3_bucket, replace=True, )
def _fetch_ratings(api_conn_id, wasb_conn_id, container, **context): year = context["execution_date"].year month = context["execution_date"].month logging.info(f"Fetching ratings for {year}/{month:02d}") api_hook = MovielensHook(conn_id=api_conn_id) ratings = pd.DataFrame.from_records( api_hook.get_ratings_for_month(year=year, month=month), columns=["userId", "movieId", "rating", "timestamp"], ) logging.info(f"Fetched {ratings.shape[0]} rows") # Write ratings to temp file. with tempfile.TemporaryDirectory() as tmp_dir: tmp_path = path.join(tmp_dir, "ratings.csv") ratings.to_csv(tmp_path, index=False) # Upload file to Azure Blob. logging.info(f"Writing results to {container}/{year}/{month:02d}.csv") hook = WasbHook(wasb_conn_id) hook.load_file( tmp_path, container_name=container, blob_name=f"{year}/{month:02d}.csv" )
def poke(self, context): hook = MovielensHook(self._conn_id) try: next( hook.get_ratings( start_date=self._start_date, end_date=self._end_date, batch_size=1 ) ) # If no StopIteration is raised, the request returned at least one record. # This means that there are records for the given period, which we indicate # to Airflow by returning True. self.log.info( f"Found ratings for {self._start_date} to {self._end_date}, continuing!" ) return True except StopIteration: self.log.info( f"Didn't find any ratings for {self._start_date} " f"to {self._end_date}, waiting..." ) # If StopIteration is raised, we know that the request did not find # any records. This means that there a no ratings for the time period, # so we should return False. return False finally: # Make sure we always close our hook's session. hook.close()
def _fetch_ratings(conn_id, templates_dict, batch_size=1000, **_): logger = logging.getLogger(__name__) start_date = templates_dict["start_date"] end_date = templates_dict["end_date"] output_path = templates_dict["output_path"] logger.info(f"Fetching ratings for {start_date} to {end_date}") hook = MovielensHook(conn_id=conn_id) ratings = list( hook.get_ratings(start_date=start_date, end_date=end_date, batch_size=batch_size)) logger.info(f"Fetched {len(ratings)} ratiings") logger.info(f"Writing ratings to {output_path}") # Making sure output directory exists. output_dir = os.path.dirname(output_path) os.makedirs(output_dir, exist_ok=True) with open(output_path, "w") as file_: json.dumps(ratings, fp=file_)