def import_data(context: Context, spark: SparkSession, aggregator: AggregationContext, data_source: str, limit: int = 50000): # Store in HBase for further batch processing print("Start: " + str(datetime.now())) csv = load_newest(context, spark) context.save_hbase(csv) print("End: " + str(datetime.now())) # Update ingestion times for Flume latest = datetime.fromtimestamp(csv.first()["opened"]) update_ingestion_times(data_source, latest) # Batch process 15 minute intervals aggregated = get_batch_processed(csv) aggregator.save_hbase(aggregated)