Пример #1
0
def import_data(context: Context,
                spark: SparkSession,
                aggregator: AggregationContext,
                data_source: str,
                limit: int = 50000):
    # Store in HBase for further batch processing
    print("Start: " + str(datetime.now()))
    csv = load_newest(context, spark)
    context.save_hbase(csv)
    print("End: " + str(datetime.now()))

    # Update ingestion times for Flume
    latest = datetime.fromtimestamp(csv.first()["opened"])
    update_ingestion_times(data_source, latest)

    # Batch process 15 minute intervals
    aggregated = get_batch_processed(csv)
    aggregator.save_hbase(aggregated)