def execute_sampling(ldf: LuxDataFrame): """ Compute and cache a sample for the overall dataframe - When # of rows exceeds lux.config.sampling_start, take 75% df as sample - When # of rows exceeds lux.config.sampling_cap, cap the df at {lux.config.sampling_cap} rows lux.config.sampling_start = 100k rows lux.config.sampling_cap = 1M rows Parameters ---------- ldf : LuxDataFrame """ SAMPLE_FLAG = lux.config.sampling SAMPLE_START = lux.config.sampling_start SAMPLE_CAP = lux.config.sampling_cap SAMPLE_FRAC = 0.75 if SAMPLE_FLAG and len(ldf) > SAMPLE_CAP: if ldf._sampled is None: # memoize unfiltered sample df ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a sample capped at {SAMPLE_CAP} rows.", priority=99, ) elif SAMPLE_FLAG and len(ldf) > SAMPLE_START: if ldf._sampled is None: # memoize unfiltered sample df ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is visualizing a sample of {SAMPLE_FRAC}% of the dataframe ({len(ldf._sampled)} rows).", priority=99, ) else: ldf._sampled = ldf
def execute_sampling(ldf: LuxDataFrame): # General Sampling for entire dataframe SAMPLE_START = 10000 SAMPLE_CAP = 30000 SAMPLE_FRAC = 0.75 if len(ldf) > SAMPLE_CAP: if (ldf._sampled is None): # memoize unfiltered sample df ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a random sample capped at {SAMPLE_CAP} rows.", priority=99) elif len(ldf) > SAMPLE_START: if (ldf._sampled is None): # memoize unfiltered sample df ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a random sample of {len(ldf._sampled)} rows.", priority=99) else: ldf._sampled = ldf
def execute(vislist: VisList, ldf: LuxDataFrame): ''' Given a VisList, fetch the data required to render the vis. 1) Apply filters 2) Retrieve relevant attribute 3) Perform vis-related processing (aggregation, binning) 4) return a DataFrame with relevant results Parameters ---------- vislist: list[lux.Vis] vis list that contains lux.Vis objects for visualization. ldf : lux.core.frame LuxDataFrame with specified intent. Returns ------- None ''' for vis in vislist: vis._vis_data = ldf # The vis data starts off being the same as the content of the original dataframe filter_executed = PandasExecutor.execute_filter(vis) # Select relevant data based on attribute information attributes = set([]) for clause in vis._inferred_intent: if (clause.attribute): if (clause.attribute != "Record"): attributes.add(clause.attribute) # General Sampling if len(vis.data) > 10000: if (filter_executed): vis._vis_data = vis.data.sample(frac=0.75, random_state=1) else: if (ldf._sampled is None): # memoize unfiltered sample df ldf._sampled = vis.data.sample(frac=0.75, random_state=1) vis._vis_data = ldf._sampled # TODO: Add some type of cap size on Nrows ? vis._vis_data = vis.data[list(attributes)] if (vis.mark == "bar" or vis.mark == "line"): PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed) elif (vis.mark == "histogram"): PandasExecutor.execute_binning(vis) elif (vis.mark == "scatter"): if (len(vis.data) > 10000): vis._mark = "heatmap" PandasExecutor.execute_2D_binning(vis)