def range(cls, columns, dimension): column = columns.data[columns.get_dimension(dimension).name] if column.dtype.kind == 'O': column = np.sort(column[column.notnull()].compute()) return column[0], column[-1] else: return dd.compute(column.min(), column.max())
def range(cls, dataset, dimension): import dask.dataframe as dd column = dataset.data[dataset.get_dimension(dimension).name] if column.dtype.kind == 'O': column = np.sort(column[column.notnull()].compute()) return (column[0], column[-1]) if len(column) else (None, None) else: return dd.compute(column.min(), column.max())
def complicated_arithmetic_operation(df): theta_1 = df.pickup_longitude phi_1 = df.pickup_latitude theta_2 = df.dropoff_longitude phi_2 = df.dropoff_latitude temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2 + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2) ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp)) return dd.compute(ret)
def test_consistency_interactions_episode_numbers(dataset: ContentWiseImpressions): na_episode_number_mask: ddf.Series = dataset.interactions.episode_number.isna() invalid_episode_number_mask: ddf.Series = (dataset.interactions.episode_number < 0) (na_episode_number_mask, invalid_episode_number_mask,) = ddf.compute(na_episode_number_mask, invalid_episode_number_mask,) assert not na_episode_number_mask.any() assert not invalid_episode_number_mask.any()
def test_consistency_impressions_non_direct_link_recommended_lists_with_at_least_one_item(dataset: ContentWiseImpressions): empty_recommendation_list_mask = (dataset .impressions_non_direct_link .recommended_series_list .map(lambda recommended_series_list: recommended_series_list.shape[0] == 0, meta=("empty_recommendation_list_mask", "bool"))) (empty_recommendation_list_mask,) = ddf.compute(empty_recommendation_list_mask,) assert not empty_recommendation_list_mask.any(skipna=False)
def test_consistency_impressions_non_direct_link_recommended_series(dataset: ContentWiseImpressions): na_recommended_series_map_mask: ddf.Series = (dataset .impressions_non_direct_link .recommended_series_list .map(lambda recommended_series_list: np.any(np.isnan(recommended_series_list)), meta=("na_recommended_series_mask", "bool"))) (na_recommended_series_map_mask,) = ddf.compute(na_recommended_series_map_mask) assert not na_recommended_series_map_mask.any(skipna=False)
def test_consistency_impressions_non_direct_link_row_position(dataset: ContentWiseImpressions): na_row_position_mask: ddf.Series = dataset.impressions_non_direct_link.row_position.isna() row_position_less_than_zero_mask: ddf.Series = (dataset.impressions_direct_link.row_position < 0) (na_row_position_mask, row_position_less_than_zero_mask,) = ddf.compute(na_row_position_mask, row_position_less_than_zero_mask, ) assert not na_row_position_mask.any(skipna=False) assert not row_position_less_than_zero_mask.any(skipna=False)
def test_consistency_interactions_series_length(dataset: ContentWiseImpressions): na_series_length_mask: ddf.Series = dataset.interactions.series_length.isna() invalid_series_length_mask: ddf.Series = (dataset.interactions.series_length < 0) (na_series_length_mask, invalid_series_length_mask,) = ddf.compute(na_series_length_mask, invalid_series_length_mask,) assert not na_series_length_mask.any() assert not invalid_series_length_mask.any()
def format_source_data(self, data): """ Description: format source ------------------------------------------- Input: source_dict = { 'X': [], 'Y': [] } ------------------------------------------- Ouput: """ self.source = data self.x_range = (self.source[self.x].min(), self.source[self.x].max()) self.y_range = (self.source[self.y].min(), self.source[self.y].max()) if isinstance(data, dask_cudf.core.DataFrame): self.x_range = dd.compute(*self.x_range) self.y_range = dd.compute(*self.y_range)
def test_consistency_impressions_direct_link_recommendation_list_length(dataset: ContentWiseImpressions): na_recommendation_list_length_mask: ddf.Series = dataset.impressions_direct_link.recommendation_list_length.isna() recommendation_list_length_less_than_zero_mask: ddf.Series = ( dataset.impressions_direct_link.recommendation_list_length < 0) (na_recommendation_list_length_mask, recommendation_list_length_less_than_zero_mask,) = ddf.compute(na_recommendation_list_length_mask, recommendation_list_length_less_than_zero_mask,) assert not na_recommendation_list_length_mask.any(skipna=False) assert not recommendation_list_length_less_than_zero_mask.any(skipna=False)
def range(cls, dataset, dimension): import dask.dataframe as dd dimension = dataset.get_dimension(dimension, strict=True) column = dataset.data[dimension.name] if column.dtype.kind == 'O': column = np.sort(column[column.notnull()].compute()) return (column[0], column[-1]) if len(column) else (None, None) else: if dimension.nodata is not None: column = cls.replace_value(column, dimension.nodata) return dd.compute(column.min(), column.max())
def test_consistency_interactions_index(dataset: ContentWiseImpressions): na_index_mask: ddf.Series = dataset.interactions.index.isna() min_index: int = dataset.interactions.index.min() max_index: int = dataset.interactions.index.max() (na_index_mask, min_index, max_index,) = ddf.compute(na_index_mask, min_index, max_index,) assert not na_index_mask.any()
def test_consistency_interactions_items_have_same_series_length(dataset: ContentWiseImpressions): pairs_item_id_with_series_length = (dataset .interactions[["item_id", "series_length"]] .groupby("item_id") .series_length .agg(["min", "max"])) invalid_pairs_mask = (pairs_item_id_with_series_length["min"] != pairs_item_id_with_series_length["max"]) (invalid_pairs_mask,) = ddf.compute(invalid_pairs_mask, scheduler="threads") assert not invalid_pairs_mask.any()
def test_consistency_interactions_items_have_only_one_episode_number(dataset: ContentWiseImpressions): pairs_item_id_with_episode_number = (dataset .interactions[["item_id", "episode_number"]] .groupby("item_id") .episode_number .agg(["min", "max"])) invalid_pairs_mask = (pairs_item_id_with_episode_number["min"] != pairs_item_id_with_episode_number["max"]) (invalid_pairs_mask, ) = ddf.compute(invalid_pairs_mask, scheduler="threads") assert not invalid_pairs_mask.any()
def test_consistency_interactions_item_types(dataset: ContentWiseImpressions): na_item_type_mask: ddf.Series = dataset.interactions.item_type.isna() invalid_item_types_mask: ddf.Series = (dataset.interactions .item_type .map(lambda item_type: item_type not in {0, 1, 2, 3})) (na_item_type_mask, invalid_item_types_mask,) = ddf.compute(na_item_type_mask, invalid_item_types_mask,) assert not na_item_type_mask.any() assert not invalid_item_types_mask.any()
def test_consistency_interactions_series_ids(dataset: ContentWiseImpressions): expected_number_series: int = dataset.metadata["num_series"] na_series_id_mask: ddf.Series = dataset.interactions.series_id.isna() invalid_series_id_mask: ddf.Series = ((dataset.interactions.series_id < 0) | (dataset.interactions.series_id > expected_number_series)) (na_series_id_mask, invalid_series_id_mask,) = ddf.compute(na_series_id_mask, invalid_series_id_mask,) assert not na_series_id_mask.any() assert not invalid_series_id_mask.any()
def test_consistency_impressions_non_direct_link_reported_length_equal_to_actual_length(dataset: ContentWiseImpressions): recommendation_list_length = dataset.impressions_non_direct_link.recommendation_list_length actual_length_of_recommended_series = (dataset .impressions_non_direct_link .recommended_series_list .map(lambda series: series.shape[0], meta=("actual_length_of_recommended_series", "int"))) impressions_with_mismatching_length_mask = (recommendation_list_length != actual_length_of_recommended_series) (impressions_with_mismatching_length_mask,) = ddf.compute(impressions_with_mismatching_length_mask) assert not impressions_with_mismatching_length_mask.any(skipna=False)
def create_posjac(self): ''' Replace our sparse jacobian with a positive variation (negative links are reversed) Self reactions are removed and non existant species are removed. Args: ignore - list of species to be ignored in posjac array (most commonly inorganics) ''' print ('computing the posjac array') try: self.posjac print ('Posjac already exists, use "del <name>.posjac" to remove it') except:None #remove no existant species #rm = re.compile(r'\b%s\b'%'|'.join(set('->'.join(self.jacsp.columns).split('->'))-set(self.spec.columns))) #self.posjac = self.jacsp[filter(lambda x: not rm.search(x), self.jacsp.columns)] #self reactions and negatives contains = set(self.jacsp.columns) selfself = set(('%s->%s'%(i,i) for i in self.spec.columns)) rxns = list(set(self.jacsp.columns) - selfself) self.posjac = dd.compute(self.jacsp[rxns])[0] rev = re.compile(r'(.+)->(.+)') #for each negative reaction for h in rxns: #our column dummy = self.posjac[h] #save static positive values - unchanged self.posjac[h] = dummy*(dummy>0).astype(float) #negative (reverse ) values only lt = dummy<0 mx = np.array(dummy*(-lt.astype(float))) #reverse link hp = rev.sub(r'\2->\1',h) try:self.posjac[hp] = self.posjac[hp] + mx except:self.posjac[hp] = mx #remove emptys self.posjac = self.posjac[self.posjac.columns[(self.posjac!=0).sum().astype(bool)]]
def normalize(self, df): try: min_date, max_date = dd.compute(df.block_timestamp.min(), df.block_timestamp.max()) self.day_diff = abs((max_date - min_date).days) logger.error("NORMALIZATION started for day-diff:%s day(s)",self.day_diff) if self.day_diff > 0: for col in df.columns: if isinstance(col,int) or isinstance(col,float): logger.warning("NORMALATION ONGOING FOR %s",col) df[col] = df[col].map(self.divide_by_day_diff) logger.warning("NORMALIZATION ended for day-diff:%s days",self.day_diff) return df except Exception: logger.error('nomalize:',exc_info=True)
def test_consistency_interactions_impressions_direct_link_only_common_recommendation_ids(dataset: ContentWiseImpressions): unique_shared_recommendation_ids = (dataset .interactions .merge(right=dataset.impressions_direct_link, how="inner", left_on="recommendation_id", right_index=True) .recommendation_id .unique()) # We add the missing recommendation id (-1) as part of a different recommendation id. The merge above removes this # value, we add its count here. num_unique_shared_recommendation_ids = unique_shared_recommendation_ids.shape[0] + 1 (num_unique_shared_recommendation_ids,) = ddf.compute(num_unique_shared_recommendation_ids) assert num_unique_shared_recommendation_ids == dataset.metadata["num_recommendations"]
def get_albedo(self, datapaths, albedo_type): #load albedos and extract data DS = xr.open_mfdataset(datapaths[:], parallel=True, chunks='auto') #loading ncdf files ## Get QFLAG ## Give dtype here because lazy loading can't infer it (float by defaut but right_shift requires int) da_qflag = DS['QFLAG'].astype(np.uint8) #da_snowmask = np.logical_and(np.right_shift(da_qflag, 5), 1)==1 # True if snow, False otherwise da_snowmask = (da_qflag & 32)==32 # True if snow, False otherwise ## Get albedo data da_al = DS[albedo_type] #getting data for specific band if self.mode=='nosnow': da_al = da_al.where(~da_snowmask) # filter out snow: set to nan when da_snowmask is False elif self.mode=='snowmask': da_al = da_snowmask #da_mean_lowres = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).mean('time') #downsampling for faster plotting #da_mean_lowres = da_al.isel(lon=slice(5400, 6100), lat=slice(2100,2700)).mean('time') # center of Africa #da_mean_lowres = da_al.isel(lon=slice(6000, 9000), lat=slice(None,2000)).mean('time') # Asia da_mean_lowres = da_al.isel(lat=slice(None,1600)).mean('time') # High latitudes #da_mean_lowres = da_al.isel(lon=slice(22400, 33600, 10), lat=slice(None,7467, 10)).mean('time') # Asia for 1KM #da_mean_lowres = da_al.isel(lon=slice(7000, 9000), lat=slice(700,1700)).mean('time') # himalaya #da_mean_lowres = da_al.isel(lon=slice(7500, 8300), lat=slice(1200,1600)).mean('time') # himalaya zoom #da_mean_lowres = da_al.isel(lon=slice(7640, 7760), lat=slice(1300,1360)).mean('time') # himalaya big zoom # da_mean_lowres = da_al.mean('time') # Full res if self.mode=='snowmask': da_mean_lowres = da_mean_lowres.where(da_mean_lowres>1.e-6) #getting average, min and max albedos for each time step (used to plot timeline) if 0: da_timeline_mean = da_al.mean(['lon','lat']) da_timeline_max = da_al.max(['lon','lat']) da_timeline_min = da_al.min(['lon','lat']) da_timeline_mean = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).mean(['lon','lat']) da_timeline_max = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).max(['lon','lat']) da_timeline_min = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).min(['lon','lat']) res_comp = dd.compute(da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min) return res_comp
def main(): input_path = "/Users/Jason/Downloads/tempdata/Run06136_r0.tio" pedestal_path = "/Users/Jason/Downloads/tempdata/Run06136_ped.tcal" max_events = None reader = TIOReader(input_path, max_events=max_events) wf_calib = WaveformCalibrator( pedestal_path, reader.n_pixels, reader.n_samples ) dtio = DaskTIO(reader, wf_calib) ddf = dtio.get_file_df() # print("here") df_0, df_2 = dd.compute( ddf.groupby(['ipix', 'fblock', 'fbpisam'])['r0'].std(), ddf.groupby(['ipix', 'fci', 'fbpisam'])['r0'].std(), ) embed()
def test_consistency_interactions_vision_factor(dataset: ContentWiseImpressions): # Vision factor values should only be set when the interaction type is "Viewed" (0). # We verify that all "viewed" interactions have valid values (from 0.0 to 1.0) # For any other interaction, we verify that the value is -1. na_vision_factor_mask: ddf.Series = dataset.interactions.vision_factor.isna() viewed_interactions_vision_factors = dataset.interactions[dataset.interactions.interaction_type == 0].vision_factor viewed_invalid_vision_factor_mask: ddf.Series = ((viewed_interactions_vision_factors < 0) | (viewed_interactions_vision_factors > 5.0)) other_interactions_vision_factors = dataset.interactions[dataset.interactions.interaction_type != 0].vision_factor other_invalid_vision_factor_mask: ddf.Series = (other_interactions_vision_factors != -1.0) (na_vision_factor_mask, viewed_invalid_vision_factor_mask, other_invalid_vision_factor_mask,) = ddf.compute(na_vision_factor_mask, viewed_invalid_vision_factor_mask, other_invalid_vision_factor_mask,) assert not na_vision_factor_mask.any() assert not viewed_invalid_vision_factor_mask.any() assert not other_invalid_vision_factor_mask.any()
def min_max_count(x, column=0): """ min_max_count Handles min, max and count. This works on numpy, lists, pandas and dask dataframes. :param column: :param x: list, numpy array, series, pandas or dask dataframe :return: min, max and count """ if dd and type(x) in (dd.core.DataFrame, dd.core.Series): omin, omax, count = dd.compute(x.min(), x.max(), x.count()) elif type(x) in (pd.DataFrame, pd.Series): omin = x.min() omax = x.max() count = len(x) else: omin = min(x) omax = max(x) count = len(x) return omin, omax, int(count)
def missing_spectrum(df: dd.DataFrame, num_bins: int, num_cols: int) -> Intermediate: """ Calculate a missing spectrum for each column """ # pylint: disable=too-many-locals num_bins = min(num_bins, len(df) - 1) df = df.iloc[:, :num_cols] cols = df.columns[:num_cols] ncols = len(cols) nrows = len(df) chunk_size = len(df) // num_bins data = df.isnull().to_dask_array() data.compute_chunk_sizes() data = data.rechunk((chunk_size, None)) (notnull_counts, ) = dd.compute(data.sum(axis=0) / data.shape[0]) missing_percent = { col: notnull_counts[idx] for idx, col in enumerate(cols) } missing_percs = data.map_blocks(missing_perc_blockwise, dtype=float).compute() locs0 = np.arange(len(missing_percs)) * chunk_size locs1 = np.minimum(locs0 + chunk_size, nrows) locs_middle = locs0 + chunk_size / 2 df = pd.DataFrame({ "column": np.repeat(cols.values, len(missing_percs)), "location": np.tile(locs_middle, ncols), "missing_rate": missing_percs.T.ravel(), "loc_start": np.tile(locs0, ncols), "loc_end": np.tile(locs1, ncols), }) return Intermediate(data=df, missing_percent=missing_percent, visual_type="missing_spectrum")
def measureValue(sessionStoreData,relayoutData,selectedData,measuresValue,fixFilterValue,clearFiltersButton): min = inventory.begin.min() max = inventory.end.max() startValue = 1970 ctx = dash.callback_context if ctx.triggered[0]['prop_id'].split('.')[0] == 'measures' and len(ctx.triggered)>1: value = [startValue,max] elif ctx.triggered[0]['prop_id'].split('.')[0] == 'clearFiltersButton': value = [startValue,max] elif fixFilterValue == 'Time': raise PreventUpdate elif relayoutData.get('dragmode') == 'lasso' and selectedData is None: raise PreventUpdate else: df = inventory dd = filter_by_mapbox_data(df,relayoutData,selectedData) df = dd.compute() value = [df.begin.min(),df.end.max()] markColor = '#EBEBEB' marks = {} for year in range(min,max,20): marks.update({year:{'label':str(year),'style':{'color':markColor}}}) marks.update({max:{'label':str(max),'style':{'color':markColor}}}) sliderDict = {'min':min,'max':max,'value':value,'marks':marks} setRedis('sliderValue',sliderDict,sessionStoreData) return 'Computed'
def create_posjac(self): ''' Replace our sparse jacobian with a positive variation (negative links are reversed) Self reactions are removed and non existant species are removed. ''' print('computing the posjac array') try: return self.posjac except: None #remove no existant species #rm = re.compile(r'\b%s\b'%'|'.join(set('->'.join(self.jacsp.columns).split('->'))-set(self.spec.columns))) #self.posjac = self.jacsp[filter(lambda x: not rm.search(x), self.jacsp.columns)] #self reactions and negatives contains = set(self.jacsp.columns[(self.jacsp < 0).sum().astype(bool)]) selfself = set(('%s->%s' % (i, i) for i in self.spec.columns)) self.posjac = dd.compute( self.jacsp[list(set(self.jacsp.columns) - selfself)])[0] rev = re.compile(r'(.+)->(.+)') #for each nevative reaction for h in contains - selfself: dummy = self.posjac[h] lt = dummy < 0 mx = np.array(dummy * (-lt)) self.posjac[h] = dummy * (dummy > 0) hp = rev.sub(r'\2->\1', h) try: self.posjac[hp] = self.posjac[hp] + mx except: self.posjac[hp] = mx #remove emptys self.posjac = self.posjac[self.posjac.columns[(self.posjac > 0).any()]]
def test_consistency_interactions_impressions_direct_link_interacted_items_are_inside_recommendation_list(dataset: ContentWiseImpressions): def get_series_index_on_recommendation_list(row) -> int: results: np.ndarray = np.where(row.recommended_series_list == row.series_id) indices: np.ndarray = results[0] if len(indices) == 0: return -1 return indices[0] dataset: ddf.DataFrame = dataset.interactions.merge(right=dataset.impressions_direct_link, how="inner", left_on="recommendation_id", right_index=True) dataset["recommendation_index"] = dataset.apply(get_series_index_on_recommendation_list, axis="columns", meta=("recommendation_index", "int32")) series_not_found_on_recommendation_mask: ddf.Series = (dataset.recommendation_index == -1) (series_not_found_on_recommendation_mask,) = ddf.compute(series_not_found_on_recommendation_mask) assert not series_not_found_on_recommendation_mask.any(skipna=False)
def transform(self, input_scores, calibrated_scores): """ Calibrates a score Parameters ---------- input_scores: list Input score files to be calibrated calibrated_files: list Output score files """ assert isinstance(input_scores, list) or isinstance( input_scores, tuple) assert isinstance(calibrated_scores, list) or isinstance( calibrated_scores, tuple) assert len(calibrated_scores) == len(input_scores) for file_name, output_file_name in zip(input_scores, calibrated_scores): # Fetching scores dataframe = dask.dataframe.read_csv(file_name) dataframe = dataframe.compute() X = dataframe["score"].to_numpy() calibrated_scores = np.vstack([ fitter.predict_proba(X) for fitter in self._categorical_fitters ]).T calibrated_scores = self.reduction_function(calibrated_scores, axis=1) dataframe["score"] = calibrated_scores dataframe.to_csv(output_file_name, index=False) return calibrated_scores
def test_consistency_interactions_explicit_ratings(dataset: ContentWiseImpressions): # Explicit ratings values should only be set when the interaction type is "Rated" (2). # We verify that all "rated" interactions have valid values (from 0.0 to 5.0 with steps of 0.5) # For any other interaction, we verify that the value is -1. na_explicit_ratings_mask: ddf.Series = dataset.interactions.explicit_rating.isna() rated_interactions_explicit_ratings: ddf.Series = dataset.interactions[dataset.interactions.interaction_type == 2].explicit_rating rated_invalid_explicit_ratings_mask: ddf.Series = (rated_interactions_explicit_ratings .map(lambda rating: rating not in np.linspace(0.0, 5.0, num=11))) other_interactions_explicit_rating = dataset.interactions[ dataset.interactions.interaction_type != 2].explicit_rating other_invalid_explicit_ratings_mask: ddf.Series = (other_interactions_explicit_rating != -1.0) (na_explicit_ratings_mask, rated_invalid_explicit_ratings_mask, other_invalid_explicit_ratings_mask,) = ddf.compute(na_explicit_ratings_mask, rated_invalid_explicit_ratings_mask, other_invalid_explicit_ratings_mask,) assert not na_explicit_ratings_mask.any() assert not rated_invalid_explicit_ratings_mask.any() assert not other_invalid_explicit_ratings_mask.any()
def test_consistency_interactions_impressions_non_direct_link_only_common_user_ids(dataset: ContentWiseImpressions): # NOTE: We calculate uniqueness of user_ids on the impressions_non_direct_link due to the high impact on memory # that the merges take if not done in this way. unique_user_ids_on_impressions_non_direct_link = (dataset .impressions_non_direct_link .reset_index(drop=False) .user_id .unique() .to_frame(name='user_id')) unique_shared_user_ids = (dataset .interactions .merge(right=unique_user_ids_on_impressions_non_direct_link, how="inner", left_on="user_id", right_on="user_id") .user_id .unique()) num_unique_shared_user_ids = unique_shared_user_ids.shape[0] (num_unique_shared_user_ids,) = ddf.compute(num_unique_shared_user_ids) assert num_unique_shared_user_ids == dataset.metadata["num_users"]
def missing_impact_1v1( # pylint: disable=too-many-locals df: dd.DataFrame, x: str, y: str, bins: int, ndist_sample: int, dtype: Optional[DTypeDef] = None, ) -> Intermediate: # pylint: disable=too-many-arguments """ Calculate the distribution change on another column y when the missing values in x is dropped. """ df0 = df[[x, y]] df1 = df.dropna(subset=[x]) srs0, srs1 = df0[y], df1[y] minimum, maximum = srs0.min(), srs0.max() hists = [ histogram(srs, dtype=dtype, bins=bins, return_edges=True) for srs in [srs0, srs1] ] hists = da.compute(*hists) meta = ColumnsMetadata() meta["y", "dtype"] = detect_dtype(df[y], dtype) if is_dtype(detect_dtype(df[y], dtype), Continuous()): dists = [rv_histogram((hist[0], hist[2])) for hist in hists] # type: ignore xs = np.linspace(minimum, maximum, ndist_sample) pdfs = [dist.pdf(xs) for dist in dists] cdfs = [dist.cdf(xs) for dist in dists] distdf = pd.DataFrame({ "x": np.tile(xs, 2), "pdf": np.concatenate(pdfs), "cdf": np.concatenate(cdfs), "label": np.repeat(LABELS, ndist_sample), }) counts, xs, edges = zip(*hists) lower_bounds: List[float] = [] upper_bounds: List[float] = [] for edge in edges: lower_bounds.extend(edge[:-1]) upper_bounds.extend(edge[1:]) histdf = pd.DataFrame({ "x": np.concatenate(xs), "count": np.concatenate(counts), "label": np.repeat(LABELS, [len(count) for count in counts]), "lower_bound": lower_bounds, "upper_bound": upper_bounds, }) quantiles = [[srs.quantile(q) for q in [0, 0.25, 0.5, 0.75, 1]] for srs in [srs0, srs1]] quantiles = dd.compute(*quantiles) boxdf = pd.DataFrame(quantiles) boxdf.columns = ["min", "q1", "q2", "q3", "max"] iqr = boxdf["q3"] - boxdf["q1"] boxdf["upper"] = np.minimum(boxdf["q3"] + 1.5 * iqr, boxdf["max"]) boxdf["lower"] = np.maximum(boxdf["q3"] - 1.5 * iqr, boxdf["min"]) boxdf["label"] = LABELS itmdt = Intermediate( dist=distdf, hist=histdf, box=boxdf, meta=meta["y"], x=x, y=y, visual_type="missing_impact_1v1", ) return itmdt else: counts, xs = zip(*hists) df = pd.DataFrame({ "x": np.concatenate(xs, axis=0), "count": np.concatenate(counts, axis=0), "label": np.repeat(LABELS, [len(count) for count in counts]), }) # If the cardinality of a categorical column is too large, # we show the top `num_bins` values, sorted by their count before drop if len(counts[0]) > bins: sortidx = np.argsort(-counts[0]) selected_xs = xs[0][sortidx[:bins]] df = df[df["x"].isin(selected_xs)] partial = (bins, len(counts[0])) else: partial = (len(counts[0]), len(counts[0])) meta["y", "partial"] = partial itmdt = Intermediate( hist=df, x=x, y=y, meta=meta["y"], visual_type="missing_impact_1v1", ) return itmdt
def missing_impact_1vn( # pylint: disable=too-many-locals df: dd.DataFrame, x: str, bins: int, dtype: Optional[DTypeDef] = None, ) -> Intermediate: """ Calculate the distribution change on other columns when the missing values in x is dropped. """ df0 = df df1 = df.dropna(subset=[x]) cols = [col for col in df.columns if col != x] hists = {} hists_restore_dtype = {} for col in cols: range = None # pylint: disable=redefined-builtin if is_dtype(detect_dtype(df0[col], dtype), Continuous()): range = (df0[col].min(axis=0), df0[col].max(axis=0)) hists[col] = [ histogram(df[col], dtype=dtype, bins=bins, return_edges=True, range=range) for df in [df0, df1] ] # In some cases(Issue#98), dd.compute() can change the features dtypes and cause error. # So we need to restore features dtypes after dd.compute(). centers_dtypes = (hists[col][0][1].dtype, hists[col][1][1].dtype) (hists, ) = dd.compute(hists) dict_value = [] # Here we do not reassign to the "hists" variable as # dd.compute() can change variables' types and cause error to mypy test in CircleCI . # Instead, we assign to a new variable hists_restore_dtype. for i in [0, 1]: intermediate = list(hists[col][i]) intermediate[1] = intermediate[1].astype(centers_dtypes[i]) dict_value.append(tuple(intermediate)) hists_restore_dtype[col] = dict_value dfs = {} meta = ColumnsMetadata() for col, hists_ in hists_restore_dtype.items(): counts, xs, *edges = zip(*hists_) labels = np.repeat(LABELS, [len(x) for x in xs]) data = { "x": np.concatenate(xs), "count": np.concatenate(counts), "label": labels, } if edges: lower_bound: List[float] = [] upper_bound: List[float] = [] for edge in edges[0]: lower_bound.extend(edge[:-1]) upper_bound.extend(edge[1:]) data["lower_bound"] = lower_bound data["upper_bound"] = upper_bound df = pd.DataFrame(data) # If the cardinality of a categorical column is too large, # we show the top `num_bins` values, sorted by their count before drop if len(counts[0]) > bins and is_dtype(detect_dtype(df0[col], dtype), Nominal()): sortidx = np.argsort(-counts[0]) selected_xs = xs[0][sortidx[:bins]] df = df[df["x"].isin(selected_xs)] meta[col, "partial"] = (bins, len(counts[0])) else: meta[col, "partial"] = (len(counts[0]), len(counts[0])) meta[col, "dtype"] = detect_dtype(df0[col], dtype) dfs[col] = df return Intermediate(data=dfs, x=x, meta=meta, visual_type="missing_impact_1vn")