def test_cload_field(): runner = CliRunner() with runner.isolated_filesystem(): extra_args = ['--field', 'score=7'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count']) assert 'score' in pixels.columns and types.is_float_dtype( pixels.dtypes['score']) extra_args = ['--field', 'count=7'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count']) assert np.allclose(pixels['count'][:], 0) extra_args = ['--field', 'count=7:dtype=float'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_float_dtype( pixels.dtypes['count']) assert np.allclose(pixels['count'][:], 0.2) extra_args = ['--field', 'count=7:agg=min,dtype=float'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_float_dtype( pixels.dtypes['count']) assert np.allclose(pixels['count'][:], 0.1)
def check_if_series_has_internal_type(series, internal_type): """Check if data type of series fits to the internal type of gettsim. Parameters ---------- series : pd.Series Some data series. internal_type : TypeVar One of the internal gettsim types. Returns ------- out : bool Return check variable. """ if internal_type == FloatSeries: out = is_float_dtype(series) or is_integer_dtype(series) elif internal_type == BoolSeries: out = is_bool_dtype(series) elif internal_type == IntSeries: out = is_integer_dtype(series) elif internal_type == DateTimeSeries: out = is_datetime64_any_dtype(series) else: raise ValueError(f"The internal type {internal_type} is not defined.") return out
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]): # There is a mismatch between floating and integer columns. # Determine all mismatched and error. mismatched = sorted(c for c in df.columns if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c])) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "%s\n\n" "To fix, specify dtypes manually by adding:\n\n" "%s\n\n" "to the call to `read_csv`/`read_table`.\n\n" "Alternatively, provide `assume_missing=True` to " "interpret all unspecified integer columns as floats.") missing_list = '\n'.join('- %r' % c for c in mismatched) dtype_list = ('%r: float' % c for c in mismatched) missing_dict = 'dtype={%s}' % ',\n '.join(dtype_list) raise ValueError(msg % (missing_list, missing_dict)) df[c] = df[c].astype(dtypes[c])
def _validate_fit_data(self): """Verifies that T, X, and y are formatted the right way""" # Checks for T column if not is_float_dtype(self.T): raise TypeError(f"Treatment data must be of type float") # Make sure all X columns are float or int if isinstance(self.X, pd.Series): if not is_numeric_dtype(self.X): raise TypeError( f"All covariate (X) columns must be int or float type (i.e. must be numeric)" ) elif isinstance(self.X, pd.DataFrame): for column in self.X: if not is_numeric_dtype(self.X[column]): raise TypeError( f"All covariate (X) columns must be int or float type " f"(i.e. must be numeric)") # Checks for Y column if not (is_float_dtype(self.y) or is_integer_dtype(self.y)): raise TypeError(f"Outcome data must be of type float or integer") if is_integer_dtype(self.y) and (not np.array_equal( np.sort(self.y.unique()), np.array([0, 1]))): raise TypeError( f"If your outcome data is of type integer (binary outcome)," f"it should only contain 1's and 0's.")
def test_cload_field(): runner = CliRunner() with runner.isolated_filesystem(): extra_args = ["--field", "score=8"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"]) assert "score" in pixels.columns and types.is_float_dtype( pixels.dtypes["score"]) extra_args = ["--field", "count=8"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"]) assert np.allclose(pixels["count"][:], 0) extra_args = ["--field", "count=8:dtype=float"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_float_dtype( pixels.dtypes["count"]) assert np.allclose(pixels["count"][:], 0.2) extra_args = ["--field", "count=8:agg=min,dtype=float"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_float_dtype( pixels.dtypes["count"]) assert np.allclose(pixels["count"][:], 0.1)
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def sanity_checks(df_sumstats): if np.any(df_sumstats['CHR'].isnull()): raise ValueError('Some SNPs have a null chromosome value') if np.any(df_sumstats['BP'].isnull()): raise ValueError('Some SNPs have a null base-pair value') if not is_integer_dtype(df_sumstats['CHR']): raise ValueError( 'Some chromosome values are not integers. Please double-check your input' ) if not is_integer_dtype(df_sumstats['BP']): raise ValueError( 'Some base-pair values are not integers. Please double-check your input' )
def estimate_parameters( self, data: pd.DataFrame, method: str = "mle", infer_levels: bool = False, method_args: Optional[Dict[str, Union[int, float]]] = None, ) -> None: """Estimate conditional probabilities based on supplied data.""" data = data.copy() if infer_levels: if all(is_categorical_dtype(data[col]) for col in data.columns): self.vs['levels'] = [ list(dtype.categories) for dtype in data.dtypes ] else: for vertex in self.vs: if not (is_integer_dtype(data[vertex['name']]) or is_string_dtype(data[vertex['name']])): raise ValueError( f"Unrecognised DataFrame dtype: {data[vertex['name']].dtype}" ) vertex_categories = sorted( data[vertex['name']].unique().astype(str)) column = pd.Categorical(data[vertex['name']].astype(str), categories=vertex_categories) vertex['levels'] = vertex_categories data[vertex['name']] = column else: try: if not all( isinstance(dtype, pd.CategoricalDtype) for dtype in data.dtypes): for vertex in self.vs: if is_integer_dtype(data[vertex['name']]): cat_dtype = pd.CategoricalDtype(vertex['levels'], ordered=True) data[vertex['name']] = pd.Categorical.from_codes( codes=data[vertex['name']], dtype=cat_dtype) elif is_string_dtype(data[vertex['name']]): data[vertex['name']] = pd.Categorical( data[vertex['name']], categories=vertex['levels']) except KeyError: raise ValueError( "`estimate_parameters()` requires levels be defined or `infer_levels=True`" ) for vertex in self.vs: vertex['CPD'] = ConditionalProbabilityTable.estimate( vertex, data=data, method=method, method_args=method_args)
def series_is_boolean(col: pd.Series or pd.Index): """ returns: None if column is all None; True if a pd.Series contains True, False, and None; False otherwise caveat: does not interpret all-zero or all-one columns as boolean""" if len(col.unique()) == 1 and col.unique()[0] is None: # return None for all-None columns return None elif col.isna().all(): return None elif is_bool_dtype(col): return True elif is_object_dtype(col): for val in col.unique(): if val not in [True, False, None]: return False return False in col.unique() and True in col.unique() elif is_integer_dtype(col) or is_float_dtype(col): for val in col.unique(): if pd.isna(val): continue if val not in [1, 0, None]: return False if 0 not in col.unique() or 1 not in col.unique(): return False return True return False
def test_cload_custom_tempdir(bins_path, pairs_path): for temp_dir in [op.join(testdir, "data"), "-"]: cload_pairs.callback( bins_path, pairs_path, testcool_path, metadata=None, assembly="toy", chunksize=10, zero_based=False, comment_char="#", input_copy_status="unique", no_symmetric_upper=False, field=(), temp_dir=temp_dir, no_delete_temp=False, storage_options=None, no_count=True, max_merge=200, chrom1=2, pos1=3, chrom2=4, pos2=5, ) pixels = cooler.Cooler(testcool_path).pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"])
def test_cload_field(bins_path, pairs_path): kwargs = dict( metadata=None, assembly="toy", chunksize=10, zero_based=False, comment_char="#", input_copy_status="unique", no_symmetric_upper=False, temp_dir=None, no_delete_temp=False, storage_options=None, no_count=True, max_merge=200, chrom1=2, pos1=3, chrom2=4, pos2=5, ) cload_pairs.callback(bins_path, pairs_path, testcool_path, field=("score=8:dtype=float", ), **kwargs) pixels = cooler.Cooler(testcool_path).pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"]) assert "score" in pixels.columns and types.is_float_dtype( pixels.dtypes["score"])
def prepare_data( dataset_df: pd.DataFrame, drop_na: bool = False, mean_int: bool = True, mean_float: bool = True, rescale_float: bool = True, standardize_float: bool = True, ) -> None: """Fill missing values and standardize float columns. :author: Robin Courant :param dataset_df: dataset to process. :param drop_na: whether to drop every row with at least on `NaN` cell. :param mean_int: whether to use mean or the median for missing integers. :param mean_float: whether to use mean or the median for missing floats. :param rescale_float: whether to rescale floats (standardize or normalize). :param standardize_float: whether to apply standardization or normalization. """ if drop_na: dataset_df.dropna() return for column_name, column_series in dataset_df.iteritems(): if is_integer_dtype(column_series): if set(column_series.unique()) == {0, 1}: dataset_df[column_name] = _prepare_bool(column_series) else: dataset_df[column_name] = _prepare_int(column_series, mean_int) elif is_float_dtype(column_series): dataset_df[column_name] = _prepare_float(column_series, mean_float, rescale_float, standardize_float) # Raise an error is the column's type is not boolean, integer or float else: raise TypeError(f"Unrecognized type, column: {column_name}")
def binary_operator(self, op, other, reflect=False): if reflect: self, other = other, self # Binary Arithmatics between decimal columns. `Scale` and `precision` # are computed outside of libcudf if op in ("add", "sub", "mul"): scale = _binop_scale(self.dtype, other.dtype, op) output_type = Decimal64Dtype( scale=scale, precision=Decimal64Dtype.MAX_PRECISION ) # precision will be ignored, libcudf has no notion of precision result = libcudf.binaryop.binaryop(self, other, op, output_type) result.dtype.precision = _binop_precision(self.dtype, other.dtype, op) elif op in ("eq", "ne", "lt", "gt", "le", "ge"): if not isinstance( other, (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar), ): raise TypeError(f"Operator {op} not supported between" f"{str(type(self))} and {str(type(other))}") if isinstance(other, cudf.core.column.NumericalColumn ) and not is_integer_dtype(other.dtype): raise TypeError( f"Only decimal and integer column is supported for {op}.") if isinstance(other, cudf.core.column.NumericalColumn): other = other.as_decimal_column( Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)) result = libcudf.binaryop.binaryop(self, other, op, bool) return result
def fillna(self, fill_value): """ Fill null values with *fill_value* """ if np.isscalar(fill_value): # castsafely to the same dtype as self fill_value_casted = self.dtype.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( "Cannot safely cast non-equivalent {} to {}".format( type(fill_value).__name__, self.dtype.name ) ) fill_value = fill_value_casted else: fill_value = column.as_column(fill_value, nan_as_null=False) # cast safely to the same dtype as self if is_integer_dtype(self.dtype): fill_value = _safe_cast_to_int(fill_value, self.dtype) else: fill_value = fill_value.astype(self.dtype) result = libcudfxx.replace.replace_nulls(self, fill_value) result = column.build_column( result.base_data, result.dtype, mask=None, offset=result.offset, size=result.size, ) return result
def guess_natsort_alg(cls, dtype: Type[Any]) -> NatsortFlagsAndValue: """ Guesses a good natsorted flag for the dtype. Here are some specifics: - integers ⇒ INT and SIGNED - floating-point ⇒ FLOAT and SIGNED - strings ⇒ COMPATIBILITYNORMALIZE and GROUPLETTERS - datetime ⇒ GROUPLETTERS (only affects 'Z' vs. 'z'; shouldn't matter) Args: dtype: Probably from ``pd.Series.dtype`` Returns: A tuple of (set of flags, int) -- see :meth:`exact_natsort_alg` """ st, x = set(), 0 if is_string_dtype(dtype): st.update(["COMPATIBILITYNORMALIZE", "GROUPLETTERS"]) x |= ns_enum.COMPATIBILITYNORMALIZE | ns_enum.GROUPLETTERS elif is_categorical_dtype(dtype): pass elif is_integer_dtype(dtype) or is_bool_dtype(dtype): st.update(["INT", "SIGNED"]) x |= ns_enum.INT | ns_enum.SIGNED elif is_float_dtype(dtype): st.update(["FLOAT", "SIGNED"]) x |= ns_enum.FLOAT | ns_enum.SIGNED # same as ns_enum.REAL return NatsortFlagsAndValue(st, x)
def _default_transformer(col, train_df): if is_integer_dtype(train_df[col]): return int if is_float_dtype(train_df[col]): return float if is_string_dtype(train_df[col]): return LabelEncoder
def fillna(self, fill_value=None, method=None): """ Fill null values with *fill_value* """ if method is not None: return super().fillna(fill_value, method) if (isinstance(fill_value, cudf.Scalar) and fill_value.dtype == self.dtype): return super().fillna(fill_value, method) if np.isscalar(fill_value): # castsafely to the same dtype as self fill_value_casted = self.dtype.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( f"Cannot safely cast non-equivalent " f"{type(fill_value).__name__} to {self.dtype.name}") fill_value = cudf.Scalar(fill_value_casted) else: fill_value = column.as_column(fill_value, nan_as_null=False) # cast safely to the same dtype as self if is_integer_dtype(self.dtype): fill_value = _safe_cast_to_int(fill_value, self.dtype) else: fill_value = fill_value.astype(self.dtype) return super().fillna(fill_value, method)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name ) elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def load_metadata(self): self._ensure_loaded() dtypes = self.dataset._dtypes to_feature_type = lambda dt: ( 'int' if pat.is_integer_dtype(dt) else 'float' if pat.is_float_dtype(dt) else 'number' if pat.is_numeric_dtype(dt) else 'category' if pat.is_categorical_dtype(dt) else 'string' if pat.is_string_dtype(dt) # else 'datetime' if pat.is_datetime64_dtype(dt) else 'object') features = [ Feature(i, col, to_feature_type(dtypes[i])) for i, col in enumerate(self._ds.columns) ] for f in features: col = self._ds.iloc[:, f.index] f.has_missing_values = col.hasnans if f.is_categorical(): f.values = sorted(self._ds.dtypes[f.name].categories.values) target = self._find_target_feature(features) self._set_feature_as_target(target) meta = dict(features=features, target=target) log.debug("Metadata for dataset %s: %s", self.path, meta) return meta
def categorical_func(series): natural_language_threshold = ww.config.get_option( 'natural_language_threshold') numeric_categorical_threshold = ww.config.get_option( 'numeric_categorical_threshold') if pdtypes.is_string_dtype(series.dtype) and not col_is_datetime(series): # heuristics to predict this some other than categorical sample = series.sample(min(10000, len(series))) # catch cases where object dtype cannot be interpreted as a string try: avg_length = sample.str.len().mean() if avg_length > natural_language_threshold: return False except AttributeError: pass return True if pdtypes.is_categorical_dtype(series.dtype): return True if ((pdtypes.is_float_dtype(series.dtype) or pdtypes.is_integer_dtype(series.dtype)) and _is_numeric_categorical(series, numeric_categorical_threshold)): return True return False
def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if exc is None: sdtype = tm.get_dtype(s) if (hasattr(other, "dtype") and not is_extension_array_dtype(other.dtype) and is_integer_dtype(other.dtype) and sdtype.is_unsigned_integer): # TODO: comment below is inaccurate; other can be int8, int16, ... # and the trouble is that e.g. if s is UInt8 and other is int8, # then result is UInt16 # other is np.int64 and would therefore always result in # upcasting, so keeping other as same numpy_dtype other = other.astype(sdtype.numpy_dtype) result = op(s, other) expected = self._combine(s, other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): expected = expected.fillna(np.nan).astype("Float64") else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) self.assert_equal(result, expected) else: with pytest.raises(exc): op(s, other)
def test_cload_custom_tempdir(bins_path, pairs_path): for temp_dir in [op.join(testdir, 'data'), '-']: cload_pairs.callback( bins_path, pairs_path, testcool_path, metadata=None, assembly='toy', chunksize=10, zero_based=False, comment_char='#', input_copy_status='unique', no_symmetric_upper=False, field=(), temp_dir=temp_dir, no_delete_temp=False, storage_options=None, no_count=True, max_merge=200, chrom1=1, pos1=2, chrom2=3, pos2=4, ) pixels = cooler.Cooler(testcool_path).pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count'])
def integer_nullable_func(series): numeric_categorical_threshold = ww.config.get_option( 'numeric_categorical_threshold') if (pdtypes.is_integer_dtype(series.dtype) and not _is_numeric_categorical( series, numeric_categorical_threshold)): return True return False
def dispatch(col): dfcol = df[col.factor] if type(col) == IndicatorCol: assert is_categorical_dtype(dfcol) return (dfcol == col.level).to_numpy() elif type(col) == NumericCol: assert is_float_dtype(dfcol) or is_integer_dtype(dfcol) return dfcol.to_numpy() elif type(col) == CustomCol: assert col.factor in contrasts mat = contrasts[col.factor] levels = metadata.column(col.factor).levels # TODO: This can be triggered in normal use, so turn into # friendly error. It probably makes sense to check for # this earlier. This could possibly happen in `defm` after # creating the metadata, though this would also require a # separate check for SequentialOED. Is there anywhere # sensible to put this that catches both? assert len(levels) == mat.shape[0] assert col.index < mat.shape[1] # TODO: Better asymptotics then using `.index()` out = mat[[levels.index(val) for val in dfcol], col.index] return out else: raise Exception('unknown column type')
def test_cload_field(bins_path, pairs_path): kwargs = dict( metadata=None, assembly='toy', chunksize=10, zero_based=False, comment_char='#', input_copy_status='unique', no_symmetric_upper=False, temp_dir=None, no_delete_temp=False, storage_options=None, no_count=True, max_merge=200, chrom1=1, pos1=2, chrom2=3, pos2=4, ) cload_pairs.callback(bins_path, pairs_path, testcool_path, field=('score=7:dtype=float', ), **kwargs) pixels = cooler.Cooler(testcool_path).pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count']) assert 'score' in pixels.columns and types.is_float_dtype( pixels.dtypes['score'])
def categorical_bar_graph(self, params=None): df_parameters = self._dataframe_parameters() if not params: params = list(df_parameters) for item in params: if is_float_dtype(df_parameters[item]) or is_integer_dtype( df_parameters[item]): continue else: # ToDO: fix # this is broken for i in range(len(df_parameters[item].index)): if type(df_parameters[item][i]) == str or type( df_parameters[item][i]) == bool: continue else: # ToDO: enhancement #this is where I could potentially make it so that an additional bar said numerical df_parameters[item].drop(i) try: plt.figure(figsize=(20, 8)) plt.rcParams['font.size'] = 18 sns.countplot(x=item, data=df_parameters, palette="husl").set_title( '{} bar graph'.format(item)) path = self._save_to_folder('/bar_parameters', '{}_bar_graph.pdf'.format(item)) plt.savefig(path) plt.close('all') except: continue
def categorical_evolution(self, params=None): df_parameters = self._dataframe_parameters() if not params: params = list(df_parameters) df_parameters['iteration'] = self.results['iteration'] for item in params: if is_float_dtype(df_parameters[item]) or is_integer_dtype( df_parameters[item]): continue else: for i in range(len(df_parameters[item].index)): if type(df_parameters[item][i]) == str or type( df_parameters[item][i]) == bool: continue else: df_parameters[item] = df_parameters[item].drop(i) try: plt.figure(figsize=(20, 8)) plt.rcParams['font.size'] = 18 sns.catplot(data=df_parameters, x='iteration', y=item).fig.suptitle( '{} over iterations'.format(item)) path = self._save_to_folder( '/category_evolution', '{}_category_iter_graph.pdf'.format(item)) plt.savefig(path) plt.close('all') except: continue
def _encode_labels(self, y): y_encoded = _convert_to_woodwork_structure(y) y_encoded = _convert_woodwork_types_wrapper(y_encoded.to_series()) # change only if dtype isn't int if not is_integer_dtype(y_encoded): self._label_encoder = LabelEncoder() y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64') return y_encoded
def _is_all_int(df_list: List[Union[dd.DataFrame, pd.DataFrame]], col: str) -> bool: """ Check whether the col in all dataframes are all integer type. """ for df in df_list: srs = df[col] if isinstance(srs, (dd.DataFrame, pd.DataFrame)): for dtype in srs.dtypes: if not is_integer_dtype(dtype): return False elif isinstance(srs, (dd.Series, pd.Series)): if not is_integer_dtype(srs.dtype): return False else: raise ValueError(f"unprocessed type of data:{type(srs)}") return True
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if get_option("compute.eager_check") and index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name ) return _non_fractional_astype(index_ops, dtype, spark_type)
def pandas_to_table(df): # type: (pd.DataFrame) -> Orange.data.Table """ Convert a pandas.DataFrame to a Orange.data.Table instance. """ index = df.index if not isinstance(index, pd.RangeIndex): df = df.reset_index() columns = [] # type: List[Tuple[Orange.data.Variable, np.ndarray]] for header, series in df.items(): # type: (Any, pd.Series) if pdtypes.is_categorical(series): coldata = series.values # type: pd.Categorical categories = [str(c) for c in coldata.categories] var = Orange.data.DiscreteVariable.make( str(header), values=categories, ordered=coldata.ordered ) # Remap the coldata into the var.values order/set coldata = pd.Categorical( coldata, categories=var.values, ordered=coldata.ordered ) codes = coldata.codes assert np.issubdtype(codes.dtype, np.integer) orangecol = np.array(codes, dtype=np.float) orangecol[codes < 0] = np.nan elif pdtypes.is_datetime64_any_dtype(series): # Check that this converts tz local to UTC series = series.astype(np.dtype("M8[ns]")) coldata = series.values # type: np.ndarray assert coldata.dtype == "M8[ns]" mask = np.isnat(coldata) orangecol = coldata.astype(np.int64) / 10 ** 9 orangecol[mask] = np.nan var = Orange.data.TimeVariable.make(str(header)) var.have_date = var.have_time = 1 elif pdtypes.is_object_dtype(series): coldata = series.values assert isinstance(coldata, np.ndarray) orangecol = coldata var = Orange.data.StringVariable.make(str(header)) elif pdtypes.is_integer_dtype(series): coldata = series.values var = Orange.data.ContinuousVariable.make(str(header)) var.number_of_decimals = 0 orangecol = coldata.astype(np.float64) elif pdtypes.is_numeric_dtype(series): orangecol = series.values.astype(np.float64) var = Orange.data.ContinuousVariable.make(str(header)) var._out_format = "%.15g" else: warnings.warn( "Column '{}' with dtype: {} skipped." .format(header, series.dtype), UserWarning ) continue columns.append((var, orangecol)) cols_x = [(var, col) for var, col in columns if var.is_primitive()] cols_m = [(var, col) for var, col in columns if not var.is_primitive()] variables = [v for v, _ in cols_x] if cols_x: X = np.column_stack([a for _, a in cols_x]) else: X = np.empty((df.shape[0], 0), dtype=np.float) metas = [v for v, _ in cols_m] if cols_m: M = np.column_stack([a for _, a in cols_m]) else: M = None domain = Orange.data.Domain(variables, metas=metas) return Orange.data.Table.from_numpy(domain, X, None, M)
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ bad_dtypes = [] bad_dates = [] errors = [] for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: actual = df.dtypes[c] desired = dtypes[c] if is_float_dtype(actual) and is_integer_dtype(desired): bad_dtypes.append((c, actual, desired)) elif is_object_dtype(actual) and is_datetime64_any_dtype(desired): # This can only occur when parse_dates is specified, but an # invalid date is encountered. Pandas then silently falls back # to object dtype. Since `object_array.astype(datetime)` will # silently overflow, error here and report. bad_dates.append(c) else: try: df[c] = df[c].astype(dtypes[c]) except Exception as e: bad_dtypes.append((c, actual, desired)) errors.append((c, e)) if bad_dtypes: if errors: ex = '\n'.join("- %s\n %r" % (c, e) for c, e in sorted(errors, key=lambda x: str(x[0]))) exceptions = ("The following columns also raised exceptions on " "conversion:\n\n%s\n\n") % ex extra = "" else: exceptions = "" # All mismatches are int->float, also suggest `assume_missing=True` extra = ("\n\nAlternatively, provide `assume_missing=True` " "to interpret\n" "all unspecified integer columns as floats.") bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0])) table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes) dtype_kw = ('dtype={%s}' % ',\n' ' '.join("%r: '%s'" % (k, v) for (k, v, _) in bad_dtypes)) dtype_msg = ( "{table}\n\n" "{exceptions}" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n\n" "{dtype_kw}\n\n" "to the call to `read_csv`/`read_table`." "{extra}").format(table=table, exceptions=exceptions, dtype_kw=dtype_kw, extra=extra) else: dtype_msg = None if bad_dates: also = " also " if bad_dtypes else " " cols = '\n'.join("- %s" % c for c in bad_dates) date_msg = ( "The following columns{also}failed to properly parse as dates:\n\n" "{cols}\n\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`.").format(also=also, cols=cols) else: date_msg = None if bad_dtypes or bad_dates: rule = "\n\n%s\n\n" % ('-' * 61) msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n" "%s" % (rule.join(filter(None, [dtype_msg, date_msg])))) raise ValueError(msg)
def read_pandas(reader, urlpath, blocksize=AUTO_BLOCKSIZE, collection=True, lineterminator=None, compression=None, sample=256000, enforce=False, assume_missing=False, storage_options=None, include_path_column=False, **kwargs): reader_name = reader.__name__ if lineterminator is not None and len(lineterminator) == 1: kwargs['lineterminator'] = lineterminator else: lineterminator = '\n' if include_path_column and isinstance(include_path_column, bool): include_path_column = 'path' if 'index' in kwargs or 'index_col' in kwargs: raise ValueError("Keyword 'index' not supported " "dd.{0}(...).set_index('my-index') " "instead".format(reader_name)) for kw in ['iterator', 'chunksize']: if kw in kwargs: raise ValueError("{0} not supported for " "dd.{1}".format(kw, reader_name)) if kwargs.get('nrows', None): raise ValueError("The 'nrows' keyword is not supported by " "`dd.{0}`. To achieve the same behavior, it's " "recommended to use `dd.{0}(...)." "head(n=nrows)`".format(reader_name)) if isinstance(kwargs.get('skiprows'), int): skiprows = lastskiprow = firstrow = kwargs.get('skiprows') elif kwargs.get('skiprows') is None: skiprows = lastskiprow = firstrow = 0 else: # When skiprows is a list, we expect more than max(skiprows) to # be included in the sample. This means that [0,2] will work well, # but [0, 440] might not work. skiprows = set(kwargs.get('skiprows')) lastskiprow = max(skiprows) # find the firstrow that is not skipped, for use as header firstrow = min(set(range(len(skiprows) + 1)) - set(skiprows)) if isinstance(kwargs.get('header'), list): raise TypeError("List of header rows not supported for " "dd.{0}".format(reader_name)) if isinstance(kwargs.get('converters'), dict) and include_path_column: path_converter = kwargs.get('converters').get(include_path_column, None) else: path_converter = None if isinstance(blocksize, (str, unicode)): blocksize = parse_bytes(blocksize) if blocksize and compression not in seekable_files: warn("Warning %s compression does not support breaking apart files\n" "Please ensure that each individual file can fit in memory and\n" "use the keyword ``blocksize=None to remove this message``\n" "Setting ``blocksize=None``" % compression) blocksize = None if compression not in seekable_files and compression not in cfiles: raise NotImplementedError("Compression format %s not installed" % compression) if blocksize and blocksize < sample and lastskiprow != 0: warn("Unexpected behavior can result from passing skiprows when\n" "blocksize is smaller than sample size.\n" "Setting ``sample=blocksize``") sample = blocksize b_lineterminator = lineterminator.encode() b_out = read_bytes(urlpath, delimiter=b_lineterminator, blocksize=blocksize, sample=sample, compression=compression, include_path=include_path_column, **(storage_options or {})) if include_path_column: b_sample, values, paths = b_out if path_converter: paths = [path_converter(path) for path in paths] path = (include_path_column, paths) else: b_sample, values = b_out path = None if not isinstance(values[0], (tuple, list)): values = [values] # Get header row, and check that sample is long enough. If the file # contains a header row, we need at least 2 nonempty rows + the number of # rows to skip. names = kwargs.get('names', None) header = kwargs.get('header', 'infer' if names is None else None) need = 1 if header is None else 2 parts = b_sample.split(b_lineterminator, lastskiprow + need) # If the last partition is empty, don't count it nparts = 0 if not parts else len(parts) - int(not parts[-1]) if nparts < lastskiprow + need and len(b_sample) >= sample: raise ValueError("Sample is not large enough to include at least one " "row of data. Please increase the number of bytes " "in `sample` in the call to `read_csv`/`read_table`") header = b'' if header is None else parts[firstrow] + b_lineterminator # Use sample to infer dtypes and check for presense of include_path_column head = reader(BytesIO(b_sample), **kwargs) if include_path_column and (include_path_column in head.columns): raise ValueError("Files already contain the column name: %s, so the " "path column cannot use this name. Please set " "`include_path_column` to a unique name." % include_path_column) specified_dtypes = kwargs.get('dtype', {}) if specified_dtypes is None: specified_dtypes = {} # If specified_dtypes is a single type, then all columns were specified if assume_missing and isinstance(specified_dtypes, dict): # Convert all non-specified integer columns to floats for c in head.columns: if is_integer_dtype(head[c].dtype) and c not in specified_dtypes: head[c] = head[c].astype(float) return text_blocks_to_pandas(reader, values, header, head, kwargs, collection=collection, enforce=enforce, specified_dtypes=specified_dtypes, path=path)
def read_pandas(reader, urlpath, blocksize=AUTO_BLOCKSIZE, collection=True, lineterminator=None, compression=None, sample=256000, enforce=False, assume_missing=False, storage_options=None, **kwargs): reader_name = reader.__name__ if lineterminator is not None and len(lineterminator) == 1: kwargs['lineterminator'] = lineterminator else: lineterminator = '\n' if 'index' in kwargs or 'index_col' in kwargs: raise ValueError("Keyword 'index' not supported " "dd.{0}(...).set_index('my-index') " "instead".format(reader_name)) for kw in ['iterator', 'chunksize']: if kw in kwargs: raise ValueError("{0} not supported for " "dd.{1}".format(kw, reader_name)) if kwargs.get('nrows', None): raise ValueError("The 'nrows' keyword is not supported by " "`dd.{0}`. To achieve the same behavior, it's " "recommended to use `dd.{0}(...)." "head(n=nrows)`".format(reader_name)) if isinstance(kwargs.get('skiprows'), list): raise TypeError("List of skiprows not supported for " "dd.{0}".format(reader_name)) if isinstance(kwargs.get('header'), list): raise TypeError("List of header rows not supported for " "dd.{0}".format(reader_name)) if blocksize and compression not in seekable_files: warn("Warning %s compression does not support breaking apart files\n" "Please ensure that each individual file can fit in memory and\n" "use the keyword ``blocksize=None to remove this message``\n" "Setting ``blocksize=None``" % compression) blocksize = None if compression not in seekable_files and compression not in cfiles: raise NotImplementedError("Compression format %s not installed" % compression) b_lineterminator = lineterminator.encode() b_sample, values = read_bytes(urlpath, delimiter=b_lineterminator, blocksize=blocksize, sample=sample, compression=compression, **(storage_options or {})) if not isinstance(values[0], (tuple, list)): values = [values] # Get header row, and check that sample is long enough. If the file # contains a header row, we need at least 2 nonempty rows + the number of # rows to skip. skiprows = kwargs.get('skiprows', 0) header = kwargs.get('header', 'infer') need = 1 if header is None else 2 parts = b_sample.split(b_lineterminator, skiprows + need) # If the last partition is empty, don't count it nparts = 0 if not parts else len(parts) - int(not parts[-1]) if nparts < skiprows + need and len(b_sample) >= sample: raise ValueError("Sample is not large enough to include at least one " "row of data. Please increase the number of bytes " "in `sample` in the call to `read_csv`/`read_table`") header = b'' if header is None else parts[skiprows] + b_lineterminator # Use sample to infer dtypes head = reader(BytesIO(b_sample), **kwargs) specified_dtypes = kwargs.get('dtype', {}) if specified_dtypes is None: specified_dtypes = {} # If specified_dtypes is a single type, then all columns were specified if assume_missing and isinstance(specified_dtypes, dict): # Convert all non-specified integer columns to floats for c in head.columns: if is_integer_dtype(head[c].dtype) and c not in specified_dtypes: head[c] = head[c].astype(float) return text_blocks_to_pandas(reader, values, header, head, kwargs, collection=collection, enforce=enforce)