Пример #1
0
def test_cload_field():
    runner = CliRunner()
    with runner.isolated_filesystem():
        extra_args = ['--field', 'score=7']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_integer_dtype(
            pixels.dtypes['count'])
        assert 'score' in pixels.columns and types.is_float_dtype(
            pixels.dtypes['score'])

        extra_args = ['--field', 'count=7']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_integer_dtype(
            pixels.dtypes['count'])
        assert np.allclose(pixels['count'][:], 0)

        extra_args = ['--field', 'count=7:dtype=float']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_float_dtype(
            pixels.dtypes['count'])
        assert np.allclose(pixels['count'][:], 0.2)

        extra_args = ['--field', 'count=7:agg=min,dtype=float']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_float_dtype(
            pixels.dtypes['count'])
        assert np.allclose(pixels['count'][:], 0.1)
def check_if_series_has_internal_type(series, internal_type):
    """Check if data type of series fits to the internal type of gettsim.

    Parameters
    ----------
    series : pd.Series
        Some data series.
    internal_type : TypeVar
        One of the internal gettsim types.

    Returns
    -------
    out : bool
        Return check variable.
    """
    if internal_type == FloatSeries:
        out = is_float_dtype(series) or is_integer_dtype(series)
    elif internal_type == BoolSeries:
        out = is_bool_dtype(series)
    elif internal_type == IntSeries:
        out = is_integer_dtype(series)
    elif internal_type == DateTimeSeries:
        out = is_datetime64_any_dtype(series)
    else:
        raise ValueError(f"The internal type {internal_type} is not defined.")
    return out
Пример #3
0
Файл: csv.py Проект: sethah/dask
def coerce_dtypes(df, dtypes):
    """ Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]):
                # There is a mismatch between floating and integer columns.
                # Determine all mismatched and error.
                mismatched = sorted(c for c in df.columns
                                    if is_float_dtype(df.dtypes[c])
                                    and is_integer_dtype(dtypes[c]))

                msg = ("Mismatched dtypes found.\n"
                       "Expected integers, but found floats for columns:\n"
                       "%s\n\n"
                       "To fix, specify dtypes manually by adding:\n\n"
                       "%s\n\n"
                       "to the call to `read_csv`/`read_table`.\n\n"
                       "Alternatively, provide `assume_missing=True` to "
                       "interpret all unspecified integer columns as floats.")

                missing_list = '\n'.join('- %r' % c for c in mismatched)
                dtype_list = ('%r: float' % c for c in mismatched)
                missing_dict = 'dtype={%s}' % ',\n       '.join(dtype_list)
                raise ValueError(msg % (missing_list, missing_dict))

            df[c] = df[c].astype(dtypes[c])
Пример #4
0
    def _validate_fit_data(self):
        """Verifies that T, X, and y are formatted the right way"""
        # Checks for T column
        if not is_float_dtype(self.T):
            raise TypeError(f"Treatment data must be of type float")

        # Make sure all X columns are float or int
        if isinstance(self.X, pd.Series):
            if not is_numeric_dtype(self.X):
                raise TypeError(
                    f"All covariate (X) columns must be int or float type (i.e. must be numeric)"
                )

        elif isinstance(self.X, pd.DataFrame):
            for column in self.X:
                if not is_numeric_dtype(self.X[column]):
                    raise TypeError(
                        f"All covariate (X) columns must be int or float type "
                        f"(i.e. must be numeric)")

        # Checks for Y column
        if not (is_float_dtype(self.y) or is_integer_dtype(self.y)):
            raise TypeError(f"Outcome data must be of type float or integer")

        if is_integer_dtype(self.y) and (not np.array_equal(
                np.sort(self.y.unique()), np.array([0, 1]))):
            raise TypeError(
                f"If your outcome data is of type integer (binary outcome),"
                f"it should only contain 1's and 0's.")
Пример #5
0
def test_cload_field():
    runner = CliRunner()
    with runner.isolated_filesystem():
        extra_args = ["--field", "score=8"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_integer_dtype(
            pixels.dtypes["count"])
        assert "score" in pixels.columns and types.is_float_dtype(
            pixels.dtypes["score"])

        extra_args = ["--field", "count=8"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_integer_dtype(
            pixels.dtypes["count"])
        assert np.allclose(pixels["count"][:], 0)

        extra_args = ["--field", "count=8:dtype=float"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_float_dtype(
            pixels.dtypes["count"])
        assert np.allclose(pixels["count"][:], 0.2)

        extra_args = ["--field", "count=8:agg=min,dtype=float"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_float_dtype(
            pixels.dtypes["count"])
        assert np.allclose(pixels["count"][:], 0.1)
Пример #6
0
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms
    cats = cat_names >> ops.Categorify()
    workflow = nvt.Workflow(conts + cats + label_name)

    workflow.fit(dataset)
    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        output_path=tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Пример #7
0
def sanity_checks(df_sumstats):
    if np.any(df_sumstats['CHR'].isnull()):
        raise ValueError('Some SNPs have a null chromosome value')
    if np.any(df_sumstats['BP'].isnull()):
        raise ValueError('Some SNPs have a null base-pair value')
    if not is_integer_dtype(df_sumstats['CHR']):
        raise ValueError(
            'Some chromosome values are not integers. Please double-check your input'
        )
    if not is_integer_dtype(df_sumstats['BP']):
        raise ValueError(
            'Some base-pair values are not integers. Please double-check your input'
        )
Пример #8
0
    def estimate_parameters(
        self,
        data: pd.DataFrame,
        method: str = "mle",
        infer_levels: bool = False,
        method_args: Optional[Dict[str, Union[int, float]]] = None,
    ) -> None:
        """Estimate conditional probabilities based on supplied data."""
        data = data.copy()
        if infer_levels:
            if all(is_categorical_dtype(data[col]) for col in data.columns):
                self.vs['levels'] = [
                    list(dtype.categories) for dtype in data.dtypes
                ]
            else:
                for vertex in self.vs:
                    if not (is_integer_dtype(data[vertex['name']])
                            or is_string_dtype(data[vertex['name']])):
                        raise ValueError(
                            f"Unrecognised DataFrame dtype: {data[vertex['name']].dtype}"
                        )
                    vertex_categories = sorted(
                        data[vertex['name']].unique().astype(str))
                    column = pd.Categorical(data[vertex['name']].astype(str),
                                            categories=vertex_categories)
                    vertex['levels'] = vertex_categories
                    data[vertex['name']] = column
        else:
            try:
                if not all(
                        isinstance(dtype, pd.CategoricalDtype)
                        for dtype in data.dtypes):
                    for vertex in self.vs:
                        if is_integer_dtype(data[vertex['name']]):
                            cat_dtype = pd.CategoricalDtype(vertex['levels'],
                                                            ordered=True)
                            data[vertex['name']] = pd.Categorical.from_codes(
                                codes=data[vertex['name']], dtype=cat_dtype)
                        elif is_string_dtype(data[vertex['name']]):
                            data[vertex['name']] = pd.Categorical(
                                data[vertex['name']],
                                categories=vertex['levels'])
            except KeyError:
                raise ValueError(
                    "`estimate_parameters()` requires levels be defined or `infer_levels=True`"
                )

        for vertex in self.vs:
            vertex['CPD'] = ConditionalProbabilityTable.estimate(
                vertex, data=data, method=method, method_args=method_args)
Пример #9
0
def series_is_boolean(col: pd.Series or pd.Index):
    """
    returns:
        None if column is all None;
        True if a pd.Series contains True, False, and None;
        False otherwise

    caveat: does not interpret all-zero or all-one columns as boolean"""
    if len(col.unique()) == 1 and col.unique()[0] is None:
        # return None for all-None columns
        return None
    elif col.isna().all():
        return None
    elif is_bool_dtype(col):
        return True
    elif is_object_dtype(col):
        for val in col.unique():
            if val not in [True, False, None]:
                return False
        return False in col.unique() and True in col.unique()
    elif is_integer_dtype(col) or is_float_dtype(col):
        for val in col.unique():
            if pd.isna(val):
                continue
            if val not in [1, 0, None]:
                return False
            if 0 not in col.unique() or 1 not in col.unique():
                return False
        return True
    return False
Пример #10
0
def test_cload_custom_tempdir(bins_path, pairs_path):
    for temp_dir in [op.join(testdir, "data"), "-"]:
        cload_pairs.callback(
            bins_path,
            pairs_path,
            testcool_path,
            metadata=None,
            assembly="toy",
            chunksize=10,
            zero_based=False,
            comment_char="#",
            input_copy_status="unique",
            no_symmetric_upper=False,
            field=(),
            temp_dir=temp_dir,
            no_delete_temp=False,
            storage_options=None,
            no_count=True,
            max_merge=200,
            chrom1=2,
            pos1=3,
            chrom2=4,
            pos2=5,
        )
        pixels = cooler.Cooler(testcool_path).pixels()[:]
        assert "count" in pixels.columns and types.is_integer_dtype(
            pixels.dtypes["count"])
Пример #11
0
def test_cload_field(bins_path, pairs_path):
    kwargs = dict(
        metadata=None,
        assembly="toy",
        chunksize=10,
        zero_based=False,
        comment_char="#",
        input_copy_status="unique",
        no_symmetric_upper=False,
        temp_dir=None,
        no_delete_temp=False,
        storage_options=None,
        no_count=True,
        max_merge=200,
        chrom1=2,
        pos1=3,
        chrom2=4,
        pos2=5,
    )
    cload_pairs.callback(bins_path,
                         pairs_path,
                         testcool_path,
                         field=("score=8:dtype=float", ),
                         **kwargs)
    pixels = cooler.Cooler(testcool_path).pixels()[:]
    assert "count" in pixels.columns and types.is_integer_dtype(
        pixels.dtypes["count"])
    assert "score" in pixels.columns and types.is_float_dtype(
        pixels.dtypes["score"])
Пример #12
0
def prepare_data(
    dataset_df: pd.DataFrame,
    drop_na: bool = False,
    mean_int: bool = True,
    mean_float: bool = True,
    rescale_float: bool = True,
    standardize_float: bool = True,
) -> None:
    """Fill missing values and standardize float columns.

    :author: Robin Courant
    :param dataset_df: dataset to process.
    :param drop_na: whether to drop every row with at least on `NaN` cell.
    :param mean_int: whether to use mean or the median for missing integers.
    :param mean_float: whether to use mean or the median for missing floats.
    :param rescale_float: whether to rescale floats (standardize or normalize).
    :param standardize_float: whether to apply standardization or normalization.
    """
    if drop_na:
        dataset_df.dropna()
        return

    for column_name, column_series in dataset_df.iteritems():
        if is_integer_dtype(column_series):
            if set(column_series.unique()) == {0, 1}:
                dataset_df[column_name] = _prepare_bool(column_series)
            else:
                dataset_df[column_name] = _prepare_int(column_series, mean_int)
        elif is_float_dtype(column_series):
            dataset_df[column_name] = _prepare_float(column_series, mean_float,
                                                     rescale_float,
                                                     standardize_float)
        # Raise an error is the column's type is not boolean, integer or float
        else:
            raise TypeError(f"Unrecognized type, column: {column_name}")
Пример #13
0
    def binary_operator(self, op, other, reflect=False):
        if reflect:
            self, other = other, self

        # Binary Arithmatics between decimal columns. `Scale` and `precision`
        # are computed outside of libcudf
        if op in ("add", "sub", "mul"):
            scale = _binop_scale(self.dtype, other.dtype, op)
            output_type = Decimal64Dtype(
                scale=scale, precision=Decimal64Dtype.MAX_PRECISION
            )  # precision will be ignored, libcudf has no notion of precision
            result = libcudf.binaryop.binaryop(self, other, op, output_type)
            result.dtype.precision = _binop_precision(self.dtype, other.dtype,
                                                      op)
        elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
            if not isinstance(
                    other,
                (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
            ):
                raise TypeError(f"Operator {op} not supported between"
                                f"{str(type(self))} and {str(type(other))}")
            if isinstance(other, cudf.core.column.NumericalColumn
                          ) and not is_integer_dtype(other.dtype):
                raise TypeError(
                    f"Only decimal and integer column is supported for {op}.")
            if isinstance(other, cudf.core.column.NumericalColumn):
                other = other.as_decimal_column(
                    Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0))
            result = libcudf.binaryop.binaryop(self, other, op, bool)
        return result
Пример #14
0
    def fillna(self, fill_value):
        """
        Fill null values with *fill_value*
        """
        if np.isscalar(fill_value):
            # castsafely to the same dtype as self
            fill_value_casted = self.dtype.type(fill_value)
            if not np.isnan(fill_value) and (fill_value_casted != fill_value):
                raise TypeError(
                    "Cannot safely cast non-equivalent {} to {}".format(
                        type(fill_value).__name__, self.dtype.name
                    )
                )
            fill_value = fill_value_casted
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)
            # cast safely to the same dtype as self
            if is_integer_dtype(self.dtype):
                fill_value = _safe_cast_to_int(fill_value, self.dtype)
            else:
                fill_value = fill_value.astype(self.dtype)
        result = libcudfxx.replace.replace_nulls(self, fill_value)
        result = column.build_column(
            result.base_data,
            result.dtype,
            mask=None,
            offset=result.offset,
            size=result.size,
        )

        return result
Пример #15
0
    def guess_natsort_alg(cls, dtype: Type[Any]) -> NatsortFlagsAndValue:
        """
        Guesses a good natsorted flag for the dtype.

        Here are some specifics:
            - integers       ⇒ INT and SIGNED
            - floating-point ⇒ FLOAT and SIGNED
            - strings        ⇒ COMPATIBILITYNORMALIZE and GROUPLETTERS
            - datetime       ⇒ GROUPLETTERS (only affects 'Z' vs. 'z'; shouldn't matter)

        Args:
            dtype: Probably from ``pd.Series.dtype``

        Returns:
            A tuple of (set of flags, int) -- see :meth:`exact_natsort_alg`
        """
        st, x = set(), 0
        if is_string_dtype(dtype):
            st.update(["COMPATIBILITYNORMALIZE", "GROUPLETTERS"])
            x |= ns_enum.COMPATIBILITYNORMALIZE | ns_enum.GROUPLETTERS
        elif is_categorical_dtype(dtype):
            pass
        elif is_integer_dtype(dtype) or is_bool_dtype(dtype):
            st.update(["INT", "SIGNED"])
            x |= ns_enum.INT | ns_enum.SIGNED
        elif is_float_dtype(dtype):
            st.update(["FLOAT", "SIGNED"])
            x |= ns_enum.FLOAT | ns_enum.SIGNED  # same as ns_enum.REAL
        return NatsortFlagsAndValue(st, x)
Пример #16
0
def _default_transformer(col, train_df):
    if is_integer_dtype(train_df[col]):
        return int
    if is_float_dtype(train_df[col]):
        return float
    if is_string_dtype(train_df[col]):
        return LabelEncoder
Пример #17
0
    def fillna(self, fill_value=None, method=None):
        """
        Fill null values with *fill_value*
        """
        if method is not None:
            return super().fillna(fill_value, method)

        if (isinstance(fill_value, cudf.Scalar)
                and fill_value.dtype == self.dtype):
            return super().fillna(fill_value, method)
        if np.isscalar(fill_value):
            # castsafely to the same dtype as self
            fill_value_casted = self.dtype.type(fill_value)
            if not np.isnan(fill_value) and (fill_value_casted != fill_value):
                raise TypeError(
                    f"Cannot safely cast non-equivalent "
                    f"{type(fill_value).__name__} to {self.dtype.name}")
            fill_value = cudf.Scalar(fill_value_casted)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)
            # cast safely to the same dtype as self
            if is_integer_dtype(self.dtype):
                fill_value = _safe_cast_to_int(fill_value, self.dtype)
            else:
                fill_value = fill_value.astype(self.dtype)

        return super().fillna(fill_value, method)
Пример #18
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError(
                    "Cannot convert %s with missing values to integer" % self.pretty_name
                )
        elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Пример #19
0
    def load_metadata(self):
        self._ensure_loaded()
        dtypes = self.dataset._dtypes
        to_feature_type = lambda dt: (
            'int' if pat.is_integer_dtype(dt) else 'float'
            if pat.is_float_dtype(dt) else 'number'
            if pat.is_numeric_dtype(dt) else 'category'
            if pat.is_categorical_dtype(dt) else 'string'
            if pat.is_string_dtype(dt)
            # else 'datetime' if pat.is_datetime64_dtype(dt)
            else 'object')
        features = [
            Feature(i, col, to_feature_type(dtypes[i]))
            for i, col in enumerate(self._ds.columns)
        ]

        for f in features:
            col = self._ds.iloc[:, f.index]
            f.has_missing_values = col.hasnans
            if f.is_categorical():
                f.values = sorted(self._ds.dtypes[f.name].categories.values)

        target = self._find_target_feature(features)
        self._set_feature_as_target(target)

        meta = dict(features=features, target=target)
        log.debug("Metadata for dataset %s: %s", self.path, meta)
        return meta
Пример #20
0
def categorical_func(series):
    natural_language_threshold = ww.config.get_option(
        'natural_language_threshold')
    numeric_categorical_threshold = ww.config.get_option(
        'numeric_categorical_threshold')

    if pdtypes.is_string_dtype(series.dtype) and not col_is_datetime(series):
        # heuristics to predict this some other than categorical
        sample = series.sample(min(10000, len(series)))
        # catch cases where object dtype cannot be interpreted as a string
        try:
            avg_length = sample.str.len().mean()
            if avg_length > natural_language_threshold:
                return False
        except AttributeError:
            pass
        return True

    if pdtypes.is_categorical_dtype(series.dtype):
        return True
    if ((pdtypes.is_float_dtype(series.dtype)
         or pdtypes.is_integer_dtype(series.dtype)) and
            _is_numeric_categorical(series, numeric_categorical_threshold)):
        return True
    return False
Пример #21
0
    def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
        if exc is None:
            sdtype = tm.get_dtype(s)

            if (hasattr(other, "dtype")
                    and not is_extension_array_dtype(other.dtype)
                    and is_integer_dtype(other.dtype)
                    and sdtype.is_unsigned_integer):
                # TODO: comment below is inaccurate; other can be int8, int16, ...
                #  and the trouble is that e.g. if s is UInt8 and other is int8,
                #  then result is UInt16
                # other is np.int64 and would therefore always result in
                # upcasting, so keeping other as same numpy_dtype
                other = other.astype(sdtype.numpy_dtype)

            result = op(s, other)
            expected = self._combine(s, other, op)

            if op_name in ("__rtruediv__", "__truediv__", "__div__"):
                expected = expected.fillna(np.nan).astype("Float64")
            else:
                # combine method result in 'biggest' (int64) dtype
                expected = expected.astype(sdtype)

            self.assert_equal(result, expected)
        else:
            with pytest.raises(exc):
                op(s, other)
Пример #22
0
def test_cload_custom_tempdir(bins_path, pairs_path):
    for temp_dir in [op.join(testdir, 'data'), '-']:
        cload_pairs.callback(
            bins_path,
            pairs_path,
            testcool_path,
            metadata=None,
            assembly='toy',
            chunksize=10,
            zero_based=False,
            comment_char='#',
            input_copy_status='unique',
            no_symmetric_upper=False,
            field=(),
            temp_dir=temp_dir,
            no_delete_temp=False,
            storage_options=None,
            no_count=True,
            max_merge=200,
            chrom1=1,
            pos1=2,
            chrom2=3,
            pos2=4,
        )
        pixels = cooler.Cooler(testcool_path).pixels()[:]
        assert 'count' in pixels.columns and types.is_integer_dtype(
            pixels.dtypes['count'])
Пример #23
0
def integer_nullable_func(series):
    numeric_categorical_threshold = ww.config.get_option(
        'numeric_categorical_threshold')
    if (pdtypes.is_integer_dtype(series.dtype) and not _is_numeric_categorical(
            series, numeric_categorical_threshold)):
        return True
    return False
Пример #24
0
 def dispatch(col):
     dfcol = df[col.factor]
     if type(col) == IndicatorCol:
         assert is_categorical_dtype(dfcol)
         return (dfcol == col.level).to_numpy()
     elif type(col) == NumericCol:
         assert is_float_dtype(dfcol) or is_integer_dtype(dfcol)
         return dfcol.to_numpy()
     elif type(col) == CustomCol:
         assert col.factor in contrasts
         mat = contrasts[col.factor]
         levels = metadata.column(col.factor).levels
         # TODO: This can be triggered in normal use, so turn into
         # friendly error. It probably makes sense to check for
         # this earlier. This could possibly happen in `defm` after
         # creating the metadata, though this would also require a
         # separate check for SequentialOED. Is there anywhere
         # sensible to put this that catches both?
         assert len(levels) == mat.shape[0]
         assert col.index < mat.shape[1]
         # TODO: Better asymptotics then using `.index()`
         out = mat[[levels.index(val) for val in dfcol], col.index]
         return out
     else:
         raise Exception('unknown column type')
Пример #25
0
def test_cload_field(bins_path, pairs_path):
    kwargs = dict(
        metadata=None,
        assembly='toy',
        chunksize=10,
        zero_based=False,
        comment_char='#',
        input_copy_status='unique',
        no_symmetric_upper=False,
        temp_dir=None,
        no_delete_temp=False,
        storage_options=None,
        no_count=True,
        max_merge=200,
        chrom1=1,
        pos1=2,
        chrom2=3,
        pos2=4,
    )
    cload_pairs.callback(bins_path,
                         pairs_path,
                         testcool_path,
                         field=('score=7:dtype=float', ),
                         **kwargs)
    pixels = cooler.Cooler(testcool_path).pixels()[:]
    assert 'count' in pixels.columns and types.is_integer_dtype(
        pixels.dtypes['count'])
    assert 'score' in pixels.columns and types.is_float_dtype(
        pixels.dtypes['score'])
Пример #26
0
    def categorical_bar_graph(self, params=None):
        df_parameters = self._dataframe_parameters()
        if not params:
            params = list(df_parameters)

        for item in params:
            if is_float_dtype(df_parameters[item]) or is_integer_dtype(
                    df_parameters[item]):
                continue
            else:
                # ToDO: fix
                # this is broken
                for i in range(len(df_parameters[item].index)):
                    if type(df_parameters[item][i]) == str or type(
                            df_parameters[item][i]) == bool:
                        continue
                    else:
                        # ToDO: enhancement
                        #this is where I could potentially make it so that an additional bar said numerical
                        df_parameters[item].drop(i)
            try:
                plt.figure(figsize=(20, 8))
                plt.rcParams['font.size'] = 18
                sns.countplot(x=item, data=df_parameters,
                              palette="husl").set_title(
                                  '{} bar graph'.format(item))
                path = self._save_to_folder('/bar_parameters',
                                            '{}_bar_graph.pdf'.format(item))
                plt.savefig(path)
                plt.close('all')
            except:
                continue
Пример #27
0
    def categorical_evolution(self, params=None):
        df_parameters = self._dataframe_parameters()
        if not params:
            params = list(df_parameters)

        df_parameters['iteration'] = self.results['iteration']
        for item in params:
            if is_float_dtype(df_parameters[item]) or is_integer_dtype(
                    df_parameters[item]):
                continue
            else:
                for i in range(len(df_parameters[item].index)):
                    if type(df_parameters[item][i]) == str or type(
                            df_parameters[item][i]) == bool:
                        continue
                    else:
                        df_parameters[item] = df_parameters[item].drop(i)
            try:
                plt.figure(figsize=(20, 8))
                plt.rcParams['font.size'] = 18
                sns.catplot(data=df_parameters, x='iteration',
                            y=item).fig.suptitle(
                                '{} over iterations'.format(item))
                path = self._save_to_folder(
                    '/category_evolution',
                    '{}_category_iter_graph.pdf'.format(item))
                plt.savefig(path)
                plt.close('all')
            except:
                continue
Пример #28
0
 def _encode_labels(self, y):
     y_encoded = _convert_to_woodwork_structure(y)
     y_encoded = _convert_woodwork_types_wrapper(y_encoded.to_series())
     # change only if dtype isn't int
     if not is_integer_dtype(y_encoded):
         self._label_encoder = LabelEncoder()
         y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64')
     return y_encoded
Пример #29
0
def _is_all_int(df_list: List[Union[dd.DataFrame, pd.DataFrame]],
                col: str) -> bool:
    """
    Check whether the col in all dataframes are all integer type.
    """
    for df in df_list:
        srs = df[col]
        if isinstance(srs, (dd.DataFrame, pd.DataFrame)):
            for dtype in srs.dtypes:
                if not is_integer_dtype(dtype):
                    return False
        elif isinstance(srs, (dd.Series, pd.Series)):
            if not is_integer_dtype(srs.dtype):
                return False
        else:
            raise ValueError(f"unprocessed type of data:{type(srs)}")
    return True
Пример #30
0
 def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
     dtype, spark_type = pandas_on_spark_type(dtype)
     if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
         if get_option("compute.eager_check") and index_ops.hasnans:
             raise ValueError(
                 "Cannot convert %s with missing values to integer" % self.pretty_name
             )
     return _non_fractional_astype(index_ops, dtype, spark_type)
def pandas_to_table(df):
    # type: (pd.DataFrame) -> Orange.data.Table
    """
    Convert a pandas.DataFrame to a Orange.data.Table instance.
    """
    index = df.index
    if not isinstance(index, pd.RangeIndex):
        df = df.reset_index()

    columns = []  # type: List[Tuple[Orange.data.Variable, np.ndarray]]

    for header, series in df.items():  # type: (Any, pd.Series)
        if pdtypes.is_categorical(series):
            coldata = series.values  # type: pd.Categorical
            categories = [str(c) for c in coldata.categories]
            var = Orange.data.DiscreteVariable.make(
                str(header), values=categories, ordered=coldata.ordered
            )
            # Remap the coldata into the var.values order/set
            coldata = pd.Categorical(
                coldata, categories=var.values, ordered=coldata.ordered
            )
            codes = coldata.codes
            assert np.issubdtype(codes.dtype, np.integer)
            orangecol = np.array(codes, dtype=np.float)
            orangecol[codes < 0] = np.nan
        elif pdtypes.is_datetime64_any_dtype(series):
            # Check that this converts tz local to UTC
            series = series.astype(np.dtype("M8[ns]"))
            coldata = series.values  # type: np.ndarray
            assert coldata.dtype == "M8[ns]"
            mask = np.isnat(coldata)
            orangecol = coldata.astype(np.int64) / 10 ** 9
            orangecol[mask] = np.nan
            var = Orange.data.TimeVariable.make(str(header))
            var.have_date = var.have_time = 1
        elif pdtypes.is_object_dtype(series):
            coldata = series.values
            assert isinstance(coldata, np.ndarray)
            orangecol = coldata
            var = Orange.data.StringVariable.make(str(header))
        elif pdtypes.is_integer_dtype(series):
            coldata = series.values
            var = Orange.data.ContinuousVariable.make(str(header))
            var.number_of_decimals = 0
            orangecol = coldata.astype(np.float64)
        elif pdtypes.is_numeric_dtype(series):
            orangecol = series.values.astype(np.float64)
            var = Orange.data.ContinuousVariable.make(str(header))
            var._out_format = "%.15g"
        else:
            warnings.warn(
                "Column '{}' with dtype: {} skipped."
                .format(header, series.dtype),
                UserWarning
            )
            continue
        columns.append((var, orangecol))

    cols_x = [(var, col) for var, col in columns if var.is_primitive()]
    cols_m = [(var, col) for var, col in columns if not var.is_primitive()]

    variables = [v for v, _ in cols_x]
    if cols_x:
        X = np.column_stack([a for _, a in cols_x])
    else:
        X = np.empty((df.shape[0], 0), dtype=np.float)
    metas = [v for v, _ in cols_m]
    if cols_m:
        M = np.column_stack([a for _, a in cols_m])
    else:
        M = None

    domain = Orange.data.Domain(variables, metas=metas)
    return Orange.data.Table.from_numpy(domain, X, None, M)
Пример #32
0
def coerce_dtypes(df, dtypes):
    """ Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    bad_dtypes = []
    bad_dates = []
    errors = []
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            actual = df.dtypes[c]
            desired = dtypes[c]
            if is_float_dtype(actual) and is_integer_dtype(desired):
                bad_dtypes.append((c, actual, desired))
            elif is_object_dtype(actual) and is_datetime64_any_dtype(desired):
                # This can only occur when parse_dates is specified, but an
                # invalid date is encountered. Pandas then silently falls back
                # to object dtype. Since `object_array.astype(datetime)` will
                # silently overflow, error here and report.
                bad_dates.append(c)
            else:
                try:
                    df[c] = df[c].astype(dtypes[c])
                except Exception as e:
                    bad_dtypes.append((c, actual, desired))
                    errors.append((c, e))

    if bad_dtypes:
        if errors:
            ex = '\n'.join("- %s\n  %r" % (c, e) for c, e in
                           sorted(errors, key=lambda x: str(x[0])))
            exceptions = ("The following columns also raised exceptions on "
                          "conversion:\n\n%s\n\n") % ex
            extra = ""
        else:
            exceptions = ""
            # All mismatches are int->float, also suggest `assume_missing=True`
            extra = ("\n\nAlternatively, provide `assume_missing=True` "
                     "to interpret\n"
                     "all unspecified integer columns as floats.")

        bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0]))
        table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes)
        dtype_kw = ('dtype={%s}' % ',\n'
                    '       '.join("%r: '%s'" % (k, v)
                                   for (k, v, _) in bad_dtypes))

        dtype_msg = (
            "{table}\n\n"
            "{exceptions}"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n\n"
            "{dtype_kw}\n\n"
            "to the call to `read_csv`/`read_table`."
            "{extra}").format(table=table, exceptions=exceptions,
                              dtype_kw=dtype_kw, extra=extra)
    else:
        dtype_msg = None

    if bad_dates:
        also = " also " if bad_dtypes else " "
        cols = '\n'.join("- %s" % c for c in bad_dates)
        date_msg = (
            "The following columns{also}failed to properly parse as dates:\n\n"
            "{cols}\n\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`.").format(also=also, cols=cols)
    else:
        date_msg = None

    if bad_dtypes or bad_dates:
        rule = "\n\n%s\n\n" % ('-' * 61)
        msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n"
               "%s" % (rule.join(filter(None, [dtype_msg, date_msg]))))
        raise ValueError(msg)
Пример #33
0
def read_pandas(reader, urlpath, blocksize=AUTO_BLOCKSIZE, collection=True,
                lineterminator=None, compression=None, sample=256000,
                enforce=False, assume_missing=False, storage_options=None,
                include_path_column=False,
                **kwargs):
    reader_name = reader.__name__
    if lineterminator is not None and len(lineterminator) == 1:
        kwargs['lineterminator'] = lineterminator
    else:
        lineterminator = '\n'
    if include_path_column and isinstance(include_path_column, bool):
        include_path_column = 'path'
    if 'index' in kwargs or 'index_col' in kwargs:
        raise ValueError("Keyword 'index' not supported "
                         "dd.{0}(...).set_index('my-index') "
                         "instead".format(reader_name))
    for kw in ['iterator', 'chunksize']:
        if kw in kwargs:
            raise ValueError("{0} not supported for "
                             "dd.{1}".format(kw, reader_name))
    if kwargs.get('nrows', None):
        raise ValueError("The 'nrows' keyword is not supported by "
                         "`dd.{0}`. To achieve the same behavior, it's "
                         "recommended to use `dd.{0}(...)."
                         "head(n=nrows)`".format(reader_name))
    if isinstance(kwargs.get('skiprows'), int):
        skiprows = lastskiprow = firstrow = kwargs.get('skiprows')
    elif kwargs.get('skiprows') is None:
        skiprows = lastskiprow = firstrow = 0
    else:
        # When skiprows is a list, we expect more than max(skiprows) to
        # be included in the sample. This means that [0,2] will work well,
        # but [0, 440] might not work.
        skiprows = set(kwargs.get('skiprows'))
        lastskiprow = max(skiprows)
        # find the firstrow that is not skipped, for use as header
        firstrow = min(set(range(len(skiprows) + 1)) - set(skiprows))
    if isinstance(kwargs.get('header'), list):
        raise TypeError("List of header rows not supported for "
                        "dd.{0}".format(reader_name))
    if isinstance(kwargs.get('converters'), dict) and include_path_column:
        path_converter = kwargs.get('converters').get(include_path_column, None)
    else:
        path_converter = None

    if isinstance(blocksize, (str, unicode)):
        blocksize = parse_bytes(blocksize)
    if blocksize and compression not in seekable_files:
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``blocksize=None to remove this message``\n"
             "Setting ``blocksize=None``" % compression)
        blocksize = None
    if compression not in seekable_files and compression not in cfiles:
        raise NotImplementedError("Compression format %s not installed" %
                                  compression)
    if blocksize and blocksize < sample and lastskiprow != 0:
        warn("Unexpected behavior can result from passing skiprows when\n"
             "blocksize is smaller than sample size.\n"
             "Setting ``sample=blocksize``")
        sample = blocksize
    b_lineterminator = lineterminator.encode()
    b_out = read_bytes(urlpath, delimiter=b_lineterminator,
                       blocksize=blocksize,
                       sample=sample,
                       compression=compression,
                       include_path=include_path_column,
                       **(storage_options or {}))

    if include_path_column:
        b_sample, values, paths = b_out
        if path_converter:
            paths = [path_converter(path) for path in paths]
        path = (include_path_column, paths)
    else:
        b_sample, values = b_out
        path = None

    if not isinstance(values[0], (tuple, list)):
        values = [values]

    # Get header row, and check that sample is long enough. If the file
    # contains a header row, we need at least 2 nonempty rows + the number of
    # rows to skip.
    names = kwargs.get('names', None)
    header = kwargs.get('header', 'infer' if names is None else None)
    need = 1 if header is None else 2
    parts = b_sample.split(b_lineterminator, lastskiprow + need)
    # If the last partition is empty, don't count it
    nparts = 0 if not parts else len(parts) - int(not parts[-1])

    if nparts < lastskiprow + need and len(b_sample) >= sample:
        raise ValueError("Sample is not large enough to include at least one "
                         "row of data. Please increase the number of bytes "
                         "in `sample` in the call to `read_csv`/`read_table`")

    header = b'' if header is None else parts[firstrow] + b_lineterminator

    # Use sample to infer dtypes and check for presense of include_path_column
    head = reader(BytesIO(b_sample), **kwargs)
    if include_path_column and (include_path_column in head.columns):
        raise ValueError("Files already contain the column name: %s, so the "
                         "path column cannot use this name. Please set "
                         "`include_path_column` to a unique name."
                         % include_path_column)

    specified_dtypes = kwargs.get('dtype', {})
    if specified_dtypes is None:
        specified_dtypes = {}
    # If specified_dtypes is a single type, then all columns were specified
    if assume_missing and isinstance(specified_dtypes, dict):
        # Convert all non-specified integer columns to floats
        for c in head.columns:
            if is_integer_dtype(head[c].dtype) and c not in specified_dtypes:
                head[c] = head[c].astype(float)

    return text_blocks_to_pandas(reader, values, header, head, kwargs,
                                 collection=collection, enforce=enforce,
                                 specified_dtypes=specified_dtypes,
                                 path=path)
Пример #34
0
def read_pandas(reader, urlpath, blocksize=AUTO_BLOCKSIZE, collection=True,
                lineterminator=None, compression=None, sample=256000,
                enforce=False, assume_missing=False, storage_options=None,
                **kwargs):
    reader_name = reader.__name__
    if lineterminator is not None and len(lineterminator) == 1:
        kwargs['lineterminator'] = lineterminator
    else:
        lineterminator = '\n'
    if 'index' in kwargs or 'index_col' in kwargs:
        raise ValueError("Keyword 'index' not supported "
                         "dd.{0}(...).set_index('my-index') "
                         "instead".format(reader_name))
    for kw in ['iterator', 'chunksize']:
        if kw in kwargs:
            raise ValueError("{0} not supported for "
                             "dd.{1}".format(kw, reader_name))
    if kwargs.get('nrows', None):
        raise ValueError("The 'nrows' keyword is not supported by "
                         "`dd.{0}`. To achieve the same behavior, it's "
                         "recommended to use `dd.{0}(...)."
                         "head(n=nrows)`".format(reader_name))
    if isinstance(kwargs.get('skiprows'), list):
        raise TypeError("List of skiprows not supported for "
                        "dd.{0}".format(reader_name))
    if isinstance(kwargs.get('header'), list):
        raise TypeError("List of header rows not supported for "
                        "dd.{0}".format(reader_name))

    if blocksize and compression not in seekable_files:
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``blocksize=None to remove this message``\n"
             "Setting ``blocksize=None``" % compression)
        blocksize = None
    if compression not in seekable_files and compression not in cfiles:
        raise NotImplementedError("Compression format %s not installed" %
                                  compression)

    b_lineterminator = lineterminator.encode()
    b_sample, values = read_bytes(urlpath, delimiter=b_lineterminator,
                                  blocksize=blocksize,
                                  sample=sample,
                                  compression=compression,
                                  **(storage_options or {}))

    if not isinstance(values[0], (tuple, list)):
        values = [values]

    # Get header row, and check that sample is long enough. If the file
    # contains a header row, we need at least 2 nonempty rows + the number of
    # rows to skip.
    skiprows = kwargs.get('skiprows', 0)
    header = kwargs.get('header', 'infer')
    need = 1 if header is None else 2
    parts = b_sample.split(b_lineterminator, skiprows + need)
    # If the last partition is empty, don't count it
    nparts = 0 if not parts else len(parts) - int(not parts[-1])

    if nparts < skiprows + need and len(b_sample) >= sample:
        raise ValueError("Sample is not large enough to include at least one "
                         "row of data. Please increase the number of bytes "
                         "in `sample` in the call to `read_csv`/`read_table`")

    header = b'' if header is None else parts[skiprows] + b_lineterminator

    # Use sample to infer dtypes
    head = reader(BytesIO(b_sample), **kwargs)

    specified_dtypes = kwargs.get('dtype', {})
    if specified_dtypes is None:
        specified_dtypes = {}
    # If specified_dtypes is a single type, then all columns were specified
    if assume_missing and isinstance(specified_dtypes, dict):
        # Convert all non-specified integer columns to floats
        for c in head.columns:
            if is_integer_dtype(head[c].dtype) and c not in specified_dtypes:
                head[c] = head[c].astype(float)

    return text_blocks_to_pandas(reader, values, header, head, kwargs,
                                 collection=collection, enforce=enforce)