Пример #1
0
    def test_bin_width(self):
        """ Test getting the bin width of bin and sparselybin histograms
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})

                # building test histograms
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist4.fill.numpy(df1)

                assert hist2.bin_width() == 1.0
                assert hist3.bin_width() == 1.0
                assert hist4.bin_width() == 0.5
                assert hist5.bin_width() == 0.5
Пример #2
0
    def test_most_probable_value(self):
        """ Test getting most probable value or label from histogram
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame(
                    {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], 'C': ['f1', 'f3', 'f4', 'f3', 'f4', 'f2', 'f2', 'f1', 'f3', 'f4']})
                df2 = pd.DataFrame(
                    {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], 'C': ['f7', 'f3', 'f5', 'f8', 'f9', 'f2', 'f3', 'f6', 'f7', 'f7']})

                # building 1d-, 2d-, and 3d-histogram (iteratively)
                hist0 = hg.Categorize(unit('C'))
                hist1 = hg.Categorize(unit('C'))
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))

                # fill them
                hist0.fill.numpy(df1)
                hist1.fill.numpy(df2)
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)

                assert hist0.mpv == 'f3'
                assert hist1.mpv == 'f7'
                assert hist2.mpv == 1.5
                assert hist3.mpv == 4.5
Пример #3
0
    def test_bin_edges(self):
        """ Test getting the bin edges for requested ranges
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})
                df2 = pd.DataFrame({'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]})

                # building test histograms
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                import numpy as np
                np.testing.assert_array_equal(hist2.bin_edges(), [0., 1., 2., 3., 4., 5.])
                np.testing.assert_array_equal(hist3.bin_edges(), [2., 3., 4., 5., 6., 7., 8., 9.])
                np.testing.assert_array_equal(hist4.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.])
                np.testing.assert_array_equal(hist5.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.])

                np.testing.assert_array_equal(hist2.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.])
                np.testing.assert_array_equal(hist3.bin_edges(low=1.1, high=6), [1., 2., 3., 4., 5., 6.])
                np.testing.assert_array_equal(hist4.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10.])
                np.testing.assert_array_equal(hist5.bin_edges(low=1.1, high=5.4), [1., 2., 3., 4., 5., 6.])
Пример #4
0
def test_assert_similar_hists():
    """Test assert on similarity of list of histograms

    Check similarity of: type, n-dim, sub-hists, specific type attributes
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df["date"] = df["D"].apply(to_ns)

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Bin(5, 0, 5, unit("A"))
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.Categorize(unit("C"), value=hist0)

    hist4 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    hist5 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist3,
    )
    # fill them
    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        hist.fill.numpy(df)

    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        assert check_similar_hists([hist, hist])

    args01 = [""]
    args23 = [""]
    args45 = [""]

    try:
        assert_similar_hists([hist0, hist1])
    except ValueError as e:
        args01 = e.args

    try:
        assert_similar_hists([hist2, hist3])
    except ValueError as e:
        args23 = e.args

    try:
        assert_similar_hists([hist4, hist5])
    except ValueError as e:
        args45 = e.args

    assert args01[0] == "Input histograms are not all similar."
    assert args23[0] == "Input histograms are not all similar."
    assert args45[0] == "Input histograms are not all similar."
Пример #5
0
    def test_bin_entries(self):
        """ Test getting the number of bins for all assigned bins
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame(
                    {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], 'C': ['f1', 'f3', 'f4', 'f3', 'f4', 'f2', 'f2', 'f1', 'f3', 'f4']})
                df2 = pd.DataFrame(
                    {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], 'C': ['f7', 'f3', 'f5', 'f8', 'f9', 'f2', 'f3', 'f6', 'f7', 'f7']})

                # building 1d-, 2d-, and 3d-histogram (iteratively)
                hist0 = hg.Categorize(unit('C'))
                hist1 = hg.Categorize(unit('C'))
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist0.fill.numpy(df1)
                hist1.fill.numpy(df2)
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                labels0 = hist0.bin_labels()
                labels1 = hist1.bin_labels()
                centers2 = hist2.bin_centers()
                centers3 = hist3.bin_centers()
                centers = hist4.bin_centers()

                import numpy as np
                np.testing.assert_array_equal(hist0.bin_entries(), [2., 2., 3., 3.])
                np.testing.assert_array_equal(hist1.bin_entries(), [1., 2., 1., 1., 3., 1., 1.])
                np.testing.assert_array_equal(hist0.bin_entries(labels=labels1), [2., 3., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist1.bin_entries(labels=labels0), [0., 1., 2., 0.])

                np.testing.assert_array_equal(hist2.bin_entries(), [1., 4., 2., 2., 1.])
                np.testing.assert_array_equal(hist3.bin_entries(), [1., 1., 2., 2., 1., 2., 1.])
                np.testing.assert_array_equal(hist4.bin_entries(), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist5.bin_entries(), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.])

                np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers3), [2., 2., 1., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers2), [0., 0., 1., 1., 2.])
                np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.])

                np.testing.assert_array_equal(hist2.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist3.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.])
                np.testing.assert_array_equal(hist4.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist5.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.])
Пример #6
0
    def get_hist_bin(self, hist, features, quant, col, dt):
        is_number = np.issubdtype(dt, np.number)
        is_timestamp = np.issubdtype(dt, np.datetime64)

        if is_number or is_timestamp:
            # numbers and timestamps are put in a sparse binned histogram
            specs = self.var_bin_specs(features, features.index(col))
            if "bin_width" in specs:
                hist = hg.SparselyBin(
                    binWidth=specs["bin_width"],
                    origin=specs.get("bin_offset", 0),
                    quantity=quant,
                    value=hist,
                )
            elif "num" in specs and "low" in specs and "high" in specs:
                hist = hg.Bin(
                    num=specs["num"],
                    low=specs["low"],
                    high=specs["high"],
                    quantity=quant,
                    value=hist,
                )
            else:
                raise RuntimeError(
                    "Do not know how to interpret bin specifications.")
        else:
            # string and boolians are treated as categories
            hist = hg.Categorize(quantity=quant, value=hist)

        return hist
Пример #7
0
def test_prepare_2dgrid():
    """Test preparation of grid for extraction of number of entries for 2d hists"""
    df, hc1, hc2, hc3 = get_test_histograms1()

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)

    xkeys1, ykeys1 = prepare_2dgrid(hist1)
    xkeys2, ykeys2 = prepare_2dgrid(hist2)
    xkeys3, ykeys3 = prepare_2dgrid(hist3)

    np.testing.assert_array_equal(xkeys1, [])
    np.testing.assert_array_equal(ykeys1, [])
    np.testing.assert_array_equal(xkeys2, [0, 1, 2, 3, 4])
    np.testing.assert_array_equal(ykeys2,
                                  ["foo1", "foo2", "foo3", "foo4", "foo5"])
    np.testing.assert_array_equal(xkeys3, [0, 1, 4, 5, 6])
    np.testing.assert_array_equal(ykeys3, [0, 1, 2, 3, 4])
Пример #8
0
    def construct_empty_hist(self, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the features
        # in reverse order and passing a single-dim hist as input
        # to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = self.var_dtype[col]

            # processing function, e.g. only accept boolians during filling
            f = utils.QUANTITY[dt]
            if len(features) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)  # noqa
            else:
                # df[features] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])  # noqa

            is_number = np.issubdtype(dt, np.number)
            is_timestamp = np.issubdtype(dt, np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(features, features.index(col))
                if "bin_width" in specs:
                    hist = hg.SparselyBin(
                        binWidth=specs["bin_width"],
                        origin=specs.get("bin_offset", 0),
                        quantity=quant,
                        value=hist,
                    )
                elif "num" in specs and "low" in specs and "high" in specs:
                    hist = hg.Bin(
                        num=specs["num"],
                        low=specs["low"],
                        high=specs["high"],
                        quantity=quant,
                        value=hist,
                    )
                else:
                    raise RuntimeError(
                        "Do not know how to interpret bin specifications.")
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=quant, value=hist)

        return hist
Пример #9
0
def test_get_consistent_numpy_entries():
    """Test extraction of number of entries

    When first making bin_edges of input histograms consistent to each other.
    """
    df1 = pd.DataFrame({
        "A": [0, 1, 2, 3, 4, 3, 2, 1, 1, 1],
        "C": ["f1", "f3", "f4", "f3", "f4", "f2", "f2", "f1", "f3", "f4"],
    })
    df2 = pd.DataFrame({
        "A": [2, 3, 4, 5, 7, 4, 6, 5, 7, 8],
        "C": ["f7", "f3", "f5", "f8", "f9", "f2", "f3", "f6", "f7", "f7"],
    })

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Categorize(unit("C"))
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A"))
    hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A"))

    # fill them
    for hist, df in zip([hist0, hist1, hist2, hist3], [df1, df2, df1, df2]):
        hist.fill.numpy(df)

    e0, e1 = get_consistent_numpy_entries([hist0, hist1], get_bin_labels=False)
    _, labels01 = get_consistent_numpy_entries([hist0, hist1],
                                               get_bin_labels=True)

    e2, e3 = get_consistent_numpy_entries([hist2, hist3], get_bin_labels=False)
    _, centers23 = get_consistent_numpy_entries([hist2, hist3],
                                                get_bin_labels=True)

    entries0 = [2.0, 2.0, 3.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    entries1 = [0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 3.0, 1.0, 1.0]
    labels = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"]

    entries2 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0]
    entries3 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0]
    centers = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]

    np.testing.assert_array_equal(e0, entries0)
    np.testing.assert_array_equal(e1, entries1)
    np.testing.assert_array_equal(labels01, labels)

    np.testing.assert_array_equal(e2, entries2)
    np.testing.assert_array_equal(e3, entries3)
    np.testing.assert_array_equal(centers23, centers)
Пример #10
0
    def construct_empty_hist(self, columns):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        revcols = list(reversed(columns))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])

            # processing function, e.g. only accept boolians during filling
            f = self.quantity.get(col, hf.QUANTITY[dt.type])
            if len(columns) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)  # noqa
            else:
                # df[columns] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])  # noqa

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(
                    col, self._unit_bin_specs
                    if is_number else self._unit_timestamp_specs)
                hist = hg.SparselyBin(binWidth=bs['bin_width'],
                                      origin=bs['bin_offset'],
                                      quantity=quant,
                                      value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=quant, value=hist)

            # decorators; adding them here doesn't seem to work!
            #hist.n_dim = get_n_dim(hist)
            #selected_cols = revcols[:idx+1]
            #dta = [self.var_dtype[col] for col in reversed(selected_cols)]
            #hist.datatype = dta[0] if hist.n_dim==1 else dta

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        return hist
Пример #11
0
    def construct_empty_hist(self, df, columns):
        """Create an (empty) histogram of right type

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """

        hist = histogrammar.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        for col in reversed(columns):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(
                    col, self._unit_bin_specs
                    if is_number else self._unit_timestamp_specs)
                hist = histogrammar.SparselyBin(binWidth=bs['bin_width'],
                                                origin=bs['bin_offset'],
                                                quantity=df[col],
                                                value=hist)
            else:
                # string and boolians are treated as categories
                hist = histogrammar.Categorize(quantity=df[col], value=hist)

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        @property
        def n_bins(self):
            if hasattr(self, 'num'):
                return self.num
            elif hasattr(self, 'size'):
                return self.size
            else:
                raise RuntimeError(
                    'Cannot retrieve number of bins from hgr hist')

        hist.n_bins = n_bins

        return hist
Пример #12
0
    def test_bin_centers(self):
        """ Test getting assigned bin-centers for Bin and SparselyBin histograms
        """
        with Pandas() as pd:
            if pd is None:
                return
            with Numpy() as np:  # noqa
                if numpy is None:
                    return
                sys.stderr.write("\n")

                df1 = pd.DataFrame(
                    {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})
                df2 = pd.DataFrame(
                    {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]})

                # histograms
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                import numpy as np
                np.testing.assert_array_equal(hist2.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5])
                np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5])
                np.testing.assert_array_equal(hist2.bin_centers(low=5, high=15),
                                              [5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5])
                np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5])
                np.testing.assert_array_equal(hist3.bin_centers(low=2.1, high=5.6), [2.5, 3.5, 4.5, 5.5])

                np.testing.assert_array_equal(hist4.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
                np.testing.assert_array_equal(hist5.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
                np.testing.assert_array_equal(hist4.bin_centers(low=5, high=15), [5.5, 6.5, 7.5, 8.5, 9.5])
                np.testing.assert_array_equal(hist5.bin_centers(low=2.1, high=9.1), [
                                              2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
Пример #13
0
    def test_num_bins(self):
        """ Test getting the number of bins from lowest to highest bin
        """
        with Pandas() as pd:
            if pd is None:
                return
            with Numpy() as np:  # noqa
                if numpy is None:
                    return
                sys.stderr.write("\n")

                df1 = pd.DataFrame({'A': [0, 2, 4, 5, 7, 9, 11, 13, 13, 15]})
                df2 = pd.DataFrame({'A': [2, 4, 4, 6, 8, 7, 10, 14, 17, 19]})

                # building 1d-, 2d-, and 3d-histogram (iteratively)
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A'))
                hist5 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                assert hist2.num_bins() == 16
                assert hist3.num_bins() == 18
                assert hist4.num_bins() == 20
                assert hist5.num_bins() == 20

                assert hist2.num_bins(low=10, high=25) == 15
                assert hist3.num_bins(low=10, high=25) == 15
                assert hist4.num_bins(low=10, high=25) == 10
                assert hist5.num_bins(low=10, high=25) == 10

                assert hist2.num_bins(low=-10, high=28) == 38
                assert hist3.num_bins(low=-10, high=28) == 38
                assert hist4.num_bins(low=-10, high=28) == 20
                assert hist5.num_bins(low=-10, high=28) == 20
Пример #14
0
def test_get_consistent_numpy_1dhists():
    """Test extraction of number of entries and bin-edges/labels

    When first making bin_edges/bin-labels of input histograms consistent to each other.
    """
    df1 = pd.DataFrame({"A": [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})
    df2 = pd.DataFrame({"A": [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]})

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A"))
    hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A"))

    # fill them
    hist1.fill.numpy(df1)
    hist2.fill.numpy(df2)

    nphist1, nphist2 = get_consistent_numpy_1dhists([hist1, hist2],
                                                    get_bin_labels=False)
    nphist_list, centers = get_consistent_numpy_1dhists([hist1, hist2],
                                                        get_bin_labels=True)

    entries1 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0]
    entries2 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0]
    bin_edges = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
    bin_centers = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]

    np.testing.assert_array_equal(nphist1[0], entries1)
    np.testing.assert_array_equal(nphist1[1], bin_edges)
    np.testing.assert_array_equal(nphist2[0], entries2)
    np.testing.assert_array_equal(nphist2[1], bin_edges)

    np.testing.assert_array_equal(nphist_list[0][0], entries1)
    np.testing.assert_array_equal(nphist_list[0][1], bin_edges)
    np.testing.assert_array_equal(nphist_list[1][0], entries2)
    np.testing.assert_array_equal(nphist_list[1][1], bin_edges)
    np.testing.assert_array_equal(centers, bin_centers)
Пример #15
0
def project_on_x(hist):
    """Project n-dim histogram onto x-axis

    :param hist: input histogrammar histogram
    :return: on x-axis projected histogram (1d)
    """
    # basic check: projecting on itself
    if hasattr(hist, "n_dim") and hist.n_dim <= 1:
        return hist
    # basic checks on contents
    if hasattr(hist, "bins"):
        if len(hist.bins) == 0:
            return hist
    elif hasattr(hist, "values"):
        if len(hist.values) == 0:
            return hist
    else:
        return hist

    # make empty clone
    # note: cannot do: h_x = hist.zero(), b/c it copies n-dim structure, which screws up hist.toJsonString()
    if isinstance(hist, histogrammar.Bin):
        h_x = histogrammar.Bin(
            num=hist.num,
            low=hist.low,
            high=hist.high,
            quantity=hist.quantity,
        )
    elif isinstance(hist, histogrammar.SparselyBin):
        h_x = histogrammar.SparselyBin(
            binWidth=hist.binWidth,
            origin=hist.origin,
            quantity=hist.quantity,
        )
    elif isinstance(hist, histogrammar.Categorize):
        h_x = histogrammar.Categorize(quantity=hist.quantity)
    else:
        raise TypeError("Unknown histogram type. cannot get zero copy.")

    if hasattr(hist, "bins"):
        for key, bi in hist.bins.items():
            h_x.bins[key] = histogrammar.Count.ed(sum_entries(bi))
    elif hasattr(hist, "values"):
        for i, bi in enumerate(hist.values):
            h_x.values[i] = histogrammar.Count.ed(sum_entries(bi))

    return h_x
Пример #16
0
    def construct_empty_hist(self, df, columns):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        revcols = list(reversed(columns))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])
            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(columns, columns.index(col))
                hist = hg.SparselyBin(binWidth=specs['bin_width'],
                                      origin=specs['bin_offset'],
                                      quantity=df[col],
                                      value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=df[col], value=hist)

            # decorators; adding them here doesn't seem to work!
            #selected_cols = revcols[:idx+1]
            #hist.datatype = [self.var_dtype[col] for col in reversed(selected_cols)]

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        return hist
Пример #17
0
    def _create_hist_with_time_axis(self, hist, time_bin_idx):
        """Create histogram with time-axis and place hist into it at time-value

        :param hist: input histogram to insert into histogram with time-axis
        :param str time_bin_idx: time-value at which to insert histogram
        :return: histogram with time-axis
        """
        # basic checks
        if time_bin_idx is None or not isinstance(time_bin_idx, (str, int)):
            raise TypeError(
                "time_bin_idx not set. should be an (ordered) string or integer."
            )

        ht = (hg.SparselyBin(binWidth=1.0, origin=0.0, quantity=lambda x: x)
              if isinstance(time_bin_idx, int) else hg.Categorize(
                  quantity=lambda x: x))  # noqa
        ht.bins[time_bin_idx] = hist
        ht.entries = hist.entries
        return ht
Пример #18
0
    def construct_empty_hist(self, df, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through
        # the features in reverse order and passing a single-dim hist
        # as input to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])
            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(features, features.index(col))
                hist = hg.SparselyBin(
                    binWidth=specs['bin_width'],
                    origin=specs['bin_offset'],
                    quantity=df[col],
                    value=hist
                )
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=df[col], value=hist)

        # set data types in histogram
        dta = [self.var_dtype[col] for col in features]
        hist.datatype = dta[0] if len(features) == 1 else dta
        return hist
Пример #19
0
def get_test_histograms1():
    """ Get set 1 of test histograms
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    import pandas as pd
    import histogrammar as hg

    df = pd.util.testing.makeMixedDataFrame()
    df['date'] = df['D'].apply(to_ns)
    df['boolT'] = True
    df['boolF'] = False

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.Categorize(unit('C'))
    hist2 = hg.Bin(5, 0, 5, unit('A'), value=hist1)
    hist3 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value, binWidth=pd.Timedelta(days=1).value,
                           quantity=unit('date'), value=hist2)
    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)

    return df, hist1, hist2, hist3
Пример #20
0
def get_test_histograms1():
    """Get set 1 of test histograms"""
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df["date"] = df["D"].apply(to_ns)
    df["boolT"] = True
    df["boolF"] = False

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)

    return df, hist1, hist2, hist3
Пример #21
0
    def _construct_empty_hist(self, columns):
        """Create an (empty) histogram of right type

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """

        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        for col in reversed(columns):
            # histogram type depends on the data type
            dt = np.dtype(self.datatype[col])

            # processing function, e.g. only accept boolians during filling
            f = self.quantity[col] if col in self.quantity else QUANTITY[
                dt.type]
            if len(columns) == 1:
                # df[col] is a pd.series
                q = lambda x, fnc=f: fnc(x)
            else:
                # df[columns] is a pd.Dataframe
                # fix column to col
                q = lambda x, fnc=f, clm=col: fnc(x[clm])

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(
                    col, self._unit_bin_specs
                    if is_number else self._unit_timestamp_specs)
                hist = hg.SparselyBin(binWidth=bs['bin_width'],
                                      origin=bs['bin_offset'],
                                      quantity=q,
                                      value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=q, value=hist)

        # FIXME stick data types and number of dimension to histogram
        dta = [self.datatype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        @property
        def n_bins(self):
            if hasattr(self, num):
                return self.num
            elif hasattr(size, size):
                return self.size
            else:
                raise Exception(
                    'Cannot retrieve number of bins from hgr hist.')

        hist.n_bins = n_bins

        return hist
Пример #22
0
def test_get_consistent_numpy_2dgrids():
    """Test extraction of number of entries for 2d hists

    When first making bin_edges of input histograms consistent to each other.
    """
    df1 = pd.DataFrame({
        "A": [0, 1, 2, 3, 4, 3, 2, 1, 1, 1],
        "C": ["f1", "f3", "f4", "f3", "f4", "f2", "f2", "f1", "f3", "f4"],
    })
    df2 = pd.DataFrame({
        "A": [2, 3, 4, 5, 7, 4, 6, 5, 7, 8],
        "C": ["f7", "f3", "f5", "f8", "f9", "f2", "f3", "f6", "f7", "f7"],
    })

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Categorize(unit("C"))
    hist1 = hg.SparselyBin(origin=0.0,
                           binWidth=1.0,
                           quantity=unit("A"),
                           value=hist0)
    hist2 = hg.SparselyBin(origin=0.0,
                           binWidth=1.0,
                           quantity=unit("A"),
                           value=hist0)

    # fill them
    hist0.fill.numpy(df1)
    hist1.fill.numpy(df1)
    hist2.fill.numpy(df2)

    args = [""]
    try:
        get_consistent_numpy_2dgrids([hist0, hist0])
    except ValueError as e:
        args = e.args

    grid2d_list = get_consistent_numpy_2dgrids([hist1, hist2])

    g1 = np.asarray([
        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    ])
    g2 = np.asarray([
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
    ])
    grid2d_comp = [g1, g2]

    # MB 20190828: not sure if this is the right way to test for exceptions.
    assert (
        args[0] ==
        "Input histogram only has 1 dimensions (<2). Cannot compute 2d-grid.")

    for i in range(2):
        assert (grid2d_list[i] == grid2d_comp[i]).all()