Exemplo n.º 1
0
    def test_combine_first(self):
        values = tm.makeIntIndex(20).values.astype(float)
        series = Series(values, index=tm.makeIntIndex(20))

        series_copy = series * 2
        series_copy[::2] = np.NaN

        # nothing used from the input
        combined = series.combine_first(series_copy)

        tm.assert_series_equal(combined, series)

        # Holes filled from input
        combined = series_copy.combine_first(series)
        assert np.isfinite(combined).all()

        tm.assert_series_equal(combined[::2], series[::2])
        tm.assert_series_equal(combined[1::2], series_copy[1::2])

        # mixed types
        index = tm.makeStringIndex(20)
        floats = Series(np.random.randn(20), index=index)
        strings = Series(tm.makeStringIndex(10), index=index[::2])

        combined = strings.combine_first(floats)

        tm.assert_series_equal(strings, combined.loc[index[::2]])
        tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]])

        # corner case
        ser = Series([1.0, 2, 3], index=[0, 1, 2])
        empty = Series([], index=[], dtype=object)
        result = ser.combine_first(empty)
        ser.index = ser.index.astype("O")
        tm.assert_series_equal(ser, result)
Exemplo n.º 2
0
 def setup(self):
     N = 10000
     K = 10
     self.df = DataFrame({
         "key1": tm.makeStringIndex(N).values.repeat(K),
         "key2": tm.makeStringIndex(N).values.repeat(K),
         "value": np.random.randn(N * K),
     })
Exemplo n.º 3
0
 def setup(self):
     N = 10000
     K = 10
     key1 = tm.makeStringIndex(N).values.repeat(K)
     key2 = tm.makeStringIndex(N).values.repeat(K)
     col_array = np.vstack([key1, key2, np.random.randn(N * K)])
     col_array2 = col_array.copy()
     col_array2[:, :10000] = np.nan
     self.col_array_list = list(col_array)
Exemplo n.º 4
0
 def setup(self):
     index = tm.makeStringIndex(1000)
     columns = tm.makeStringIndex(30)
     with warnings.catch_warnings(record=True):
         self.df = DataFrame(np.random.randn(1000, 30),
                             index=index,
                             columns=columns)
     self.idx_scalar = index[100]
     self.col_scalar = columns[10]
     self.bool_indexer = self.df[self.col_scalar] > 0
     self.bool_obj_indexer = self.bool_indexer.astype(object)
     self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")
Exemplo n.º 5
0
 def setup(self, other_cols, sep, na_rep, na_frac):
     N = 10**5
     mask_gen = lambda: np.random.choice(
         [True, False], N, p=[1 - na_frac, na_frac])
     self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
     if other_cols == 0:
         # str.cat self-concatenates only for others=None
         self.others = None
     else:
         self.others = DataFrame({
             i: tm.makeStringIndex(N).where(mask_gen())
             for i in range(other_cols)
         })
Exemplo n.º 6
0
    def setup(self, dtype):
        N, K = 5000, 50
        self.index = tm.makeStringIndex(N)
        self.columns = tm.makeStringIndex(K)

        def create_df(data):
            return DataFrame(data, index=self.index, columns=self.columns)

        self.df_int = create_df(np.random.randint(low=100, size=(N, K)))
        self.df_float = create_df(np.random.randn(N, K))
        self.df_bool = create_df(np.random.choice([True, False], size=(N, K)))
        self.df_string = create_df(
            np.random.choice(list(string.ascii_letters), size=(N, K)))
Exemplo n.º 7
0
 def setup(self):
     N, K = 5000, 50
     self.index = tm.makeStringIndex(N)
     self.columns = tm.makeStringIndex(K)
     frame = DataFrame(np.random.randn(N, K),
                       index=self.index,
                       columns=self.columns)
     self.data = frame.to_dict()
     self.dict_list = frame.to_dict(orient="records")
     self.data2 = {
         i: {j: float(j)
             for j in range(100)}
         for i in range(2000)
     }
Exemplo n.º 8
0
 def setup(self):
     rng = date_range(start="1/1/1970", periods=10000, freq="1min")
     self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10))
     self.df["foo"] = "bar"
     self.rng_subset = Index(rng[::2])
     self.df2 = DataFrame(
         index=range(10000), data=np.random.rand(10000, 30), columns=range(30)
     )
     N = 5000
     K = 200
     level1 = tm.makeStringIndex(N).values.repeat(K)
     level2 = np.tile(tm.makeStringIndex(K).values, N)
     index = MultiIndex.from_arrays([level1, level2])
     self.s = Series(np.random.randn(N * K), index=index)
     self.s_subset = self.s[::2]
Exemplo n.º 9
0
 def setup(self, dtype, method):
     N = 10**5
     dates_left = date_range("1/1/2000", periods=N, freq="T")
     fmt = "%Y-%m-%d %H:%M:%S"
     date_str_left = Index(dates_left.strftime(fmt))
     int_left = Index(np.arange(N))
     str_left = tm.makeStringIndex(N)
     data = {
         "datetime": {
             "left": dates_left,
             "right": dates_left[:-1]
         },
         "date_string": {
             "left": date_str_left,
             "right": date_str_left[:-1]
         },
         "int": {
             "left": int_left,
             "right": int_left[:-1]
         },
         "strings": {
             "left": str_left,
             "right": str_left[:-1]
         },
     }
     self.left = data[dtype]["left"]
     self.right = data[dtype]["right"]
Exemplo n.º 10
0
    def setup(self):
        n1 = 400
        n2 = 250
        index = MultiIndex(
            levels=[np.arange(n1), tm.makeStringIndex(n2)],
            codes=[np.repeat(range(n1), n2).tolist(),
                   list(range(n2)) * n1],
            names=["lev1", "lev2"],
        )
        arr = np.random.randn(n1 * n2, 3)
        arr[::10000, 0] = np.nan
        arr[1::10000, 1] = np.nan
        arr[2::10000, 2] = np.nan
        data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"])
        self.df = data

        n = 20000
        self.df1 = DataFrame(np.random.randint(1, n, (n, 3)),
                             columns=["jim", "joe", "jolie"])
        self.df2 = self.df1.copy()
        self.df2["jim"] = self.df2["joe"]

        self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)),
                             columns=["jim", "joe", "jolie"])
        self.df4 = self.df3.copy()
        self.df4["jim"] = self.df4["joe"]
Exemplo n.º 11
0
 def setup(self):
     n, k = 200, 5000
     levels = [
         np.arange(n),
         tm.makeStringIndex(n).values, 1000 + np.arange(n)
     ]
     codes = [np.random.choice(n, (k * n)) for lev in levels]
     self.mi = MultiIndex(levels=levels, codes=codes)
Exemplo n.º 12
0
def test_invalid_index_types_unicode():
    # see gh-10822
    #
    # Odd error message on conversions to datetime for unicode.
    msg = "Unknown string format"

    with pytest.raises(ValueError, match=msg):
        frequencies.infer_freq(tm.makeStringIndex(10))
Exemplo n.º 13
0
 def setup_cache(self):
     size = 10**6
     data = {
         "int64_small":
         Series(np.random.randint(0, 100, size=size)),
         "int64_large":
         Series(np.random.randint(0, 10000, size=size)),
         "object_small":
         Series(
             tm.makeStringIndex(100).take(
                 np.random.randint(0, 100, size=size))),
         "object_large":
         Series(
             tm.makeStringIndex(10000).take(
                 np.random.randint(0, 10000, size=size))),
     }
     return data
Exemplo n.º 14
0
 def setup(self):
     n = 50000
     indices = tm.makeStringIndex(n)
     subsample_size = 40000
     self.x = Series(np.random.randn(n), indices)
     self.y = Series(
         np.random.randn(subsample_size),
         index=np.random.choice(indices, subsample_size, replace=False),
     )
Exemplo n.º 15
0
    def test_repr_mixed_big(self):
        # big mixed
        biggie = DataFrame(
            {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
        )
        biggie.loc[:20, "A"] = np.nan
        biggie.loc[:20, "B"] = np.nan

        repr(biggie)
Exemplo n.º 16
0
def test_duplicated_large(keep):
    # GH 9125
    n, k = 200, 5000
    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
    codes = [np.random.choice(n, k * n) for lev in levels]
    mi = MultiIndex(levels=levels, codes=codes)

    result = mi.duplicated(keep=keep)
    expected = hashtable.duplicated(mi.values, keep=keep)
    tm.assert_numpy_array_equal(result, expected)
Exemplo n.º 17
0
def _generate_dataframe():
    N = 2000
    C = 5
    df = DataFrame(
        np.random.randn(N, C),
        columns=[f"float{i}" for i in range(C)],
        index=date_range("20000101", periods=N, freq="H"),
    )
    df["object"] = tm.makeStringIndex(N)
    return df
Exemplo n.º 18
0
    def setup(self, inplace):
        N = 10000
        K = 10
        key1 = tm.makeStringIndex(N).values.repeat(K)
        key2 = tm.makeStringIndex(N).values.repeat(K)
        self.df = DataFrame(
            {"key1": key1, "key2": key2, "value": np.random.randn(N * K)}
        )
        self.df_nan = self.df.copy()
        self.df_nan.iloc[:10000, :] = np.nan

        self.s = Series(np.random.randint(0, 1000, size=10000))
        self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

        N = 1000000
        K = 10000
        key1 = np.random.randint(0, K, size=N)
        self.df_int = DataFrame({"key1": key1})
        self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool))
Exemplo n.º 19
0
 def test_sub_fail(self):
     index = tm.makeStringIndex(100)
     with pytest.raises(TypeError):
         index - "a"
     with pytest.raises(TypeError):
         index - index
     with pytest.raises(TypeError):
         index - index.tolist()
     with pytest.raises(TypeError):
         index.tolist() - index
Exemplo n.º 20
0
Arquivo: hdf.py Projeto: ygene2/pandas
 def setup(self, format):
     self.fname = "__test__.h5"
     N = 100000
     C = 5
     self.df = DataFrame(
         np.random.randn(N, C),
         columns=[f"float{i}" for i in range(C)],
         index=date_range("20000101", periods=N, freq="H"),
     )
     self.df["object"] = tm.makeStringIndex(N)
     self.df.to_hdf(self.fname, "df", format=format)
Exemplo n.º 21
0
    def test_sub_fail(self):
        index = tm.makeStringIndex(100)

        msg = "unsupported operand type|Cannot broadcast"
        with pytest.raises(TypeError, match=msg):
            index - "a"
        with pytest.raises(TypeError, match=msg):
            index - index
        with pytest.raises(TypeError, match=msg):
            index - index.tolist()
        with pytest.raises(TypeError, match=msg):
            index.tolist() - index
Exemplo n.º 22
0
    def test_add(self):
        index = tm.makeStringIndex(100)
        expected = pd.Index(index.values * 2)
        tm.assert_index_equal(index + index, expected)
        tm.assert_index_equal(index + index.tolist(), expected)
        tm.assert_index_equal(index.tolist() + index, expected)

        # test add and radd
        index = pd.Index(list("abc"))
        expected = pd.Index(["a1", "b1", "c1"])
        tm.assert_index_equal(index + "1", expected)
        expected = pd.Index(["1a", "1b", "1c"])
        tm.assert_index_equal("1" + index, expected)
Exemplo n.º 23
0
def biggie_df_fixture(request):
    """Fixture for a big mixed Dataframe and an empty Dataframe"""
    if request.param == "mixed":
        df = DataFrame(
            {"A": np.random.randn(200), "B": tm.makeStringIndex(200)},
            index=np.arange(200),
        )
        df.loc[:20, "A"] = np.nan
        df.loc[:20, "B"] = np.nan
        return df
    elif request.param == "empty":
        df = DataFrame(index=np.arange(200))
        return df
Exemplo n.º 24
0
Arquivo: csv.py Projeto: ygene2/pandas
 def setup(self, skiprows):
     N = 20000
     index = tm.makeStringIndex(N)
     df = DataFrame(
         {
             "float1": np.random.randn(N),
             "float2": np.random.randn(N),
             "string1": ["foo"] * N,
             "bool1": [True] * N,
             "int1": np.random.randint(0, N, size=N),
         },
         index=index,
     )
     df.to_csv(self.fname)
Exemplo n.º 25
0
def get_objs():
    indexes = [
        tm.makeBoolIndex(10, name="a"),
        tm.makeIntIndex(10, name="a"),
        tm.makeFloatIndex(10, name="a"),
        tm.makeDateIndex(10, name="a"),
        tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"),
        tm.makePeriodIndex(10, name="a"),
        tm.makeStringIndex(10, name="a"),
    ]

    arr = np.random.randn(10)
    series = [Series(arr, index=idx, name="a") for idx in indexes]

    objs = indexes + series
    return objs
Exemplo n.º 26
0
    def setup(self, threads):
        if not have_real_test_parallel:
            raise NotImplementedError

        strings = tm.makeStringIndex(100000)

        @test_parallel(num_threads=threads)
        def parallel():
            factorize(strings)

        self.parallel = parallel

        def loop():
            factorize(strings)

        self.loop = loop
Exemplo n.º 27
0
Arquivo: sql.py Projeto: ygene2/pandas
 def setup(self):
     N = 10000
     self.table_name = "test"
     self.con = create_engine("sqlite:///:memory:")
     self.df = DataFrame(
         {
             "float": np.random.randn(N),
             "float_with_nan": np.random.randn(N),
             "string": ["foo"] * N,
             "bool": [True] * N,
             "int": np.random.randint(0, N, size=N),
             "datetime": date_range("2000-01-01", periods=N, freq="s"),
         },
         index=tm.makeStringIndex(N),
     )
     self.df.loc[1000:3000, "float_with_nan"] = np.nan
     self.df["datetime_string"] = self.df["datetime"].astype(str)
     self.df.to_sql(self.table_name, self.con, if_exists="replace")
Exemplo n.º 28
0
 def setup(self, orient, frame):
     N = 10 ** 5
     ncols = 5
     index = date_range("20000101", periods=N, freq="H")
     timedeltas = timedelta_range(start=1, periods=N, freq="s")
     datetimes = date_range(start=1, periods=N, freq="s")
     ints = np.random.randint(100000000, size=N)
     floats = np.random.randn(N)
     strings = tm.makeStringIndex(N)
     self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
     self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
     self.df_td_int_ts = DataFrame(
         {
             "td_1": timedeltas,
             "td_2": timedeltas,
             "int_1": ints,
             "int_2": ints,
             "ts_1": datetimes,
             "ts_2": datetimes,
         },
         index=index,
     )
     self.df_int_floats = DataFrame(
         {
             "int_1": ints,
             "int_2": ints,
             "int_3": ints,
             "float_1": floats,
             "float_2": floats,
             "float_3": floats,
         },
         index=index,
     )
     self.df_int_float_str = DataFrame(
         {
             "int_1": ints,
             "int_2": ints,
             "float_1": floats,
             "float_2": floats,
             "str_1": strings,
             "str_2": strings,
         },
         index=index,
     )
Exemplo n.º 29
0
 def setup(self, index, index_structure):
     N = 10**6
     if index == "string":
         index = tm.makeStringIndex(N)
     elif index == "datetime":
         index = date_range("1900", periods=N, freq="s")
     elif index == "period":
         index = period_range("1900", periods=N, freq="s")
     index = index.sort_values()
     assert index.is_unique and index.is_monotonic_increasing
     if index_structure == "nonunique_monotonic_inc":
         index = index.insert(item=index[2], loc=2)[:-1]
     elif index_structure == "non_monotonic":
         index = index[::2].append(index[1::2])
         assert len(index) == N
     self.s = Series(np.random.rand(N), index=index)
     self.lbl = index[80000]
     # warm up index mapping
     self.s[self.lbl]
Exemplo n.º 30
0
    def setup_method(self, method):
        self.bool_index = tm.makeBoolIndex(10, name="a")
        self.int_index = tm.makeIntIndex(10, name="a")
        self.float_index = tm.makeFloatIndex(10, name="a")
        self.dt_index = tm.makeDateIndex(10, name="a")
        self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")
        self.period_index = tm.makePeriodIndex(10, name="a")
        self.string_index = tm.makeStringIndex(10, name="a")
        self.unicode_index = tm.makeUnicodeIndex(10, name="a")

        arr = np.random.randn(10)
        self.bool_series = Series(arr, index=self.bool_index, name="a")
        self.int_series = Series(arr, index=self.int_index, name="a")
        self.float_series = Series(arr, index=self.float_index, name="a")
        self.dt_series = Series(arr, index=self.dt_index, name="a")
        self.dt_tz_series = self.dt_tz_index.to_series()
        self.period_series = Series(arr, index=self.period_index, name="a")
        self.string_series = Series(arr, index=self.string_index, name="a")
        self.unicode_series = Series(arr, index=self.unicode_index, name="a")

        types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"]
        self.indexes = [getattr(self, f"{t}_index") for t in types]
        self.series = [getattr(self, f"{t}_series") for t in types]

        # To test narrow dtypes, we use narrower *data* elements, not *index* elements
        index = self.int_index
        self.float32_series = Series(arr.astype(np.float32), index=index, name="a")

        arr_int = np.random.choice(10, size=10, replace=False)
        self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a")
        self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a")
        self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a")

        self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a")
        self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a")
        self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a")

        nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"]
        self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types]

        self.objs = self.indexes + self.series + self.narrow_series