Пример #1
0
    def test_combine_first_period(self):
        data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"],
                               freq="M")
        df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7])
        data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
        df2 = DataFrame({"P": data2}, index=[2, 4, 5])

        res = df1.combine_first(df2)
        exp_dts = pd.PeriodIndex(
            ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"],
            freq="M")
        exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
        tm.assert_frame_equal(res, exp)
        assert res["P"].dtype == data1.dtype

        # different freq
        dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"],
                              freq="D")
        df2 = DataFrame({"P": dts2}, index=[2, 4, 5])

        res = df1.combine_first(df2)
        exp_dts = [
            pd.Period("2011-01", freq="M"),
            pd.Period("2012-01-01", freq="D"),
            pd.NaT,
            pd.Period("2012-01-02", freq="D"),
            pd.Period("2011-03", freq="M"),
            pd.Period("2011-04", freq="M"),
        ]
        exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
        tm.assert_frame_equal(res, exp)
        assert res["P"].dtype == "object"
Пример #2
0
def slide_7():
    a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
               index=['f', 'e', 'd', 'c', 'b', 'a'])
    b = Series(np.arange(len(a), dtype=np.float64),
               index=['f', 'e', 'd', 'c', 'b', 'a'])
    print '***a***'
    print a
    print '***b***'
    print b
    b[-1] = np.nan
    print '***a***'
    print a
    print '***b***'
    print b
    print np.where(pd.isnull(a), b, a)

    print '#####combine_first#####'
    print '***b[:-2]***'
    print b[:-2]
    print '***a[2:]***'
    print a[2:]
    print 'b[:-2].combine_first(a[2:])'
    print b[:-2].combine_first(a[2:])

    df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                     'b': [np.nan, 2., np.nan, 6.],
                     'c': range(2, 18, 4)})
    df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                     'b': [np.nan, 3., 4., 6., 8.]})
    print '***df1***'
    print df1
    print '***df2***'
    print df2
    print df1.combine_first(df2)
Пример #3
0
    def test_combine_first_int(self):
        # GH14687 - integer series that do no align exactly

        df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
        df2 = DataFrame({"a": [1, 4]}, dtype="int64")

        result_12 = df1.combine_first(df2)
        expected_12 = DataFrame({"a": [0, 1, 3, 5]})
        tm.assert_frame_equal(result_12, expected_12)

        result_21 = df2.combine_first(df1)
        expected_21 = DataFrame({"a": [1, 4, 3, 5]})
        tm.assert_frame_equal(result_21, expected_21)
Пример #4
0
    def test_combine_first_return_obj_type_with_bools(self):
        # GH3552

        df1 = DataFrame([[np.nan, 3.0, True], [-4.6, np.nan, True],
                         [np.nan, 7.0, False]])
        df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]],
                        index=[1, 2])

        expected = Series([True, True, False], name=2, dtype=bool)

        result_12 = df1.combine_first(df2)[2]
        tm.assert_series_equal(result_12, expected)

        result_21 = df2.combine_first(df1)[2]
        tm.assert_series_equal(result_21, expected)
Пример #5
0
    def test_combine_first_same_as_in_update(self):
        # gh 3016 (same as in update)
        df = DataFrame(
            [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
            columns=["A", "B", "bool1", "bool2"],
        )

        other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
        result = df.combine_first(other)
        tm.assert_frame_equal(result, df)

        df.loc[0, "A"] = np.nan
        result = df.combine_first(other)
        df.loc[0, "A"] = 45
        tm.assert_frame_equal(result, df)
Пример #6
0
    def update(self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None,
               barsize: str=None, tz: str=None, standardize_index=True):
        """
        Input data is combined with self.df. Overlapped data will be
        overwritten by non-null values of input data. Indexes and Columns
        will be unioned.
        """
        # Check input data type
        if not (isinstance(df_in, pd.DataFrame)):
            raise TypeError('Input data must be a pandas.DataFrame.')

        # Check empty data
        if df_in.empty:
            return self

        # Standardize index
        if standardize_index:
            df_in = self._standardize_index(
                df_in.copy(), symbol=symbol, datatype=datatype,
                barsize=barsize, tz=tz)

        # Combine input DataFrame with internal self.df
        if self.df.empty:  # Initialize self.df
            self.df = df_in.sort_index()
        else:
            df_in = df_in.tz_convert(self.tzinfo, level=self.__class__.dtlevel)
            self.df = df_in.combine_first(self.df).sort_index()

        # Post-combination processing
        # Fill NaN, and enforce barcount and volume columns dtype to int64
        self.df.fillna(-1, inplace=True)
        for col in self.df.columns:
            if col.lower() in ('barcount', 'volume'):
                self.df[col] = self.df[col].astype(np.int64)
Пример #7
0
    def test_combine_first(self):
        # disjoint
        head, tail = self.frame[:5], self.frame[5:]

        combined = head.combine_first(tail)
        reordered_frame = self.frame.reindex(combined.index)
        assert_frame_equal(combined, reordered_frame)
        assert tm.equalContents(combined.columns, self.frame.columns)
        assert_series_equal(combined['A'], reordered_frame['A'])

        # same index
        fcopy = self.frame.copy()
        fcopy['A'] = 1
        del fcopy['C']

        fcopy2 = self.frame.copy()
        fcopy2['B'] = 0
        del fcopy2['D']

        combined = fcopy.combine_first(fcopy2)

        assert (combined['A'] == 1).all()
        assert_series_equal(combined['B'], fcopy['B'])
        assert_series_equal(combined['C'], fcopy2['C'])
        assert_series_equal(combined['D'], fcopy['D'])

        # overlap
        head, tail = reordered_frame[:10].copy(), reordered_frame
        head['A'] = 1

        combined = head.combine_first(tail)
        assert (combined['A'][:10] == 1).all()

        # reverse overlap
        tail['A'][:10] = 0
        combined = tail.combine_first(head)
        assert (combined['A'][:10] == 0).all()

        # no overlap
        f = self.frame[:10]
        g = self.frame[10:]
        combined = f.combine_first(g)
        assert_series_equal(combined['A'].reindex(f.index), f['A'])
        assert_series_equal(combined['A'].reindex(g.index), g['A'])

        # corner cases
        comb = self.frame.combine_first(self.empty)
        assert_frame_equal(comb, self.frame)

        comb = self.empty.combine_first(self.frame)
        assert_frame_equal(comb, self.frame)

        comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
        assert "faz" in comb.index

        # #2525
        df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
        df2 = DataFrame({}, columns=['b'])
        result = df.combine_first(df2)
        assert 'b' in result
Пример #8
0
def update_df(df: pd.DataFrame,
              new_df: pd.DataFrame,
              on: (str, list) = None,
              mode='update'):
    """
    根据某一列更新dataframe里的数据

    :param df: 待升级的
    :param new_df: 新表
    :param on: 根据哪一列升级,默认为None,使用index
    :param mode:处理方式,update:直接更新对应位置的数值,insert:只有对应位置为空时才更新
    :return:
    """
    v1 = len(df)
    if on is not None:
        on = ensure_list(on)
        new_df = new_df.drop_duplicates()
        if any(new_df[on].duplicated()):
            raise ValueError('new_df中有重复的索引列对应不同的值,请检查')
        new_df = df[on].drop_duplicates().merge(new_df, how='inner', on=on)
        df = df.set_index(on, drop=False)
        new_df = new_df.set_index(on, drop=False)
    if mode == 'update':
        df.update(new_df)
    elif mode == 'insert':
        df = df.combine_first(new_df)
    else:
        raise ValueError(f'参数{mode}错误,可选参数为 update or insert')
    df = df.reset_index(drop=True)
    if on is not None:
        if v1 != len(df):
            raise ValueError('update后Dataframe结构发生变化,请检查')
    return df
Пример #9
0
    def test_combine_first(self):
        # disjoint
        head, tail = self.frame[:5], self.frame[5:]

        combined = head.combine_first(tail)
        reordered_frame = self.frame.reindex(combined.index)
        assert_frame_equal(combined, reordered_frame)
        assert tm.equalContents(combined.columns, self.frame.columns)
        assert_series_equal(combined['A'], reordered_frame['A'])

        # same index
        fcopy = self.frame.copy()
        fcopy['A'] = 1
        del fcopy['C']

        fcopy2 = self.frame.copy()
        fcopy2['B'] = 0
        del fcopy2['D']

        combined = fcopy.combine_first(fcopy2)

        assert (combined['A'] == 1).all()
        assert_series_equal(combined['B'], fcopy['B'])
        assert_series_equal(combined['C'], fcopy2['C'])
        assert_series_equal(combined['D'], fcopy['D'])

        # overlap
        head, tail = reordered_frame[:10].copy(), reordered_frame
        head['A'] = 1

        combined = head.combine_first(tail)
        assert (combined['A'][:10] == 1).all()

        # reverse overlap
        tail['A'][:10] = 0
        combined = tail.combine_first(head)
        assert (combined['A'][:10] == 0).all()

        # no overlap
        f = self.frame[:10]
        g = self.frame[10:]
        combined = f.combine_first(g)
        assert_series_equal(combined['A'].reindex(f.index), f['A'])
        assert_series_equal(combined['A'].reindex(g.index), g['A'])

        # corner cases
        comb = self.frame.combine_first(self.empty)
        assert_frame_equal(comb, self.frame)

        comb = self.empty.combine_first(self.frame)
        assert_frame_equal(comb, self.frame)

        comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
        assert "faz" in comb.index

        # #2525
        df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
        df2 = DataFrame({}, columns=['b'])
        result = df.combine_first(df2)
        assert 'b' in result
Пример #10
0
    def test_combine_first_align_nan(self):
        # GH 7509 (not fixed)
        dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
        dfb = DataFrame([[4], [5]], columns=["b"])
        assert dfa["a"].dtype == "datetime64[ns]"
        assert dfa["b"].dtype == "int64"

        res = dfa.combine_first(dfb)
        exp = DataFrame(
            {
                "a": [pd.Timestamp("2011-01-01"), pd.NaT],
                "b": [2, 5]
            },
            columns=["a", "b"],
        )
        tm.assert_frame_equal(res, exp)
        assert res["a"].dtype == "datetime64[ns]"
        # TODO: this must be int64
        assert res["b"].dtype == "int64"

        res = dfa.iloc[:0].combine_first(dfb)
        exp = DataFrame({
            "a": [np.nan, np.nan],
            "b": [4, 5]
        },
                        columns=["a", "b"])
        tm.assert_frame_equal(res, exp)
        # TODO: this must be datetime64
        assert res["a"].dtype == "float64"
        # TODO: this must be int64
        assert res["b"].dtype == "int64"
Пример #11
0
    def test_combine_first(self, float_frame):
        # disjoint
        head, tail = float_frame[:5], float_frame[5:]

        combined = head.combine_first(tail)
        reordered_frame = float_frame.reindex(combined.index)
        tm.assert_frame_equal(combined, reordered_frame)
        assert tm.equalContents(combined.columns, float_frame.columns)
        tm.assert_series_equal(combined["A"], reordered_frame["A"])

        # same index
        fcopy = float_frame.copy()
        fcopy["A"] = 1
        del fcopy["C"]

        fcopy2 = float_frame.copy()
        fcopy2["B"] = 0
        del fcopy2["D"]

        combined = fcopy.combine_first(fcopy2)

        assert (combined["A"] == 1).all()
        tm.assert_series_equal(combined["B"], fcopy["B"])
        tm.assert_series_equal(combined["C"], fcopy2["C"])
        tm.assert_series_equal(combined["D"], fcopy["D"])

        # overlap
        head, tail = reordered_frame[:10].copy(), reordered_frame
        head["A"] = 1

        combined = head.combine_first(tail)
        assert (combined["A"][:10] == 1).all()

        # reverse overlap
        tail.iloc[:10, tail.columns.get_loc("A")] = 0
        combined = tail.combine_first(head)
        assert (combined["A"][:10] == 0).all()

        # no overlap
        f = float_frame[:10]
        g = float_frame[10:]
        combined = f.combine_first(g)
        tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
        tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])

        # corner cases
        comb = float_frame.combine_first(DataFrame())
        tm.assert_frame_equal(comb, float_frame)

        comb = DataFrame().combine_first(float_frame)
        tm.assert_frame_equal(comb, float_frame)

        comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
        assert "faz" in comb.index

        # #2525
        df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
        df2 = DataFrame(columns=["b"])
        result = df.combine_first(df2)
        assert "b" in result
Пример #12
0
def test_combine_first_with_nan_multiindex():
    # gh-36562

    mi1 = MultiIndex.from_arrays(
        [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]],
        names=["a", "b"])
    df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
    mi2 = MultiIndex.from_arrays(
        [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"])
    s = Series([1, 2, 3, 4, 5, 6], index=mi2)
    res = df.combine_first(DataFrame({"d": s}))
    mi_expected = MultiIndex.from_arrays(
        [
            ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
            [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
        ],
        names=["a", "b"],
    )
    expected = DataFrame(
        {
            "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
            "d": [
                1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0,
                np.nan
            ],
        },
        index=mi_expected,
    )
    tm.assert_frame_equal(res, expected)
Пример #13
0
def test_combine_first_int64_not_cast_to_float64():
    # GH 28613
    df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
    df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]})
    result = df_1.combine_first(df_2)
    expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]})
    tm.assert_frame_equal(result, expected)
Пример #14
0
    def test_combine_first_convert_datatime_correctly(self, data1, data2,
                                                      data_expected):
        # GH 3593

        df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
        result = df1.combine_first(df2)
        expected = DataFrame({"a": data_expected})
        tm.assert_frame_equal(result, expected)
Пример #15
0
    def test_combine_first_int(self):
        # GH14687 - integer series that do no align exactly

        df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
        df2 = DataFrame({"a": [1, 4]}, dtype="int64")

        res = df1.combine_first(df2)
        tm.assert_frame_equal(res, df1)
        assert res["a"].dtype == "int64"
Пример #16
0
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
    # GH28481
    na_value = nulls_fixture
    frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
    other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])

    result = frame.combine_first(other)
    expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"])
    tm.assert_frame_equal(result, expected)
Пример #17
0
def add_df(self, df:pd.DataFrame):
    """This function adds new data to the existing data 
     It places it into the positions refered by the "Index" date.
     If there are days with the same index, they get overwritten.
        self.dailyData = pd.concat([self.dailyData, new_dailyData], verify_integrity = True)
        self.dailyData = pd.merge(self.dailyData, new_dailyData)
    """
    df = df.combine_first(self._df) 
    self.df = df  
Пример #18
0
def table_OD(list_coordsO, list_idsO, list_coordsD, list_idsD,
             OSRM_max_table=100, host='http://localhost:5000'):
    """
    Function wrapping OSRM 'table' function in order to get a matrix of
    time distance between different origins and destinations (N:M)
    Params :
        list_coordsO: list
            A list of coord as [x, y] for the origins, like :
                 list_coords = [[21.3224, 45.2358],
                                [21.3856, 42.0094],
                                [20.9574, 41.5286]] (coords have to be float)
        list_idsO: list
            A list of the corresponding unique id for the origins, like :
                     list_ids = ['name1',
                                 'name2',
                                 'name3'] (id can be str, int or float)
        list_coordsD: list
            A list of coord as [x, y] for the destinations (same kind as the
            origins)
        list_idsD: list
            A list of the corresponding unique id for the destinations (same
            kind as the origins)
        OSRM_max_table: int, default=100
            The --max-table-size defined when lauching osrm-routed (default is
            100). It will be used to clip the request in many 'table' requests
            and reconstruct the matrix.
        host: str, default 'http://localhost:5000'
            Url and port of the OSRM instance (no final bakslash)

    Output:
        A labeled DataFrame containing the time matrix in minutes
            (or NaN when OSRM encounter an error to compute a route)

        -1 or an empty DataFrame is return in case of any other error
            (wrong list of coords/ids, unknow host,
            wrong response from the host, etc.)
    """
    if list_coordsO == list_coordsD and list_idsO == list_idsD:
        list_coords, list_ids = list_coordsO, list_idsO
    else:
        list_coords = list_coordsO + list_coordsD
        list_ids = list_idsO + list_idsD

    if len(list_coords) > OSRM_max_table:
        gpd_coords = list(chunk(list_coords, OSRM_max_table//2))
        gpd_ids = list(chunk(list_ids, OSRM_max_table//2))
        df = DataFrame(index=list_ids, columns=list_ids, dtype=float)
        for lcoord, lid in zip(mat_range2d(gpd_coords), mat_range2d(gpd_ids)):
            df = df.combine_first(table(list(lcoord), list(lid), host=host))
    else:
        df = table(list_coords, list_ids, host=host)

    try:
        return df[list_idsO].filter(list_idsD, axis=0)
    except Exception as err:
        print(err)
        return -1
Пример #19
0
    def test_combine_first_with_asymmetric_other(self, val):
        # see gh-20699
        df1 = DataFrame({"isNum": [val]})
        df2 = DataFrame({"isBool": [True]})

        res = df1.combine_first(df2)
        exp = DataFrame({"isBool": [True], "isNum": [val]})

        tm.assert_frame_equal(res, exp)
Пример #20
0
def combine(self, devices = None, readings = None):
    """
    Combines devices from a test into a new dataframe, following the 
    naming as follows: DEVICE-NAME_READING-NAME
    Parameters
    ----------
        devices: list or None
            None
            If None, includes all the devices in self.devices
        readings: list or None
            None
            If None, includes all the readings in self.readings
    Returns
    -------
        Dataframe if successful or False otherwise
    """ 

    dfc = DataFrame()

    if devices is None:
        dl = list(self.devices.keys())
    else: 
        # Only pick the ones that are actually present
        dl = list(set(devices).intersection(list(self.devices.keys())))
        if len(dl) != len(devices):
            std_out('Requested devices are not all present in devices', 'WARNING')
            std_out(f'Discarding {set(devices).difference(list(self.devices.keys()))}')

    for device in dl:
        new_names = list()

        if readings is None:
            rl = list(self.devices[device].readings.columns)
        else: 
            # Only pick the ones that are actually present
            rl = list(set(readings).intersection(list(self.devices[device].readings.columns)))

            if any([reading not in rl for reading in readings]):
                std_out(f'Requested readings are not all present in readings for device {device}', 'WARNING')
                std_out(f'Discarding {list(set(readings).difference(list(self.devices[device].readings.columns)))}', 'WARNING')
        
        rename = dict()

        for reading in rl:
            rename[reading] = reading + '_' + self.devices[device].id
        
        df = self.devices[device].readings[rl].copy()
        df.rename(columns = rename, inplace = True)
        dfc = dfc.combine_first(df)
    
    if dfc.empty:
        std_out('Error ocurred while combining data. Review data', 'ERROR')
        return False
    else:
        std_out('Data combined successfully', 'SUCCESS')
        return dfc
Пример #21
0
def cal_SMB_HML(ret,
                size,
                BM,
                percentile1=None,
                percentile2=None,
                independent=True,
                exclude_30_small_size=False):
    if exclude_30_small_size:
        size = ClipQuantile(size, [0.0, 0.3, 1.0], [-1.0, 1.0])
    ret, size, BM = IndexAlign(ret, size, BM)
    valid_ = ~pd.isnull(
        BM + ret + size
    )  # TypeError: bad operand type for unary ~: 'float'--->index或columns不匹配
    size = size[valid_]
    BM = BM[valid_]
    ret = ret[valid_]
    if percentile1 is None:
        percentile1 = [0.0, 0.5, 1.0]  # size
        percentile2 = [0.0, 0.3, 0.7, 1.0]  # value
    label_1 = [i + 1 for i in range(len(percentile1) - 1)]
    label_2 = [i + 1 for i in range(len(percentile2) - 1)]
    if independent:
        #mark_1 = pd.DataFrame([pd.qcut(size.iloc[i], q=percentile1, labels=label_1) for i in size.index[:-1]],
        #                      index=size.index[:-1]) # 报错
        mark_1 = DataFrame([
            qcut(size.loc[i], q=percentile1, labels=label_1)
            for i in size.index
        ])
        mark_2 = DataFrame([
            qcut(BM.loc[i], q=percentile2, labels=label_2) for i in BM.index
        ])  # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的
    else:
        mark_1 = DataFrame([
            qcut(size.loc[i], q=percentile1, labels=label_1)
            for i in size.index
        ])  # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的
        mark_2 = DataFrame(index=mark_1.index, columns=mark_1.columns)
        for l_ in label_1:
            tmp = DataFrame([
                qcut(BM.loc[i][mark_1.iloc[i] == l_],
                     q=percentile2,
                     labels=label_2) for i in BM.index
            ])
            mark_2 = mark_2.combine_first(tmp)
    #valid_ = ~(pd.isnull(mark_1 + mark_2) | pd.isnull(ret.iloc[1:]))  # valid的股票要满足:当期有前一个月的indicator信息;当期保证交易
    df = DataFrame()
    df['rtn'] = ret.stack()
    df['ref1'] = mark_1.stack()
    df['ref2'] = mark_2.stack()
    tmp = df.groupby(level=0).apply(
        lambda g: g.groupby(['ref1', 'ref2']).mean()).unstack()['rtn']
    #tmp.columns = tmp.columns.get_level_values(1)
    tmp.index.names = ('trddt', 'ref1')
    HML = tmp.mean(axis=0, level=0)
    SMB = tmp.mean(axis=1).unstack()
    return SMB.iloc[:, -1] - SMB.iloc[:, 0], HML.iloc[:, -1] - HML.iloc[:, 0]
Пример #22
0
def save_csv(path_csv, df: pd.DataFrame) -> None:
    """Saves dataframe as csv and merges with existing csv if necessary."""
    if os.path.exists(path_csv):
        df_old = pd.read_csv(path_csv)
        col_index = df_old.columns[0]  # Assumes first col is index col
        df_old = df_old.set_index(col_index)
        df = df.combine_first(df_old)
    df = df.sort_index(axis=1)
    retry_if_oserror(df.to_csv)(path_csv)
    print('Saved:', path_csv)
Пример #23
0
def prepare(self, measurand, inputs, options=dict()):
    """
    Prepares a test for a regression model
    Parameters
    ----------
        measurand: dict
            measurand = {'8019043': ['NO2']}
        inputs: dict
            inputs per device and reading
                inputs = {'devicename': ['reading-1', 'reading-2']}
        options: dict
            Options including data processing. Defaults in config._model_def_opt
    Returns
    -------
        df = pandas Dataframe
        measurand_name = string
    """

    options = dict_fmerge(options, config._model_def_opt)

    # Measurand
    measurand_device = list(measurand.keys())[0]
    measurand_metric = measurand[measurand_device][0]
    measurand_name = measurand[measurand_device][0] + '_' + measurand_device

    df = DataFrame()
    df[measurand_name] = self.devices[measurand_device].readings[
        measurand_metric]

    for input_device in inputs.keys():
        combined_df = self.combine(devices=[input_device],
                                   readings=inputs[input_device])
        df = df.combine_first(combined_df)

    if options['common_avg']:

        common_channels = inputs[list(inputs.keys())[0]]
        for input_device in inputs.keys():
            common_channels = list(
                set(common_channels).intersection(set(inputs[input_device])))
        std_out(f'Performing avg in common columns {common_channels}')
        for channel in common_channels:
            columns_list = [
                channel + '_' + device for device in list(inputs.keys())
            ]
            df[channel + '_AVG'] = df[columns_list].mean(axis=1)

        df = df.loc[:,
                    df.columns.str.contains("_AVG")
                    | df.columns.str.contains(measurand_name)]

    if options['clean_na'] is not None:
        df = clean(df, options['clean_na'], how='any')

    return df, measurand_name
def merge_update(df_left: pd.DataFrame,
                 df_right: pd.DataFrame,
                 on=None,
                 left_on=None,
                 right_on=None,
                 left_index=False,
                 right_index=False,
                 prefer='right',
                 adjust_dtypes=True):
    """
    Merge `df_right` with `df_left` in an update method:
        - distinct left/right columns are combined into the new dataframe
        - for common columns, a `combine_first` is performed (left to right if `prefer='left'`, right to left otherwise)
            this update replace NaN values with non-NaNs values where possible
        
    If `prefer` is 'left', right values are ignored if left ones are not NaNs.
    """
    if all((_ is None for _ in (on, left_on, right_on))):
        # based on index values
        if prefer == 'left':
            m = df_left.combine_first(df_right)
        else:
            m = df_right.combine_first(df_left)
    else:
        # use provided id columns
        if left_index:
            ml = df_left
        else:
            ml = df_left.set_index(left_on or on)
        if right_index:
            mr = df_right
        else:
            mr = df_right.set_index(right_on or on)
        if prefer == 'left':
            m = ml.combine_first(mr).reset_index()
        else:
            m = mr.combine_first(ml).reset_index()

    if adjust_dtypes:
        m = m.infer_objects()

    return m
Пример #25
0
    def test_combine_first_mixed(self):
        a = Series(['a', 'b'], index=lrange(2))
        b = Series(lrange(2), index=lrange(2))
        f = DataFrame({'A': a, 'B': b})

        a = Series(['a', 'b'], index=lrange(5, 7))
        b = Series(lrange(2), index=lrange(5, 7))
        g = DataFrame({'A': a, 'B': b})

        # TODO(wesm): no verification?
        combined = f.combine_first(g)  # noqa
    def test_combine_first_mixed(self):
        a = Series(['a', 'b'], index=lrange(2))
        b = Series(lrange(2), index=lrange(2))
        f = DataFrame({'A': a, 'B': b})

        a = Series(['a', 'b'], index=lrange(5, 7))
        b = Series(lrange(2), index=lrange(5, 7))
        g = DataFrame({'A': a, 'B': b})

        # TODO(wesm): no verification?
        combined = f.combine_first(g)  # noqa
Пример #27
0
 def test_combine_first_string_dtype_only_na(self):
     # GH: 37519
     df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string")
     df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string")
     df.set_index(["a", "b"], inplace=True)
     df2.set_index(["a", "b"], inplace=True)
     result = df.combine_first(df2)
     expected = DataFrame(
         {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string"
     ).set_index(["a", "b"])
     tm.assert_frame_equal(result, expected)
Пример #28
0
    def test_combine_first_timedelta(self):
        data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
        df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7])
        data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
        df2 = DataFrame({"TD": data2}, index=[2, 4, 5])

        res = df1.combine_first(df2)
        exp_dts = pd.TimedeltaIndex(
            ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"])
        exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
        tm.assert_frame_equal(res, exp)
        assert res["TD"].dtype == "timedelta64[ns]"
Пример #29
0
def get_feature_df(df: pd.DataFrame) -> pd.DataFrame:
    LOG.info("Cleaning up the text...")
    df['feature_list'] = df['Text'].apply(clean_text)
    LOG.info("Extracted new features...")
    df[['lemmas', 'n_stopwords', 'n_punct', 'n_pos', 'n_urls',
        'n_tokens']] = pd.DataFrame(df['feature_list'].tolist(),
                                    index=df.index)
    POS_df = pd.DataFrame(df['n_pos'].to_list(), index=df.index).fillna(0)
    LOG.debug("Got POS_df")
    final_df = df.combine_first(POS_df)
    LOG.debug("Got final_df")
    return final_df
Пример #30
0
    def test_combine_first_mixed(self):
        a = Series(["a", "b"], index=range(2))
        b = Series(range(2), index=range(2))
        f = DataFrame({"A": a, "B": b})

        a = Series(["a", "b"], index=range(5, 7))
        b = Series(range(2), index=range(5, 7))
        g = DataFrame({"A": a, "B": b})

        exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
        combined = f.combine_first(g)
        tm.assert_frame_equal(combined, exp)
Пример #31
0
    def test_combine_first_mixed(self):
        a = Series(['a', 'b'], index=lrange(2))
        b = Series(lrange(2), index=lrange(2))
        f = DataFrame({'A': a, 'B': b})

        a = Series(['a', 'b'], index=lrange(5, 7))
        b = Series(lrange(2), index=lrange(5, 7))
        g = DataFrame({'A': a, 'B': b})

        exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
                           index=[0, 1, 5, 6])
        combined = f.combine_first(g)
        tm.assert_frame_equal(combined, exp)
Пример #32
0
def test_combine_first_timestamp_bug_NaT():
    # GH28481
    frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
    other = DataFrame(
        [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"])

    result = frame.combine_first(other)
    expected = DataFrame(
        [[pd.NaT, datetime(2020, 1, 1),
          datetime(2020, 1, 2)]],
        columns=["a", "b", "c"])

    tm.assert_frame_equal(result, expected)
Пример #33
0
    def test_combine_first_mixed(self):
        a = Series(['a', 'b'], index=lrange(2))
        b = Series(lrange(2), index=lrange(2))
        f = DataFrame({'A': a, 'B': b})

        a = Series(['a', 'b'], index=lrange(5, 7))
        b = Series(lrange(2), index=lrange(5, 7))
        g = DataFrame({'A': a, 'B': b})

        exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
                           index=[0, 1, 5, 6])
        combined = f.combine_first(g)
        tm.assert_frame_equal(combined, exp)
Пример #34
0
def get_tfidf(df: pd.DataFrame,
              tfidf_vect: TfidfVectorizer = tfidf_v,
              fit: bool = True):
    df.dropna(subset=['lemmas'], inplace=True)
    LOG.debug(f"feature columns: {df.columns}")
    if fit:
        tfidf = tfidf_vect.fit_transform(df['lemmas'])
        joblib.dump(tfidf_vect, path.join(OUT_PATH, 'tfidf_vect.joblib'))
    else:
        tfidf = tfidf_vect.transform(df['lemmas'])
    tfidf_df = pd.DataFrame(tfidf.toarray(),
                            columns=tfidf_vect.get_feature_names(),
                            index=df.index)
    final_df = df.combine_first(tfidf_df)
    return final_df
Пример #35
0
    def test_combine_first_doc_example(self):
        # doc example
        df1 = DataFrame(
            {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
        )

        df2 = DataFrame(
            {
                "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
                "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
            }
        )

        result = df1.combine_first(df2)
        expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
        tm.assert_frame_equal(result, expected)
Пример #36
0
def cal_idxrel_sym(f, s, cal_self=False, self_val=0, *args, **kargs):
    slen = len(s)
    df = {}
    for i in range(slen):
        if np.isscalar(s[i]) and np.isnan(s[i]):
            df[s.index[i]] = Series(np.nan, index=s.index)
            continue
        res = []
        if cal_self:
            res.append(f(s[i], s[i]))
        else:
            res.append(self_val)
        for j in range(i+1, slen):
            if np.isscalar(s[j]) and np.isnan(s[j]):
                res.append(np.nan)
            else:
                res.append(f(s[i], s[j], *args, **kargs))
        df[s.index[i]] = Series(res, index=s.index[i:])
    df = DataFrame(df)
    df = df.combine_first(df.T)
    return df
Пример #37
0
    def interpolate(self, X, force_interpolation=True, **kwargs):
        # force_interpolation: if false, if ALL interpolant variables already
        # present in X, then do not actually create new interpolation
        do_interpolation = force_interpolation
        for key in self.y_keys:
            # if a y_key is not present, force the interpolation
            if key not in X:
                do_interpolation = True

        if do_interpolation:
            interpolated = {}
            # for key in X.keys():#self.x_keys:
            #     interpolated[key] = X[key]
            for key in self.y_keys:
                interpolated[key] = self.knn[key].predict(X[self.x_keys])
            interpolated = DataFrame(interpolated, index=X.index)
            # want to overwrite any preexisting x
            X_return = interpolated.combine_first(X)
        else:
            # do nothing
            X_return = X
        return X_return
Пример #38
0
    def process(self, start_time:datetime, end_time:datetime, input:DataFrame):
        if (self._args is not None and len(self._args) > 2) or \
           (len(self._args) != 0 and not isinstance(self._args[0], QueryFunction)):
            raise ValueError('Invalid argument to absolute value function')

        # get the data
        data = input if len(self._args) == 0 else self._args[0].process(start_time, end_time, input)

        ret = None

        # go through each column, get the average, and apply it to the rows
        for col in data.columns:
            abs = data[col].abs()  # get the absolute value for each value in the column
            abs.name = 'abs ' + col  # update the name

            if ret is None:
                ret = DataFrame(abs)
            else:
                ret = ret.combine_first(DataFrame(abs))  # add it to our return value

        print(ret.head())

        return ret
Пример #39
0
    def process(self, start_time: datetime, end_time: datetime, input:DataFrame):
        if str(self.name) not in '+-*/':
            raise ValueError("Unknown math function: " + str(self.name))

        ret = DataFrame()

        # two args means we're doing A + B
        if len(self._args) == 2:
            left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0]
            right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1]

            for l_col in left.columns:
                for r_col in right.columns:
                    if self.name == '+':
                        t = left[l_col] + right[r_col]
                    elif self.name == '-':
                        t = left[l_col] - right[r_col]
                    elif self.name == '*':
                        t = left[l_col] * right[r_col]
                    elif self.name == '/':
                        t = left[l_col] / right[r_col]
                    else:
                        raise ValueError("Unknown operator: " + str(self.name))

                    t = DataFrame(t)
                    t.columns = [l_col + self.name + r_col]

                    print(left.head())
                    print(right.head())
                    print(t.head())
                    ret = ret.combine_first(t)

        else:  # everything is in the input DataFrame
            ret = DataFrame(input.sum(axis=0))
            ret.columns = [' + '.join(input.columns)]

        return ret
Пример #40
0
Файл: io.py Проект: ALGe9/owls
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=1,
        exclude=None
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    from pandas import concat
    fileList = find_datafiles(
        path, search=search, files=files, exclude=exclude)
    if not fileList:
        print("no files found")
        return
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[::skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()
        for fn in files:
            #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
            ret = read_data_file(fn, skiplines, maxlines)
            p_bar.next()
            if not ret:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    # use combine first for all df at existing Loc or
                    # if not Loc is specified (Eul or Lag fields)
                    if x.index.levels[0][0] in df_tmp.index.levels[0]:
                        df_tmp = df_tmp.combine_first(x)
                        #df_tmp = concat([df_tmp, x], axis=1)
                        pass
                    else:
                        df_tmp = concat([df_tmp, x])
                except Exception as e:
                    print(x)
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    df = df.reorder_levels(['Time','Loc','Id'])
    p_bar.done()
    return origins, df
Пример #41
0
    def test_combine_first_mixed_bug(self):
        idx = Index(['a', 'b', 'c', 'e'])
        ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
        ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
        ser3 = Series([12, 4, 5, 97], index=idx)

        frame1 = DataFrame({"col0": ser1,
                            "col2": ser2,
                            "col3": ser3})

        idx = Index(['a', 'b', 'c', 'f'])
        ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
        ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
        ser3 = Series([12, 4, 5, 97], index=idx)

        frame2 = DataFrame({"col1": ser1,
                            "col2": ser2,
                            "col5": ser3})

        combined = frame1.combine_first(frame2)
        assert len(combined.columns) == 5

        # gh 3016 (same as in update)
        df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
                       columns=['A', 'B', 'bool1', 'bool2'])

        other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
        result = df.combine_first(other)
        assert_frame_equal(result, df)

        df.loc[0, 'A'] = np.nan
        result = df.combine_first(other)
        df.loc[0, 'A'] = 45
        assert_frame_equal(result, df)

        # doc example
        df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
                         'B': [np.nan, 2., 3., np.nan, 6.]})

        df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
                         'B': [np.nan, np.nan, 3., 4., 6., 8.]})

        result = df1.combine_first(df2)
        expected = DataFrame(
            {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
        assert_frame_equal(result, expected)

        # GH3552, return object dtype with bools
        df1 = DataFrame(
            [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
        df2 = DataFrame(
            [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])

        result = df1.combine_first(df2)[2]
        expected = Series([True, True, False], name=2)
        assert_series_equal(result, expected)

        # GH 3593, converting datetime64[ns] incorrecly
        df0 = DataFrame({"a": [datetime(2000, 1, 1),
                               datetime(2000, 1, 2),
                               datetime(2000, 1, 3)]})
        df1 = DataFrame({"a": [None, None, None]})
        df2 = df1.combine_first(df0)
        assert_frame_equal(df2, df0)

        df2 = df0.combine_first(df1)
        assert_frame_equal(df2, df0)

        df0 = DataFrame({"a": [datetime(2000, 1, 1),
                               datetime(2000, 1, 2),
                               datetime(2000, 1, 3)]})
        df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
        df2 = df1.combine_first(df0)
        result = df0.copy()
        result.iloc[0, :] = df1.iloc[0, :]
        assert_frame_equal(df2, result)

        df2 = df0.combine_first(df1)
        assert_frame_equal(df2, df0)
Пример #42
0
# encoding=utf-8

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 合并重叠数据

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
print a
print b
print pd.isnull(a)
# 在True的地方保留b的元素,在False替换为a中对应的元素
print np.where(pd.isnull(a), b, a)

# Series中有对应的combine_first
print b[:-2].combine_first(a[2:])

df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2, np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4. ,np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})
print df1
print df2
print df1.combine_first(df2)
Пример #43
0
Файл: io.py Проект: greole/owls
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=slice(0,None),
        exclude=None,
        times_slice=None,
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    fileList = find_datafiles(
        path, search=search, files=files,
        exclude=exclude, times_slice=times_slice
    )
    if not fileList:
        print("no files found")
        return None, DataFrame()
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()

        # for fn in files:
        #     #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
        #     ret = read_data_file(fn, skiplines, maxlines)
        #     p_bar.next()

        args = [(fn, skiplines, maxlines, p_bar) for fn in files]
        if MULTIPROCESS:
            with multiprocessing.Pool(processes=MULTIPROCESS) as pool:
                rets = pool.map(read_data_file_args, args)
        else:
            rets = map(read_data_file_args, args)


        for fn, ret in zip(files, rets):
            if not ret or ret[1].empty:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    df_tmp = df_tmp.combine_first(x)
                except Exception as e:
                    print("failed to concat: ",
                            df_tmp, "and", x, "new_loc ",
                            x.index.levels[0][0], " existing_locs ",
                            df_tmp.index.levels[0] )
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                if field == "Pos":
                    continue
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    if not "Loc" in  df.index.names:
        print(df)
        # df = df.reorder_levels(['Time', ])
    else:
        df = df.reorder_levels(['Time', 'Loc', 'Pos'])
    p_bar.done()
    return origins, df
Пример #44
0
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
	index=['f','e','d','c','b','a'])
b = Series(np.arange(len(a), dtype=np.float64),
	index=['f','e','d','c','b','a'])
b[-1] = np.nan
# np中实现ifelse语句,a中空值位置用b替代
np.where(pd.isnull(a),b,a)
# pd中类似函数,b中控制用a替代
b[:-2].combine_first(a[2:])
# DataFrame中使用
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
	'b':[np.nan,2.,np.nan, 6.],
	'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
	'b':[np.nan,3.,4,6.,8.]})
df1.combine_first(df2)
## 移除重复数据
data = DataFrame({'k1':['one']*3+['two']*4,
	'k2':[1,1,2,3,3,4,4]})
data.duplicated()
# 去除重复值,默认留第一个
data.drop_duplicates()
# 根据某一列去除重复值
data['v1'] = range(7)
data.drop_duplicates(['k1'])
# 保留最后一个
data.drop_duplicates(['k1','k2'], take_last=True)

## 利用函数或映射进行数据转换
data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
	'corned beef','Bacon','pastrami','honey ham','nova lox'],
Пример #45
0
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print(df2)
print(pd.concat([df1, df2]))
print(pd.concat([df1, df2], ignore_index=True))

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a)),
           dtype=np.float64,
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print(a)
print(b)
print(np.where(pd.isnull(a), b, a))
print(b[:-2].combine_first(a[2:]))

df1 = DataFrame({'a': [1, np.nan, 5, np.nan],
                 'b': [np.nan, 2, np.nan, 6],
                 'c': range(2, 18, 4)
                 })

df2 = DataFrame({'a': [5, 4, np.nan, 3, 7],
                 'b': [np.nan, 3, 4, 6, 8]
 })

print(df1.combine_first(df2))