def test_combine_first_period(self): data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7]) data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") df2 = DataFrame({"P": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.PeriodIndex( ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M") exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == data1.dtype # different freq dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") df2 = DataFrame({"P": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = [ pd.Period("2011-01", freq="M"), pd.Period("2012-01-01", freq="D"), pd.NaT, pd.Period("2012-01-02", freq="D"), pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M"), ] exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == "object"
def slide_7(): a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a']) b = Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a']) print '***a***' print a print '***b***' print b b[-1] = np.nan print '***a***' print a print '***b***' print b print np.where(pd.isnull(a), b, a) print '#####combine_first#####' print '***b[:-2]***' print b[:-2] print '***a[2:]***' print a[2:] print 'b[:-2].combine_first(a[2:])' print b[:-2].combine_first(a[2:]) df1 = DataFrame({'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.], 'c': range(2, 18, 4)}) df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]}) print '***df1***' print df1 print '***df2***' print df2 print df1.combine_first(df2)
def test_combine_first_int(self): # GH14687 - integer series that do no align exactly df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = DataFrame({"a": [1, 4]}, dtype="int64") result_12 = df1.combine_first(df2) expected_12 = DataFrame({"a": [0, 1, 3, 5]}) tm.assert_frame_equal(result_12, expected_12) result_21 = df2.combine_first(df1) expected_21 = DataFrame({"a": [1, 4, 3, 5]}) tm.assert_frame_equal(result_21, expected_21)
def test_combine_first_return_obj_type_with_bools(self): # GH3552 df1 = DataFrame([[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) expected = Series([True, True, False], name=2, dtype=bool) result_12 = df1.combine_first(df2)[2] tm.assert_series_equal(result_12, expected) result_21 = df2.combine_first(df1)[2] tm.assert_series_equal(result_21, expected)
def test_combine_first_same_as_in_update(self): # gh 3016 (same as in update) df = DataFrame( [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], columns=["A", "B", "bool1", "bool2"], ) other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) result = df.combine_first(other) tm.assert_frame_equal(result, df) df.loc[0, "A"] = np.nan result = df.combine_first(other) df.loc[0, "A"] = 45 tm.assert_frame_equal(result, df)
def update(self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None, barsize: str=None, tz: str=None, standardize_index=True): """ Input data is combined with self.df. Overlapped data will be overwritten by non-null values of input data. Indexes and Columns will be unioned. """ # Check input data type if not (isinstance(df_in, pd.DataFrame)): raise TypeError('Input data must be a pandas.DataFrame.') # Check empty data if df_in.empty: return self # Standardize index if standardize_index: df_in = self._standardize_index( df_in.copy(), symbol=symbol, datatype=datatype, barsize=barsize, tz=tz) # Combine input DataFrame with internal self.df if self.df.empty: # Initialize self.df self.df = df_in.sort_index() else: df_in = df_in.tz_convert(self.tzinfo, level=self.__class__.dtlevel) self.df = df_in.combine_first(self.df).sort_index() # Post-combination processing # Fill NaN, and enforce barcount and volume columns dtype to int64 self.df.fillna(-1, inplace=True) for col in self.df.columns: if col.lower() in ('barcount', 'volume'): self.df[col] = self.df[col].astype(np.int64)
def test_combine_first(self): # disjoint head, tail = self.frame[:5], self.frame[5:] combined = head.combine_first(tail) reordered_frame = self.frame.reindex(combined.index) assert_frame_equal(combined, reordered_frame) assert tm.equalContents(combined.columns, self.frame.columns) assert_series_equal(combined['A'], reordered_frame['A']) # same index fcopy = self.frame.copy() fcopy['A'] = 1 del fcopy['C'] fcopy2 = self.frame.copy() fcopy2['B'] = 0 del fcopy2['D'] combined = fcopy.combine_first(fcopy2) assert (combined['A'] == 1).all() assert_series_equal(combined['B'], fcopy['B']) assert_series_equal(combined['C'], fcopy2['C']) assert_series_equal(combined['D'], fcopy['D']) # overlap head, tail = reordered_frame[:10].copy(), reordered_frame head['A'] = 1 combined = head.combine_first(tail) assert (combined['A'][:10] == 1).all() # reverse overlap tail['A'][:10] = 0 combined = tail.combine_first(head) assert (combined['A'][:10] == 0).all() # no overlap f = self.frame[:10] g = self.frame[10:] combined = f.combine_first(g) assert_series_equal(combined['A'].reindex(f.index), f['A']) assert_series_equal(combined['A'].reindex(g.index), g['A']) # corner cases comb = self.frame.combine_first(self.empty) assert_frame_equal(comb, self.frame) comb = self.empty.combine_first(self.frame) assert_frame_equal(comb, self.frame) comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) assert "faz" in comb.index # #2525 df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) df2 = DataFrame({}, columns=['b']) result = df.combine_first(df2) assert 'b' in result
def update_df(df: pd.DataFrame, new_df: pd.DataFrame, on: (str, list) = None, mode='update'): """ 根据某一列更新dataframe里的数据 :param df: 待升级的 :param new_df: 新表 :param on: 根据哪一列升级,默认为None,使用index :param mode:处理方式,update:直接更新对应位置的数值,insert:只有对应位置为空时才更新 :return: """ v1 = len(df) if on is not None: on = ensure_list(on) new_df = new_df.drop_duplicates() if any(new_df[on].duplicated()): raise ValueError('new_df中有重复的索引列对应不同的值,请检查') new_df = df[on].drop_duplicates().merge(new_df, how='inner', on=on) df = df.set_index(on, drop=False) new_df = new_df.set_index(on, drop=False) if mode == 'update': df.update(new_df) elif mode == 'insert': df = df.combine_first(new_df) else: raise ValueError(f'参数{mode}错误,可选参数为 update or insert') df = df.reset_index(drop=True) if on is not None: if v1 != len(df): raise ValueError('update后Dataframe结构发生变化,请检查') return df
def test_combine_first_align_nan(self): # GH 7509 (not fixed) dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) dfb = DataFrame([[4], [5]], columns=["b"]) assert dfa["a"].dtype == "datetime64[ns]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) exp = DataFrame( { "a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5] }, columns=["a", "b"], ) tm.assert_frame_equal(res, exp) assert res["a"].dtype == "datetime64[ns]" # TODO: this must be int64 assert res["b"].dtype == "int64" res = dfa.iloc[:0].combine_first(dfb) exp = DataFrame({ "a": [np.nan, np.nan], "b": [4, 5] }, columns=["a", "b"]) tm.assert_frame_equal(res, exp) # TODO: this must be datetime64 assert res["a"].dtype == "float64" # TODO: this must be int64 assert res["b"].dtype == "int64"
def test_combine_first(self, float_frame): # disjoint head, tail = float_frame[:5], float_frame[5:] combined = head.combine_first(tail) reordered_frame = float_frame.reindex(combined.index) tm.assert_frame_equal(combined, reordered_frame) assert tm.equalContents(combined.columns, float_frame.columns) tm.assert_series_equal(combined["A"], reordered_frame["A"]) # same index fcopy = float_frame.copy() fcopy["A"] = 1 del fcopy["C"] fcopy2 = float_frame.copy() fcopy2["B"] = 0 del fcopy2["D"] combined = fcopy.combine_first(fcopy2) assert (combined["A"] == 1).all() tm.assert_series_equal(combined["B"], fcopy["B"]) tm.assert_series_equal(combined["C"], fcopy2["C"]) tm.assert_series_equal(combined["D"], fcopy["D"]) # overlap head, tail = reordered_frame[:10].copy(), reordered_frame head["A"] = 1 combined = head.combine_first(tail) assert (combined["A"][:10] == 1).all() # reverse overlap tail.iloc[:10, tail.columns.get_loc("A")] = 0 combined = tail.combine_first(head) assert (combined["A"][:10] == 0).all() # no overlap f = float_frame[:10] g = float_frame[10:] combined = f.combine_first(g) tm.assert_series_equal(combined["A"].reindex(f.index), f["A"]) tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) tm.assert_frame_equal(comb, float_frame) comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) assert "faz" in comb.index # #2525 df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) df2 = DataFrame(columns=["b"]) result = df.combine_first(df2) assert "b" in result
def test_combine_first_with_nan_multiindex(): # gh-36562 mi1 = MultiIndex.from_arrays( [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]) df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) mi2 = MultiIndex.from_arrays( [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]) s = Series([1, 2, 3, 4, 5, 6], index=mi2) res = df.combine_first(DataFrame({"d": s})) mi_expected = MultiIndex.from_arrays( [ ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], ], names=["a", "b"], ) expected = DataFrame( { "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], "d": [ 1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan ], }, index=mi_expected, ) tm.assert_frame_equal(res, expected)
def test_combine_first_int64_not_cast_to_float64(): # GH 28613 df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]}) result = df_1.combine_first(df_2) expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]}) tm.assert_frame_equal(result, expected)
def test_combine_first_convert_datatime_correctly(self, data1, data2, data_expected): # GH 3593 df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2}) result = df1.combine_first(df2) expected = DataFrame({"a": data_expected}) tm.assert_frame_equal(result, expected)
def test_combine_first_int(self): # GH14687 - integer series that do no align exactly df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = DataFrame({"a": [1, 4]}, dtype="int64") res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) assert res["a"].dtype == "int64"
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): # GH28481 na_value = nulls_fixture frame = DataFrame([[na_value, na_value]], columns=["a", "b"]) other = DataFrame([[scalar1, scalar2]], columns=["b", "c"]) result = frame.combine_first(other) expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected)
def add_df(self, df:pd.DataFrame): """This function adds new data to the existing data It places it into the positions refered by the "Index" date. If there are days with the same index, they get overwritten. self.dailyData = pd.concat([self.dailyData, new_dailyData], verify_integrity = True) self.dailyData = pd.merge(self.dailyData, new_dailyData) """ df = df.combine_first(self._df) self.df = df
def table_OD(list_coordsO, list_idsO, list_coordsD, list_idsD, OSRM_max_table=100, host='http://localhost:5000'): """ Function wrapping OSRM 'table' function in order to get a matrix of time distance between different origins and destinations (N:M) Params : list_coordsO: list A list of coord as [x, y] for the origins, like : list_coords = [[21.3224, 45.2358], [21.3856, 42.0094], [20.9574, 41.5286]] (coords have to be float) list_idsO: list A list of the corresponding unique id for the origins, like : list_ids = ['name1', 'name2', 'name3'] (id can be str, int or float) list_coordsD: list A list of coord as [x, y] for the destinations (same kind as the origins) list_idsD: list A list of the corresponding unique id for the destinations (same kind as the origins) OSRM_max_table: int, default=100 The --max-table-size defined when lauching osrm-routed (default is 100). It will be used to clip the request in many 'table' requests and reconstruct the matrix. host: str, default 'http://localhost:5000' Url and port of the OSRM instance (no final bakslash) Output: A labeled DataFrame containing the time matrix in minutes (or NaN when OSRM encounter an error to compute a route) -1 or an empty DataFrame is return in case of any other error (wrong list of coords/ids, unknow host, wrong response from the host, etc.) """ if list_coordsO == list_coordsD and list_idsO == list_idsD: list_coords, list_ids = list_coordsO, list_idsO else: list_coords = list_coordsO + list_coordsD list_ids = list_idsO + list_idsD if len(list_coords) > OSRM_max_table: gpd_coords = list(chunk(list_coords, OSRM_max_table//2)) gpd_ids = list(chunk(list_ids, OSRM_max_table//2)) df = DataFrame(index=list_ids, columns=list_ids, dtype=float) for lcoord, lid in zip(mat_range2d(gpd_coords), mat_range2d(gpd_ids)): df = df.combine_first(table(list(lcoord), list(lid), host=host)) else: df = table(list_coords, list_ids, host=host) try: return df[list_idsO].filter(list_idsD, axis=0) except Exception as err: print(err) return -1
def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 df1 = DataFrame({"isNum": [val]}) df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) exp = DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp)
def combine(self, devices = None, readings = None): """ Combines devices from a test into a new dataframe, following the naming as follows: DEVICE-NAME_READING-NAME Parameters ---------- devices: list or None None If None, includes all the devices in self.devices readings: list or None None If None, includes all the readings in self.readings Returns ------- Dataframe if successful or False otherwise """ dfc = DataFrame() if devices is None: dl = list(self.devices.keys()) else: # Only pick the ones that are actually present dl = list(set(devices).intersection(list(self.devices.keys()))) if len(dl) != len(devices): std_out('Requested devices are not all present in devices', 'WARNING') std_out(f'Discarding {set(devices).difference(list(self.devices.keys()))}') for device in dl: new_names = list() if readings is None: rl = list(self.devices[device].readings.columns) else: # Only pick the ones that are actually present rl = list(set(readings).intersection(list(self.devices[device].readings.columns))) if any([reading not in rl for reading in readings]): std_out(f'Requested readings are not all present in readings for device {device}', 'WARNING') std_out(f'Discarding {list(set(readings).difference(list(self.devices[device].readings.columns)))}', 'WARNING') rename = dict() for reading in rl: rename[reading] = reading + '_' + self.devices[device].id df = self.devices[device].readings[rl].copy() df.rename(columns = rename, inplace = True) dfc = dfc.combine_first(df) if dfc.empty: std_out('Error ocurred while combining data. Review data', 'ERROR') return False else: std_out('Data combined successfully', 'SUCCESS') return dfc
def cal_SMB_HML(ret, size, BM, percentile1=None, percentile2=None, independent=True, exclude_30_small_size=False): if exclude_30_small_size: size = ClipQuantile(size, [0.0, 0.3, 1.0], [-1.0, 1.0]) ret, size, BM = IndexAlign(ret, size, BM) valid_ = ~pd.isnull( BM + ret + size ) # TypeError: bad operand type for unary ~: 'float'--->index或columns不匹配 size = size[valid_] BM = BM[valid_] ret = ret[valid_] if percentile1 is None: percentile1 = [0.0, 0.5, 1.0] # size percentile2 = [0.0, 0.3, 0.7, 1.0] # value label_1 = [i + 1 for i in range(len(percentile1) - 1)] label_2 = [i + 1 for i in range(len(percentile2) - 1)] if independent: #mark_1 = pd.DataFrame([pd.qcut(size.iloc[i], q=percentile1, labels=label_1) for i in size.index[:-1]], # index=size.index[:-1]) # 报错 mark_1 = DataFrame([ qcut(size.loc[i], q=percentile1, labels=label_1) for i in size.index ]) mark_2 = DataFrame([ qcut(BM.loc[i], q=percentile2, labels=label_2) for i in BM.index ]) # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的 else: mark_1 = DataFrame([ qcut(size.loc[i], q=percentile1, labels=label_1) for i in size.index ]) # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的 mark_2 = DataFrame(index=mark_1.index, columns=mark_1.columns) for l_ in label_1: tmp = DataFrame([ qcut(BM.loc[i][mark_1.iloc[i] == l_], q=percentile2, labels=label_2) for i in BM.index ]) mark_2 = mark_2.combine_first(tmp) #valid_ = ~(pd.isnull(mark_1 + mark_2) | pd.isnull(ret.iloc[1:])) # valid的股票要满足:当期有前一个月的indicator信息;当期保证交易 df = DataFrame() df['rtn'] = ret.stack() df['ref1'] = mark_1.stack() df['ref2'] = mark_2.stack() tmp = df.groupby(level=0).apply( lambda g: g.groupby(['ref1', 'ref2']).mean()).unstack()['rtn'] #tmp.columns = tmp.columns.get_level_values(1) tmp.index.names = ('trddt', 'ref1') HML = tmp.mean(axis=0, level=0) SMB = tmp.mean(axis=1).unstack() return SMB.iloc[:, -1] - SMB.iloc[:, 0], HML.iloc[:, -1] - HML.iloc[:, 0]
def save_csv(path_csv, df: pd.DataFrame) -> None: """Saves dataframe as csv and merges with existing csv if necessary.""" if os.path.exists(path_csv): df_old = pd.read_csv(path_csv) col_index = df_old.columns[0] # Assumes first col is index col df_old = df_old.set_index(col_index) df = df.combine_first(df_old) df = df.sort_index(axis=1) retry_if_oserror(df.to_csv)(path_csv) print('Saved:', path_csv)
def prepare(self, measurand, inputs, options=dict()): """ Prepares a test for a regression model Parameters ---------- measurand: dict measurand = {'8019043': ['NO2']} inputs: dict inputs per device and reading inputs = {'devicename': ['reading-1', 'reading-2']} options: dict Options including data processing. Defaults in config._model_def_opt Returns ------- df = pandas Dataframe measurand_name = string """ options = dict_fmerge(options, config._model_def_opt) # Measurand measurand_device = list(measurand.keys())[0] measurand_metric = measurand[measurand_device][0] measurand_name = measurand[measurand_device][0] + '_' + measurand_device df = DataFrame() df[measurand_name] = self.devices[measurand_device].readings[ measurand_metric] for input_device in inputs.keys(): combined_df = self.combine(devices=[input_device], readings=inputs[input_device]) df = df.combine_first(combined_df) if options['common_avg']: common_channels = inputs[list(inputs.keys())[0]] for input_device in inputs.keys(): common_channels = list( set(common_channels).intersection(set(inputs[input_device]))) std_out(f'Performing avg in common columns {common_channels}') for channel in common_channels: columns_list = [ channel + '_' + device for device in list(inputs.keys()) ] df[channel + '_AVG'] = df[columns_list].mean(axis=1) df = df.loc[:, df.columns.str.contains("_AVG") | df.columns.str.contains(measurand_name)] if options['clean_na'] is not None: df = clean(df, options['clean_na'], how='any') return df, measurand_name
def merge_update(df_left: pd.DataFrame, df_right: pd.DataFrame, on=None, left_on=None, right_on=None, left_index=False, right_index=False, prefer='right', adjust_dtypes=True): """ Merge `df_right` with `df_left` in an update method: - distinct left/right columns are combined into the new dataframe - for common columns, a `combine_first` is performed (left to right if `prefer='left'`, right to left otherwise) this update replace NaN values with non-NaNs values where possible If `prefer` is 'left', right values are ignored if left ones are not NaNs. """ if all((_ is None for _ in (on, left_on, right_on))): # based on index values if prefer == 'left': m = df_left.combine_first(df_right) else: m = df_right.combine_first(df_left) else: # use provided id columns if left_index: ml = df_left else: ml = df_left.set_index(left_on or on) if right_index: mr = df_right else: mr = df_right.set_index(right_on or on) if prefer == 'left': m = ml.combine_first(mr).reset_index() else: m = mr.combine_first(ml).reset_index() if adjust_dtypes: m = m.infer_objects() return m
def test_combine_first_mixed(self): a = Series(['a', 'b'], index=lrange(2)) b = Series(lrange(2), index=lrange(2)) f = DataFrame({'A': a, 'B': b}) a = Series(['a', 'b'], index=lrange(5, 7)) b = Series(lrange(2), index=lrange(5, 7)) g = DataFrame({'A': a, 'B': b}) # TODO(wesm): no verification? combined = f.combine_first(g) # noqa
def test_combine_first_string_dtype_only_na(self): # GH: 37519 df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") df.set_index(["a", "b"], inplace=True) df2.set_index(["a", "b"], inplace=True) result = df.combine_first(df2) expected = DataFrame( {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected)
def test_combine_first_timedelta(self): data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7]) data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) df2 = DataFrame({"TD": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.TimedeltaIndex( ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]) exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["TD"].dtype == "timedelta64[ns]"
def get_feature_df(df: pd.DataFrame) -> pd.DataFrame: LOG.info("Cleaning up the text...") df['feature_list'] = df['Text'].apply(clean_text) LOG.info("Extracted new features...") df[['lemmas', 'n_stopwords', 'n_punct', 'n_pos', 'n_urls', 'n_tokens']] = pd.DataFrame(df['feature_list'].tolist(), index=df.index) POS_df = pd.DataFrame(df['n_pos'].to_list(), index=df.index).fillna(0) LOG.debug("Got POS_df") final_df = df.combine_first(POS_df) LOG.debug("Got final_df") return final_df
def test_combine_first_mixed(self): a = Series(["a", "b"], index=range(2)) b = Series(range(2), index=range(2)) f = DataFrame({"A": a, "B": b}) a = Series(["a", "b"], index=range(5, 7)) b = Series(range(2), index=range(5, 7)) g = DataFrame({"A": a, "B": b}) exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6]) combined = f.combine_first(g) tm.assert_frame_equal(combined, exp)
def test_combine_first_mixed(self): a = Series(['a', 'b'], index=lrange(2)) b = Series(lrange(2), index=lrange(2)) f = DataFrame({'A': a, 'B': b}) a = Series(['a', 'b'], index=lrange(5, 7)) b = Series(lrange(2), index=lrange(5, 7)) g = DataFrame({'A': a, 'B': b}) exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]}, index=[0, 1, 5, 6]) combined = f.combine_first(g) tm.assert_frame_equal(combined, exp)
def test_combine_first_timestamp_bug_NaT(): # GH28481 frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"]) other = DataFrame( [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]) result = frame.combine_first(other) expected = DataFrame( [[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected)
def get_tfidf(df: pd.DataFrame, tfidf_vect: TfidfVectorizer = tfidf_v, fit: bool = True): df.dropna(subset=['lemmas'], inplace=True) LOG.debug(f"feature columns: {df.columns}") if fit: tfidf = tfidf_vect.fit_transform(df['lemmas']) joblib.dump(tfidf_vect, path.join(OUT_PATH, 'tfidf_vect.joblib')) else: tfidf = tfidf_vect.transform(df['lemmas']) tfidf_df = pd.DataFrame(tfidf.toarray(), columns=tfidf_vect.get_feature_names(), index=df.index) final_df = df.combine_first(tfidf_df) return final_df
def test_combine_first_doc_example(self): # doc example df1 = DataFrame( {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} ) df2 = DataFrame( { "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], } ) result = df1.combine_first(df2) expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) tm.assert_frame_equal(result, expected)
def cal_idxrel_sym(f, s, cal_self=False, self_val=0, *args, **kargs): slen = len(s) df = {} for i in range(slen): if np.isscalar(s[i]) and np.isnan(s[i]): df[s.index[i]] = Series(np.nan, index=s.index) continue res = [] if cal_self: res.append(f(s[i], s[i])) else: res.append(self_val) for j in range(i+1, slen): if np.isscalar(s[j]) and np.isnan(s[j]): res.append(np.nan) else: res.append(f(s[i], s[j], *args, **kargs)) df[s.index[i]] = Series(res, index=s.index[i:]) df = DataFrame(df) df = df.combine_first(df.T) return df
def interpolate(self, X, force_interpolation=True, **kwargs): # force_interpolation: if false, if ALL interpolant variables already # present in X, then do not actually create new interpolation do_interpolation = force_interpolation for key in self.y_keys: # if a y_key is not present, force the interpolation if key not in X: do_interpolation = True if do_interpolation: interpolated = {} # for key in X.keys():#self.x_keys: # interpolated[key] = X[key] for key in self.y_keys: interpolated[key] = self.knn[key].predict(X[self.x_keys]) interpolated = DataFrame(interpolated, index=X.index) # want to overwrite any preexisting x X_return = interpolated.combine_first(X) else: # do nothing X_return = X return X_return
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): if (self._args is not None and len(self._args) > 2) or \ (len(self._args) != 0 and not isinstance(self._args[0], QueryFunction)): raise ValueError('Invalid argument to absolute value function') # get the data data = input if len(self._args) == 0 else self._args[0].process(start_time, end_time, input) ret = None # go through each column, get the average, and apply it to the rows for col in data.columns: abs = data[col].abs() # get the absolute value for each value in the column abs.name = 'abs ' + col # update the name if ret is None: ret = DataFrame(abs) else: ret = ret.combine_first(DataFrame(abs)) # add it to our return value print(ret.head()) return ret
def process(self, start_time: datetime, end_time: datetime, input:DataFrame): if str(self.name) not in '+-*/': raise ValueError("Unknown math function: " + str(self.name)) ret = DataFrame() # two args means we're doing A + B if len(self._args) == 2: left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0] right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1] for l_col in left.columns: for r_col in right.columns: if self.name == '+': t = left[l_col] + right[r_col] elif self.name == '-': t = left[l_col] - right[r_col] elif self.name == '*': t = left[l_col] * right[r_col] elif self.name == '/': t = left[l_col] / right[r_col] else: raise ValueError("Unknown operator: " + str(self.name)) t = DataFrame(t) t.columns = [l_col + self.name + r_col] print(left.head()) print(right.head()) print(t.head()) ret = ret.combine_first(t) else: # everything is in the input DataFrame ret = DataFrame(input.sum(axis=0)) ret.columns = [' + '.join(input.columns)] return ret
def import_foam_folder( path, search, files, skiplines=1, maxlines=0, skiptimes=1, exclude=None ): """ returns a Dataframe for every file in fileList """ #import StringIO from pandas import concat fileList = find_datafiles( path, search=search, files=files, exclude=exclude) if not fileList: print("no files found") return p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()])) df = DataFrame() #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0]) from collections import defaultdict origins = Origins() els = list(fileList.items())[::skiptimes] for fullpath, files in els: time = strip_time(fullpath, path) df_tmp = DataFrame() for fn in files: #ret = read_table(StringIO.StringIO(foam_to_csv(fn))) ret = read_data_file(fn, skiplines, maxlines) p_bar.next() if not ret: continue field_names, x, hashes = ret loc = x.index.values[-1][0] if df_tmp.empty: df_tmp = x else: try: # use combine first for all df at existing Loc or # if not Loc is specified (Eul or Lag fields) if x.index.levels[0][0] in df_tmp.index.levels[0]: df_tmp = df_tmp.combine_first(x) #df_tmp = concat([df_tmp, x], axis=1) pass else: df_tmp = concat([df_tmp, x]) except Exception as e: print(x) print(e) field_names = ([field_names] if not type(field_names) == list else field_names) for field in field_names: origins.insert(time, loc, field, fn, hashes[field]) df_tmp['Time'] = time if df.empty: df = df_tmp else: df = df.append(df_tmp) df.set_index('Time', append=True, inplace=True) df = df.reorder_levels(['Time','Loc','Id']) p_bar.done() return origins, df
def test_combine_first_mixed_bug(self): idx = Index(['a', 'b', 'c', 'e']) ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) ser2 = Series(['a', 'b', 'c', 'e'], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) idx = Index(['a', 'b', 'c', 'f']) ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) ser2 = Series(['a', 'b', 'c', 'f'], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 # gh 3016 (same as in update) df = DataFrame([[1., 2., False, True], [4., 5., True, False]], columns=['A', 'B', 'bool1', 'bool2']) other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) result = df.combine_first(other) assert_frame_equal(result, df) df.loc[0, 'A'] = np.nan result = df.combine_first(other) df.loc[0, 'A'] = 45 assert_frame_equal(result, df) # doc example df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], 'B': [np.nan, 2., 3., np.nan, 6.]}) df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) result = df1.combine_first(df2) expected = DataFrame( {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) assert_frame_equal(result, expected) # GH3552, return object dtype with bools df1 = DataFrame( [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) df2 = DataFrame( [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) result = df1.combine_first(df2)[2] expected = Series([True, True, False], name=2) assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrecly df0 = DataFrame({"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) df1 = DataFrame({"a": [None, None, None]}) df2 = df1.combine_first(df0) assert_frame_equal(df2, df0) df2 = df0.combine_first(df1) assert_frame_equal(df2, df0) df0 = DataFrame({"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) df2 = df1.combine_first(df0) result = df0.copy() result.iloc[0, :] = df1.iloc[0, :] assert_frame_equal(df2, result) df2 = df0.combine_first(df1) assert_frame_equal(df2, df0)
# encoding=utf-8 import pandas as pd import numpy as np from pandas import Series, DataFrame # 合并重叠数据 a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a']) b = Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a']) b[-1] = np.nan print a print b print pd.isnull(a) # 在True的地方保留b的元素,在False替换为a中对应的元素 print np.where(pd.isnull(a), b, a) # Series中有对应的combine_first print b[:-2].combine_first(a[2:]) df1 = DataFrame({'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2, np.nan, 6.], 'c': range(2, 18, 4)}) df2 = DataFrame({'a': [5., 4. ,np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]}) print df1 print df2 print df1.combine_first(df2)
def import_foam_folder( path, search, files, skiplines=1, maxlines=0, skiptimes=slice(0,None), exclude=None, times_slice=None, ): """ returns a Dataframe for every file in fileList """ #import StringIO fileList = find_datafiles( path, search=search, files=files, exclude=exclude, times_slice=times_slice ) if not fileList: print("no files found") return None, DataFrame() p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()])) df = DataFrame() #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0]) from collections import defaultdict origins = Origins() els = list(fileList.items())[skiptimes] for fullpath, files in els: time = strip_time(fullpath, path) df_tmp = DataFrame() # for fn in files: # #ret = read_table(StringIO.StringIO(foam_to_csv(fn))) # ret = read_data_file(fn, skiplines, maxlines) # p_bar.next() args = [(fn, skiplines, maxlines, p_bar) for fn in files] if MULTIPROCESS: with multiprocessing.Pool(processes=MULTIPROCESS) as pool: rets = pool.map(read_data_file_args, args) else: rets = map(read_data_file_args, args) for fn, ret in zip(files, rets): if not ret or ret[1].empty: continue field_names, x, hashes = ret loc = x.index.values[-1][0] if df_tmp.empty: df_tmp = x else: try: df_tmp = df_tmp.combine_first(x) except Exception as e: print("failed to concat: ", df_tmp, "and", x, "new_loc ", x.index.levels[0][0], " existing_locs ", df_tmp.index.levels[0] ) print(e) field_names = ([field_names] if not type(field_names) == list else field_names) for field in field_names: if field == "Pos": continue origins.insert(time, loc, field, fn, hashes[field]) df_tmp['Time'] = time if df.empty: df = df_tmp else: df = df.append(df_tmp) df.set_index('Time', append=True, inplace=True) if not "Loc" in df.index.names: print(df) # df = df.reorder_levels(['Time', ]) else: df = df.reorder_levels(['Time', 'Loc', 'Pos']) p_bar.done() return origins, df
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f','e','d','c','b','a']) b = Series(np.arange(len(a), dtype=np.float64), index=['f','e','d','c','b','a']) b[-1] = np.nan # np中实现ifelse语句,a中空值位置用b替代 np.where(pd.isnull(a),b,a) # pd中类似函数,b中控制用a替代 b[:-2].combine_first(a[2:]) # DataFrame中使用 df1 = DataFrame({'a':[1.,np.nan,5.,np.nan], 'b':[np.nan,2.,np.nan, 6.], 'c':range(2,18,4)}) df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.], 'b':[np.nan,3.,4,6.,8.]}) df1.combine_first(df2) ## 移除重复数据 data = DataFrame({'k1':['one']*3+['two']*4, 'k2':[1,1,2,3,3,4,4]}) data.duplicated() # 去除重复值,默认留第一个 data.drop_duplicates() # 根据某一列去除重复值 data['v1'] = range(7) data.drop_duplicates(['k1']) # 保留最后一个 data.drop_duplicates(['k1','k2'], take_last=True) ## 利用函数或映射进行数据转换 data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami', 'corned beef','Bacon','pastrami','honey ham','nova lox'],
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd']) df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a']) print(df1) print(df2) print(pd.concat([df1, df2])) print(pd.concat([df1, df2], ignore_index=True)) a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a']) b = Series(np.arange(len(a)), dtype=np.float64, index=['f', 'e', 'd', 'c', 'b', 'a']) b[-1] = np.nan print(a) print(b) print(np.where(pd.isnull(a), b, a)) print(b[:-2].combine_first(a[2:])) df1 = DataFrame({'a': [1, np.nan, 5, np.nan], 'b': [np.nan, 2, np.nan, 6], 'c': range(2, 18, 4) }) df2 = DataFrame({'a': [5, 4, np.nan, 3, 7], 'b': [np.nan, 3, 4, 6, 8] }) print(df1.combine_first(df2))