def test_interp_alt_scipy(self): tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) result = df.interpolate(method='barycentric') expected = df.copy() expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6 assert_frame_equal(result, expected) result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() # expectedk['A'].iloc[2] = 3 # expectedk['A'].iloc[5] = 6 expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() result = df.interpolate(method='pchip') expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6.125 assert_frame_equal(result, expected)
def test_interp_bad_method(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], 'C': [1, 2, 3, 5], 'D': list('abcd')}) with pytest.raises(ValueError): df.interpolate(method='not_a_method')
def test_interp_alt_scipy(self): tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) result = df.interpolate(method='barycentric') expected = df.copy() expected.loc[2, 'A'] = 3 expected.loc[5, 'A'] = 6 assert_frame_equal(result, expected) result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() import scipy result = df.interpolate(method='pchip') expected.loc[2, 'A'] = 3 if LooseVersion(scipy.__version__) >= '0.17.0': expected.loc[5, 'A'] = 6.0 else: expected.loc[5, 'A'] = 6.125 assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': ['a', 'b', 'c', 'd'], 'C': [np.nan, 2, 5, 7], 'D': [np.nan, np.nan, 9, 9], 'E': [1, 2, 3, 4]}) with pytest.raises(TypeError): df.interpolate(axis=1)
def test_interp_inplace_row(self): # GH 10395 result = DataFrame({'a': [1., 2., 3., 4.], 'b': [np.nan, 2., 3., 4.], 'c': [3, 2, 2, 2]}) expected = result.interpolate(method='linear', axis=1, inplace=False) result.interpolate(method='linear', axis=1, inplace=True) assert_frame_equal(result, expected)
def test_interp_raise_on_all_object_dtype(self): # GH 22985 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") msg = ( "Cannot interpolate with all object-dtype columns " "in the DataFrame. Try setting at least one " "column to a numeric dtype." ) with pytest.raises(TypeError, match=msg): df.interpolate()
def test_interp_inplace_row(self): # GH 10395 result = DataFrame({ "a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2] }) expected = result.interpolate(method="linear", axis=1, inplace=False) result.interpolate(method="linear", axis=1, inplace=True) tm.assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self): df = DataFrame({ 'A': [1, 2, np.nan, 4], 'B': ['a', 'b', 'c', 'd'], 'C': [np.nan, 2, 5, 7], 'D': [np.nan, np.nan, 9, 9], 'E': [1, 2, 3, 4] }) with tm.assertRaises(TypeError): df.interpolate(axis=1)
def test_interp_inplace_row(self): # GH 10395 result = DataFrame({ 'a': [1., 2., 3., 4.], 'b': [np.nan, 2., 3., 4.], 'c': [3, 2, 2, 2] }) expected = result.interpolate(method='linear', axis=1, inplace=False) result.interpolate(method='linear', axis=1, inplace=True) assert_frame_equal(result, expected)
def test_interp_string_axis(self, axis_name, axis_number): # https://github.com/pandas-dev/pandas/issues/25190 x = np.linspace(0, 100, 1000) y = np.sin(x) df = DataFrame(data=np.tile(y, (10, 1)), index=np.arange(10), columns=x).reindex(columns=x * 1.005) result = df.interpolate(method="linear", axis=axis_name) expected = df.interpolate(method="linear", axis=axis_number) tm.assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self): df = DataFrame({ "A": [1, 2, np.nan, 4], "B": ["a", "b", "c", "d"], "C": [np.nan, 2, 5, 7], "D": [np.nan, np.nan, 9, 9], "E": [1, 2, 3, 4], }) with pytest.raises(TypeError): df.interpolate(axis=1)
def fit(self, X: pd.DataFrame): """Learn the mixture probability, mean and covariance for each component k. Store the computed energy based on the training data and the aforementioned parameters. Parameters ---------- X : dataframe of shape (n_samples, n_features) The input samples. """ X.interpolate(inplace=True) X.bfill(inplace=True) data = X.values sequences = [ data[i:i + self.sequence_length] for i in range(X.shape[0] - self.sequence_length + 1) ] data_loader = DataLoader(dataset=sequences, batch_size=self.batch_size, shuffle=True, drop_last=True) self.hidden_size = 5 + int(X.shape[1] / 20) autoencoder = self.autoencoder_type(X.shape[1], hidden_size=self.hidden_size, **self.autoencoder_args) self.dagmm = DAGMMModule(autoencoder, n_gmm=self.gmm_k, latent_dim=self.hidden_size + 2, seed=self.seed, gpu=self.gpu) self.to_device(self.dagmm) self.optimizer = torch.optim.Adam(self.dagmm.parameters(), lr=self.lr) for _ in trange(self.num_epochs): for input_data in data_loader: input_data = self.to_var(input_data) self.dagmm_step(input_data.float()) self.dagmm.eval() n = 0 mu_sum = 0 cov_sum = 0 gamma_sum = 0 for input_data in data_loader: input_data = self.to_var(input_data) _, _, z, gamma = self.dagmm(input_data.float()) phi, mu, cov = self.dagmm.compute_gmm_params(z, gamma) batch_gamma_sum = torch.sum(gamma, dim=0) gamma_sum += batch_gamma_sum mu_sum += mu * batch_gamma_sum.unsqueeze( -1) # keep sums of the numerator only cov_sum += cov * batch_gamma_sum.unsqueeze(-1).unsqueeze( -1) # keep sums of the numerator only n += input_data.size(0)
def concatenate_lolo_tables(_dir, out_file): _list = [os.path.join(_dir, x) for x in os.listdir(_dir) if 'gridmet' in x] _list.sort() modis = [ os.path.join(_dir, x) for x in os.listdir(_dir) if 'gridmet' not in x ] mod_data = {} for m in modis: mod = read_csv(m).drop(columns=['.geo', 'system:index', 'Id']) dates = [ datetime.strptime(x.split('/')[-1][0:10], '%Y_%m_%d') for x in mod.columns ] param = mod.columns[0].split('/')[-1].split('_')[-1] vals = [x * 0.1 for x in list(mod.loc[0, :])] s = DataFrame(data=vals, index=dates) s.fillna(method='ffill', inplace=True) s = s.resample('D').asfreq() s = s / 8. s.interpolate(method='polynomial', order=3, inplace=True) s = s.reindex(date_range(dates[0], '{}-12-31'.format(TEST_YEARS[-1]))) s.fillna(method='ffill', inplace=True) mod_data[param] = s df = concat(mod_data, sort=False, axis=1) df.columns = ['MOD16A2_{}'.format(x[0]) for x in list(df.columns)] for csv in _list: c = read_csv(csv).drop(columns=['.geo', 'system:index', 'Id']) param = 'gridmet_{}'.format( c.columns[0].split('/')[-1].split('_')[-1]).upper() dates = [ datetime.strptime(x.split('/')[-1].split('_')[0], '%Y%m%d') for x in c.columns ] vals = [x for x in list(c.loc[0, :])] c = DataFrame(data=vals, index=dates, columns=[param]) c.fillna(method='ffill', inplace=True) if param not in df.columns: df[param] = c.reindex(index=df.index) else: df[param].loc[c.index] = c.values.reshape(c.values.shape[0], ) df.to_csv(out_file) cols = [ df['MOD16A2_ET'].resample('M').sum(), df['MOD16A2_PET'].resample('M').sum(), df['GRIDMET_ETR'].resample('M').sum(), df['GRIDMET_PR'].resample('M').sum(), df['GRIDMET_TMMN'].resample('M').mean(), df['GRIDMET_TMMX'].resample('M').mean() ] s = [(x.name, x.values) for x in cols] _dct = {k: v for (k, v) in s} df = DataFrame(data=_dct, index=cols[0].index) df.to_csv(out_file.replace('daily', 'monthly'))
def fit(self, X: pd.DataFrame): X.interpolate(inplace=True) X.bfill(inplace=True) data = X.values sequences = [ data[i:i + self.sequence_length] for i in range(data.shape[0] - self.sequence_length + 1) ] indices = np.random.permutation(len(sequences)) split_point = int(self.train_gaussian_percentage * len(sequences)) train_loader = DataLoader(dataset=sequences, batch_size=self.batch_size, drop_last=True, sampler=SubsetRandomSampler( indices[:-split_point]), pin_memory=True) train_gaussian_loader = DataLoader(dataset=sequences, batch_size=self.batch_size, drop_last=True, sampler=SubsetRandomSampler( indices[-split_point:]), pin_memory=True) self.aed = AutoEncoderModule(X.shape[1], self.sequence_length, self.hidden_size, seed=self.seed, gpu=self.gpu) self.to_device(self.aed) # .double() optimizer = torch.optim.Adam(self.aed.parameters(), lr=self.lr) self.aed.train() for epoch in trange(self.num_epochs): logging.debug(f'Epoch {epoch+1}/{self.num_epochs}.') for ts_batch in train_loader: output = self.aed(self.to_var(ts_batch)) loss = nn.MSELoss(size_average=False)(output, self.to_var( ts_batch.float())) self.aed.zero_grad() loss.backward() optimizer.step() self.aed.eval() error_vectors = [] for ts_batch in train_gaussian_loader: output = self.aed(self.to_var(ts_batch)) error = nn.L1Loss(reduce=False)(output, self.to_var(ts_batch.float())) error_vectors += list( error.view(-1, X.shape[1]).data.cpu().numpy()) self.mean = np.mean(error_vectors, axis=0) self.cov = np.cov(error_vectors, rowvar=False)
def test_interp_raise_on_all_object_dtype(self): # GH 22985 df = DataFrame({ 'A': [1, 2, 3], 'B': [4, 5, 6]}, dtype='object') msg = ("Cannot interpolate with all object-dtype columns " "in the DataFrame. Try setting at least one " "column to a numeric dtype.") with pytest.raises(TypeError, match=msg): df.interpolate()
def test_interp_leading_nans(self): df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}) result = df.interpolate() expected = df.copy() expected['B'].loc[3] = -3.75 assert_frame_equal(result, expected) tm._skip_if_no_scipy() result = df.interpolate(method='polynomial', order=1) assert_frame_equal(result, expected)
def test_interp_time_inplace_axis(self, axis): # GH 9687 periods = 5 idx = date_range(start="2014-01-01", periods=periods) data = np.random.rand(periods, periods) data[data < 0.5] = np.nan expected = DataFrame(index=idx, columns=idx, data=data) result = expected.interpolate(axis=0, method="time") expected.interpolate(axis=0, method="time", inplace=True) tm.assert_frame_equal(result, expected)
def test_interp_leading_nans(self, check_scipy): df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}) result = df.interpolate() expected = df.copy() expected['B'].loc[3] = -3.75 assert_frame_equal(result, expected) if check_scipy: result = df.interpolate(method='polynomial', order=1) assert_frame_equal(result, expected)
def test_interp_bad_method(self): df = DataFrame( { "A": [1, 2, np.nan, 4], "B": [1, 4, 9, np.nan], "C": [1, 2, 3, 5], "D": list("abcd"), } ) with pytest.raises(ValueError): df.interpolate(method="not_a_method")
def test_interp_raise_on_all_object_dtype(self): # GH 22985 df = DataFrame({ 'A': [1, 2, 3], 'B': [4, 5, 6]}, dtype='object') with tm.assert_raises_regex( TypeError, "Cannot interpolate with all object-dtype columns " "in the DataFrame. Try setting at least one " "column to a numeric dtype."): df.interpolate()
def decision_function(self, X: pd.DataFrame): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Using the learned mixture probability, mean and covariance for each component k, compute the energy on the given data. Parameters ---------- X : dataframe of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ self.dagmm.eval() X.interpolate(inplace=True) X.bfill(inplace=True) data = X.values sequences = [data[i:i + self.sequence_length] for i in range(len(data) - self.sequence_length + 1)] data_loader = DataLoader(dataset=sequences, batch_size=1, shuffle=False) test_energy = np.full((self.sequence_length, X.shape[0]), np.nan) encodings = np.full((self.sequence_length, X.shape[0], self.hidden_size), np.nan) decodings = np.full((self.sequence_length, X.shape[0], X.shape[1]), np.nan) euc_errors = np.full((self.sequence_length, X.shape[0]), np.nan) csn_errors = np.full((self.sequence_length, X.shape[0]), np.nan) for i, sequence in enumerate(data_loader): enc, dec, z, _ = self.dagmm(self.to_var(sequence).float()) sample_energy, _ = self.dagmm.compute_energy(z, size_average=False) idx = (i % self.sequence_length, np.arange(i, i + self.sequence_length)) test_energy[idx] = sample_energy.data.numpy() if self.details: encodings[idx] = enc.data.numpy() decodings[idx] = dec.data.numpy() euc_errors[idx] = z[:, 1].data.numpy() csn_errors[idx] = z[:, 2].data.numpy() test_energy = np.nanmean(test_energy, axis=0) if self.details: self.prediction_details.update({'latent_representations': np.nanmean(encodings, axis=0).T}) self.prediction_details.update({'reconstructions_mean': np.nanmean(decodings, axis=0).T}) self.prediction_details.update({'euclidean_errors_mean': np.nanmean(euc_errors, axis=0)}) self.prediction_details.update({'cosine_errors_mean': np.nanmean(csn_errors, axis=0)}) return test_energy
def predict(self, X: pd.DataFrame): """Using the learned mixture probability, mean and covariance for each component k, compute the energy on the given data.""" self.dagmm.eval() X.interpolate(inplace=True) X.bfill(inplace=True) data = X.values sequences = [ data[i:i + self.sequence_length] for i in range(len(data) - self.sequence_length + 1) ] data_loader = DataLoader(dataset=sequences, batch_size=1, shuffle=False) test_energy = np.full((self.sequence_length, X.shape[0]), np.nan) encodings = np.full( (self.sequence_length, X.shape[0], self.hidden_size), np.nan) decodings = np.full((self.sequence_length, X.shape[0], X.shape[1]), np.nan) euc_errors = np.full((self.sequence_length, X.shape[0]), np.nan) csn_errors = np.full((self.sequence_length, X.shape[0]), np.nan) for i, sequence in enumerate(data_loader): enc, dec, z, gamma = self.dagmm(self.to_var(sequence).float()) sample_energy, _ = self.dagmm.compute_energy(z, size_average=False) idx = (i % self.sequence_length, np.arange(i, i + self.sequence_length)) test_energy[idx] = sample_energy.data.cpu().numpy() if self.details: encodings[idx] = enc.data.cpu().numpy() decodings[idx] = dec.data.cpu().numpy() euc_errors[idx] = z[:, 1].data.cpu().numpy() csn_errors[idx] = z[:, 2].data.cpu().numpy() test_energy = np.nanmean(test_energy, axis=0) if self.details: self.prediction_details.update( {'latent_representations': np.nanmean(encodings, axis=0).T}) self.prediction_details.update( {'reconstructions_mean': np.nanmean(decodings, axis=0).T}) self.prediction_details.update( {'euclidean_errors_mean': np.nanmean(euc_errors, axis=0)}) self.prediction_details.update( {'cosine_errors_mean': np.nanmean(csn_errors, axis=0)}) return test_energy
def test_interp_bad_method(self): df = DataFrame({ "A": [1, 2, np.nan, 4], "B": [1, 4, 9, np.nan], "C": [1, 2, 3, 5], "D": list("abcd"), }) msg = ( r"method must be one of \['linear', 'time', 'index', 'values', " r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', " r"'barycentric', 'krogh', 'spline', 'polynomial', " r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', " r"'cubicspline'\]. Got 'not_a_method' instead.") with pytest.raises(ValueError, match=msg): df.interpolate(method="not_a_method")
def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 df = DataFrame() result = df.interpolate() assert result is not df expected = df tm.assert_frame_equal(result, expected)
def fix_failed(X: pd.DataFrame, y: pd.DataFrame, req_len: int) -> Tuple[pd.DataFrame]: """ Fix training samples with missing data. """ # FIXME: bettter fixing methods? if X.shape[0] < req_len: return (None, None) if np.any(y.isnull()): # When the target is missing, this tuple cannot be fixed. return (None, None) if X.shape[0] != req_len: return (None, None) try: fixed_X = X.interpolate( method="nearest", aixs=0, ) fixed_X.fillna(method="bfill", inplace=True) fixed_X.fillna(method="ffill", inplace=True) return (fixed_X, y) except ValueError: return (None, None)
def __init__(self, prices: pd.DataFrame, mv: pd.DataFrame, rf: bool): price_names = np.array([price_name for price_name in prices["NAME"]]) mv_names = np.array( [mv_name.split(" - ")[0] for mv_name in mv["NAME"]]) assert all([ price_name == mv_name for price_name, mv_name in zip(price_names, mv_names) ]) self.__company_names: np.array = price_names del mv["NAME"] del prices["NAME"] self.__prices: pd.DataFrame = prices.interpolate() self.__mv: pd.DataFrame = mv.interpolate() self.__moving_portfolios: pd.DataFrame = self.__compute_portfolios() self.__dates: np.ndarray = self.__moving_portfolios.columns.values self.__rf = self.import_rf().interpolate() if rf else None
def test_interp_basic(self): df = DataFrame( { "A": [1, 2, np.nan, 4], "B": [1, 4, 9, np.nan], "C": [1, 2, 3, 5], "D": list("abcd"), } ) expected = DataFrame( { "A": [1.0, 2.0, 3.0, 4.0], "B": [1.0, 4.0, 9.0, 9.0], "C": [1, 2, 3, 5], "D": list("abcd"), } ) result = df.interpolate() tm.assert_frame_equal(result, expected) result = df.set_index("C").interpolate() expected = df.set_index("C") expected.loc[3, "A"] = 3 expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected)
def __call__(self, strategy): prices = strategy.get_indicator_prices() trend = DataFrame(None, index = prices.index, columns = prices.columns, dtype = float) last_SP = Series(None, index = prices.columns) current_trend = Series('-', index = prices.columns) for i in range(prices.shape[0] - self.period): # If there are not any new highs in the recent period then must have been # a swing point high. SPH = ~(prices.iloc[(i + 1):(i + self.period)] > prices.iloc[i]).any() # NaN in series will produce false signals and need to be removed SPH = SPH[prices.iloc[i].notnull()] SPH = SPH[SPH] # Only mark as swing point high if currently in uptrend or unidentified trend, otherwise ignore. SPH = SPH[current_trend[SPH.index] != 'DOWN'] if not SPH.empty: current_trend[SPH.index] = 'DOWN' trend.loc[trend.index[i], SPH.index] = prices.iloc[i][SPH.index] # Repeat for swing point lows. SPL = ~(prices.iloc[(i + 1):(i + self.period)] < prices.iloc[i]).any() SPL = SPL[prices.iloc[i].notnull()] SPL = SPL[SPL] SPL = SPL[current_trend[SPL.index] != 'UP'] if not SPL.empty: current_trend[SPL.index] = 'UP' trend.loc[trend.index[i], SPL.index] = prices.iloc[i][SPL.index] self.trend = trend.interpolate()
def __call__(self, strategy): prices = strategy.get_indicator_prices() trend = DataFrame(None, index=prices.index, columns=prices.columns, dtype=float) last_SP = Series(None, index=prices.columns) current_trend = Series('-', index=prices.columns) for i in range(prices.shape[0] - self.period): # If there are not any new highs in the recent period then must have been # a swing point high. SPH = ~(prices.iloc[(i + 1): (i + self.period)] > prices.iloc[i]).any() # NaN in series will produce false signals and need to be removed SPH = SPH[prices.iloc[i].notnull()] SPH = SPH[SPH] # Only mark as swing point high if currently in uptrend or unidentified trend, otherwise ignore. SPH = SPH[current_trend[SPH.index] != 'DOWN'] if not SPH.empty: current_trend[SPH.index] = 'DOWN' trend.loc[trend.index[i], SPH.index] = prices.iloc[i][SPH.index] # Repeat for swing point lows. SPL = ~(prices.iloc[(i + 1): (i + self.period)] < prices.iloc[i]).any() SPL = SPL[prices.iloc[i].notnull()] SPL = SPL[SPL] SPL = SPL[current_trend[SPL.index] != 'UP'] if not SPL.empty: current_trend[SPL.index] = 'UP' trend.loc[trend.index[i], SPL.index] = prices.iloc[i][SPL.index] self.trend = trend.interpolate()
def interpolate( data_frame: pd.DataFrame, limit: int = None, method: str = "linear", headers: [str] = None, ) -> pd.DataFrame: """This function returns the Series or DataFrame of same shape interpolated at the NaNs. This is a adapted interpolate function of pandas package. Parameters ---------- data_frame : pd.DataFrame input dataframe limit : int, optional See pandas.DataFrame.interpolate, by default None method : str, optional See pandas.DataFrame.interpolate, by default "linear" headers : [str], optional chosen dataframe headers, by default None Returns ------- pd.DataFrame Series or DataFrame of same shape interpolated at the NaNs """ if headers: data_frame = data_frame.loc[:, headers] return data_frame.interpolate(method, limit=limit)
def test_interp_ignore_all_good(self): # GH df = DataFrame({ "A": [1, 2, np.nan, 4], "B": [1, 2, 3, 4], "C": [1.0, 2.0, np.nan, 4.0], "D": [1.0, 2.0, 3.0, 4.0], }) expected = DataFrame({ "A": np.array([1, 2, 3, 4], dtype="float64"), "B": np.array([1, 2, 3, 4], dtype="int64"), "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"), "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), }) result = df.interpolate(downcast=None) tm.assert_frame_equal(result, expected) # all good result = df[["B", "D"]].interpolate(downcast=None) tm.assert_frame_equal(result, df[["B", "D"]])
def _fill_na(self, X: pd.DataFrame) -> pd.DataFrame: """ return dataframe with filled missing values by defined method Parameters ---------- X: pd.DataFrame Returns ------- pd.DataFrame """ if self._fill_method == 'mean': for col in X.columns: mean = X[col].mean() X[col] = X[col].fillna(mean) elif self._fill_method == 'median': for col in X.columns: median = X[col].median() X[col] = X[col].fillna(median) elif self._fill_method == 'ffill': X = X.ffill() elif self._fill_method == 'bfill': X = X.bfill() elif self._fill_method == 'interpolate': X = X.interpolate() return X
def fill_missing_data(data_frame: pd.DataFrame) -> pd.DataFrame: """ Fills missing data in the data_frame using interpolation method. :param data_frame: (pd.DataFrame) pandas DataFrame object to perform interpolation on. :return: (pd.DataFrame) interpolated DataFrame object with missing data filled. """ return data_frame.interpolate()
def resample(df: pd.DataFrame, time_index: pd.Series, period="1S"): """Resamples the dataframe with the given `period` while using `time` as an index.""" time_range = get_time_range(time_index) time_length = time_range.start - time_range.end df.index = time_index if pd.Timedelta(period) < time_length: df = df.resample(period).first() return df.interpolate()
def preproc_pipeline(df: pd.DataFrame): """Функция предобработки для поминутных данных.""" # Выбросим столбцы, в которых есть слово 'market' df.drop([col for col in df.columns if 'market' in col], axis=1, inplace=True) # Переименуем столбцы df.columns = df.columns.map(snake_case) return df.interpolate().fillna(0)
def interpolate(df: pd.DataFrame): """ Returns the DataFrame with missing values filled in using column (axis = 1) interpolation. For example, if we're missing data for the month of March, the surrounding months of January and February will be averaged and imputed for March. Edge cases are considered negligible and are ignored. """ df = df.interpolate(axis=1) print ('Interpolation successful.') return df
def test_interpolate_pos_args_deprecation(self): # https://github.com/pandas-dev/pandas/issues/41485 df = DataFrame({"a": [1, 2, 3]}) msg = ( r"In a future version of pandas all arguments of DataFrame.interpolate " r"except for the argument 'method' will be keyword-only") with tm.assert_produces_warning(FutureWarning, match=msg): result = df.interpolate("pad", 0) expected = DataFrame({"a": [1, 2, 3]}) tm.assert_frame_equal(result, expected)
def test_interp_fillna_methods(self, axis, method): # GH 12918 df = DataFrame({ "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0], "B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0], "C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0], }) expected = df.fillna(axis=axis, method=method) result = df.interpolate(method=method, axis=axis) tm.assert_frame_equal(result, expected)
def test_interp_rowwise(self): df = DataFrame({0: [1, 2, np.nan, 4], 1: [2, 3, 4, np.nan], 2: [np.nan, 4, 5, 6], 3: [4, np.nan, 6, 7], 4: [1, 2, 3, 4]}) result = df.interpolate(axis=1) expected = df.copy() expected.loc[3, 1] = 5 expected.loc[0, 2] = 3 expected.loc[1, 3] = 3 expected[4] = expected[4].astype(np.float64) assert_frame_equal(result, expected) result = df.interpolate(axis=1, method='values') assert_frame_equal(result, expected) result = df.interpolate(axis=0) expected = df.interpolate() assert_frame_equal(result, expected)
def test_interp_basic(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], 'C': [1, 2, 3, 5], 'D': list('abcd')}) expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.], 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df.interpolate() assert_frame_equal(result, expected) result = df.set_index('C').interpolate() expected = df.set_index('C') expected.A.loc[3] = 3 expected.B.loc[5] = 9 assert_frame_equal(result, expected)
class Interpolate(object): params = [None, 'infer'] param_names = ['downcast'] def setup(self, downcast): N = 10000 # this is the worst case, where every column has NaNs. self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan self.df2 = DataFrame({'A': np.arange(0, N), 'B': np.random.randint(0, 100, N), 'C': np.random.randn(N), 'D': np.random.randn(N)}) self.df2.loc[1::5, 'A'] = np.nan self.df2.loc[1::5, 'C'] = np.nan def time_interpolate(self, downcast): self.df.interpolate(downcast=downcast) def time_interpolate_some_good(self, downcast): self.df2.interpolate(downcast=downcast)
def test_interp_basic(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], 'C': [1, 2, 3, 5], 'D': list('abcd')}) expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9], 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df.interpolate() assert_frame_equal(result, expected) result = df.set_index('C').interpolate() expected = df.set_index('C') expected.A.loc[3] = 3 expected.B.loc[5] = 9 expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64) assert_frame_equal(result, expected)
def test_interpolate(): skip_if_not_available(modules=['pandas']) """ Ensure tha DataFrame.interpolate(method='nearest') has the desired properties. It is used by blocks-plot and should: * interpolate missing/NaN datapoints between valid ones * not replace any NaN before/after the first/last finite datapoint """ y = [nan, nan, 2., 3., nan, 5, nan, nan] df = DataFrame(y) df_ = df.interpolate(method='nearest')[0] assert all(isfinite(df_[2:6])) assert all(~isfinite(df_[0:2])) assert all(~isfinite(df_[6:8]))
def test_interp_ignore_all_good(self): # GH df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 2, 3, 4], 'C': [1., 2., np.nan, 4.], 'D': [1., 2., 3., 4.]}) expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float64'), 'B': np.array([1, 2, 3, 4], dtype='int64'), 'C': np.array([1., 2., 3, 4.], dtype='float64'), 'D': np.array([1., 2., 3., 4.], dtype='float64')}) result = df.interpolate(downcast=None) assert_frame_equal(result, expected) # all good result = df[['B', 'D']].interpolate(downcast=None) assert_frame_equal(result, df[['B', 'D']])
def test_interp_various(self): tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) df = df.set_index('C') expected = df.copy() result = df.interpolate(method='polynomial', order=1) expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923076 assert_frame_equal(result, expected) result = df.interpolate(method='cubic') # GH #15662. # new cubic and quadratic interpolation algorithms from scipy 0.19.0. # previously `splmake` was used. See scipy/scipy#6710 if _is_scipy_ge_0190: expected.A.loc[3] = 2.81547781 expected.A.loc[13] = 5.52964175 else: expected.A.loc[3] = 2.81621174 expected.A.loc[13] = 5.64146581 assert_frame_equal(result, expected) result = df.interpolate(method='nearest') expected.A.loc[3] = 2 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method='quadratic') if _is_scipy_ge_0190: expected.A.loc[3] = 2.82150771 expected.A.loc[13] = 6.12648668 else: expected.A.loc[3] = 2.82533638 expected.A.loc[13] = 6.02817974 assert_frame_equal(result, expected) result = df.interpolate(method='slinear') expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923077 assert_frame_equal(result, expected) result = df.interpolate(method='zero') expected.A.loc[3] = 2. expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False)
def test_interp_various(self): _skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) df = df.set_index('C') expected = df.copy() result = df.interpolate(method='polynomial', order=1) expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923076 assert_frame_equal(result, expected) result = df.interpolate(method='cubic') expected.A.loc[3] = 2.81621174 expected.A.loc[13] = 5.64146581 assert_frame_equal(result, expected) result = df.interpolate(method='nearest') expected.A.loc[3] = 2 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method='quadratic') expected.A.loc[3] = 2.82533638 expected.A.loc[13] = 6.02817974 assert_frame_equal(result, expected) result = df.interpolate(method='slinear') expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923077 assert_frame_equal(result, expected) result = df.interpolate(method='zero') expected.A.loc[3] = 2. expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method='quadratic') expected.A.loc[3] = 2.82533638 expected.A.loc[13] = 6.02817974 assert_frame_equal(result, expected)
def test_interp_various(self): df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) df = df.set_index('C') expected = df.copy() result = df.interpolate(method='polynomial', order=1) expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923076 assert_frame_equal(result, expected) result = df.interpolate(method='cubic') # GH #15662. expected.A.loc[3] = 2.81547781 expected.A.loc[13] = 5.52964175 assert_frame_equal(result, expected) result = df.interpolate(method='nearest') expected.A.loc[3] = 2 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method='quadratic') expected.A.loc[3] = 2.82150771 expected.A.loc[13] = 6.12648668 assert_frame_equal(result, expected) result = df.interpolate(method='slinear') expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923077 assert_frame_equal(result, expected) result = df.interpolate(method='zero') expected.A.loc[3] = 2. expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False)
def test_rowwise_alt(self): df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64], 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]}) df.interpolate(axis=0)
def test_interp_nan_idx(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) df = df.set_index('A') with pytest.raises(NotImplementedError): df.interpolate(method='values')