Exemplo n.º 1
0
    def test_interp_alt_scipy(self):
        tm._skip_if_no_scipy()
        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        result = df.interpolate(method='barycentric')
        expected = df.copy()
        expected['A'].iloc[2] = 3
        expected['A'].iloc[5] = 6
        assert_frame_equal(result, expected)

        result = df.interpolate(method='barycentric', downcast='infer')
        assert_frame_equal(result, expected.astype(np.int64))

        result = df.interpolate(method='krogh')
        expectedk = df.copy()
        # expectedk['A'].iloc[2] = 3
        # expectedk['A'].iloc[5] = 6
        expectedk['A'] = expected['A']
        assert_frame_equal(result, expectedk)

        _skip_if_no_pchip()
        result = df.interpolate(method='pchip')
        expected['A'].iloc[2] = 3
        expected['A'].iloc[5] = 6.125
        assert_frame_equal(result, expected)
Exemplo n.º 2
0
 def test_interp_bad_method(self):
     df = DataFrame({'A': [1, 2, np.nan, 4],
                     'B': [1, 4, 9, np.nan],
                     'C': [1, 2, 3, 5],
                     'D': list('abcd')})
     with pytest.raises(ValueError):
         df.interpolate(method='not_a_method')
Exemplo n.º 3
0
    def test_interp_alt_scipy(self):
        tm._skip_if_no_scipy()
        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        result = df.interpolate(method='barycentric')
        expected = df.copy()
        expected.loc[2, 'A'] = 3
        expected.loc[5, 'A'] = 6
        assert_frame_equal(result, expected)

        result = df.interpolate(method='barycentric', downcast='infer')
        assert_frame_equal(result, expected.astype(np.int64))

        result = df.interpolate(method='krogh')
        expectedk = df.copy()
        expectedk['A'] = expected['A']
        assert_frame_equal(result, expectedk)

        _skip_if_no_pchip()
        import scipy
        result = df.interpolate(method='pchip')
        expected.loc[2, 'A'] = 3

        if LooseVersion(scipy.__version__) >= '0.17.0':
            expected.loc[5, 'A'] = 6.0
        else:
            expected.loc[5, 'A'] = 6.125

        assert_frame_equal(result, expected)
Exemplo n.º 4
0
 def test_interp_raise_on_only_mixed(self):
     df = DataFrame({'A': [1, 2, np.nan, 4],
                     'B': ['a', 'b', 'c', 'd'],
                     'C': [np.nan, 2, 5, 7],
                     'D': [np.nan, np.nan, 9, 9],
                     'E': [1, 2, 3, 4]})
     with pytest.raises(TypeError):
         df.interpolate(axis=1)
Exemplo n.º 5
0
 def test_interp_inplace_row(self):
     # GH 10395
     result = DataFrame({'a': [1., 2., 3., 4.],
                         'b': [np.nan, 2., 3., 4.],
                         'c': [3, 2, 2, 2]})
     expected = result.interpolate(method='linear', axis=1, inplace=False)
     result.interpolate(method='linear', axis=1, inplace=True)
     assert_frame_equal(result, expected)
Exemplo n.º 6
0
 def test_interp_raise_on_all_object_dtype(self):
     # GH 22985
     df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
     msg = (
         "Cannot interpolate with all object-dtype columns "
         "in the DataFrame. Try setting at least one "
         "column to a numeric dtype."
     )
     with pytest.raises(TypeError, match=msg):
         df.interpolate()
Exemplo n.º 7
0
 def test_interp_inplace_row(self):
     # GH 10395
     result = DataFrame({
         "a": [1.0, 2.0, 3.0, 4.0],
         "b": [np.nan, 2.0, 3.0, 4.0],
         "c": [3, 2, 2, 2]
     })
     expected = result.interpolate(method="linear", axis=1, inplace=False)
     result.interpolate(method="linear", axis=1, inplace=True)
     tm.assert_frame_equal(result, expected)
Exemplo n.º 8
0
 def test_interp_raise_on_only_mixed(self):
     df = DataFrame({
         'A': [1, 2, np.nan, 4],
         'B': ['a', 'b', 'c', 'd'],
         'C': [np.nan, 2, 5, 7],
         'D': [np.nan, np.nan, 9, 9],
         'E': [1, 2, 3, 4]
     })
     with tm.assertRaises(TypeError):
         df.interpolate(axis=1)
Exemplo n.º 9
0
 def test_interp_inplace_row(self):
     # GH 10395
     result = DataFrame({
         'a': [1., 2., 3., 4.],
         'b': [np.nan, 2., 3., 4.],
         'c': [3, 2, 2, 2]
     })
     expected = result.interpolate(method='linear', axis=1, inplace=False)
     result.interpolate(method='linear', axis=1, inplace=True)
     assert_frame_equal(result, expected)
Exemplo n.º 10
0
 def test_interp_string_axis(self, axis_name, axis_number):
     # https://github.com/pandas-dev/pandas/issues/25190
     x = np.linspace(0, 100, 1000)
     y = np.sin(x)
     df = DataFrame(data=np.tile(y, (10, 1)),
                    index=np.arange(10),
                    columns=x).reindex(columns=x * 1.005)
     result = df.interpolate(method="linear", axis=axis_name)
     expected = df.interpolate(method="linear", axis=axis_number)
     tm.assert_frame_equal(result, expected)
Exemplo n.º 11
0
 def test_interp_raise_on_only_mixed(self):
     df = DataFrame({
         "A": [1, 2, np.nan, 4],
         "B": ["a", "b", "c", "d"],
         "C": [np.nan, 2, 5, 7],
         "D": [np.nan, np.nan, 9, 9],
         "E": [1, 2, 3, 4],
     })
     with pytest.raises(TypeError):
         df.interpolate(axis=1)
Exemplo n.º 12
0
    def fit(self, X: pd.DataFrame):
        """Learn the mixture probability, mean and covariance for each component k.
        Store the computed energy based on the training data and the aforementioned parameters.
        Parameters
        ----------
        X : dataframe of shape (n_samples, n_features)
            The input samples.
        """
        X.interpolate(inplace=True)
        X.bfill(inplace=True)
        data = X.values
        sequences = [
            data[i:i + self.sequence_length]
            for i in range(X.shape[0] - self.sequence_length + 1)
        ]
        data_loader = DataLoader(dataset=sequences,
                                 batch_size=self.batch_size,
                                 shuffle=True,
                                 drop_last=True)
        self.hidden_size = 5 + int(X.shape[1] / 20)
        autoencoder = self.autoencoder_type(X.shape[1],
                                            hidden_size=self.hidden_size,
                                            **self.autoencoder_args)
        self.dagmm = DAGMMModule(autoencoder,
                                 n_gmm=self.gmm_k,
                                 latent_dim=self.hidden_size + 2,
                                 seed=self.seed,
                                 gpu=self.gpu)
        self.to_device(self.dagmm)
        self.optimizer = torch.optim.Adam(self.dagmm.parameters(), lr=self.lr)

        for _ in trange(self.num_epochs):
            for input_data in data_loader:
                input_data = self.to_var(input_data)
                self.dagmm_step(input_data.float())

        self.dagmm.eval()
        n = 0
        mu_sum = 0
        cov_sum = 0
        gamma_sum = 0
        for input_data in data_loader:
            input_data = self.to_var(input_data)
            _, _, z, gamma = self.dagmm(input_data.float())
            phi, mu, cov = self.dagmm.compute_gmm_params(z, gamma)

            batch_gamma_sum = torch.sum(gamma, dim=0)

            gamma_sum += batch_gamma_sum
            mu_sum += mu * batch_gamma_sum.unsqueeze(
                -1)  # keep sums of the numerator only
            cov_sum += cov * batch_gamma_sum.unsqueeze(-1).unsqueeze(
                -1)  # keep sums of the numerator only

            n += input_data.size(0)
Exemplo n.º 13
0
def concatenate_lolo_tables(_dir, out_file):
    _list = [os.path.join(_dir, x) for x in os.listdir(_dir) if 'gridmet' in x]
    _list.sort()
    modis = [
        os.path.join(_dir, x) for x in os.listdir(_dir) if 'gridmet' not in x
    ]
    mod_data = {}
    for m in modis:
        mod = read_csv(m).drop(columns=['.geo', 'system:index', 'Id'])
        dates = [
            datetime.strptime(x.split('/')[-1][0:10], '%Y_%m_%d')
            for x in mod.columns
        ]
        param = mod.columns[0].split('/')[-1].split('_')[-1]
        vals = [x * 0.1 for x in list(mod.loc[0, :])]
        s = DataFrame(data=vals, index=dates)
        s.fillna(method='ffill', inplace=True)
        s = s.resample('D').asfreq()
        s = s / 8.
        s.interpolate(method='polynomial', order=3, inplace=True)
        s = s.reindex(date_range(dates[0], '{}-12-31'.format(TEST_YEARS[-1])))
        s.fillna(method='ffill', inplace=True)
        mod_data[param] = s
    df = concat(mod_data, sort=False, axis=1)
    df.columns = ['MOD16A2_{}'.format(x[0]) for x in list(df.columns)]

    for csv in _list:
        c = read_csv(csv).drop(columns=['.geo', 'system:index', 'Id'])
        param = 'gridmet_{}'.format(
            c.columns[0].split('/')[-1].split('_')[-1]).upper()
        dates = [
            datetime.strptime(x.split('/')[-1].split('_')[0], '%Y%m%d')
            for x in c.columns
        ]
        vals = [x for x in list(c.loc[0, :])]
        c = DataFrame(data=vals, index=dates, columns=[param])
        c.fillna(method='ffill', inplace=True)
        if param not in df.columns:
            df[param] = c.reindex(index=df.index)
        else:
            df[param].loc[c.index] = c.values.reshape(c.values.shape[0], )

    df.to_csv(out_file)
    cols = [
        df['MOD16A2_ET'].resample('M').sum(),
        df['MOD16A2_PET'].resample('M').sum(),
        df['GRIDMET_ETR'].resample('M').sum(),
        df['GRIDMET_PR'].resample('M').sum(),
        df['GRIDMET_TMMN'].resample('M').mean(),
        df['GRIDMET_TMMX'].resample('M').mean()
    ]
    s = [(x.name, x.values) for x in cols]
    _dct = {k: v for (k, v) in s}
    df = DataFrame(data=_dct, index=cols[0].index)
    df.to_csv(out_file.replace('daily', 'monthly'))
Exemplo n.º 14
0
    def fit(self, X: pd.DataFrame):
        X.interpolate(inplace=True)
        X.bfill(inplace=True)
        data = X.values
        sequences = [
            data[i:i + self.sequence_length]
            for i in range(data.shape[0] - self.sequence_length + 1)
        ]
        indices = np.random.permutation(len(sequences))
        split_point = int(self.train_gaussian_percentage * len(sequences))
        train_loader = DataLoader(dataset=sequences,
                                  batch_size=self.batch_size,
                                  drop_last=True,
                                  sampler=SubsetRandomSampler(
                                      indices[:-split_point]),
                                  pin_memory=True)
        train_gaussian_loader = DataLoader(dataset=sequences,
                                           batch_size=self.batch_size,
                                           drop_last=True,
                                           sampler=SubsetRandomSampler(
                                               indices[-split_point:]),
                                           pin_memory=True)

        self.aed = AutoEncoderModule(X.shape[1],
                                     self.sequence_length,
                                     self.hidden_size,
                                     seed=self.seed,
                                     gpu=self.gpu)
        self.to_device(self.aed)  # .double()
        optimizer = torch.optim.Adam(self.aed.parameters(), lr=self.lr)

        self.aed.train()
        for epoch in trange(self.num_epochs):
            logging.debug(f'Epoch {epoch+1}/{self.num_epochs}.')
            for ts_batch in train_loader:
                output = self.aed(self.to_var(ts_batch))
                loss = nn.MSELoss(size_average=False)(output,
                                                      self.to_var(
                                                          ts_batch.float()))
                self.aed.zero_grad()
                loss.backward()
                optimizer.step()

        self.aed.eval()
        error_vectors = []
        for ts_batch in train_gaussian_loader:
            output = self.aed(self.to_var(ts_batch))
            error = nn.L1Loss(reduce=False)(output,
                                            self.to_var(ts_batch.float()))
            error_vectors += list(
                error.view(-1, X.shape[1]).data.cpu().numpy())

        self.mean = np.mean(error_vectors, axis=0)
        self.cov = np.cov(error_vectors, rowvar=False)
Exemplo n.º 15
0
 def test_interp_raise_on_all_object_dtype(self):
     # GH 22985
     df = DataFrame({
         'A': [1, 2, 3],
         'B': [4, 5, 6]},
         dtype='object')
     msg = ("Cannot interpolate with all object-dtype columns "
            "in the DataFrame. Try setting at least one "
            "column to a numeric dtype.")
     with pytest.raises(TypeError, match=msg):
         df.interpolate()
Exemplo n.º 16
0
    def test_interp_leading_nans(self):
        df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
                        "B": [np.nan, -3, -3.5, np.nan, -4]})
        result = df.interpolate()
        expected = df.copy()
        expected['B'].loc[3] = -3.75
        assert_frame_equal(result, expected)

        tm._skip_if_no_scipy()
        result = df.interpolate(method='polynomial', order=1)
        assert_frame_equal(result, expected)
Exemplo n.º 17
0
    def test_interp_time_inplace_axis(self, axis):
        # GH 9687
        periods = 5
        idx = date_range(start="2014-01-01", periods=periods)
        data = np.random.rand(periods, periods)
        data[data < 0.5] = np.nan
        expected = DataFrame(index=idx, columns=idx, data=data)

        result = expected.interpolate(axis=0, method="time")
        expected.interpolate(axis=0, method="time", inplace=True)
        tm.assert_frame_equal(result, expected)
Exemplo n.º 18
0
    def test_interp_leading_nans(self, check_scipy):
        df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
                        "B": [np.nan, -3, -3.5, np.nan, -4]})
        result = df.interpolate()
        expected = df.copy()
        expected['B'].loc[3] = -3.75
        assert_frame_equal(result, expected)

        if check_scipy:
            result = df.interpolate(method='polynomial', order=1)
            assert_frame_equal(result, expected)
Exemplo n.º 19
0
 def test_interp_bad_method(self):
     df = DataFrame(
         {
             "A": [1, 2, np.nan, 4],
             "B": [1, 4, 9, np.nan],
             "C": [1, 2, 3, 5],
             "D": list("abcd"),
         }
     )
     with pytest.raises(ValueError):
         df.interpolate(method="not_a_method")
Exemplo n.º 20
0
 def test_interp_raise_on_all_object_dtype(self):
     # GH 22985
     df = DataFrame({
         'A': [1, 2, 3],
         'B': [4, 5, 6]},
         dtype='object')
     with tm.assert_raises_regex(
             TypeError,
             "Cannot interpolate with all object-dtype columns "
             "in the DataFrame. Try setting at least one "
             "column to a numeric dtype."):
         df.interpolate()
Exemplo n.º 21
0
    def decision_function(self, X: pd.DataFrame):
        """Predict raw anomaly score of X using the fitted detector.
        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.
        Using the learned mixture probability, mean and covariance for each component k, compute the energy on the
        given data.

        Parameters
        ----------
        X : dataframe of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.
        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        self.dagmm.eval()
        X.interpolate(inplace=True)
        X.bfill(inplace=True)
        data = X.values
        sequences = [data[i:i + self.sequence_length] for i in range(len(data) - self.sequence_length + 1)]
        data_loader = DataLoader(dataset=sequences, batch_size=1, shuffle=False)
        test_energy = np.full((self.sequence_length, X.shape[0]), np.nan)

        encodings = np.full((self.sequence_length, X.shape[0], self.hidden_size), np.nan)
        decodings = np.full((self.sequence_length, X.shape[0], X.shape[1]), np.nan)
        euc_errors = np.full((self.sequence_length, X.shape[0]), np.nan)
        csn_errors = np.full((self.sequence_length, X.shape[0]), np.nan)

        for i, sequence in enumerate(data_loader):
            enc, dec, z, _ = self.dagmm(self.to_var(sequence).float())
            sample_energy, _ = self.dagmm.compute_energy(z, size_average=False)
            idx = (i % self.sequence_length, np.arange(i, i + self.sequence_length))
            test_energy[idx] = sample_energy.data.numpy()

            if self.details:
                encodings[idx] = enc.data.numpy()
                decodings[idx] = dec.data.numpy()
                euc_errors[idx] = z[:, 1].data.numpy()
                csn_errors[idx] = z[:, 2].data.numpy()

        test_energy = np.nanmean(test_energy, axis=0)

        if self.details:
            self.prediction_details.update({'latent_representations': np.nanmean(encodings, axis=0).T})
            self.prediction_details.update({'reconstructions_mean': np.nanmean(decodings, axis=0).T})
            self.prediction_details.update({'euclidean_errors_mean': np.nanmean(euc_errors, axis=0)})
            self.prediction_details.update({'cosine_errors_mean': np.nanmean(csn_errors, axis=0)})

        return test_energy
Exemplo n.º 22
0
    def predict(self, X: pd.DataFrame):
        """Using the learned mixture probability, mean and covariance for each component k, compute the energy on the
        given data."""
        self.dagmm.eval()
        X.interpolate(inplace=True)
        X.bfill(inplace=True)
        data = X.values
        sequences = [
            data[i:i + self.sequence_length]
            for i in range(len(data) - self.sequence_length + 1)
        ]
        data_loader = DataLoader(dataset=sequences,
                                 batch_size=1,
                                 shuffle=False)
        test_energy = np.full((self.sequence_length, X.shape[0]), np.nan)

        encodings = np.full(
            (self.sequence_length, X.shape[0], self.hidden_size), np.nan)
        decodings = np.full((self.sequence_length, X.shape[0], X.shape[1]),
                            np.nan)
        euc_errors = np.full((self.sequence_length, X.shape[0]), np.nan)
        csn_errors = np.full((self.sequence_length, X.shape[0]), np.nan)

        for i, sequence in enumerate(data_loader):
            enc, dec, z, gamma = self.dagmm(self.to_var(sequence).float())
            sample_energy, _ = self.dagmm.compute_energy(z, size_average=False)
            idx = (i % self.sequence_length,
                   np.arange(i, i + self.sequence_length))
            test_energy[idx] = sample_energy.data.cpu().numpy()

            if self.details:
                encodings[idx] = enc.data.cpu().numpy()
                decodings[idx] = dec.data.cpu().numpy()
                euc_errors[idx] = z[:, 1].data.cpu().numpy()
                csn_errors[idx] = z[:, 2].data.cpu().numpy()

        test_energy = np.nanmean(test_energy, axis=0)

        if self.details:
            self.prediction_details.update(
                {'latent_representations': np.nanmean(encodings, axis=0).T})
            self.prediction_details.update(
                {'reconstructions_mean': np.nanmean(decodings, axis=0).T})
            self.prediction_details.update(
                {'euclidean_errors_mean': np.nanmean(euc_errors, axis=0)})
            self.prediction_details.update(
                {'cosine_errors_mean': np.nanmean(csn_errors, axis=0)})

        return test_energy
Exemplo n.º 23
0
 def test_interp_bad_method(self):
     df = DataFrame({
         "A": [1, 2, np.nan, 4],
         "B": [1, 4, 9, np.nan],
         "C": [1, 2, 3, 5],
         "D": list("abcd"),
     })
     msg = (
         r"method must be one of \['linear', 'time', 'index', 'values', "
         r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', "
         r"'barycentric', 'krogh', 'spline', 'polynomial', "
         r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', "
         r"'cubicspline'\]. Got 'not_a_method' instead.")
     with pytest.raises(ValueError, match=msg):
         df.interpolate(method="not_a_method")
Exemplo n.º 24
0
 def test_interp_empty(self):
     # https://github.com/pandas-dev/pandas/issues/35598
     df = DataFrame()
     result = df.interpolate()
     assert result is not df
     expected = df
     tm.assert_frame_equal(result, expected)
Exemplo n.º 25
0
def fix_failed(X: pd.DataFrame, y: pd.DataFrame,
               req_len: int) -> Tuple[pd.DataFrame]:
    """
    Fix training samples with missing data.
    """
    # FIXME: bettter fixing methods?
    if X.shape[0] < req_len:
        return (None, None)

    if np.any(y.isnull()):
        # When the target is missing, this tuple cannot be fixed.
        return (None, None)

    if X.shape[0] != req_len:
        return (None, None)

    try:
        fixed_X = X.interpolate(
            method="nearest",
            aixs=0,
        )

        fixed_X.fillna(method="bfill", inplace=True)

        fixed_X.fillna(method="ffill", inplace=True)
        return (fixed_X, y)
    except ValueError:
        return (None, None)
Exemplo n.º 26
0
 def __init__(self, prices: pd.DataFrame, mv: pd.DataFrame, rf: bool):
     price_names = np.array([price_name for price_name in prices["NAME"]])
     mv_names = np.array(
         [mv_name.split(" - ")[0] for mv_name in mv["NAME"]])
     assert all([
         price_name == mv_name
         for price_name, mv_name in zip(price_names, mv_names)
     ])
     self.__company_names: np.array = price_names
     del mv["NAME"]
     del prices["NAME"]
     self.__prices: pd.DataFrame = prices.interpolate()
     self.__mv: pd.DataFrame = mv.interpolate()
     self.__moving_portfolios: pd.DataFrame = self.__compute_portfolios()
     self.__dates: np.ndarray = self.__moving_portfolios.columns.values
     self.__rf = self.import_rf().interpolate() if rf else None
Exemplo n.º 27
0
    def test_interp_basic(self):
        df = DataFrame(
            {
                "A": [1, 2, np.nan, 4],
                "B": [1, 4, 9, np.nan],
                "C": [1, 2, 3, 5],
                "D": list("abcd"),
            }
        )
        expected = DataFrame(
            {
                "A": [1.0, 2.0, 3.0, 4.0],
                "B": [1.0, 4.0, 9.0, 9.0],
                "C": [1, 2, 3, 5],
                "D": list("abcd"),
            }
        )
        result = df.interpolate()
        tm.assert_frame_equal(result, expected)

        result = df.set_index("C").interpolate()
        expected = df.set_index("C")
        expected.loc[3, "A"] = 3
        expected.loc[5, "B"] = 9
        tm.assert_frame_equal(result, expected)
Exemplo n.º 28
0
 def __call__(self, strategy):
     prices = strategy.get_indicator_prices()
     trend = DataFrame(None, index = prices.index, columns = prices.columns, dtype = float)
     last_SP = Series(None, index = prices.columns)
     current_trend = Series('-', index = prices.columns)
     for i in range(prices.shape[0] - self.period):
         # If there are not any new highs in the recent period then must have been 
         # a swing point high.
         SPH = ~(prices.iloc[(i + 1):(i + self.period)] > prices.iloc[i]).any()
         # NaN in series will produce false signals and need to be removed
         SPH = SPH[prices.iloc[i].notnull()]
         SPH = SPH[SPH]
         # Only mark as swing point high if currently in uptrend or unidentified trend, otherwise ignore.
         SPH = SPH[current_trend[SPH.index] != 'DOWN']
         if not SPH.empty:
             current_trend[SPH.index] = 'DOWN'
             trend.loc[trend.index[i], SPH.index] = prices.iloc[i][SPH.index]
         # Repeat for swing point lows.
         SPL = ~(prices.iloc[(i + 1):(i + self.period)] < prices.iloc[i]).any()
         SPL = SPL[prices.iloc[i].notnull()]
         SPL = SPL[SPL]
         SPL = SPL[current_trend[SPL.index] != 'UP']
         if not SPL.empty:
             current_trend[SPL.index] = 'UP'
             trend.loc[trend.index[i], SPL.index] = prices.iloc[i][SPL.index]
     self.trend = trend.interpolate()
Exemplo n.º 29
0
 def __call__(self, strategy):
     prices = strategy.get_indicator_prices()
     trend = DataFrame(None,
                       index=prices.index,
                       columns=prices.columns,
                       dtype=float)
     last_SP = Series(None, index=prices.columns)
     current_trend = Series('-', index=prices.columns)
     for i in range(prices.shape[0] - self.period):
         # If there are not any new highs in the recent period then must have been
         # a swing point high.
         SPH = ~(prices.iloc[(i + 1):
                             (i + self.period)] > prices.iloc[i]).any()
         # NaN in series will produce false signals and need to be removed
         SPH = SPH[prices.iloc[i].notnull()]
         SPH = SPH[SPH]
         # Only mark as swing point high if currently in uptrend or unidentified trend, otherwise ignore.
         SPH = SPH[current_trend[SPH.index] != 'DOWN']
         if not SPH.empty:
             current_trend[SPH.index] = 'DOWN'
             trend.loc[trend.index[i],
                       SPH.index] = prices.iloc[i][SPH.index]
         # Repeat for swing point lows.
         SPL = ~(prices.iloc[(i + 1):
                             (i + self.period)] < prices.iloc[i]).any()
         SPL = SPL[prices.iloc[i].notnull()]
         SPL = SPL[SPL]
         SPL = SPL[current_trend[SPL.index] != 'UP']
         if not SPL.empty:
             current_trend[SPL.index] = 'UP'
             trend.loc[trend.index[i],
                       SPL.index] = prices.iloc[i][SPL.index]
     self.trend = trend.interpolate()
Exemplo n.º 30
0
def interpolate(
    data_frame: pd.DataFrame,
    limit: int = None,
    method: str = "linear",
    headers: [str] = None,
) -> pd.DataFrame:
    """This function returns the Series or DataFrame of same shape interpolated
    at the NaNs. This is a adapted interpolate function of pandas package.

    Parameters
    ----------
    data_frame : pd.DataFrame
        input dataframe
    limit : int, optional
        See pandas.DataFrame.interpolate, by default None
    method : str, optional
        See pandas.DataFrame.interpolate, by default "linear"
    headers : [str], optional
        chosen dataframe headers, by default None

    Returns
    -------
    pd.DataFrame
        Series or DataFrame of same shape interpolated at the NaNs
    """
    if headers:
        data_frame = data_frame.loc[:, headers]
    return data_frame.interpolate(method, limit=limit)
Exemplo n.º 31
0
    def test_interp_ignore_all_good(self):
        # GH
        df = DataFrame({
            "A": [1, 2, np.nan, 4],
            "B": [1, 2, 3, 4],
            "C": [1.0, 2.0, np.nan, 4.0],
            "D": [1.0, 2.0, 3.0, 4.0],
        })
        expected = DataFrame({
            "A":
            np.array([1, 2, 3, 4], dtype="float64"),
            "B":
            np.array([1, 2, 3, 4], dtype="int64"),
            "C":
            np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
            "D":
            np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
        })

        result = df.interpolate(downcast=None)
        tm.assert_frame_equal(result, expected)

        # all good
        result = df[["B", "D"]].interpolate(downcast=None)
        tm.assert_frame_equal(result, df[["B", "D"]])
Exemplo n.º 32
0
    def _fill_na(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        return dataframe with filled missing values by defined method
        Parameters
        ----------
        X: pd.DataFrame

        Returns
        -------
        pd.DataFrame
        """
        if self._fill_method == 'mean':
            for col in X.columns:
                mean = X[col].mean()
                X[col] = X[col].fillna(mean)
        elif self._fill_method == 'median':
            for col in X.columns:
                median = X[col].median()
                X[col] = X[col].fillna(median)
        elif self._fill_method == 'ffill':
            X = X.ffill()
        elif self._fill_method == 'bfill':
            X = X.bfill()
        elif self._fill_method == 'interpolate':
            X = X.interpolate()
        return X
Exemplo n.º 33
0
    def fill_missing_data(data_frame: pd.DataFrame) -> pd.DataFrame:
        """
        Fills missing data in the data_frame using interpolation method.

        :param data_frame: (pd.DataFrame) pandas DataFrame object to perform interpolation on.
        :return: (pd.DataFrame) interpolated DataFrame object with missing data filled.
        """
        return data_frame.interpolate()
Exemplo n.º 34
0
def resample(df: pd.DataFrame, time_index: pd.Series, period="1S"):
    """Resamples the dataframe with the given `period` while using `time` as an index."""
    time_range = get_time_range(time_index)
    time_length = time_range.start - time_range.end
    df.index = time_index
    if pd.Timedelta(period) < time_length:
        df = df.resample(period).first()
    return df.interpolate()
Exemplo n.º 35
0
def preproc_pipeline(df: pd.DataFrame):
    """Функция предобработки для поминутных данных."""
    # Выбросим столбцы, в которых есть слово 'market'
    df.drop([col for col in df.columns if 'market' in col], axis=1, inplace=True)

    # Переименуем столбцы
    df.columns = df.columns.map(snake_case)

    return df.interpolate().fillna(0)
Exemplo n.º 36
0
def interpolate(df: pd.DataFrame): 
    """
    Returns the DataFrame with missing values filled in using column (axis = 1) interpolation. 
    For example, if we're missing data for the month of March, the surrounding months of January
    and February will be averaged and imputed for March. 
    Edge cases are considered negligible and are ignored. 
    """
    df = df.interpolate(axis=1)
    print ('Interpolation successful.')
    return df 
Exemplo n.º 37
0
 def test_interpolate_pos_args_deprecation(self):
     # https://github.com/pandas-dev/pandas/issues/41485
     df = DataFrame({"a": [1, 2, 3]})
     msg = (
         r"In a future version of pandas all arguments of DataFrame.interpolate "
         r"except for the argument 'method' will be keyword-only")
     with tm.assert_produces_warning(FutureWarning, match=msg):
         result = df.interpolate("pad", 0)
     expected = DataFrame({"a": [1, 2, 3]})
     tm.assert_frame_equal(result, expected)
Exemplo n.º 38
0
 def test_interp_fillna_methods(self, axis, method):
     # GH 12918
     df = DataFrame({
         "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0],
         "B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0],
         "C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0],
     })
     expected = df.fillna(axis=axis, method=method)
     result = df.interpolate(method=method, axis=axis)
     tm.assert_frame_equal(result, expected)
Exemplo n.º 39
0
    def test_interp_rowwise(self):
        df = DataFrame({0: [1, 2, np.nan, 4],
                        1: [2, 3, 4, np.nan],
                        2: [np.nan, 4, 5, 6],
                        3: [4, np.nan, 6, 7],
                        4: [1, 2, 3, 4]})
        result = df.interpolate(axis=1)
        expected = df.copy()
        expected.loc[3, 1] = 5
        expected.loc[0, 2] = 3
        expected.loc[1, 3] = 3
        expected[4] = expected[4].astype(np.float64)
        assert_frame_equal(result, expected)

        result = df.interpolate(axis=1, method='values')
        assert_frame_equal(result, expected)

        result = df.interpolate(axis=0)
        expected = df.interpolate()
        assert_frame_equal(result, expected)
Exemplo n.º 40
0
    def test_interp_basic(self):
        df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan],
                        'C': [1, 2, 3, 5], 'D': list('abcd')})
        expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.],
                              'C': [1, 2, 3, 5], 'D': list('abcd')})
        result = df.interpolate()
        assert_frame_equal(result, expected)

        result = df.set_index('C').interpolate()
        expected = df.set_index('C')
        expected.A.loc[3] = 3
        expected.B.loc[5] = 9
        assert_frame_equal(result, expected)
Exemplo n.º 41
0
class Interpolate(object):

    params = [None, 'infer']
    param_names = ['downcast']

    def setup(self, downcast):
        N = 10000
        # this is the worst case, where every column has NaNs.
        self.df = DataFrame(np.random.randn(N, 100))
        self.df.values[::2] = np.nan

        self.df2 = DataFrame({'A': np.arange(0, N),
                              'B': np.random.randint(0, 100, N),
                              'C': np.random.randn(N),
                              'D': np.random.randn(N)})
        self.df2.loc[1::5, 'A'] = np.nan
        self.df2.loc[1::5, 'C'] = np.nan

    def time_interpolate(self, downcast):
        self.df.interpolate(downcast=downcast)

    def time_interpolate_some_good(self, downcast):
        self.df2.interpolate(downcast=downcast)
Exemplo n.º 42
0
    def test_interp_basic(self):
        df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan],
                        'C': [1, 2, 3, 5], 'D': list('abcd')})
        expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9],
                              'C': [1, 2, 3, 5], 'D': list('abcd')})
        result = df.interpolate()
        assert_frame_equal(result, expected)

        result = df.set_index('C').interpolate()
        expected = df.set_index('C')
        expected.A.loc[3] = 3
        expected.B.loc[5] = 9
        expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64)

        assert_frame_equal(result, expected)
Exemplo n.º 43
0
def test_interpolate():
    skip_if_not_available(modules=['pandas'])
    """ Ensure tha DataFrame.interpolate(method='nearest') has the
    desired properties.

    It is used by blocks-plot and should:

    * interpolate missing/NaN datapoints between valid ones
    * not replace any NaN before/after the first/last finite datapoint
    """
    y = [nan, nan, 2., 3., nan, 5, nan, nan]
    df = DataFrame(y)
    df_ = df.interpolate(method='nearest')[0]

    assert all(isfinite(df_[2:6]))
    assert all(~isfinite(df_[0:2]))
    assert all(~isfinite(df_[6:8]))
Exemplo n.º 44
0
    def test_interp_ignore_all_good(self):
        # GH
        df = DataFrame({'A': [1, 2, np.nan, 4],
                        'B': [1, 2, 3, 4],
                        'C': [1., 2., np.nan, 4.],
                        'D': [1., 2., 3., 4.]})
        expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float64'),
                              'B': np.array([1, 2, 3, 4], dtype='int64'),
                              'C': np.array([1., 2., 3, 4.], dtype='float64'),
                              'D': np.array([1., 2., 3., 4.], dtype='float64')})

        result = df.interpolate(downcast=None)
        assert_frame_equal(result, expected)

        # all good
        result = df[['B', 'D']].interpolate(downcast=None)
        assert_frame_equal(result, df[['B', 'D']])
Exemplo n.º 45
0
    def test_interp_various(self):
        tm._skip_if_no_scipy()

        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        df = df.set_index('C')
        expected = df.copy()
        result = df.interpolate(method='polynomial', order=1)

        expected.A.loc[3] = 2.66666667
        expected.A.loc[13] = 5.76923076
        assert_frame_equal(result, expected)

        result = df.interpolate(method='cubic')
        # GH #15662.
        # new cubic and quadratic interpolation algorithms from scipy 0.19.0.
        # previously `splmake` was used. See scipy/scipy#6710
        if _is_scipy_ge_0190:
            expected.A.loc[3] = 2.81547781
            expected.A.loc[13] = 5.52964175
        else:
            expected.A.loc[3] = 2.81621174
            expected.A.loc[13] = 5.64146581
        assert_frame_equal(result, expected)

        result = df.interpolate(method='nearest')
        expected.A.loc[3] = 2
        expected.A.loc[13] = 5
        assert_frame_equal(result, expected, check_dtype=False)

        result = df.interpolate(method='quadratic')
        if _is_scipy_ge_0190:
            expected.A.loc[3] = 2.82150771
            expected.A.loc[13] = 6.12648668
        else:
            expected.A.loc[3] = 2.82533638
            expected.A.loc[13] = 6.02817974
        assert_frame_equal(result, expected)

        result = df.interpolate(method='slinear')
        expected.A.loc[3] = 2.66666667
        expected.A.loc[13] = 5.76923077
        assert_frame_equal(result, expected)

        result = df.interpolate(method='zero')
        expected.A.loc[3] = 2.
        expected.A.loc[13] = 5
        assert_frame_equal(result, expected, check_dtype=False)
Exemplo n.º 46
0
    def test_interp_various(self):
        _skip_if_no_scipy()
        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        df = df.set_index('C')
        expected = df.copy()
        result = df.interpolate(method='polynomial', order=1)

        expected.A.loc[3] = 2.66666667
        expected.A.loc[13] = 5.76923076
        assert_frame_equal(result, expected)

        result = df.interpolate(method='cubic')
        expected.A.loc[3] = 2.81621174
        expected.A.loc[13] = 5.64146581
        assert_frame_equal(result, expected)

        result = df.interpolate(method='nearest')
        expected.A.loc[3] = 2
        expected.A.loc[13] = 5
        assert_frame_equal(result, expected, check_dtype=False)

        result = df.interpolate(method='quadratic')
        expected.A.loc[3] = 2.82533638
        expected.A.loc[13] = 6.02817974
        assert_frame_equal(result, expected)

        result = df.interpolate(method='slinear')
        expected.A.loc[3] = 2.66666667
        expected.A.loc[13] = 5.76923077
        assert_frame_equal(result, expected)

        result = df.interpolate(method='zero')
        expected.A.loc[3] = 2.
        expected.A.loc[13] = 5
        assert_frame_equal(result, expected, check_dtype=False)

        result = df.interpolate(method='quadratic')
        expected.A.loc[3] = 2.82533638
        expected.A.loc[13] = 6.02817974
        assert_frame_equal(result, expected)
Exemplo n.º 47
0
    def test_interp_various(self):
        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        df = df.set_index('C')
        expected = df.copy()
        result = df.interpolate(method='polynomial', order=1)

        expected.A.loc[3] = 2.66666667
        expected.A.loc[13] = 5.76923076
        assert_frame_equal(result, expected)

        result = df.interpolate(method='cubic')
        # GH #15662.
        expected.A.loc[3] = 2.81547781
        expected.A.loc[13] = 5.52964175
        assert_frame_equal(result, expected)

        result = df.interpolate(method='nearest')
        expected.A.loc[3] = 2
        expected.A.loc[13] = 5
        assert_frame_equal(result, expected, check_dtype=False)

        result = df.interpolate(method='quadratic')
        expected.A.loc[3] = 2.82150771
        expected.A.loc[13] = 6.12648668
        assert_frame_equal(result, expected)

        result = df.interpolate(method='slinear')
        expected.A.loc[3] = 2.66666667
        expected.A.loc[13] = 5.76923077
        assert_frame_equal(result, expected)

        result = df.interpolate(method='zero')
        expected.A.loc[3] = 2.
        expected.A.loc[13] = 5
        assert_frame_equal(result, expected, check_dtype=False)
Exemplo n.º 48
0
 def test_rowwise_alt(self):
     df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
                     1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
     df.interpolate(axis=0)
Exemplo n.º 49
0
 def test_interp_nan_idx(self):
     df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
     df = df.set_index('A')
     with pytest.raises(NotImplementedError):
         df.interpolate(method='values')