Пример #1
0
    def test_align_multiindex(self):
        # GH#10665
        # same test cases as test_align_multiindex in test_series.py

        midx = pd.MultiIndex.from_product(
            [range(2), range(3), range(2)], names=("a", "b", "c"))
        idx = Index(range(2), name="b")
        df1 = DataFrame(np.arange(12, dtype="int64"), index=midx)
        df2 = DataFrame(np.arange(2, dtype="int64"), index=idx)

        # these must be the same results (but flipped)
        res1l, res1r = df1.align(df2, join="left")
        res2l, res2r = df2.align(df1, join="right")

        expl = df1
        tm.assert_frame_equal(expl, res1l)
        tm.assert_frame_equal(expl, res2r)
        expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
        tm.assert_frame_equal(expr, res1r)
        tm.assert_frame_equal(expr, res2l)

        res1l, res1r = df1.align(df2, join="right")
        res2l, res2r = df2.align(df1, join="left")

        exp_idx = pd.MultiIndex.from_product(
            [range(2), range(2), range(2)], names=("a", "b", "c"))
        expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
        tm.assert_frame_equal(expl, res1l)
        tm.assert_frame_equal(expl, res2r)
        expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
        tm.assert_frame_equal(expr, res1r)
        tm.assert_frame_equal(expr, res2l)
Пример #2
0
    def test_frame_align_aware(self):
        idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern")
        idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern")
        df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
        df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
        new1, new2 = df1.align(df2)
        assert df1.index.tz == new1.index.tz
        assert df2.index.tz == new2.index.tz

        # different timezones convert to UTC

        # frame with frame
        df1_central = df1.tz_convert("US/Central")
        new1, new2 = df1.align(df1_central)
        assert new1.index.tz == pytz.UTC
        assert new2.index.tz == pytz.UTC

        # frame with Series
        new1, new2 = df1.align(df1_central[0], axis=0)
        assert new1.index.tz == pytz.UTC
        assert new2.index.tz == pytz.UTC

        df1[0].align(df1_central, axis=0)
        assert new1.index.tz == pytz.UTC
        assert new2.index.tz == pytz.UTC
Пример #3
0
    def test_frame_align_aware(self):
        idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
        idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
        df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
        df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
        new1, new2 = df1.align(df2)
        assert df1.index.tz == new1.index.tz
        assert df2.index.tz == new2.index.tz

        # different timezones convert to UTC

        # frame with frame
        df1_central = df1.tz_convert('US/Central')
        new1, new2 = df1.align(df1_central)
        assert new1.index.tz == pytz.UTC
        assert new2.index.tz == pytz.UTC

        # frame with Series
        new1, new2 = df1.align(df1_central[0], axis=0)
        assert new1.index.tz == pytz.UTC
        assert new2.index.tz == pytz.UTC

        df1[0].align(df1_central, axis=0)
        assert new1.index.tz == pytz.UTC
        assert new2.index.tz == pytz.UTC
Пример #4
0
    def to_categories(train: pd.DataFrame,
                      test: pd.DataFrame,
                      vizualize=False) -> [pd.DataFrame, pd.DataFrame]:
        """
        Turns both dataframes into categories or labels depending on categories contained by each column
        :param train:
        :param test:
        :param vizualize: Print for debugging purposes
        :return:
        """
        encoder = LabelEncoder()
        for column in train:
            if train[column].dtype == 'object' and len(
                    list(train[column].unique())) <= 2:
                train[column] = encoder.fit_transform(train[column])
                test[column] = encoder.transform(test[column])

        train = pd.get_dummies(train)
        test = pd.get_dummies(test)

        train_labels = train['TARGET']
        train, test = train.align(test, join='inner', axis=1)
        train["TARGET"] = train_labels
        if vizualize:
            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                print("Train")
                print(train.head(), "\n")
                print("Test")
                print(test.head(), "\n")

        return train, test
Пример #5
0
def prepare_val_features_for_predict(train_features: pd.DataFrame,
                                     val_features: pd.DataFrame):
    train_features, val_features = train_features.align(val_features,
                                                        join="left",
                                                        axis=1)
    val_features = val_features.fillna(0)
    return val_features
Пример #6
0
 def test_align_aware(self):
     idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern")
     idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern")
     df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
     df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
     new1, new2 = df1.align(df2)
     self.assertEqual(df1.index.tz, new1.index.tz)
     self.assertEqual(df2.index.tz, new2.index.tz)
Пример #7
0
 def test_align_aware(self):
     idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
     idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
     df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
     df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
     new1, new2 = df1.align(df2)
     self.assertEqual(df1.index.tz, new1.index.tz)
     self.assertEqual(df2.index.tz, new2.index.tz)
Пример #8
0
 def test_align_aware(self):
     idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
     idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
     df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
     df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
     new1, new2 = df1.align(df2)
     self.assertEqual(df1.index.tz, new1.index.tz)
     self.assertEqual(df2.index.tz, new2.index.tz)
Пример #9
0
class LevelAlign:
    def setup(self):
        self.index = MultiIndex(
            levels=[np.arange(10), np.arange(100), np.arange(100)],
            codes=[
                np.arange(10).repeat(10000),
                np.tile(np.arange(100).repeat(100), 10),
                np.tile(np.tile(np.arange(100), 100), 10),
            ],
        )
        self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
        self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])

    def time_align_level(self):
        self.df.align(self.df_level, level=1, copy=False)

    def time_reindex_level(self):
        self.df_level.reindex(self.index, level=1)
Пример #10
0
class LevelAlign(object):

    def setup(self):
        self.index = MultiIndex(
            levels=[np.arange(10), np.arange(100), np.arange(100)],
            codes=[np.arange(10).repeat(10000),
                   np.tile(np.arange(100).repeat(100), 10),
                   np.tile(np.tile(np.arange(100), 100), 10)])
        self.df = DataFrame(np.random.randn(len(self.index), 4),
                            index=self.index)
        self.df_level = DataFrame(np.random.randn(100, 4),
                                  index=self.index.levels[1])

    def time_align_level(self):
        self.df.align(self.df_level, level=1, copy=False)

    def time_reindex_level(self):
        self.df_level.reindex(self.index, level=1)
Пример #11
0
    def __init__(self, X: DataFrame, y):
        if isinstance(y, SparseSeries):
            y = y.to_dense()
        assert isinstance(y, Series)
        assert X.ndim == 2
        assert y.ndim == 1
        self._X_tr, self._y_tr = X.align(y, axis=0, join="inner")
        self._X_te = X.ix[~X.index.isin(y.index),:]

        self.name = y.name
        self.n_predictors = X.shape[1]
        self.n_samples = X.shape[0]
Пример #12
0
    def test_multiindex_align_to_series_with_common_index_level(self):
        #  GH-46001
        foo_index = Index([1, 2, 3], name="foo")
        bar_index = Index([1, 2], name="bar")

        series = Series([1, 2], index=bar_index, name="foo_series")
        df = DataFrame(
            {"col": np.arange(6)},
            index=pd.MultiIndex.from_product([foo_index, bar_index]),
        )

        expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
        result_l, result_r = df.align(series, axis=0)

        tm.assert_frame_equal(result_l, df)
        tm.assert_series_equal(result_r, expected_r)
Пример #13
0
    def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(
            self):
        #  GH-46001
        foo_index = Index([1, 2, 3], name="foo")
        bar_index = Index([1, 2], name="bar")

        series = Series([1, 2], index=bar_index, name="foo_series")
        df = DataFrame(
            np.arange(18).reshape(6, 3),
            index=pd.MultiIndex.from_product([foo_index, bar_index]),
        )
        df.columns = ["cfoo", "cbar", "cfoo"]

        expected = Series([1, 2] * 3, index=df.index, name="foo_series")
        result_left, result_right = df.align(series, axis=0)

        tm.assert_series_equal(result_right, expected)
        tm.assert_index_equal(result_left.columns, df.columns)
Пример #14
0
def align_data(training_data: pd.DataFrame,
               test_data: pd.DataFrame,
               preserve=['TARGET']) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Align the data between a training_data set and a test_data set while
    preserving all columns specified by preserve in the training_data.

    Parameters
    ------------------
        training_data (pd.DataFrame)
            data frame of the training features.  Must include the columns 
            specified by the value of preserve
        
        test_data (pd.DataFrame)
            corresponding data frame of the test data
        
        preserve (List[str])
            list of columns from training_data to be preserved and restored
            after the data frames are aligned
    
    Return
    -------------------
        aligned_training_data (pd.DataFrame)
            training_data aligned with test_data and columns specified by 
            preserve
        
        aligned_test_data (pd.DataFrame)
            test_data aligned with training_data

    """
    print(
        f"Aligning training and test data before combining for feature engineering:"
    )

    preserved_features = training_data[preserve]
    aligned_training_data, aligned_test_data = training_data.align(
        test_data, join='inner', axis=1)

    aligned_training_data[preserve] = preserved_features

    print(f"  Aligned data has {aligned_training_data.shape[1]} columns")

    return aligned_training_data, aligned_test_data
    def test_align_categorical(self, l_ordered, r_ordered, expected):
        # GH-28397
        df_1 = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(
                pd.CategoricalDtype(list("cab"), ordered=l_ordered)),
        }).set_index("B")
        df_2 = DataFrame({
            "A":
            np.arange(5, dtype="int64"),
            "B":
            Series(list("babca")).astype(
                pd.CategoricalDtype(list("cab"), ordered=r_ordered)),
        }).set_index("B")

        aligned_1, aligned_2 = df_1.align(df_2)
        assert isinstance(aligned_1.index, expected)
        assert isinstance(aligned_2.index, expected)
        tm.assert_index_equal(aligned_1.index, aligned_2.index)
Пример #16
0
    def test_align_series_combinations(self):
        df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
        s = Series([1, 2, 4], index=list("ABD"), name="x")

        # frame + series
        res1, res2 = df.align(s, axis=0)
        exp1 = DataFrame(
            {
                "a": [1, np.nan, 3, np.nan, 5],
                "b": [1, np.nan, 3, np.nan, 5]
            },
            index=list("ABCDE"),
        )
        exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")

        tm.assert_frame_equal(res1, exp1)
        tm.assert_series_equal(res2, exp2)

        # series + frame
        res1, res2 = s.align(df)
        tm.assert_series_equal(res1, exp2)
        tm.assert_frame_equal(res2, exp1)
Пример #17
0
def classify(
    features: pd.DataFrame,
    y: qiime2.CategoricalMetadataColumn,
    c: np.ndarray = None,
    weights: np.ndarray = None,
    # taxa: skbio.TreeNode = None,
    # PATH parameters :
    path: bool = True,
    path_numerical_method: str = "not specified",
    path_n_active: int = 0,
    path_nlam_log: int = 40,
    path_lamin_log: float = 1e-2,
    # CV parameters :
    cv: bool = True,
    cv_numerical_method: str = "not specified",
    cv_seed: int = 1,
    cv_one_se: bool = True,
    cv_subsets: int = 5,
    cv_nlam: int = 100,
    cv_lamin: float = 1e-3,
    cv_logscale: bool = True,
    # StabSel parameters :
    stabsel: bool = True,
    stabsel_numerical_method: str = "not specified",
    stabsel_seed: int = None,  # do something here ! for now it can be a bool !
    stabsel_lam: float = -1.0,  # if negative, then it means 'theoretical'
    stabsel_true_lam: bool = True,
    stabsel_method: str = "first",
    stabsel_b: int = 50,
    stabsel_q: int = 10,
    stabsel_percent_ns: float = 0.5,
    stabsel_lamin: float = 1e-2,
    stabsel_threshold: float = 0.7,
    stabsel_threshold_label: float = 0.4,
    # might unneeded here, but needed for visualisation
    # LAMfixed parameters :
    lamfixed: bool = True,
    lamfixed_numerical_method: str = "not specified",
    lamfixed_lam: float = -1.0,  # if negative, then it means 'theoretical'
    lamfixed_true_lam: bool = True,
    # Formulation parameters
    huber: bool = False,
    rho: float = 0.0,
    intercept: bool = True,
) -> classo_problem:

    complete_y = y.to_series()
    complete_y = complete_y[~complete_y.isna()]
    first_cell = complete_y[0]

    #print(sum(complete_y==complete_y[0]), len(complete_y))

    features, pdY = features.align(y.to_series(), join="inner", axis=0)
    missing = pdY.isna()
    training_labels = list(pdY[~missing].index)
    label_missing = list(pdY.index[missing])
    if label_missing:
        print("{} are missing in y ".format(label_missing))
    Y = pdY[~missing].to_numpy()
    X = features.values[~missing, :]

    verfify_binary(Y)
    Y = Y == first_cell
    Y = 2 * Y - 1

    problem = classo_problem(X, Y, C=c, label=list(features.columns))
    problem.formulation.classification = True
    problem.formulation.concomitant = False
    problem.formulation.huber = huber
    #print(rho)
    problem.formulation.rho_classification = rho
    problem.formulation.intercept = intercept
    d = X.shape[1]
    if weights is not None:
        if len(weights) < d:
            problem.formulation.w = np.concatenate(
                [weights, np.ones(d - len(weights))], axis=0)
        else:
            problem.formulation.w = weights[:d]

    problem.model_selection.PATH = path
    if path:
        param = problem.model_selection.PATHparameters
        param.numerical_method = path_numerical_method
        param.n_active = path_n_active
        param.logscale = True
        param.Nlam = path_nlam_log
        param.lamin = path_lamin_log

    problem.model_selection.CV = cv
    if cv:
        param = problem.model_selection.CVparameters
        param.numerical_method = cv_numerical_method
        param.seed = cv_seed
        param.oneSE = cv_one_se
        param.Nsubsets = cv_subsets
        param.lamin = cv_lamin
        param.Nlam = cv_nlam
        param.logscale = cv_logscale

    problem.model_selection.StabSel = stabsel
    if stabsel:
        param = problem.model_selection.StabSelparameters
        param.numerical_method = stabsel_numerical_method
        param.seed = stabsel_seed
        param.true_lam = stabsel_true_lam
        param.method = stabsel_method
        param.B = stabsel_b
        param.q = stabsel_q
        param.percent_nS = stabsel_percent_ns
        param.lamin = stabsel_lamin
        param.threshold = stabsel_threshold
        param.threshold_label = stabsel_threshold_label
        if stabsel_lam > 0.0:
            param.lam = stabsel_lam
        else:
            param.lam = "theoretical"

    problem.model_selection.LAMfixed = lamfixed
    if lamfixed:
        param = problem.model_selection.LAMfixedparameters
        param.numerical_method = lamfixed_numerical_method
        param.true_lam = lamfixed_true_lam
        if lamfixed_lam > 0.0:
            param.lam = lamfixed_lam
        else:
            param.lam = "theoretical"

    problem.solve()

    cy = complete_y.values
    problem.data.complete_y = 2 * (cy == cy[0]) - 1
    problem.data.complete_labels = list(complete_y.index)
    problem.data.training_labels = training_labels

    return problem
Пример #18
0

# In[10]:

s2 = s[1:]
s2


# In[12]:

s1.align(s2)


# In[13]:

df.align(df2, join ='inner')


# In[ ]:




# In[ ]:

#filter and column selection in single statement
#frame.loc[frame['PRIMARY_DESIGN_VALUE']>20,['ARRANGEMENT_ID', 'PRIMARY_DESIGN_VALUE']]


# In[ ]:
Пример #19
0
    def test_align_broadcast_axis(self):
        # GH 13194
        # First four tests for DataFrame.align(Index)
        # For 'right' join
        df = DataFrame(np.array([[1., 2.], [3., 4.]]), columns=list('AB'))
        ts = Series([5., 6., 7.])

        result = df.align(ts, join='right', axis=0, broadcast_axis=1)
        expected1 = DataFrame(np.array([[1., 2.], [3., 4.],
                                        [pd.np.nan, pd.np.nan]]),
                              columns=list('AB'))
        expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]),
                              columns=list('AB'))
        assert_frame_equal(result[0], expected1)
        assert_frame_equal(result[1], expected2)

        # For 'right' join on different index
        result = df.align(ts, join='right', axis=1, broadcast_axis=1)
        expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]),
                              columns=list('AB'))
        expected2 = DataFrame(np.array([[5., 5.], [6., 6.],
                                        [7., 7.]]),
                              columns=list('AB'))
        assert_frame_equal(result[0], expected1)
        assert_frame_equal(result[1], expected2)

        # For 'left' join
        result = df.align(ts, join='left', axis=0, broadcast_axis=1)
        expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]),
                              columns=list('AB'))
        expected2 = DataFrame(np.array([[5., 5.], [6., 6.]]),
                              columns=list('AB'))
        assert_frame_equal(result[0], expected1)
        assert_frame_equal(result[1], expected2)

        # For 'left' join on different axis
        result = df.align(ts, join='left', axis=1, broadcast_axis=1)
        expected1 = DataFrame(np.array([[1., 2.], [3., 4.]]),
                              columns=list('AB'))
        expected2 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]),
                              columns=list('AB'))
        assert_frame_equal(result[0], expected1)
        assert_frame_equal(result[1], expected2)

        # Series.align(DataFrame) tests, 'outer' join
        result = ts.align(df, join='outer', axis=0, broadcast_axis=1)
        expected1 = DataFrame(np.array([[5., 5.], [6., 6.], [7., 7.]]),
                              columns=list('AB'))
        expected2 = DataFrame(np.array([[1., 2.], [3., 4.],
                                        [pd.np.nan, pd.np.nan]]),
                              columns=list('AB'))
        assert_frame_equal(result[0], expected1)
        assert_frame_equal(result[1], expected2)

        # Series.align(DataFrame) tests, 'inner' join
        result = ts.align(df, join='inner', axis=0, broadcast_axis=1)
        expected1 = DataFrame(np.array([[5., 5.], [6., 6.]]),
                              columns=list('AB'))
        expected2 = DataFrame(np.array([[1., 2.], [3., 4.]]),
                              columns=list('AB'))
        assert_frame_equal(result[0], expected1)
        assert_frame_equal(result[1], expected2)
Пример #20
0
    def test_missing_axis_specification_exception(self):
        df = DataFrame(np.arange(50).reshape((10, 5)))
        series = Series(np.arange(5))

        with pytest.raises(ValueError, match=r"axis=0 or 1"):
            df.align(series)
Пример #21
0
def analyze_all(records, **conf):
    '''Analyze all codes.

    records: a list of (code, dataframe) pairs.
    '''
    rates = DataFrame()
    for code, values in records:
        try:
            rates[code] = change_rate(values, 'Close', 1, 500)
        except Exception as e:
            print 'Error in %s: %s' % (code, str(e))
            raise e

    # test
    # for code, values in records:
    #     mean = values['Close'].tail(300).aggregate('mean')
    #     last = values['Close'].tail(1).aggregate('min')
    #     if code in kospi_code.kospi200map and last < mean * (1 - 0.1):
    #         print '<div><a href="http://finance.naver.com/item/main.nhn?code=' \
    #             + code + '">' + kospi200map[code] + '</a></div>'

    # Up-Down
    # print '=== Up-downs ==='
    # counts = DataFrame()
    # for code, rate in rates.iteritems():
    #     counts[code] = rate.gt(0).value_counts()
    # print 'Most ups:'
    # print counts.transpose()[True].nlargest(10)
    # print 'Least ups:'
    # print counts.transpose()[True].nsmallest(10)
    # print counts.transpose()[False].nlargest(10)

    # Hike
    print '=== Hike ranking ==='
    hike_counts = DataFrame()
    h1 = DataFrame()
    h2 = DataFrame()
    # h3 = DataFrame()
    # h4 = DataFrame()
    # h5 = DataFrame()
    for code, rate in rates.iteritems():
        h1[code] = rate.tail(30).apply(
            lambda x: x > 0.5 and x < 1.5).value_counts()
        h2[code] = rate.tail(60).apply(
            lambda x: x > 0.5 and x < 1.5).value_counts()
        # h3[code] = rate.tail(300).apply(lambda x: x > 0.5).value_counts()
        # h4[code] = rate.tail(400).apply(lambda x: x > 1.0).value_counts()
        # h5[code] = rate.tail(500).apply(lambda x: x > 1.0).value_counts()
    h1 = h1.transpose()[True].nlargest(30)
    h2 = h2.transpose()[True].nsmallest(h2.size - 30)

    # h3 = h3.transpose()[True]
    # h3 = h3.nsmallest(h3.size - 50)
    # h4 = h4.transpose()[True].nlargest(50)
    # h5 = h5.transpose()[True].nlargest(50)

    # h1, _ = h1.align(h2, axis = 0, join = 'inner')
    h1, _ = h1.align(h2, axis=0, join='inner')
    # h1, _ = h1.align(h4, axis = 0, join = 'inner')
    # h1, _ = h1.align(h5, axis = 0, join = 'inner')
    for code, value in h1.iteritems():
        print code, value