Python shift примеры, datatable.shift Python примеры использования

Пример #1

0

Показать файл

def test_shift():
    assert str(dt.shift(f.A)) == str(f.A.shift())
    assert str(dt.shift(f[:])) == str(f[:].shift())
    DT = dt.Frame({"C": [2, 5, 30, 20, 10],
                   "D": [10, 8, 20, 20, 1]})

    assert_equals(DT[:, f[:].shift()], DT[:, dt.shift(f[:])])

Пример #2

0

Показать файл

Файл: enhance.py Проект: ralfr/Corona

def add7dSumColumn(table, srcColumn, newColumn):
    src = dt.f[srcColumn]
    #src_na = dt.math.isna(src)
    src_na = dt.math.isna(src)
    table[src_na, src] = 0

    newTable = table[:, dt.f[:].extend({newColumn: src + dt.shift(src, n=1) + dt.shift(src, n=2) + dt.shift(src, n=3)+
                                                   dt.shift(src, n=4)+ dt.shift(src, n=5)+ dt.shift(src, n=6)})]
    #print(newTable)
    return newTable

Пример #3

0

Показать файл

def add7dWTrendColumn(table, srcColumn, newColumn):
    src = dt.f[srcColumn]
    newTable = table[:, dt.f[:].extend({
        newColumn:
        (dt.math.abs(src) + 5) / (dt.shift(dt.math.abs(src), n=7) + 5)
    })]
    return newTable

Пример #4

0

Показать файл

def test_shift_all_columns():
    DT = dt.Frame([[5, 9, 1], list("ABC"), [2.3, 4.4, -2.5]],
                  names=["A", "B", "D"])
    RES = DT[:, shift(f[:], n=1)]
    assert_equals(RES, dt.Frame(A=[None, 5, 9],
                                B=[None, "A", "B"],
                                D=[None, 2.3, 4.4]))

Пример #5

0

Показать файл

Файл: test-shift.py Проект: preritg/datatable

def test_shift_group_column():
    DT = dt.Frame(A=[1, 2, 1, 1, 2])
    RES = DT[:, shift(f.A), by(f.A)]
    assert_equals(
        RES, dt.Frame({
            "A": [1, 1, 1, 2, 2],
            "A.0": [None, 1, 1, None, 2]
        }))

Пример #6

0

Показать файл

def add7dDifferenceColumn(table, srcColumn, newColumn):
    src = dt.f[srcColumn]
    #src_na = dt.math.isna(src)
    src_na = dt.math.isna(src)
    table[src_na, src] = 0

    newTable = table[:, dt.f[:].extend({newColumn: src - dt.shift(src, n=7)})]
    #print(newTable)
    return newTable

Пример #7

0

Показать файл

Файл: test-shift.py Проект: preritg/datatable

def test_shift_stypes():
    DT = dt.Frame([[0, 1, 2, 3, 4], [2.7, 9.4, -1.1, None, 3.4],
                   ["one", "two", "three", "FOUR", "five"],
                   [True, False, True, True, False]])
    RES = shift(DT, n=1)
    assert_equals(
        RES,
        dt.Frame([[None, 0, 1, 2, 3], [None, 2.7, 9.4, -1.1, None],
                  [None, "one", "two", "three", "FOUR"],
                  [None, True, False, True, True]]))

Пример #8

0

Показать файл

Файл: test-shift.py Проект: preritg/datatable

def test_shift_amount():
    DT = dt.Frame(range(5))
    RES = DT[:, [shift(f.C0, n) for n in range(-5, 6)]]
    assert_equals(
        RES,
        dt.Frame([[None, None, None, None, None], [4, None, None, None, None],
                  [3, 4, None, None, None], [2, 3, 4, None, None],
                  [1, 2, 3, 4, None], [0, 1, 2, 3, 4], [None, 0, 1, 2, 3],
                  [None, None, 0, 1, 2], [None, None, None, 0, 1],
                  [None, None, None, None, 0], [None, None, None, None, None]],
                 stype=dt.int32))

Пример #9

0

Показать файл

Файл: test-shift.py Проект: preritg/datatable

def test_shift_reduced_column():
    DT = dt.Frame(A=[1, 2, 1, 1, 2, 1], B=range(6))
    RES = DT[:, shift(dt.sum(f.B)), by(f.A)]
    assert_equals(
        RES,
        dt.Frame(A=[1, 1, 1, 1, 2, 2],
                 B=[None, 10, 10, 10, None, 5],
                 stypes={
                     "A": dt.int32,
                     "B": dt.int64
                 }))

Пример #10

0

Показать файл

def test_shift_wrong_signature1():
    msg = r"Function shift\(\) requires 1 positional argument"
    with pytest.raises(TypeError, match=msg):
        shift()
    with pytest.raises(TypeError, match=msg):
        shift(None)
    with pytest.raises(TypeError, match=msg):
        shift(n=3)

Пример #11

0

Показать файл

Файл: nytimes_covid19_cases_deaths_us.py Проект: mdolinski-equinix/driverlessai-recipes

    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_total = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv")

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_total[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_total[:, series_cols_to_delete]

        # set negative daily values to 0
        us_total[f.cases_daily < 0, [f.cases_daily]] = 0
        us_total[f.deaths_daily < 0, [f.deaths_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_total[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_total[date_col].to_pandas()
        train = us_total[df[date_col] <= split_date, :]
        test = us_total[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_us_train": train,
                f"covid19_daily_{test_date}_us_test": test}

Пример #12

0

Показать файл

def add7dAvrgColumn(table, srcColumn, newColumn):
    src = dt.f[srcColumn]
    newTable = table[:, dt.f[:].extend({
        newColumn:
        (src + dt.shift(src, n=1) + dt.shift(src, n=2) + dt.shift(src, n=3) +
         dt.shift(src, n=4) + dt.shift(src, n=5) + dt.shift(src, n=6)) / 7
    })]
    #print(newTable)
    return newTable

Пример #13

0

Показать файл

def test_shift_with_by():
    DT = dt.Frame(A=[1, 2, 1, 1, 2, 1, 2],
                  B=[3, 7, 9, 0, -1, 2, 1])
    RES = DT[:, {"lag1": shift(f.B, 1),
                 "lag2": shift(f.B, 2),
                 "lag3": shift(f.B, 3),
                 "nolag": shift(f.B, 0),
                 "lead1": shift(f.B, -1),
                 "lead2": shift(f.B, -2),
                 "lead3": shift(f.B, -3),
                 }, by(f.A)]
    assert_equals(RES, dt.Frame(A=[1, 1, 1, 1, 2, 2, 2],
                                lag1=[None, 3, 9, 0, None, 7, -1],
                                lag2=[None, None, 3, 9, None, None, 7],
                                lag3=[None, None, None, 3, None, None, None],
                                nolag=[3, 9, 0, 2, 7, -1, 1],
                                lead1=[9, 0, 2, None, -1, 1, None],
                                lead2=[0, 2, None, None, 1, None, None],
                                lead3=[2, None, None, None, None, None, None]))

Пример #14

0

Показать файл

Файл: enhance.py Проект: ralfr/Corona

def addShiftedColumn(table, srcColumn, newColumn, shift_rows):
    src = dt.f[srcColumn]
    newTable = table[:, dt.f[:].extend({newColumn: dt.shift(src, n=shift_rows)})]
    #print(newTable)
    return newTable

Пример #15

0

Показать файл

Файл: enhance.py Проект: ralfr/Corona

def add7dTrendColumn(table, srcColumn, newColumn):
    src = dt.f[srcColumn]
    newTable = table[:, dt.f[:].extend({newColumn: (src / dt.shift(src, n=7))})]
    #print(newTable)
    return newTable

Пример #16

0

Показать файл

#   X: datatable - primary data set
# Parameters:
#   time_col: date/time/int - time column to order rows before the shift op
#   group_by_cols: list of column names - group columns
#   shift_cols: list of column names - columns to shift
# Output:
#   dataset augmented with shifted columns

from datatable import f, by, sort, update, shift, isna

time_col = "date"
group_by_cols = ["state"]
shift_cols = ["cases", "deaths"]

new_dataset_name = "new_dataset_name_with_shift"

# produce lag of 1 unit and add as new feature for each shift column
aggs = {f"{col}_yesterday": shift(f[col]) for col in shift_cols}
X[:, update(**aggs), sort(time_col), by(*group_by_cols)]

# update NA lags
aggs = {f"{col}_yesterday": 0 for col in shift_cols}
X[isna(f[f"{shift_cols[0]}_yesterday"]), update(**aggs)]

aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in shift_cols}
X[:, update(**aggs), sort(time_col), by(group_by_cols)]

for col in shift_cols:
    del X[:, f[f"{col}_yesterday"]]

return {new_dataset_name: X}

Пример #17

0

Показать файл

    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_states = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")

        # get states population
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv")
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop.key = "state"

        # augment data with state population figures and create adjusted case and death counts
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}100k": dt.f[col] / (dt.g.pop / 100000) for col in series_cols}
        us_states[:, dt.update(pop = g.pop, pop100k = g.pop / 10000, **aggs), join(us_states_pop)]

        # remove rows without state defined (resulted in unmatched rows after left outer join)
        del us_states[isna(f.pop), :]

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_states[:, series_cols_to_delete]

        # set negative daily values to 0
        us_states[f.cases_daily < 0, [f.cases_daily, f.cases100k_daily]] = 0
        us_states[f.deaths_daily < 0, [f.deaths_daily, f.deaths100k_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_by_states_train": train,
                f"covid19_daily_{test_date}_by_states_test": test}

Пример #18

0

Показать файл

def test_shift_expr():
    DT = dt.Frame(A=[3, 4, 5, 6], B=[-1, 2, -2, 3])
    RES = DT[:, shift(f.A + f.B, n=1)]
    assert_equals(RES, dt.Frame([None, 2, 6, 3]))

Пример #19

0

Показать файл

Файл: test-f.py Проект: rserran/datatable

def test_shift_n():
    DT = dt.Frame(a=range(10))
    assert_equals(DT[:, [f.a.shift(n=3), f.a.shift(-1)]],
                  DT[:, [dt.shift(f.a, 3), dt.shift(f.a, -1)]])

Пример #20

0

Показать файл

def test_shift_frame():
    DT = dt.Frame(A=range(5))
    RES = shift(DT, 2)
    assert_equals(RES, dt.Frame(A=[None, None, 0, 1, 2]))

Пример #21

0

Показать файл

def test_shift_default():
    DT = dt.Frame(A=range(5))
    assert_equals(DT[:, shift(f.A)],
                  dt.Frame(A=[None, 0, 1, 2, 3]))

Пример #22

0

Показать файл

def test_shift_wrong_signature3():
    msg = r"Argument n in shift\(\) should be an integer"
    for n in ["one", 0.0, f.B, range(3), [1, 2, 3]]:
        with pytest.raises(TypeError, match=msg):
            shift(f.A, n=n)

Пример #23

0

Показать файл

def test_shift_wrong_signature2():
    msg = r"The first argument to shift\(\) must be a column expression " \
          r"or a Frame"
    for s in [3, 12.5, "hi", dt]:
        with pytest.raises(TypeError, match=msg):
            shift(s)

Пример #24

0

Показать файл

def test_shift_by_with_i():
    DT = dt.Frame(A=[1, 2, 1, 2, 1, 2, 1, 2], B=range(8))
    RES = DT[1:, shift(f.B), by(f.A)]
    assert_equals(RES, dt.Frame(A=[1, 1, 1, 2, 2, 2],
                                B=[None, 2, 4, None, 3, 5]))

Пример #25

0

Показать файл

Файл: covidtracking_daily_by_states.py Проект: mdolinski-equinix/driverlessai-recipes

    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # state codes lookup table
        us_state_codes = dt.Frame(
            code=[
                'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC',
                'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY',
                'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE',
                'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK',
                'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT',
                'VI', 'VA', 'WA', 'WV', 'WI', 'WY'
            ],
            state=[
                'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
                'California', 'Colorado', 'Connecticut', 'Delaware',
                'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
                'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
                'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
                'New York', 'North Carolina', 'North Dakota',
                'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
                'Pennsylvania', 'Puerto Rico', 'Rhode Island',
                'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
                'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
                'West Virginia', 'Wisconsin', 'Wyoming'
            ])
        us_state_codes.key = "state"

        # get states population lookup table
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop = us_states_pop[dt.f.STATE > 0, :]
        us_states_pop.key = "state"

        # join state codes and population into single lookup table
        us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)]
        us_states_pop.key = "code"

        # US Covid Tracking API: https://covidtracking.com/data/api
        us_states = dt.fread(
            "https://covidtracking.com/api/v1/states/daily.csv")
        # remove deprecated fields
        deprecated = [
            'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified',
            'grade', 'hash', 'hospitalized', 'negativeIncrease',
            'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore',
            'score', 'total'
        ]
        us_states = us_states[:, list(set(us_states.names) - set(deprecated))]
        us_states.names = {'state': 'code'}

        series_cols = [
            "positive", "negative", "hospitalizedCumulative",
            "inIcuCumulative", "onVentilatorCumulative", "recovered", "death"
        ]
        aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols}
        us_states[:,
                  dt.update(
                      state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs),
                  join(us_states_pop)]
        us_states = us_states[~dt.isna(dt.f.state), :]

        # produce lag of 1 unit and add as new feature for each shift column
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        aggs = {
            f"{col}_daily": f[col] - f[f"{col}_yesterday"]
            for col in series_cols
        }
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        for col in series_cols:
            del us_states[:, f[f"{col}_yesterday"]]

        # validate dataset
        if us_states[:, count(),
                     by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1:
            raise ValueError(
                "Found duplicate elements for the same date and state.")

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        return {
            f"covidtracking_daily_{split_date}_by_us_states_train": train,
            f"covidtracking_daily_{test_date}_by_us_states_test": test
        }

Python shift примеры использования