Exemplos de Pype em Python, exemplos de seglearn.pipe.Pype em Python

Exemplo n.º 1

0

Exibir arquivo

def test_pipe_forecast():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [np.random.rand(1000)]

    pipe = Pype([('seg', SegmentXYForecast()), ('ftr', FeatureRep()),
                 ('ridge', Ridge())])

    forecast_test(pipe, X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000)]

    forecast_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    forecast_test(pipe, X, y)

    # multiple time seres
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]

    forecast_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    forecast_test(pipe, X, y)

    # cross val

    Xt = np.array([np.random.rand(1000, 10)] * 5)
    Xc = np.random.rand(5, 3)
    X = TS_Data(Xt, Xc)
    y = np.array([np.random.rand(1000)] * 5)

    cross_validate(pipe, X, y, cv=3)

    X = pd.DataFrame(Xc)
    Xt = [np.random.rand(1000, 10)] * 5
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    cross_validate(pipe, X, y, cv=3)

Exemplo n.º 2

0

Exibir arquivo

def test_pipe_classification():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [5]

    pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()),
                 ('rf', RandomForestClassifier(n_estimators=10))])

    classifier_test(pipe, X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = TS_Data(Xt, Xc)
    y = [5]
    classifier_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    classifier_test(pipe, X, y)

    # multiple time series
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    classifier_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    classifier_test(pipe, X, y)

    # univariate data
    Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    Xc = np.random.rand(3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    classifier_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    classifier_test(pipe, X, y)

Exemplo n.º 3

0

Exibir arquivo

from seglearn.pipe import Pype
from seglearn.split import temporal_split, TemporalKFold
from seglearn.transform import FeatureRep, Segment, last

# for a single time series, we need to make it a list
X = [np.arange(10000) / 100.]
y = [np.sin(X[0]) * X[0] * 3 + X[0] * X[0]]

# split the data along the time axis (our only option since we have only 1 time series)
X_train, X_test, y_train, y_test = temporal_split(X, y)

# setting y_func = last, selects the last value from each y segment as the target
# other options include transform.middle, or you can make your own function
# see the API documentation for further details

pipe = Pype([('seg', Segment(width=200, overlap=0.5, y_func=last)),
             ('features', FeatureRep()), ('lin', LinearRegression())])

# fit and score
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", pipe.N_train)
print("N segments in test: ", pipe.N_test)
print("Score: ", score)

# generate some predictions
ytr, ytr_p = pipe.transform_predict(X_train, y_train)  # training predictions
yte, yte_p = pipe.transform_predict(X_test, y_test)  # test predictions
xtr = np.arange(len(ytr))  # segment number

Exemplo n.º 4

0

Exibir arquivo

Arquivo: plot_segment_rep.py Projeto: yejiachen/seglearn

                  metrics=['accuracy'])

    return model


# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a segment learning pipeline
width = 100

pipe = Pype([('seg', SegmentX()),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=8,
                              batch_size=256,
                              verbose=0))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", pipe.N_train)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: svm_freezing_seglearn.py Projeto: andrey-chu/ml_freezing

 gamma_chosen = 0.05994  #grid1.best_params_["gamma"]
 n_estimators = 20
 #clf = Pype([('segment', SegmentXY(width=chosenwidth,step=1)), # in this context what is the difference with SegmentX?
 #        ('features', FeatureRep(fts)),
 #        ('scaler', StandardScaler()),
 #        ('rf', RandomForestClassifier(n_estimators=20))], scorer=scorer1)
 clf = Pype([
     ('segment', SegmentXY(
         width=chosenwidth,
         step=1)),  # in this context what is the difference with SegmentX?
     ('features', FunctionTransformer(reshape_all)),
     ('scaler', StandardScaler()),
     ('bagg',
      OneVsRestClassifier(
          BaggingClassifier(SVC(kernel='rbf',
                                gamma=gamma_chosen,
                                C=C_chosen,
                                probability=True,
                                class_weight='balanced'),
                            max_samples=1.0 / n_estimators,
                            warm_start=True,
                            n_estimators=n_estimators,
                            n_jobs=6,
                            verbose=10)))
 ])
 #scorer=scorer1
 X_train, X_test, y_train, y_test, matlab_train, matlab_test = train_test_split(
     new_features_seg_included,
     new_labels_seg_included,
     new_matlab_seg_included,
     test_size=0.10,

Exemplo n.º 6

0

Exibir arquivo

Arquivo: plot_imblearn.py Projeto: xiaoruishan/seglearn

# Single univariate time series with 10 samples
X = [
    np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [7, 8],
              [8, 9], [9, 10]])
]
# Time series target (imbalanced towards False)
y = [
    np.array(
        [True, False, False, False, False, False, True, False, False, False])
]

print("Implementation details: transform and fit_transform methods:")

pipe = Pype([
    ('segment', Segment(width=1, overlap=0)),
    ('resample', patch_sampler(RandomUnderSampler)()),
])
print("Pipeline:", pipe)

print("Calling a transform on the data does not change it ...")
Xf, yf = pipe.transform(X, y)
print("X (flattened):", Xf.flatten())
print("y", yf)

print("... but calling fit_transform resamples the data.")
Xf, yf = pipe.fit_transform(X, y)
print("X (flattened):", Xf.flatten())
print("y", yf)

print()
print("VerboseDummyClassifier example:")

Exemplo n.º 7

0

Exibir arquivo

    X.append(group[['lat', 'lon', '方向', '速度']].values)
    y.append(group['type'].values[0])
    id_list.append(int(ship_id))

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y[:len(train_df_list)])

X_train = X[:len(train_df_list)]
X_test = X[len(train_df_list):]

kf = KFold(n_splits=5, random_state=42, shuffle=True)
model_v1_list = []
score_v1_list = []
for train_index, test_index in kf.split(X_train):
    model_v1 = Pype([('segment', SegmentX(width=10)),
                        ('features', FeatureRep()),
                        ('scaler', StandardScaler()),
                        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))])

    model_v1.fit(np.array(X_train)[train_index], y_train[train_index])

    model_v1_list.append(model_v1)

    y_pred = []
    for test_sample in np.array(X_train)[test_index]:
        result = model_v1.predict_proba([test_sample])
        pred = np.argmax(np.sum(result, axis=0) / result.shape[0])
        y_pred.append(pred)
    score_v1_list.append(f1_score(y_train[test_index], y_pred, average='macro'))

print(score_v1_list)
print(np.mean(score_v1_list), np.std(score_v1_list))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: plot_model_selection2.py Projeto: tylerwmarrs/seglearn


# load the data
data = load_watch()
X = data['X']
y = data['y']

# temporal splitting of data
splitter = TemporalKFold(n_splits=3)
Xs, ys, cv = splitter.split(X, y)

# create a segment learning pipeline
width = 100
pipe = Pype([('seg', SegmentX(order='C')),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=1,
                              batch_size=256,
                              verbose=0))])

# create a parameter dictionary using the sklearn API
#
# you can also set a parameter to be always equal to another parameter, by setting its value to
# parameter name to track (this is an extension from sklearn)
#
# note that if you want to set a parameter to a single value, it will still need to be as a list

par_grid = {
    'seg__width': [50, 100, 200],
    'seg__overlap': [0.],
    'crnn__width': ['seg__width']
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: plot_padtrunc.py Projeto: xiaoruishan/seglearn

from sklearn.svm import LinearSVC

from seglearn.datasets import load_watch
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, PadTrunc

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline with PadTrunc segmentation
# the time series are between 20-40 seconds
# this truncates them all to the first 5 seconds (sampling rate is 50 Hz)

pipe = Pype([('trunc', PadTrunc(width=250)), ('features', FeatureRep()),
             ('scaler', StandardScaler()), ('svc', LinearSVC())])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=42)

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", pipe.N_train)
print("N segments in test: ", pipe.N_test)

Exemplo n.º 10

0

Exibir arquivo

# seed RNGESUS
np.random.seed(123124)

# load the data
data = load_watch()

X = data['X']
y = data['y']

# I am adding in a column to represent time (50 Hz sampling), since my data doesn't include it
# the Interp class assumes time is the first column in the series
X = np.array([np.column_stack([np.arange(len(X[i])) / 50., X[i]]) for i in np.arange(len(X))])

clf = Pype([('interp', Interp(1. / 25., categorical_target=True)),
            ('segment', Segment(width=100)),
            ('features', FeatureRep()),
            ('scaler', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=20))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", clf.N_train)
print("N segments in test: ", clf.N_test)
print("Accuracy score: ", score)

Exemplo n.º 11

0

Exibir arquivo

def test_pipe_transformation():
    # SegmentX transform pipe
    pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

    # SegmentXY transform pipe
    pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

    # Forecast transform pipe
    pipe = Pype([('seg', SegmentXYForecast()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

    # Padtrunc transform pipe
    pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

Exemplo n.º 12

0

Exibir arquivo

def test_pipe_regression():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [np.random.rand(1000)]

    pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()),
                 ('ridge', Ridge())])

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000)]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # multiple time seres
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # cross val
    Xt = np.array([np.random.rand(1000, 10)] * 5)
    Xc = np.random.rand(5, 3)
    X = TS_Data(Xt, Xc)
    y = np.array([np.random.rand(1000)] * 5)

    cross_validate(pipe, X, y, cv=3)

    # transform pipe
    pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])

    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]

    pipe.fit(X, y)
    pipe.transform(X, y)
    pipe.fit_transform(X, y)

Exemplo n.º 13

0

Exibir arquivo

def test_pipe_PadTrunc():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [5]

    pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()),
                 ('rf', RandomForestClassifier(n_estimators=10))])

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = TS_Data(Xt, Xc)
    y = [5]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # multiple time series
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # univariate data
    Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    Xc = np.random.rand(3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # transform pipe
    pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])

    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform(X, y)
    pipe.fit_transform(X, y)

Exemplo n.º 14

0

Exibir arquivo

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a segment learning pipeline
pipe = Pype([('seg', Segment(width=100, step=100, order='C')),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=1,
                              batch_size=256,
                              verbose=0))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", pipe.N_train)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: plot_nn_training_curves.py Projeto: eackermann/seglearn

data = load_watch()
X = data['X']
y = data['y']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

# create a segment learning pipeline
width = 100
pipe = Pype([('seg', SegmentX()),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=10,
                              batch_size=256,
                              verbose=0,
                              validation_split=0.2))])

##############################################
# Accessing training history
##############################################

# this is a bit of a hack, because history object is returned by the
# keras wrapper when fit is called
# this approach won't work with a more complex estimator pipeline, in which case
# a callable class with the desired properties should be made passed to build_fn

pipe.fit(X_train, y_train)
print(DataFrame(pipe.history.history))

Exemplo n.º 16

0

Exibir arquivo

Arquivo: preprocess.py Projeto: linytsysu/fishing-classification

train_df = pd.concat(train_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df])

X = []
y = []
id_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    X.append(group[['lat', 'lon', '速度', '方向', 'time']])
    y.append(group['type'].values[0])
    id_list.append(ship_id)
print(len(id_list))

pype = Pype([('segment', SegmentX(width=72, overlap=0.1))])

pype = pype.fit(X, y)

shape_list = []
df_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    sample = group[['lat', 'lon', '速度', '方向', 'time']].values
    transform_result = pype.transform([sample])[0]

    if transform_result.shape[0] == 0:
        seg_df = pd.DataFrame(sample,
                              columns=['lat', 'lon', '速度', '方向', 'time'])
        seg_df['渔船ID'] = len(df_list)
        seg_df['type'] = group['type'].values[0]
        df_list.append(seg_df)

Exemplo n.º 17

0

Exibir arquivo

from seglearn.base import TS_Data
from seglearn.datasets import load_watch
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, SegmentX

# seed RNGESUS
np.random.seed(123124)

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline
clf = Pype([('segment', SegmentX()), ('features', FeatureRep()),
            ('scaler', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=20))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", clf.N_train)
print("N segments in test: ", clf.N_test)
print("Accuracy score: ", score)

# now lets add some contextual data

Exemplo n.º 18

0

Exibir arquivo

##############################################
# SETUP
##############################################

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline
steps = [('seg', Segment()), ('features', FeatureRep()),
         ('scaler', StandardScaler()),
         ('rf', RandomForestClassifier(n_estimators=20))]

pipe = Pype(steps)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

##############################################
# OPTION 1: Use the score SegPipe score method
##############################################

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
print("Accuracy score: ", score)

Exemplo n.º 19

0

Exibir arquivo

# remember for a single time series, we need to make a list
X = [X]
y = [y]

# split the data along the time axis (our only option since we have only 1 time series)
X_train, X_test, y_train, y_test = temporal_split(X, y, test_size=0.25)

# create a feature representation pipeline
# setting y_func = last, and forecast = 200 makes us predict the value of y
# 200 samples ahead of the segment
# other reasonable options for y_func are ``mean``, ``all`` (or create your own function)
# see the API documentation for further details
clf = Pype([('segment',
             SegmentXYForecast(width=200,
                               overlap=0.5,
                               y_func=last,
                               forecast=200)), ('features', FeatureRep()),
            ('lin', LinearRegression())])

# fit and score
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", clf.N_train)
print("N segments in test: ", clf.N_test)
print("Score: ", score)

# generate some predictions
y, y_p = clf.transform_predict(X, y)  # all predictions

Exemplo n.º 20

0

Exibir arquivo

Arquivo: plot_feature_rep.py Projeto: xiaoruishan/seglearn

from seglearn.base import TS_Data
from seglearn.datasets import load_watch
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, Segment

# seed RNGESUS
np.random.seed(123124)

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline
clf = Pype([('segment', Segment()), ('features', FeatureRep()),
            ('scaler', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=20))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", clf.N_train)
print("N segments in test: ", clf.N_test)
print("Accuracy score: ", score)

# lets make a pretend series with different activities

Exemplo n.º 21

0

Exibir arquivo

Arquivo: main_v3.py Projeto: linytsysu/fishing-classification

def generate_result():
    train_path = '/tcdata/hy_round2_train_20200225'
    test_path = '/tcdata/hy_round2_testB_20200312'

    train_df_list = []
    for file_name in os.listdir(train_path):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(train_path, file_name))
            train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(test_path, file_name))
            test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df])

    X = []
    y = []
    id_list = []
    for ship_id, group in all_df.groupby('渔船ID'):
        X.append(group[['lat', 'lon', '速度', '方向', 'time']])
        y.append(group['type'].values[0])
        id_list.append(ship_id)
    print(len(id_list))

    pype = Pype([('segment', SegmentX(width=72, overlap=0.0))])

    pype = pype.fit(X, y)

    shape_list = []
    df_list = []
    for ship_id, group in all_df.groupby('渔船ID'):
        sample = group[['lat', 'lon', '速度', '方向', 'time']].values
        transform_result = pype.transform([sample])[0]

        if transform_result.shape[0] == 0:
            seg_df = pd.DataFrame(sample,
                                  columns=['lat', 'lon', '速度', '方向', 'time'])
            seg_df['渔船ID'] = len(df_list)
            seg_df['type'] = group['type'].values[0]
            df_list.append(seg_df)
            shape_list.append(1)
        else:
            for seg in transform_result:
                seg_df = pd.DataFrame(
                    seg, columns=['lat', 'lon', '速度', '方向', 'time'])
                seg_df['渔船ID'] = len(df_list)
                seg_df['type'] = group['type'].values[0]
                df_list.append(seg_df)
            shape_list.append(transform_result.shape[0])

    new_all_df = pd.concat(df_list, sort=False)
    new_all_df.to_csv('help.csv', index=False)
    new_all_df = pd.read_csv('help.csv')
    df = new_all_df.drop(columns=['type'])
    extracted_df = extract_features(df,
                                    column_id='渔船ID',
                                    column_sort='time',
                                    n_jobs=8,
                                    kind_to_fc_parameters=fc_parameters_v2)

    new_df = new_all_df.groupby('渔船ID').agg(x_min=('lat', 'min'),
                                            x_max=('lat', 'max'),
                                            y_min=('lon', 'min'),
                                            y_max=('lon', 'max'))
    extracted_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    extracted_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    extracted_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    extracted_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']
    extracted_df['slope'] = extracted_df['y_max-y_min'] / np.where(
        extracted_df['x_max-x_min'] == 0, 0.001, extracted_df['x_max-x_min'])
    extracted_df[
        'area'] = extracted_df['x_max-x_min'] * extracted_df['y_max-y_min']

    def get_feature(arr):
        feature = [
            np.max(arr),
            np.quantile(arr, 0.9),
            np.quantile(arr, 0.1),
            np.quantile(arr, 0.75),
            np.quantile(arr, 0.25),
            np.mean(arr),
            np.std(arr),
            np.median(arr),
            np.std(arr) / np.mean(arr)
        ]
        feature.append(np.corrcoef(np.array([arr[:-1], arr[1:]]))[0, 1])
        feature.append(skew(arr))
        feature.append(kurtosis(arr))
        return feature

    features = []
    for _, group in new_all_df.groupby('渔船ID'):
        group = group.sort_values(by=['time'])
        lat = group['lat'].values
        lon = group['lon'].values
        time_ = pd.to_datetime(group['time'],
                               format='%Y-%m-%d %H:%M:%S').values
        dire = group['方向'].values

        speed_list = []
        for i in range(lat.shape[0]):
            if i == 0:
                continue
            hour = (time_[i] - time_[i - 1]) / np.timedelta64(1, 'h')
            dist = geodesic((lat[i - 1], lon[i - 1]), (lat[i], lon[i]))
            speed_list.append(dist.km / hour)

        c = np.sum(np.cos(dire / 180 * np.pi)) / group.shape[0]
        s = np.sum(np.sin(dire / 180 * np.pi)) / group.shape[0]
        r = np.sqrt(c**2 + s**2)
        theta = np.arctan(s / c)
        angle_feature = [r, theta, np.sqrt(-2 * np.log(r))]

        turn_list = []
        for i in range(dire.shape[0]):
            if i == 0:
                continue
            turn = 1 - np.cos(dire[i - 1] / 180 * np.pi -
                              dire[i] / 180 * np.pi)
            turn_list.append(turn * np.pi)
        turn_list = np.array(turn_list)
        c = np.sum(np.cos(turn_list)) / (group.shape[0] - 1)
        s = np.sum(np.sin(turn_list)) / (group.shape[0] - 1)
        r = np.sqrt(c**2 + s**2)
        theta = np.arctan(s / c)
        turn_feature = [r, theta, np.sqrt(-2 * np.log(r))]

        features.append(
            np.concatenate(
                [get_feature(speed_list), angle_feature[:1],
                 turn_feature[:1]]))

    extracted_df_ = pd.concat([pd.DataFrame(np.array(features)), extracted_df],
                              axis=1)

    y = []
    for _, group in new_all_df.groupby('渔船ID'):
        y.append(group.iloc[0]['type'])

    train_df = extracted_df_.iloc[:np.sum(shape_list[:len(train_df_list)])]
    test_df = extracted_df_.iloc[np.sum(shape_list[:len(train_df_list)]):]

    y_train = y[:train_df.shape[0]]
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    train_df['type'] = le.inverse_transform(y_train)

    train_df.to_csv('./train.csv')
    test_df.to_csv('./test.csv')

    train_df = pd.read_csv('./train.csv', index_col=0)
    X_train = train_df.drop(columns=['type']).values
    y_train = train_df['type'].values

    test_df = pd.read_csv('./test.csv', index_col=0)
    X_test = test_df.values

    from sklearn.impute import SimpleImputer

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    X_train = imputer.fit_transform(
        pd.DataFrame(X_train).replace([np.inf, -np.inf], np.nan).values)
    X_test = imputer.fit_transform(
        pd.DataFrame(X_test).replace([np.inf, -np.inf], np.nan).values)

    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    def get_model():
        exported_pipeline = make_pipeline(
            SelectPercentile(score_func=f_classif, percentile=48),
            StackingEstimator(
                estimator=SGDClassifier(alpha=0.01,
                                        eta0=0.01,
                                        fit_intercept=False,
                                        l1_ratio=0.25,
                                        learning_rate="invscaling",
                                        loss="modified_huber",
                                        penalty="elasticnet",
                                        power_t=10.0)),
            ExtraTreesClassifier(bootstrap=False,
                                 criterion="entropy",
                                 max_features=0.6000000000000001,
                                 min_samples_leaf=1,
                                 min_samples_split=3,
                                 n_estimators=100))

        set_param_recursive(exported_pipeline.steps, 'random_state', 42)
        return exported_pipeline

    def get_model_v2():
        exported_pipeline = make_pipeline(
            make_union(
                make_pipeline(
                    make_union(FunctionTransformer(copy),
                               FunctionTransformer(copy)),
                    SelectPercentile(score_func=f_classif, percentile=18)),
                FunctionTransformer(copy)),
            StackingEstimator(estimator=SGDClassifier(alpha=0.01,
                                                      eta0=0.1,
                                                      fit_intercept=False,
                                                      l1_ratio=1.0,
                                                      learning_rate="constant",
                                                      loss="hinge",
                                                      penalty="elasticnet",
                                                      power_t=0.1)),
            VarianceThreshold(threshold=0.05),
            ExtraTreesClassifier(bootstrap=False,
                                 criterion="entropy",
                                 max_features=0.55,
                                 min_samples_leaf=1,
                                 min_samples_split=4,
                                 n_estimators=100))
        set_param_recursive(exported_pipeline.steps, 'random_state', 42)
        return exported_pipeline

    def get_data(shape_idx):
        start_idx = int(np.sum(shape_list[:shape_idx]))
        end_idx = start_idx + shape_list[shape_idx]
        if shape_idx < len(train_df_list):

            return X_train[start_idx:end_idx], y_train[start_idx:end_idx]
        else:
            return X_test[start_idx:end_idx], None

    kf = KFold(n_splits=5, random_state=2019, shuffle=True)

    model_v1_list = []
    score_v1_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v1 = get_model()
        model_v1.fit(train_data, y_data)
        model_v1_list.append(model_v1)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v1.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v1_list.append(score)

    print(score_v1_list)
    print(np.mean(score_v1_list))

    kf = KFold(n_splits=5, random_state=22, shuffle=True)

    model_v2_list = []
    score_v2_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v2 = get_model_v2()
        model_v2.fit(train_data, y_data)
        model_v2_list.append(model_v2)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v2.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v2_list.append(score)

    print(score_v2_list)
    print(np.mean(score_v2_list))

    kf = KFold(n_splits=5, random_state=22, shuffle=True)

    model_v3_list = []
    score_v3_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v3 = RandomForestClassifier(bootstrap=False,
                                          criterion="entropy",
                                          max_features=0.1,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          n_estimators=100)
        model_v3.fit(train_data, y_data)
        model_v3_list.append(model_v3)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v3.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v3_list.append(score)

    print(score_v3_list)
    print(np.mean(score_v3_list))

    kf = KFold(n_splits=5, random_state=22, shuffle=True)

    model_v4_list = []
    score_v4_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v4 = ExtraTreesClassifier(bootstrap=False,
                                        criterion="entropy",
                                        max_features=0.6000000000000001,
                                        min_samples_leaf=1,
                                        min_samples_split=3,
                                        n_estimators=100)
        model_v4.fit(train_data, y_data)
        model_v4_list.append(model_v4)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v4.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v4_list.append(score)

    print(score_v4_list)
    print(np.mean(score_v4_list))

    pred = []
    for i in range(len(train_df_list), len(shape_list)):
        start_idx = int(np.sum(shape_list[len(train_df_list):i]))
        sample = X_test[start_idx:start_idx + shape_list[i]]
        result = []
        for model in model_v1_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        for model in model_v2_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        for model in model_v3_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        for model in model_v4_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        pred.append(np.sum(result, axis=0) / 20)

    pd.DataFrame(pred, index=id_list[len(train_df_list):]).to_csv(
        './probaresult.csv', header=None)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: plot_nn_training_curves.py Projeto: xiaoruishan/seglearn

# load the data
data = load_watch()
X = data['X']
y = data['y']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

# create a segment learning pipeline
pipe = Pype([('seg', Segment(width=100, step=100, order='C')),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=4,
                              batch_size=256,
                              verbose=0,
                              validation_split=0.2))])

##############################################
# Accessing training history
##############################################

# this is a bit of a hack, because history object is returned by the
# keras wrapper when fit is called
# this approach won't work with a more complex estimator pipeline, in which case
# a callable class with the desired properties should be made passed to build_fn

pipe.fit(X_train, y_train)
history = pipe.history.history