def test_pipe_forecast(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [np.random.rand(1000)] pipe = Pype([('seg', SegmentXYForecast()), ('ftr', FeatureRep()), ('ridge', Ridge())]) forecast_test(pipe, X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = TS_Data(Xt, Xc) y = [np.random.rand(1000)] forecast_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) forecast_test(pipe, X, y) # multiple time seres Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] forecast_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) forecast_test(pipe, X, y) # cross val Xt = np.array([np.random.rand(1000, 10)] * 5) Xc = np.random.rand(5, 3) X = TS_Data(Xt, Xc) y = np.array([np.random.rand(1000)] * 5) cross_validate(pipe, X, y, cv=3) X = pd.DataFrame(Xc) Xt = [np.random.rand(1000, 10)] * 5 X['ts_data'] = Xt X = TS_Data.from_df(X) cross_validate(pipe, X, y, cv=3)
def test_pipe_classification(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [5] pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()), ('rf', RandomForestClassifier(n_estimators=10))]) classifier_test(pipe, X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = TS_Data(Xt, Xc) y = [5] classifier_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) classifier_test(pipe, X, y) # multiple time series Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] classifier_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) classifier_test(pipe, X, y) # univariate data Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] Xc = np.random.rand(3) X = TS_Data(Xt, Xc) y = [1, 2, 3] classifier_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) classifier_test(pipe, X, y)
from seglearn.pipe import Pype from seglearn.split import temporal_split, TemporalKFold from seglearn.transform import FeatureRep, Segment, last # for a single time series, we need to make it a list X = [np.arange(10000) / 100.] y = [np.sin(X[0]) * X[0] * 3 + X[0] * X[0]] # split the data along the time axis (our only option since we have only 1 time series) X_train, X_test, y_train, y_test = temporal_split(X, y) # setting y_func = last, selects the last value from each y segment as the target # other options include transform.middle, or you can make your own function # see the API documentation for further details pipe = Pype([('seg', Segment(width=200, overlap=0.5, y_func=last)), ('features', FeatureRep()), ('lin', LinearRegression())]) # fit and score pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", pipe.N_train) print("N segments in test: ", pipe.N_test) print("Score: ", score) # generate some predictions ytr, ytr_p = pipe.transform_predict(X_train, y_train) # training predictions yte, yte_p = pipe.transform_predict(X_test, y_test) # test predictions xtr = np.arange(len(ytr)) # segment number
metrics=['accuracy']) return model # load the data data = load_watch() X = data['X'] y = data['y'] # create a segment learning pipeline width = 100 pipe = Pype([('seg', SegmentX()), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=8, batch_size=256, verbose=0))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", pipe.N_train)
gamma_chosen = 0.05994 #grid1.best_params_["gamma"] n_estimators = 20 #clf = Pype([('segment', SegmentXY(width=chosenwidth,step=1)), # in this context what is the difference with SegmentX? # ('features', FeatureRep(fts)), # ('scaler', StandardScaler()), # ('rf', RandomForestClassifier(n_estimators=20))], scorer=scorer1) clf = Pype([ ('segment', SegmentXY( width=chosenwidth, step=1)), # in this context what is the difference with SegmentX? ('features', FunctionTransformer(reshape_all)), ('scaler', StandardScaler()), ('bagg', OneVsRestClassifier( BaggingClassifier(SVC(kernel='rbf', gamma=gamma_chosen, C=C_chosen, probability=True, class_weight='balanced'), max_samples=1.0 / n_estimators, warm_start=True, n_estimators=n_estimators, n_jobs=6, verbose=10))) ]) #scorer=scorer1 X_train, X_test, y_train, y_test, matlab_train, matlab_test = train_test_split( new_features_seg_included, new_labels_seg_included, new_matlab_seg_included, test_size=0.10,
# Single univariate time series with 10 samples X = [ np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [7, 8], [8, 9], [9, 10]]) ] # Time series target (imbalanced towards False) y = [ np.array( [True, False, False, False, False, False, True, False, False, False]) ] print("Implementation details: transform and fit_transform methods:") pipe = Pype([ ('segment', Segment(width=1, overlap=0)), ('resample', patch_sampler(RandomUnderSampler)()), ]) print("Pipeline:", pipe) print("Calling a transform on the data does not change it ...") Xf, yf = pipe.transform(X, y) print("X (flattened):", Xf.flatten()) print("y", yf) print("... but calling fit_transform resamples the data.") Xf, yf = pipe.fit_transform(X, y) print("X (flattened):", Xf.flatten()) print("y", yf) print() print("VerboseDummyClassifier example:")
X.append(group[['lat', 'lon', '方向', '速度']].values) y.append(group['type'].values[0]) id_list.append(int(ship_id)) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y[:len(train_df_list)]) X_train = X[:len(train_df_list)] X_test = X[len(train_df_list):] kf = KFold(n_splits=5, random_state=42, shuffle=True) model_v1_list = [] score_v1_list = [] for train_index, test_index in kf.split(X_train): model_v1 = Pype([('segment', SegmentX(width=10)), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=100, random_state=42))]) model_v1.fit(np.array(X_train)[train_index], y_train[train_index]) model_v1_list.append(model_v1) y_pred = [] for test_sample in np.array(X_train)[test_index]: result = model_v1.predict_proba([test_sample]) pred = np.argmax(np.sum(result, axis=0) / result.shape[0]) y_pred.append(pred) score_v1_list.append(f1_score(y_train[test_index], y_pred, average='macro')) print(score_v1_list) print(np.mean(score_v1_list), np.std(score_v1_list))
# load the data data = load_watch() X = data['X'] y = data['y'] # temporal splitting of data splitter = TemporalKFold(n_splits=3) Xs, ys, cv = splitter.split(X, y) # create a segment learning pipeline width = 100 pipe = Pype([('seg', SegmentX(order='C')), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=1, batch_size=256, verbose=0))]) # create a parameter dictionary using the sklearn API # # you can also set a parameter to be always equal to another parameter, by setting its value to # parameter name to track (this is an extension from sklearn) # # note that if you want to set a parameter to a single value, it will still need to be as a list par_grid = { 'seg__width': [50, 100, 200], 'seg__overlap': [0.], 'crnn__width': ['seg__width'] }
from sklearn.svm import LinearSVC from seglearn.datasets import load_watch from seglearn.pipe import Pype from seglearn.transform import FeatureRep, PadTrunc # load the data data = load_watch() X = data['X'] y = data['y'] # create a feature representation pipeline with PadTrunc segmentation # the time series are between 20-40 seconds # this truncates them all to the first 5 seconds (sampling rate is 50 Hz) pipe = Pype([('trunc', PadTrunc(width=250)), ('features', FeatureRep()), ('scaler', StandardScaler()), ('svc', LinearSVC())]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", pipe.N_train) print("N segments in test: ", pipe.N_test)
# seed RNGESUS np.random.seed(123124) # load the data data = load_watch() X = data['X'] y = data['y'] # I am adding in a column to represent time (50 Hz sampling), since my data doesn't include it # the Interp class assumes time is the first column in the series X = np.array([np.column_stack([np.arange(len(X[i])) / 50., X[i]]) for i in np.arange(len(X))]) clf = Pype([('interp', Interp(1. / 25., categorical_target=True)), ('segment', Segment(width=100)), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", clf.N_train) print("N segments in test: ", clf.N_test) print("Accuracy score: ", score)
def test_pipe_transformation(): # SegmentX transform pipe pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y) # SegmentXY transform pipe pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y) # Forecast transform pipe pipe = Pype([('seg', SegmentXYForecast()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y) # Padtrunc transform pipe pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y)
def test_pipe_regression(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [np.random.rand(1000)] pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()), ('ridge', Ridge())]) pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = TS_Data(Xt, Xc) y = [np.random.rand(1000)] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # multiple time seres Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # cross val Xt = np.array([np.random.rand(1000, 10)] * 5) Xc = np.random.rand(5, 3) X = TS_Data(Xt, Xc) y = np.array([np.random.rand(1000)] * 5) cross_validate(pipe, X, y, cv=3) # transform pipe pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] pipe.fit(X, y) pipe.transform(X, y) pipe.fit_transform(X, y)
def test_pipe_PadTrunc(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [5] pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()), ('rf', RandomForestClassifier(n_estimators=10))]) pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = TS_Data(Xt, Xc) y = [5] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # multiple time series Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # univariate data Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] Xc = np.random.rand(3) X = TS_Data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # transform pipe pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform(X, y) pipe.fit_transform(X, y)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model # load the data data = load_watch() X = data['X'] y = data['y'] # create a segment learning pipeline pipe = Pype([('seg', Segment(width=100, step=100, order='C')), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=1, batch_size=256, verbose=0))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", pipe.N_train)
data = load_watch() X = data['X'] y = data['y'] # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # create a segment learning pipeline width = 100 pipe = Pype([('seg', SegmentX()), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=10, batch_size=256, verbose=0, validation_split=0.2))]) ############################################## # Accessing training history ############################################## # this is a bit of a hack, because history object is returned by the # keras wrapper when fit is called # this approach won't work with a more complex estimator pipeline, in which case # a callable class with the desired properties should be made passed to build_fn pipe.fit(X_train, y_train) print(DataFrame(pipe.history.history))
train_df = pd.concat(train_df_list) train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S') all_df = pd.concat([train_df]) X = [] y = [] id_list = [] for ship_id, group in all_df.groupby('渔船ID'): X.append(group[['lat', 'lon', '速度', '方向', 'time']]) y.append(group['type'].values[0]) id_list.append(ship_id) print(len(id_list)) pype = Pype([('segment', SegmentX(width=72, overlap=0.1))]) pype = pype.fit(X, y) shape_list = [] df_list = [] for ship_id, group in all_df.groupby('渔船ID'): sample = group[['lat', 'lon', '速度', '方向', 'time']].values transform_result = pype.transform([sample])[0] if transform_result.shape[0] == 0: seg_df = pd.DataFrame(sample, columns=['lat', 'lon', '速度', '方向', 'time']) seg_df['渔船ID'] = len(df_list) seg_df['type'] = group['type'].values[0] df_list.append(seg_df)
from seglearn.base import TS_Data from seglearn.datasets import load_watch from seglearn.pipe import Pype from seglearn.transform import FeatureRep, SegmentX # seed RNGESUS np.random.seed(123124) # load the data data = load_watch() X = data['X'] y = data['y'] # create a feature representation pipeline clf = Pype([('segment', SegmentX()), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", clf.N_train) print("N segments in test: ", clf.N_test) print("Accuracy score: ", score) # now lets add some contextual data
############################################## # SETUP ############################################## # load the data data = load_watch() X = data['X'] y = data['y'] # create a feature representation pipeline steps = [('seg', Segment()), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))] pipe = Pype(steps) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) ############################################## # OPTION 1: Use the score SegPipe score method ############################################## pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print("Accuracy score: ", score)
# remember for a single time series, we need to make a list X = [X] y = [y] # split the data along the time axis (our only option since we have only 1 time series) X_train, X_test, y_train, y_test = temporal_split(X, y, test_size=0.25) # create a feature representation pipeline # setting y_func = last, and forecast = 200 makes us predict the value of y # 200 samples ahead of the segment # other reasonable options for y_func are ``mean``, ``all`` (or create your own function) # see the API documentation for further details clf = Pype([('segment', SegmentXYForecast(width=200, overlap=0.5, y_func=last, forecast=200)), ('features', FeatureRep()), ('lin', LinearRegression())]) # fit and score clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", clf.N_train) print("N segments in test: ", clf.N_test) print("Score: ", score) # generate some predictions y, y_p = clf.transform_predict(X, y) # all predictions
from seglearn.base import TS_Data from seglearn.datasets import load_watch from seglearn.pipe import Pype from seglearn.transform import FeatureRep, Segment # seed RNGESUS np.random.seed(123124) # load the data data = load_watch() X = data['X'] y = data['y'] # create a feature representation pipeline clf = Pype([('segment', Segment()), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", clf.N_train) print("N segments in test: ", clf.N_test) print("Accuracy score: ", score) # lets make a pretend series with different activities
def generate_result(): train_path = '/tcdata/hy_round2_train_20200225' test_path = '/tcdata/hy_round2_testB_20200312' train_df_list = [] for file_name in os.listdir(train_path): if file_name.endswith('.csv'): df = pd.read_csv(os.path.join(train_path, file_name)) train_df_list.append(df) test_df_list = [] for file_name in os.listdir(test_path): if file_name.endswith('.csv'): df = pd.read_csv(os.path.join(test_path, file_name)) test_df_list.append(df) train_df = pd.concat(train_df_list) test_df = pd.concat(test_df_list) train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S') test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S') all_df = pd.concat([train_df, test_df]) X = [] y = [] id_list = [] for ship_id, group in all_df.groupby('渔船ID'): X.append(group[['lat', 'lon', '速度', '方向', 'time']]) y.append(group['type'].values[0]) id_list.append(ship_id) print(len(id_list)) pype = Pype([('segment', SegmentX(width=72, overlap=0.0))]) pype = pype.fit(X, y) shape_list = [] df_list = [] for ship_id, group in all_df.groupby('渔船ID'): sample = group[['lat', 'lon', '速度', '方向', 'time']].values transform_result = pype.transform([sample])[0] if transform_result.shape[0] == 0: seg_df = pd.DataFrame(sample, columns=['lat', 'lon', '速度', '方向', 'time']) seg_df['渔船ID'] = len(df_list) seg_df['type'] = group['type'].values[0] df_list.append(seg_df) shape_list.append(1) else: for seg in transform_result: seg_df = pd.DataFrame( seg, columns=['lat', 'lon', '速度', '方向', 'time']) seg_df['渔船ID'] = len(df_list) seg_df['type'] = group['type'].values[0] df_list.append(seg_df) shape_list.append(transform_result.shape[0]) new_all_df = pd.concat(df_list, sort=False) new_all_df.to_csv('help.csv', index=False) new_all_df = pd.read_csv('help.csv') df = new_all_df.drop(columns=['type']) extracted_df = extract_features(df, column_id='渔船ID', column_sort='time', n_jobs=8, kind_to_fc_parameters=fc_parameters_v2) new_df = new_all_df.groupby('渔船ID').agg(x_min=('lat', 'min'), x_max=('lat', 'max'), y_min=('lon', 'min'), y_max=('lon', 'max')) extracted_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min'] extracted_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min'] extracted_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min'] extracted_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min'] extracted_df['slope'] = extracted_df['y_max-y_min'] / np.where( extracted_df['x_max-x_min'] == 0, 0.001, extracted_df['x_max-x_min']) extracted_df[ 'area'] = extracted_df['x_max-x_min'] * extracted_df['y_max-y_min'] def get_feature(arr): feature = [ np.max(arr), np.quantile(arr, 0.9), np.quantile(arr, 0.1), np.quantile(arr, 0.75), np.quantile(arr, 0.25), np.mean(arr), np.std(arr), np.median(arr), np.std(arr) / np.mean(arr) ] feature.append(np.corrcoef(np.array([arr[:-1], arr[1:]]))[0, 1]) feature.append(skew(arr)) feature.append(kurtosis(arr)) return feature features = [] for _, group in new_all_df.groupby('渔船ID'): group = group.sort_values(by=['time']) lat = group['lat'].values lon = group['lon'].values time_ = pd.to_datetime(group['time'], format='%Y-%m-%d %H:%M:%S').values dire = group['方向'].values speed_list = [] for i in range(lat.shape[0]): if i == 0: continue hour = (time_[i] - time_[i - 1]) / np.timedelta64(1, 'h') dist = geodesic((lat[i - 1], lon[i - 1]), (lat[i], lon[i])) speed_list.append(dist.km / hour) c = np.sum(np.cos(dire / 180 * np.pi)) / group.shape[0] s = np.sum(np.sin(dire / 180 * np.pi)) / group.shape[0] r = np.sqrt(c**2 + s**2) theta = np.arctan(s / c) angle_feature = [r, theta, np.sqrt(-2 * np.log(r))] turn_list = [] for i in range(dire.shape[0]): if i == 0: continue turn = 1 - np.cos(dire[i - 1] / 180 * np.pi - dire[i] / 180 * np.pi) turn_list.append(turn * np.pi) turn_list = np.array(turn_list) c = np.sum(np.cos(turn_list)) / (group.shape[0] - 1) s = np.sum(np.sin(turn_list)) / (group.shape[0] - 1) r = np.sqrt(c**2 + s**2) theta = np.arctan(s / c) turn_feature = [r, theta, np.sqrt(-2 * np.log(r))] features.append( np.concatenate( [get_feature(speed_list), angle_feature[:1], turn_feature[:1]])) extracted_df_ = pd.concat([pd.DataFrame(np.array(features)), extracted_df], axis=1) y = [] for _, group in new_all_df.groupby('渔船ID'): y.append(group.iloc[0]['type']) train_df = extracted_df_.iloc[:np.sum(shape_list[:len(train_df_list)])] test_df = extracted_df_.iloc[np.sum(shape_list[:len(train_df_list)]):] y_train = y[:train_df.shape[0]] le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) train_df['type'] = le.inverse_transform(y_train) train_df.to_csv('./train.csv') test_df.to_csv('./test.csv') train_df = pd.read_csv('./train.csv', index_col=0) X_train = train_df.drop(columns=['type']).values y_train = train_df['type'].values test_df = pd.read_csv('./test.csv', index_col=0) X_test = test_df.values from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') X_train = imputer.fit_transform( pd.DataFrame(X_train).replace([np.inf, -np.inf], np.nan).values) X_test = imputer.fit_transform( pd.DataFrame(X_test).replace([np.inf, -np.inf], np.nan).values) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) def get_model(): exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=48), StackingEstimator( estimator=SGDClassifier(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.25, learning_rate="invscaling", loss="modified_huber", penalty="elasticnet", power_t=10.0)), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100)) set_param_recursive(exported_pipeline.steps, 'random_state', 42) return exported_pipeline def get_model_v2(): exported_pipeline = make_pipeline( make_union( make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), SelectPercentile(score_func=f_classif, percentile=18)), FunctionTransformer(copy)), StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.1, fit_intercept=False, l1_ratio=1.0, learning_rate="constant", loss="hinge", penalty="elasticnet", power_t=0.1)), VarianceThreshold(threshold=0.05), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=100)) set_param_recursive(exported_pipeline.steps, 'random_state', 42) return exported_pipeline def get_data(shape_idx): start_idx = int(np.sum(shape_list[:shape_idx])) end_idx = start_idx + shape_list[shape_idx] if shape_idx < len(train_df_list): return X_train[start_idx:end_idx], y_train[start_idx:end_idx] else: return X_test[start_idx:end_idx], None kf = KFold(n_splits=5, random_state=2019, shuffle=True) model_v1_list = [] score_v1_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v1 = get_model() model_v1.fit(train_data, y_data) model_v1_list.append(model_v1) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v1.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v1_list.append(score) print(score_v1_list) print(np.mean(score_v1_list)) kf = KFold(n_splits=5, random_state=22, shuffle=True) model_v2_list = [] score_v2_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v2 = get_model_v2() model_v2.fit(train_data, y_data) model_v2_list.append(model_v2) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v2.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v2_list.append(score) print(score_v2_list) print(np.mean(score_v2_list)) kf = KFold(n_splits=5, random_state=22, shuffle=True) model_v3_list = [] score_v3_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v3 = RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.1, min_samples_leaf=1, min_samples_split=2, n_estimators=100) model_v3.fit(train_data, y_data) model_v3_list.append(model_v3) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v3.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v3_list.append(score) print(score_v3_list) print(np.mean(score_v3_list)) kf = KFold(n_splits=5, random_state=22, shuffle=True) model_v4_list = [] score_v4_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v4 = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100) model_v4.fit(train_data, y_data) model_v4_list.append(model_v4) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v4.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v4_list.append(score) print(score_v4_list) print(np.mean(score_v4_list)) pred = [] for i in range(len(train_df_list), len(shape_list)): start_idx = int(np.sum(shape_list[len(train_df_list):i])) sample = X_test[start_idx:start_idx + shape_list[i]] result = [] for model in model_v1_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) for model in model_v2_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) for model in model_v3_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) for model in model_v4_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) pred.append(np.sum(result, axis=0) / 20) pd.DataFrame(pred, index=id_list[len(train_df_list):]).to_csv( './probaresult.csv', header=None)
# load the data data = load_watch() X = data['X'] y = data['y'] # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # create a segment learning pipeline pipe = Pype([('seg', Segment(width=100, step=100, order='C')), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=4, batch_size=256, verbose=0, validation_split=0.2))]) ############################################## # Accessing training history ############################################## # this is a bit of a hack, because history object is returned by the # keras wrapper when fit is called # this approach won't work with a more complex estimator pipeline, in which case # a callable class with the desired properties should be made passed to build_fn pipe.fit(X_train, y_train) history = pipe.history.history