def grid_search_analysis(filename): ## logger = logging.getLogger(__name__) ## logger.debug('Read and prepare cross validation score data') grid_search_data = pandas.read_csv(os.path.join(dt.output_dir(), filename), header=None, index_col=None) grid_search_data = grid_search_data.iloc[:, :-1] grid_search_data.columns = ['learning_rate', 'n_estimators', 'score'] grid_search_data.loc[:, 'score'] = grid_search_data.loc[:, 'score'].apply(lambda x: float(*((re.findall(r'[0.]\d+', str(x)))))) grid_search_data.loc[:, 'learning_rate'] = grid_search_data.loc[:, 'learning_rate'].apply(lambda x: float(*((re.findall(r'[0.]\d+', str(x)))))) grid_search_data.loc[:, 'n_estimators'] = grid_search_data.loc[:, 'n_estimators'].apply(lambda x: int(*((re.findall(r'\d+', str(x)))))) ## logger.debug('Return table sorted by score') result = grid_search_data.sort_values(by='score') print(result) ## logger.debug('Analytics') print(grid_search_data.groupby(('n_estimators', 'learning_rate')).mean().sort_values('score')) return result
def train_ada_boost_classifier(x_train, y_train, x_test, y_test, max_depth, class_weight, n_estimators, learning_rate_lower, learning_rate_upper, learning_rate_num, criterion, machines, comment='AdaBoostClassifier'): logger = logging.getLogger(__name__) rs = numpy.random.RandomState(12357) ## logger.info('<--Spec model parameters-->') learning_rate = numpy.logspace(learning_rate_lower, learning_rate_upper, learning_rate_num) model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, class_weight=class_weight, random_state=rs)) param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators) kfold = StratifiedKFold(n_splits=4, random_state=rs) score = make_scorer(f1_score, average='micro') ## logger.info('<--Start grid search over n_estimators and learning_rate-->') grid_search = GridSearchCV(model, param_grid, scoring=score, n_jobs=machines, cv=kfold, verbose=3) opt_model = grid_search.fit(x_train, y_train.values.flatten()) logger.info("Best score: [{:f}] using [{}]".format(opt_model.best_score_, opt_model.best_params_)) ## logger.info('<--Make prediction and write out-->') prediction = opt_model.predict(x_test) prediction_score = f1_score(prediction, y_test.values.flatten(), average='micro') logger.info('Check prediction score on validation set := [{:f}]'.format(prediction_score)) output = pandas.Series(prediction, name='y') output.to_csv(os.path.join(dt.output_dir(), 'ABC_{:s}.csv'.format(comment)), index=True, header=['y'], index_label=['id']) ## from sklearn.metrics import confusion_matrix as confusion_matrix print(confusion_matrix(prediction, y_test.values.flatten(),labels=[0,1,2])) return prediction, opt_model
def train(eeg1, eeg2, emg, labels, validate_size, epochs, label, type): eeg1_ = df_row_norm(eeg1.fillna(0)).values eeg2_ = df_row_norm(eeg2.fillna(0)).values emg_ = df_row_norm(emg.fillna(0)).values labels_ = (labels - 1).values X = np.dstack((eeg1_, eeg2_, emg_)) Y = labels_ x_validate = X[-validate_size:, :].copy() y_validate = Y[-validate_size:, :].copy() x_train = X[:-validate_size, :] y_train = Y[:-validate_size, :] model_path = os.path.join(dt.output_dir(), "%s.h5" % label) model = train_model(x_train, y_train, x_validate, y_validate, epochs=epochs, type=type, model_path=model_path) return model
def run(self): with open(os.path.join(dt.output_dir(), 'grid_diag'), 'a') as f: while True: (cexp,gexp) = self.job_queue.get() if cexp is WorkerStopToken: self.job_queue.put((cexp,gexp)) # print('worker {0} stop.'.format(self.name)) break try: c, g = None, None if cexp != None: c = 2.0**cexp if gexp != None: g = 2.0**gexp rate = self.run_one(c,g, f) if rate is None: raise RuntimeError('get no rate') except: # we failed, let others do that and we just quit traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) self.job_queue.put((cexp,gexp)) sys.stderr.write('worker {0} quit.\n'.format(self.name)) break else: self.result_queue.put((self.name,cexp,gexp,rate))
def predict(X_test_A, X_test_B, model, type, weights, label): num_samples = X_test_A.shape[0] assert X_test_A.shape[0] == num_samples if type == ModelType.CNN_LSTM: assert len(model.input_shape) == 4 time_steps = TIME_STEPS assert model.input_shape[1] == time_steps assert num_samples % time_steps == 0, "total number of samples must divide by number of time steps" X_test_A = X_test_A.reshape( (int(num_samples / time_steps), time_steps, *X_test_A.shape[1:])) X_test_B = X_test_B.reshape( (int(num_samples / time_steps), time_steps, *X_test_B.shape[1:])) y_score_A = model.predict(X_test_A).reshape( num_samples, model.output_shape[-1]) * weights y_score_B = model.predict(X_test_B).reshape( num_samples, model.output_shape[-1]) * weights y_score = np.concatenate((y_score_A, y_score_B)) y_test = np.argmax(y_score, axis=1) result = pd.Series(y_test) expected = [0.526, 0.418, 0.0548] for i in range(3): print("class expected/realized class ratio [%s]: [%s/%s]" % (i, expected[i], sum(result == i) / len(result))) print("") result += 1 result.index.name = 'Id' result.name = 'y' pd.DataFrame(result).to_csv(os.path.join(dt.output_dir(), "%s.csv" % label)) return y_score
def main(): test, train = read_data() ## yCols = ['y'] xCols = list(set(train.columns).difference(yCols)) assert set(yCols).intersection(xCols) == set(),\ "there is a non-trivial intersection between yCols {} and xCols {}".format(" ".join(yCols), " ".join(xCols)) ## betas = pandas.Series(dr.ridge_regression(X=train[xCols], y=train[yCols], lambdaParam=0).flatten(), index=xCols) ## predictY = pandas.Series(test[xCols].dot(betas), index=numpy.arange(10000, 12000)) predictY.to_csv(os.path.join(dt.output_dir(), 'task0_solution.csv'), index=True, header=yCols, index_label=['Id'])
-log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2) begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end} "null" -- do not grid with g -v n : n-fold cross validation (default 5) -svmtrain pathname : set svm executable path and name -gnuplot {pathname | "null"} : pathname -- set gnuplot executable path and name "null" -- do not plot -out {pathname | "null"} : (default dataset.out) pathname -- set output file path and name "null" -- do not output file -png pathname : set graphic output file path and name (default dataset.png) -resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out) This is experimental. Try this option only if some parameters have been checked for the SAME data. svm_options : additional options for svm-train""") sys.exit(1) if len(sys.argv) < 2: exit_with_help() dataset_pathname = sys.argv[-1] options = sys.argv[1:-1] try: with open(os.path.join(dt.output_dir(), 'grid_diag'), 'a') as f: f.write("START\n") find_parameters(dataset_pathname, options) except (IOError,ValueError) as e: sys.stderr.write(str(e) + '\n') sys.stderr.write('Try "grid.py" for more information.\n') sys.exit(1)
def train_svm_classifier(x_train, y_train, x_test, y_test, machines, c_penalty_lower, c_penalty_upper, c_penalty_num, g_lower, g_upper, g_num, class_weight, kernel, comment='SMVC'): ## logger = logging.getLogger(__name__) rs = numpy.random.RandomState(12357) ## support_vector_machine_classifier = SVC(gamma='scale', kernel=kernel, class_weight=class_weight) ## logger.info('<--Spec model parameters-->') if (g_lower == None) & (g_upper == None) & (g_num == None): gamma = 'scale' c_penalty = numpy.logspace(c_penalty_lower, c_penalty_upper, c_penalty_num) param_grid = dict(C=c_penalty) else: c_penalty = numpy.logspace(c_penalty_lower, c_penalty_upper, c_penalty_num) gamma = numpy.logspace(g_lower, g_upper, g_num) param_grid = dict(C=c_penalty, gamma=gamma) kfold = StratifiedKFold(n_splits=3, random_state=rs) score = make_scorer(f1_score, average='micro') ## logger.info('<--Start grid search over C and gamma-->') grid_search = GridSearchCV(support_vector_machine_classifier, param_grid, scoring=score, n_jobs=machines, cv=kfold, verbose=3) opt_model = grid_search.fit(x_train, y_train.values.flatten()) logger.info("Best score: [{:f}] using [{}]".format(opt_model.best_score_, opt_model.best_params_)) ## logger.info('<--Make prediction and write out-->') prediction = opt_model.predict(x_test) prediction_score = f1_score(prediction, y_test.values.flatten(), average='micro') logger.info('Check prediction score on validation set := [{:f}]'.format( prediction_score)) output = pandas.Series(prediction, name='y') output.to_csv(os.path.join(dt.output_dir(), 'SVM_{:s}.csv'.format(comment)), index=True, header=['y'], index_label=['id']) print( confusion_matrix(prediction, y_test.values.flatten(), labels=list(range(len(set(y_test.y.values)))))) return prediction, opt_model
import os import pandas as pd import data as dt import matplotlib.pyplot as plt grid_search = pd.read_csv(os.path.join(dt.output_dir(), 'svm_train.scale_grid_search'), header=None, index_col=None) x = grid_search.iloc[:, 0] y = grid_search.iloc[:, 1] z = grid_search.iloc[:, 2] fig, ax = plt.subplots() ax.scatter(x, y, c=z) plt.show() print("DONE")
def regression(seed, start, end, step, cv=3, comment=''): logger = logging.getLogger(__name__) ## logger.info('read provided data') X_test, X_train, y_train = read_data() std_train, std_test, = transform_data(X_train=X_train, X_test=X_test) ## removed = 0 for col in std_train.columns: data = std_train[col].copy() mask = numpy.abs(data) > data.mean() + 3.5 * data.std() std_train.loc[mask, col] = numpy.NaN removed += sum(mask) del data, mask logger.info('removed a total of [{}] elements'.format(removed)) ## if True: logger.info( 'fill NaN with 0 i.e. the mean of the standardized random variables' ) std_train.fillna(1e-3, inplace=True) std_test.fillna(1e-3, inplace=True) elif False: logger.info('fill NaN with linear regression model of X_i = f(y)') std_train = clean_data(predictors=std_train_temp, response=y_train, clean_mode=CLEAN_MODE.RESPONSE) std_test.fillna(0.0, inplace=True) std_test = std_test.reindex(columns=choose) del choose ## logger.info('feature engineering') base_columns = std_train.copy().columns base_train = std_train.copy() base_test = std_test.copy() names = base_columns + '_sq' train_sq = base_train.pow(2) train_sq.columns = names std_train = pandas.concat([std_train, train_sq], axis=1) test_sq = base_test.pow(2) test_sq.columns = names std_test = pandas.concat([std_test, test_sq], axis=1) names = base_columns + '_sin' train_sq = numpy.sin(base_train) train_sq.columns = names std_train = pandas.concat([std_train, train_sq], axis=1) test_sq = numpy.sin(base_test) test_sq.columns = names std_test = pandas.concat([std_test, test_sq], axis=1) ## logger.info('use lasso regression with custom set of lambda parameters') alphas = seed**numpy.arange(start, end, step) logger.info('alpha parameters := {}'.format( str(["{0:0.2f}".format(i) for i in alphas]).replace("'", ""))) reg = LassoCV(alphas=alphas, cv=cv, n_jobs=2, random_state=12357) model_cv = reg.fit(std_train.values, y_train.values.flatten()) logger.info('alpha := {:f}'.format(float(model_cv.alpha_))) pred = model_cv.predict(std_test) resid = y_train.values.flatten() - model_cv.predict(std_train) ## logger.info('plotting of first stage results') f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(17, 10)) f.suptitle('first stage') ax1.plot(resid, 'bo') tau = numpy.mean(resid) + 1.64 * numpy.std(resid) mask = numpy.abs(resid) > tau ax1.plot([i if numpy.abs(i) > tau else None for i in resid], 'ro') ax1.set_title('Residuals') ax2.scatter(model_cv.predict(std_train), y_train) x0, x1 = ax2.get_xlim() y0, y1 = ax2.get_ylim() ax2.set_aspect((x1 - x0) / (y1 - y0)) ax2.set_title('Fitted vs. Actual') ## logger.info( 'use second lasso regression, removing large error inducing observations' ) std_train_ = std_train[~mask] y_train_ = y_train[~mask] reg = LassoCV(alphas=alphas, cv=cv, n_jobs=2, random_state=12357) model_cv = reg.fit(std_train_.values, y_train_.values.flatten()) logger.info('alpha := {:f}'.format(float(model_cv.alpha_))) pred = model_cv.predict(std_test) resid = y_train_.values.flatten() - model_cv.predict(std_train_) ## logger.info('plotting of second stage results') f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(17, 10)) f.suptitle('second stage') ax1.plot(resid, 'bo') tau = numpy.mean(resid) + 1.6 * numpy.std(resid) mask = numpy.abs(resid) > tau ax1.plot([i if numpy.abs(i) > tau else None for i in resid], 'ro') ax1.set_title('Residuals') ax2.scatter(model_cv.predict(std_train), y_train) x0, x1 = ax2.get_xlim() y0, y1 = ax2.get_ylim() ax2.set_aspect((x1 - x0) / (y1 - y0)) ax2.set_title('Fitted vs. Actual, RMSE := {:.6f}'.format( mean_squared_error(y_train, model_cv.predict(std_train)))) ## logger.info('write to pandas Series object') write_to_file = pandas.Series(pred, index=X_test.index.astype(int), name='y') write_to_file.to_csv(os.path.join(dt.output_dir(), 'task1_solution_{}.csv'.format(comment)), index=True, header=['y'], index_label=['id'])
grid_search = GridSearchCV(model, param_grid, scoring="balanced_accuracy", n_jobs=4, cv=kfold) opt_ada_boost_params = grid_search.fit(x_train, y_train.flatten()) logger.info("Best: [{:f}] using [{}]".format( opt_ada_boost_params.best_score_, opt_ada_boost_params.best_params_)) return opt_ada_boost_params ####################################################################### if __name__ == '__main__': root = logging.getLogger(__name__) root.setLevel(logging.INFO) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) root.addHandler(ch) ## user = '******' logger = logging.getLogger(__name__) opt_params = cluster_tester(n=100, k=10) pandas.Series([1, 2, 3, 4], index=[1, 2, 3, 4]).to_csv( os.path.join(dt.output_dir(), 'euler_cluster_test_out.csv')) logger.info('{} <Job Done>'.format(user))
def main(): N = 21600 validate_size = 2000 epochs = 50 type = ModelType.CNN_LSTM ################################### ### Read train data and fit models ################################### eeg1 = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_eeg1.csv'), header=0, index_col=0) eeg2 = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_eeg2.csv'), header=0, index_col=0) emg = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_emg.csv'), header=0, index_col=0) labels = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_labels.csv'), header=0, index_col=0) ########################## ### subject one model start = 0 end = N label = 'subject1_%s_%s_epochs' % (type, epochs) subject1_model = train(eeg1=eeg1.iloc[start:end, :], eeg2=eeg2.iloc[start:end, :], emg=emg.iloc[start:end, :], labels=labels.iloc[start:end, :], type=type, validate_size=validate_size, epochs=epochs, label=label) ########################## ### subject two model start = N end = N * 2 label = 'subject2_%s_%s_epochs' % (type, epochs) subject2_model = train(eeg1=eeg1.iloc[start:end, :], eeg2=eeg2.iloc[start:end, :], emg=emg.iloc[start:end, :], labels=labels.iloc[start:end, :], type=type, validate_size=validate_size, epochs=epochs, label=label) ########################## ### subject three model start = N * 2 end = N * 3 - 500 label = 'subject3_%s_%s_epochs' % (type, epochs) subject3_model = train(eeg1=eeg1.iloc[start:end, :], eeg2=eeg2.iloc[start:end, :], emg=emg.iloc[start:end, :], labels=labels.iloc[start:end, :], type=type, validate_size=validate_size, epochs=epochs, label=label) ############################################## ### Models fitted, read test data and predict ############################################## eeg1_test = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'test_eeg1.csv'), header=0, index_col=0) eeg2_test = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'test_eeg2.csv'), header=0, index_col=0) emg_test = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'test_emg.csv'), header=0, index_col=0) eeg1_test_A = df_row_norm(eeg1_test.iloc[:N, :].fillna(0)).values eeg2_test_A = df_row_norm(eeg2_test.iloc[:N, :].fillna(0)).values emg_test_A = df_row_norm(emg_test.iloc[:N, :].fillna(0)).values eeg1_test_B = df_row_norm(eeg1_test.iloc[N:, :].fillna(0)).values eeg2_test_B = df_row_norm(eeg2_test.iloc[N:, :].fillna(0)).values emg_test_B = df_row_norm(emg_test.iloc[N:, :].fillna(0)).values X_test_A = np.dstack((eeg1_test_A, eeg2_test_A, emg_test_A)) X_test_B = np.dstack((eeg1_test_B, eeg2_test_B, emg_test_B)) ################################# ### subject one model prediction label = 'subject_1_%s_weighted_%s_epochs' % (type, epochs) y_subject1_score = predict(X_test_A, X_test_B, model=subject1_model, type=type, weights=[1, 0.5, 2.5], label=label) ################################# ### subject two model prediction label = 'subject_2_%s_weighted_%s_epochs' % (type, epochs) y_subject2_score = predict(X_test_A, X_test_B, model=subject2_model, type=type, weights=[1, 0.5, 2.0], label=label) ################################### ### subject three model prediction label = 'subject_3_%s_weighted_%s_epochs' % (type, epochs) y_subject3_score = predict(X_test_A, X_test_B, model=subject3_model, type=type, weights=[1, 0.5, 4.5], label=label) ################################## ### all subjects model prediction label = 'all_subjects_%s_%s_epochs' % (type, epochs) y_score = y_subject1_score * 0.33 + y_subject2_score * 0.33 + y_subject3_score * 0.33 y_test = np.argmax(y_score, axis=1) result = pd.Series(y_test) expected = [0.526, 0.418, 0.0548] for i in range(3): print("class expected/realized class ratio [%s]: [%s/%s]" % (i, expected[i], sum(result == i) / len(result))) print("") result += 1 result.index.name = 'Id' result.name = 'y' pd.DataFrame(result).to_csv(os.path.join(dt.output_dir(), "%s.csv" % label)) ################################## ### all subjects model prediction label = 'all_subjects_%s_weighted_%s_epochs' % (type, epochs) y_score = (y_subject1_score * 0.5 + y_subject2_score * 0 + y_subject3_score * 0.5) * [1.5, 0.8, 1.6] y_test = np.argmax(y_score, axis=1) result = pd.Series(y_test) expected = [0.526, 0.418, 0.0548] for i in range(3): print("class expected/realized class ratio [%s]: [%s/%s]" % (i, expected[i], sum(result == i) / len(result))) print("") result += 1 result.index.name = 'Id' result.name = 'y' pd.DataFrame(result).to_csv(os.path.join(dt.output_dir(), "%s.csv" % label)) print("DONE")
def analyze_all_tram_lines(): import math from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error logger = logging.getLogger(__name__) ## logger.info('<--Read bus delay data-->') zvv = pandas.read_hdf(os.path.join('data', 'zvv_all_tram_lines.h5')) ## logger.info('<--Read weather data & adjust for outliers-->') weather = dt.get_iac_weather_data() q = 3 #weather.rain.quantile(0.99975) mask = weather.rain < q weather = weather[mask] del mask, q ## logger.info('<--Pre-process tram delay data-->') zvv.loc[:, 'diff'] = zvv.ist_an_von - zvv.soll_an_von zvv.loc[:, 'time'] = pandas.to_datetime(zvv.soll_an_von.astype(float), errors='coerce', unit='s') zvv.time = zvv.time.dt.strftime('%H:%M') zvv.loc[:, 'datetime'] = pandas.to_datetime(zvv.datum_von.astype(str) + ' ' + zvv.time) zvv.datetime = zvv.datetime.dt.round('60min') ## logger.info('<--Extract weather measures-->') weather.loc[:, 'datetime'] = weather.index.round('60min') resampleSumWeatherByHour = weather.resample('H').sum() resampleMeanWeatherByHour = weather.resample('H').mean() maskSnow = resampleMeanWeatherByHour.T_air < 0 feature = resampleMeanWeatherByHour.rain * maskSnow.astype(int) ## logger.info('<--Compute de-seasoning for all tram lines-->') container = [] for line in numpy.sort(numpy.setdiff1d(zvv.linie.unique(), [753, 29])): ## logger.info('<--Compute groupby sum on datetime for all tram line %i -->' %line) transport = zvv[zvv.linie == line] transport.set_index('datetime', drop=True, inplace=True) transport.index = pandas.to_datetime(transport.index) transport = transport.groupby(transport.index).sum() timeDelta = datetime.timedelta(days=7) temp = transport['diff'].copy() - transport['diff'].shift(freq=timeDelta) weeklyDetrendedtram = temp.dropna(how='all', axis=0) weeklyDetrendedtram = weeklyDetrendedtram.interpolate() del timeDelta, temp ## logger.info('<--Combine line %i features into new data-frame-->' %line) window = 6 combine = [ weeklyDetrendedtram.rolling(window=window).mean(), resampleSumWeatherByHour.rain.rolling(window=window).mean(), pandas.Series(feature.rolling(window=window).mean(), name='snow'), pandas.Series(resampleMeanWeatherByHour.T_air.rolling(window=window).mean(), name='temp') ] df = pandas.concat(combine, axis=1).dropna(how='any') df.loc[:, 'weekday'] = df.index.dayofweek df.loc[:, 'hour'] = df.index.hour mask = (df['diff'] > 0) df = df[mask] corr = df.corr() logger.info('<--1. Categorical features -> one-hot encoder-->') data = df.sort_values(['weekday', 'hour']) encoder = OneHotEncoder() categoricalFeatures = [ 'weekday', 'hour' ] encoderFeatureOrder = [ *data.weekday.unique(), *data.hour.unique(), ] enc = encoder.fit(data.loc[:, categoricalFeatures]) categoricalData = enc.transform(data.loc[:, categoricalFeatures]) logger.info('<--2. Ordinal features -> no transform -->') target = ['diff'] data.drop(columns=categoricalFeatures) ordinalFeatures = data.columns.difference(target) logger.info('<--2. Regression-->') trainX = numpy.hstack([data.loc[:, ordinalFeatures].values, categoricalData.todense()]) trainY = data.loc[:, target].values.flatten() reg = LinearRegression(fit_intercept=True) reg.fit(X=trainX, y=trainY) predict = reg.predict(X=trainX) logger.info('<--3. Results & Plot-->') a, b = numpy.polyfit(trainY, predict, deg=1) f = lambda x: a*x + b fig, ax = plt.subplots(1) ax.scatter(y=trainY, x=predict, color='red', marker='x') ax.plot(predict, f(predict)) ax.set_aspect('equal') ax.grid(True) ax.set_ylabel('Actual - delay') ax.set_xlabel('Predicted - delay') ax.set_title('Linear Regression Model - Line %i' %line) r2 = r2_score(trainY, predict) mse = mean_squared_error(trainY, predict) corrDelayRain = mpatches.Patch(color='blue', label='R^2 %.4f' %r2) corrDelaySnow = mpatches.Patch(color='blue', label='RMSE %.4f' %math.sqrt(mse)) plt.legend(handles=[corrDelayRain, corrDelaySnow]) plt.savefig(os.path.join(dt.output_dir(), 'line_%i_prediction.png' %line)) logger.info('<--4. Correlation structure-->') print(df.corr()) logger.info('<--5. Save summary statistics to file-->') stats = pandas.Series(data=corr.loc['diff',:], name=line) stats['r2'] = r2 stats['mse'] = mse container.append(stats) pandas.concat(container, axis=1).to_csv(os.path.join(dt.output_dir(), 'correlation_all_tram_lines.csv'))
axis=0 ax[axis].plot(yData.index, yData) ax[axis].set_ylabel('Delay [s]') axis+=1 ax[axis].bar(xData.index, height=xData, width=0.05, color='green') ax[axis].set_xlabel('YYYY-MM-DD:HH') <<<<<<< HEAD ax[axis].set_ylabel('OTELFINGEN - CUM. HOURLY PRECIPITATION [mm]') plt.tight_layout() ======= ax[axis].set_ylabel('Precipitation [mm]') >>>>>>> 75b31de49b39ce14bd7c974f3da31600ecd0ee94 fig.savefig(os.path.join(dt.output_dir(), 'delay_vs_rainfall.png')) del mask, xData, yData ''' Description: Scatter plot between AVERAGE TEMPRATURE and DELAYS ''' xData = averageWeatherDelays.reindex(index=weeklySeasoned.index)['temp_degrees_c_mittel'] yData = weeklySeasoned['diff'] plt.figure() plt.scatter(x=xData, y=yData, marker='x') plt.xlabel('AVG TEMPERATURE [C]') plt.ylabel('DE-SEASONED DELAY [s]') plt.tight_layout()
cmd = '{0} "{1}" "{2}" "{3}"'.format(svmpredict_exe, scaled_test_file, model_file, predict_test_file) print('Testing...') print("run [%s]" % cmd) Popen(cmd, shell=True).communicate() print('Output prediction: {0}'.format(predict_test_file)) return predict_test_file ####################################################################### if __name__ == "__main__": train_file = os.path.join(dt.output_dir(), 'svm_train') validate_file = os.path.join(dt.output_dir(), 'svm_validate') all_train_file = os.path.join(dt.output_dir(), 'svm_all_train') test_file = os.path.join(dt.output_dir(), 'svm_test') result_file = os.path.join(dt.output_dir(), 'svm_result') #nrows = 100 nrows = 3030 + 443 + 1474 + 170 y = pd.read_csv(os.path.join(dt.data_dir(), 'task3', 'y_train.csv'), header=0, index_col=0, nrows=nrows) X_fft = pd.read_csv(os.path.join(dt.data_dir(), 'task3', 'X_train_fft.csv'),