from ay_hw_3.util_data import load_data_and_label, get_all_datasets_path from ay_hw_3.util_statistic import gen_statistic_result from ay_hw_3._global import ROOT_PATH import pandas as pd import numpy as np import pprint if __name__ == "__main__": allFilePaths = get_all_datasets_path(rootPath=ROOT_PATH) statisticResult = pd.DataFrame() for index, path in enumerate(allFilePaths): fileItem, fileLabel = load_data_and_label(path) staticResultItem = gen_statistic_result(fileItem, index + 1) statisticResult = statisticResult.append(staticResultItem) ##----Same to the main_c_ii.py------------------- confidence_interval = {} for column in statisticResult.columns: itemCIRange = [] for i in range(0, 999): # Return a random sample of items from an axis of object. ran_sample = statisticResult[column].sample(n=10, replace=True) stat = ran_sample.std() itemCIRange.append(stat) itemCIRange.sort() lowerValue = np.percentile(itemCIRange, 0.05) upperValue = np.percentile(itemCIRange, 0.95) confidence_interval[column] = [lowerValue, upperValue]
trainStaticResult = pd.DataFrame() testStaticResult = pd.DataFrame() gaussianTestErrorRateList = list() for parts in range(1, 10): for index, path in enumerate(allTrainFilePaths): trainFileItem, trainFileLabel = load_data_and_label(path, hasTime=False) gluedTrainFile = split_DF_in_parts(trainFileItem, parts=parts, needConcat=True) gluedTrainFile.columns = gen_multiple_column_name(parts=parts, hasTime=False) trainStaticResultItem = gen_statistic_result(gluedTrainFile, index + 1, hasTime=False) trainStaticResultItem["label"] = convert_label_2_num( trainFileLabel) trainStaticResult = trainStaticResult.append(trainStaticResultItem, sort=False) for index, path in enumerate(allTestFilePaths): testFileItem, testFileLabel = load_data_and_label(path, hasTime=False) gluedTestFile = split_DF_in_parts(testFileItem, parts=parts, needConcat=True) gluedTestFile.columns = gen_multiple_column_name(parts=parts, hasTime=False) testStaticResultItem = gen_statistic_result(gluedTestFile,
if __name__ == "__main__": if not sys.warnoptions: warnings.simplefilter("ignore") allTrainFilePaths = gen_train_data_file_paths() # based on what the pdf said, we need to use all training data trainStaticResult = pd.DataFrame() for parts in range(1, 21): for index, path in enumerate(allTrainFilePaths): fileItem, fileLabel = load_data_and_label(path, hasTime=False) splitedDFs = split_DF_in_parts(fileItem, parts=parts, needConcat=False) statisticResultTemp = pd.DataFrame() for DFItem in splitedDFs: staticResultTempItem = gen_statistic_result(DFItem, index + 1, hasTime=False) statisticResultTemp = statisticResultTemp.append(staticResultTempItem, sort=False) statisticResultTemp["label"] = is_bending(fileLabel) trainStaticResult = trainStaticResult.append(statisticResultTemp, sort=False) logitModel = sm.Logit(trainStaticResult['label'], trainStaticResult[gen_multiple_label(parts=1)]) logitModelResults = logitModel.fit(method="bfgs",disp=0) # ['median(1)'] ['max(5)'] significantVars = \ [key for key, p_value in logitModelResults.pvalues.items() if p_value <= 0.05] if len(significantVars) > 0: print("When split all training data sets in {} times, " "I got significant variables : ".format(parts), end=" ") print(' '.join(significantVars))
if __name__ == "__main__": simplefilter(action='ignore', category=FutureWarning) # the best l I got is 3 bestL = 3 selectedFeatures = ['min(5)', '3rd quart(5)', '3rd quart(7)', 'max(18)'] allTrainFilePaths = gen_train_data_file_paths() # based on what the pdf said, we need to use all training data statisticResult = pd.DataFrame() for index, path in enumerate(allTrainFilePaths): fileItem, fileLabel = load_data_and_label(path, hasTime=False) gluedFile = split_DF_in_parts(fileItem, parts=bestL, needConcat=True) gluedFile.columns = gen_multiple_column_name(parts=bestL, hasTime=False) staticResultItem = gen_statistic_result(gluedFile, index + 1, hasTime=False) staticResultItem["label"] = is_bending(fileLabel) statisticResult = statisticResult.append(staticResultItem, sort=False) X_trainData = statisticResult[selectedFeatures] y_trainData = statisticResult['label'] skLogitModel = LogisticRegression() skLogitModel.fit(X_trainData, y_trainData) skYPredict = skLogitModel.predict(X_trainData) falsePositiveRate, truePositiveRate, thresholds = roc_curve( y_trainData, skYPredict) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate)
# __author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '9/27/2019 11:13 PM' from ay_hw_3.util_data import convert_label_2_num, load_data_and_label from ay_hw_3.util_generate import gen_test_data_file_paths from ay_hw_3.util_statistic import gen_statistic_result from ay_hw_3._global import FULL_COLUMNS, ROOT_PATH if __name__ == "__main__": print(convert_label_2_num("bending1")) print(convert_label_2_num("bending2")) print(convert_label_2_num("cycling")) print(convert_label_2_num("sitting")) print(convert_label_2_num("walking")) print(type(convert_label_2_num("lying"))) # print(gen_test_data_file_paths('.\\assets')) dataframe, label = load_data_and_label('.\\assets\\cycling\\dataset2.csv') staticResultItem = gen_statistic_result(dataframe, 1) print(staticResultItem.to_string())