from data_preparation import read_data,remove_outliers,get_dummies,split_data from train_model import train from logger import get_logger from generate_report import generate_report # Logger logger = get_logger(__name__) if __name__ == "__main__": # Lendo dados df = read_data() # Removendo Outliers df = remove_outliers(df) # Get Dummies df = get_dummies(df) # Divindo dados X_train, X_test, y_train, y_test = split_data(df) # Treinando Modelo regLinear,score = train(X_train,y_train) # Avaliando Modelo generate_report(regLinear,X_test,y_test,score)
if __name__ == "__main__": ##### # The purpose of our classifier is to predict the hostkidoutcome category and a percentage of released persons. # Y: hostkidoutcome, npreleased # X: extended, iyear, gname_id, nhostkid, ndays, ransom, ransompaid, ishostkid ##### ### Data filtering # Read data and exclude cols # @Snippet: To exclude: lambda x: x not in ["eventid","imonth","iday", "attacktype2","claims2","claimmode2","claimmode3","gname2"] df = prep.read_data('globalterrorismdb_0617dist.csv', usecols=[ 'nreleased', 'attacktype1', 'attacktype2', 'attacktype3', 'extended', 'iyear', 'gname', 'nhostkid', 'nhours', 'ndays', 'ransom', 'ransompaid', 'ransompaidus', 'ishostkid', 'hostkidoutcome' ]) df = filter_data(df) df = augmentate_data(df) # We also have sometimes -99 or -9 as values when things were unknown. We have to replace them as well with NaNs df = set_unknown_to_NaN(df, [-9, -99]) # We have a whole number of columns which contains NaNs for missing data. To overcome those, we simply use the sklearn Imputer to fill the NaNs with the mean values df = set_NaN_to_value(df, -1) print(df.head()) # Plot data
import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from pandas.tools.plotting import autocorrelation_plot from pandas.tools.plotting import lag_plot from sklearn.metrics import mean_absolute_error from statsmodels.tsa.ar_model import AR import submission_generator import data_preparation __author__ = '[email protected] (Sajad Azami)' sns.set_style("dark") # Load dataset data_set = data_preparation.read_data('./data_set/HourlyDemands_2002-2016.csv') data, label = data_preparation.split_label(data_set, 'Ontario Demand') print('Data set Loaded!') print(data.shape) print(label.shape) # Plot 2 weeks of data points line = np.linspace(0, 336, 336) plt.plot(line, label[0:336]) plt.xlabel('Hour') plt.ylabel('Power Demand') plt.title('Power Demand of first 14 days') plt.show() lag_plot(label) # Plotting the lag plot of target feature
models.append(('DT', DecisionTreeClassifier(min_samples_leaf=4, min_samples_split=13, splitter='best'))) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(C=10, gamma='scale', kernel='rbf', probability=True))) models.append(('ADB', AdaBoostClassifier(n_estimators=20))) models.append(('RF', RandomForestClassifier(criterion='gini', max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=100))) models.append(('GPC', GaussianProcessClassifier(kernel=RBF(length_scale=1)))) models.append(('XGB', XGBClassifier(booster='gbtree', colsample_bylevel=1, learning_rate=0.001, max_depth=6, min_child_weight=5, n_estimators=700, objective='binary:logistic'))) path = "E:/project/models/" data_path = "E:/project/training_data.csv" data, labels = read_data(data_path) train_models(models, data, labels) save_models(models, path)
def encode_field(dataframe_train, dataframe_test, col_name): encoder = preprocessing.LabelEncoder() for i in col_name: dataframe_train = dataframe_train.fillna( dataframe_train[i].value_counts().index[0]) dataframe_test = dataframe_test.fillna( dataframe_test[i].value_counts().index[0]) encoder.fit(dataframe_train[i].values) dataframe_train[i] = encoder.transform(dataframe_train[i].values) dataframe_test[i] = encoder.transform(dataframe_test[i].values) return dataframe_train, dataframe_test train_full_X, train_full_Y = data_preparation.read_data( './data_set/train.csv', 'SalePrice') test_full_X = pd.read_csv('./data_set/test.csv') submission_ids = test_full_X['Id'] print('Data set Loaded!\nTrain Shape: ' + str(train_full_X.shape)) print('Final Test Shape: ' + str(test_full_X.shape)) # print('\nMissing Status:') # print(data_preparation.show_missing(train_full_X)) # IMPUTATION # Dropping features with huge number of NAs: [PoolQC, Fence, MiscFeature] # train_full_X = train_full_X.drop(['PoolQC', 'Fence', 'MiscFeature'], axis=1) # test_full_X = test_full_X.drop(['PoolQC', 'Fence', 'MiscFeature'], axis=1) def fill_na(dataframe):
def naive_bayes_with_some_features(all_city_data, all_city_label, feature_list): all_city_label = all_city_label.reshape(len(all_city_label), ) features_to_use = all_city_data.loc[:, feature_list] mnnb = MultinomialNB() mnnb.fit(features_to_use, all_city_label) pred = mnnb.predict(features_to_use) print("Number of mislabeled points out of a total " + str(features_to_use.shape[0]) + ' points: ' + ( str((all_city_label != pred).sum()))) # LOOCV risk print('Feature set: ' + str(feature_list) + '\nLOOCV: ' + str(get_LOOCV(features_to_use, all_city_label))) print('') return mnnb # Loading dataset cleveland = data_preparation.read_data('./data_set/processed.cleveland.data.txt') hungarian = data_preparation.read_data('./data_set/processed.hungarian.data.txt') switzerland = data_preparation.read_data('./data_set/processed.switzerland.data.txt') va = data_preparation.read_data('./data_set/processed.va.data.txt') print('Data set Loaded!') # Merge datasets frames = [cleveland, hungarian, switzerland, va] all_city_data = pd.concat(frames) # Splitting label and features all_city_data, all_city_label = data_preparation.split_label(all_city_data, 13) all_city_label = all_city_label.reshape(len(all_city_label), 1) all_city_data = all_city_data.reset_index(drop=True) # Filling missing values with each columns mean for column [0, 3, 4, 7, 9] and mode for the rest
import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas.tools.plotting import autocorrelation_plot from pandas.tools.plotting import lag_plot from sklearn.metrics import mean_absolute_error from statsmodels.tsa.ar_model import AR import data_preparation # Load dataset data_set = data_preparation.read_data('./data_set/HourlyDemands_2002-2016.csv') data, label = data_preparation.split_label(data_set, 'Ontario Demand') print('Data set Loaded!') print(data.shape) print(label.shape) # Splitting train and test data train_data, test_data = data[0:119832], data[119832:] train_label, test_label = label[0:119832], label[119832:] # Implementing Models df = pd.concat([label.shift(48), label], axis=1) df.columns = ['t-1', 't+1'] X = df.values train, test = X[0:119832], X[119832:] train_X, train_y = train[:, 0], train[:, 1] test_X, test_y = test[:, 0], test[:, 1] # Mean years = [] for i in range(0, 365 * 24):