示例#1
0
from data_preparation import read_data,remove_outliers,get_dummies,split_data
from train_model import train
from logger import get_logger
from generate_report import generate_report

# Logger
logger = get_logger(__name__)

if __name__ == "__main__":
    
    # Lendo dados
    df = read_data()

    # Removendo Outliers
    df = remove_outliers(df)

    # Get Dummies
    df = get_dummies(df)

    # Divindo dados
    X_train, X_test, y_train, y_test = split_data(df)

    # Treinando Modelo
    regLinear,score = train(X_train,y_train)

    # Avaliando Modelo 
    generate_report(regLinear,X_test,y_test,score)
示例#2
0
if __name__ == "__main__":
    #####
    # The purpose of our classifier is to predict the hostkidoutcome category and a percentage of released persons.
    # Y: hostkidoutcome, npreleased
    # X: extended, iyear, gname_id, nhostkid, ndays, ransom, ransompaid, ishostkid
    #####

    ### Data filtering

    # Read data and exclude cols
    # @Snippet: To exclude: lambda x: x not in ["eventid","imonth","iday", "attacktype2","claims2","claimmode2","claimmode3","gname2"]
    df = prep.read_data('globalterrorismdb_0617dist.csv',
                        usecols=[
                            'nreleased', 'attacktype1', 'attacktype2',
                            'attacktype3', 'extended', 'iyear', 'gname',
                            'nhostkid', 'nhours', 'ndays', 'ransom',
                            'ransompaid', 'ransompaidus', 'ishostkid',
                            'hostkidoutcome'
                        ])
    df = filter_data(df)
    df = augmentate_data(df)

    # We also have sometimes -99 or -9 as values when things were unknown. We have to replace them as well with NaNs
    df = set_unknown_to_NaN(df, [-9, -99])

    # We have a whole number of columns which contains NaNs for missing data. To overcome those, we simply use the sklearn Imputer to fill the NaNs with the mean values
    df = set_NaN_to_value(df, -1)

    print(df.head())

    # Plot data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.tools.plotting import autocorrelation_plot
from pandas.tools.plotting import lag_plot
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.ar_model import AR
import submission_generator
import data_preparation

__author__ = '[email protected] (Sajad Azami)'
sns.set_style("dark")

# Load dataset
data_set = data_preparation.read_data('./data_set/HourlyDemands_2002-2016.csv')
data, label = data_preparation.split_label(data_set, 'Ontario Demand')
print('Data set Loaded!')
print(data.shape)
print(label.shape)

# Plot 2 weeks of data points
line = np.linspace(0, 336, 336)
plt.plot(line, label[0:336])
plt.xlabel('Hour')
plt.ylabel('Power Demand')
plt.title('Power Demand of first 14 days')
plt.show()
lag_plot(label)

# Plotting the lag plot of target feature
示例#4
0
models.append(('DT',
               DecisionTreeClassifier(min_samples_leaf=4,
                                      min_samples_split=13,
                                      splitter='best')))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(C=10, gamma='scale', kernel='rbf',
                          probability=True)))
models.append(('ADB', AdaBoostClassifier(n_estimators=20)))
models.append(('RF',
               RandomForestClassifier(criterion='gini',
                                      max_depth=6,
                                      min_samples_leaf=1,
                                      min_samples_split=2,
                                      n_estimators=100)))
models.append(('GPC', GaussianProcessClassifier(kernel=RBF(length_scale=1))))
models.append(('XGB',
               XGBClassifier(booster='gbtree',
                             colsample_bylevel=1,
                             learning_rate=0.001,
                             max_depth=6,
                             min_child_weight=5,
                             n_estimators=700,
                             objective='binary:logistic')))

path = "E:/project/models/"
data_path = "E:/project/training_data.csv"

data, labels = read_data(data_path)
train_models(models, data, labels)
save_models(models, path)
示例#5
0
def encode_field(dataframe_train, dataframe_test, col_name):
    encoder = preprocessing.LabelEncoder()

    for i in col_name:
        dataframe_train = dataframe_train.fillna(
            dataframe_train[i].value_counts().index[0])
        dataframe_test = dataframe_test.fillna(
            dataframe_test[i].value_counts().index[0])

        encoder.fit(dataframe_train[i].values)
        dataframe_train[i] = encoder.transform(dataframe_train[i].values)
        dataframe_test[i] = encoder.transform(dataframe_test[i].values)
    return dataframe_train, dataframe_test


train_full_X, train_full_Y = data_preparation.read_data(
    './data_set/train.csv', 'SalePrice')
test_full_X = pd.read_csv('./data_set/test.csv')
submission_ids = test_full_X['Id']

print('Data set Loaded!\nTrain Shape: ' + str(train_full_X.shape))
print('Final Test Shape: ' + str(test_full_X.shape))

# print('\nMissing Status:')
# print(data_preparation.show_missing(train_full_X))


# IMPUTATION
# Dropping features with huge number of NAs: [PoolQC, Fence, MiscFeature]
# train_full_X = train_full_X.drop(['PoolQC', 'Fence', 'MiscFeature'], axis=1)
# test_full_X = test_full_X.drop(['PoolQC', 'Fence', 'MiscFeature'], axis=1)
def fill_na(dataframe):
示例#6
0
def naive_bayes_with_some_features(all_city_data, all_city_label, feature_list):
    all_city_label = all_city_label.reshape(len(all_city_label), )
    features_to_use = all_city_data.loc[:, feature_list]
    mnnb = MultinomialNB()
    mnnb.fit(features_to_use, all_city_label)
    pred = mnnb.predict(features_to_use)
    print("Number of mislabeled points out of a total " + str(features_to_use.shape[0]) + ' points: ' + (
        str((all_city_label != pred).sum())))
    # LOOCV risk
    print('Feature set: ' + str(feature_list) + '\nLOOCV: ' + str(get_LOOCV(features_to_use, all_city_label)))
    print('')
    return mnnb


# Loading dataset
cleveland = data_preparation.read_data('./data_set/processed.cleveland.data.txt')
hungarian = data_preparation.read_data('./data_set/processed.hungarian.data.txt')
switzerland = data_preparation.read_data('./data_set/processed.switzerland.data.txt')
va = data_preparation.read_data('./data_set/processed.va.data.txt')
print('Data set Loaded!')

# Merge datasets
frames = [cleveland, hungarian, switzerland, va]
all_city_data = pd.concat(frames)

# Splitting label and features
all_city_data, all_city_label = data_preparation.split_label(all_city_data, 13)
all_city_label = all_city_label.reshape(len(all_city_label), 1)
all_city_data = all_city_data.reset_index(drop=True)

# Filling missing values with each columns mean for column [0, 3, 4, 7, 9] and mode for the rest
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.tools.plotting import autocorrelation_plot
from pandas.tools.plotting import lag_plot
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.ar_model import AR
import data_preparation

# Load dataset
data_set = data_preparation.read_data('./data_set/HourlyDemands_2002-2016.csv')
data, label = data_preparation.split_label(data_set, 'Ontario Demand')
print('Data set Loaded!')
print(data.shape)
print(label.shape)

# Splitting train and test data
train_data, test_data = data[0:119832], data[119832:]
train_label, test_label = label[0:119832], label[119832:]

# Implementing Models
df = pd.concat([label.shift(48), label], axis=1)
df.columns = ['t-1', 't+1']
X = df.values
train, test = X[0:119832], X[119832:]
train_X, train_y = train[:, 0], train[:, 1]
test_X, test_y = test[:, 0], test[:, 1]

# Mean
years = []
for i in range(0, 365 * 24):