Пример #1
0
# import modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import load_model
import matplotlib.pyplot as plt
from one_hot_encoding import one_hot_encoder
import feature_index
import csv

testdata = pd.read_csv('./data/criminal_test.csv')
perid = testdata.iloc[:, 0:1].values
testdata = testdata.drop(['PERID'], axis=1)
data = testdata[feature_index.categorical]
data_one_hot_encoded = one_hot_encoder(data)
X = pd.concat([testdata, data_one_hot_encoded], axis=1)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)


# Load model
model = load_model('./model/checkpoints/criminal-ann-009-loss0.098-acc0.958.hdf5')

scores = model.predict(np.array(X))

myData = [["PERID", "Criminal"]]
i=0
Пример #2
0
        categorical_cols.append(col)
        print(traindata[col].value_counts())

# Taking care of missing values in Columns with categorical data where Imputation wont work
if (traindata.isnull().sum().sum()):
    traindata.dropna(inplace=True)
traindata.fillna(method='ffill', inplace=True)

y = traindata.iloc[:, -1].values
train_set = traindata.drop(['id', 'P'], axis=1)
numeric_cols.remove('id')
numeric_cols.remove('P')

# one-hot encoding
data = train_set[categorical_cols]
one_hot_encoded_data_train = one_hot_encoder(data, categorical_cols)
X = pd.concat([train_set[numeric_cols], one_hot_encoded_data_train], axis=1)

# Encoding the Independent Varialble
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#labelencoder = LabelEncoder()
#catg_index = [0,3,4,5,6,8,9,11,12]
#for item in catg_index:
#    train_set[:, item] = labelencoder.fit_transform(train_set[:, item])
#onehotencoder = OneHotEncoder(categorical_features = catg_index)
#X = onehotencoder.fit_transform(train_set).toarray()

# ====================== Splitting the dataset into the Training set and Test set =============================

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
Пример #3
0
print(sys.version)

# load data
loan = pd.read_csv('./data/loan.csv')

# pre-process data
drop_null_columns(loan)
loan_in_progress = split_loan_in_progress(loan)
loan = categorize_target(loan)

# Feature Engineering by EDA
trim_features(loan)

# one-hot encoding
loan = loan[feature_index.features]
loan_one_hot_encoded = one_hot_encoder(loan)

# Train-Test split
y = loan_one_hot_encoded.loan_status_coded
X = loan_one_hot_encoded.drop("loan_status_coded", axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# oversample_SMOTE
#x_train, y_train = oversample_smote(x_train, y_train)

# Neural Network model
y_train = encode_neural_net_y(y_train)
y_test = encode_neural_net_y(y_test)

model = Sequential()
model.add(Dense(34, input_dim=66, activation='relu'))
Пример #4
0
labels = list(traindata)
for label in labels:
    traindata = traindata[np.abs(traindata[label]-traindata[label].mean()) <= (3.8*traindata[label].std())]

# pre-process data
traindata.isnull().sum()
num_cols = traindata._get_numeric_data().columns
for col in traindata:
    print(traindata[col].value_counts())

y = traindata.iloc[:, -1].values
traindata = traindata.drop(['PERID', 'Criminal'], axis=1)
# Except VESTR and ANALWT_C all others are Categorical data. Hence they need feature scaling.
# one-hot encoding
data = traindata[feature_index.categorical]
one_hot_encoded_data = one_hot_encoder(data)

# Splitting the dataset into the Training set and Test set
X = pd.concat([traindata, one_hot_encoded_data], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# ========================================================Neural Network model==================================================================
# Initialising the ANN
model = Sequential()
# Adding the input layer and the first hidden layer
Пример #5
0
categorical_cols = []
for col in testdata:
    if (col not in numeric_cols):
        categorical_cols.append(col)
        print(testdata[col].value_counts())

# Taking care of missing values in Columns with categorical data where Imputation wont work
testdata.fillna(method='ffill', inplace=True)

test_set = testdata.drop(['id'], axis=1)
numeric_cols.remove('id')

# one-hot encoding
data = test_set[categorical_cols]
one_hot_encoded_data_test = one_hot_encoder(data, categorical_cols)
feature_difference = set(one_hot_encoded_data_train) - set(
    one_hot_encoded_data_test)
feature_difference_df = pd.DataFrame(data=np.zeros(
    (one_hot_encoded_data_test.shape[0], len(feature_difference))),
                                     columns=list(feature_difference))
one_hot_encoded_data_test = one_hot_encoded_data_test.join(
    feature_difference_df)
X = pd.concat([test_set[numeric_cols], one_hot_encoded_data_test], axis=1)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# ========================================= Predicting the Test set results ======================================