Exemplo n.º 1
0
import numpy as np
from matplotlib import pyplot as plt
print "-----------------------------------------------------------------------"
print('The scikit-learn version is {}.'.format(sklearn.__version__))
#get the working directory and filename
path = r'C:\Users\pmspr\Documents\HS\MS\Sem 2\EECS 738\Lab\2\Work\Code\Data'

#load data using load class and print describe of data
from projectFunctions import loadData
filename = "forestfires.csv"

data = loadData(path, filename)

##explore the data
from projectFunctions import exploreData
exploreData(data)

# Success - Display the first record
if data is not None:
    display(data.head(n=1))
    print data.describe(include='all')

drop_col = ['X', 'Y', 'rain', 'area']
features_raw = data.drop(drop_col, axis=1)
target_raw = data['area']
if features_raw is not None:
    display(features_raw.head(n=1))

#transform data
from projectFunctions import transformData
features, target, target_reg = transformData(features_raw, target_raw)
Exemplo n.º 2
0
print ("-----------------------------------------------------------------------")
print('The scikit-learn version is {}.'.format(sklearn.__version__))

#load functions from 
from projectFunctions import loadData, exploreData, missingValues, tokenString, transformData

path = r'C:\Users\pmspr\Documents\HS\MS\Sem 3\EECS 731\Week 4\HW\Git\EECS-731-Project-2\Data'
filename = "Shakespeare_data.csv"
data = loadData(path,filename)
drop_col = ['Dataline','PlayerLinenumber','ActSceneLine']
data = data.drop(drop_col, axis = 1)
data.rename(columns={'Player':'target'},inplace=True)
print(data.columns)

print ("----------------------Shakespear Play data-----------------------------")
features, target = exploreData(data)
misVal, mis_val_table_ren_columns = missingValues(data)

# Print some summary information
print ("Columns that have missing values:" + str(misVal.shape[0]))
print ("-----------------------------------------------------------------------")
print(mis_val_table_ren_columns.head(20))

#Remove rows with missing target values
ind = data[data['target'].isnull()].index.tolist()
data = data.drop(index=ind, axis=0)

#Compute features to add value
line_count = data.groupby(['Play','target'], as_index=False).count()
line_count.rename(columns={'PlayerLine':'LineCount'},inplace=True)