Python ReplaceImputeEncode示例，Class_replace_impute_encode.ReplaceImputeEncode Python示例

示例#1

0

显示文件

target = 'crash'
# Drop data with missing values for target (price)
drops = []
for i in range(df1.shape[0]):
    if pd.isnull(df1['crash'][i]):
        drops.append(i)
df1 = df1.drop(drops)

# In[23]:

encoding = 'one-hot'
scale = None  # Interval scaling:  Use 'std', 'robust' or None
# drop=False - do not drop last category - used for Decision Trees
rie = ReplaceImputeEncode(data_map=attribute_map,
                          nominal_encoding=encoding,
                          interval_scale=scale,
                          drop=False,
                          display=True)

# In[24]:

df1.drop('crash', axis=1, inplace=True)

# In[25]:

encoded_df = rie.fit_transform(df1)

# In[26]:

#varlist = [target, 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9','points']
X = encoded_df.drop(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7'], axis=1)

示例#2

0

显示文件

    'job': [2, (1, 2, 3, 4), [0, 0]],
    'housing': [2, (1, 2, 3), [0, 0]],
    'foreign': [1, (1, 2), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]],
    'resident': [2, (1, 2, 3, 4), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'other': [2, (1, 2, 3), [0, 0]],
    #     'purpose':[1,('0','1','2','3','4','5','6','7','8','9','X'),[0,0]],
    'property': [2, (1, 2, 3, 4), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'telephon': [1, (1, 2), [0, 0]]
}

rie = ReplaceImputeEncode(data_map=attribute_map,
                          drop=False,
                          nominal_encoding='one-hot',
                          display=True,
                          interval_scale='std')
encoded_df = rie.fit_transform(df)

# In[4]:

X = encoded_df.drop('good_bad', axis=1)
Y = encoded_df['good_bad']
np_y = np.ravel(Y)

features = X.columns
classes = ['Good', 'bad']

# In[9]:

示例#3

0

显示文件

文件： DecisionTreeClassifier_Creditdata.py 项目： himgupta08/Stat-656-projects-applied-analytics-sas-eminer-python

    'history': [2, (0, 1, 2, 3, 4), [0, 0]],
    'existcr': [2, (1, 2, 3, 4), [0, 0]],
    'installp': [2, (1, 2, 3, 4), [0, 0]],
    'job': [2, (1, 2, 3, 4), [0, 0]],
    'housing': [2, (1, 2, 3), [0, 0]],
    'foreign': [1, (1, 2), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]],
    'resident': [2, (1, 2, 3, 4), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'other': [2, (1, 2, 3), [0, 0]],
    'property': [2, (1, 2, 3, 4), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'telephon': [2, (1, 2), [0, 0]]
}

rie = ReplaceImputeEncode(data_map=attribute_map, display=True)
encoded_df = rie.fit_transform(df)

# In[5]:

from collections import Counter
Counter(encoded_df['employed0'])
len(encoded_df.columns)  # 46 columns

Counter(encoded_df['good_bad'])

# In[6]:

X = encoded_df.drop('good_bad', axis=1)
y = encoded_df['good_bad']

示例#4

0

显示文件

文件： churn.py 项目： ChrisBerardi/Customer-Churn

    'MonthlyCharges': [0, (18.25, 188.75), [0, 0]],
    'TotalCharges': [0, (0, 8700), [0, 0]],  #int as str hides max value!
    'Churn': [1, ('Yes', 'No'), [0, 0]],
}

#Define the target
target = ['Churn']

#Logistics Regression
max_f1 = 0
score_list = ['accuracy', 'recall', 'precision', 'f1']

#Encode for logistic regressions
rie_l = ReplaceImputeEncode(data_map=attribute_map,
                            nominal_encoding='one-hot',
                            interval_scale='std',
                            drop=True,
                            display=True)
encoded_df_l = rie_l.fit_transform(df)
X_l = encoded_df_l.drop(target, axis=1)
y_l = encoded_df_l[target]
np_y_l = np.ravel(y_l)  #convert dataframe column to flat array

#Do feature selection using random forest classifiers to determine which
#predictors to include in the logistic regression
features = ExtraTreesClassifier(n_estimators=500)
features.fit(X_l, np_y_l)
print(features.feature_importances_)
#Only the interval predictors are important
#Try two logistic models: one with all predictors, one with only the top 3
#predictors

示例#5

0

显示文件

文件： sample_project.py 项目： shanebabe/Python_2020

        sentiment_score[i] = sentiment_score[i] / n_sw
df_senscore = pd.DataFrame(sentiment_score, columns=['sentiment score'])
df = df.join(df_senscore)

# classify topic based on the probability

df['topic'] = 0
for ix, row in df.iterrows():
    mx = row[['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7']].max()
    b = (row == mx).idxmax(axis=1)
    df.loc[ix, 'topic'] = b
# save the data output of NLP
df.to_csv('after_NLP_data.csv', index=False)

# scale data
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', \
                           interval_scale=None, drop=False, display=True)

df_tree = rie.fit_transform(df)
y = df_tree['crash']
X = df_tree.drop('crash', axis=1)

# find the best tree depth
depth_list = [3, 5, 6, 7, 8, 10, 12, 15, 20, 25]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for d in depth_list:
    print("\nMaximum Tree Depth: ", d)
    dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, \
                                 min_samples_split=5,random_state=12345)
    dtc = dtc.fit(X, y)
    scores = cross_validate(dtc, X, y, scoring=score_list, \
                            return_train_score=False, cv=10)

示例#6

0

显示文件

        'Feb_PayPercent':[0,(0, 1),[0,0]],
        'Jan_PayPercent':[0,(0, 1),[0,0]]
    }
    
    
    # In[14]:df.drop(['Customer'], axis=1)
    
    # In[6]:
    
    
np.sum(df['Marital_Status']==0)
df.dtypes
    
    
    # In[15]:
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', display=True)
encoded_df = rie.fit_transform(df)
    
    
varlist = df['Default']
X = encoded_df.drop('Default', axis=1)
y = encoded_df['Default']

lgr = LogisticRegression()

#Selecting the best attributes using RFE - 25 attributes chosen   
rfe = RFE(lgr,25)
rfe = rfe.fit(X,y)


print(rfe.support_)

示例#7

0

显示文件

文件： Logistic_Regression_and_Cross_Validation.py 项目： krit-gpt/Applied_Analytics

    'foreign':[1,(1,2),[0,0]],
    'good_bad':[1,('bad', 'good'),[0,0]],
    'history':[2,(0,1,2,3,4),[0,0]],
    'installp':[2,(1,2,3,4),[0,0]],
    'job':[2,(1,2,3,4),[0,0]],
    'marital':[2,(1,2,3,4),[0,0]],
    'other':[2,(1,2,3),[0,0]],
    'property':[2,(1,2,3,4),[0,0]],
 #   'purpose':[1,(0,1,2,3,4,5,6,7,8,9,'X'),[0,0]],
    'resident':[2,(1,2,3,4),[0,0]],
    'savings':[2,(1,2,3,4,5),[0,0]],
    'telephon':[1,(1,2),[0,0]] }
Step 3: Replace-Impute-Encode
Next, use the class ReplaceImputeEncode() to replace outliers with missing values, impute missing values and then scale interval data and encode categorial data.

The ReplaceImputeEncode() class allows you to specify None for scaling and/or encoding. It also lets you select 'one-hot' or 'SAS' encoding for categorical variables. In most other software this is automatic, but for Python we need to setup our own scaling and encoding.

The complete API for this class is described in the class. First you instantiate the class then you use fit_transform() to actually process your dataframe.

In [3]:
encoding = 'SAS' # Categorical encoding:  Use 'SAS', 'one-hot' or None
scale    = None  # Interval scaling:  Use 'std', 'robust' or None
scaling  = 'No'  # Text description for interval scaling

rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding=encoding, \
                          interval_scale = scale, display=True)

#features_map = rie.draft_features_map(df)
encoded_df = rie.fit_transform(df)
********** Data Preprocessing ***********
Features Dictionary Contains:

示例#8

0

显示文件

文件： week 10.py 项目： ChrisBerardi/STAT656-Applied-Analytics

attribute_map_clus = {
        'Score'   :[0,(80,100),[0,0]],
        'Year'    :[0,(1985,2016),[0,0]],
        'Region'  :[2,('California Other', 'Central Coast','Central Valley', \
                     'Clear Lake','High Valley', 'Lake County',\
                     'Mendocino County','Mendocino Ridge',\
                     'Mendocino/Lake Counties', 'Napa','Napa-Sonoma',\
                     'North Coast','Red Hills Lake County','Redwood Valley',\
                     'Sierra Foothills','Sonoma','South Coast'),[0,0]],
        'Cluster' :[2,(0,1,2,3,4,5,6,7,8),[0,0]],
        'Price'   :[0,(0,625),[0,0]]
}
varlist = ['Price']

rie_clus = ReplaceImputeEncode(data_map=attribute_map_clus, \
                               nominal_encoding='one-hot', 
                          interval_scale = None, drop=True, display=False)
encoded_df_clus = rie_clus.fit_transform(clus)

X_clus = encoded_df_clus.drop(varlist, axis=1)
y_clus = encoded_df_clus[varlist]
X_train, X_valid, y_train, y_valid= \
train_test_split(X_clus,y_clus,test_size = 0.3, random_state=7)

np_y_train = np.ravel(y_train)
np_y_valid = np.ravel(y_valid)


reg = LinearRegression()
reg.fit(X_train,np_y_train)

示例#9

0

显示文件

    'model': [2, ('COBALT', 'G5', 'HHR', 'ION', 'SKY', 'SOLSTICE'), [0, 0]],
    'crashed': [1, ('N', 'Y'), [0, 0]],
    'abs': [1, ('N', 'Y'), [0, 0]],
    'mileage': [0, (0, 200000), [0, 0]],
    '0': [0, (0, 1), [0, 0]],
    '1': [0, (0, 1), [0, 0]],
    '2': [0, (0, 1), [0, 0]],
    '3': [0, (0, 1), [0, 0]],
    '4': [0, (0, 1), [0, 0]],
    '5': [0, (0, 1), [0, 0]],
    '6': [0, (0, 1), [0, 0]],
    '7': [0, (0, 1), [0, 0]],
}
varlist = ['crashed']
rie = ReplaceImputeEncode(data_map=attribute_map, \
                               nominal_encoding='one-hot',
                          interval_scale = None, drop=True, display=False)
encoded_df = rie.fit_transform(reg_df)
X = encoded_df.drop(varlist, axis=1)
y = encoded_df[varlist]
np_y = np.ravel(y)

#10 fold-cross validation to find optimum regularization value
max_f1 = 0
C_list = [.1, 1, 10, 100]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for c in C_list:
    print("\nRegularization Parameter: ", c)
    lgr = LogisticRegression(C=c, tol=1e-8, max_iter=1000)
    lgr.fit(X, np_y)
    scores = cross_validate(lgr, X, np_y,\

示例#10

0

显示文件

文件： homework4.py 项目： ChrisBerardi/STAT656-Applied-Analytics

    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'telephon': [1, (1, 2), [0, 0]]
}

sas_map = {
    'duration': [0, (0, 100), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'coapp': [2, (1, 2, 3), [0, 0]],
    'history': [2, (0, 1, 2, 3, 4), [0, 0]],
    'good_bad': [1, ('good', 'bad'), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'installp': [2, (1, 2, 3, 4), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]]
}
#Replace, impute, and encode using SAS encoding
rep_imp_enc = ReplaceImputeEncode(data_map=attribute_map, display=True)
encoded_df = rep_imp_enc.fit_transform(df)

# Regression requires numpy arrays containing all numeric values
y = np.asarray(encoded_df['good_bad'])
# Drop the target, 'object'.  Axis=1 indicates the drop is for a column.
X = np.asarray(encoded_df.drop('good_bad', axis=1))

#Fit a logistic regression model, use k=4 fold cross validation
X_train, X_validate, y_train, y_validate = \
            train_test_split(X,y,test_size = 0.3, random_state=7)
logistic = LogisticRegression()
logistic.fit(X, y)

log_tts = LogisticRegression()
log_tts.fit(X_train, y_train)