Пример #1
0
def test_not_fitted():
    """
    If imputer is not fitted, NotFittedError is raised.
    """
    imp = CategoricalImputer()
    with pytest.raises(NotFittedError):
        imp.transform(np.array(['a', 'b', 'b', None]))
Пример #2
0
def test_missing_values_param(input_type):

    data = ['x', 'y', 'a_missing', 'y']

    if input_type == 'pd':
        X = pd.Series(data)
    else:
        X = np.asarray(data, dtype=object)

    imp = CategoricalImputer(missing_values='a_missing')
    Xt = imp.fit_transform(X)

    assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
Пример #3
0
def test_default_fill_value_for_constant_strategy(input_type):
    data = ['a', np.nan, 'b', 'b']

    if input_type == 'pd':
        X = pd.Series(data)
    else:
        X = np.asarray(data, dtype=object)

    imputer = CategoricalImputer(strategy='constant')
    Xt = imputer.fit_transform(X)

    assert imputer.fill_ == '?'
    assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
Пример #4
0
def test_copy_param(input_type):

    data = ['a', np.nan, 'b', 'a']

    if input_type == 'pd':
        X = pd.Series(data)
    else:
        X = np.asarray(data, dtype=object)

    imp = CategoricalImputer(copy=False)
    Xt = imp.fit_transform(X)

    Xe = np.array(['a', 'a', 'b', 'a'])
    assert (Xt == Xe).all()
    assert (X == Xe).all()
Пример #5
0
import pandas as pd

#Import dataset
df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

#splitting dataset into training and test set
X_train = df_train.iloc[:, 1:-1].values
y_train = df_train.iloc[:, 12].values
X_test = df_test.iloc[:, 1:].values

#Missing values
#--------------training set---------
from sklearn_pandas import CategoricalImputer

imputer_train_cat = CategoricalImputer()
imputer_train_cat = imputer_train_cat.fit(X_train[:, [0, 1, 4]])
X_train[:, [0, 1, 4]] = imputer_train_cat.transform(X_train[:, [0, 1, 4]])

for i in range(0, 614):
    if X_train[:, 2][i] == '3+':
        X_train[:, 2][i] = 3
    else:
        continue

from sklearn.preprocessing import Imputer

imputer_train_num = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer_train_num = imputer_train_num.fit(X_train[:, [2, 7, 8, 9]])
X_train[:, [2, 7, 8, 9]] = imputer_train_num.transform(X_train[:,
                                                               [2, 7, 8, 9]])
Пример #6
0

# In[14]:

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

# In[15]:

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('imputer', CategoricalImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse=False)),
])

# In[16]:

str_pipeline = Pipeline([
    ('selector', DataFrameSelector("Name")),
    ('str_finder', StringFinder()),
])

# In[17]:

pre_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
Пример #7
0
def predict():
  print("__________________________") 

 

  import pandas as pd
  import numpy as np
  import seaborn as  sns
  import matplotlib.pyplot as plt
  from sklearn_pandas import CategoricalImputer
  import os as os
  import category_encoders as ce
  from sklearn.metrics  import confusion_matrix
  from sklearn.model_selection import train_test_split
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import accuracy_score
  from sklearn.metrics import matthews_corrcoef
  from sklearn.externals import joblib
  from sklearn.base import BaseEstimator, TransformerMixin
  from sklearn.pipeline import make_pipeline

  import warnings
  warnings.filterwarnings("ignore")


  esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv', encoding='iso-8859-1', sep=',', engine='python')

  list(esd_df.columns)

  esd_df.shape

  #Finding out the null / Nan values in the columns:
  # for _ in esd_df.columns:
  #     print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum()))

  esd_array = esd_df['Doc_type'].values
  imputer = CategoricalImputer()
  imputer.fit_transform(esd_array)

  esd_df["Error_detail"].fillna("No detail", inplace = True) 
  #print(esd_df)
  esd_df=esd_df.drop(["Doc_type"],axis=1)
  esd_df['Doc_type'] = esd_array
  esd = esd_df.copy()
          
  encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category'])
  df_tc = encoder_tc.fit_transform(esd)
          
  encoder_et = ce.BinaryEncoder(cols=['Error_type'])
  df_et = encoder_et.fit_transform(df_tc)
          
          
  encoder_ed = ce.BinaryEncoder(cols=['Error_detail'])
  df_ed = encoder_ed.fit_transform(df_et)
          
  encoder_dt = ce.BinaryEncoder(cols=['Doc_type'])
  df_dt = encoder_dt.fit_transform(df_ed)

  #Next step is creating training and testing datasets:

  x=df_dt.drop(['Resolution'],axis='columns')
  x.shape

  y=df_dt['Resolution']
  y.shape

  from sklearn.model_selection import train_test_split

  x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

  # print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
  rf1=RandomForestClassifier(criterion='entropy',n_estimators=100,max_features=3,oob_score=True,bootstrap=True,n_jobs=-1,random_state=1)

  #Model fit
  rf1.fit(x_train,y_train)

  row = x_test.head(1)
  # print(row)

  rf1_pred=rf1.predict(x_test)

  # print(rf1_pred)
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import accuracy_score
  from sklearn.metrics import matthews_corrcoef

  # Finding Accracy Score
  # print('Accuracy Score:',accuracy_score(y_test,rf1_pred))

  # Matthews Corealation Coefficient 
  mcc = matthews_corrcoef(y_test,rf1_pred)
  # print('Matthews_corrcoef for Model is:',mcc)

  #Feature importances
  features=df_dt.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22]]
  importances = rf1.feature_importances_
  indices = np.argsort(importances)

  plt.figure(1)
  plt.title('Feature Importances')
  plt.barh(range(len(indices)), importances[indices], color='b', align='center')
  plt.yticks(range(len(indices)), features[indices])
  plt.xlabel('Relative Importance')

  oob_error=1-rf1.oob_score_
  # print(oob_error)    #0.150

  params={
              'criterion':['gini','entropy'],
              'n_estimators':[50],
              'max_features':[2,3,4,5,6,7,8],
          }

  rf_gridcv=GridSearchCV(estimator=rf1,cv=5,param_grid=params,scoring='accuracy')
  rf_grid=rf_gridcv.fit(x_train,y_train)

  # print(rf_gridcv.best_params_)

  y_predrf=rf_gridcv.predict(x_test)

  # print(y_predrf)
  one_row1 = x_test.head(2)
  y_pred_one=rf_gridcv.predict(one_row1)
  # print(y_pred_one)

  x = x_test.head(1)
  # print(x)

  import pickle

  pickle.dump(rf_gridcv, open('model.pkl','wb'))

  model = pickle.load(open('model.pkl','rb'))
  print(model.predict(x))
  print("***********************************************")

	# 'Ticket_Category_0' = 0
	# 'Ticket_Category_1' = 1 
	# 'Ticket_Category_2' = 1
 #    'Ticket_Category_3' = 0
 #    'Error_type_0' = 0
 #    'Error_type_1' = 1 
 #    'Error_type_2' = 0
 #    'Error_type_3' = 0 
 #    'Error_type_4' = 1
 #    'Error_type_5' = 0 
 #    'Error_type_6' = 0
 #    'Error_detail_0' = 0 
 #    'Error_detail_1' = 0 
 #    'Error_detail_2' = 0 
 #    'Error_detail_3' = 0
 #    'Error_detail_4' = 0 
 #    'Error_detail_5' = 1 
 #    'Doc_type_0' = 0 
 #    'Doc_type_1' = 0
 #    'Doc_type_2' = 1 
 #    'Doc_type_3' = 1 
 #    'Doc_type_4' = 1
 # 
  # query = [0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,1]
   
  # prediction = model.predict(query)
  # prediction = jsonify({'prediction': list(prediction)})
  # print(prediction)
  prediction = model.predict(x)
  return jsonify({'prediction': list(prediction)})
Пример #8
0
esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv',
                     encoding='iso-8859-1',
                     sep=',',
                     engine='python')

list(esd_df.columns)

esd_df.shape

#Finding out the null / Nan values in the columns:
# for _ in esd_df.columns:
#     print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum()))

esd_array = esd_df['Doc_type'].values
imputer = CategoricalImputer()
imputer.fit_transform(esd_array)

esd_df["Error_detail"].fillna("No detail", inplace=True)
#print(esd_df)
esd_df = esd_df.drop(["Doc_type"], axis=1)
esd_df['Doc_type'] = esd_array
esd = esd_df.copy()

encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category'])
df_tc = encoder_tc.fit_transform(esd)

encoder_et = ce.BinaryEncoder(cols=['Error_type'])
df_et = encoder_et.fit_transform(df_tc)

encoder_ed = ce.BinaryEncoder(cols=['Error_detail'])
Пример #9
0
def build_audit_na(classifier,
                   name,
                   with_proba=True,
                   predict_proba_transformer=None,
                   apply_transformer=None,
                   **pmml_options):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0, "MALE": 1}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"),
              name="flag_missing(Hours, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        Imputer()
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline(
        [("mapper", mapper), ("classifier", classifier)],
        predict_proba_transformer=predict_proba_transformer,
        apply_transformer=apply_transformer)
    pipeline.fit(audit_na_X, audit_na_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")
Пример #10
0
df.describe()
df.describe().transpose() 
df.info()
df.describe(include='O')

#Count missing values
df.isna().sum()

#Drop unwanted column
df=df.drop(['Loan_ID'],axis=1)

#impute 'catagorical varibles' ..impute  gender
df['Gender'].value_counts(dropna=False)  #gives na clunts for gender seperately

from sklearn_pandas import CategoricalImputer
imputer=CategoricalImputer()
df['Gender']=imputer.fit_transform(df['Gender'])

df['Married'].value_counts(dropna=False)
df['Married']=imputer.fit_transform(df['Married'])
df['Dependents'].value_counts(dropna=False)
df['Dependents']=imputer.fit_transform(df['Dependents'])
df['Self_Employed'].value_counts(dropna=False)
df['Self_Employed']=imputer.fit_transform(df['Self_Employed'])
df['Credit_History'].value_counts(dropna=False)
df['Credit_History']=imputer.fit_transform(df['Credit_History'])
df.isna().sum()

#only numeric data impute
#impute loamAmount
df['LoanAmount'].isna().sum()
Пример #11
0
def impute_categorical(df, col_name):
    imputer = CategoricalImputer()
    df[col_name] = imputer.fit_transform(df[col_name])
    return df
Пример #12
0
    'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
    'hours_per_week'
]
cat = [name for name in adult.columns if name not in num + ['class']]

#adult.workclass = adult.workclass.str.strip()
#adult.education = adult.education.str.strip()

preprocess = DataFrameMapper(
    [(['age'], [Imputer(), StandardScaler()]),
     (['fnlwgt'], [Imputer(), StandardScaler()]),
     (['education_num'], [Imputer(), StandardScaler()]),
     (['capital_gain'], [Imputer(), StandardScaler()]),
     (['capital_loss'], [Imputer(), StandardScaler()]),
     (['hours_per_week'], [Imputer(), StandardScaler()]),
     (['workclass'], [CategoricalImputer(),
                      LabelEncoder()]),
     (['education'], [CategoricalImputer(),
                      LabelEncoder()]),
     (['marital_status'], [CategoricalImputer(),
                           LabelEncoder()]),
     (['occupation'], [CategoricalImputer(),
                       LabelEncoder()]),
     (['relationship'], [CategoricalImputer(),
                         LabelEncoder()]),
     (['race'], [CategoricalImputer(), LabelEncoder()]),
     (['sex'], [CategoricalImputer(), LabelEncoder()]),
     (['native_country'], [CategoricalImputer(),
                           LabelEncoder()])],
    df_out=True)
df = preprocess.fit_transform(adult)
Пример #13
0
y = df[["player_id", "goals"]].groupby("player_id").shift(-1)
y = y.dropna(subset=["goals"])
train = pd.merge(X,
                 y,
                 left_index=True,
                 right_index=True,
                 suffixes=("", "_next"))

target = "goals_next"
X_train = train.drop(target, axis=1)
y_train = train[target]

mapper = DataFrameMapper(
    [
        ("position", [CategoricalImputer(),
                      LabelBinarizer()]),
        (["goals"], [SimpleImputer(), StandardScaler()]),
        (["assists"], [SimpleImputer(), StandardScaler()]),
        (["shots"], [SimpleImputer(), StandardScaler()]),
        (["ice_time"], [SimpleImputer(), StandardScaler()]),
    ],
    df_out=True,
)

model = LinearRegression()

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

with open("pickles/pipe.pkl", "wb") as f:
test_data = pd.read_csv('big_mart_test.csv')
"""Deal With Missing Data
The missingno library provides a neat way to showcase which variables have
missing data. This is done below using a bar chart. I will then proceed to use
Pandas fillna method to fill the two columns that have missing data (Item_Weight, Outlet_Size)
"""
msno.bar(train_data)
msno.bar(test_data)

train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(),
                                 inplace=True)
test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean(), inplace=True)

outlet_size_tr = train_data['Outlet_Size']
outlet_size_ts = test_data['Outlet_Size']
imputer1 = CategoricalImputer()
outlet_size_tr = imputer1.fit_transform(outlet_size_tr)
outlet_size_ts = imputer1.fit_transform(outlet_size_ts)

train_data = train_data.drop(['Outlet_Size'], axis=1)
train_data.insert(8, 'Outlet_Size', outlet_size_tr)

test_data = test_data.drop(['Outlet_Size'], axis=1)
test_data.insert(8, 'Outlet_Size', outlet_size_ts)

# Let's see if there are any columns we can drop

cor = train_data.corr()
cor["Item_Outlet_Sales"].sort_values(ascending=False)

# The year that an outlet was established has a very low correlation figure
Пример #15
0
    def encodeCategoricalValuesPrediction(self,data):
        """
                                               Method Name: encodeCategoricalValuesPrediction
                                               Description: This method encodes all the categorical values in the prediction set.
                                               Output: A Dataframe which has all the categorical values encoded.
                                               On Failure: Raise Exception

                                               Written By: Ajinkya Abhang
                                               Version: 1.0
                                               Revisions: None
                            """

        # We can impute the categorical values like below:
        features_nan = [feature for feature in data.columns if
                        data[feature].isnull().sum() > 0 and data[feature].dtypes == 'O']

        imputer = CategoricalImputer()

        if len(features_nan) != 0:
            for cat_feature in features_nan:
                data[cat_feature] = imputer.fit_transform(data[cat_feature])

        # We can impute the non-categorical values like below:
        numerical_with_nan = [feature for feature in data.columns if
                              data[feature].isnull().sum() > 1 and data[feature].dtypes != 'O']

        if len(numerical_with_nan) != 0:
            imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan)
            data[numerical_with_nan] = imputer.fit_transform(data[numerical_with_nan])

        # We can use label encoder for encoding
        df_new = pd.DataFrame({
            'laundry_options_1': [np.nan] * data.shape[0],
            'laundry_options_2': [np.nan] * data.shape[0],
            'laundry_options_3': [np.nan] * data.shape[0],
            'laundry_options_4': [np.nan] * data.shape[0],
            'parking_options_1': [np.nan] * data.shape[0],
            'parking_options_2': [np.nan] * data.shape[0],
            'parking_options_3': [np.nan] * data.shape[0],
            'parking_options_4': [np.nan] * data.shape[0],
            'parking_options_5': [np.nan] * data.shape[0],
            'parking_options_6': [np.nan] * data.shape[0]
        })

        dat = pd.concat([data, df_new], axis=1)

        for i in range(data.shape[0]):
            if (dat['laundry_options'][i] == 'w/d in unit'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 1
            elif (dat['laundry_options'][i] == 'w/d hookups'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 1
                dat['laundry_options_4'][i] = 0
            elif (dat['laundry_options'][i] == 'laundry on site'):
                dat['laundry_options_1'][i] = 1
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 0
            elif (dat['laundry_options'][i] == 'no laundry on site'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 1
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 0
            elif (dat['laundry_options'][i] == 'laundry in bldg'):
                dat['laundry_options_1'][i] = 0
                dat['laundry_options_2'][i] = 0
                dat['laundry_options_3'][i] = 0
                dat['laundry_options_4'][i] = 0

        for i in range(data.shape[0]):
            if (dat['parking_options'][i] == 'carport'):
                dat['parking_options_1'][i] = 1
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'detached garage'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 1
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'no parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 1
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'off-street parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 1
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'street parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 1
                dat['parking_options_6'][i] = 0
            elif (dat['parking_options'][i] == 'valet parking'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 1
            elif (dat['parking_options'][i] == 'attached garage'):
                dat['parking_options_1'][i] = 0
                dat['parking_options_2'][i] = 0
                dat['parking_options_3'][i] = 0
                dat['parking_options_4'][i] = 0
                dat['parking_options_5'][i] = 0
                dat['parking_options_6'][i] = 0

        dat.drop(['laundry_options', 'parking_options'], axis=1, inplace = True)

        return dat
 def fit(self, X, y=None):
     self.imputer = CategoricalImputer()
     return self
Пример #17
0
le.fit(df['name'])
df['model'] = le.transform(df['name'])

# Train Test SPlit
target = 'price'
y = df[target]
X = df.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# DataFrame Mapper
mapper = DataFrameMapper(
    [
        #     ('region', LabelBinarizer()),
        (['year'], StandardScaler()),
        # ('manufacturer',[CategoricalImputer(), LabelBinarizer()]),
        ('model', [CategoricalImputer()]),
        ('cylinders', [CategoricalImputer(),
                       LabelBinarizer()]),
        ('fuel', [CategoricalImputer(), LabelBinarizer()]),
        (['odometer'], [SimpleImputer(), StandardScaler()]),
        # ('title_status', [CategoricalImputer(), LabelBinarizer()]),
        ('transmission', [CategoricalImputer(),
                          LabelBinarizer()]),
        # (['vin'], StandardScaler()),
        # ('type', [CategoricalImputer(), LabelBinarizer()]),
        ('paint_color', [CategoricalImputer(),
                         LabelBinarizer()]),
        ('condition', [CategoricalImputer(),
                       LabelBinarizer()]),
    ],
    df_out=True)
def doprediction():
    info = request.data
    json_data = json.loads(info)
    meldrange = json_data["meldrange"]
    meldrange = float(meldrange)
    donor_data = json_data["donor"]
    dolen = len(donor_data)
    allrecip_data = json_data["allrecip"]
    allrecip_len = len(allrecip_data)
    donor_df = pd.DataFrame(data=donor_data[1:dolen], columns=donor_data[0])
    allrecip_df = pd.DataFrame(data=allrecip_data[1:allrecip_len],
                               columns=allrecip_data[0])

    filename = 'datafile/donorfile.csv'
    filename2 = 'datafile/recipfile.csv'
    silentremove(filename)
    silentremove(filename2)
    donor_df.to_csv(filename, encoding='utf-8')
    allrecip_df.to_csv(filename2, encoding='utf-8')
    # start to impute --------------------------------------

    donor_df = pd.read_csv('datafile/donorfile.csv', index_col=0)
    recipient_df = pd.read_csv('datafile/recipfile.csv', index_col=0)
    id_df = pd.DataFrame(
        recipient_df[['recipient_id', 'FINAL_MELD_PELD_LAB_SCORE']])
    X_cf_r = recipient_df.select_dtypes(include=['object'])
    X_ncf_r = recipient_df.select_dtypes(exclude=['object'])

    X_cf_d = donor_df.select_dtypes(include=['object'])
    X_ncf_d = donor_df.select_dtypes(exclude=['object'])

    imp_cat = CategoricalImputer()
    X_cf_r = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_r)),
                          columns=X_cf_r.columns)

    imp_cat = CategoricalImputer()
    X_cf_d = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_d)),
                          columns=X_cf_d.columns)

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_ncf_r)
    X_ncf_r = pd.DataFrame(imp.transform(X_ncf_r), columns=X_ncf_r.columns)

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_ncf_d)
    X_ncf_d = pd.DataFrame(imp.transform(X_ncf_d), columns=X_ncf_d.columns)

    recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True)
    # donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True)

    if meldrange != 200:
        id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (
            id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)]
        recipient_df = recipient_df.loc[
            (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange)
            & (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20.0)]

    X_cf_r = recipient_df.select_dtypes(include=['object'])

    X_ncf_r = recipient_df.select_dtypes(exclude=['object'])

    min_max_scaler = preprocessing.MinMaxScaler()
    header = X_ncf_d.columns
    X_ncf_d = min_max_scaler.fit_transform(X_ncf_d)
    X_ncf_d = pd.DataFrame(X_ncf_d, columns=header)

    min_max_scaler = preprocessing.MinMaxScaler()
    header = X_ncf_r.columns
    X_ncf_r = min_max_scaler.fit_transform(X_ncf_r)

    X_ncf_r = pd.DataFrame(X_ncf_r, columns=header)
    X_ncf_r.index = X_cf_r.index
    recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True)
    print("recipdf", recipient_df)
    donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True)

    filename = 'datafile/donorfile.csv'
    filename2 = 'datafile/recipfile.csv'
    filename3 = 'datafile/recipidfile.csv'
    silentremove(filename)
    silentremove(filename2)
    silentremove(filename3)
    donor_df.to_csv(filename, encoding='utf-8')

    print("meldrange", meldrange)
    # if meldrange!=200:
    #     id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)]
    #     recipient_df = recipient_df.loc[(recipient_df['FINAL_MELD_PELD_LAB_SCORE']<meldrange) & (recipient_df['FINAL_MELD_PELD_LAB_SCORE']>= meldrange-20.0)]

    id_df = pd.DataFrame(id_df['recipient_id'], columns=['recipient_id'])

    recipient_df.to_csv(filename2, encoding='utf-8')
    id_df.to_csv(filename3, encoding='utf-8')

    import prediction
    match_score = prediction.matching()
    predict_score = prediction.predictscore()
    return json.dumps({'match': match_score, 'predict': predict_score})
Пример #19
0
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
    [([numeric_feature], Imputer(strategy="median"))
     for numeric_feature in non_categorical_columns],
    input_df=True,
    df_out=True)

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
    [(category_feature, CategoricalImputer())
     for category_feature in categorical_columns],
    input_df=True,
    df_out=True)

# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
    ("num_mapper", numeric_imputation_mapper),
    ("cat_mapper", categorical_imputation_mapper)
])

# Create full pipeline
pipeline = Pipeline([("featureunion", numeric_categorical_union),
Пример #20
0
# # Missing values
# ============================================================================='''

data_missing = dataset.isnull().sum()
print(data_missing)

# Numpy array for imputing missing values
X = dataset.iloc[:, :-1].values

# =============================================================================
## Missing Categorical Values
# =============================================================================
from sklearn_pandas import CategoricalImputer

data = np.array(X[:,8], dtype=object)
imputer = CategoricalImputer()
X[:,8] = imputer.fit_transform(data)
dataset['Outlet_Size'] = X[:,8]

# =============================================================================
# # Imputer for numeric values
# =============================================================================

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
X[:, 1:2] = imputer.fit_transform(X[:, 1:2])
dataset['Item_Weight'] = X[:,1:2] 

# Check Values in Item Visibilty
dataset.Item_Visibility.value_counts()
# Replace 0 with NaN
def test_invalid_strategy():
    """
    Raise an error if an invalid strategy is entered
    """
    with pytest.raises(ValueError):
        CategoricalImputer(strategy="not_a_supported_strategy")
Пример #22
0
]])

y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])
train = pd.merge(X,
                 y,
                 left_index=True,
                 right_index=True,
                 suffixes=('', '_next'))

target = 'goals_next'
X_train = train.drop(target, axis=1)
y_train = train[target]

mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]),
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]),
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
],
                         df_out=True)

model = LinearRegression()

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)
score = pipe.score(X_train, y_train)

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)
def impute_categorical_features(df, features):
    feature_defs = []
    for col_name in features:
        feature_defs.append((col_name, CategoricalImputer()))
    mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True)
    df[features] = mapper.fit_transform(df[features])
Пример #24
0
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        Written By: iNeuron Intelligence
        Version: 1.0
        Revisions: None

        """

    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def remove_unwanted_spaces(self,data):
        """
                        Method Name: remove_unwanted_spaces
                        Description: This method removes the unwanted spaces from a pandas dataframe.
                        Output: A pandas DataFrame after removing the spaces.
                        On Failure: Raise Exception

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                """
        self.logger_object.log(self.file_object, 'Entered the remove_unwanted_spaces method of the Preprocessor class')
        self.data = data

        try:
            self.df_without_spaces=self.data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)  # drop the labels specified in the columns
            self.logger_object.log(self.file_object,
                                   'Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class')
            return self.df_without_spaces
        except Exception as e:
            self.logger_object.log(self.file_object,
                                   'Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message:  ' + str(
                                       e))
            self.logger_object.log(self.file_object,
                                   'unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class')
            raise Exception()


    def remove_columns(self,data,columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception

                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

        """
        self.logger_object.log(self.file_object, 'Entered the remove_columns method of the Preprocessor class')
        self.data=data
        self.columns=columns
        try:
            self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns
            self.logger_object.log(self.file_object,
                                   'Column removal Successful.Exited the remove_columns method of the Preprocessor class')
            return self.useful_data
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '+str(e))
            self.logger_object.log(self.file_object,
                                   'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class')
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                """
        self.logger_object.log(self.file_object, 'Entered the separate_label_feature method of the Preprocessor class')
        try:
            self.X=data.drop(labels=label_column_name,axis=1) # drop the columns specified and separate the feature columns
            self.Y=data[label_column_name] # Filter the Label columns
            self.logger_object.log(self.file_object,
                                   'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class')
            return self.X,self.Y
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class')
            raise Exception()

    def is_null_present(self,data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.logger_object.log(self.file_object, 'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values=[]
        self.cols = data.columns
        try:
            self.null_counts=data.isna().sum() # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i]>0:
                    self.null_present=True
                    self.cols_with_missing_values.append(self.cols[i])
            if(self.null_present): # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(data.isna().sum())
                self.dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file
            self.logger_object.log(self.file_object,'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class')
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in is_null_present method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object,'Finding missing values failed. Exited the is_null_present method of the Preprocessor class')
            raise Exception()

    def impute_missing_values(self, data, cols_with_missing_values):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class')
        self.data= data
        self.cols_with_missing_values=cols_with_missing_values
        try:

            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                try:
                    self.data[col] = self.imputer.fit_transform(self.data[col])
                except:
                    self.data[col] = data[col].fillna(data[col].mode()[0])
            self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class')
            return self.data
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class')
            raise Exception()
    def scale_numerical_columns(self,data):
        """
                                                        Method Name: scale_numerical_columns
                                                        Description: This method scales the numerical values using the Standard scaler.
                                                        Output: A dataframe with scaled values
                                                        On Failure: Raise Exception

                                                        Written By: iNeuron Intelligence
                                                        Version: 1.0
                                                        Revisions: None
                                     """
        self.logger_object.log(self.file_object,
                               'Entered the scale_numerical_columns method of the Preprocessor class')

        self.data=data
        self.num_df = self.data[['months_as_customer', 'policy_deductable', 'umbrella_limit',
                          'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
                          'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim',
                          'property_claim',
                          'vehicle_claim']]

        try:

            self.scaler = StandardScaler()
            self.scaled_data = self.scaler.fit_transform(self.num_df)
            self.scaled_num_df = pd.DataFrame(data=self.scaled_data, columns=self.num_df.columns,index=self.data.index)
            self.data.drop(columns=self.scaled_num_df.columns, inplace=True)
            self.data = pd.concat([self.scaled_num_df, self.data], axis=1)

            self.logger_object.log(self.file_object, 'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class')
            return self.data

        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object, 'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class')
            raise Exception()
    def encode_categorical_columns(self,data):
        """
                                                Method Name: encode_categorical_columns
                                                Description: This method encodes the categorical values to numeric values.
                                                Output: dataframe with categorical values converted to numerical values
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None
                             """
        self.logger_object.log(self.file_object, 'Entered the encode_categorical_columns method of the Preprocessor class')

        self.data=data
        try:
            self.cat_df = self.data.select_dtypes(include=['object']).copy()
            self.cat_df['policy_csl'] = self.cat_df['policy_csl'].map({'100/300': 1, '250/500': 2.5, '500/1000': 5})
            self.cat_df['insured_education_level'] = self.cat_df['insured_education_level'].map(
                {'JD': 1, 'High School': 2, 'College': 3, 'Masters': 4, 'Associate': 5, 'MD': 6, 'PhD': 7})
            self.cat_df['incident_severity'] = self.cat_df['incident_severity'].map(
                {'Trivial Damage': 1, 'Minor Damage': 2, 'Major Damage': 3, 'Total Loss': 4})
            self.cat_df['insured_sex'] = self.cat_df['insured_sex'].map({'FEMALE': 0, 'MALE': 1})
            self.cat_df['property_damage'] = self.cat_df['property_damage'].map({'NO': 0, 'YES': 1})
            self.cat_df['police_report_available'] = self.cat_df['police_report_available'].map({'NO': 0, 'YES': 1})
            try:
                # code block for training
                self.cat_df['fraud_reported'] = self.cat_df['fraud_reported'].map({'N': 0, 'Y': 1})
                self.cols_to_drop=['policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex',
                                            'property_damage', 'police_report_available', 'fraud_reported']
            except:
                # code block for Prediction
                self.cols_to_drop = ['policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex',
                                     'property_damage', 'police_report_available']
            # Using the dummy encoding to encode the categorical columns to numerical ones

            for col in self.cat_df.drop(columns=self.cols_to_drop).columns:
                self.cat_df = pd.get_dummies(self.cat_df, columns=[col], prefix=[col], drop_first=True)

            self.data.drop(columns=self.data.select_dtypes(include=['object']).columns, inplace=True)
            self.data= pd.concat([self.cat_df,self.data],axis=1)
            self.logger_object.log(self.file_object, 'encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class')
            return self.data

        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object, 'encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class')
            raise Exception()

    def handle_imbalanced_dataset(self,x,y):
        """
        Method Name: handle_imbalanced_dataset
        Description: This method handles the imbalanced dataset to make it a balanced one.
        Output: new balanced feature and target columns
        On Failure: Raise Exception

        Written By: iNeuron Intelligence
        Version: 1.0
        Revisions: None
                                     """
        self.logger_object.log(self.file_object,
                               'Entered the handle_imbalanced_dataset method of the Preprocessor class')

        try:
            self.rdsmple = RandomOverSampler()
            self.x_sampled,self.y_sampled  = self.rdsmple.fit_sample(x,y)
            self.logger_object.log(self.file_object,
                                   'dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class')
            return self.x_sampled,self.y_sampled

        except Exception as e:
            self.logger_object.log(self.file_object,
                                   'Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message:  ' + str(
                                       e))
            self.logger_object.log(self.file_object,
                                   'dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class')
            raise Exception()
Пример #25
0
)
#creation of data frames from csv
titanic_train = pd.read_csv("Titanic_train.csv")
print(titanic_train.info())

#preprocessing stage
#impute missing values for continuous features
imputable_cont_features = ['Age', 'Fare']
cont_imputer = preprocessing.Imputer()
cont_imputer.fit(titanic_train[imputable_cont_features])
print(cont_imputer.statistics_)
titanic_train[imputable_cont_features] = cont_imputer.transform(
    titanic_train[imputable_cont_features])

#impute missing values for categorical features
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_train['Embarked'])
print(cat_imputer.fill_)
titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked'])

le_embarked = preprocessing.LabelEncoder()
le_embarked.fit(titanic_train['Embarked'])
print(le_embarked.classes_)
titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked'])

le_sex = preprocessing.LabelEncoder()
le_sex.fit(titanic_train['Sex'])
print(le_sex.classes_)
titanic_train['Sex'] = le_sex.transform(titanic_train['Sex'])

le_pclass = preprocessing.LabelEncoder()
Пример #26
0
def processData():

    catFeatures = [
        'GENDER', 'ABO', 'LIFE_SUP_TCR', 'MALIG_TCR', 'EXC_HCC', 'EXC_CASE',
        'PERM_STATE', 'PREV_AB_SURG_TCR', 'BACT_PERIT_TCR', 'PORTAL_VEIN_TCR',
        'TIPSS_TCR', 'WORK_INCOME_TCR', 'INIT_DIALYSIS_PRIOR_WEEK',
        'INIT_MELD_OR_PELD', 'FINAL_DIALYSIS_PRIOR_WEEK', 'FINAL_MELD_OR_PELD',
        'PERM_STATE_TRR', 'WORK_INCOME_TRR', 'MALIG_TRR', 'LIFE_SUP_TRR',
        'PORTAL_VEIN_TRR', 'PREV_AB_SURG_TRR', 'TIPSS_TRR', 'HBV_CORE',
        'HBV_SUR_ANTIGEN', 'HCV_SEROSTATUS', 'EBV_SEROSTATUS',
        'HIV_SEROSTATUS', 'CMV_STATUS', 'CMV_IGG', 'CMV_IGM', 'TXLIV',
        'PREV_TX', 'DDAVP_DON', 'CMV_DON', 'HEP_C_ANTI_DON', 'HBV_CORE_DON',
        'HBV_SUR_ANTIGEN_DON', 'DON_TY', 'GENDER_DON', 'HOME_STATE_DON',
        'NON_HRT_DON', 'ANTIHYPE_DON', 'PT_DIURETICS_DON', 'PT_STEROIDS_DON',
        'PT_T3_DON', 'PT_T4_DON', 'VASODIL_DON', 'VDRL_DON', 'CLIN_INFECT_DON',
        'EXTRACRANIAL_CANCER_DON', 'HIST_CIG_DON', 'HIST_COCAINE_DON',
        'DIABETES_DON', 'HIST_HYPERTENS_DON', 'HIST_OTH_DRUG_DON', 'ABO_DON',
        'INTRACRANIAL_CANCER_DON', 'SKIN_CANCER_DON', 'HIST_CANCER_DON',
        'PT_OTH_DON', 'HEPARIN_DON', 'ARGININE_DON', 'INSULIN_DON', 'DIAL_TX',
        'ABO_MAT', 'AGE_GROUP', 'MALIG', 'RECOV_OUT_US', 'TATTOOS',
        'LI_BIOPSY', 'PROTEIN_URINE', 'CARDARREST_NEURO',
        'INOTROP_SUPPORT_DON', 'CDC_RISK_HIV_DON', 'HISTORY_MI_DON',
        'CORONARY_ANGIO_DON', 'LT_ONE_WEEK_DON'
    ]
    numFeatures = [
        'WGT_KG_DON_CALC', 'INIT_INR', 'ETHCAT_DON', 'ETHNICITY', 'DGN_TCR',
        'REM_CD', 'INIT_AGE', 'ALBUMIN_TX', 'BMI_DON_CALC', 'EXC_EVER',
        'OTH_LIFE_SUP_TCR', 'FINAL_ASCITES', 'WGT_KG_CALC', 'END_BMI_CALC',
        'LISTYR', 'DDR1', 'FINAL_ALBUMIN', 'DB2', 'INIT_BMI_CALC',
        'CITIZENSHIP', 'DB1', 'EDUCATION', 'DAYSWAIT_CHRON',
        'OTH_LIFE_SUP_TRR', 'MED_COND_TRR', 'INIT_WGT_KG',
        'MELD_PELD_LAB_SCORE', 'NUM_PREV_TX', 'INIT_SERUM_SODIUM',
        'VENTILATOR_TCR', 'TX_PROCEDUR_TY', 'LITYP', 'INIT_SERUM_CREAT',
        'WGT_KG_TCR', 'TBILI_DON', 'HGT_CM_CALC', 'SGOT_DON', 'ASCITES_TX',
        'INIT_MELD_PELD_LAB_SCORE', 'ECD_DONOR', 'CREAT_TX', 'INIT_ENCEPH',
        'INIT_HGT_CM', 'PRI_PAYMENT_TRR', 'INIT_STAT', 'ARTIFICIAL_LI_TCR',
        'PT_CODE', 'WL_ID_CODE', 'INIT_ALBUMIN', 'ARTIFICIAL_LI_TRR',
        'AGE_DON', 'ON_VENT_TRR', 'PRI_PAYMENT_TCR', 'BLOOD_INF_DON',
        'CREAT_DON', 'REGION', 'INIT_ASCITES', 'HEMATOCRIT_DON', 'DIAB',
        'TBILI_TX', 'FINAL_INR', 'AGE', 'FUNC_STAT_TRR', 'ETHCAT',
        'CITIZENSHIP_DON', 'DEATH_MECH_DON', 'FUNC_STAT_TCR',
        'FINAL_SERUM_SODIUM', 'COD_CAD_DON', 'FINAL_BILIRUBIN', 'BUN_DON',
        'END_STAT', 'BMI_CALC', 'DDR2', 'FINAL_SERUM_CREAT',
        'HIST_DIABETES_DON', 'ENCEPH_TX', 'SHARE_TY', 'DA1', 'PH_DON',
        'FINAL_MELD_PELD_LAB_SCORE', 'BMI_TCR', 'INIT_BILIRUBIN', 'DISTANCE',
        'SGPT_DON', 'PULM_INF_DON', 'HGT_CM_TCR', 'TRANSFUS_TERM_DON',
        'FINAL_ENCEPH', 'DIAG', 'DA2', 'HGT_CM_DON_CALC', 'URINE_INF_DON',
        'COLD_ISCH', 'INR_TX', 'DEATH_CIRCUM_DON', 'CANCER_SITE_DON'
    ]

    #Categorical pipeline
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(catFeatures)),
        ('imputer', CategoricalImputer()),
        ('cat_encoder',
         CategoricalEncoder("onehot-dense", handle_unknown='ignore')),
    ])

    #Numerical pipeline
    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numFeatures)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

    #Full pipeline
    full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

    train = pd.read_csv("train.csv")
    X_train = full_pipeline.fit_transform(train.loc[:,
                                                    catFeatures + numFeatures])
    gstatusSixMonths_train = train["GSTATUS_SIX_MONTHS"].values
    gstatusOneYear_train = train["GSTATUS_ONE_YEAR"].values
    gstatusThreeYears_train = train["GSTATUS_THREE_YEARS"].values
    gstatus_train = train["GSTATUS"].values
    gtime_train = train["GTIME"].values
    Y_train = np.array([[gstatus_train[i], gtime_train[i]]
                        for i in range(len(gtime_train))
                        ])  #[is_not_censored, survival time]

    test = pd.read_csv("test.csv")
    X_test = full_pipeline.transform(test.loc[:, catFeatures + numFeatures])
    gstatusSixMonths_test = test["GSTATUS_SIX_MONTHS"].values
    gstatusOneYear_test = test["GSTATUS_ONE_YEAR"].values
    gstatusThreeYears_test = test["GSTATUS_THREE_YEARS"].values
    gstatus_test = test["GSTATUS"].values
    gtime_test = test["GTIME"].values
    Y_test = np.array([[gstatus_test[i], gtime_test[i]]
                       for i in range(len(gtime_test))
                       ])  #[is_not_censored, survival time]

    return X_train, Y_train, X_test, Y_test
Пример #27
0
class Preprocessor:
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def replaceInvalidValuesWithNull(self, data):
        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.NaN)
        return data

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                self.dataframe_with_null.to_csv(
                    'preprocessing_data/null_values.csv'
                )  # storing the null column information to file
            self.logger_object.log(
                self.file_object,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def impute_missing_values(self, data, cols_with_missing_values):
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        self.cols_with_missing_values = cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.data

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        self.logger_object.log(
            self.file_object,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(labels=label_column_name, axis=1)
            self.Y = data[label_column_name]
            self.logger_object.log(
                self.file_object,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()
 def transform(self, X):
     for var in config.CAT_FEATURES:
         imputer = CategoricalImputer()
         X[var] = imputer.fit_transform(X[var])
     return X
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )


## Kidney disease case study II: Feature Union

# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])
def test_missing_replacement():
    """
    Raise error if no replacement value specified and strategy='fixed_value'
    """
    with pytest.raises(ValueError):
        CategoricalImputer(strategy="fixed_value")
Пример #31
0
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        Written By: Arpit Kumar
        Version: 1.0
        Revisions: None

        """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object

    def remove_columns(self, data, columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception

                Written By: Arpit Kumar
                Version: 1.0
                Revisions: None

        """
        self.logger_object.log(
            self.file_object,
            'Entered the remove_columns method of the Preprocessor class')
        self.data = data
        self.columns = columns
        try:
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            self.logger_object.log(
                self.file_object,
                'Column removal Successful.Exited the remove_columns method of the Preprocessor class'
            )
            return self.useful_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception

                        Written By: Arpit Kumar
                        Version: 1.0
                        Revisions: None

                """
        self.logger_object.log(
            self.file_object,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(
                labels=label_column_name, axis=1
            )  # drop the columns specified and separate the feature columns
            self.Y = data[label_column_name]  # Filter the Label columns
            self.logger_object.log(
                self.file_object,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()

    def dropUnnecessaryColumns(self, data, columnNameList):
        """
                        Method Name: is_null_present
                        Description: This method drops the unwanted columns as discussed in EDA section.

                        Written By: Arpit Kumar
                        Version: 1.0
                        Revisions: None

                                """
        data = data.drop(columnNameList, axis=1)
        return data

    def replaceInvalidValuesWithNull(self, data):
        """
                               Method Name: is_null_present
                               Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA.

                               Written By: Arpit Kumar
                               Version: 1.0
                               Revisions: None

                                       """

        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.nan)
        return data

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                                Written By: Arpit Kumar
                                Version: 1.0
                                Revisions: None

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                self.dataframe_with_null.to_csv(
                    'preprocessing_data/null_values.csv'
                )  # storing the null column information to file
            self.logger_object.log(
                self.file_object,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def encodeCategoricalValues(self, data):
        """
                                        Method Name: encodeCategoricalValues
                                        Description: This method encodes all the categorical values in the training set.
                                        Output: A Dataframe which has all the categorical values encoded.
                                        On Failure: Raise Exception

                                        Written By: Arpit Kumar
                                        Version: 1.0
                                        Revisions: None
                     """
        data["class"] = data["class"].map({'p': 1, 'e': 2})

        for column in data.drop(['class'], axis=1).columns:
            data = pd.get_dummies(data, columns=[column])

        return data

    def encodeCategoricalValuesPrediction(self, data):
        """
                                               Method Name: encodeCategoricalValuesPrediction
                                               Description: This method encodes all the categorical values in the prediction set.
                                               Output: A Dataframe which has all the categorical values encoded.
                                               On Failure: Raise Exception

                                               Written By: Arpit Kumar
                                               Version: 1.0
                                               Revisions: None
                            """

        for column in data.columns:
            data = pd.get_dummies(data, columns=[column])

        return data

    # def handleImbalanceDataset(self,X,Y):
    #     """
    #                                                   Method Name: handleImbalanceDataset
    #                                                   Description: This method handles the imbalance in the dataset by oversampling.
    #                                                   Output: A Dataframe which is balanced now.
    #                                                   On Failure: Raise Exception
    #
    #                                                   Written By: Arpit Kumar
    #                                                   Version: 1.0
    #                                                   Revisions: None
    #                                """
    #
    #
    #
    #     rdsmple = RandomOverSampler()
    #     x_sampled, y_sampled = rdsmple.fit_sample(X, Y)
    #
    #     return x_sampled,y_sampled

    def impute_missing_values(self, data, cols_with_missing_values):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: Arpit Kumar
                                        Version: 1.0
                                        Revisions: None
                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        self.cols_with_missing_values = cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def get_columns_with_zero_std_deviation(self, data):
        """
                                                Method Name: get_columns_with_zero_std_deviation
                                                Description: This method finds out the columns which have a standard deviation of zero.
                                                Output: List of the columns with standard deviation of zero
                                                On Failure: Raise Exception

                                                Written By: Arpit Kumar
                                                Version: 1.0
                                                Revisions: None
                             """
        self.logger_object.log(
            self.file_object,
            'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class'
        )
        self.columns = data.columns
        self.data_n = data.describe()
        self.col_to_drop = []
        try:
            for x in self.columns:
                if (self.data_n[x]['std'] == 0
                    ):  # check if standard deviation is zero
                    self.col_to_drop.append(
                        x
                    )  # prepare the list of columns with standard deviation zero
            self.logger_object.log(
                self.file_object,
                'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            return self.col_to_drop

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            raise Exception()
Пример #32
0
Y = df.groupby('id')[Y_COLUMNS].shift(-1)
X = df[X_COLUMNS]
X = X[~pd.isnull(Y).any(axis=1)]
Y = Y.dropna()
Y = Y.reset_index(drop=True)
X = X.reset_index(drop=True)

X_train, X_test, Y_train, Y_test = (train_test_split(X,
                                                     Y,
                                                     test_size=0.2,
                                                     random_state=42))

mapper = DataFrameMapper(
    [(['age'], [SimpleImputer(),
                PolynomialFeatures(include_bias=False)]),
     (['position'], [CategoricalImputer(),
                     LabelBinarizer()]), (['goals'], SimpleImputer()),
     (['assists'], SimpleImputer()), (['plus_minus'], SimpleImputer()),
     (['shots_on_goal'], SimpleImputer()), (['blocks'], SimpleImputer()),
     (['hits'], SimpleImputer())],
    df_out=False)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

from sklearn.linear_model import LinearRegression
multi_model = MultiOutputRegressor(LinearRegression())
multi_model.fit(Z_train, Y_train)
pd.DataFrame(Z_test).iloc[:, 8:].head()
pd.DataFrame(multi_model.predict(Z_test)).head()
multi_model.score(Z_test, Y_test)
Пример #33
0
def impute_categorical_features(df, features):
    #impute missing values for categorical features
    cat_imputer = CategoricalImputer()
    cat_imputer.fit(df[features])
    print(cat_imputer.fill_)
    df[features] = cat_imputer.transform(df[features])
le.fit(X_train['name'])
le.transform(X_train['name'])

le.fit(X_train['location'])
le.transform(X_train['location'])

# In[ ]:

mapper = DataFrameMapper([
    (['name'], [LabelBinarizer()]),
    (['location'], [LabelBinarizer()]),
    (['year'], [StandardScaler()]),
    (['kilometers_driven'], [SimpleImputer(),
                             StandardScaler()]),
    (['fuel_type'], [CategoricalImputer(),
                     LabelBinarizer()]),
    (['transmission'], [CategoricalImputer(),
                        LabelBinarizer()]),
    (['owner_type'], [SimpleImputer(), StandardScaler()]),
    (['seats'], [SimpleImputer(), StandardScaler()]),
],
                         df_out=True)

# In[ ]:

Z_train = mapper.fit_transform(X_train)

# In[ ]:

Z_test = mapper.transform(X_test)