示例#1
0
def dataPreprocessing(df):

    #imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0, verbose=1)
    #imp.fit(df)

    imputerResult = DataFrameImputer().fit_transform(df)

    imputerResult = imputerResult.apply(preprocessing.LabelEncoder().fit_transform)
    #df=df.apply(preprocessing.StandardScaler().fit_transform)
    #df=df.apply(preprocessing.MinMaxScaler().fit_transform)
    return imputerResult
 def dataLoading(self,file):
     with open(file) as f:
         features = []
         labels = []
         for line in f:
             line = line.replace("@data","")
             line = line.replace("yes","0")
             line = line.replace("no,","1,")
             line = line.replace("poor","0")
             line = line.replace("good","1")
             line = line.replace(",normal",",1")
             line = line.replace(",abnormal",",0")
             line = line.replace(",notpresent",",1")
             line = line.replace(",present",",0")
             line = line.replace("?", "NaN")
             line = line.replace(",notckd",",1")
             line = line.replace(",ckd",",0")
             line = line.replace("\t","")
             line = line[:-2]
             row = line.split(",")
             labels.append(row[-1])
             features.append(row[:-1])       
         features = features[8:]
         labels = labels[8:]
         labels[399] = "1"
         features = [list(func(i)) for i in features]
         labels = [list(func(i)) for i in labels]
         labels = np.asarray(labels)
         labels = labels.reshape((400,))
         features = [pd.to_numeric(i,errors="coerce") for i in features]
         features_before = pd.DataFrame(features)
         features_after = DataFrameImputer().fit_transform(features_before)
     return features_after,labels
示例#3
0
def data_prepartion():

    num_attribs = ["yearsExperience", "milesFromMetropolis"]
    cat_attribs = ["companyId", "jobType", "degree", "major", "industry"]
    num_pipeline = Pipeline([
        ('imputer', DataFrameImputer(num_attribs)),
        ('std_scaler', StandardScaler()),
    ])

    cat_pipeline = Pipeline([
        ('imputer', DataFrameImputer(cat_attribs)),
        ('label_binarizer', OneHotEncoder()),
    ])

    data_pipeline = FeatureUnion(
        transformer_list=[("num_pipeline",
                           num_pipeline), ("cat_pipeline", cat_pipeline)])
    return data_pipeline
示例#4
0
# seperate back to training and test sets after one hot encoding
dataframe = one_hot.iloc[:dataframe.shape[0], :]
test = one_hot.iloc[dataframe.shape[0]:, ]

print("Imputing Data...")
# imputes with the mean of each col
#for col in dataframe:
#	dataframe[col].fillna(dataframe[col].mean())
#dataframe = dataframe.fillna(dataframe.mean())
#dataframe.dropna(inplace=True)

#imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
#imp.fit(dataframe)
X = pd.DataFrame(dataframe)
dataframe = DataFrameImputer().fit_transform(X)

# get numeric variables
numeric_variables = list(dataframe.dtypes[dataframe.dtypes != "object"].index)

print("fitting baseline model on just numerical values...")
# fit model on just numerical variables as a baseline
model = RandomForestRegressor(n_estimators=2, oob_score=True, random_state=42)
model.fit(dataframe[numeric_variables], labels)

# for regression the oob_score_ (out of bag score) gives the R^2 based on oob predictions
#print(model.oob_score_)

labels_oob = model.oob_prediction_
print("c-stat ", roc_auc_score(labels, labels_oob))
print("Out of bag score...")
示例#5
0
文件: main.py 项目: QBonSale/KDD2014
        print('train data shape', projects.ix[train_idx].shape)
        print('test data shape', projects.ix[test_idx].shape)

        print('raw data loaded')

        print('dropping unnecessary columns...')
        drop_labels = ['school_ncesid', 'schoolid', 'school_city', 'school_latitude', 'school_longitude', 'school_zip', 'school_district',
                       'school_county', 'secondary_focus_subject', 'secondary_focus_area']
        drop_labels.append('date_posted')
        # drop_labels.append('school_city')
        for label in drop_labels:
            projects.drop(label, axis=1, inplace=True)

        print('imputing missing elements...')  # mean for number, most frequent for nan
        dfi = DataFrameImputer()
        projects = dfi.fit_transform(projects)
        outcomes = dfi.fit_transform(outcomes)

        print('factorizing catagorical values...')
        proj_cat_labels = ['teacher_acctid',  'school_state', 'school_metro',
                           'school_charter',
                           'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
                           'school_charter_ready_promise',
                           'teacher_prefix',
                           'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
                           'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level',
                           'grade_level', 'eligible_double_your_impact_match', 'eligible_almost_home_match'
                           ]
        # proj_cat_labels.remove('school_city')
        for label in proj_cat_labels:
示例#6
0
# In[282]:

import sklearn.preprocessing as pre
from DataFrameImputer import DataFrameImputer


# In[283]:

# Import and replace NaNs with most frequent value for labels and mean for numericals
train_original = pd.DataFrame.from_csv("train.csv")
test_original = pd.DataFrame.from_csv("test.csv")
lenght_train = len(train_original)
total = train_original.append(test_original)
total['SalePrice'] = total['SalePrice'].fillna(value=0)  # don't impute price of the test data

total_dframe = DataFrameImputer().fit_transform(total)


# In[284]:

# Nachbarschaft
nbh_score = {None: 0, "CollgCr": 2, "Veenker": 2, "Crawfor": 2, "NoRidge": 3, "Mitchel": 1, "Somerst": 2, "NWAmes": 2, "OldTown": 1, "BrkSide": 1, "Sawyer": 1, "NridgHt": 3, "NAmes": 1,
             "SawyerW": 2, "IDOTRR": 1, "MeadowV": 1, "Edwards": 1, "Timber": 2, "Gilbert": 2, "StoneBr": 3, "ClearCr": 2, "NPkVill": 1, "Blmngtn": 2, "BrDale": 1, "SWISU": 1, "Blueste": 1}

total_dframe['nbh_score'] = total_dframe['Neighborhood'].map(nbh_score)


# In[285]:

next_to_airport = {
    'Blmngtn': 0,  # Bloomington Heights
示例#7
0
        print('raw data loaded')

        print('dropping unnecessary columns...')
        drop_labels = [
            'school_ncesid', 'schoolid', 'school_city', 'school_latitude',
            'school_longitude', 'school_zip', 'school_district',
            'school_county', 'secondary_focus_subject', 'secondary_focus_area'
        ]
        drop_labels.append('date_posted')
        # drop_labels.append('school_city')
        for label in drop_labels:
            projects.drop(label, axis=1, inplace=True)

        print('imputing missing elements...'
              )  # mean for number, most frequent for nan
        dfi = DataFrameImputer()
        projects = dfi.fit_transform(projects)
        outcomes = dfi.fit_transform(outcomes)

        print('factorizing catagorical values...')
        proj_cat_labels = [
            'teacher_acctid', 'school_state', 'school_metro', 'school_charter',
            'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
            'school_charter_ready_promise', 'teacher_prefix',
            'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
            'primary_focus_subject', 'primary_focus_area', 'resource_type',
            'poverty_level', 'grade_level',
            'eligible_double_your_impact_match', 'eligible_almost_home_match'
        ]
        # proj_cat_labels.remove('school_city')
        for label in proj_cat_labels:
示例#8
0
 def fill_missing_values(self, df):
     return DataFrameImputer().fit_transform(df)