lastcolumn_np = df.iloc[:,-1:].to_numpy()
lastcolumn_op=np.ravel(lastcolumn_np)


# In[105]:


df_copy.drop(['Target Variable (Discrete)','Feature 16','Feature 17'], axis=1, inplace=True)


# In[106]:


imp_mean = IterativeImputer(random_state=0)
imp_mean.fit(df_copy)
IterativeImputer(random_state=0)
data = imp_mean.transform(df_copy)


# In[108]:


corr_features = set()
correlation_matrix = pd.DataFrame(data).corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            corr_features.add(colname)
예제 #2
0
                                #Split data in training data and validation data
                                x_trainCV = x_train.values[train_index]
                                x_testCV = x_train.values[test_index]
                                y_trainCV = y_train.values[train_index]
                                y_testCV = y_train.values[test_index]

                                #Replacement of NaN's

                                #Default Simple
                                imp = SimpleImputer(
                                    missing_values=np.nan,
                                    strategy=replace_nan_strategy)

                                if (replace_nan_method == 'Iterative'):
                                    imp = IterativeImputer(
                                        n_nearest_features=replace_nan_strategy
                                    )

                                x_trainCV = imp.fit_transform(
                                    x_trainCV, y_trainCV)
                                x_testCV = imp.transform(x_testCV)

                                #Outlier detection

                                print('before: ', x_trainCV.shape)

                                iso = IsolationForest(
                                    contamination=contamination).fit(
                                        x_trainCV, y_trainCV)

                                clfTrain = iso.predict(x_trainCV)
예제 #3
0
# =============================================================================
# #Normalisation
# =============================================================================
from sklearn import preprocessing
temp_features = temp_features.iloc[:, :].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
temp_features = min_max_scaler.fit_transform(temp_features)

temp_features = pd.DataFrame(temp_features, columns=feature_list)

# =============================================================================
# #Imputation 
# =============================================================================
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
imp = IterativeImputer(random_state=0, max_iter = 50, imputation_order='random')
imp.fit(temp_features)
features_imp = imp.transform(temp_features)
imp = None
import gc
gc.collect()
features_imp = pd.DataFrame(features_imp,columns=feature_list)
features = features_imp.copy()

features = features.join(pd.DataFrame(temp_features_label, columns=(['Longterm_TransplantOutcome'])))
features = features.join(pd.DataFrame(temp_features_tenure, columns=(['tenure'])))
features = features.join(pd.DataFrame(temp_features_transplantationIDs, columns=(['TransplantationID'])))
features = features.join(pd.DataFrame(temp_features_patientIDs, columns=(['PatientID'])))
features.to_csv(r'T:\\tbase\\tbase_data_imputed.csv')

###################################
예제 #4
0
def load_data_release_level(project, metric):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis=1, how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name', 'commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0, item)
    understand_df = understand_df[cols_list]
    cols = understand_df.columns.tolist()
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    understand_df['Name'] = understand_df.Name.str.rsplit('.', 1).str[1]

    commit_guru_file_level_path = 'data/commit_guru_file/' + project + '.csv'
    commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path)
    commit_guru_file_level_df[
        'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"')
    commit_guru_file_level_df = commit_guru_file_level_df[
        commit_guru_file_level_df['file_name'].str.contains('.java')]
    commit_guru_file_level_df[
        'Name'] = commit_guru_file_level_df.file_name.str.rsplit(
            '/', 1).str[1].str.split('.').str[0].str.replace('/', '.')
    commit_guru_file_level_df = commit_guru_file_level_df.drop('file_name',
                                                               axis=1)

    release_df = pd.read_pickle('data/release/' + project + '_release.pkl')
    release_df = release_df.sort_values('created_at', ascending=False)
    release_df = release_df.reset_index(drop=True)
    release_df['created_at'] = pd.to_datetime(release_df.created_at)
    release_df['created_at'] = release_df.created_at.dt.date

    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    commit_guru_df = pd.read_csv(commit_guru_path)
    cols = understand_df.columns.tolist()
    commit_guru_df['created_at'] = pd.to_datetime(
        commit_guru_df.author_date_unix_timestamp, unit='s')
    commit_guru_df['created_at'] = commit_guru_df.created_at.dt.date

    commit_guru_df = commit_guru_df[['commit_hash', 'created_at']]

    df = understand_df.merge(commit_guru_file_level_df,
                             how='left',
                             on=['commit_hash', 'Name'])
    df = df.merge(commit_guru_df, how='left', on=['commit_hash'])

    cols = df.columns.tolist()
    cols.remove('Bugs')
    cols.append('Bugs')
    df = df[cols]
    file_names = df.Name
    commit_hash = df.commit_hash
    for item in ['Kind', 'Name', 'commit_hash']:
        if item in cols:
            df = df.drop(labels=[item], axis=1)
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)

    created_at = df.created_at
    df = df.drop('created_at', axis=1)
    y = df.Bugs
    X = df.drop('Bugs', axis=1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    imp_mean = IterativeImputer(random_state=0)
    X = imp_mean.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    X['created_at'] = created_at

    if metric == 'process':
        X = X[[
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr', 'created_at'
        ]]
    elif metric == 'product':
        X = X.drop([
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ],
                   axis=1)
    else:
        X = X

    df = X
    df['Name'] = file_names
    df['Bugs'] = y

    accepted_commit_dates = []
    all_data = pd.DataFrame()
    for i in range(release_df.shape[0] - 1):
        sub_df = df[df['created_at'] <= release_df.loc[i, 'created_at']]
        sub_df = sub_df[sub_df['created_at'] > release_df.loc[i + 1,
                                                              'created_at']]
        sub_df.sort_values(by=['created_at'], inplace=True, ascending=False)
        sub_df.drop_duplicates(['Name'], inplace=True)

        all_data = pd.concat([all_data, sub_df], axis=0)

    all_data = all_data.drop('created_at', axis=1)

    return all_data
예제 #5
0
for n_estimators, max_iter in [(e, i) for e in [10, 100] for i in [10, 100]]:

    x_train = x_train0
    y_train = y_train0

    # 1. Missing Values
    est = ExtraTreesRegressor(n_estimators=n_estimators,
                              random_state=42,
                              max_features='sqrt',
                              n_jobs=10,
                              verbose=0)
    imputer = IterativeImputer(estimator=est,
                               max_iter=max_iter,
                               tol=0.001,
                               n_nearest_features=100,
                               initial_strategy='median',
                               imputation_order='ascending',
                               verbose=2,
                               random_state=0)
    x_train_filled = imputer.fit_transform(x_train)
    x_train = pd.DataFrame(x_train_filled)

    # 2. Outliers detection

    clf = IsolationForest(n_estimators=150,
                          max_samples=1000,
                          contamination=0.02,
                          max_features=1.0,
                          bootstrap=False,
                          n_jobs=10,
                          behaviour='old',
예제 #6
0
for i in range(num_iter):
    print('Iteration', i + 1)

    # ### Split Data

    X_train, X_test, y_train, y_test = train_test_split(
        df.values,
        labels.values.ravel(),
        train_size=train_size,
        shuffle=True,
        stratify=labels.values.ravel())

    # ### Impute Data
    if data_impute:
        imp = IterativeImputer(max_iter=25, random_state=1337)

        X_train = imp.fit_transform(X_train)
        X_test = imp.transform(X_test)

    # ### Augment Data
    if smote_ratio > 0:
        smote = SMOTE(sampling_strategy='all',
                      random_state=1337,
                      k_neighbors=5,
                      n_jobs=1)

        X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
headers = list(train_features)
rare = []
common = []
tot = train_features.shape[0] + test_features.shape[
    0]  # total entries per column
for head in headers:
    # get number of missing values in this column
    missing = train_features[head].isna().sum() + test_features[head].isna(
    ).sum()
    if (missing / tot >= 0.9):
        rare.append(head)
    else:
        common.append(head)

# impute and compute features for each patient
imp = IterativeImputer()
features = ["pid", "Age"] + rare

common.remove("pid")
common.remove("Age")

for c in common:
    features.append(c + "_mean")
    features.append(c + "_min")
    features.append(c + "_max")
    features.append(c + "_median")

X_feat = pd.DataFrame(index=all_pids, data={"pid": all_pids}, columns=features)

skip = False
for pid in all_pids:
예제 #8
0
def run(argv=None):
    """Emulate a HP search and monitor fit time."""
    args = parser.parse_args(argv)

    imputers = {
        'Mean': SimpleImputer(strategy='mean'),
        'Mean+mask': SimpleImputer(strategy='mean', add_indicator=True),
        'Med': SimpleImputer(strategy='median'),
        'Med+mask': SimpleImputer(strategy='median', add_indicator=True),
        'Iterative': IterativeImputer(max_iter=args.max_iter),
        'Iterative+mask': IterativeImputer(add_indicator=True,
                                           max_iter=args.max_iter),
        'IterativeR': IterativeImputer(estimator=RidgeCV(),
                                       max_iter=args.max_iter),
        'IterativeR+mask': IterativeImputer(estimator=RidgeCV(),
                                            add_indicator=True,
                                            max_iter=args.max_iter),
        'KNN': KNNImputer(),
        'KNN+mask': KNNImputer(add_indicator=True),

    }

    task_name = args.task_name
    est = args.est
    imp = imputers.get(args.imp, None)

    if task_name is None or est is None:
        logger.info('No argv given.')
        task_name = 'TB/shock_hemo'
        est = 'HGBC'

    task = tasks[task_name]
    logger.info(f'Argv given. Task {task.meta.tag}. est {est}.')

    t0 = time()
    logger.info('Getting X.')
    X = task.X
    logger.info('Getting y.')
    y = task.y

    logger.info(f'X shape before splits: {X.shape}')

    # Simulate the outer CV (the one of KFold)
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2)

    # Simulate the inner CV (the one of RandomSearchCV)
    X_train2, X_test2, y_train2, _ = train_test_split(X_train, y_train, test_size=0.2)

    # Now X has the same shape as in real experiment
    logger.info(f'X shape: {X_train2.shape}')

    t_X_ready = time()

    if imp is not None:
        logger.info(f'Fitting imputer {args.imp}')
        imp.fit(X_train2, y_train2)
        t_fit_imp = time()
        logger.info('Imputer fitted.')

        logger.info('Transforming X_train')
        imp.transform(X_train2)
        t_tra1_imp = time()
        logger.info('X_train transformed')

        logger.info('Transforming X_test')
        imp.transform(X_test2)
        t_tra2_imp = time()
        logger.info('X_test transformed')

    t_fits = [time()]

    for learning_rate in param_space['learning_rate']:
        for max_depth in param_space['max_depth']:
            if est == 'HGBC':
                estimator = HistGradientBoostingClassifier(
                    learning_rate=learning_rate,
                    max_depth=max_depth
                )
            elif est == 'HGBR':
                estimator = HistGradientBoostingRegressor(
                    loss='least_absolute_deviation',
                    learning_rate=learning_rate,
                    max_depth=max_depth
                )
            else:
                raise ValueError(f'Unknown estimator {est}')

            logger.info(f'Params: LR {learning_rate} MD {max_depth}')
            logger.info('Fitting estimator.')
            estimator.fit(X_train2, y_train2)
            t_fits.append(time())
            logger.info('Estimator fitted.')

    t_fits = np.diff(t_fits)

    data = {
        'task_tag': [task.meta.tag],
        'imp': [args.imp],
        'imp_params': [repr({'max_iter': args.max_iter})],
        'X_shape': [repr(X.shape)],
        'X_train_shape': [repr(X_train2.shape)],
        'X_test_shape': [repr(X_test2.shape)],
        'time_X_ready': [t_X_ready-t0],
        'time_fit_imp': np.around([0 if imp is None else t_fit_imp-t_X_ready], 2),
        'time_tra1_imp': np.around([0 if imp is None else t_tra1_imp-t_X_ready], 2),
        'time_tra2_imp': np.around([0 if imp is None else t_tra2_imp-t_tra1_imp], 2),
        'time_fits': [repr(np.around(t_fits.tolist(), 2))],
        'time_fits_mean': [np.around(t_fits.mean(), 2)]
    }

    new_df = pd.DataFrame(data)

    df = None
    filepath = 'results/fit_time.csv'
    if os.path.exists(filepath):
        df = pd.read_csv(filepath, index_col=0)

    if df is not None:
        new_df = pd.concat([df, new_df])

    new_df.to_csv(filepath)
예제 #9
0
def basic_preprocess(train_complete, 
                     test_complete, 
                     out_column, 
                     drop_columns=None,
                     forced_categorical = None, 
                     forced_numeric = None, 
                     columns_to_normalize = None,
                     use_labeler = None,
                     manual_processing = None,
                     seed=42,
                     perc=10):
  complete_features = pd.concat([train_complete, test_complete], sort=False).reset_index(drop=True)
  train = train_complete.copy()
  test = test_complete.copy()

  normalize_output = columns_to_normalize and out_column in columns_to_normalize
  if normalize_output:
    columns_to_normalize.remove(out_column)

  if use_labeler:
    if not columns_to_normalize:
      columns_to_normalize = []
    for column in use_labeler:
      if column in columns_to_normalize:
        columns_to_normalize.remove(column)

  convert_dict = {}
  if forced_categorical:
    for column in forced_categorical:
      convert_dict[column] = 'str'
  
  if forced_numeric:
    for column in forced_numeric:
      convert_dict[column] = 'float64'

  train = train.astype(convert_dict)
  test = test.astype(convert_dict) 

  if drop_columns:
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)

  train_data = np.array(train[out_column])

  if normalize_output:
    normalize, denormalize = transform_distribution(train_data)
  else:
    normalize = lambda x: x
    denormalize = lambda x: x
    
  y = np.array(normalize(train_data))

  train_features = train.drop([out_column], axis=1)
  features = pd.concat([train_features, test], sort=False).reset_index(drop=True)
  
  impute_with_mode(features)

  numerics = list(features.select_dtypes(include=[np.number]).columns.values)
  if len(numerics) >= 2:
    imp = IterativeImputer(max_iter=10, sample_posterior=False, random_state=seed)
    imp.fit(features[numerics])
    features[numerics] = imp.transform(features[numerics])
  elif numerics:
    impute_with_median(features)

  if use_labeler:
    labeler = LabelEncoder()
    for column in use_labeler:
      features[column] = labeler.fit_transform(features[column])
  
  final_features = pd.get_dummies(features).reset_index(drop=True)
  if columns_to_normalize:
    normalize_columns(final_features, columns_to_normalize)

  if manual_processing:
    final_features = manual_processing(final_features, complete_features)
  
  X = final_features.iloc[:len(y), :]
  X_sub = final_features.iloc[len(X):, :]

  #print('selecting relevant features')
  #X, X_sub = select_features(X, y, X_sub, final_features.columns, perc=perc)

  return X, y, X_sub, denormalize
 def impute(self):
     self.data = IterativeImputer().fit_transform(self.data)
     return self.data
예제 #11
0
def main():
    index = load_dataset('all_merged', return_index=True)
    for _sym, data in index.items():
        features, target = get_symbol_features(index, _sym)

        features_p = features[data['features']['ohlcv']].pct_change().replace(
            [np.inf, -np.inf], np.nan)
        features_p.columns = [c + '_p1' for c in features_p.columns]
        features_1 = features_p.shift(1)
        features_1.columns = [c + '_lag1' for c in features_1.columns]
        features_2 = features_p.shift(2)
        features_2.columns = [c + '_lag2' for c in features_2.columns]

        features_mean = features_p.rolling(3).mean()
        features_mean.columns = [c + '_mean_3' for c in features_mean.columns]

        ta = features[data['features']['ta'] + data['features']['ta_7d'] +
                      data['features']['ta_30d']]

        features = pd.concat([
            features['close'], ta, features_p, features_1, features_2,
            features_mean
        ],
                             axis=1)[30:]
        target = target[30:]
        # Split data in train and blind test set with 70:30 ratio,
        #  most ML models don't take sequentiality into account, but our pipeline
        #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
        X_train, X_test, y_train, y_test = train_test_split(features.values,
                                                            target.values,
                                                            shuffle=False,
                                                            test_size=0.3)
        logger.info("Start Feature Selection")
        imp = SimpleImputer()
        values = imp.fit_transform(X_train)
        #sel = SelectKBest(score_func=f_classif, k=min(10, X_train.shape[1]))
        feature_count = int(0.3 * X_train.shape[1])
        sel = RFECV(estimator=RandomForestClassifier(),
                    cv=5,
                    verbose=0,
                    n_jobs=4,
                    min_features_to_select=feature_count,
                    scoring='neg_mean_squared_error')
        sel.fit(values, y_train)
        logger.info("End Feature Selection")
        bestfeatures = [
            c for c, f in zip(features.columns, sel.get_support()) if f
        ]
        if not 'close' in bestfeatures:
            bestfeatures += ['close']
        print("Using features:\n{}".format(bestfeatures, len(bestfeatures)))

        train_features = pd.DataFrame(X_train, columns=features.columns)
        test_features = pd.DataFrame(X_test, columns=features.columns)
        X_train = train_features[bestfeatures].values
        X_test = test_features[bestfeatures].values

        # Summarize distribution
        print("Training set: # Features {}, # Samples {}".format(
            X_train.shape[1], X_train.shape[0]))
        plot_class_distribution("Training set", _sym, y_train)
        print("Test set: # Features {}, # Samples {}".format(
            X_test.shape[1], X_test.shape[0]))
        plot_class_distribution("Test set", _sym, y_test)
        if not np.isfinite(X_train).all():
            logger.warning("Training x is not finite!")
        if not np.isfinite(y_train).all():
            logger.warning("Training y is not finite!")
        if not np.isfinite(X_test).all():
            logger.warning("Test x is not finite!")
        if not np.isfinite(y_test).all():
            logger.warning("Test y is not finite!")

        # Build pipeline to be used as estimator in grid search
        #  so that each subset of the data is transformed independently
        #  to avoid contamination between folds.
        pipeline = Pipeline([
            (
                'i', IterativeImputer()
            ),  # Replace nan's with the median value between previous and next observation
            ('s', MinMaxScaler(feature_range=(-1, 1))),
            ('c', MLPClassifier()),
        ])

        # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
        logger.info("Start Grid search")
        CV_rfc = GridSearchCV(estimator=pipeline,
                              param_grid=PARAM_GRID,
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              verbose=1)
        CV_rfc.fit(X_train, y_train)
        logger.info("End Grid search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = CV_rfc.best_estimator_
        # Test ensemble's performance on training and test sets
        logger.info("Classification report on train set")
        predictions1 = clf.predict(X_train)
        train_report = classification_report(y_train,
                                             predictions1,
                                             output_dict=True)
        print(classification_report(y_train, predictions1))
        logger.info("Classification report on test set")
        predictions2 = clf.predict(X_test)
        test_report = classification_report(y_test,
                                            predictions2,
                                            output_dict=True)
        print(classification_report(y_test, predictions2))
        stats = {
            'score': accuracy_score(y_train, predictions1),
            'mse': mean_squared_error(y_train, predictions1),
            'test_score': accuracy_score(y_test, predictions2),
            'test_mse': mean_squared_error(y_test, predictions2),
            'train_report': train_report,
            'test_report': test_report,
        }
        print(CV_rfc.best_params_)
        num_samples = min(y_train.shape[0], y_test.shape[0], 30)
        print("Gains calculated on {} samples only!".format(num_samples))
        print(
            "Train Accuracy: {}\nTrain MSE: {}\nGains on train preds: 100 -> {}"
            .format(
                accuracy_score(y_train, predictions1),
                mean_squared_error(y_train, predictions1),
                test_gains(train_features['close'][0:num_samples],
                           predictions1[0:num_samples],
                           initial_balance=100,
                           position_size=0.1)))
        print(
            "Test Accuracy: {}\nTest MSE: {}\nGains on test preds: 100 -> {}".
            format(
                accuracy_score(y_test, predictions2),
                mean_squared_error(y_test, predictions2),
                test_gains(test_features['close'][0:num_samples],
                           predictions2[0:num_samples],
                           initial_balance=100,
                           position_size=0.1)))
        print("--- end ---")
 def fill_nan(self):
     self.data_expand = IterativeImputer().fit_transform(self.data_expand)
     return self
예제 #13
0
def impute_tui():
    path = 'tui_data_d1/'
    filenames = os.listdir(path)
    filenames = [path + name for name in filenames if name.startswith('X')]

    main_df = pd.DataFrame()
    for n, filename in enumerate(filenames):
        df = pd.read_csv(filename,
                         sep='\t',
                         low_memory=False,
                         encoding='utf-16')
        main_df = main_df.append(df, ignore_index=True)
        if n % 100 == 0:
            print(n)

    main_df = main_df.drop(['VLST_KODAS2', 'kodas'], axis=1)
    index = main_df.columns.values
    print(index)
    print(main_df)

    estimators = [
        ExtraTreesRegressor(),
        BayesianRidge(),
        KNeighborsRegressor(),
        DecisionTreeRegressor(),
        RandomForestRegressor()
    ]
    f = open('performance.txt', 'w')

    for estimator in estimators:

        imp = IterativeImputer(estimator=estimator, missing_values=np.nan)
        imp.fit(main_df)

        df = pd.read_csv(os.path.join(path, 'test.csv'),
                         sep='\t',
                         low_memory=False,
                         encoding='utf-16')
        df = df.drop(['VLST_KODAS2', 'kodas'], axis=1)
        df = imp.transform(df)

        df = pd.DataFrame(df)
        df.columns = index
        df.to_csv(os.path.join(path, 'test_imp.csv'),
                  sep='\t',
                  encoding='utf-16')

        df1 = pd.read_csv(os.path.join(path, 'X00411.csv'),
                          sep='\t',
                          low_memory=False,
                          encoding='utf-16')
        df1 = df1.drop(['VLST_KODAS2', 'kodas'], axis=1)
        df2 = pd.read_csv(os.path.join(path, 'test_imp.csv'),
                          sep='\t',
                          low_memory=False,
                          encoding='utf-16')

        score, total = 0, 0
        for i in range(1, 400):
            val1 = df1.iloc[i]
            val2 = df2.iloc[i]
            val1 = int(val1['D1'])
            val2 = int(val2['D1'])
            if relatively_equal(val1, val2):
                print(val1, val2)
                score += 1
            total += 1

        print(str(estimator).split('(')[0])
        print('score: ', score / total)

        f.write(str(estimator).split('(')[0])
        f.write('\ntotal: %i / %i\n' % (score, total))
        f.write('score: %.1f%%\n\n' % (100 * score / total))
def run(argv=None):
    if argv is None or len(argv) < 2:
        logger.info('No argv given.')
        task = tasks['TB/shock_hemo']
        imp = 'iterative'
    else:
        task = tasks[argv[1]]
        imp = argv[2]
        logger.info(f'Argv given. Task {task.meta.tag}. Imp {imp}.')

    logger.info('Getting X.')
    X = task.X
    logger.info('Getting y.')
    y = task.y

    logger.info(f'X shape before splits: {X.shape}')

    # Simulate the outer CV (the one of KFold)
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2)

    # Simulate the inner CV (the one of RandomSearchCV)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.2)

    # Now X has the same shape as in real experiment
    logger.info(f'X shape: {X_train2.shape}')

    if imp == 'iterative':
        imp = IterativeImputer()
    elif imp == 'knn':
        imp = KNNImputer()

    t0 = time()

    logger.info('Fitting imputer.')
    imp.fit(X_train2)
    t1 = time()
    logger.info('Imputer fitted.')

    logger.info('Transforming X_train.')
    imp.transform(X_train2)
    t2 = time()
    logger.info('X_train transformed.')

    logger.info('Transforming X_test.')
    imp.transform(X_test2)
    t3 = time()
    logger.info('X_test transformed.')

    data = {
        'task_tag': [task.meta.tag],
        'imp': [imp.__class__.__name__],
        'X_shape': [repr(X.shape)],
        'X_train_shape': [repr(X_train2.shape)],
        'X_test_shape': [repr(X_test2.shape)],
        'fit_time': [t1 - t0],
        'transform_time_train': [t2 - t1],
        'transform_time_test': [t3 - t2]
    }

    new_df = pd.DataFrame(data)

    df = None
    filepath = 'results/impute_time.csv'
    if os.path.exists(filepath):
        df = pd.read_csv(filepath, index_col=0)

    if df is not None:
        new_df = pd.concat([df, new_df])

    new_df.to_csv(filepath)

    print(new_df)
def evaluate_exp4(main_folder, mv_config, r_seed=0, num_file=500):

    train_with_nan, train_full, w0_with_nan_list, w0_list, w1_with_nan_list, w1_list = dh.load_realdata(
        r_seed, mv_config, num_file, main_folder)

    imp = IterativeImputer(max_iter=10, random_state=0)
    train_impu = imp.fit_transform(train_with_nan)

    alpha = 0.05

    result_MWW = np.zeros([2, 2])
    result_QTree = np.zeros([2, 2])
    result_kchi2 = np.zeros([2, 2])
    result_Gau = np.zeros([2, 2])
    result_Tri = np.zeros([2, 2])
    result_ME = np.zeros([2, 2])
    result_MMD = np.zeros([2, 2])

    Qtree_Htest_impu = None
    Qtree_Htest_full = None

    kchi2_impu = None
    kchi2_full = None

    mfkchi2_miss_gau = None
    mfkchi2_full_gau = None
    mfkchi2_miss_tri = None
    mfkchi2_full_tri = None

    me_ml = None
    me_mg = None
    me_fl = None
    me_fg = None

    mmd_impu = None
    mmd_full = None

    for i in range(num_file):

        imp = IterativeImputer(max_iter=10, random_state=0)
        w0_miss = w0_with_nan_list[i][0]
        w0_impu = imp.fit_transform(w0_miss)
        w0_full = w0_list[i]

        imp = IterativeImputer(max_iter=10, random_state=0)
        w1_miss = w1_with_nan_list[i][0]
        w1_impu = imp.fit_transform(w1_miss)
        w1_full = w1_list[i]

        alg_r_seed = 1
        print('MWW')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result = perform_mww_test(train_impu, w0_impu, train_full, w0_full,
                                     alpha)
        result_MWW[0] = result_MWW[0] + w0_result
        w1_result = perform_mww_test(train_impu, w1_impu, train_full, w1_full,
                                     alpha)
        result_MWW[1] = result_MWW[1] + w1_result

        print('Qtree')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result, Qtree_Htest_impu, Qtree_Htest_full = perform_QTree_test(
            train_impu, w0_impu, train_full, w0_full, alpha, Qtree_Htest_impu,
            Qtree_Htest_full)
        result_QTree[0] = result_QTree[0] + w0_result
        w1_result, Qtree_Htest_impu, Qtree_Htest_full = perform_QTree_test(
            train_impu, w1_impu, train_full, w1_full, alpha, Qtree_Htest_impu,
            Qtree_Htest_full)
        result_QTree[1] = result_QTree[1] + w1_result
        #
        print('kchi2')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result, kchi2_impu, kchi2_full = perform_kmean_chi2_test(
            train_impu, w0_impu, train_full, w0_full, alpha, kchi2_impu,
            kchi2_full)
        result_kchi2[0] = result_kchi2[0] + w0_result
        w1_result, kchi2_impu, kchi2_full = perform_kmean_chi2_test(
            train_impu, w1_impu, train_full, w1_full, alpha, kchi2_impu,
            kchi2_full)
        result_kchi2[1] = result_kchi2[1] + w1_result

        print('Gau')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result, mfkchi2_miss_gau, mfkchi2_full_gau = perform_mfkmean_chi2_test(
            train_with_nan,
            w0_miss,
            train_full,
            w0_full,
            alpha,
            mfkchi2_miss_gau,
            mfkchi2_full_gau,
            apply_fuzzy='Gaussion',
            top_k=2)
        result_Gau[0] = result_Gau[0] + w0_result
        w1_result, mfkchi2_miss_gau, mfkchi2_full_gau = perform_mfkmean_chi2_test(
            train_with_nan,
            w1_miss,
            train_full,
            w1_full,
            alpha,
            mfkchi2_miss_gau,
            mfkchi2_full_gau,
            apply_fuzzy='Gaussion',
            top_k=2)
        result_Gau[1] = result_Gau[1] + w1_result

        print('Tri')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result, mfkchi2_miss_tri, mfkchi2_full_tri = perform_mfkmean_chi2_test(
            train_with_nan,
            w0_miss,
            train_full,
            w0_full,
            alpha,
            mfkchi2_miss_tri,
            mfkchi2_full_tri,
            apply_fuzzy='Triangle',
            top_k=2)
        result_Tri[0] = result_Tri[0] + w0_result
        w1_result, mfkchi2_miss_tri, mfkchi2_full_tri = perform_mfkmean_chi2_test(
            train_with_nan,
            w1_miss,
            train_full,
            w1_full,
            alpha,
            mfkchi2_miss_tri,
            mfkchi2_full_tri,
            apply_fuzzy='Triangle',
            top_k=2)
        result_Tri[1] = result_Tri[1] + w1_result

        print('ME')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result, me_ml, me_mg, me_fl, me_fg = perform_me_test(
            train_impu, w0_impu, train_full, w0_full, alpha, me_ml, me_mg,
            me_fl, me_fg)
        result_ME[0] = result_ME[0] + w0_result
        w1_result, me_ml, me_mg, me_fl, me_fg = perform_me_test(
            train_impu, w1_impu, train_full, w1_full, alpha, me_ml, me_mg,
            me_fl, me_fg)
        result_ME[1] = result_ME[1] + w1_result
        #
        print('MMD')
        np.random.seed(alg_r_seed)
        # ============================================================================================================== #
        w0_result, mmd_impu, mmd_full = perform_mmd_test(
            train_impu, w0_impu, train_full, w0_full, alpha, mmd_impu,
            mmd_full)
        result_MMD[0] = result_MMD[0] + w0_result
        w1_result, mmd_impu, mmd_full = perform_mmd_test(
            train_impu, w1_impu, train_full, w1_full, alpha, mmd_impu,
            mmd_full)
        result_MMD[1] = result_MMD[1] + w1_result


#
    return result_MWW, result_QTree, result_kchi2, result_Gau, result_Tri, result_ME, result_MMD
예제 #16
0
def extract_feats_transform(X, Y=None):

    if Y is None:

        #==================================#
        # use impute distances as features #
        #==================================#
        imp = SimpleImputer(missing_values=np.nan,
                            strategy='constant',
                            fill_value=0)
        zero_imput = imp.fit_transform(X)
        zero_imput = euclidean_distances(zero_imput, zero_imput)
        zero_imput = zero_imput.flatten().reshape(-1, 1)

        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        mean_imput = imp.fit_transform(X)
        mean_imput = euclidean_distances(mean_imput, mean_imput)
        mean_imput = mean_imput.flatten().reshape(-1, 1)

        imp = SimpleImputer(missing_values=np.nan, strategy='median')
        medi_imput = imp.fit_transform(X)
        medi_imput = euclidean_distances(medi_imput, medi_imput)
        medi_imput = medi_imput.flatten().reshape(-1, 1)

        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        mfre_imput = imp.fit_transform(X)
        mfre_imput = euclidean_distances(mfre_imput, mfre_imput)
        mfre_imput = mfre_imput.flatten().reshape(-1, 1)

        imp = IterativeImputer(max_iter=10, random_state=0)
        iter_imput = imp.fit_transform(X)
        iter_imput = euclidean_distances(iter_imput, iter_imput)
        iter_imput = iter_imput.flatten().reshape(-1, 1)

        #=============================#
        # missing value masked vector #
        #=============================#
        X_masked_hasNan_masked_vector = np.isnan(X) * 1
        pd_X_Nan = pd.DataFrame(X_masked_hasNan_masked_vector)
        pd_X_Nan['key'] = 0
        all_merge = pd.merge(pd_X_Nan, pd_X_Nan, on='key', how='outer')
        all_merge = all_merge.drop(columns=['key'])
        all_merge = all_merge.values
        train_X = np.hstack([
            all_merge, zero_imput, mean_imput, medi_imput, mfre_imput,
            iter_imput
        ])

    else:

        #==================================#
        # use impute distances as features #
        #==================================#
        imp = SimpleImputer(missing_values=np.nan,
                            strategy='constant',
                            fill_value=0)
        zero_imput_X = imp.fit_transform(X)
        zero_imput_Y = imp.fit_transform(Y)
        zero_imput = euclidean_distances(zero_imput_X, zero_imput_Y)
        zero_imput = zero_imput.flatten().reshape(-1, 1)

        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        mean_imput_X = imp.fit_transform(X)
        mean_imput_Y = imp.fit_transform(Y)
        mean_imput = euclidean_distances(mean_imput_X, mean_imput_Y)
        mean_imput = mean_imput.flatten().reshape(-1, 1)

        imp = SimpleImputer(missing_values=np.nan, strategy='median')
        medi_imput_X = imp.fit_transform(X)
        medi_imput_Y = imp.fit_transform(Y)
        medi_imput = euclidean_distances(medi_imput_X, medi_imput_Y)
        medi_imput = medi_imput.flatten().reshape(-1, 1)

        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        mfre_imput_X = imp.fit_transform(X)
        mfre_imput_Y = imp.fit_transform(Y)
        mfre_imput = euclidean_distances(mfre_imput_X, mfre_imput_Y)
        mfre_imput = mfre_imput.flatten().reshape(-1, 1)

        imp = IterativeImputer(max_iter=10, random_state=0)
        iter_imput_X = imp.fit_transform(X)
        iter_imput_Y = imp.fit_transform(Y)
        iter_imput = euclidean_distances(iter_imput_X, iter_imput_Y)
        iter_imput = iter_imput.flatten().reshape(-1, 1)

        #=============================#
        # missing value masked vector #
        #=============================#
        X_masked_hasNan_masked_vector = np.isnan(X) * 1
        Y_masked_hasNan_masked_vector = np.isnan(Y) * 1
        pd_X_Nan = pd.DataFrame(X_masked_hasNan_masked_vector)
        pd_Y_Nan = pd.DataFrame(Y_masked_hasNan_masked_vector)
        pd_X_Nan['key'] = 0
        pd_Y_Nan['key'] = 0
        all_merge = pd.merge(pd_X_Nan, pd_Y_Nan, on='key', how='outer')
        all_merge = all_merge.drop(columns=['key'])
        all_merge = all_merge.values
        train_X = np.hstack([
            all_merge, zero_imput, mean_imput, medi_imput, mfre_imput,
            iter_imput
        ])

    return train_X
예제 #17
0
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
    X = np.zeros((100, 2))
    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
    with pytest.raises(error_type, match=warning):
        imputer.fit_transform(X)
    date = pd.Timestamp.now().strftime(format='%Y-%m-%d_%H-%M_')
    predictions.to_csv(
        f'C:/Users/fredh/Documents/Data Driven platform competition - Pump it up/predictions/{date}submission.csv',
        index=True,
        header=True)


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
categorical_feat = X_train.select_dtypes(include='object').columns.to_list()
num_feat = X_train.select_dtypes(include='number').columns.to_list()
num_pipe_7 = Pipeline([('imputer', IterativeImputer(max_iter=10,
                                                    random_state=0)),
                       ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore'))])
ct_7 = ColumnTransformer(remainder='drop',
                         transformers=[('numerical', num_pipe_7, num_feat),
                                       ('categorical', cat_pipe,
                                        categorical_feat)])
from xgboost import XGBClassifier
"""
space = [
Real(0.6, 0.7, name="colsample_bylevel"),
Real(0.6, 0.7, name="colsample_bytree"),
Real(0.01, 1, name="gamma"),
Real(0.0001, 1, name="learning_rate"),
Real(0.1, 10, name="max_delta_step"),
예제 #19
0
파일: B3.py 프로젝트: amittal-tcd/my_rep
def imputer(df, dfv, dfk, target_col, imputer_dict):

    result = {}

    for i in imputer_dict:

        if imputer_dict[i]['Indicator'] == 'deleterows':
            if df[i].isna().sum() > 0:
                df = df[df[i].isfinite()]
                dfv = dfv[dfv[i].isfinite()]
                dfk = dfk[dfk[i].isfinite()]

        if imputer_dict[i]['Indicator'] == True:
            if df[i].isna().sum() > 0:
                df[i + '_null_ind'] = np.where(df[i].isna(), 1, 0)
                dfv[i + '_null_ind'] = np.where(dfv[i].isna(), 1, 0)
                dfk[i + '_null_ind'] = np.where(dfk[i].isna(), 1, 0)

        if imputer_dict[i]['mvi'] in ['mean', 'median', 'most_frequent']:
            imp = SimpleImputer(missing_values=np.nan,
                                strategy=imputer_dict[i]['mvi'],
                                verbose=True,
                                add_indicator=False,
                                fill_value=None)
            imp.fit(df[[i]])
            result[i] = imp
            df.loc[:, i] = result[i].transform(df[[i]])
            dfv.loc[:, i] = result[i].transform(dfv[[i]])
            dfk.loc[:, i] = result[i].transform(dfk[[i]])

        if imputer_dict[i]['mvi'] == 'far_val':
            result[i] = df[i].max() * 100
            df[i] = np.where(df[i].isna(), result[i], df[i])
            dfv[i] = np.where(dfv[i].isna(), result[i], dfv[i])
            dfk[i] = np.where(dfk[i].isna(), result[i], dfk[i])

    ##### interativeimputer (if none of the above then this) ######

    imp = IterativeImputer(
        max_iter=3,
        estimator=ExtraTreesRegressor(
        )  #### hyperparameter, alternatively beysian, knn etc.
        ,
        n_nearest_features=
        5  ##### Change value for maximum columns considered to predict missing value
    )

    dfvc = dfv.copy()
    dfv[target_col] = np.nan

    dfkc = dfk.copy()
    dfk[target_col] = np.nan

    dfcolumns = df.columns
    imp.fit(df)
    df = pd.DataFrame(imp.transform(df))
    df.columns = dfcolumns
    dfv = pd.DataFrame(imp.transform(dfv))
    dfv.columns = dfcolumns
    dfk = pd.DataFrame(imp.transform(dfk))
    dfk.columns = dfcolumns

    dfv[target_col] = np.array(dfvc[target_col])
    dfk[target_col] = np.nan

    for i in imputer_dict:
        if imputer_dict[i]['mvi'] == 'iterativeimputer':
            result[i] = imp

    print("Completed imputer - ", datetime.datetime.now())

    return df, dfv, dfk, result
예제 #20
0
    data.Name = data.Name.map(lambda n: n.split(',')[1].split('.')[0].strip())
    data.Cabin = data.Cabin.map(lambda a: a[0])
    # one_hot
    for i in ['Cabin', 'Embarked', 'Name']:
        data = make_one_hot(data, i)
    data.Sex.replace({'female': 0, 'male': 1}, inplace=True)
    # numeric feature eng
    data['Family_size'] = data.Parch + data.SibSp + 1
    # drop
    data.drop(['Cabin_*', 'SibSp', 'Parch', 'Ticket'], axis=1, inplace=True)

    return data


data = feature_eng(data)
data = IterativeImputer().fit_transform(data)
x_train, x_test = data[:len(train)], data[len(train):]
x_train = scale(x_train)
m1 = MLPClassifier(max_iter=1000, hidden_layer_sizes=len(x_train[0]) * 2)

cv = cross_val_score(m1, x_train, y_train, cv=5)
print(cv.mean(), ' +/-', cv.std() * 2)

x_tr, x_ts, y_tr, y_ts = split(x_train, y_train, shuffle=True, test_size=0.2)
m1.fit(x_tr, y_tr)
print(m1.score(x_ts, y_ts))

# ===================================
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD, Adam
예제 #21
0
from DataPrepocessing import converting_discrete_attributes_to_continuous as prepo

raw_data = pd.read_csv('flag_data/flag.data')
## 1
areadata = raw_data['4']
areadata = np.array(areadata).reshape(-1, 1)
mean = SimpleImputer(missing_values=0, strategy='mean').fit_transform(areadata)
median = SimpleImputer(missing_values=0,
                       strategy='median').fit_transform(areadata)
most_freq = SimpleImputer(missing_values=0,
                          strategy='most_frequent').fit_transform(areadata)
constant = SimpleImputer(missing_values=0,
                         strategy='constant').fit_transform(areadata)

plt.hist(areadata, 100)
plt.show()
plt.subplot(221)
plt.hist(mean, 100)
plt.subplot(222)
plt.hist(median, 100)
plt.subplot(223)
plt.hist(most_freq, 100)
plt.subplot(224)
plt.hist(constant, 100)
plt.show()

## 2
imp = IterativeImputer(missing_values=np.nan)
plt.hist(imp.fit_transform(prepo.dataout))
plt.show()
예제 #22
0
                            subject_dict[ID][ses][(atlas, est, clust, _k,
                                                   smooth, hpass)]['topology'])
                    vect_all.append(np.concatenate(vects, axis=1))
                    del vects
                X_top = np.swapaxes(np.hstack(vect_all), 0, 1)

                Y = np.array(id_list)
                try:
                    df_summary.at[i, 'grid'] = (atlas, est, clust, _k, smooth,
                                                hpass)
                    bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))]
                    for m in set(bad_ixs):
                        if (X_top.shape[0] -
                                bad_ixs.count(m)) / X_top.shape[0] < 0.50:
                            X_top = np.delete(X_top, m, axis=1)
                    imp = IterativeImputer(max_iter=50, random_state=42)
                    X_top = imp.fit_transform(X_top)
                    scaler = StandardScaler()
                    X_top = scaler.fit_transform(X_top)
                    discr_stat_val, rdf = discr_stat(X_top, Y)
                    df_summary.at[i, 'discriminability'] = discr_stat_val
                    print(discr_stat_val)
                    #print(rdf)
                    del discr_stat_val
                    i += 1
                except:
                    i += 1
                    continue
    elif modality == 'dwi':
        gen_hyperparams = ['est', 'clust', '_k']
        for col in cols:
예제 #23
0
def load_both_data(project, metric):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis=1, how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name', 'commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0, item)
    understand_df = understand_df[cols_list]
    cols = understand_df.columns.tolist()
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    understand_df['Name'] = understand_df.Name.str.rsplit('.', 1).str[1]

    commit_guru_file_level_path = 'data/commit_guru_file/' + project + '.csv'
    commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path)
    commit_guru_file_level_df[
        'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"')
    commit_guru_file_level_df = commit_guru_file_level_df[
        commit_guru_file_level_df['file_name'].str.contains('.java')]
    commit_guru_file_level_df[
        'Name'] = commit_guru_file_level_df.file_name.str.rsplit(
            '/', 1).str[1].str.split('.').str[0].str.replace('/', '.')
    commit_guru_file_level_df = commit_guru_file_level_df.drop('file_name',
                                                               axis=1)

    df = understand_df.merge(commit_guru_file_level_df,
                             how='left',
                             on=['commit_hash', 'Name'])

    cols = df.columns.tolist()
    cols.remove('Bugs')
    cols.append('Bugs')
    df = df[cols]
    file_names = df.Name

    for item in ['Kind', 'Name', 'commit_hash']:
        if item in cols:
            df = df.drop(labels=[item], axis=1)


#     df.dropna(inplace=True)
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)

    y = df.Bugs
    X = df.drop('Bugs', axis=1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    imp_mean = IterativeImputer(random_state=0)
    X = imp_mean.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)

    if metric == 'process':
        X = X[[
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ]]
    elif metric == 'product':
        X = X.drop([
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ],
                   axis=1)
    else:
        X = X
    X['Name'] = file_names
    X['Bugs'] = y

    return X
    ),
    make_pipeline(Nystroem(kernel="polynomial", degree=2, random_state=0),
                  Ridge(alpha=1e3)),
    KNeighborsRegressor(n_neighbors=15),
]
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.
tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(random_state=0,
                         estimator=impute_estimator,
                         max_iter=25,
                         tol=tol),
        br_estimator,
    )
    score_iterative_imputer[
        impute_estimator.__class__.__name__] = cross_val_score(
            estimator,
            X_missing,
            y_missing,
            scoring="neg_mean_squared_error",
            cv=N_SPLITS)

scores = pd.concat(
    [score_full_data, score_simple_imputer, score_iterative_imputer],
    keys=["Original", "SimpleImputer", "IterativeImputer"],
    axis=1,
예제 #25
0
# minimum instances proportion to allow feature
minimum_valued_instances_proportion = 0.80

X = pd.read_excel (r'C:\Temp\learning\clean.xlsx')
y = pd.read_excel (r'C:\Temp\learning\happiness.xlsx') 

no_target_value_instances = y.notna().iloc[ : , 1 ]

X = X[ no_target_value_instances ]

y = y[ no_target_value_instances ]


X.dropna(thresh=len(X) * minimum_valued_instances_proportion, axis=1, inplace=True)

imp = IterativeImputer(max_iter=2, random_state=123)
start = time.time()

imp.fit(X.drop(X.columns[[0]], axis=1))

end = time.time()
print(end - start)

X_values = imp.transform(X.drop(X.columns[[0]], axis=1))


X.iloc[:,1:] = X_values

X.to_excel(r'C:\Temp\learning\cleaner.xlsx', index=False)
y.to_excel(r'C:\Temp\learning\targeter.xlsx', index=False)
예제 #26
0
 def impute_all(self, df, regressor=None, **regr_kwargs):
     im = IterativeImputer(estimator=regressor)
     dffin = im.fit_transform(df)
     return dffin
예제 #27
0
import pytest

import numpy as np
from scipy import sparse

from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_array_equal

from sklearn.experimental import enable_iterative_imputer  # noqa

from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

IMPUTERS = [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
SPARSE_IMPUTERS = [SimpleImputer()]


# ConvergenceWarning will be raised by the IterativeImputer
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
@pytest.mark.parametrize("imputer", IMPUTERS)
def test_imputation_missing_value_in_test_array(imputer):
    # [Non Regression Test for issue #13968] Missing value in test set should
    # not throw an error and return a finite dataset
    train = [[1], [2]]
    test = [[3], [np.nan]]
    imputer.set_params(add_indicator=True)
    imputer.fit(train).transform(test)

예제 #28
0
random.seed(r)

print('\n############### Evaluate Best Model ###############')

# ## Read in Data

train_test = pd.read_json('data/train-test.json')
train_test_labels = train_test[['label']]
train_test = train_test.drop('label', axis='columns')

hold = pd.read_json('data/holdout.json')
hold_labels = hold[['label']]
hold = hold.drop('label', axis='columns')

# ### Impute Data
imp = IterativeImputer(max_iter=100, random_state=r)

X_train_test = imp.fit_transform(train_test.values)
y_train_test = train_test_labels.values.ravel()

X_hold = imp.transform(hold.values)
y_hold = hold_labels.values.ravel()

# ### Augment Data
#if smote_ratio > 0:
#    smote = SMOTE(
#                sampling_strategy='all',
#                random_state=1337,
#                k_neighbors=5,
#                n_jobs=1
#            )
# creating surrogates for missing data
for col in df:
    if df[col].isna().sum() != 0:
        df[col + '_surrogate'] = df[col].isna().astype(int)

#check new columns created
df.head()

#Impute missing values
num_nulls = pd.DataFrame({"Number of Nulls": df.isnull().sum()})
impute_cols = list(num_nulls[num_nulls["Number of Nulls"] != 0].index)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(missing_values=np.nan,
                       max_iter=10,
                       verbose=0,
                       random_state=100)
df[impute_cols] = imp.fit_transform(df[impute_cols])
df
df.describe()

#Check for missing values
df.isnull().sum()
df.info()
df.describe()

###
### HANDLE SKEWED DATA
###

#Get float cols
def data_preprocessing(dat: pd.DataFrame,
                       art='C',
                       y=None,
                       logger=None,
                       remove=True):
    """
    Encoding + remove columns with more than 1/2 na if remove==True + remove columns with all na + imputation
    if art == 'C', will do LabelEncoding first for the target column
    ================
    Parameter:
    ================
    dat - type of DataFrame
    art - type of string
        either C for classifcation of R for regression. indicates the type of problem 
    y - type of string
        the name of the target column; if None, set the last column of the data set as target
        considering only one column for label
    logger - type of Logger
    remove - type of boolean
        whether remove the columns with na value more than half length or not
    =================
    Output
    =================
    dat - type of Dataframe 
        the dataframe after preprocessing
    cols - type of list of string
        the name of the numerical columns
    """
    if logger == None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        logger = logging.getLogger(__name__)

    logger.info('Start data preprocessing')
    # replace original indeices with default ones
    dat = dat.reset_index(drop=True)

    if art == 'C':
        logger.info('Start to label target feature y for classification task')
        dat.iloc[:, -1] = LabelEncoder().fit_transform(dat.iloc[:, -1])
        logger.info('End with label encoding the target feature')
    if remove:
        # remove columns with more than 1/2 na
        dat = dat.loc[:, dat.isna().sum() / len(dat) < .5]
        logger.info(
            'Following features are removed from the dataframe because half of their value are NA: %s'
            % (dat.columns[dat.isna().sum() / len(dat) > .5].to_list()))
    # Encoding
    oe = OneHotEncoder(drop='first')
    # get categorical columns
    if y:
        dat_y = dat[[y]]
        cols = dat.columns.to_list()
        cols.remove(y)
        dat_x = dat[cols]
    else:
        dat_y = dat[[dat.columns[-1]]]
        dat_x = dat[dat.columns[:-1]]
    dat_categ = dat_x.select_dtypes(include=['object'])
    # get kterm of categ features
    for i in dat_categ.columns:
        # save output to dat
        tmp = dat_x[i].value_counts()
        dat_x[i + '_kterm'] = dat_x[i].map(lambda x: tmp[x]
                                           if x in tmp.index else 0)
    # float columns including the k term cols
    dat_numeric = dat_x.select_dtypes(
        include=['float32', 'float64', 'int32', 'int64'])
    # onehot encoding and label encoding
    dat_categ_onehot = dat_categ.iloc[:,
                                      dat_categ.apply(lambda x: len(x.unique())
                                                      ).values < 8]
    dat_categ_label = dat_categ.iloc[:,
                                     dat_categ.apply(lambda x: len(x.unique())
                                                     ).values >= 8]
    flag_onehot = False
    flag_label = False
    # oe
    if dat_categ_onehot.shape[1] > 0:
        logger.info(
            'Start to do onehot to the following categoric features: %s' %
            (str(dat_categ_onehot.columns.to_list())))
        dat_onehot = pd.DataFrame(
            oe.fit_transform(dat_categ_onehot.astype(str)).toarray(),
            columns=oe.get_feature_names(dat_categ_onehot.columns))
        logger.info('End with onehot')
        flag_onehot = True
    else:
        dat_onehot = None
    # le
    if dat_categ_label.shape[1] > 0:
        logger.info(
            'Start to do label encoding to the following categoric features: %s'
            % (str(dat_categ_label.columns.to_list())))
        dat_categ_label = dat_categ_label.fillna('NULL')
        dat_label = pd.DataFrame(columns=dat_categ_label.columns)
        for i in dat_categ_label.columns:
            dat_label[i] = LabelEncoder().fit_transform(
                dat_categ_label[i].astype(str))
        flag_label = True
        logger.info('End with label encoding')
    else:
        dat_label = None
    # scaling
    # combine
    dat_new = pd.DataFrame()
    if flag_onehot and flag_label:
        dat_new = pd.concat([dat_numeric, dat_onehot, dat_label], axis=1)
    elif flag_onehot:
        dat_new = pd.concat([dat_numeric, dat_onehot], axis=1)
    elif flag_label:
        dat_new = pd.concat([dat_numeric, dat_label], axis=1)
    else:
        dat_new = dat_numeric
    dat_new = pd.concat([dat_new, dat_y], axis=1)
    # imputation
    dat_new = dat_new.dropna(axis=1, how='all')
    if dat_new.isna().sum().sum() > 0:
        logger.info(
            'Nan value exist, start to fill na with iterative imputer: ' +
            str(dat_new.isna().sum().sum()))
        # include na value, impute with iterative Imputer or simple imputer
        columns = dat_new.columns
        imp = IterativeImputer(max_iter=10, random_state=0)
        # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        dat_new = imp.fit_transform(dat_new)
        dat_new = pd.DataFrame(dat_new, columns=columns)
    dat_numeric = dat_new.iloc[:, :-1].select_dtypes(
        include=['float32', 'float64', 'int32', 'int64'])
    logger.info('End with filling nan')
    return dat_new, dat_numeric.columns