Exemplo n.º 1
0
    def features_ETC(self,
                     sort=False,
                     N=0,
                     n_estimators=1000,
                     max_features='auto',
                     min_samples_split=2,
                     usedata=1):
        n_sample = self.numData
        ind = list(range(n_sample))
        np.random.shuffle(ind)
        if not isinstance(max_features,
                          six.string_types) and max_features is not None:
            max_features = max(1,
                               min(max_features, int(self.data["X"].shape[1])))
        etc = ETC(n_estimators=n_estimators,
                  max_features=max_features,
                  min_samples_split=min_samples_split).fit(
                      self.data["X"][ind[:int(usedata * n_sample)]],
                      self.data["Y"][ind[:int(usedata * n_sample)]])
        f = etc.feature_importances_
        if not N:
            N = len(f)

        if sort:
            return nlargest(N, [(f[i], i) for i in range(len(f))])
        else:
            return [(f[i], i) for i in range(len(f))]
Exemplo n.º 2
0
    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesClassifier as ETC

        if refit:
            self.estimator = None

        if self.estimator is None:
            max_features = int(X.shape[1]**float(self.max_features))
            self.estimator = ETC(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight,
                warm_start=True)

        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self
def main():
    # Get the clean datasets
    x,y,xt,feats,sample = readData()

    #Try out different models
    xg_class_params = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree",
                       "eta": 0.01,"max_depth": 14,"min_child_weight": 10,                    
                       "subsample": 0.66,
                       #"colsample_bytree": 0.7,
                       "colsample_bylevel":0.3,                       
                       "thread": 1,"silent": 1,"seed": 221}
    xg_class_params2 = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree",
                       "eta": 0.02,"max_depth": 5,"min_child_weight": 10,                    
                       "subsample": 0.66,
                       #"colsample_bytree": 0.7,
                       "colsample_bylevel":0.3,                       
                       "thread": 1,"silent": 1,"seed": 221}
    rf1 = RF(n_estimators=1000,max_features= 50,criterion='entropy',min_samples_split= 40,max_depth= 30, min_samples_leaf= 2, n_jobs = 10,verbose=0,random_state=42)
    etc1 = ETC(n_estimators=500,max_features= 90,criterion='entropy',min_samples_split= 20,max_depth= 25, min_samples_leaf= 10, n_jobs =10,verbose=0,random_state=42)
    xgb1 = XGC(xg_class_params,num_rounds=550)
    xgb2 = XGC(xg_class_params2,num_rounds=600)
    xgb_bag=bagger(xgb2,num_bags=3,bag_fraction=0.75)

    # EVALUATE a model
    score = crossValidate(etc1,x,y,folds=5,runs=1)
Exemplo n.º 4
0
    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesClassifier as ETC

        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETC(
                n_estimators=0, criterion=self.criterion,
                max_depth=self.max_depth, min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
                max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
                oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight,
                warm_start=True
            )

        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(X, y, sample_weight=sample_weight)
        self.estimator = tmp
        return self
Exemplo n.º 5
0
 def fit_predict(self, data_fit, data_predict):
     clf = ETC(criterion='gini',
               max_features=self.p['nfeatures'],
               max_depth=self.p['depth'],
               n_estimators=self.p['ntrees'],
               random_state=self.p['seed'])
     clf.fit(data_fit.x, data_fit.y)
     yhat = clf.predict_proba(data_predict.x)[:, 1]
     return data_predict.ids, yhat
Exemplo n.º 6
0
def etccv(n_estimators, min_samples_split):
    return cross_val_score(AdaBoostClassifier(ETC(
        min_samples_split=int(min_samples_split), random_state=2, n_jobs=-1),
                                              algorithm="SAMME",
                                              n_estimators=int(n_estimators)),
                           train,
                           train_labels,
                           'roc_auc',
                           cv=5).mean()
Exemplo n.º 7
0
    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesClassifier as ETC

        if refit:
            self.estimator = None

        if self.estimator is None:
            max_features = int(X.shape[1]**float(self.max_features))
            if self.criterion not in ("gini", "entropy"):
                raise ValueError("'criterion' is not in ('gini', 'entropy'): "
                                 "%s" % self.criterion)

            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_samples_split = int(self.min_samples_split)
            self.max_features = float(self.max_features)
            self.min_impurity_decrease = float(self.min_impurity_decrease)
            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)
            self.oob_score = check_for_bool(self.oob_score)
            self.bootstrap = check_for_bool(self.bootstrap)
            self.n_jobs = int(self.n_jobs)
            self.verbose = int(self.verbose)

            self.estimator = ETC(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight,
                warm_start=True)

        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self
Exemplo n.º 8
0
    def etc_ccv(self, n_estimators, top_args):
        _transition_model = ETC(n_estimators=int(n_estimators), random_state=0)
        _transition_model.fit(self.X, self.y)
        _transition_model_select = SelectFromModel(_transition_model,
                                                   prefit=True,
                                                   threshold=top_args)
        _transition_model_x = _transition_model_select.transform(self.X)

        # In this case there are not features to select, therefore, return 0 as accuracy.
        if _transition_model_x.shape[1] == 0:
            return 0

        # score = make_scorer(self.loss_function_2, greater_is_better=True)
        # af = AffinityPropagation().fit(self.X)
        # labels = af.labels_
        # val = metrics.metrics.adjusted_mutual_info_score(self.y, )
        # val = cross_validate(
        # AffinityPropagation(),
        # X=_transition_model_x,
        # y=self.y,
        # scoring=score,
        # cv=2
        # )

        score = make_scorer(self.loss_function_2, greater_is_better=True)
        val = cross_validate(ETC(n_estimators=int(n_estimators),
                                 random_state=0),
                             X=_transition_model_x,
                             y=self.y,
                             scoring=score,
                             cv=2)

        self.cumulative_objective_function.append({
            "score":
            val['test_score'].mean(),
            "n_estimators":
            n_estimators,
            "top_args":
            _transition_model_x.shape[1],
            "importance_cutoff":
            top_args
        })

        return val['test_score'].mean()
Exemplo n.º 9
0
 def fit_predict(self, dfit, dpre, tournament):
     clf = ETC(criterion='gini',
               max_features=self.p['nfeatures'],
               max_depth=self.p['depth'],
               n_estimators=self.p['ntrees'],
               random_state=self.p['seed'],
               n_jobs=-1)
     clf.fit(dfit.x, dfit.y[tournament])
     yhat = clf.predict_proba(dpre.x)[:, 1]
     return dpre.ids, yhat
Exemplo n.º 10
0
def trainClassifier(xTrain, yTrain):
    # learner = LR(penalty='l2')
    # learner = SVM()
    # learner = DT()
    # learner = RF()
    learner = ETC()
    # learner = ADA(n_estimators=200)
    # learner = G(n_estimators=100)
    learner.fit(xTrain, yTrain)
    return learner
Exemplo n.º 11
0
    def optimize(self):
        self.gp_params = {"alpha": 1e-5}
        self.etc_0 = BayesianOptimization(
            self.etc_ccv, {
                'n_estimators': (1000, 1000),
                'top_args': (self.min_importance, self.max_importance)
            })

        self.etc_0.maximize(n_iter=self.epochs, **self.gp_params)

        print('selecting best performance parameters ...')
        selected_parameters = sorted(self.etc_0.res,
                                     key=lambda i: i['target'])[-1]

        self.forest = ETC(n_estimators=int(
            selected_parameters['params']['n_estimators']),
                          random_state=0)

        self.forest.fit(self.X, self.y)
        self._selected_features_model = SelectFromModel(
            self.forest,
            prefit=True,
            threshold=selected_parameters['params']['top_args'])

        self.parameters = pd.DataFrame({
            "score": [i['score'] for i in self.cumulative_objective_function],
            "n_estimators":
            [i['n_estimators'] for i in self.cumulative_objective_function],
            "top_args":
            [i['top_args'] for i in self.cumulative_objective_function],
            "importance_cutoff": [
                i['importance_cutoff']
                for i in self.cumulative_objective_function
            ]
        })
        # print(json.dumps(self.cumulative_objective_function, indent=4))

        self.x_t_selected = self._selected_features_model.transform(self.X)

        self.x_selected = pd.DataFrame(
            data=self.x_t_selected,
            index=self.X.index,
            columns=self.X.columns[
                self._selected_features_model.get_support()])

        self.importances = pd.DataFrame({
            'Gene':
            self.x_selected.columns,
            'importance':
            self.forest.feature_importances_[
                self._selected_features_model.get_support()]
        })
Exemplo n.º 12
0
def Model_rec(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=42)
    scaler = MinMaxScaler()
    column_names = X_train.columns.values
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    X_train = pd.DataFrame(data=X_train, columns=column_names)
    X_test = pd.DataFrame(data=X_test, columns=column_names)
    forest = ETC(n_estimators=250, max_depth=10, random_state=np.random)
    forest.fit(X_train, y_train)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)
    return (forest, scaler, X_test, y_test)
Exemplo n.º 13
0
    train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF = StratifiedKFold(train_y, 5, shuffle=True, random_state=2333)

cv_scores = []
i = 0

for dev_index, val_index in KF:
    result_dict = {}

    dev_set, val_set = train_df.iloc[dev_index, :], train_df.iloc[val_index, :]
    #filter the features
    dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    et = ETC(2000, random_state=0)
    et.fit(dev_X, dev_y)
    preds = et.predict_proba(val_X)

    #save the pickles for futures use
    pickl_file = store + 'et2000-5fold-out-' + str(i) + '.pickle'
    fileObject = open(pickl_file, 'wb')
    pickle.dump(preds, fileObject)
    fileObject.close()

    loss = log_loss(val_y, preds)

    cv_scores.append(loss)
    i += 1
    print 'loss for the turn ' + str(i) + ' is ' + str(loss)
print("logloss", logloss(Yval, y_pred_prob2))

#Estimation on testing data
y_pred_prob = gmean_pred(clfs2, Xtest)
test_data['XGB_pred'] = y_pred_prob

#######################ExtraTrees################################
print('\n ################Extra Trees########################')
y_pred = Y * 0
y_pred_prob = Y * 0

clfs3 = []
for train, test in kf:
    X_train, X_test, y_train, y_test = X[train, :], X[
        test, :], Y[train], Y[test]
    clf = ETC(n_estimators=100, n_jobs=7)
    clf.fit(X_train, y_train)
    clfs3.append(clf)
    y_pred_test = clf.predict(X_test)
    y_pred_prob_test = clf.predict_proba(X_test)[:, 1]
    print("Iteration - logloss", logloss(Y[test], y_pred_prob_test))

clf3 = combine_classifier(clfs3)
#Training performance evaluation
y_pred = clf3.predict(X)
y_pred_prob = clf3.predict_proba(X)[:, 1]
print('Training Results:')
print(confusion_matrix(Y, y_pred))
print("logloss", logloss(Y, y_pred_prob))

train_data['ETC_pred'] = clf3.predict_proba(X)[:, 1]
Exemplo n.º 15
0
df_validation_label = df_validation.loc[:,"label"]
df_validation_label

df_test_statement = df_test.loc[:,"statement"]
df_test_statement

df_test_label = df_test.loc[:,"label"]
df_test_label

# Training the model  
from sklearn.ensemble import BaggingClassifier as BRC
from sklearn.ensemble import ExtraTreesClassifier as ETC
pipeline = Pipeline([
        ('ngrams', TfidfVectorizer(ngram_range=(1, 1))),
        ('clf', BRC(base_estimator=ETC(n_estimators=30), n_estimators=100,bootstrap_features=True,oob_score=True,max_features = 7))
    ])
pipeline.fit(df_training_statement, df_training_label)

predicted_labels = pipeline.predict(df_validation_statement)
predicted_labels

accuracy = pipeline.score(df_validation_label,predicted_labels)
accuracy

predicted_labels_test = pipeline.predict(df_test_statement)
predicted_labels_test

accuracy_test = pipeline.score(df_test_label,predicted_labels_test)
accuracy_test
Exemplo n.º 16
0
inpFile = open("data/training_data.txt", "r")

# Extract the rest of the data so that we can parse it
training_data = np.genfromtxt("data/training_data.txt",
                              delimiter="|",
                              skip_header=1)
test_data = np.genfromtxt("data/testing_data.txt",
                          delimiter="|",
                          skip_header=1)

X = training_data[:, :1000]
Y = training_data[:, 1000]

# Various Classifiers
dtc_min_samples_leaf = DTC(min_samples_leaf=15)
etc = ETC()
gbc = GBC()
rfc = RFC()
dtc_max_depth = DTC(max_depth=8)
nb = BernoulliNB()
svc = SVC()
lr = LR()
abc = ABC()
bc = BC()
'''
inv_doc_freq = np.zeros(1000)
for i in range(len(inv_doc_freq)):
    total = sum(X[:, i])
    if total == 0:
        inv_doc_freq[i] = 0
    else:
Exemplo n.º 17
0
    # RF
    etcBO = BayesianOptimization(etccv, {
        'n_estimators': (200, 800),
        'min_samples_split': (2, 8)
    })
    print('-' * 53)
    etcBO.maximize()
    print('-' * 53)
    print('Final Results')
    print('ETC: %f' % etcBO.res['max']['max_val'])

    # # MAKING SUBMISSION
    rf = cross_val_score(ETC(
        n_estimators=int(etcBO.res['max']['max_params']['n_estimators']),
        min_samples_split=int(
            etcBO.res['max']['max_params']['min_samples_split']),
        random_state=2,
        n_jobs=-1),
                         train,
                         train_labels,
                         'roc_auc',
                         cv=5).mean()

    rf.fit(train, train_labels)
    preds = rf.predict_proba(test)[:, 1]
    print('Prediction Complete')
    submission = submission = pd.DataFrame(preds,
                                           index=test_labels,
                                           columns=['target'])
    submission.to_csv('../output/extratrees_autotune.csv')
import utils
import pickle

from os.path import isfile

from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

filename = '/usr/src/app/sentiment/models/pickles/BernoulliNB.pickle'

if isfile(filename) == False:

    train, test = train_test_split(utils.read_data(), test_size=0.2)

    train_embeddings = utils.combined_embeddings(train['text'].tolist())
    test_embeddings = utils.combined_embeddings(test['text'].tolist())

    clf = ETC(n_estimators=100)
    clf.fit(train_embeddings, train['sentiment'])

    prediction = clf.predict(test_embeddings)
    report = classification_report(test['sentiment'], prediction)
    print(report)

    with open(filename, 'wb') as f:
        pickle.dump(clf, f)

else:
    print('Already Trained!')
Exemplo n.º 19
0
# KNeighbors Classifier
kn_cls = KNNc(n_neighbors=41, weights='uniform', algorithm='brute',
              metric='chebyshev')

# Ridge Classifier
rd_cls = RdC(fit_intercept=False, class_weight=None, solver='lsqr',
             random_state=5)

# Random Forest Classifier
rf_cls = RFC(n_estimators=200, max_depth=10, min_samples_split=2,
             min_samples_leaf=3, max_features=None, class_weight=None,
             criterion='entropy', random_state=5)

# Extra Trees Classifier
et_cls = ETC(criterion='entropy', min_impurity_decrease=0.0, bootstrap=True,
             max_features=None, n_estimators=100, max_depth=None,
             min_samples_split=3, min_samples_leaf=2, max_leaf_nodes=20,
             class_weight=None, random_state=5)

# Gradient Boosting Classifier
gb_cls = GBC(loss='deviance', max_features=None, learning_rate=0.125,
             n_estimators=150, min_samples_split=2, min_samples_leaf=20,
             max_depth=5, min_impurity_decrease=0.20, max_leaf_nodes=10,
             random_state=5)

# Isolation Forest
if_cls = IFc(random_state=5)
if_param = {'n_estimators': [100, 200, 300],
            'contamination': [0.05, 0.1, 0.2],
            'max_features': [0.5, 0.75, 1.0],
            'bootstrap': [True, False],
            'behaviour': ['new']}
words = inpFile.readline().rstrip().split("|")

# Extract the rest of the data so that we can parse it
training_data = np.genfromtxt("data/training_data.txt",
                              delimiter="|",
                              skip_header=1)
test_data = np.genfromtxt("data/testing_data.txt",
                          delimiter="|",
                          skip_header=1)

X = training_data[:, :1000]
Y = training_data[:, 1000]

# Various Classifiers
dtc_min_samples_leaf = DTC(min_samples_leaf=15)
etc = ETC()
gbc = GBC()
rfc = RFC()
dtc_max_depth = DTC(max_depth=8)
nb = BernoulliNB()
svc = SVC()

# Split Training Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3333)

# Compare individual classifiers
dtc_min_samples_leaf.fit(X_train, Y_train)
print "dtc_min_samples_leaf"
prediction1_train = dtc_min_samples_leaf.predict(X_train)
prediction1 = dtc_min_samples_leaf.predict(X_test)
print accuracy_score(prediction1, Y_test)
Exemplo n.º 21
0
def nn_vs_out_of_the_box_model(all_data=None,
                               test_box_id=1000,
                               prop_test=0.1,
                               predictor_params=None,
                               box_file=None,
                               min_test_cases=5,
                               max_test_cases=100):
    """
    Tests accuracy of nearest neighbor predictor for a specific box
    :param all_data:
    :param test_id:
    :param prop_test:
    :param window:
    :return:
    """
    # ------------PRELIMINARY SET UP-----------------------------------
    data_test_bx = all_data[all_data.box_id ==
                            test_box_id]  # select data for this box only
    train_df, test_df = prepare_data_for_training_testing(
        data=all_data,
        box_id=test_box_id,
        min_test_cases=min_test_cases,
        prop_test=prop_test,
        max_test_cases=max_test_cases)

    # ----------------CREATE MODEL OBJECT-----------------------------
    num_boxes = len(list(all_data.box_id.unique()))

    etc = ETC(n_estimators=100)
    clf = pred.ImputationNearestNeighbor(
        data=train_df,
        target=predictor_params['target'],
        neighbors=predictor_params['neighbors'],
        how=predictor_params['how'],
        time_window=predictor_params['time-window'],
        direction=predictor_params['direction'],
        out_of_box_model=etc,
        pred_features=PREDICTION_FEATURES)

    clf.generate_box_metadata(box_file=box_file)
    box_lat_lon = [
        data_test_bx[data_test_bx.box_id == test_box_id].lat.values[0],
        data_test_bx[data_test_bx.box_id == test_box_id].lon.values[0]
    ]

    # --------------TEST MODEL PERFOMANCE OUT OF THE BOX---------------------------------------------------
    results_out = compute_metrics_power_state(
        model_object=clf,
        test_data=test_df,
        box_id=test_box_id,
        xy=box_lat_lon,
        model_type='out',
    )

    # --------------TEST MODEL PERFOMANCE NEAREST NEIGHBOR-----------------------------------------------
    results_nearest = compute_metrics_power_state(model_object=clf,
                                                  test_data=test_df,
                                                  box_id=test_box_id,
                                                  xy=box_lat_lon,
                                                  model_type='nn')

    # --------------TEST MODEL PERFOMANCE MAJORITY CLASSIFIER------------------------------------------
    results_majority = compute_metrics_power_state(model_object=clf,
                                                   test_data=test_df,
                                                   box_id=test_box_id,
                                                   xy=box_lat_lon,
                                                   model_type='major')

    # --------------TEST MODEL PERFOMANCE RANDOM-------------------------------------------------------
    results_random = compute_metrics_power_state(model_object=clf,
                                                 test_data=test_df,
                                                 box_id=test_box_id,
                                                 xy=box_lat_lon,
                                                 model_type='rand')

    return results_nearest, results_out, results_majority, results_random
Exemplo n.º 22
0
scores = cross_val_score(LR, X, y, cv=3, scoring='roc_auc')
print scores
print np.mean(scores)

#random forest
from sklearn.ensemble import ExtraTreesClassifier as ETC
RF = RF(n_estimators=100, random_state=1)
RF.fit(X_train, y_train)

predicted_probs = RF.predict_proba(X_test)
predicted_probs = ["%f" % x[1] for x in predicted_probs]
print RF.score(X_test, y_test)

forest = ETC(
    n_estimators=100,
    random_state=1,
    compute_importances=True,
)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
print "feature importances:"
print importances

pl.figure()
pl.title("feature importances")
pl.bar(xrange(9),
       importances[indices],
       color="b",
Exemplo n.º 23
0
    #label of training dataset
    DEFAULTER = np.array(df['DEFAULTER'])

    #stacking features together in a matrix
    X = np.column_stack((AMOUNT, VAR_1, VAR_2, DUE_MORTGAGE, VALUE, DCL,
                         REASON, OCC, TJOB, CL_COUNT, CL_COUNT, RATIO))

    #setting Y as the label
    Y = DEFAULTER

    #using the Imputer() function to fill in the missing values using strategy='mean'
    imputer = Imputer(copy=False)
    transformed_X = imputer.fit_transform(X)

    #fitting the model with training dataset. Model is a BaggingClassifier, with ExtraTreesClassifier as it's estimator
    model = BRC(base_estimator=ETC(n_estimators=30),
                n_estimators=100,
                bootstrap_features=True,
                oob_score=True,
                max_features=7)
    model.fit(transformed_X, Y)
    '''
    #crossvalidating the model using RepeatedStratifiedKFold
    model = BRC(base_estimator=ETC(n_estimators=30), n_estimators=100,bootstrap_features=True,oob_score=True,max_features = 7)
    kfold = KFold()
    result = cross_val_score(model, transformed_X, Y, cv=kfold, scoring = 'roc_auc')
    print(result.mean())
    '''

    f = r'F:/Analyticity2018/test.csv'  #reading address of file
    df = pd.read_csv(f)  #creating pandas dataframe