def prep_data(data_set): if data_set == 1: data = pd.read_excel( r'C:\\Users\\cbroe\\OneDrive\\Skrivebord\\Stuff\\School\\bachelor\\Python\\Bachelor\\output.xlsx' ) data = data.iloc[13750:86190] lon, lat = fs.get_feature_selection_rows(data_set, 6) X = data.filter(lon) y = data.filter(['lon_rad', 'lat_rad']) return data, X, y elif data_set == 2: data = pd.read_excel( r'C:\\Users\\cbroe\\OneDrive\\Skrivebord\\Stuff\\School\\bachelor\\Python\\Bachelor\\output2.xlsx' ) data = data.iloc[0:3700] lon, lat = fs.get_feature_selection_rows(data_set, 5) X = data.filter(lon) y = data.filter(['Long', 'Lat']) return data, X, y
def plot_score_log_reg_embedded(X, y): # c = 2050 ,s = 0.963 # threshold = 0.514,shape =16 logistic_regression = log_reg_train(X, y, C=2050, max_iter=27) print logistic_regression.n_iter_ print logistic_regression.coef_ range = np.linspace(0, logistic_regression.coef_.max(), 20) scores = [] scores_embedded = [] for threshold in range: score = cross_val_score(logistic_regression, X, y, cv=5, scoring='accuracy') select_model, X_embedded = FeatureSelection.select_by_model( logistic_regression, X, y, threshold=threshold) score_embedded = cross_val_score(logistic_regression, X_embedded, y, cv=5, scoring='accuracy') scores.append(score.mean()) scores_embedded.append(score_embedded.mean()) print range[scores_embedded.index( max(scores_embedded))], max(scores_embedded) plt.plot(range, scores) plt.plot(range, scores_embedded) plt.show() select_model, X_embedded = FeatureSelection.select_by_model( logistic_regression, X, y, threshold=0.514) print X_embedded.shape
def validation_core(i, x, y, model, feature_count): (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i) mutualInformationTable = FeatureSelection.byMutualInformation(foldTrainX, foldTrainY) words = [word for word,_ in mutualInformationTable[:feature_count]] (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words) model.fit(xNewTrain, foldTrainY) return self.__countCorrect(model.predict(xNewValidation), foldValidationY)
def main(): # Define constant datadir = 'data/IS11_speaker_state' # 'data/IS09_emotion' or 'data/IS11_speaker_state' # datadir = 'data/IS09_emotion' # 'data/IS09_emotion' or 'data/IS11_speaker_state' n_features = 1500 # Totally 382 features for IS09_emotion; 4354 for IS11_speaker_state fs_method = 'none' # 'univ', 'rfe', 'lasso', 'l1norm', 'fdr' or 'none' C = 0.01 # Penalty factor for RFE-SVM feature selection rfe_step = 1 # Step for RFE-SVM feature selection n_speakers = 10 # There are 10 speakers in EmoDB svm_pred = [] svm_true = [] print('Start train and test ...') trn_file = datadir + '/' + 'cheavd_trn_fs.mat' tst_file = datadir + '/' + 'cheavd_tst_fs.mat' # Load trn and tst data trn_data = sio.loadmat(trn_file) tst_data = sio.loadmat(tst_file) # Retreive training and test data x_train = np.array(trn_data['x'], dtype='float32') y_train = np.array(trn_data['y'].ravel(), dtype='int32') x_test = np.array(tst_data['x'], dtype='float32') y_test = np.array(tst_data['y'].ravel(), dtype='int32') if fs_method == 'rfe': # Select features by RFE-SVM, remove 10% of feature for each iteration (step=0.1) x_train, x_test = fs.rfe_select_features(x_train, y_train, x_test, n_fs=n_features, penalty=C, step=rfe_step) elif fs_method == 'l1norm': # Select features by L1-norm x_train, x_test = fs.l1norm_select_features(x_train, y_train, x_test, penalty=C) elif fs_method == 'lasso': # Select features by LASSO x_train, x_test = fs.lasso_select_features(x_train, y_train, x_test, alpha=0.001) elif fs_method == 'univ': x_train, x_test = fs.univ_select_features(x_train, y_train, x_test, n_fs=n_features) elif fs_method == 'fdr': # Select features by FDR x_train, x_test = fs.fdr_select_features(x_train, y_train, x_test, n_fs=n_features) elif fs_method == 'none': pass print('No. of selected features = %d, ' % x_train.shape[1], end='') # Train an SVM classifier svc = SVC(C=1, gamma='auto', kernel='rbf') svc.fit(x_train, y_train) # Test the SVM classifier svm_pred = svc.predict(x_test) n_classes = np.max(y_train) + 1 print('Overall SVM accuracy for %d classes with %d features/class: %.2f%%' % (n_classes, n_features, get_accuracy(svm_pred, y_test)))
def retrieve_feature_name(self): """ retrieve the names of the features for the nodes """ feature_stat_df = FeatureSelection.FeatureStatus().feature_stat feature_name_list = feature_stat_df['feature'].tolist() return feature_name_list
def start(self): # perform some logging self.jlogger.info("Starting job with job id {}".format(self.job_id)) self.jlogger.debug("Job Config: {}".format(self.config)) self.jlogger.debug("Job Other Data: {}".format(self.job_data)) try: rud.ReadUserData(self) fg.FeatureGeneration(self, is_train=True) pp.Preprocessing(self, is_train=True) fs.FeatureSelection(self, is_train=True) fe.FeatureExtraction(self, is_train=True) clf.Classification(self) cv.CrossValidation(self) tsg.TestSetGeneration(self) tspp.TestSetPreprocessing(self) tsprd.TestSetPrediction(self) job_success_status = True except: job_success_status = False helper.update_running_job_status(self.job_id, "Errored") self.jlogger.exception("Exception occurred in ML Job {} ".format( self.job_id)) return job_success_status
def main(): X_train, X_test, y_train, y_test = FeatureSelection.main( './kickstarter-projects/ks-projects-201801.csv') X_train_d, X_test_d = _create_dummy_features(X_train,X_test) X_train, X_test = _create_label_encoding(X_train, X_test) dt = decisionTree(X_train_d, X_test_d, y_train, y_test) lgr = logisticRegrs(X_train_d, X_test_d, y_train, y_test) rf = randomForest(X_train_d, X_test_d, y_train, y_test) gb = gradientBooster(X_train_d, X_test_d, y_train, y_test) ada = adaBooster(X_train_d, X_test_d, y_train, y_test) bagging_ensemble = bagging(X_train_d, X_test_d, y_train, y_test) rf = RandomForestClassifier(criterion='entropy', max_depth=20, min_samples_split=25, n_estimators=100, random_state=0, class_weight='balanced_subsample') dt = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_split=1000, random_state=0, class_weight='balanced') gb = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=0) ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100, learning_rate=0.5) estimators = [('dt', dt), ('rf', rf), ('ada', ada), ('gb', gb)] voting_ensemble = voting(X_train_d, X_test_d, y_train, y_test, estimators) """
def validateByFrequency(self, x, y, model): totalCorrect = 0 for i in range(self.k): (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i) frequencyTable = FeatureSelection.byFrequency(foldTrainX) words = [word for word,_ in frequencyTable[:10]] print('For fold %d/%d, choose words:' % (i + 1, self.k)) print(words) (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words) model.fit(xNewTrain, foldTrainY) totalCorrect += self.__countCorrect(model.predict(xNewValidation), foldValidationY) accuracy = totalCorrect / len(x) return accuracy
def main(): sf = FS.Select(Sequence = False, Random = True, Cross = False) #select the way you want to process searching sf.ImportDF(prepareData(),label = 'is_trade') sf.ImportLossFunction(modelscore,direction = 'descend') sf.ImportCrossMethod(CrossMethod) sf.NonTrainableFeatures = ['used','instance_id', 'item_property_list', 'context_id', 'context_timestamp', 'predict_category_property', 'is_trade'] sf.InitialFeatures(['item_category_list', 'item_price_level','item_sales_level','item_collected_level', 'item_pv_level','day']) sf.clf = lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000, max_depth=3, learning_rate = 0.05, n_jobs=8) sf.logfile = 'record.log' sf.run(validation)
def main(): sf = FS.Select(Sequence=True, Random=True, Cross=True) sf.ImportDF(prepareData(), label='Survived') sf.ImportLossFunction(modelscore, direction='ascend') sf.ImportCrossMethod(CrossMethod) sf.NonTrainableFeatures = ['Survived'] sf.InitialFeatures([]) sf.PotentialAdd = ['Pclass'] # sf.clf = lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000, max_depth=3, learning_rate = 0.05, n_jobs=1) sf.clf = LogisticRegression() sf.logfile = 'record.log' sf.run(validation)
def featureSelect(): """ Desc : Feature Selection """ print('\n ********** Feature Selector ***********') #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features.csv' #writeFile='/nobackup/anikgaik/search/features/Train_Features/Final_train_Feature.csv' fileName = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Generated_Features.csv' writeFile = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv' oFS = FS.featureSelect() oFS.loadfeature(fileName) oFS.printAll() oFS.selectFeatures() oFS.generateNewfeature(fileName, writeFile)
def featureSelect(): """ Desc : Feature Selection """ print('\n ********** Feature Selector ***********') #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features.csv' #writeFile='/nobackup/anikgaik/search/features/Train_Features/Final_train_Feature.csv' fileName='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Generated_Features.csv' writeFile='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv' oFS=FS.featureSelect() oFS.loadfeature(fileName) oFS.printAll() oFS.selectFeatures() oFS.generateNewfeature(fileName,writeFile)
def run(is_changing_n=False, number_to_ngram=3, number_of_features=220, fs_method=FEATURE_SELECTION_MOST_COMMON, c_method=CLASSIFIER_ONE_CLASS_SVM): # set param for final calcs all_ans = [] # Feature extraction out_0_file = Path(out_0_path) if is_changing_n: fe.export_to_csv_all_users(number_to_ngram) if not out_0_file.exists(): fe.export_to_csv_all_users(number_to_ngram) for user_number in range(0, 5): # Feature selection FeatureSelection.select_features(number_of_features, fs_method, user_number) # Classifier ans = Classifier.classify(number_of_features, c_method) all_ans.append(ans) print(""" ** FINAL SCORE : {} ** """.format(sum(all_ans)/len(all_ans)))
def main(): train_X, train_Y, test_X, test_Y = data.load_ICICI() start_time = time.clock() clf = Ridge() features = SelectKBest(f_regression) train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main( clf, train_X, train_Y, test_X, test_Y) k_value = [best_K] hyper_parameters = hyper.main(Ridge, 1) hyper_parameters['Kbest__k'] = k_value pipeline = Pipeline([('Kbest', features), ('model', clf)]) rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters) rand_search.fit(train_X, train_Y) best_accu = 1000 b_i = 0 predictions = [] for i in range(len(test_Y) / 3): b_pred = rand_search.predict(test_X) - (test_Y[i] - rand_search.predict(test_X)[i]) RMSE = math.sqrt(mean_squared_error(test_Y, b_pred)) if RMSE < best_accu: best_accu = RMSE b_i = i predictions = b_pred[1:] test_Y = test_Y[:-1] print b_i, best_accu #predictions = predictions - diff #normalizing end_time = time.clock() - start_time filename = "..\Speed\ICICI_Ridge_time.txt" target = open(filename, 'w') target.write(str(end_time)) MSE = mean_squared_error(test_Y, predictions) RMSE = math.sqrt(MSE) filename = "..\RMSE\ICICI_Ridge_rmse.txt" target = open(filename, 'w') target.write(str(RMSE)) print "ICICI RIDGE BEST", RMSE ICICI = ["ICICI"] Graphs_plotting.line_graph(5000, test_Y, predictions, "RidgeBEST", ICICI[0])
def get_data_splits(self, cv_method): cv_data_splits = [] x = self.ml_pipeline.data.values y = self.ml_pipeline.data_labels.values.ravel() i = 1 for train_index, test_index in cv_method.split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] x_train_pd = pd.DataFrame(x_train) x_test_pd = pd.DataFrame(x_test) ppp_ml_pipeline = MLPipeline.MLPipeline(self.ml_pipeline.job_id) ppp_ml_pipeline.x_train = x_train_pd ppp_ml_pipeline.y_train = y_train ppp_ml_pipeline.x_test = x_test_pd ppp_ml_pipeline.y_test = y_test pp = ppp.Preprocessing(ppp_ml_pipeline, is_train=False) pp.preprocess_data() fs = pfs.FeatureSelection(ppp_ml_pipeline, is_train=False) fs.perform_feature_selection() fe = pfe.FeatureExtraction(ppp_ml_pipeline, is_train=False) fe.perform_feature_extraction() self.jlogger.info("Cross validation split number {}".format(i)) self.jlogger.info("XTrain Shape: {}".format( ppp_ml_pipeline.x_train.shape)) self.jlogger.info("XTest Shape: {}".format( ppp_ml_pipeline.x_test.shape)) self.jlogger.info("YTrain Shape: {}".format( ppp_ml_pipeline.y_train.shape)) self.jlogger.info("YTest Shape: {}".format( ppp_ml_pipeline.y_test.shape)) cv_data_splits.append( (ppp_ml_pipeline.x_train, ppp_ml_pipeline.x_test, ppp_ml_pipeline.y_train, ppp_ml_pipeline.y_test)) i += 1 return cv_data_splits
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs): new_train_set = list(trainSet) new_y_train = list(y_train) trainAndBSData = trainSet + bootstrap_data generateDataDrivenFeats(trainSet, trainAndBSData, es) featurized = featurize(trainAndBSData) train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)] test_feats = [ featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1) ] #Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, [i for i in range(0, len(trainSet), 1)], es) # calculate Inter-annotator weighting. weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot) vectorizer = DictVectorizer() x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: min_max_scalar = MinMaxScaler() x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) model = train(estimator, x_train, y_train, weights_train, model=None) y_pred_prob = model.predict_proba(x_test) for i, cur_y in enumerate(y_pred_prob): if np.max(cur_y) > th_bs: new_train_set.append(bootstrap_data[i]) new_y_train.append(np.argmax(cur_y)) return (new_train_set, new_y_train) #update none to confidence vector
def do_preprocessing(pos_path, neg_path, selected_DB, is_bigrams, k=None, method=None, features_space=None): f2r = FileToReview.FileToReview(pos_path, neg_path, selected_DB) pos_reviews, neg_reviews = f2r.buildReviewMatrix() # get a new instance for preprocessing # The new instance needs to know where positive and negative review directories are, also database no prep = Preprocessing(pos_path, neg_path, selected_DB, pos_reviews, neg_reviews, is_bigrams) # extract positive and negative vocabularies prep.extract_vocabulary() # print extracted vocabularies in dictionnary (json) format vocabs = prep.get_v() nb_neg_review = prep.get_nb_neg_review() nb_pos_review = prep.get_nb_pos_review() # get a new instance # The new instance needs to know where positive and negative review directories are, also database no tfp = TermFrequencyProcessing.TermFrequencyProcessing( pos_path, neg_path, selected_DB) tfp.compute_terms_frequency(vocabs) # print(tfp.get_overall_terms_frequency()) # print(tfp.get_reviews_info()) T = tfp.get_overall_terms_frequency() fs = FeatureSelection.FeatureSelection(T, nb_neg_review, nb_pos_review) if not features_space: features_space = fs.build_features_space(k, method) reduced_vocabs = fs.reduce_vocabs(vocabs, features_space) return vocabs, reduced_vocabs, fs, features_space reduced_vocabs = fs.reduce_vocabs(vocabs, features_space) return vocabs, reduced_vocabs, fs
def main(): PotentialAdd = ['min_query_time_gap_after', 'hour', 'shop_score_delivery', 'min_query_time_gap_before_user_item', 'shop_id_smooth_query_rate', 'min_query_time_gap_before', 'shop_score_description', 'item_sales_level', 'shop_query_count', 'user_star_level', 'user_age_level', 'item_sales_query_rate', 'item_query_count', 'shop_score_service', 'shop_review_positive_rate', 'item_price_level', 'min_query_time_gap_after_user_item'] ''' PotentialAdd = [] ''' sf = FS.Select(Sequence = True, Random = True, Cross = False, PotentialAdd = PotentialAdd) #select the way you want to process searching sf.ImportDF(prepareData(),label = 'is_trade') sf.ImportLossFunction(modelscore,direction = 'descend') sf.ImportCrossMethod(CrossMethod) sf.NonTrainableFeatures = ['instance_id', 'item_id', 'item_brand_id', 'item_city_id', 'user_id', 'context_id', 'shop_id', 'item_category_0', 'time', 'context_timestamp', 'item_property_list', 'predict_category_property', 'item_category_list', 'is_trade', 'day', ] sf.InitialFeatures(['item_price_level', 'item_sales_level', 'item_collected_level', 'min_query_time_gap_after', 'min_query_time_gap_before_user_item', 'min_query_time_gap_after_user_item', 'hour', 'item_category_1', 'shop_score_service', 'user_age_level', 'user_star_level', 'context_page_id', 'min_query_time_gap_before', 'shop_query_count', 'item_sales_count']) #sf.InitialFeatures(['item_price_level','item_sales_level','item_collected_level', 'item_pv_level']) sf.clf = lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000, max_depth=3, learning_rate = 0.05, n_jobs=8) sf.logfile = 'record.log' sf.run(validation)
def eval_model(folds, models, metric='roc_curve', dummy=False): fprs = {} tprs = {} aucs = {} fold = 0 for train, test in folds: X_train, X_test, y_train, y_test = FeatureSelection.main( './kickstarter-projects/ks-projects-201801.csv', split='strat_k_fold', train=train, test=test) if dummy == True: X_train, X_test = _create_dummy_features( X_train, X_test) elif dummy == False: X_train, X_test = _create_label_encoding(X_train, X_test) for name, model in models.items(): model_ = model.fit(X_train, y_train) predicitons = model_.predict_proba(X_test) fpr, tpr, thresholds = roc_curve( y_test, predicitons[:, 1], pos_label=1) auc_score = auc(fpr, tpr) aucs[name] = {fold: auc_score} fold += 1 return aucs
def main(): splitter = create_strat_k_folds( './kickstarter-projects/ks-projects-201801.csv', 3) gridsearch_model = joblib.load('./ModelHyperparm/rfParams.pkl') rf = RandomForestClassifier(**gridsearch_model.best_params_, n_estimators=100, random_state=0, class_weight='balanced_subsample') dt = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_split=1000, random_state=0, class_weight='balanced') gb = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=0) ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100, learning_rate=0.5) estimators = [('dt', dt), ('rf', rf), ('ada', ada), ('gb', gb)] voting_ensemble = VotingClassifier(estimators, voting='soft') models = {'Random Forest': rf, 'Decision Tree': dt, 'Gradient Booster': gb, 'AdaBooster': ada, "Voting Ensemble": voting_ensemble} fprs, tprs, aucs = eval_model(splitter, models, dummy=True) auc_scores = get_auc_score(aucs) X_train, X_test, y_train, y_test = FeatureSelection.main( "./kickstarter-projects/ks-projects-201801.csv") X_train, X_test = _create_dummy_features(X_train, X_test) voting_ensemble.fit(X_train, y_train) predicitons = voting_ensemble.predict_proba(X_test) fpr, tpr, thresholds = roc_curve( y_test, predicitons[:, 1], pos_label=1) auc_score = auc(fpr, tpr) get_roc_curve(fpr, tpr, 'Voting Ensemble', auc_score) predicitons = voting_ensemble.predict(X_test) conf_matrix = confusion_matrix(y_test, predicitons) print(conf_matrix) print("F1:", f1_score(y_test, predicitons)) print("Recall", recall_score(y_test, predicitons)) print("Percision:", precision_score(y_test, predicitons))
def testDiabeteData(): # TEST WITH DIABETES DATA FROM UCI url = "pima-indians-diabetes.data" print url headers = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] df = loadCSV(url, headers) # df = df.iloc[::2] print "First few samples: \n", df.head(5) # scale data df[[ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ]] = ut.minMaxScale(df[[ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ]]) # df = df[['preg', 'plas', 'pres', 'mass', 'pedi', 'age', 'class']] array = df.values [nrow, ncol] = array.shape print "First few samples: \n", df.head(5) # print "Dimension: ", array.shape X = array[:, 0:(ncol - 1)] Y = array[:, (ncol - 1)] # kfold = getKfolds(5) # testClassifers(X, Y, kfold) # k = 5 # runClassifiers(X, Y, k) # Feature Selection m = ncol - 1 # Number of features print "\nFeature Selection by RFE (smaller is better): " fs.recursiveFeatureElimination(X, Y, m) print "\nFeature Selection by kBest (higher values mean higher dependency): " fs.selectKBest(X, Y, m) print "\nFeature Selection by extraTree (higher is better): " fs.extraTrees(X, Y)
def main(useAnnotatorWeighing=True): ''' This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well) Configure your model settings by modifying the ExperimentSettings object in the script. The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py ''' # Making folders from config # cfg.makeFolders() # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations features = [["DSM+1"]] #features = [["CONCEPTS"]]#['BOW'], # features = [["CONCEPTS"]] # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings) es = ExperimentSettings() # es.fs_varianceFilter = True # es.bootstrap = True # es.ss_prototyping = True # es.weighInterAnnot = False # es.ml_algorithm='RF' #remove these! # es.removeDeniedConcepts=False # es.splitFamilyConcepts=False # es.splitUncertainConcepts=False # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_data, y_train) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) concatenated_data = [] concatenated_data.extend(train_data) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_data, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_data)] test_feats = featurized[len(train_data):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, [i for i in range(len(train_data))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_data, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_train, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
def main(): # Define constant optimizer = 'adam' # Can be 'adam', 'sgd', or 'rmsprop' activation = 'relu' # Can be 'sigmoid', 'tanh', 'softplus', 'softsign', 'relu' datadir = 'data/IS09_emotion' # 'data/IS09_emotion' or 'data/IS11_speaker_state' # datadir = 'data/IS11_speaker_state' # 'data/IS09_emotion' or 'data/IS11_speaker_state' n_hiddens = [100, 100] n_features = 100 # Totally 382 features for IS09_emotion; 4354 for IS11_speaker_state fs_method = 'none' # 'univ', 'rfe', 'lasso', 'l1norm' or 'none' C = 0.01 rfe_step = 1 n_epochs = 10 bat_size = 10 print('Start train and test ...') trn_file = datadir + '/' + 'cheavd_trn_fs.mat' tst_file = datadir + '/' + 'cheavd_tst_fs.mat' # Load trn and tst data trn_data = sio.loadmat(trn_file) tst_data = sio.loadmat(tst_file) # Retreive training and test dataa x_train = np.array(trn_data['x'], dtype='float32') y_train = np.array(trn_data['y'].ravel(), dtype='int32') y_train_ohe = np_utils.to_categorical(y_train) x_test = np.array(tst_data['x'], dtype='float32') y_test = np.array(tst_data['y'].ravel(), dtype='int32') if fs_method == 'rfe': # Select features by RFE-SVM, remove 10% of feature for each iteration (step=0.1) x_train, x_test = fs.rfe_select_features(x_train, y_train, x_test, n_fs=n_features, penalty=C, step=rfe_step) elif fs_method == 'l1norm': # Select features by L1-norm x_train, x_test = fs.l1norm_select_features(x_train, y_train, x_test, penalty=C) elif fs_method == 'lasso': # Select features by LASSO x_train, x_test = fs.lasso_select_features(x_train, y_train, x_test, alpha=0.001) elif fs_method == 'univ': x_train, x_test = fs.univ_select_features(x_train, y_train, x_test, n_fs=n_features) elif fs_method == 'fdr': # Select features by FDR x_train, x_test = fs.fdr_select_features(x_train, y_train, x_test, n_fs=n_features) elif fs_method == 'none': pass print('No. of selected features = %d, ' % x_train.shape[1], end='') # Train DNN model = train_dnn(x_train, y_train_ohe, n_hiddens, optimizer, activation, n_epochs, bat_size) # Test DNN train_acc, dummy, dummy = test_dnn(x_train, y_train, model) print('Training accuracy: %.2f%% ' % (train_acc * 100), end='', flush=True) test_acc, dummy, dummy = test_dnn(x_test, y_test, model) print('Test accuracy: %.2f%% ' % (test_acc * 100))
testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted) print("Test Set Accuracy is %f" % (testAccuracy)) print("Train with all 5 features") model.fit(xTrain, yTrain, iterations=50000, step=0.01) yTestPredicted = model.predict(xTest) testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted) print("Test Set Accuracy is %f" % (testAccuracy)) ############################ import FeatureSelection print('### Get the Frequency Table') frequencyTable = FeatureSelection.byFrequency(xTrainRaw) print('Top 10') for i in range(10): print(frequencyTable[i]) ############################# print('### Get the Mutual Information Table') mutualInformationTable = FeatureSelection.byMutualInformation( xTrainRaw, yTrain) print('Top 10') for i in range(10): print(mutualInformationTable[i]) #############################
""" This script will read all the emails and it will train the classifier """ import os from Email import * from FeatureSelection import * from NaiveBayesClassifier import * trainPath = "dataset" trainSet_emails = [] #create an email for every file we read for f in os.listdir(trainPath): fileName = trainPath+'/'+f e = Email() if "spm" in fileName: e.setCategory("SPAM") else: e.setCategory("HAM") e.read(fileName) #insert the email we created to a collection of emails trainSet_emails.append(e) #select features from our training set(automatic feature selection) fs = FeatureSelection(trainSet_emails) fs.selectFeatures() #create a naive bayes classifier and train it nb = NaiveBayesClassifier() nb.setEmails(trainSet_emails) nb.train()
(xRaw, yRaw) = Assignment1Support.LoadRawData(kDataPath) # (xTrainRaw, yTrainRaw, xTestRaw, yTestRaw) = Assignment1Support.TrainTestSplit(xRaw, yRaw) (xTrainRawOriginal, yTrainRawOriginal, xTestRawOriginal, yTestRawOriginal) = Assignment1Support.TrainTestSplit(xRaw, yRaw) (xTrainRaw, yTrainRaw) = AddNoise.MakeProblemHarder(xTrainRawOriginal, yTrainRawOriginal) (xTestRaw, yTestRaw) = AddNoise.MakeProblemHarder(xTestRawOriginal, yTestRawOriginal) (xTrain, xTest) = Assignment1Support.Featurize(xTrainRaw, xTestRaw) yTrain = yTrainRaw yTest = yTestRaw ### Get the Mutual Information Words as features import FeatureSelection print('### Get the Mutual Information features') mutualInformationTable = FeatureSelection.byMutualInformation(xTrainRaw, yTrain) words = [word for word,_ in mutualInformationTable[:295]] (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words) print('### Merge the features') xTrain = np.hstack([xTrain, xNewTrain]) xTest = np.hstack([xTest, xNewTest]) import RandomForest ############################ print("========== Building one Model and output the accuracy ==========") model = RandomForest.RandomForest(num_trees = 10, min_to_split = 2, use_bagging = True, restrict_features = 20) print("### Training with Random Forest") model.fit(xTrain, yTrain)
#========================= Classifying the metadata ===============================# from DataPrep import data_return X_data, class_names= data_return() X_data = np.nan_to_num(X_data) clf = SGDClassifier() model=clf.fit(X_data, DataPrep.train_data[1]) # TODO: This has to be done on test_data op=model.predict(X_data[None,0,:]) accuracy=0 Y_pred={} for i in range(len(DataPrep.train_data[1])): accuracy+=int(model.predict(X_data[None,i,:])== DataPrep.train_data[1][i]) Y_pred[i]=model.predict(X_data[None,i,:])[0] accuracy/=len(DataPrep.train_data[1]) cm = confusion_matrix(DataPrep.train_data[1],list(Y_pred.values())) plt.figure() confusion_matrix=FeatureSelection.plot_confusion_matrix(cm, classes=class_names, normalize=True, title='Normalized confusion matrix')
def main(): #=================================Linear Regression Baseline================================== LinearRegression_baseline.main( ) #All the datasets are loaded within the function itself #============================================================================================= # ============================================================================================= models_baseline.main() # ============================================================================================= models = np.array([ LinearRegression, DecisionTreeRegressor, KNeighborsRegressor, Ridge, MLPRegressor, RandomForestRegressor, ElasticNet ]) #models_names = ["LinearRegression", "DecisionTreeRegressor", "KNeighborsRegressor", "Ridge", "MLPRegressor", "RandomForestRegressor", "ElasticNet"] models_names = ["LR", "DTR", "KNR", "Ridge", "MLPR", "RFR", "EN"] stock = ["ICICI", "TATA", "VEDL", "REDDY"] model_count = models.shape[0] fig_no = np.zeros([4 * model_count]) features = SelectKBest(f_regression) #train_X1,train_Y1,test_X1,test_Y1 = FeatureSelection.main(train_X,train_Y,test_X,test_Y) rmse = [] #k_value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13] #*********************************************ICICI data**************************************** for i in range(0, model_count): train_X, train_Y, test_X, test_Y = data.load_ICICI() #load the data start_time = time.clock() clf = models[i]() train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main( clf, train_X, train_Y, test_X, test_Y) k_value = [best_K] fig_no[i] = i hyper_parameters = hyper.main(models[i], model_count) hyper_parameters['Kbest__k'] = k_value pipeline = Pipeline([('Kbest', features), ('model', clf)]) rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters) rand_search.fit(train_X, train_Y) predictions = rand_search.predict(test_X) end_time = time.clock() - start_time # Calculate the speed filename = "..\Speed\ICICI_" + "{}".format( models_names[i]) + "_time.txt" # Write the speed to a file target = open(filename, 'w') target.write(str(end_time)) MSE = mean_squared_error(test_Y, predictions) RMSE = math.sqrt(MSE) # Calculate RMSE filename = "..\RMSE\ICICI_" + "{}".format( models_names[i]) + "_rmse.txt" #Store RMSE to a file target = open(filename, 'w') target.write(str(RMSE)) print "ICICI ", models_names[i], " ", RMSE rmse.append(RMSE) Graphs_plotting.line_graph(fig_no[i], test_Y, predictions, models_names[i], stock[0]) Graphs_plotting.bar_chart(101, rmse, models_names, stock[0], model_count) rmse = [] # *********************************************TATA data***************************************** for i in range(0, model_count): train_X, train_Y, test_X, test_Y = data.load_TATA() start_time = time.clock() clf = models[i]() train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main( clf, train_X, train_Y, test_X, test_Y) k_value = [best_K] fig_no[model_count + i] = model_count + i pipeline = Pipeline([('Kbest', features), ('model', clf)]) hyper_parameters = hyper.main(models[i], model_count) hyper_parameters['Kbest__k'] = k_value rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters) rand_search.fit(train_X, train_Y) predictions = rand_search.predict(test_X) end_time = time.clock() - start_time filename = "..\Speed\TATA_" + "{}".format( models_names[i]) + "_time.txt" target = open(filename, 'w') target.write(str(end_time)) MSE = mean_squared_error(test_Y, predictions) RMSE = math.sqrt(MSE) filename = "..\RMSE\TATA_" + "{}".format(models_names[i]) + "_rmse.txt" target = open(filename, 'w') target.write(str(RMSE)) print "TATA ", models_names[i], " ", RMSE rmse.append(RMSE) Graphs_plotting.line_graph(fig_no[model_count + i], test_Y, predictions, models_names[i], stock[1]) Graphs_plotting.bar_chart(102, rmse, models_names, stock[1], model_count) rmse = [] # *********************************************VEDL data***************************************** for i in range(0, model_count): train_X, train_Y, test_X, test_Y = data.load_VEDL() start_time = time.clock() clf = models[i]() train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main( clf, train_X, train_Y, test_X, test_Y) k_value = [best_K] fig_no[2 * model_count + i] = 2 * model_count + i pipeline = Pipeline([('Kbest', features), ('model', clf)]) hyper_parameters = hyper.main(models[i], model_count) hyper_parameters['Kbest__k'] = k_value rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters) rand_search.fit(train_X, train_Y) predictions = rand_search.predict(test_X) end_time = time.clock() - start_time filename = "..\Speed\VEDL_" + "{}".format( models_names[i]) + "_time.txt" target = open(filename, 'w') target.write(str(end_time)) # if (models[i] in [MLPRegressor, KNeighborsRegressor, RandomForestRegressor]): # predictions = predictions - (predictions[0] - test_Y[0])#normalizing the predicted values MSE = mean_squared_error(test_Y, predictions) RMSE = math.sqrt(MSE) filename = "..\RMSE\VEDL_" + "{}".format(models_names[i]) + "_rmse.txt" target = open(filename, 'w') target.write(str(RMSE)) print "VEDANTA ", models_names[i], " ", RMSE rmse.append(RMSE) Graphs_plotting.line_graph(fig_no[2 * model_count + i], test_Y, predictions, models_names[i], stock[2]) Graphs_plotting.bar_chart(103, rmse, models_names, stock[2], model_count) rmse = [] # *********************************************REDDY data***************************************** for i in range(0, model_count): train_X, train_Y, test_X, test_Y = data.load_REDDY() start_time = time.clock() clf = models[i]() train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main( clf, train_X, train_Y, test_X, test_Y) k_value = [best_K] fig_no[3 * model_count + i] = 3 * model_count + i pipeline = Pipeline([('Kbest', features), ('model', clf)]) hyper_parameters = hyper.main(models[i], model_count) hyper_parameters['Kbest__k'] = k_value rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters) rand_search.fit(train_X, train_Y) predictions = rand_search.predict(test_X) end_time = time.clock() - start_time filename = "..\Speed\REDDY_" + "{}".format( models_names[i]) + "_time.txt" target = open(filename, 'w') target.write(str(end_time)) MSE = mean_squared_error(test_Y, predictions) RMSE = math.sqrt(MSE) filename = "..\RMSE\REDDY_" + "{}".format( models_names[i]) + "_rmse.txt" target = open(filename, 'w') target.write(str(RMSE)) print "DR REDDY ", models_names[i], " ", RMSE rmse.append(RMSE) Graphs_plotting.line_graph(fig_no[3 * model_count + i], test_Y, predictions, models_names[i], stock[3]) Graphs_plotting.bar_chart(103, rmse, models_names, stock[3], model_count)
(xRaw, yRaw) = Assignment1Support.LoadRawData(kDataPath) (xTrainRaw, yTrainRaw, xTestRaw, yTestRaw) = Assignment1Support.TrainTestSplit(xRaw, yRaw) yTrain = yTrainRaw yTest = yTestRaw print('========== Debug on raw data =========') num_trees = 10 min_to_split = 12 use_bagging = True restrict_features = 70 print("========== Preprocess the Data ==========") (xTrainRawNormalize, xTestRawNormalize) = FeatureSelection.preprocess(xTrainRaw, xTestRaw) print('========== Merge Features ==========') print('Use 5 Hand Craft Words as Features') (xTrainHand, xTestHand, featuresName) = FeatureSelection.hand_craft_features( xTrainRaw, xTestRaw, 2) print('Use 70 Mutual Information Words as Features') model = RandomForest.RandomForest(num_trees=num_trees, min_to_split=min_to_split, use_bagging=use_bagging, restrict_features=restrict_features) mutualInformationTable = FeatureSelection.byMutualInformation( xTrainRawNormalize, yTrain) words = [word for word, _ in mutualInformationTable[:70]] (xTrainMI, xTestMI) = FeatureSelection.Featurize(xTrainRawNormalize,
''' """ This script will read all the emails and it will train the classifier """ import os from Email import * from FeatureSelection import * from NaiveBayesClassifier import * trainPath = "dataset" trainSet_emails = [] #create an email for every file we read for f in os.listdir(trainPath): fileName = trainPath + '/' + f e = Email() if "spm" in fileName: e.setCategory("SPAM") else: e.setCategory("HAM") e.read(fileName) #insert the email we created to a collection of emails trainSet_emails.append(e) #select features from our training set(automatic feature selection) fs = FeatureSelection(trainSet_emails) fs.selectFeatures() #create a naive bayes classifier and train it nb = NaiveBayesClassifier() nb.setEmails(trainSet_emails) nb.train()
def evalCrossval(estimator, data, es=ExperimentSettings(), nFolds=10, printTree=False, verbose=False, random_seed=44): ''' Calculate average cross validation score on the split train data to evaluate performance of trained models @param estimator: the machine learning estimator @param feats_train: Features for generated training data @param labels_train: symptom severity label for generated training data @param nFolds: number of folds in k-fold cross validation ''' # scores = cross_validation.cross_val_score(estimator, feats_train, labels_train, scoring='mean_absolute_error', cv=nFolds, verbose=1) # print("Average cross validation score (mean absolute error): ", np.average(scores)) labels = [x.severity for x in data] folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=es.random_seed) min_max_scalar = MinMaxScaler() metrics = defaultdict(list) confMat = None generatePrimaryFeats(data, es) utils.out('Generated primary features!') #_, vocab = getFeats(data, ['MED']) #print(vocab) # For each fold for fold_idx, fold in enumerate(folds): #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data innerData = copy(data) train_bucket, test_bucket = fold # Generate data-driven features (meta-features) # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular') trainSet = [copy(innerData[idx]) for idx in train_bucket] generateDataDrivenFeats(trainSet, innerData, es) if verbose: utils.out('Generated data-driven features!') # Deriving the values for the trainset, also generating the vocabulary featurized = featurize(innerData) # Get all featurized documents from by using the indices in the train and test buckets. train_feats = [featurized[idx] for idx in train_bucket] test_feats = [featurized[idx] for idx in test_bucket] #Do feature selection on train data y_train = [labels[idx] for idx in train_bucket] train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, train_bucket, es) vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_feats) # Same for test data. x_test = vectorizer.transform(test_feats) y_test = [labels[idx] for idx in test_bucket] if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) # calculate Inter-annotator weighting. weights_train = getWeights(data, train_bucket, es.weighInterAnnot) if verbose: utils.out("Running fold", fold_idx) model = train(estimator, x_train, y_train, weights_train, model=None) #for part in model.estimators_: #graph = export_graphviz(part, out_file=None, feature_names=vectorizer.feature_names_) #selFeats = utils.find_between(graph, 'label="','gini') # output the importance of features try: indices = np.argsort(model.feature_importances_)[::-1] featImportances = [[ vectorizer.feature_names_[x], model.feature_importances_[x] ] for x in indices] except: featImportances = None y_pred = test(x_test, model) #print(y_pred) if confMat is None: confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) else: confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) if verbose: utils.out("Actual", y_test) utils.out("Predicted", y_pred) if printTree: save_decision_tree( cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model, fold_idx, vectorizer.get_feature_names()) calc_and_append_scores(y_test, y_pred, metrics, featImportances) return save_results(vectorizer, metrics, confMat, es, nFolds)
def eval_bootstrapped_crossVal(estimator, data, bootstrap_data, es=ExperimentSettings(), nFolds=10, printTree=False, verbose=False, th_bs=0.6, random_seed=44): labels = [x.severity for x in data] folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=es.random_seed) min_max_scalar = MinMaxScaler() metrics = defaultdict(list) confMat = None generatePrimaryFeats(data, es) generatePrimaryFeats(bootstrap_data, es) utils.out('Generated primary features!') # For each fold for fold_idx, fold in enumerate(folds): #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data trainAndTestData = copy(data) train_bucket, test_bucket = fold # Generate data-driven features (meta-features) # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular') trainData = [copy(trainAndTestData[idx]) for idx in train_bucket] y_train = [labels[idx] for idx in train_bucket] (new_train_data, new_y_train) = get_bootstrapped_trainset(trainData, y_train, bootstrap_data, es, estimator, th_bs) testData = [copy(trainAndTestData[idx]) for idx in test_bucket] allData = new_train_data + testData generateDataDrivenFeats(new_train_data, allData, es) if verbose: utils.out('Generated data-driven features!') # Deriving the values for the trainset, also generating the vocabulary featurized = featurize(allData) # Get all featurized documents from by using the indices in the train and test buckets. train_feats = featurized[0:len(new_train_data)] test_feats = featurized[len(new_train_data):len(featurized)] #Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, new_y_train, es) train_feats, new_y_train, new_train_bucket = ss.runSampleSelection( train_feats, new_y_train, [i for i in range(len(new_train_data))], es) vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_feats) # Same for test data. x_test = vectorizer.transform(test_feats) y_test = [labels[idx] for idx in test_bucket] new_weights_train = getWeights(new_train_data, new_train_bucket, es.weighInterAnnot) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) if verbose: utils.out("Running fold", fold_idx) model = train(estimator, x_train, new_y_train, new_weights_train, model=None) # output the importance of features indices = np.argsort(model.feature_importances_)[::-1] featImportance = [[ vectorizer.feature_names_[x], model.feature_importances_[x] ] for x in indices] y_pred = test(x_test, model) if confMat is None: confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) else: confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) if verbose: utils.out("Actual", y_test) utils.out("Predicted", y_pred) if printTree: save_decision_tree( cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model, fold_idx, vectorizer.get_feature_names()) calc_and_append_scores(y_test, y_pred, metrics, featImportance) return save_results(vectorizer, metrics, confMat, es, nFolds)