def explore_final_model(): #https://github.com/gameofdimension/xgboost_explainer/blob/master/xgboost_explainer_demo.ipynb nr_labels = len(y) value_counts = y.value_counts() perc_per_label = {k:round(100 * v/float(nr_labels),2) for k,v in value_counts.items()} print('value counts:', y.value_counts()) print('perc per label:', perc_per_label) model = pickle.load(open(filename_model, "rb")) model_feature_names = model.attr('feature_names').split('|') index_to_class = json.loads(model.attr('index_to_class')) print(index_to_class) classes = [index_to_class[k] for k in sorted(index_to_class.keys())] print(classes) print('eli5 explain weights (gain):\n',eli5.format_as_text(eli5.explain_weights(model, top=10))) #gain df_test = pd.read_json(open(test_filename, "r")) df_test = df_test.head(5) feature_extractor = FeatureExtractor(df_test) X_test, X_test_featurenames = feature_extractor.get_features_pred_instances(df_test, model_feature_names) print(X) print(set(X.dtypes)) # print(X.iloc[0]) print(eli5.format_as_text(eli5.explain_prediction(model, X_test.head(1), target_names = classes, top = 10, feature_names = X_test_featurenames)))
def explain(model_path): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") sent_word2vec_path = "./data/word2vec.query.bin" sent_vocab_path = "./data/word2vec.query.vocab" sent_model_path = "./data/sif.model" sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path, binary=True) sent_vocab_dict = load_vocab(sent_vocab_path) sent_model = joblib.load(sent_model_path) tfidf_count_hash_vectorModels = VectorModels() ner_dict_path = "./data/ner.dict" syn_dict_path = "./data/syn.dict" ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path) model = joblib.load(model_path) pd.set_option('display.max_rows', None) explain = eli5.explain_weights(model, top=None) explain = eli5.format_as_text(explain) print explain feature_names = [] column_names = ["qid", "ql", "qr"] #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100) reader = pd.read_csv(sys.stdin, sep="\t", dtype="str", names=column_names, chunksize=1) first_chunk = True feature_extractor = lambda row: extract_features( wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict) for data in reader: _ = data.fillna("", inplace=True) X = data[["ql", "qr"]].apply(feature_extractor, axis=1) X_features = X.apply(pd.Series) feature_names = X_features.columns.values.tolist() X_features = X_features[feature_names] y_preds = model.predict_proba(X_features, ntree_limit=model.best_ntree_limit) y_preds = map(lambda o: o[1], y_preds) data = pd.concat([data, X_features], axis=1) data = data.assign(predict=y_preds) #if first_chunk: # data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w") # first_chunk = False #else: # data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a") data.to_csv(sys.stdout, header=False, sep="\t") explain = eli5.explain_prediction(model, X_features.iloc[0]) explain = eli5.format_as_text(explain) print explain print X_features.iloc[0]
def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) assert str(df) == ('to class2 class1\n' 'from \n' 'class2 1.5 2.5\n' 'class1 3.5 4.5') with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def result(): tweet = str(request.data) explain = explain_prediction(gnb, tweet, vec=tfid, target_names=['known weird', 'less weird']) return str(format_as_text(explain))
def visualise_feature_importance(model, title_variable, x_test, y_test): print( eli5.format_as_text( eli5.explain_weights( PermutationImportance(model, random_state=42).fit(x_test, y_test)))) importances = model.feature_importances_ labels = [ "X{} - {:4.1f}%".format(i, importances[i - 1] * 100) for i in range(1, 8) ] patches, texts = plt.pie(importances, wedgeprops=dict(width=0.5), startangle=90, radius=1.2) plt.legend(patches, labels, prop={'size': 12}, bbox_to_anchor=(0.74, 0.5), loc="center right", fontsize=8) plt.title("Feature Importance for {}".format(title_variable)) plt.savefig("plots/fi{}.png".format(title_variable)) plt.show()
def performance_measurement(crf_model, x, y, g_sentences): """Utilizes different functions to measure the model's performance and saves the results to files for review.""" # Cross-validating the model cross_val_predictions = cross_val_predict(estimator=crf_model, X=x, y=y, cv=5) report = flat_classification_report(y_pred=cross_val_predictions, y_true=y) file = open( f'results/performance_measurement_results_{datetime.datetime.today().date()}.txt', 'a', encoding='utf-8') file.seek(0) file.truncate() print2both('created on:', str(datetime.datetime.today().date()), '\n', file=file) print2both('flat_classification_report:\n\n', report, '\n\n', file=file) print2both('cross_val_predict:\n\n', cross_val_predictions, '\n\n', file=file) # Showing the weights assigned to each feature print2both('eli5.explain_weights(crf, top=100):\n\n', eli5.format_as_text(eli5.explain_weights(crf_model, top=100)), '\n\n', file=file) file.close() # Saving the potentially correct and the incorrect classifications in separate CSV files for review categorize_predictions(gold_sents=g_sentences, y_hat=cross_val_predictions, y_actual=y)
def SGD(): train_text, test_text, ytrain, ytest = train_test_split(df['description'], df['category'], random_state=42) word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 8)) word_vectorizer.fit(train_text) char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(1, 5)) char_vectorizer.fit(train_text) sgd_cls = SGDClassifier(max_iter=2) sgd_cls.fit(word_vectorizer.transform(train_text), ytrain) print( eli5.format_as_text(eli5.explain_weights(sgd_cls, vec=word_vectorizer))) print( eli5.format_as_text( eli5.explain_prediction( sgd_cls, df['description'][df['points'] <= 81].values[0], vec=word_vectorizer))) X = hstack([ word_vectorizer.transform(train_text), char_vectorizer.transform(train_text) ]) sgd_cls = SGDClassifier(max_iter=2) sgd_cls.fit(X, ytrain) predict = sgd_cls.predict( hstack([ word_vectorizer.transform(test_text), char_vectorizer.transform(test_text) ])) acc = np.mean(ytest == np.around(predict)) print('Dokladnosc: {0:.3}'.format(acc))
def process_xgb(): col, train, test, test_ref = load_data() print(train.shape, test.shape, test_ref.shape) params = { 'colsample_bytree': 0.055, 'colsample_bylevel': 0.4, 'gamma': 1.5, 'learning_rate': 0.01, 'max_depth': 5, 'objective': 'reg:linear', 'booster': 'gbtree', 'min_child_weight': 10, 'n_estimators': 1800, 'reg_alpha': 0, 'reg_lambda': 0, 'eval_metric': 'rmse', 'subsample': 0.7, 'silent': True, 'seed': 7, } folds = 20 full_score = 0.0 xg_test = xgb.DMatrix(test[col]) use_regressor = True use_regressor = False for fold in range(folds): x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), test_size=0.0010, random_state=fold) if use_regressor: p = params model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'], colsample_bylevel=p['colsample_bylevel'], gamma=p['gamma'], learning_rate=p['learning_rate'], max_depth=p['max_depth'], objective=p['objective'], booster=p['booster'], min_child_weight=p['min_child_weight'], n_estimators=p['n_estimators'], reg_alpha=p['reg_alpha'], reg_lambda=p['reg_lambda'], eval_metric=p['eval_metric'] , subsample=p['subsample'], silent=1, n_jobs = -1, early_stopping_rounds = 100, random_state=7, nthread=-1) model.fit(x1, y1) score = np.sqrt(mean_squared_error(y2, model.predict(x2))) test['target'] += np.expm1(model.predict(test[col])) else: xg_valid = xgb.DMatrix(x2, label=y2) xg_train = xgb.DMatrix(x1, label=y1) model = xgb.train(params, xg_train, params['n_estimators']) score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid))) test['target'] += np.expm1(model.predict(xg_test)) print('Fold', fold, 'Score', score) full_score += score full_score /= folds print('Full score', full_score) test['target'] /= folds test.loc[test_ref.target > 0, 'target'] = test_ref[test_ref.target > 0].target.values test[['ID', 'target']].to_csv('subxgb.csv', index=False) explain=False #explain=True if explain and not use_regressor: print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
def interpret_model(model: Pipeline, all_features: list): log.debug("All features: {}".format(all_features)) # Explain the model log.info( eli5.format_as_text( eli5.explain_weights( model.named_steps["model"], feature_names=all_features)))
def show_feature_importance(self): print("\n\n\n\n+++++++++++++++++++++++++++++++") print('Calculating feature importance for model ', self.name, "...") perm = PermutationImportance(self.get_eli5_model(), random_state=1).fit( self.x_test, self.y_test) print(self.name, 'model feature importance') print( eli5.format_as_text( eli5.explain_weights(perm, feature_names=self.feature_names)))
def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) expected = pd.DataFrame([ { 'from': 'class2', 'to': 'class2', 'coef': 1.5 }, { 'from': 'class2', 'to': 'class1', 'coef': 2.5 }, { 'from': 'class1', 'to': 'class2', 'coef': 3.5 }, { 'from': 'class1', 'to': 'class1', 'coef': 4.5 }, ], columns=['from', 'to', 'coef']) assert df.equals(expected) with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def computePermutationImportance(data_test, target_test, clf): perm = PermutationImportance(clf, random_state=1).fit(data_test, target_test) permString = (eli5.format_as_text(eli5.explain_weights(perm, feature_names=data_test.columns.tolist()))) permString = permString.split('\n', 9)[-1] all_rows = permString.split("\n") all_cols = [row.split(' ') for row in all_rows] all_cols.pop(0) fimp = [row[0] for row in all_cols] errot = [row[2] for row in all_cols] name = [row[4] for row in all_cols] dfvals = pd.DataFrame(list(zip(fimp, errot, name)), columns=['A', 'B', 'C']) fname = os.path.join(tree_evaluations_out, str(classifierName) + '_permutations_importances.csv') dfvals.to_csv(fname, index=False)
def permutation_importance(clf,X,Y,features,random_state=42,scoring=None): from eli5 import explain_weights, format_as_text from eli5.sklearn import PermutationImportance # Extract the classifier object from the clf multilearn object clf.verbose = False #Turn verbose off after this to tidy prints # Calculate feature importances #TODO - how to pick out label from clf to print feature importances and pdp's for specified label perm = PermutationImportance(clf, random_state=random_state,scoring=scoring).fit(X, Y) print(format_as_text(explain_weights(perm, feature_names = features),show=['feature_importances'])) clf.verbose = True # reset return
def permutation_importance(dataset, Processing_Unit): data = dataset y = data.author X = data.drop("author", axis=1) if Processing_Unit == "FUNCTION": X = X.drop("function", axis=1) feature_names = [i for i in data.columns if data[i].dtype in [np.int64]] X = data[feature_names] train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) my_model = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X, train_y) perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y) #print(eli5.format_as_text(eli5.explain_weights(perm,feature_names = val_X.columns.tolist()))) w = open(settings.aux_perm, 'w') w.truncate(0) w.write( eli5.format_as_text( eli5.explain_weights(perm, feature_names=val_X.columns.tolist())))
def _train(self): """ Build the model with the experiment configuration represented by this object """ self._logger.debug("---Building model for %s", self._signature) assert self._regression_inputs xdata, ydata = self._regression_inputs.get_xy_data( self._regression_inputs.inputs_split["training"]) with warnings.catch_warnings(): warnings.simplefilter("ignore") self._regressor.fit(xdata, ydata) self._logger.debug("---Model built") # Da simone expl = eli5.xgboost.explain_weights_xgboost( self._regressor, top=None) # feature_names= XXX self.feature_names XXX expl_weights = eli5.format_as_text(expl) self._logger.debug("---Features Importance Computed") # OK target = open( os.path.join(self._experiment_directory, "explanations.txt"), 'w') target.write(expl_weights) target.close()
print("Decided Tree Classification") clf = DecisionTreeClassifier() clf = clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print('Accuracy: ',accuracy_score(y_test,y_pred)) scores = cross_val_score(clf,X,y,cv=5) print('Scores: ',scores) print('Final Score: ',scores.mean()) import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(clf,random_state=1).fit(X_test,y_test) print(eli5.format_as_text(eli5.explain_weights(perm))) "Support Vector Classification" print("Support Vector Classification") from sklearn.svm import SVC clf = SVC() clf = clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print('Accuracy: ',accuracy_score(y_test,y_pred)) scores = cross_val_score(clf,X,y,cv=5) print("Scores: ",scores) print('Final Score: ',scores.mean())
accuracy_score(yTest, valid_pred_GBoost))) print('Accuracy of Mnb classifier on test set: {:.3f}'.format( accuracy_score(yTest, valid_pred_Mnb))) # save the model to disk from joblib import dump dump(tfidf_logit_pipeline, './models/LinearRegression-model.joblib') dump(tfidf_logit_pipeline_RandomForestClassifier, './models/RandomForestClassifier-model.joblib') dump(tfidf_logit_pipeline_SVC, './models/SVC-model.joblib') dump(tfidf_logit_pipeline_GBoost, './models/GBoost-model.joblib') dump(tfidf_logit_pipeline_Mnb, './models/Mnb-model.joblib') cm_lrc = confusion_matrix(yTest, valid_pred_SVC) f, ax = plt.subplots(figsize=(5, 5)) sns.heatmap(cm_lrc, annot=True, linewidths=0.5, linecolor="gray", fmt=".0f", ax=ax) plt.title('Confusion matrix of SVC') plt.ylabel('Predicted label') plt.xlabel('True label') #showing weights import eli5 print(eli5.format_as_text(eli5.explain_weights(tfidf_logit_pipeline))) plt.show()
X = data[base_features] train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) first_model = RandomForestRegressor(n_estimators=50, random_state=1).fit(train_X, train_y) # show data print("Data sample:") print(data.head()) # Show permutation importance perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names=val_X.columns.tolist()) print( eli5.format_as_text( eli5.explain_weights(perm, feature_names=val_X.columns.tolist()))) ############ ### Creating new features ############ data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude) data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude) features_2 = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'abs_lat_change', 'abs_lon_change' ] X = data[features_2] new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(
-1] #[::-1] reverses the ascending result of argsort, indices of arrays sorted rankedFeatures = features[rankedFeatureIds] numRanks = 15 # RF: At 15 (score:0.786) and 30 max scores. Score declines with decreasing features # LR: at 10 scores were less than full-set but at 15 features, scores halved. # With PermutationInporatance RF performs better than LR featuresTopNRanks = list(rankedFeatures[0:numRanks]) featuresToDrop = list(rankedFeatures[numRanks:-1]) print('Selected features \n', featuresTopNRanks) print('features to drop:\n', repr(featuresToDrop)) # print('Feature importance in accordance to weights\n') # print(features[permFeatureRanks]) # only for printing in readable format permExpWghts = eli5.explain_weights( perm, feature_names=X_train.columns.to_list()) permFeatureRanksText = eli5.format_as_text( permExpWghts) # only for printing print(permFeatureRanksText) # based on importance, select only top 10 columns for building model dataDFp = dataDF.copy() fdataDFp = dataDFp[featuresTopNRanks] X_train, X_test, y_train, y_test = train_test_split(fdataDFp, outcomeVarDF, test_size=0.2, random_state=42) clf = RandomForestClassifier(random_state=0).fit(X_train, y_train) print('RF Model fitted with' + repr(numRanks) + 'features\n RF feature importances\n') important_features = pd.Series(data=clf.feature_importances_[0:numRanks],
'random_state': [0], } # Instantiate the grid search model hyperp_srch = GridSearchCV(estimator=rf_model, param_grid=group_param, cv=5, return_train_score=False) hyperp_srch.fit(x_train, y_train) #print(hyperp_srch.best_params_) best_hyper = hyperp_srch.best_estimator_ rf_model = RandomForestClassifier(**best_hyper.get_params()) rf_model.fit(x_train, y_train) y_pred_train = rf_model.predict(x_train) y_pred_val = rf_model.predict(x_val) ## End print('Classification Report: \n') print(classification_report(y_val, y_pred_val)) print('\nConfusion Matrix: \n') print(confusion_matrix(y_val, y_pred_val)) permutation = PermutationImportance(rf_model, random_state=2).fit(x_train, y_train) eli5.explain_weights(permutation, feature_names=x.columns.tolist()) print( eli5.format_as_text( eli5.explain_weights(permutation, feature_names=x.columns.tolist())))
################################################################################## ## Random Forest Regressor for permutation importance rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=42) rf.fit(X_train, Y_train) perm = PermutationImportance(rf, random_state=1).fit(X_validation, Y_validation) results.write('\n\n\nRANDOM FOREST REGRESSOR PERMUTATION IMPORTANCE\n\n\n') print( eli5.format_as_text( eli5.explain_weights(perm, feature_names=data.columns.tolist()))) results.write( eli5.format_as_text( eli5.explain_weights(perm, feature_names=data.columns.tolist()))) ########################################################## #### CREATE SHADOW MODEL IN FORM OF RULE FIT ALGORITHM ### ########################################################## rf = RuleFit() rf.fit(X_train, [int(i) for i in Y_train], feature_names=[ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ])
def process(self, inputs): max_features = self._campaign_configuration['FeatureSelection']['max_features'] # setting parameters for XGboost design space expoloration xgboost_parameters = copy.deepcopy(self._campaign_configuration) xgboost_parameters['General']['techniques'] = ['XGBoost'] xgboost_parameters['General']['run_num'] = 1 local_root_directory = self._campaign_configuration['General']['output'] for token in self._prefix: local_root_directory = os.path.join(local_root_directory, token) xgboost_parameters['General']['output'] = local_root_directory del xgboost_parameters['FeatureSelection'] model_building_var = model_building.model_building.ModelBuilding(0) if 'XGBoost' not in xgboost_parameters: # default parameters if not provided in the ini file xgboost_parameters['XGBoost'] = {} xgboost_parameters['XGBoost']['min_child_weight'] = [1, 3] xgboost_parameters['XGBoost']['gamma'] = [0, 1] xgboost_parameters['XGBoost']['n_estimators'] = [50, 100, 150, 250] xgboost_parameters['XGBoost']['learning_rate'] = [0.01, 0.05, 0.1] xgboost_parameters['XGBoost']['max_depth'] = [1, 2, 3, 5, 9, 13] best_conf = model_building_var.process(xgboost_parameters, inputs, int(self._campaign_configuration['General']['j'])) # best_conf is a XGBoost configuration exeperiment xgb_regressor = best_conf.get_regressor() # top = None means all expl = eli5.xgboost.explain_weights_xgboost(xgb_regressor, feature_names=inputs.x_columns, top=max_features, importance_type='gain') # text version expl_weights = eli5.format_as_text(expl) self._logger.debug("XGBoost feature scores:\n%s", str(expl_weights)) df = eli5.format_as_dataframe(expl) # data frame version xgb_sorted_features = df['feature'].values.tolist() # features list features_sig = df['weight'].values.tolist() # significance score weights cumulative_significance = 0 tolerance = self._campaign_configuration['FeatureSelection']['XGBoost_tolerance'] index = 0 while cumulative_significance < tolerance and index < len(features_sig): cumulative_significance = cumulative_significance + features_sig[index] index = index + 1 feat_res = xgb_sorted_features[0:index] self._logger.info("XGBoost selected features: %s", str(feat_res)) data = inputs data.x_columns = feat_res return data
def perm_import(model, features, X_val, y_val): perm = PermutationImportance(model, random_state=1).fit(X_val, y_val) # eli5.show_weights(perm, feature_names = features) print(eli5.format_as_text(eli5.explain_weights(model))) pass
def process_xgb(): col, train, test, test_ref = load_data() print(train.shape, test.shape, test_ref.shape) params = { 'colsample_bytree': 0.055, 'colsample_bylevel': 0.4, 'gamma': 1.5, 'learning_rate': 0.01, 'max_depth': 5, 'objective': 'reg:linear', 'booster': 'gbtree', 'min_child_weight': 10, 'n_estimators': 1800, 'reg_alpha': 0, 'reg_lambda': 0, 'eval_metric': 'rmse', 'subsample': 0.7, 'silent': True, 'seed': 7, } folds = 20 full_score = 0.0 xg_test = xgb.DMatrix(test[col]) use_regressor = True use_regressor = False for fold in range(folds): x1, x2, y1, y2 = model_selection.train_test_split( train[col], np.log1p(train.target.values), test_size=0.0010, random_state=fold) if use_regressor: p = params model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'], colsample_bylevel=p['colsample_bylevel'], gamma=p['gamma'], learning_rate=p['learning_rate'], max_depth=p['max_depth'], objective=p['objective'], booster=p['booster'], min_child_weight=p['min_child_weight'], n_estimators=p['n_estimators'], reg_alpha=p['reg_alpha'], reg_lambda=p['reg_lambda'], eval_metric=p['eval_metric'], subsample=p['subsample'], silent=1, n_jobs=-1, early_stopping_rounds=100, random_state=7, nthread=-1) model.fit(x1, y1) score = np.sqrt(mean_squared_error(y2, model.predict(x2))) test['target'] += np.expm1(model.predict(test[col])) else: xg_valid = xgb.DMatrix(x2, label=y2) xg_train = xgb.DMatrix(x1, label=y1) model = xgb.train(params, xg_train, params['n_estimators']) score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid))) test['target'] += np.expm1(model.predict(xg_test)) print('Fold', fold, 'Score', score) full_score += score full_score /= folds print('Full score', full_score) test['target'] /= folds test.loc[test_ref.target > 0, 'target'] = test_ref[test_ref.target > 0].target.values test[['ID', 'target']].to_csv('subxgb.csv', index=False) explain = False #explain=True if explain and not use_regressor: print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
#%% #parseNER('Data/Train/DrugBank/Aciclovir_ddi.xml') #buildTrainTestNER() if __name__ == "__main__": train_x, train_y, test_x, test_y, testfull = prepareTrainTestforTraining() crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=10, all_possible_transitions=True) crf.fit(train_x, train_y) weight_explined = eli5.format_as_text(eli5.explain_weights(crf, top=30)) labels = list(crf.classes_) labels.remove('O') labels y_pred = crf.predict(test_x) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(test_y, y_pred, labels=sorted_labels, digits=3)) classification_report = metrics.flat_classification_report( test_y, y_pred, labels=sorted_labels, digits=3)
train_y = train.SalePrice train_X = train[train.columns.difference(['SalePrice', 'Id'])].values preprocessing = Pipeline([('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), ('onehot', OneHotEncoder())]) #visualizer = rank2d(preprocessing.fit_transform(train_X).todense(), train_y) #plt.show() y_test = train.SalePrice X_text = train[train.columns.difference(['SalePrice', 'Id'])].values pipeline = Pipeline([('preprocessing', preprocessing), ('tree', DecisionTreeRegressor(criterion='mse', random_state=1, max_leaf_nodes=100))]) # Fit Model pipeline.fit(train_X, train_y) predictions = pipeline.predict(X_text) print(r2_score(y_test, predictions)) import eli5 print(eli5.format_as_text(eli5.explain_weights(pipeline.named_steps['tree'])))
#count_vectorizer.fit(data["text"]) #feature_names = count_vectorizer.get_feature_names() #tokens_with_weights = sorted(zip(classifier.coef_[0], feature_names))[:20] #counts = count_vectorizer.fit_transform(data["text"].values) #feature_names = count_vectorizer.get_feature_names() kf = KFold(3) for train_index, test_index in kf.split(data): print("TRAIN:", train_index, "TEST:", test_index) train_targets = data.iloc[train_index,1].values train_values = data.iloc[train_index,0].values test_targets = data.iloc[test_index,1].values test_values = data.iloc[test_index,0].values vec = TfidfVectorizer(stop_words=skt.ENGLISH_STOP_WORDS) clf = LogisticRegressionCV() pipe = make_pipeline(vec, clf) pipe.fit(train_values, train_targets) print_report(pipe, test_values, test_targets) print(eli5.format_as_text(eli5.explain_weights(clf, vec=vec, target_names=("UNRELIABLE", "RELIABLE")))) #counts = count_vectorizer.fit_transform(data["text"].values) #classifier.fit(counts,data.iloc[:,1].values) #pizza_data = pd.read_csv("pizzagate.csv") #test_counts = count_vectorizer.transform(pizza_data["text"].values) #print(pizza_data) #print(classifier.classes_) #print(classifier.coef_) #print(classifier.predict_proba(test_counts))
auc(fpr, tpr) #Nombre de prédiction correctes (VP+VN normalisé) rf.score(X_test,y_test) """Permutation importance""" perm = PermutationImportance(rf, random_state=1).fit(X_test, y_test) eli5.show_weights(perm, feature_names = X_test.columns.tolist()) #Code pour afficher sur l'IDE (méthode 1) print(eli5.format_as_text(eli5.explain_weights(perm, feature_names=X_test.columns.tolist()))) #Code pour afficher sur l'IDE (méthode 2) perm = PermutationImportance(rf, random_state=1).fit(X_test, y_test) html_obj = eli5.show_weights(perm, feature_names = X_test.columns.tolist()) with open('permutation-importance.htm','wb') as f: f.write(html_obj.data.encode("UTF-8")) #Analyse de sensibilité #from sklearn.grid_search import GridSearchCV from sklearn.model_selection import GridSearchCV from sklearn.ensemble import BaggingClassifier
print(metrics.flat_classification_report(y_test, y_pred, digits=3)) from collections import Counter # def print_transitions(trans_features): # for (label_from, label_to), weight in trans_features: # print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight)) # # print("Top likely transitions:") # print_transitions(Counter(crf.transition_features_).most_common(20)) # # print("\nTop unlikely transitions:") # print_transitions(Counter(crf.transition_features_).most_common()[-20:]) # # # # def print_state_features(state_features): # for (attr, label), weight in state_features: # print("%0.6f %-8s %s" % (weight, label, attr)) # # print("Top positive:") # print_state_features(Counter(crf.state_features_).most_common(30)) # # print("\nTop negative:") # print_state_features(Counter(crf.state_features_).most_common()[-30:]) # eli5.show_weights(crf, top=30) expl = eli5.explain_weights(crf, top=5) print(eli5.format_as_text(expl)) #
crf.fit(X_train, y_train) labels = list(crf.classes_) #labels.remove('O') print(labels) y_pred = crf.predict(X_test) metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) sorted_labels = sorted( labels, key=lambda name: (name[1:], name[0]) ) print(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3 )) import sys sys.exit() print(eli5.format_as_text((eli5.explain_weights(crf, top=30)))) ''''' eli5.show_weights(crf, top=5, show=['transition_features']) eli5.show_weights(crf, top=10, targets=['O', 'B-ORG', 'I-ORG']) eli5.show_weights(crf, top=10, feature_re='^word\.is', horizontal_layout=False, show=['targets']) expl = eli5.explain_weights(crf, top=5, targets=['O', 'B-LOC', 'I-LOC']) print(eli5.format_as_text(expl)) '''''
trainEntries, testEntries, \ trainLabels, testLabels, \ _, testOrigText = train_test_split(allEntries, allLabels, allOrigText) pipeCV.fit(trainEntries, trainLabels) pred = pipeCV.predict(testEntries) for i in range(len(pred)): if pred[i] != testLabels[i]: key = (testLabels[i], pred[i]) if key not in confSampleDict: confSampleDict[key] = [] predExplan = eli5.format_as_text( eli5.explain_prediction(clfCV, testEntries[i], top=TOP_K_MODEL_PRED_FEATURES, vec=vectorizer)) confSampleDict[key].append( (testOrigText[i], testEntries[i], pred[i], predExplan)) cm = confusion_matrix(testLabels, pred) / (CV_FOLDS * len(testLabels)) if confMatrix is None: confMatrix = cm else: confMatrix += cm f1 = f1_score(testLabels, pred, average=AVG_TYPE) multiClassF1Avg += f1 / CV_FOLDS recall = recall_score(testLabels, pred, average=AVG_TYPE)