def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs): new_train_set = list(trainSet) new_y_train = list(y_train) trainAndBSData = trainSet + bootstrap_data generateDataDrivenFeats(trainSet, trainAndBSData, es) featurized = featurize(trainAndBSData) train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)] test_feats = [ featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1) ] #Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, [i for i in range(0, len(trainSet), 1)], es) # calculate Inter-annotator weighting. weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot) vectorizer = DictVectorizer() x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: min_max_scalar = MinMaxScaler() x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) model = train(estimator, x_train, y_train, weights_train, model=None) y_pred_prob = model.predict_proba(x_test) for i, cur_y in enumerate(y_pred_prob): if np.max(cur_y) > th_bs: new_train_set.append(bootstrap_data[i]) new_y_train.append(np.argmax(cur_y)) return (new_train_set, new_y_train) #update none to confidence vector
def main(useAnnotatorWeighing=True): ''' This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well) Configure your model settings by modifying the ExperimentSettings object in the script. The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py ''' # Making folders from config # cfg.makeFolders() # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations features = [["DSM+1"]] #features = [["CONCEPTS"]]#['BOW'], # features = [["CONCEPTS"]] # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings) es = ExperimentSettings() # es.fs_varianceFilter = True # es.bootstrap = True # es.ss_prototyping = True # es.weighInterAnnot = False # es.ml_algorithm='RF' #remove these! # es.removeDeniedConcepts=False # es.splitFamilyConcepts=False # es.splitUncertainConcepts=False # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_data, y_train) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) concatenated_data = [] concatenated_data.extend(train_data) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_data, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_data)] test_feats = featurized[len(train_data):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, [i for i in range(len(train_data))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_data, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_train, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
def evalCrossval(estimator, data, es=ExperimentSettings(), nFolds=10, printTree=False, verbose=False, random_seed=44): ''' Calculate average cross validation score on the split train data to evaluate performance of trained models @param estimator: the machine learning estimator @param feats_train: Features for generated training data @param labels_train: symptom severity label for generated training data @param nFolds: number of folds in k-fold cross validation ''' # scores = cross_validation.cross_val_score(estimator, feats_train, labels_train, scoring='mean_absolute_error', cv=nFolds, verbose=1) # print("Average cross validation score (mean absolute error): ", np.average(scores)) labels = [x.severity for x in data] folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=es.random_seed) min_max_scalar = MinMaxScaler() metrics = defaultdict(list) confMat = None generatePrimaryFeats(data, es) utils.out('Generated primary features!') #_, vocab = getFeats(data, ['MED']) #print(vocab) # For each fold for fold_idx, fold in enumerate(folds): #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data innerData = copy(data) train_bucket, test_bucket = fold # Generate data-driven features (meta-features) # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular') trainSet = [copy(innerData[idx]) for idx in train_bucket] generateDataDrivenFeats(trainSet, innerData, es) if verbose: utils.out('Generated data-driven features!') # Deriving the values for the trainset, also generating the vocabulary featurized = featurize(innerData) # Get all featurized documents from by using the indices in the train and test buckets. train_feats = [featurized[idx] for idx in train_bucket] test_feats = [featurized[idx] for idx in test_bucket] #Do feature selection on train data y_train = [labels[idx] for idx in train_bucket] train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, train_bucket, es) vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_feats) # Same for test data. x_test = vectorizer.transform(test_feats) y_test = [labels[idx] for idx in test_bucket] if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) # calculate Inter-annotator weighting. weights_train = getWeights(data, train_bucket, es.weighInterAnnot) if verbose: utils.out("Running fold", fold_idx) model = train(estimator, x_train, y_train, weights_train, model=None) #for part in model.estimators_: #graph = export_graphviz(part, out_file=None, feature_names=vectorizer.feature_names_) #selFeats = utils.find_between(graph, 'label="','gini') # output the importance of features try: indices = np.argsort(model.feature_importances_)[::-1] featImportances = [[ vectorizer.feature_names_[x], model.feature_importances_[x] ] for x in indices] except: featImportances = None y_pred = test(x_test, model) #print(y_pred) if confMat is None: confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) else: confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) if verbose: utils.out("Actual", y_test) utils.out("Predicted", y_pred) if printTree: save_decision_tree( cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model, fold_idx, vectorizer.get_feature_names()) calc_and_append_scores(y_test, y_pred, metrics, featImportances) return save_results(vectorizer, metrics, confMat, es, nFolds)
def eval_bootstrapped_crossVal(estimator, data, bootstrap_data, es=ExperimentSettings(), nFolds=10, printTree=False, verbose=False, th_bs=0.6, random_seed=44): labels = [x.severity for x in data] folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=es.random_seed) min_max_scalar = MinMaxScaler() metrics = defaultdict(list) confMat = None generatePrimaryFeats(data, es) generatePrimaryFeats(bootstrap_data, es) utils.out('Generated primary features!') # For each fold for fold_idx, fold in enumerate(folds): #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data trainAndTestData = copy(data) train_bucket, test_bucket = fold # Generate data-driven features (meta-features) # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular') trainData = [copy(trainAndTestData[idx]) for idx in train_bucket] y_train = [labels[idx] for idx in train_bucket] (new_train_data, new_y_train) = get_bootstrapped_trainset(trainData, y_train, bootstrap_data, es, estimator, th_bs) testData = [copy(trainAndTestData[idx]) for idx in test_bucket] allData = new_train_data + testData generateDataDrivenFeats(new_train_data, allData, es) if verbose: utils.out('Generated data-driven features!') # Deriving the values for the trainset, also generating the vocabulary featurized = featurize(allData) # Get all featurized documents from by using the indices in the train and test buckets. train_feats = featurized[0:len(new_train_data)] test_feats = featurized[len(new_train_data):len(featurized)] #Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, new_y_train, es) train_feats, new_y_train, new_train_bucket = ss.runSampleSelection( train_feats, new_y_train, [i for i in range(len(new_train_data))], es) vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_feats) # Same for test data. x_test = vectorizer.transform(test_feats) y_test = [labels[idx] for idx in test_bucket] new_weights_train = getWeights(new_train_data, new_train_bucket, es.weighInterAnnot) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) if verbose: utils.out("Running fold", fold_idx) model = train(estimator, x_train, new_y_train, new_weights_train, model=None) # output the importance of features indices = np.argsort(model.feature_importances_)[::-1] featImportance = [[ vectorizer.feature_names_[x], model.feature_importances_[x] ] for x in indices] y_pred = test(x_test, model) if confMat is None: confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) else: confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3]) if verbose: utils.out("Actual", y_test) utils.out("Predicted", y_pred) if printTree: save_decision_tree( cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model, fold_idx, vectorizer.get_feature_names()) calc_and_append_scores(y_test, y_pred, metrics, featImportance) return save_results(vectorizer, metrics, confMat, es, nFolds)
def runForExperimentSettings(features, es): # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] #else argument added here to not override the train_data/y_train setting, otherwise we can only do one featType at a time if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_datac, y_trainc) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) else: train_datac = train_data y_trainc = y_train concatenated_data = [] concatenated_data.extend(train_datac) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_datac, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_datac)] test_feats = featurized[len(train_datac):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_trainc, es) train_feats, y_trainc, train_bucket = ss.runSampleSelection( train_feats, y_trainc, [i for i in range(len(train_datac))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_datac, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_trainc, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')