示例#1
0
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator,
                              th_bs):
    new_train_set = list(trainSet)
    new_y_train = list(y_train)

    trainAndBSData = trainSet + bootstrap_data

    generateDataDrivenFeats(trainSet, trainAndBSData, es)

    featurized = featurize(trainAndBSData)

    train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
    test_feats = [
        featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)
    ]

    #Do feature selection on train data
    train_feats = fs.runFeatureSelection(train_feats, y_train, es)
    train_feats, y_train, train_bucket = ss.runSampleSelection(
        train_feats, y_train, [i for i in range(0, len(trainSet), 1)], es)

    # calculate Inter-annotator weighting.
    weights_train = getWeights(trainAndBSData, train_bucket,
                               es.weighInterAnnot)

    vectorizer = DictVectorizer()
    x_train = vectorizer.fit_transform(train_feats)
    x_test = vectorizer.transform(test_feats)

    if es.scaleData:
        min_max_scalar = MinMaxScaler()
        x_train = min_max_scalar.fit_transform(x_train.toarray())
        x_test = min_max_scalar.transform(x_test.toarray())

    model = train(estimator, x_train, y_train, weights_train, model=None)

    y_pred_prob = model.predict_proba(x_test)
    for i, cur_y in enumerate(y_pred_prob):
        if np.max(cur_y) > th_bs:
            new_train_set.append(bootstrap_data[i])
            new_y_train.append(np.argmax(cur_y))

    return (new_train_set, new_y_train)  #update none to confidence vector
示例#2
0
def main(useAnnotatorWeighing=True):
    '''
    This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well)
    Configure your model settings by modifying the ExperimentSettings object in the script.

    The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py
    '''

    # Making folders from config
    # cfg.makeFolders()

    # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations
    features = [["DSM+1"]]
    #features = [["CONCEPTS"]]#['BOW'],
    #     features = [["CONCEPTS"]]

    # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings)
    es = ExperimentSettings()
    #     es.fs_varianceFilter = True
    #     es.bootstrap = True
    #     es.ss_prototyping = True
    #     es.weighInterAnnot = False
    #     es.ml_algorithm='RF'
    #remove these!
    #     es.removeDeniedConcepts=False
    #     es.splitFamilyConcepts=False
    #     es.splitUncertainConcepts=False

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_data, y_train) = m.get_bootstrapped_trainset(train_data,
                                                                y_train,
                                                                bootstrap_data,
                                                                es,
                                                                estimator,
                                                                th_bs=0.6)

        concatenated_data = []
        concatenated_data.extend(train_data)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_data, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_data)]
        test_feats = featurized[len(train_data):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_train, es)
        train_feats, y_train, train_bucket = ss.runSampleSelection(
            train_feats, y_train, [i for i in range(len(train_data))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_data, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator, x_train, y_train, weights_train, model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
示例#3
0
def evalCrossval(estimator,
                 data,
                 es=ExperimentSettings(),
                 nFolds=10,
                 printTree=False,
                 verbose=False,
                 random_seed=44):
    '''
    Calculate average cross validation score on the split train data to evaluate performance of trained models
    @param estimator: the machine learning estimator
    @param feats_train: Features for generated training data
    @param labels_train: symptom severity label for generated training data 
    @param nFolds: number of folds in k-fold cross validation 
    '''

    # scores = cross_validation.cross_val_score(estimator, feats_train, labels_train, scoring='mean_absolute_error', cv=nFolds, verbose=1)
    # print("Average cross validation score (mean absolute error): ", np.average(scores))

    labels = [x.severity for x in data]
    folds = cross_validation.StratifiedKFold(labels,
                                             n_folds=nFolds,
                                             shuffle=True,
                                             random_state=es.random_seed)

    min_max_scalar = MinMaxScaler()

    metrics = defaultdict(list)
    confMat = None

    generatePrimaryFeats(data, es)
    utils.out('Generated primary features!')

    #_, vocab = getFeats(data, ['MED'])
    #print(vocab)

    # For each fold
    for fold_idx, fold in enumerate(folds):
        #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data
        innerData = copy(data)

        train_bucket, test_bucket = fold

        # Generate data-driven features (meta-features)
        # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular')
        trainSet = [copy(innerData[idx]) for idx in train_bucket]
        generateDataDrivenFeats(trainSet, innerData, es)

        if verbose:
            utils.out('Generated data-driven features!')

        # Deriving the values for the trainset, also generating the vocabulary
        featurized = featurize(innerData)
        # Get all featurized documents from by using the indices in the train and test buckets.
        train_feats = [featurized[idx] for idx in train_bucket]
        test_feats = [featurized[idx] for idx in test_bucket]

        #Do feature selection on train data
        y_train = [labels[idx] for idx in train_bucket]
        train_feats = fs.runFeatureSelection(train_feats, y_train, es)
        train_feats, y_train, train_bucket = ss.runSampleSelection(
            train_feats, y_train, train_bucket, es)

        vectorizer = DictVectorizer()
        # Fit and transform the train data.
        x_train = vectorizer.fit_transform(train_feats)
        # Same for test data.
        x_test = vectorizer.transform(test_feats)
        y_test = [labels[idx] for idx in test_bucket]

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        # calculate Inter-annotator weighting.
        weights_train = getWeights(data, train_bucket, es.weighInterAnnot)

        if verbose:
            utils.out("Running fold", fold_idx)

        model = train(estimator, x_train, y_train, weights_train, model=None)

        #for part in model.estimators_:
        #graph = export_graphviz(part, out_file=None, feature_names=vectorizer.feature_names_)
        #selFeats = utils.find_between(graph, 'label="','gini')

        # output the importance of features
        try:
            indices = np.argsort(model.feature_importances_)[::-1]
            featImportances = [[
                vectorizer.feature_names_[x], model.feature_importances_[x]
            ] for x in indices]
        except:
            featImportances = None

        y_pred = test(x_test, model)
        #print(y_pred)

        if confMat is None:
            confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3])
        else:
            confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3])

        if verbose:
            utils.out("Actual", y_test)
            utils.out("Predicted", y_pred)

        if printTree:
            save_decision_tree(
                cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model,
                fold_idx, vectorizer.get_feature_names())

        calc_and_append_scores(y_test, y_pred, metrics, featImportances)

    return save_results(vectorizer, metrics, confMat, es, nFolds)
示例#4
0
def eval_bootstrapped_crossVal(estimator,
                               data,
                               bootstrap_data,
                               es=ExperimentSettings(),
                               nFolds=10,
                               printTree=False,
                               verbose=False,
                               th_bs=0.6,
                               random_seed=44):
    labels = [x.severity for x in data]
    folds = cross_validation.StratifiedKFold(labels,
                                             n_folds=nFolds,
                                             shuffle=True,
                                             random_state=es.random_seed)

    min_max_scalar = MinMaxScaler()

    metrics = defaultdict(list)
    confMat = None

    generatePrimaryFeats(data, es)
    generatePrimaryFeats(bootstrap_data, es)
    utils.out('Generated primary features!')

    # For each fold
    for fold_idx, fold in enumerate(folds):
        #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data
        trainAndTestData = copy(data)

        train_bucket, test_bucket = fold

        # Generate data-driven features (meta-features)
        # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular')
        trainData = [copy(trainAndTestData[idx]) for idx in train_bucket]
        y_train = [labels[idx] for idx in train_bucket]

        (new_train_data,
         new_y_train) = get_bootstrapped_trainset(trainData, y_train,
                                                  bootstrap_data, es,
                                                  estimator, th_bs)

        testData = [copy(trainAndTestData[idx]) for idx in test_bucket]
        allData = new_train_data + testData
        generateDataDrivenFeats(new_train_data, allData, es)

        if verbose:
            utils.out('Generated data-driven features!')

        # Deriving the values for the trainset, also generating the vocabulary
        featurized = featurize(allData)
        # Get all featurized documents from by using the indices in the train and test buckets.

        train_feats = featurized[0:len(new_train_data)]
        test_feats = featurized[len(new_train_data):len(featurized)]

        #Do feature selection on train data

        train_feats = fs.runFeatureSelection(train_feats, new_y_train, es)
        train_feats, new_y_train, new_train_bucket = ss.runSampleSelection(
            train_feats, new_y_train, [i for i in range(len(new_train_data))],
            es)

        vectorizer = DictVectorizer()
        # Fit and transform the train data.
        x_train = vectorizer.fit_transform(train_feats)
        # Same for test data.
        x_test = vectorizer.transform(test_feats)
        y_test = [labels[idx] for idx in test_bucket]

        new_weights_train = getWeights(new_train_data, new_train_bucket,
                                       es.weighInterAnnot)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        if verbose:
            utils.out("Running fold", fold_idx)

        model = train(estimator,
                      x_train,
                      new_y_train,
                      new_weights_train,
                      model=None)
        # output the importance of features
        indices = np.argsort(model.feature_importances_)[::-1]
        featImportance = [[
            vectorizer.feature_names_[x], model.feature_importances_[x]
        ] for x in indices]

        y_pred = test(x_test, model)

        if confMat is None:
            confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3])
        else:
            confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3])

        if verbose:
            utils.out("Actual", y_test)
            utils.out("Predicted", y_pred)

        if printTree:
            save_decision_tree(
                cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model,
                fold_idx, vectorizer.get_feature_names())

        calc_and_append_scores(y_test, y_pred, metrics, featImportance)

    return save_results(vectorizer, metrics, confMat, es, nFolds)
示例#5
0
def runForExperimentSettings(features, es):

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        #else argument added here to not override the train_data/y_train setting, otherwise we can only do one featType at a time
        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_datac,
             y_trainc) = m.get_bootstrapped_trainset(train_data,
                                                     y_train,
                                                     bootstrap_data,
                                                     es,
                                                     estimator,
                                                     th_bs=0.6)
        else:
            train_datac = train_data
            y_trainc = y_train

        concatenated_data = []
        concatenated_data.extend(train_datac)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_datac, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_datac)]
        test_feats = featurized[len(train_datac):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_trainc, es)
        train_feats, y_trainc, train_bucket = ss.runSampleSelection(
            train_feats, y_trainc, [i for i in range(len(train_datac))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_datac, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator,
                        x_train,
                        y_trainc,
                        weights_train,
                        model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')