示例#1
0
def runExperiments(features, es, logFile):
    # Reading the data into an array
    data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    data = m.conceptPreprocessing(data, es.removeDeniedConcepts,
                                  es.splitDeniedConcepts,
                                  es.removeUncertainConcepts,
                                  es.splitUncertainConcepts,
                                  es.removeFamilyConcepts,
                                  es.splitFamilyConcepts)

    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)
    # Looping over different feature parameters
    for featTypes in features:
        #for x in [True, False]:
        #es.fs_confidence = x

        logFile.write('Executing for ' + ','.join(featTypes) + ' model.\n')
        es.featTypes = featTypes

        if es.svmParamSweep:
            result_params = m.param_sweep_svm(data,
                                              es,
                                              gammaSweep=False,
                                              nFolds=10,
                                              verbose=False,
                                              random_seed=44)
            for name in result_params:
                logFile.write(
                    str(name) + ": " + str(result_params[name]) + '\n')
        else:
            estimator = m.getEstimator(es)
            if es.bootstrap:
                results = m.eval_bootstrapped_crossVal(estimator,
                                                       data,
                                                       bootstrap_data,
                                                       es,
                                                       10,
                                                       printTree=False)
            else:
                results = m.evalCrossval(estimator,
                                         data,
                                         es,
                                         10,
                                         printTree=False)
            for name in results:
                logFile.write(str(name) + ": " + str(results[name]) + '\n')
示例#2
0
def main(useAnnotatorWeighing=True):
    '''
    This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well)
    Configure your model settings by modifying the ExperimentSettings object in the script.

    The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py
    '''

    # Making folders from config
    # cfg.makeFolders()

    # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations
    features = [["DSM+1"]]
    #features = [["CONCEPTS"]]#['BOW'],
    #     features = [["CONCEPTS"]]

    # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings)
    es = ExperimentSettings()
    #     es.fs_varianceFilter = True
    #     es.bootstrap = True
    #     es.ss_prototyping = True
    #     es.weighInterAnnot = False
    #     es.ml_algorithm='RF'
    #remove these!
    #     es.removeDeniedConcepts=False
    #     es.splitFamilyConcepts=False
    #     es.splitUncertainConcepts=False

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_data, y_train) = m.get_bootstrapped_trainset(train_data,
                                                                y_train,
                                                                bootstrap_data,
                                                                es,
                                                                estimator,
                                                                th_bs=0.6)

        concatenated_data = []
        concatenated_data.extend(train_data)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_data, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_data)]
        test_feats = featurized[len(train_data):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_train, es)
        train_feats, y_train, train_bucket = ss.runSampleSelection(
            train_feats, y_train, [i for i in range(len(train_data))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_data, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator, x_train, y_train, weights_train, model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
示例#3
0
def main(useAnnotatorWeighing=True):
    """
    This script allows for 10-fold cross validation over the data in the training set. Experiments only yield results, they don't yield annotated files.
    The standard deviation seen over the different folds for each metric are reported as well.
    
    Configure your model settings by modifying the ExperimentSettings object in the script.
    """

    # Making folders from config
    # cfg.makeFolders()

    # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations
    features = [["DSM+2"], ["BOW"], ["DSM+1"], ["DSM"], ["SNOMED"],
                ["SNOMED+1"], ["DSM+2"], ["CONCEPTS"]]
    #features = [["DSM"],["DSM+1","DIST_HIER"],["DSM+1"], ["CATEGORICAL_QUESTIONSET","QUESTIONSET","LONG_QUESTIONSET"]]

    # Options:
    # 'CONCEPTS', 'DSM+1', 'DSM', 'DSM_HIER', 'MED', 'BOW', 'BOW_ANSWERS', 'CATEGORICAL_QUESTIONSET', 'QUESTIONSET'
    # 'WORD_VECTOR', 'WORD_VECTOR_ANSWERS', 'CONCEPT_VECTOR', 'DIST_WORDVECTOR', 'DIST_CONCEPTVECTOR'
    # 'CONCEPT_CLUSTERS', 'PREAMBLE_CLUSTERS'

    # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings)
    es = ExperimentSettings()
    es.fs_varianceFilter = True
    es.bootstrap = False
    es.ss_prototyping = False
    es.weighInterAnnot = False
    #es.ml_algorithm='XGBOOST'
    #es.ml_algorithm = 'RANDOM'
    '''es.removeDeniedConcepts=True
    es.removeUncertainConcepts=False
    es.splitDeniedConcepts=False
    es.splitFamilyConcepts=True'''

    es.removeDeniedConcepts = False
    es.splitDeniedConcepts = False
    es.splitUncertainConcepts = False
    es.splitFamilyConcepts = False

    #es.fs_confidence=True
    #es.fs_confidenceValueDistinction = True
    #es.fs_chiSquare = False
    #es.fs_varianceFilter = True
    #es.fs_varianceThreshold = 0.05
    #es.fs_confidence = True
    #es.fs_informationGain = False
    #es.fs_confidenceWithCoverage = True
    #es.fs_confidenceTopK = 100
    #es.fs_confidenceCoverageOverlap = 3
    #es.fs_confidenceCutOff = 0.05'''

    # Reading the data into an array
    data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    data = m.conceptPreprocessing(data, es.removeDeniedConcepts,
                                  es.splitDeniedConcepts,
                                  es.removeUncertainConcepts,
                                  es.splitUncertainConcepts,
                                  es.removeFamilyConcepts,
                                  es.splitFamilyConcepts)

    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)
    # Looping over different feature parameters
    for featTypes in features:
        #for x in [True, False]:
        #es.fs_confidence = x

        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        if es.svmParamSweep:
            result_params = m.param_sweep_svm(data,
                                              es,
                                              gammaSweep=False,
                                              nFolds=10,
                                              verbose=False,
                                              random_seed=44)
            for name in result_params:
                print(str(name) + ":", result_params[name])
        else:
            estimator = m.getEstimator(es)
            if es.bootstrap:
                results = m.eval_bootstrapped_crossVal(estimator,
                                                       data,
                                                       bootstrap_data,
                                                       es,
                                                       10,
                                                       printTree=False)
            else:
                results = m.evalCrossval(estimator,
                                         data,
                                         es,
                                         10,
                                         printTree=False)
            for name in results:
                print(str(name) + ":", results[name])
示例#4
0
def runForExperimentSettings(features, es):

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        #else argument added here to not override the train_data/y_train setting, otherwise we can only do one featType at a time
        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_datac,
             y_trainc) = m.get_bootstrapped_trainset(train_data,
                                                     y_train,
                                                     bootstrap_data,
                                                     es,
                                                     estimator,
                                                     th_bs=0.6)
        else:
            train_datac = train_data
            y_trainc = y_train

        concatenated_data = []
        concatenated_data.extend(train_datac)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_datac, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_datac)]
        test_feats = featurized[len(train_datac):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_trainc, es)
        train_feats, y_trainc, train_bucket = ss.runSampleSelection(
            train_feats, y_trainc, [i for i in range(len(train_datac))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_datac, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator,
                        x_train,
                        y_trainc,
                        weights_train,
                        model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')