예제 #1
0
def main():
	data_paths = {
		'test': ('../data/test_data/*', '../data/reference_standard_for_test_data/concepts/'),
		'train': ('../data/concept_assertion_relation_training_data/merged/txt/*', '../data/concept_assertion_relation_training_data/merged/concept')
	}
	
	for type, paths in data_paths.items():
		full_path = lambda f: os.path.join(os.path.dirname(os.path.realpath(__file__)), f)
		args_txt = full_path(paths[0])
		args_ref = full_path(paths[1])
	
		txt_files = glob.glob(args_txt)
		ref_files = os.listdir(args_ref)
		ref_files = map(lambda f: os.path.join(args_ref, f), ref_files)

		txt_files_map = helper.map_files(txt_files)
		ref_files_map = helper.map_files(ref_files)
		
		files = []
		for k in txt_files_map:
			if k in ref_files_map:
				files.append((txt_files_map[k], ref_files_map[k]))
		
		labels = {}
		for txt, ref in files:
			txt = read_txt(txt)
			for r in read_con(ref, txt):
				for r in r:
					if r not in labels:
						labels[r] = 0
					labels[r] += 1
					
		print type, labels
예제 #2
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        help="Files containing predictions",
                        dest="txt",
                        default=os.path.join(BASE_DIR, 'data/predictions/*'))

    parser.add_argument(
        "-r",
        help=
        "The directory that contains reference gold standard concept files",
        dest="ref",
        default=os.path.join(BASE_DIR, 'data'))

    parser.add_argument(
        "-o",
        help="Write the evaluation to a file rather than STDOUT",
        dest="output",
        default=None)

    parser.add_argument("-e",
                        help="Do error analysis",
                        dest="error",
                        action='store_true')

    # Parse command line arguments
    args = parser.parse_args()

    # Is output destination specified
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)

    ref_directory = args.ref

    ref_files = os.listdir(ref_directory)
    ref_files = map(lambda f: os.path.join(args.ref, f), ref_files)
    ref_files_map = helper.map_files(ref_files)

    files = []
    for k in txt_files_map:
        if k in ref_files_map:
            files.append((txt_files_map[k], ref_files_map[k]))

    print files

    # Useful for error analysis
    text = []

    # One list of all labels
    pred_labels = []
    gold_labels = []

    # txt <- predicted labels
    # ref <- actual labels
    for txt, ref in files:

        # A note that represents the model's predictions
        pnote = Note()
        pnote.read(txt)

        # A note that is the actual concept labels
        gnote = Note()
        gnote.read(ref)

        # Accumulate all predictions
        pred_labels += pnote.label_list()
        gold_labels += gnote.label_list()

        # Collect text for error analysis
        text += pnote.text_list()

    # Compute results
    evaluate(pred_labels, gold_labels, out=args.output)

    # Error analysis
    if args.error:
        print '\n\n\n'
        error_analysis(text, pred_labels, gold_labels)
예제 #3
0
파일: train.py 프로젝트: aussina/CliNER
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        dest = "txt",
        help = "The files that contain the training examples",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/txt/*')
    )

    parser.add_argument("-c",
        dest = "con",
        help = "The files that contain the labels for the training examples",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*')
    )

    parser.add_argument("-m",
        dest = "model",
        help = "Path to the model that should be generated",
        default = os.path.join(os.getenv('CLINER_DIR'), 'models/run.model')
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
        default = 'i2b2'
    )

    parser.add_argument("-g",
        dest = "grid",
        help = "A flag indicating whether to perform a grid search",
        action = "store_true"
    )

    parser.add_argument("-no-crf",
        dest = "nocrf",
        help = "A flag indicating whether to use crfsuite for pass one.",
        action = "store_true"
    )

    # Parse the command line arguments
    args = parser.parse_args()
    is_crf = not args.nocrf


    # A list of text    file paths
    # A list of concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)


    # data format
    format = args.format


    # Must specify output format
    if format not in Note.supportedFormats():
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: ', ' | '.join(Note.supportedFormats())
        print >>sys.stderr, ''
        exit(1)


    # Collect training data file paths
    txt_files_map = helper.map_files(txt_files) # ex. {'record-13': 'record-13.con'}
    con_files_map = helper.map_files(con_files)

    training_list = []                          # ex. training_list =  [ ('record-13.txt', 'record-13.con') ]
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))


    # display file names (for user to see data was properly located)
    print '\n', training_list, '\n'


    # Train the model
    train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
예제 #4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        dest="txt",
        help="The files that contain the training examples",
        default=os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            '../data/concept_assertion_relation_training_data/merged/txt/*'))

    parser.add_argument(
        "-c",
        dest="con",
        help="The files that contain the labels for the training examples",
        default=os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            '../data/concept_assertion_relation_training_data/merged/concept/*'
        ))

    parser.add_argument("-m",
                        dest="model",
                        help="Path to the model that should be generated",
                        default=os.path.join(
                            os.path.dirname(os.path.realpath(__file__)),
                            '../models/awesome.model'))

    parser.add_argument("-d",
                        dest="disabled_features",
                        help="The features that should not be used",
                        nargs="+",
                        default=None)

    parser.add_argument(
        "-e",
        dest="enabled_features",
        help="The features that should be used. This option trumps -d",
        nargs="+",
        default=None)

    parser.add_argument(
        "--no-svm",
        dest="no_svm",
        action="store_true",
        help="Disable SVM model generation",
    )

    parser.add_argument(
        "--no-lin",
        dest="no_lin",
        action="store_true",
        help="Disable LIN model generation",
    )

    parser.add_argument(
        "--no-crf",
        dest="no_crf",
        action="store_true",
        help="Disable CRF model generation",
    )

    args = parser.parse_args()

    training_list = []
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    txt_files_map = helper.map_files(txt_files)
    con_files_map = helper.map_files(con_files)

    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    type = 0
    if not args.no_svm:
        type = type | libml.SVM

    if not args.no_lin:
        type = type | libml.LIN

    if not args.no_crf:
        type = type | libml.CRF

    # Get data and labels from files
    data = []
    labels = []
    for txt, con in training_list:
        datum = read_txt(txt)
        data += datum
        labels += read_con(con, datum)

    # Train a model on the data and labels
    model = Model(filename=args.model, type=type)

    if args.disabled_features != None:
        model.enabled_features = model.enabled_features - Set(
            args.disabled_features)

    if args.enabled_features != None:
        model.enabled_features = Set(args.enabled_features)

    model.train(data, labels)
예제 #5
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        help="Text files that were used to generate predictions",
        dest="txt",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*'))

    parser.add_argument(
        "-c",
        help=
        "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest="con",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/'))

    parser.add_argument(
        "-r",
        help=
        "The directory that contains reference gold standard concept files",
        dest="ref",
        default=os.path.join(
            os.getenv('CLINER_DIR'),
            'data/reference_standard_for_test_data/concepts/'))

    parser.add_argument(
        "-f",
        dest="format",
        help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
    )

    parser.add_argument(
        "-o",
        help="Write the evaluation to a file rather than STDOUT",
        dest="output",
        default=None)

    # Parse command line arguments
    args = parser.parse_args()

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)
    wildcard = '*.' + Note.dictOfFormatToExtensions()[format]

    # List of gold data
    ref_files = glob.glob(os.path.join(args.ref, wildcard))
    ref_files_map = helper.map_files(ref_files)

    # List of predictions
    pred_files = glob.glob(os.path.join(args.con, wildcard))
    pred_files_map = helper.map_files(pred_files)

    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append(
                (txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard

    truePositivesExactSpan = 0
    falseNegativesExactSpan = 0
    falsePositivesExactSpan = 0

    truePositivesInexactSpan = 0
    falseNegativesInexactSpan = 0
    falsePositivesInexactSpan = 0

    confusion = [[0] * len(labels) for e in labels]

    confusionMatrixExactSpan = deepcopy(confusion)
    confusionMatrixInexactSpan = deepcopy(confusion)

    if len(files) == 0:
        exit("No files to be evaluated")

    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Note(format)
        rnote = Note(format)
        cnote.read(txt, annotations)
        rnote.read(txt, gold)

        referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist())
        predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist())

        #TO DO: i need to generate a cumulative total accross all of the files
        #modify my functions slightly and have it return the number of true positive and etc...
        #then call generate results

        exactResults = evaluate(deepcopy(referenceSpans),
                                deepcopy(predictedSpans),
                                exactMatch=True,
                                reportSeperately=False)

        inexactResults = evaluate(deepcopy(referenceSpans),
                                  deepcopy(predictedSpans),
                                  exactMatch=False,
                                  reportSeperately=False)

        truePositivesExactSpan += exactResults["True Positives"]
        falseNegativesExactSpan += exactResults["False Negatives"]
        falsePositivesExactSpan += exactResults["False Positives"]

        inexactResults = evaluate(deepcopy(referenceSpans),
                                  deepcopy(predictedSpans),
                                  exactMatch=False,
                                  reportSeperately=False)

        truePositivesInexactSpan += inexactResults["True Positives"]
        falseNegativesInexactSpan += inexactResults["False Negatives"]
        falsePositivesInexactSpan += inexactResults["False Positives"]

        MatrixInexactSpan = evaluate(deepcopy(referenceSpans),
                                     deepcopy(predictedSpans),
                                     exactMatch=False,
                                     reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixInexactSpan,
                                      MatrixInexactSpan):
            for i, int2 in enumerate(sublist2):
                sublist1[i] += int2

        MatrixExactSpan = evaluate(deepcopy(referenceSpans),
                                   deepcopy(predictedSpans),
                                   exactMatch=True,
                                   reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixExactSpan,
                                      MatrixExactSpan):
            for i, int2 in enumerate(sublist2):
                sublist1[i] += int2

    print "\nResults for exact span for concepts together.\n"

    print "True Positives: ", truePositivesExactSpan
    print "False Negatives: ", falseNegativesExactSpan
    print "False Positives: ", falsePositivesExactSpan

    exactSpan = generateResultsForExactSpans(truePositivesExactSpan,
                                             falseNegativesExactSpan,
                                             falsePositivesExactSpan)

    print "Recall: ", exactSpan["Recall"]
    print "Precision: ", exactSpan["Precision"]
    print "F Measure: ", exactSpan["F Score"]

    inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan,
                                               falseNegativesInexactSpan,
                                               falsePositivesInexactSpan)

    print "\nResults for inexact span for concepts together.\n"

    print "True Positives: ", truePositivesInexactSpan
    print "False Negatives: ", falseNegativesInexactSpan
    print "False Positives: ", falsePositivesInexactSpan

    print "Recall: ", inexactSpan["Recall"]
    print "Precision: ", inexactSpan["Precision"]
    print "F Measure: ", inexactSpan["F Score"]

    #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans
    #TO DO: number of FP, FN, TP is not same between exact and inexact.

    #LEFT OFF HERE. FIX DISPLAY FUNCTION

    displayMatrix(args.output, 'Exact', confusionMatrixExactSpan)
    displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan)

    #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

    return
예제 #6
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        dest="txt",
        help="The files that contain the training examples",
    )

    parser.add_argument(
        "-c",
        dest="con",
        help="The files that contain the labels for the training examples",
    )

    parser.add_argument(
        "-m",
        dest="model",
        help="Path to the model that should be generated",
    )

    parser.add_argument(
        "-f",
        dest="format",
        help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
    )

    parser.add_argument(
        "-g",
        dest="grid",
        help="A flag indicating whether to perform a grid search",
        action="store_true")

    parser.add_argument(
        "-no-crf",
        dest="nocrf",
        help="A flag indicating whether to use crfsuite for pass one.",
        action="store_true")

    parser.add_argument(
        "-discontiguous_spans",
        dest="third",
        help="A flag indicating whether to have third/clustering pass",
        action="store_true")

    parser.add_argument(
        "-umls_disambiguation",
        dest="umls_disambiguation",
        action="store_true",
        help=
        "A flag indicating wheter to disambiguate CUI id for detected entities in semeval format",
    )
    """
    parser.add_argument("-unlabeled",
        dest = "unlabeled",
        help = "Path to dir containing unlabelled data used for unsupervised methods",
    )
    """

    # Parse the command line arguments
    args = parser.parse_args()
    is_crf = not args.nocrf
    third = args.third

    # Error check: Ensure that file paths are specified
    if not args.txt:
        print >> sys.stderr, '\n\tError: Must provide text files'
        print >> sys.stderr, ''
        exit(1)
    if not args.con:
        print >> sys.stderr, '\n\tError: Must provide annotations for text files'
        print >> sys.stderr, ''
        exit(1)
    if not args.model:
        print >> sys.stderr, '\n\tError: Must provide valid path to store model'
        print >> sys.stderr, ''
        exit(1)
    modeldir = os.path.dirname(args.model)
    if (not os.path.exists(modeldir)) and (modeldir != ''):
        print >> sys.stderr, '\n\tError: Model dir does not exist: %s' % modeldir
        print >> sys.stderr, ''
        exit(1)

    if "PY4J_DIR_PATH" not in os.environ and args.third is True:
        exit(
            "please set environ var PY4J_DIR_PATH to the dir of the folder containg py4j<version>.jar"
        )

    # A list of text    file paths
    # A list of concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    # data format
    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    if third is True and args.format == "i2b2":
        exit("i2b2 formatting does not support disjoint spans")

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # Collect training data file paths
    txt_files_map = helper.map_files(
        txt_files)  # ex. {'record-13': 'record-13.con'}
    con_files_map = helper.map_files(con_files)

    training_list = [
    ]  # ex. training_list =  [ ('record-13.txt', 'record-13.con') ]
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    # Train the model
    train(training_list,
          args.model,
          format,
          is_crf=is_crf,
          grid=args.grid,
          third=third,
          disambiguate=args.umls_disambiguation)
예제 #7
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        help = "Files containing predictions",
        dest = "txt",
        default = os.path.join(BASE_DIR, 'data/predictions/*')
    )

    parser.add_argument("-r",
        help = "The directory that contains reference gold standard concept files",
        dest = "ref",
        default = os.path.join(BASE_DIR, 'data')
    )

    parser.add_argument("-o",
        help = "Write the evaluation to a file rather than STDOUT",
        dest = "output",
        default = None
    )

    parser.add_argument("-e",
        help = "Do error analysis",
        dest = "error",
        action = 'store_true'
    )

    # Parse command line arguments
    args = parser.parse_args()


    # Is output destination specified
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout


    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)


    ref_directory = args.ref


    ref_files = os.listdir(ref_directory)
    ref_files = map(lambda f: os.path.join(args.ref, f), ref_files)
    ref_files_map = helper.map_files(ref_files)

    files = []
    for k in txt_files_map:
        if k in ref_files_map:
            files.append((txt_files_map[k], ref_files_map[k]))


    print files


    # Useful for error analysis
    text = []

    # One list of all labels
    pred_labels = []
    gold_labels = []

    # txt <- predicted labels
    # ref <- actual labels
    for txt, ref in files:

        # A note that represents the model's predictions
        pnote = Note()
        pnote.read( txt )

        # A note that is the actual concept labels
        gnote = Note()
        gnote.read( ref )

        # Accumulate all predictions
        pred_labels += pnote.label_list()
        gold_labels += gnote.label_list()

        # Collect text for error analysis
        text += pnote.text_list()


    # Compute results
    evaluate(pred_labels, gold_labels, out=args.output)


    # Error analysis
    if args.error:
        print '\n\n\n'
        error_analysis(text, pred_labels, gold_labels)
예제 #8
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        dest="txt",
        help="The files that contain the training examples",
        default=os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "../data/concept_assertion_relation_training_data/merged/txt/*"
        ),
    )

    parser.add_argument(
        "-c",
        dest="con",
        help="The files that contain the labels for the training examples",
        default=os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            "../data/concept_assertion_relation_training_data/merged/concept/*",
        ),
    )

    parser.add_argument(
        "-m",
        dest="model",
        help="Path to the model that should be generated",
        default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "../models/awesome.model"),
    )

    parser.add_argument(
        "-d", dest="disabled_features", help="The features that should not be used", nargs="+", default=None
    )

    parser.add_argument(
        "-e",
        dest="enabled_features",
        help="The features that should be used. This option trumps -d",
        nargs="+",
        default=None,
    )

    parser.add_argument("--no-svm", dest="no_svm", action="store_true", help="Disable SVM model generation")

    parser.add_argument("--no-lin", dest="no_lin", action="store_true", help="Disable LIN model generation")

    parser.add_argument("--no-crf", dest="no_crf", action="store_true", help="Disable CRF model generation")

    args = parser.parse_args()

    training_list = []
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    txt_files_map = helper.map_files(txt_files)
    con_files_map = helper.map_files(con_files)

    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    type = 0
    if not args.no_svm:
        type = type | libml.SVM

    if not args.no_lin:
        type = type | libml.LIN

    if not args.no_crf:
        type = type | libml.CRF

        # Get data and labels from files
    data = []
    labels = []
    for txt, con in training_list:
        datum = read_txt(txt)
        data += datum
        labels += read_con(con, datum)

        # Train a model on the data and labels
    model = Model(filename=args.model, type=type)

    if args.disabled_features != None:
        model.enabled_features = model.enabled_features - Set(args.disabled_features)

    if args.enabled_features != None:
        model.enabled_features = Set(args.enabled_features)

    model.train(data, labels)
예제 #9
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        help = "Text files that were used to generate predictions",
        dest = "txt",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*')
    )

    parser.add_argument("-c",
        help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest = "con",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/')
    )

    parser.add_argument("-r",
        help = "The directory that contains reference gold standard concept files",
        dest = "ref",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/')
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
        default = 'i2b2'
    )

    parser.add_argument("-o",
        help = "Write the evaluation to a file rather than STDOUT",
        dest = "output",
        default = None
    )

    # Parse command line arguments
    args = parser.parse_args()
    format = args.format


    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout


    # Must specify output format
    if format not in Note.supportedFormats():
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: ', ' | '.join(Note.supportedFormats())
        print >>sys.stderr, ''
        exit(1)


    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)
    wildcard = '*.' + Note.dictOfFormatToExtensions()[format]


    # List of gold data
    ref_files = glob.glob( os.path.join(args.ref, wildcard) )
    ref_files_map = helper.map_files(ref_files)


    # List of predictions
    pred_files = glob.glob( os.path.join(args.con, wildcard) )
    pred_files_map = helper.map_files(pred_files)


    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k]))


    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard


    truePositivesExactSpan = 0
    falseNegativesExactSpan = 0
    falsePositivesExactSpan = 0

    truePositivesInexactSpan = 0
    falseNegativesInexactSpan = 0
    falsePositivesInexactSpan = 0

    confusion = [[0] * len(labels) for e in labels]

    confusionMatrixExactSpan = deepcopy(confusion)
    confusionMatrixInexactSpan = deepcopy(confusion)



    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Note(format)
        rnote = Note(format)
        cnote.read(txt, annotations)
        rnote.read(txt,        gold)

        referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist())
        predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist())

        #TO DO: i need to generate a cumulative total accross all of the files
        #modify my functions slightly and have it return the number of true positive and etc...
        #then call generate results

        exactResults =  evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False)

        inexactResults =  evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False)


        truePositivesExactSpan += exactResults["True Positives"]
        falseNegativesExactSpan += exactResults["False Negatives"]
        falsePositivesExactSpan += exactResults["False Positives"]


        inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False)

        truePositivesInexactSpan += inexactResults["True Positives"]
        falseNegativesInexactSpan += inexactResults["False Negatives"]
        falsePositivesInexactSpan += inexactResults["False Positives"]

        MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan):
            for i,int2 in enumerate(sublist2):
                sublist1[i] += int2

        MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan):
            for i,int2 in enumerate(sublist2):
                sublist1[i] += int2

    print "\nResults for exact span for concepts together.\n"

    print "True Positives: ", truePositivesExactSpan
    print "False Negatives: ", falseNegativesExactSpan
    print "False Positives: ", falsePositivesExactSpan

    exactSpan = generateResultsForExactSpans(truePositivesExactSpan,
                                 falseNegativesExactSpan,
                                 falsePositivesExactSpan)

    print "Recall: ", exactSpan["Recall"]
    print "Precision: ", exactSpan["Precision"]
    print "F Measure: ", exactSpan["F Score"]

    inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan,
                                 falseNegativesInexactSpan,
                                 falsePositivesInexactSpan)

    print "\nResults for inexact span for concepts together.\n"

    print "True Positives: ", truePositivesInexactSpan
    print "False Negatives: ", falseNegativesInexactSpan
    print "False Positives: ", falsePositivesInexactSpan

    print "Recall: ", inexactSpan["Recall"]
    print "Precision: ", inexactSpan["Precision"]
    print "F Measure: ", inexactSpan["F Score"]

    #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans
    #TO DO: number of FP, FN, TP is not same between exact and inexact.

    #LEFT OFF HERE. FIX DISPLAY FUNCTION

    displayMatrix(args.output, 'Exact'  , confusionMatrixExactSpan)
    displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan)


        #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

    return
예제 #10
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        dest="txt",
                        help="The files that contain the training examples",
                        default=os.path.join(os.getenv('CLINER_DIR'),
                                             'data/train/txt/*'))

    parser.add_argument(
        "-c",
        dest="con",
        help="The files that contain the labels for the training examples",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*'))

    parser.add_argument("-m",
                        dest="model",
                        help="Path to the model that should be generated",
                        default=os.path.join(os.getenv('CLINER_DIR'),
                                             'models/run.model'))

    parser.add_argument("-f",
                        dest="format",
                        help="Data format ( " +
                        ' | '.join(Note.supportedFormats()) + " )",
                        default='i2b2')

    parser.add_argument(
        "-g",
        dest="grid",
        help="A flag indicating whether to perform a grid search",
        action="store_true")

    parser.add_argument(
        "-no-crf",
        dest="nocrf",
        help="A flag indicating whether to use crfsuite for pass one.",
        action="store_true")

    # Parse the command line arguments
    args = parser.parse_args()
    is_crf = not args.nocrf

    # A list of text    file paths
    # A list of concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    # data format
    format = args.format

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # Collect training data file paths
    txt_files_map = helper.map_files(
        txt_files)  # ex. {'record-13': 'record-13.con'}
    con_files_map = helper.map_files(con_files)

    training_list = [
    ]  # ex. training_list =  [ ('record-13.txt', 'record-13.con') ]
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    # display file names (for user to see data was properly located)
    print '\n', training_list, '\n'

    # Train the model
    train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
예제 #11
0
def main():
	parser = argparse.ArgumentParser()

	parser.add_argument("-t",
		help = "Test files that were used to generate predictions",
		dest = "txt",
		default = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/test_data/*')
	)

	parser.add_argument("-c",
		help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
		dest = "con",
		default = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/test_predictions/')
	)

	parser.add_argument("-r",
		help = "The directory that contains reference gold standard concept files",
		dest = "ref",
		default = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/reference_standard_for_test_data/concepts/')
	)
	
	parser.add_argument("-o",
		help = "Write the evaluation to a file rather than STDOUT",
		dest = "output",
		default = None
	)

	args = parser.parse_args()
	
	# output
	if args.output:
		args.output = open(args.output, "w")
	else:
		args.output = sys.stdout

	txt_files = glob.glob(args.txt)
	ref_files = os.listdir(args.ref)
	ref_files = map(lambda f: os.path.join(args.ref, f), ref_files)

	txt_files_map = helper.map_files(txt_files)
	ref_files_map = helper.map_files(ref_files)

	con_directories = os.listdir(args.con)

	for con_directory in con_directories:
		files = []
		directory_name = os.path.basename(con_directory)

		if directory_name not in ["svm", "crf", "lin"]:
			continue

		con_files = os.listdir(os.path.join(args.con, con_directory))
		con_files = map(lambda f: os.path.join(args.con, con_directory, f), con_files)
		
		con_files_map = helper.map_files(con_files)

		for k in txt_files_map:
			if k in con_files_map and k in ref_files_map:
				files.append((txt_files_map[k], con_files_map[k], ref_files_map[k]))


		# Compute the confusion matrix
		labels = Model.labels
		confusion = [[0] * len(labels) for e in labels]
		for txt, con, ref in files:
			txt = read_txt(txt)
			for c, r in zip(read_con(con, txt), read_con(ref, txt)):
				for c, r in zip(c, r):
					confusion[labels[r]][labels[c]] += 1
		


		# Display the confusion matrix
		print >>args.output, ""
		print >>args.output, ""
		print >>args.output, ""
		print >>args.output, "================"
		print >>args.output, directory_name.upper() + " RESULTS" 
		print >>args.output, "================"
		print >>args.output, ""
		print >>args.output, "Confusion Matrix"
		pad = max(len(l) for l in labels) + 6
		print >>args.output, "%s %s" % (' ' * pad, "\t".join(Model.labels.keys()))
		for act, act_v in labels.items():
			print >>args.output, "%s %s" % (act.rjust(pad), "\t".join([str(confusion[act_v][pre_v]) for pre, pre_v in labels.items()]))
		print >>args.output, ""
		
		

		# Compute the analysis stuff
		precision = []
		recall = []
		specificity = []
		f1 = []

		tp = 0
		fp = 0
		fn = 0
		tn = 0

		print >>args.output, "Analysis"
		print >>args.output, " " * pad, "Precision\tRecall\tF1"

		

		for lab, lab_v in labels.items():
			tp = confusion[lab_v][lab_v]
			fp = sum(confusion[v][lab_v] for k, v in labels.items() if v != lab_v)
			fn = sum(confusion[lab_v][v] for k, v in labels.items() if v != lab_v)
			tn = sum(confusion[v1][v2] for k1, v1 in labels.items() 
				for k2, v2 in labels.items() if v1 != lab_v and v2 != lab_v)
			precision += [float(tp) / (tp + fp + 1e-100)]
			recall += [float(tp) / (tp + fn + 1e-100)]
			specificity += [float(tn) / (tn + fp + 1e-100)]
			f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)]
			print >>args.output, "%s %.4f\t%.4f\t%.4f\t%.4f" % (lab.rjust(pad), precision[-1], recall[-1], specificity[-1], f1[-1])

		print >>args.output, "--------"

		precision = sum(precision) / len(precision)
		recall = sum(recall) / len(recall)
		specificity = sum(specificity) / len(specificity)
		f1 = sum(f1) / len(f1)

		print >>args.output, "Average: %.4f\t%.4f\t%.4f\t%.4f" % (precision, recall, specificity, f1)
예제 #12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        help="Test files that were used to generate predictions",
        dest="txt",
        default=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             '../data/test_data/*'))

    parser.add_argument(
        "-c",
        help=
        "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest="con",
        default=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             '../data/test_predictions/'))

    parser.add_argument(
        "-r",
        help=
        "The directory that contains reference gold standard concept files",
        dest="ref",
        default=os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            '../data/reference_standard_for_test_data/concepts/'))

    parser.add_argument(
        "-o",
        help="Write the evaluation to a file rather than STDOUT",
        dest="output",
        default=None)

    args = parser.parse_args()

    # output
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    txt_files = glob.glob(args.txt)
    ref_files = os.listdir(args.ref)
    ref_files = map(lambda f: os.path.join(args.ref, f), ref_files)

    txt_files_map = helper.map_files(txt_files)
    ref_files_map = helper.map_files(ref_files)

    con_directories = os.listdir(args.con)

    for con_directory in con_directories:
        files = []
        directory_name = os.path.basename(con_directory)

        if directory_name not in ["svm", "crf", "lin"]:
            continue

        con_files = os.listdir(os.path.join(args.con, con_directory))
        con_files = map(lambda f: os.path.join(args.con, con_directory, f),
                        con_files)

        con_files_map = helper.map_files(con_files)

        for k in txt_files_map:
            if k in con_files_map and k in ref_files_map:
                files.append(
                    (txt_files_map[k], con_files_map[k], ref_files_map[k]))

        # Compute the confusion matrix
        labels = Model.labels
        confusion = [[0] * len(labels) for e in labels]
        for txt, con, ref in files:
            txt = read_txt(txt)
            for c, r in zip(read_con(con, txt), read_con(ref, txt)):
                for c, r in zip(c, r):
                    confusion[labels[r]][labels[c]] += 1

        # Display the confusion matrix
        print >> args.output, ""
        print >> args.output, ""
        print >> args.output, ""
        print >> args.output, "================"
        print >> args.output, directory_name.upper() + " RESULTS"
        print >> args.output, "================"
        print >> args.output, ""
        print >> args.output, "Confusion Matrix"
        pad = max(len(l) for l in labels) + 6
        print >> args.output, "%s %s" % (' ' * pad, "\t".join(
            Model.labels.keys()))
        for act, act_v in labels.items():
            print >> args.output, "%s %s" % (act.rjust(pad), "\t".join([
                str(confusion[act_v][pre_v]) for pre, pre_v in labels.items()
            ]))
        print >> args.output, ""

        # Compute the analysis stuff
        precision = []
        recall = []
        specificity = []
        f1 = []

        tp = 0
        fp = 0
        fn = 0
        tn = 0

        print >> args.output, "Analysis"
        print >> args.output, " " * pad, "Precision\tRecall\tF1"

        for lab, lab_v in labels.items():
            tp = confusion[lab_v][lab_v]
            fp = sum(confusion[v][lab_v] for k, v in labels.items()
                     if v != lab_v)
            fn = sum(confusion[lab_v][v] for k, v in labels.items()
                     if v != lab_v)
            tn = sum(confusion[v1][v2] for k1, v1 in labels.items()
                     for k2, v2 in labels.items()
                     if v1 != lab_v and v2 != lab_v)
            precision += [float(tp) / (tp + fp + 1e-100)]
            recall += [float(tp) / (tp + fn + 1e-100)]
            specificity += [float(tn) / (tn + fp + 1e-100)]
            f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)]
            print >> args.output, "%s %.4f\t%.4f\t%.4f\t%.4f" % (lab.rjust(
                pad), precision[-1], recall[-1], specificity[-1], f1[-1])

        print >> args.output, "--------"

        precision = sum(precision) / len(precision)
        recall = sum(recall) / len(recall)
        specificity = sum(specificity) / len(specificity)
        f1 = sum(f1) / len(f1)

        print >> args.output, "Average: %.4f\t%.4f\t%.4f\t%.4f" % (
            precision, recall, specificity, f1)