示例#1
0
文件: train.py 项目: aussina/CliNER
def train(training_list, model_path, format, is_crf=True, grid=False):

    # Read the data into a Note object
    notes = []
    for txt, con in training_list:
        note_tmp = Note(format)       # Create Note
        note_tmp.read(txt, con)       # Read data into Note
        notes.append(note_tmp)        # Add the Note to the list


    # file names
    if not notes:
        print 'Error: Cannot train on 0 files. Terminating train.'
        return 1


    # Create a Machine Learning model
    model = Model(is_crf=is_crf)


    # Train the model using the Note's data
    model.train(notes, grid)


    # Pickle dump
    print 'pickle dump'
    with open(model_path, "wb") as m_file:
        pickle.dump(model, m_file)
示例#2
0
    def find_note(self, name):
        note = None
        aux = self.db.get_note(name)
        self.db.close_db()

        print type(aux), aux
        if aux:
            note = Note(aux[0][1], aux[0][3])
            note.id = aux[0][0]
            note.creation_date = aux[0][2]
            return note

        return note
示例#3
0
def train(training_list,
          model_path,
          format,
          is_crf=True,
          grid=False,
          third=False,
          disambiguate=False):
    """
    train()

    Purpose: Train a model for given clinical data.

    @param training_list  list of (txt,con) file path tuples (training instances)
    @param model_path     string filename of where to pickle model object
    @param format         concept file data format (ex. i2b2, semeval)
    @param is_crf         whether first pass should use CRF classifier
    @param grid           whether second pass should perform grid search
    @param third          whether to perform third/clustering pass
    """

    # Read the data into a Note object
    notes = []
    for txt, con in training_list:

        note_tmp = Note(format)  # Create Note
        note_tmp.read(txt, con)  # Read data into Note
        notes.append(note_tmp)  # Add the Note to the list

    # file names
    if not notes:
        print 'Error: Cannot train on 0 files. Terminating train.'
        return 1

    # Create a Machine Learning model
    model = Model(is_crf=is_crf)

    # disambiguation
    if format == "semeval" and disambiguate is True and enabled["UMLS"] != None:
        model.set_cui_freq(cui_disambiguation.calcFreqOfCuis(training_list))

    # Train the model using the Note's data
    model.train(notes, grid, do_third=third)

    # Pickle dump
    print '\nserializing model to %s\n' % model_path
    with open(model_path, "wb") as m_file:
        pickle.dump(model, m_file)

    # return trained model
    return model
示例#4
0
文件: predict.py 项目: aussina/CliNER
def predict(files, model_path, output_dir, format):

    # Must specify output format
    if format not in Note.supportedFormats():
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: ', ' | '.join(Note.supportedFormats())
        print >>sys.stderr, ''
        exit(1)



    # Load model
    model = Model.load(model_path)


    # Tell user if not predicting
    if not files:
        print >>sys.stderr, "\n\tNote: You did not supply any input files\n"
        exit()


    # For each file, predict concept labels
    n = len(files)
    for i,txt in enumerate(sorted(files)):

        # Read the data into a Note object
        note = Note(format)
        note.read(txt)


        print '-' * 30
        print '\n\t%d of %d' % (i+1,n)
        print '\t', txt, '\n'


        # Predict concept labels
        labels = model.predict(note)

        # Get predictions in proper format
        extension = note.getExtension()
        output = note.write(labels)

        #print output

        # Output file
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension
        out_path = os.path.join(output_dir, fname)

        # Output the concept predictions
        print '\n\nwriting to: ', out_path
        with open(out_path, 'w') as f:
            print >>f, output
        print
示例#5
0
def train(training_list, model_path, format, is_crf=True, grid=False):

    # Read the data into a Note object
    notes = []
    for txt, con in training_list:
        note_tmp = Note(format)  # Create Note
        note_tmp.read(txt, con)  # Read data into Note
        notes.append(note_tmp)  # Add the Note to the list

    # file names
    if not notes:
        print 'Error: Cannot train on 0 files. Terminating train.'
        return 1

    # Create a Machine Learning model
    model = Model(is_crf=is_crf)

    # Train the model using the Note's data
    model.train(notes, grid)

    # Pickle dump
    print 'pickle dump'
    with open(model_path, "wb") as m_file:
        pickle.dump(model, m_file)
示例#6
0
文件: predict.py 项目: aussina/CliNER
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-i",
        dest = "input",
        help = "The input files to predict",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*')
    )

    parser.add_argument("-o",
        dest = "output",
        help = "The directory to write the output",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_predictions')
    )

    parser.add_argument("-m",
        dest = "model",
        help = "The model to use for prediction",
        default = os.path.join(os.getenv('CLINER_DIR'), 'models/run.model')
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
        default = 'i2b2'
    )

    parser.add_argument("-crf",
        dest = "with_crf",
        help = "Specify where to find crfsuite",

      default = None
    )

    args = parser.parse_args()


    # Parse arguments
    files = glob.glob(args.input)
    helper.mkpath(args.output)
    format = args.format


    # Predict
    predict(files, args.model, args.output, format=format)
示例#7
0
文件: train.py 项目: aussina/CliNER
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        dest = "txt",
        help = "The files that contain the training examples",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/txt/*')
    )

    parser.add_argument("-c",
        dest = "con",
        help = "The files that contain the labels for the training examples",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*')
    )

    parser.add_argument("-m",
        dest = "model",
        help = "Path to the model that should be generated",
        default = os.path.join(os.getenv('CLINER_DIR'), 'models/run.model')
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
        default = 'i2b2'
    )

    parser.add_argument("-g",
        dest = "grid",
        help = "A flag indicating whether to perform a grid search",
        action = "store_true"
    )

    parser.add_argument("-no-crf",
        dest = "nocrf",
        help = "A flag indicating whether to use crfsuite for pass one.",
        action = "store_true"
    )

    # Parse the command line arguments
    args = parser.parse_args()
    is_crf = not args.nocrf


    # A list of text    file paths
    # A list of concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)


    # data format
    format = args.format


    # Must specify output format
    if format not in Note.supportedFormats():
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: ', ' | '.join(Note.supportedFormats())
        print >>sys.stderr, ''
        exit(1)


    # Collect training data file paths
    txt_files_map = helper.map_files(txt_files) # ex. {'record-13': 'record-13.con'}
    con_files_map = helper.map_files(con_files)

    training_list = []                          # ex. training_list =  [ ('record-13.txt', 'record-13.con') ]
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))


    # display file names (for user to see data was properly located)
    print '\n', training_list, '\n'


    # Train the model
    train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
def main():

    # Argument Parser
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-txt",
        dest="txt",
        help="The files that contain the training examples",
    )

    parser.add_argument(
        "-annotations",
        dest="annotations",
        help="The files that contain the labels for the training examples",
    )

    parser.add_argument(
        "-out",
        dest="out",
        default=None,
        help="Directory to output data",
    )

    parser.add_argument(
        "-format",
        dest="format",
        help="Output format (%s)" % str(' or '.join(Note.supportedFormats())),
    )

    # Parse the command line arguments
    args = parser.parse_args()

    # Parse arguments
    txt = args.txt
    annotations = args.annotations
    out_file = args.out
    format = args.format

    # Ensure annotations are specified
    if not txt:
        print >> sys.stderr, '\n\tError: Must supply text file'
        print >> sys.stderr
        exit(1)
    elif not os.path.exists(txt):
        print >> sys.stderr, '\n\tError: Given text file does not exist'
        print >> sys.stderr
        exit(1)

    # Ensure annotations are specified
    extensions = Note.supportedFormatExtensions()
    if not annotations:
        print >> sys.stderr, '\n\tError: Must supply annotations'
        print >> sys.stderr
        exit(2)
    elif not os.path.exists(txt):
        print >> sys.stderr, '\n\tError: Given annotation file does not exist'
        print >> sys.stderr
        exit(2)
    elif os.path.splitext(annotations)[1][1:] not in extensions:
        print >> sys.stderr, '\n\tError: annotation must be a supported format'
        print >> sys.stderr, '\t\t(.%s)' % str(' or .'.join(extensions))
        print >> sys.stderr
        exit(2)

    # Ensure output format is specified
    if (not format) or (format not in Note.supportedFormats()):
        print >> sys.stderr, '\n\tError: Must specify supported output format'
        print >> sys.stderr, '\t\t(%s)' % str(' or '.join(
            Note.supportedFormats()))
        print >> sys.stderr
        exit(3)

    # Automatically find the input file format
    in_extension = os.path.splitext(annotations)[1][1:]
    for f, ext in Note.dictOfFormatToExtensions().items():
        if ext == in_extension:
            in_format = f

    # Read input data into note object
    in_note = Note(in_format)
    in_note.read(txt, annotations)

    # Convert data to standard format
    internal_output = in_note.write_standard()

    os_handle, tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp")
    with open(tmp_file, 'w') as f:
        f.write(internal_output)
    os.close(os_handle)

    #print internal_output

    # Read internal standard data into new file with given output format
    out_note = Note(format)
    out_note.read_standard(txt, tmp_file)

    # Output data
    out = out_note.write()
    if out_file:
        with open(out_file, 'w') as out_f:
            out_f.write(out)
    else:
        sys.stdout.write(out)

    # Clean up
    os.remove(tmp_file)
    if out_file:
        out_f.close()
示例#9
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        help="Text files that were used to generate predictions",
        dest="txt",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*'))

    parser.add_argument(
        "-c",
        help=
        "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest="con",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/'))

    parser.add_argument(
        "-r",
        help=
        "The directory that contains reference gold standard concept files",
        dest="ref",
        default=os.path.join(
            os.getenv('CLINER_DIR'),
            'data/reference_standard_for_test_data/concepts/'))

    parser.add_argument(
        "-f",
        dest="format",
        help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
    )

    parser.add_argument(
        "-o",
        help="Write the evaluation to a file rather than STDOUT",
        dest="output",
        default=None)

    # Parse command line arguments
    args = parser.parse_args()

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)
    wildcard = '*.' + Note.dictOfFormatToExtensions()[format]

    # List of gold data
    ref_files = glob.glob(os.path.join(args.ref, wildcard))
    ref_files_map = helper.map_files(ref_files)

    # List of predictions
    pred_files = glob.glob(os.path.join(args.con, wildcard))
    pred_files_map = helper.map_files(pred_files)

    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append(
                (txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard

    truePositivesExactSpan = 0
    falseNegativesExactSpan = 0
    falsePositivesExactSpan = 0

    truePositivesInexactSpan = 0
    falseNegativesInexactSpan = 0
    falsePositivesInexactSpan = 0

    confusion = [[0] * len(labels) for e in labels]

    confusionMatrixExactSpan = deepcopy(confusion)
    confusionMatrixInexactSpan = deepcopy(confusion)

    if len(files) == 0:
        exit("No files to be evaluated")

    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Note(format)
        rnote = Note(format)
        cnote.read(txt, annotations)
        rnote.read(txt, gold)

        referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist())
        predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist())

        #TO DO: i need to generate a cumulative total accross all of the files
        #modify my functions slightly and have it return the number of true positive and etc...
        #then call generate results

        exactResults = evaluate(deepcopy(referenceSpans),
                                deepcopy(predictedSpans),
                                exactMatch=True,
                                reportSeperately=False)

        inexactResults = evaluate(deepcopy(referenceSpans),
                                  deepcopy(predictedSpans),
                                  exactMatch=False,
                                  reportSeperately=False)

        truePositivesExactSpan += exactResults["True Positives"]
        falseNegativesExactSpan += exactResults["False Negatives"]
        falsePositivesExactSpan += exactResults["False Positives"]

        inexactResults = evaluate(deepcopy(referenceSpans),
                                  deepcopy(predictedSpans),
                                  exactMatch=False,
                                  reportSeperately=False)

        truePositivesInexactSpan += inexactResults["True Positives"]
        falseNegativesInexactSpan += inexactResults["False Negatives"]
        falsePositivesInexactSpan += inexactResults["False Positives"]

        MatrixInexactSpan = evaluate(deepcopy(referenceSpans),
                                     deepcopy(predictedSpans),
                                     exactMatch=False,
                                     reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixInexactSpan,
                                      MatrixInexactSpan):
            for i, int2 in enumerate(sublist2):
                sublist1[i] += int2

        MatrixExactSpan = evaluate(deepcopy(referenceSpans),
                                   deepcopy(predictedSpans),
                                   exactMatch=True,
                                   reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixExactSpan,
                                      MatrixExactSpan):
            for i, int2 in enumerate(sublist2):
                sublist1[i] += int2

    print "\nResults for exact span for concepts together.\n"

    print "True Positives: ", truePositivesExactSpan
    print "False Negatives: ", falseNegativesExactSpan
    print "False Positives: ", falsePositivesExactSpan

    exactSpan = generateResultsForExactSpans(truePositivesExactSpan,
                                             falseNegativesExactSpan,
                                             falsePositivesExactSpan)

    print "Recall: ", exactSpan["Recall"]
    print "Precision: ", exactSpan["Precision"]
    print "F Measure: ", exactSpan["F Score"]

    inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan,
                                               falseNegativesInexactSpan,
                                               falsePositivesInexactSpan)

    print "\nResults for inexact span for concepts together.\n"

    print "True Positives: ", truePositivesInexactSpan
    print "False Negatives: ", falseNegativesInexactSpan
    print "False Positives: ", falsePositivesInexactSpan

    print "Recall: ", inexactSpan["Recall"]
    print "Precision: ", inexactSpan["Precision"]
    print "F Measure: ", inexactSpan["F Score"]

    #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans
    #TO DO: number of FP, FN, TP is not same between exact and inexact.

    #LEFT OFF HERE. FIX DISPLAY FUNCTION

    displayMatrix(args.output, 'Exact', confusionMatrixExactSpan)
    displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan)

    #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

    return
示例#10
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        dest="txt",
        help="The files that contain the training examples",
    )

    parser.add_argument(
        "-c",
        dest="con",
        help="The files that contain the labels for the training examples",
    )

    parser.add_argument(
        "-m",
        dest="model",
        help="Path to the model that should be generated",
    )

    parser.add_argument(
        "-f",
        dest="format",
        help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
    )

    parser.add_argument(
        "-g",
        dest="grid",
        help="A flag indicating whether to perform a grid search",
        action="store_true")

    parser.add_argument(
        "-no-crf",
        dest="nocrf",
        help="A flag indicating whether to use crfsuite for pass one.",
        action="store_true")

    parser.add_argument(
        "-discontiguous_spans",
        dest="third",
        help="A flag indicating whether to have third/clustering pass",
        action="store_true")

    parser.add_argument(
        "-umls_disambiguation",
        dest="umls_disambiguation",
        action="store_true",
        help=
        "A flag indicating wheter to disambiguate CUI id for detected entities in semeval format",
    )
    """
    parser.add_argument("-unlabeled",
        dest = "unlabeled",
        help = "Path to dir containing unlabelled data used for unsupervised methods",
    )
    """

    # Parse the command line arguments
    args = parser.parse_args()
    is_crf = not args.nocrf
    third = args.third

    # Error check: Ensure that file paths are specified
    if not args.txt:
        print >> sys.stderr, '\n\tError: Must provide text files'
        print >> sys.stderr, ''
        exit(1)
    if not args.con:
        print >> sys.stderr, '\n\tError: Must provide annotations for text files'
        print >> sys.stderr, ''
        exit(1)
    if not args.model:
        print >> sys.stderr, '\n\tError: Must provide valid path to store model'
        print >> sys.stderr, ''
        exit(1)
    modeldir = os.path.dirname(args.model)
    if (not os.path.exists(modeldir)) and (modeldir != ''):
        print >> sys.stderr, '\n\tError: Model dir does not exist: %s' % modeldir
        print >> sys.stderr, ''
        exit(1)

    if "PY4J_DIR_PATH" not in os.environ and args.third is True:
        exit(
            "please set environ var PY4J_DIR_PATH to the dir of the folder containg py4j<version>.jar"
        )

    # A list of text    file paths
    # A list of concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    # data format
    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    if third is True and args.format == "i2b2":
        exit("i2b2 formatting does not support disjoint spans")

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # Collect training data file paths
    txt_files_map = helper.map_files(
        txt_files)  # ex. {'record-13': 'record-13.con'}
    con_files_map = helper.map_files(con_files)

    training_list = [
    ]  # ex. training_list =  [ ('record-13.txt', 'record-13.con') ]
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    # Train the model
    train(training_list,
          args.model,
          format,
          is_crf=is_crf,
          grid=args.grid,
          third=third,
          disambiguate=args.umls_disambiguation)
示例#11
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        help = "Text files that were used to generate predictions",
        dest = "txt",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*')
    )

    parser.add_argument("-c",
        help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest = "con",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/')
    )

    parser.add_argument("-r",
        help = "The directory that contains reference gold standard concept files",
        dest = "ref",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/')
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
        default = 'i2b2'
    )

    parser.add_argument("-o",
        help = "Write the evaluation to a file rather than STDOUT",
        dest = "output",
        default = None
    )

    # Parse command line arguments
    args = parser.parse_args()
    format = args.format


    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout


    # Must specify output format
    if format not in Note.supportedFormats():
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: ', ' | '.join(Note.supportedFormats())
        print >>sys.stderr, ''
        exit(1)


    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)
    wildcard = '*.' + Note.dictOfFormatToExtensions()[format]


    # List of gold data
    ref_files = glob.glob( os.path.join(args.ref, wildcard) )
    ref_files_map = helper.map_files(ref_files)


    # List of predictions
    pred_files = glob.glob( os.path.join(args.con, wildcard) )
    pred_files_map = helper.map_files(pred_files)


    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k]))


    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard


    truePositivesExactSpan = 0
    falseNegativesExactSpan = 0
    falsePositivesExactSpan = 0

    truePositivesInexactSpan = 0
    falseNegativesInexactSpan = 0
    falsePositivesInexactSpan = 0

    confusion = [[0] * len(labels) for e in labels]

    confusionMatrixExactSpan = deepcopy(confusion)
    confusionMatrixInexactSpan = deepcopy(confusion)



    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Note(format)
        rnote = Note(format)
        cnote.read(txt, annotations)
        rnote.read(txt,        gold)

        referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist())
        predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist())

        #TO DO: i need to generate a cumulative total accross all of the files
        #modify my functions slightly and have it return the number of true positive and etc...
        #then call generate results

        exactResults =  evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False)

        inexactResults =  evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False)


        truePositivesExactSpan += exactResults["True Positives"]
        falseNegativesExactSpan += exactResults["False Negatives"]
        falsePositivesExactSpan += exactResults["False Positives"]


        inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False)

        truePositivesInexactSpan += inexactResults["True Positives"]
        falseNegativesInexactSpan += inexactResults["False Negatives"]
        falsePositivesInexactSpan += inexactResults["False Positives"]

        MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan):
            for i,int2 in enumerate(sublist2):
                sublist1[i] += int2

        MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan):
            for i,int2 in enumerate(sublist2):
                sublist1[i] += int2

    print "\nResults for exact span for concepts together.\n"

    print "True Positives: ", truePositivesExactSpan
    print "False Negatives: ", falseNegativesExactSpan
    print "False Positives: ", falsePositivesExactSpan

    exactSpan = generateResultsForExactSpans(truePositivesExactSpan,
                                 falseNegativesExactSpan,
                                 falsePositivesExactSpan)

    print "Recall: ", exactSpan["Recall"]
    print "Precision: ", exactSpan["Precision"]
    print "F Measure: ", exactSpan["F Score"]

    inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan,
                                 falseNegativesInexactSpan,
                                 falsePositivesInexactSpan)

    print "\nResults for inexact span for concepts together.\n"

    print "True Positives: ", truePositivesInexactSpan
    print "False Negatives: ", falseNegativesInexactSpan
    print "False Positives: ", falsePositivesInexactSpan

    print "Recall: ", inexactSpan["Recall"]
    print "Precision: ", inexactSpan["Precision"]
    print "F Measure: ", inexactSpan["F Score"]

    #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans
    #TO DO: number of FP, FN, TP is not same between exact and inexact.

    #LEFT OFF HERE. FIX DISPLAY FUNCTION

    displayMatrix(args.output, 'Exact'  , confusionMatrixExactSpan)
    displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan)


        #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

    return
示例#12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        dest="txt",
                        help="The files that contain the training examples",
                        default=os.path.join(os.getenv('CLINER_DIR'),
                                             'data/train/txt/*'))

    parser.add_argument(
        "-c",
        dest="con",
        help="The files that contain the labels for the training examples",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*'))

    parser.add_argument("-m",
                        dest="model",
                        help="Path to the model that should be generated",
                        default=os.path.join(os.getenv('CLINER_DIR'),
                                             'models/run.model'))

    parser.add_argument("-f",
                        dest="format",
                        help="Data format ( " +
                        ' | '.join(Note.supportedFormats()) + " )",
                        default='i2b2')

    parser.add_argument(
        "-g",
        dest="grid",
        help="A flag indicating whether to perform a grid search",
        action="store_true")

    parser.add_argument(
        "-no-crf",
        dest="nocrf",
        help="A flag indicating whether to use crfsuite for pass one.",
        action="store_true")

    # Parse the command line arguments
    args = parser.parse_args()
    is_crf = not args.nocrf

    # A list of text    file paths
    # A list of concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    # data format
    format = args.format

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # Collect training data file paths
    txt_files_map = helper.map_files(
        txt_files)  # ex. {'record-13': 'record-13.con'}
    con_files_map = helper.map_files(con_files)

    training_list = [
    ]  # ex. training_list =  [ ('record-13.txt', 'record-13.con') ]
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    # display file names (for user to see data was properly located)
    print '\n', training_list, '\n'

    # Train the model
    train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
示例#13
0
def main():

    # Argument Parser
    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        dest = "txt",
        help = "The files that contain the training examples",
    )

    parser.add_argument("-a",
        dest = "annotations",
        help = "The files that contain the labels for the training examples",
    )

    parser.add_argument("-o",
        dest = "out",
        default = None,
        help = "Directory to output data",
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Output format (%s)"%str(' or '.join(Note.supportedFormats())),
    )

    # Parse the command line arguments
    args = parser.parse_args()


    # Parse arguments
    txt         = args.txt
    annotations = args.annotations
    out_file    = args.out
    format      = args.format


    # Ensure annotations are specified
    if not txt:
        print >>sys.stderr, '\n\tError: Must supply text file'
        print >>sys.stderr
        exit(1)
    elif not os.path.exists(txt):
        print >>sys.stderr, '\n\tError: Given text file does not exist'
        print >>sys.stderr
        exit(1)

    # Ensure annotations are specified
    extensions = Note.supportedFormatExtensions()
    if not annotations:
        print >>sys.stderr, '\n\tError: Must supply annotations'
        print >>sys.stderr
        exit(2)
    elif not os.path.exists(txt):
        print >>sys.stderr, '\n\tError: Given annotation file does not exist'
        print >>sys.stderr
        exit(2)
    elif os.path.splitext(annotations)[1][1:] not in extensions:
        print >>sys.stderr, '\n\tError: annotation must be a supported format'
        print >>sys.stderr, '\t\t(.%s)' %str(' or .'.join(extensions) )
        print >>sys.stderr
        exit(2)

    # Ensure output format is specified
    if (not format) or (format not in Note.supportedFormats()):
        print >>sys.stderr, '\n\tError: Must specify supported output format'
        print >>sys.stderr, '\t\t(%s)' %str(' or '.join(Note.supportedFormats()))
        print >>sys.stderr
        exit(3)


    # Automatically find the input file format
    in_extension =  os.path.splitext(annotations)[1][1:]
    for f,ext in Note.dictOfFormatToExtensions().items():
        if ext == in_extension:
            in_format = f

    # Read input data into note object
    in_note = Note(in_format)
    in_note.read(txt,annotations)


    # Convert data to standard format
    internal_output = in_note.write_standard()

    os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp")
    with open(tmp_file, 'w') as f:
        f.write(internal_output)
    os.close(os_handle)

    #print internal_output

    # Read internal standard data into new file with given output format
    out_note = Note(format)
    out_note.read_standard(txt,tmp_file)


    # Output data
    out = out_note.write()
    if out_file:
        with open(out_file, 'w') as out_f:
            out_f.write(out)
    else:
        sys.stdout.write(out)


    # Clean up
    os.remove(tmp_file)
    if out_file:
        out_f.close()
示例#14
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-i",
        dest="input",
        help="The input files to predict",
    )

    parser.add_argument(
        "-o",
        dest="output",
        help="The directory to write the output",
    )

    parser.add_argument(
        "-m",
        dest="model",
        help="The model to use for prediction",
    )

    parser.add_argument(
        "-f",
        dest="format",
        help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
    )

    parser.add_argument("-crf",
                        dest="with_crf",
                        help="Specify where to find crfsuite",
                        default=None)

    parser.add_argument(
        "-discontiguous_spans",
        dest="third",
        help="A flag indicating whether to have third/clustering pass",
        action="store_true")

    parser.add_argument(
        "-umls_disambiguation",
        dest="disambiguate",
        help=
        "A flag indicating whether to disambiguate CUI ID for identified entities in semeval",
        action="store_true")

    args = parser.parse_args()

    # Error check: Ensure that file paths are specified
    if not args.input:
        print >> sys.stderr, '\n\tError: Must provide text files\n'
        exit(1)
    if not args.output:
        print >> sys.stderr, '\n\tError: Must provide output directory\n'
        exit(1)
    if not args.model:
        print >> sys.stderr, '\n\tError: Must provide path to model\n'
        exit(1)
    if not os.path.exists(args.model):
        print >> sys.stderr, '\n\tError: Model does not exist: %s\n' % args.model
        exit(1)

    # Parse arguments
    files = glob.glob(args.input)
    helper.mkpath(args.output)

    third = args.third

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    if third is True and args.format == "i2b2":
        exit("i2b2 formatting does not support disjoint spans")

    # Tell user if not predicting
    if not files:
        print >> sys.stderr, "\n\tNote: You did not supply any input files\n"
        exit()

    # Predict
    predict(files,
            args.model,
            args.output,
            format=format,
            third=third,
            disambiguate=args.disambiguate)
示例#15
0
def predict(files,
            model_path,
            output_dir,
            format,
            third=False,
            disambiguate=False):

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # Load model
    model = Model.load(model_path)

    # Tell user if not predicting
    if not files:
        print >> sys.stderr, "\n\tNote: You did not supply any input files\n"
        exit()

    if enabled["UMLS"] is not None and disambiguate is True:
        from disambiguation import cui_disambiguation

    # For each file, predict concept labels
    n = len(files)
    for i, txt in enumerate(sorted(files)):

        note = Note(format)
        note.read(txt)

        # Output file
        extension = note.getExtension()
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension
        out_path = os.path.join(output_dir, fname)
        #if os.path.exists(out_path):
        #    print '\tWARNING: prediction file already exists (%s)' % out_path
        #    continue

        if format == "semevaL":
            note.setFileName(os.path.split(txt)[-1])

        # Predict concept labels
        labels = model.predict(note, third)

        # Get predictions in proper format
        output = note.write(labels)

        # TODO: make a flag to enable or disable looking up concept ids.
        if format == "semeval":

            print "\nencoding concept ids"
            if enabled["UMLS"] is not None and disambiguate is True:
                output = cui_disambiguation.disambiguate(
                    output, txt, model.get_cui_freq())

        # Output the concept predictions
        print '\n\nwriting to: ', out_path
        with open(out_path, 'w') as f:
            print >> f, output
        print