def predict(files, model_path, output_dir, format): # Must specify output format if format not in Note.supportedFormats(): print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) print >>sys.stderr, '' exit(1) # Load model model = Model.load(model_path) # Tell user if not predicting if not files: print >>sys.stderr, "\n\tNote: You did not supply any input files\n" exit() # For each file, predict concept labels n = len(files) for i,txt in enumerate(sorted(files)): # Read the data into a Note object note = Note(format) note.read(txt) print '-' * 30 print '\n\t%d of %d' % (i+1,n) print '\t', txt, '\n' # Predict concept labels labels = model.predict(note) # Get predictions in proper format extension = note.getExtension() output = note.write(labels) #print output # Output file fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension out_path = os.path.join(output_dir, fname) # Output the concept predictions print '\n\nwriting to: ', out_path with open(out_path, 'w') as f: print >>f, output print
def main(): # Argument Parser parser = argparse.ArgumentParser() parser.add_argument( "-txt", dest="txt", help="The files that contain the training examples", ) parser.add_argument( "-annotations", dest="annotations", help="The files that contain the labels for the training examples", ) parser.add_argument( "-out", dest="out", default=None, help="Directory to output data", ) parser.add_argument( "-format", dest="format", help="Output format (%s)" % str(' or '.join(Note.supportedFormats())), ) # Parse the command line arguments args = parser.parse_args() # Parse arguments txt = args.txt annotations = args.annotations out_file = args.out format = args.format # Ensure annotations are specified if not txt: print >> sys.stderr, '\n\tError: Must supply text file' print >> sys.stderr exit(1) elif not os.path.exists(txt): print >> sys.stderr, '\n\tError: Given text file does not exist' print >> sys.stderr exit(1) # Ensure annotations are specified extensions = Note.supportedFormatExtensions() if not annotations: print >> sys.stderr, '\n\tError: Must supply annotations' print >> sys.stderr exit(2) elif not os.path.exists(txt): print >> sys.stderr, '\n\tError: Given annotation file does not exist' print >> sys.stderr exit(2) elif os.path.splitext(annotations)[1][1:] not in extensions: print >> sys.stderr, '\n\tError: annotation must be a supported format' print >> sys.stderr, '\t\t(.%s)' % str(' or .'.join(extensions)) print >> sys.stderr exit(2) # Ensure output format is specified if (not format) or (format not in Note.supportedFormats()): print >> sys.stderr, '\n\tError: Must specify supported output format' print >> sys.stderr, '\t\t(%s)' % str(' or '.join( Note.supportedFormats())) print >> sys.stderr exit(3) # Automatically find the input file format in_extension = os.path.splitext(annotations)[1][1:] for f, ext in Note.dictOfFormatToExtensions().items(): if ext == in_extension: in_format = f # Read input data into note object in_note = Note(in_format) in_note.read(txt, annotations) # Convert data to standard format internal_output = in_note.write_standard() os_handle, tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp") with open(tmp_file, 'w') as f: f.write(internal_output) os.close(os_handle) #print internal_output # Read internal standard data into new file with given output format out_note = Note(format) out_note.read_standard(txt, tmp_file) # Output data out = out_note.write() if out_file: with open(out_file, 'w') as out_f: out_f.write(out) else: sys.stdout.write(out) # Clean up os.remove(tmp_file) if out_file: out_f.close()
def main(): # Argument Parser parser = argparse.ArgumentParser() parser.add_argument("-t", dest = "txt", help = "The files that contain the training examples", ) parser.add_argument("-a", dest = "annotations", help = "The files that contain the labels for the training examples", ) parser.add_argument("-o", dest = "out", default = None, help = "Directory to output data", ) parser.add_argument("-f", dest = "format", help = "Output format (%s)"%str(' or '.join(Note.supportedFormats())), ) # Parse the command line arguments args = parser.parse_args() # Parse arguments txt = args.txt annotations = args.annotations out_file = args.out format = args.format # Ensure annotations are specified if not txt: print >>sys.stderr, '\n\tError: Must supply text file' print >>sys.stderr exit(1) elif not os.path.exists(txt): print >>sys.stderr, '\n\tError: Given text file does not exist' print >>sys.stderr exit(1) # Ensure annotations are specified extensions = Note.supportedFormatExtensions() if not annotations: print >>sys.stderr, '\n\tError: Must supply annotations' print >>sys.stderr exit(2) elif not os.path.exists(txt): print >>sys.stderr, '\n\tError: Given annotation file does not exist' print >>sys.stderr exit(2) elif os.path.splitext(annotations)[1][1:] not in extensions: print >>sys.stderr, '\n\tError: annotation must be a supported format' print >>sys.stderr, '\t\t(.%s)' %str(' or .'.join(extensions) ) print >>sys.stderr exit(2) # Ensure output format is specified if (not format) or (format not in Note.supportedFormats()): print >>sys.stderr, '\n\tError: Must specify supported output format' print >>sys.stderr, '\t\t(%s)' %str(' or '.join(Note.supportedFormats())) print >>sys.stderr exit(3) # Automatically find the input file format in_extension = os.path.splitext(annotations)[1][1:] for f,ext in Note.dictOfFormatToExtensions().items(): if ext == in_extension: in_format = f # Read input data into note object in_note = Note(in_format) in_note.read(txt,annotations) # Convert data to standard format internal_output = in_note.write_standard() os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp") with open(tmp_file, 'w') as f: f.write(internal_output) os.close(os_handle) #print internal_output # Read internal standard data into new file with given output format out_note = Note(format) out_note.read_standard(txt,tmp_file) # Output data out = out_note.write() if out_file: with open(out_file, 'w') as out_f: out_f.write(out) else: sys.stdout.write(out) # Clean up os.remove(tmp_file) if out_file: out_f.close()
def predict(files, model_path, output_dir, format, third=False, disambiguate=False): # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # Load model model = Model.load(model_path) # Tell user if not predicting if not files: print >> sys.stderr, "\n\tNote: You did not supply any input files\n" exit() if enabled["UMLS"] is not None and disambiguate is True: from disambiguation import cui_disambiguation # For each file, predict concept labels n = len(files) for i, txt in enumerate(sorted(files)): note = Note(format) note.read(txt) # Output file extension = note.getExtension() fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension out_path = os.path.join(output_dir, fname) #if os.path.exists(out_path): # print '\tWARNING: prediction file already exists (%s)' % out_path # continue if format == "semevaL": note.setFileName(os.path.split(txt)[-1]) # Predict concept labels labels = model.predict(note, third) # Get predictions in proper format output = note.write(labels) # TODO: make a flag to enable or disable looking up concept ids. if format == "semeval": print "\nencoding concept ids" if enabled["UMLS"] is not None and disambiguate is True: output = cui_disambiguation.disambiguate( output, txt, model.get_cui_freq()) # Output the concept predictions print '\n\nwriting to: ', out_path with open(out_path, 'w') as f: print >> f, output print