def build_seqsinfo_from_seqfile(self, seq_file, data_parser_options, num_seqs=numpy.inf): """prepares and process sequences to disk and return info dictionary about the parsed sequences Args: seq_file: string representing the path to the sequence file data_parser_options: dictionary containing options to be passed to :func:`read_file` method of :class:`DataFileParser` class num_seqs: integer, maximum number of sequences to read from file (default numpy.inf -- means read all file) """ seq_representer = self.seq_representer # create working directory corpus_name = "reference_corpus_" + generate_datetime_str() working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = False # build the seqs_info by parsing the sequences from file iteratively seqs_info = {} counter=1 dparser = DataFileParser() for seq in self.get_seqs_from_file(seq_file, dparser, data_parser_options): if(hasattr(seq, 'id')): seq_id = seq.id else: seq_id = counter seqs_info.update(seq_representer.prepare_seqs({seq_id:seq}, corpus_name, working_dir, unique_id, log_progress=False)) print("{} sequences have been processed".format(counter)) if(counter>=num_seqs): break counter+=1 return(seqs_info)
def build_seqsinfo(self, seq_file): seq_representer = self.seq_representer # create working directory corpus_name = "reference_corpus_" + generate_datetime_str() working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = False # build the seqs_info by parsing the sequences from file iteratively seqs_info = {} counter = 1 for seq in self.get_seqs_from_file(seq_file): if hasattr(seq, "id"): seq_id = seq.id else: seq_id = counter seqs_info.update( seq_representer.prepare_seqs( {seq_id: seq}, corpus_name, working_dir, unique_id, log_progress=False, ) ) print("{} sequences have been processed".format(counter)) counter += 1 return seqs_info
def use_model(self, savedmodel_dir, options): """use trained model for decoding and performance measure evaluation""" # load learned models model_dir = savedmodel_dir # revive/generate learned model crf_model = self.get_learned_crf(model_dir) # parse the arguments in kwargs seqbatch_size = options.get("seqbatch_size") if(not seqbatch_size): seqbatch_size = 1000 # check if model evaluation is requested model_eval = options.get('model_eval') if(model_eval): evaluator = SeqDecodingEvaluator(crf_model.model) perf_metric = options.get('metric') if(not perf_metric): perf_metric = 'f1' exclude_states = options.get('exclude_states') if(not exclude_states): exclude_states = [] if(options.get('seqs_info')): # decode sequences seqs_info = options.get('seqs_info') seqs_id = list(seqs_info.keys()) start_ind = 0 stop_ind = seqbatch_size while(start_ind<len(seqs_id)): batch_seqsinfo = {seq_id:seqs_info[seq_id] for seq_id in seqs_id[start_ind:stop_ind]} seqs_pred = crf_model.decode_seqs("viterbi", model_dir, seqs_info=batch_seqsinfo, file_name=options.get('file_name'), sep=options.get('sep'), beam_size=options.get('beam_size')) if(model_eval): Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred) if(start_ind == 0): taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict) else: taglevel_perf += evaluator.compute_states_confmatrix(Y_seqs_dict) start_ind+=seqbatch_size stop_ind+=seqbatch_size # TO adjust the batch size and available sequences.. elif(options.get('seq_file')): flag = False seq_file = options.get('seq_file') data_parser_options = options.get("data_parser_options") num_seqs = options.get("num_seqs") if(not num_seqs): num_seqs = numpy.inf # the folder name where intermediary sequences and data are stored procseqs_foldername = "processed_seqs_" + generate_datetime_str() dparser = DataFileParser() seqs_dict = {} bcounter = 1 seq_counter = 1 for seq in self.get_seqs_from_file(seq_file, dparser, data_parser_options): seqs_dict[seq_counter] = seq if(bcounter >= seqbatch_size): seqs_pred = crf_model.decode_seqs("viterbi", model_dir, seqs_dict=seqs_dict, procseqs_foldername=procseqs_foldername, file_name=options.get('file_name'), sep=options.get('sep'), beam_size=options.get('beam_size')) bcounter = 0 seqs_dict.clear() if(model_eval): Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred) if(seq_counter == seqbatch_size): taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict) flag = True else: taglevel_perf += evaluator.compute_states_confmatrix(Y_seqs_dict) if(seq_counter>=num_seqs): break bcounter += 1 seq_counter+=1 if(len(seqs_dict)): # decode the remaining sequences seqs_pred = crf_model.decode_seqs("viterbi", model_dir, seqs_dict=seqs_dict, procseqs_foldername=procseqs_foldername, file_name=options.get('file_name'), sep=options.get('sep'), beam_size=options.get('beam_size')) if(model_eval): Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred) if(flag): taglevel_perf += evaluator.compute_states_confmatrix(Y_seqs_dict) else: taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict) if(model_eval): performance = evaluator.get_performance_metric(taglevel_perf, perf_metric, exclude_states=exclude_states) return({perf_metric:performance, 'taglevel_confusion_matrix':taglevel_perf}) return({})
def eval_model(self, savedmodel_dir, options): # load learned models model_dir = savedmodel_dir # revive/generate learned model crf_model = self.get_learned_crf(model_dir) # parse the arguments in kwargs seqbatch_size = options.get("seqbatch_size") if not seqbatch_size: seqbatch_size = 1000 # check if model evaluation is requested model_eval = options.get("model_eval") if model_eval: evaluator = SeqDecodingEvaluator(crf_model.model) perf_metric = options.get("metric") if not perf_metric: perf_metric = "f1" exclude_states = options.get("exclude_states") if not exclude_states: exclude_states = [] if options.get("seqs_info"): # decode sequences seqs_info = options.get("seqs_info") seqs_id = list(seqs_info.keys()) start_ind = 0 stop_ind = seqbatch_size while start_ind < len(seqs_id): batch_seqsinfo = { seq_id: seqs_info[seq_id] for seq_id in seqs_id[start_ind:stop_ind] } seqs_pred = crf_model.decode_seqs( "viterbi", model_dir, seqs_info=batch_seqsinfo, file_name=options.get("file_name"), sep=options.get("sep"), beam_size=options.get("beam_size"), ) if model_eval: Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred) if start_ind == 0: taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict) else: taglevel_perf += evaluator.compute_states_confmatrix( Y_seqs_dict ) start_ind += seqbatch_size stop_ind += seqbatch_size # TO adjust the batch size and available sequences.. elif options.get("seq_file"): flag = False seq_file = options.get("seq_file") # the folder name where intermediary sequences and data are stored procseqs_foldername = "processed_seqs_" + generate_datetime_str() seqs_dict = {} bcounter = 1 seq_counter = 1 for seq in self.get_seqs_from_file(seq_file): seqs_dict[seq_counter] = seq if bcounter >= seqbatch_size: seqs_pred = crf_model.decode_seqs( "viterbi", model_dir, seqs_dict=seqs_dict, procseqs_foldername=procseqs_foldername, file_name=options.get("file_name"), sep=options.get("sep"), beam_size=options.get("beam_size"), ) bcounter = 0 seqs_dict.clear() if model_eval: Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred) if seq_counter == seqbatch_size: taglevel_perf = evaluator.compute_states_confmatrix( Y_seqs_dict ) flag = True else: taglevel_perf += evaluator.compute_states_confmatrix( Y_seqs_dict ) bcounter += 1 seq_counter += 1 if len(seqs_dict): # decode the remaining sequences seqs_pred = crf_model.decode_seqs( "viterbi", model_dir, seqs_dict=seqs_dict, procseqs_foldername=procseqs_foldername, file_name=options.get("file_name"), sep=options.get("sep"), beam_size=options.get("beam_size"), ) if model_eval: Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred) if flag: taglevel_perf += evaluator.compute_states_confmatrix( Y_seqs_dict ) else: taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict) if model_eval: performance = evaluator.get_performance_metric( taglevel_perf, perf_metric, exclude_states=exclude_states ) return { perf_metric: performance, "taglevel_confusion_matrix": taglevel_perf, } return {}