예제 #1
0
    def build_seqsinfo_from_seqfile(self, seq_file, data_parser_options, num_seqs=numpy.inf):
        """prepares and process sequences to disk and return info dictionary about the parsed sequences
        
          Args:
               seq_file: string representing the path to the sequence file
               data_parser_options: dictionary containing options to be passed
                                    to :func:`read_file` method of :class:`DataFileParser` class
               num_seqs: integer, maximum number of sequences to read from file
                         (default numpy.inf -- means read all file)

        """
        seq_representer = self.seq_representer
        # create working directory 
        corpus_name = "reference_corpus_" + generate_datetime_str()
        working_dir = create_directory("working_dir", self.root_dir)
        self.working_dir = working_dir
        unique_id = False
        # build the seqs_info by parsing the sequences from file iteratively
        seqs_info = {}
        counter=1
        dparser = DataFileParser()
        for seq in self.get_seqs_from_file(seq_file, dparser, data_parser_options):
            if(hasattr(seq, 'id')):
                seq_id = seq.id
            else:
                seq_id = counter
            seqs_info.update(seq_representer.prepare_seqs({seq_id:seq}, corpus_name, working_dir, unique_id, log_progress=False))
            print("{} sequences have been processed".format(counter))
            if(counter>=num_seqs):
                break
            counter+=1    

        return(seqs_info)
예제 #2
0
 def build_seqsinfo(self, seq_file):
     seq_representer = self.seq_representer
     # create working directory
     corpus_name = "reference_corpus_" + generate_datetime_str()
     working_dir = create_directory("working_dir", self.root_dir)
     self.working_dir = working_dir
     unique_id = False
     # build the seqs_info by parsing the sequences from file iteratively
     seqs_info = {}
     counter = 1
     for seq in self.get_seqs_from_file(seq_file):
         if hasattr(seq, "id"):
             seq_id = seq.id
         else:
             seq_id = counter
         seqs_info.update(
             seq_representer.prepare_seqs(
                 {seq_id: seq},
                 corpus_name,
                 working_dir,
                 unique_id,
                 log_progress=False,
             )
         )
         print("{} sequences have been processed".format(counter))
         counter += 1
     return seqs_info
예제 #3
0
    def use_model(self, savedmodel_dir, options):
        """use trained model for decoding and performance measure evaluation"""
        # load learned models
        model_dir = savedmodel_dir
        # revive/generate learned model
        crf_model  = self.get_learned_crf(model_dir)
 
        # parse the arguments in kwargs
        seqbatch_size = options.get("seqbatch_size")
        if(not seqbatch_size):
            seqbatch_size = 1000
        # check if model evaluation is requested
        model_eval = options.get('model_eval')
        if(model_eval):
            evaluator = SeqDecodingEvaluator(crf_model.model)
            perf_metric = options.get('metric')
            if(not perf_metric):
                perf_metric = 'f1'
            exclude_states = options.get('exclude_states')
            if(not exclude_states):
                exclude_states = []
         
        if(options.get('seqs_info')):
            # decode sequences 
            seqs_info = options.get('seqs_info')
            seqs_id = list(seqs_info.keys())
            start_ind = 0
            stop_ind = seqbatch_size
            while(start_ind<len(seqs_id)):
                batch_seqsinfo = {seq_id:seqs_info[seq_id] for seq_id in seqs_id[start_ind:stop_ind]}              
                seqs_pred = crf_model.decode_seqs("viterbi", model_dir, seqs_info=batch_seqsinfo, 
                                                  file_name=options.get('file_name'), sep=options.get('sep'),
                                                  beam_size=options.get('beam_size'))
                if(model_eval):
                    Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred)
                    if(start_ind == 0):
                        taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict)
                    else:
                        taglevel_perf += evaluator.compute_states_confmatrix(Y_seqs_dict)
                start_ind+=seqbatch_size
                stop_ind+=seqbatch_size
         
        # TO adjust the batch size and available sequences..
        elif(options.get('seq_file')):       
            flag = False
            seq_file = options.get('seq_file')
            data_parser_options = options.get("data_parser_options")
            num_seqs = options.get("num_seqs")
            if(not num_seqs):
                num_seqs = numpy.inf
            # the folder name where intermediary sequences and data are stored
            procseqs_foldername = "processed_seqs_" + generate_datetime_str()
            dparser = DataFileParser()
            seqs_dict = {}
            bcounter = 1
            seq_counter = 1
            for seq in self.get_seqs_from_file(seq_file, dparser, data_parser_options):
                seqs_dict[seq_counter] = seq
                if(bcounter >= seqbatch_size):
                    seqs_pred = crf_model.decode_seqs("viterbi", model_dir, seqs_dict=seqs_dict, 
                                                      procseqs_foldername=procseqs_foldername, file_name=options.get('file_name'),
                                                      sep=options.get('sep'), beam_size=options.get('beam_size'))
                    bcounter = 0
                    seqs_dict.clear()
                    if(model_eval):
                        Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred)
                        if(seq_counter == seqbatch_size):
                            taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict)
                            flag = True
                        else:
                            taglevel_perf += evaluator.compute_states_confmatrix(Y_seqs_dict)
                if(seq_counter>=num_seqs):
                    break
                bcounter += 1
                seq_counter+=1
            if(len(seqs_dict)):
                # decode the remaining sequences
                seqs_pred = crf_model.decode_seqs("viterbi", model_dir, seqs_dict=seqs_dict, 
                                                  procseqs_foldername=procseqs_foldername, file_name=options.get('file_name'),
                                                  sep=options.get('sep'), beam_size=options.get('beam_size'))
                if(model_eval):
                    Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred)
                    if(flag):
                        taglevel_perf += evaluator.compute_states_confmatrix(Y_seqs_dict)
                    else:
                        taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict)
        if(model_eval):
            performance = evaluator.get_performance_metric(taglevel_perf, perf_metric, exclude_states=exclude_states)
            return({perf_metric:performance, 'taglevel_confusion_matrix':taglevel_perf})
        return({})
예제 #4
0
    def eval_model(self, savedmodel_dir, options):
        # load learned models
        model_dir = savedmodel_dir
        # revive/generate learned model
        crf_model = self.get_learned_crf(model_dir)

        # parse the arguments in kwargs
        seqbatch_size = options.get("seqbatch_size")
        if not seqbatch_size:
            seqbatch_size = 1000
        # check if model evaluation is requested
        model_eval = options.get("model_eval")
        if model_eval:
            evaluator = SeqDecodingEvaluator(crf_model.model)
            perf_metric = options.get("metric")
            if not perf_metric:
                perf_metric = "f1"
            exclude_states = options.get("exclude_states")
            if not exclude_states:
                exclude_states = []

        if options.get("seqs_info"):
            # decode sequences
            seqs_info = options.get("seqs_info")
            seqs_id = list(seqs_info.keys())
            start_ind = 0
            stop_ind = seqbatch_size
            while start_ind < len(seqs_id):
                batch_seqsinfo = {
                    seq_id: seqs_info[seq_id] for seq_id in seqs_id[start_ind:stop_ind]
                }
                seqs_pred = crf_model.decode_seqs(
                    "viterbi",
                    model_dir,
                    seqs_info=batch_seqsinfo,
                    file_name=options.get("file_name"),
                    sep=options.get("sep"),
                    beam_size=options.get("beam_size"),
                )
                if model_eval:
                    Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred)
                    if start_ind == 0:
                        taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict)
                    else:
                        taglevel_perf += evaluator.compute_states_confmatrix(
                            Y_seqs_dict
                        )
                start_ind += seqbatch_size
                stop_ind += seqbatch_size

        # TO adjust the batch size and available sequences..
        elif options.get("seq_file"):
            flag = False
            seq_file = options.get("seq_file")
            # the folder name where intermediary sequences and data are stored
            procseqs_foldername = "processed_seqs_" + generate_datetime_str()
            seqs_dict = {}
            bcounter = 1
            seq_counter = 1
            for seq in self.get_seqs_from_file(seq_file):
                seqs_dict[seq_counter] = seq
                if bcounter >= seqbatch_size:
                    seqs_pred = crf_model.decode_seqs(
                        "viterbi",
                        model_dir,
                        seqs_dict=seqs_dict,
                        procseqs_foldername=procseqs_foldername,
                        file_name=options.get("file_name"),
                        sep=options.get("sep"),
                        beam_size=options.get("beam_size"),
                    )
                    bcounter = 0
                    seqs_dict.clear()
                    if model_eval:
                        Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred)
                        if seq_counter == seqbatch_size:
                            taglevel_perf = evaluator.compute_states_confmatrix(
                                Y_seqs_dict
                            )
                            flag = True
                        else:
                            taglevel_perf += evaluator.compute_states_confmatrix(
                                Y_seqs_dict
                            )
                bcounter += 1
                seq_counter += 1
            if len(seqs_dict):
                # decode the remaining sequences
                seqs_pred = crf_model.decode_seqs(
                    "viterbi",
                    model_dir,
                    seqs_dict=seqs_dict,
                    procseqs_foldername=procseqs_foldername,
                    file_name=options.get("file_name"),
                    sep=options.get("sep"),
                    beam_size=options.get("beam_size"),
                )
                if model_eval:
                    Y_seqs_dict = self.map_pred_to_ref_seqs(seqs_pred)
                    if flag:
                        taglevel_perf += evaluator.compute_states_confmatrix(
                            Y_seqs_dict
                        )
                    else:
                        taglevel_perf = evaluator.compute_states_confmatrix(Y_seqs_dict)
        if model_eval:
            performance = evaluator.get_performance_metric(
                taglevel_perf, perf_metric, exclude_states=exclude_states
            )
            return {
                perf_metric: performance,
                "taglevel_confusion_matrix": taglevel_perf,
            }
        return {}