예제 #1
0
    def traineval_folds(self, data_split, **kwargs):
        """train and evaluate model on different dataset splits"""

        seqs_info = self.seqs_info

        models_info = []
        ref_corpusdir = os.path.dirname(
            os.path.dirname(seqs_info[1]["globalfeatures_dir"])
        )

        info_fromdisk = kwargs.get("load_info_fromdisk")
        # specify large number such that we always load the computed data from disk rather keeping them in memory
        if type(info_fromdisk) != int:
            info_fromdisk = 10
        elif info_fromdisk < 0:
            info_fromdisk = 10
        # get optimization options
        optimization_options = kwargs.get("optimization_options")
        if not optimization_options:
            raise ("optimization_options need to be specified !!")
        full_parsing = False
        if optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}:
            full_parsing = True
        # check if file name is specified
        file_name = kwargs.get("file_name")
        for fold in data_split:
            for dtype in ("train", "test"):
                fold_seqs_id = data_split[fold].get(dtype)
                if dtype == "train":
                    crf_model = self.build_crf_model(
                        fold_seqs_id, "f{}".format(fold), info_fromdisk, full_parsing
                    )
                    # get the directory of the trained model
                    savedmodel_dir = self.train_model(
                        fold_seqs_id, crf_model, optimization_options
                    )
                if fold_seqs_id:
                    # evaluate on the current data fold
                    fold_name = "{}_f{}".format(dtype, fold)
                    fold_seqs_info = {
                        seq_id: seqs_info[seq_id] for seq_id in fold_seqs_id
                    }
                    kwargs["seqs_info"] = fold_seqs_info
                    if file_name:
                        # add prefix
                        update_filename = fold_name + "_" + file_name
                        kwargs["file_name"] = update_filename

                    res = self.use_model(savedmodel_dir, kwargs)
                    res["fold_name"] = fold_name
                    res["model_dir"] = savedmodel_dir
                    models_info.append(res)
        # save workflow trainer instance on disk
        ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, "workflow_trainer"))
        return models_info
예제 #2
0
 def save(self, folder_dir):
     """save relevant info about the scaler on disk
     
        Args:
            folder_dir: string representing directory where files are pickled/dumped
     """
     save_info = {'AS_scalinginfo': self.scaling_info,
                  'AS_method':self.method
                 }
     for name in save_info:
         ReaderWriter.dump_data(save_info[name], os.path.join(folder_dir, name))   
예제 #3
0
    def seq_parsing_workflow(self, split_options, **kwargs):
        """preparing and parsing sequences to be later used in the learning framework"""

        # get attribute extractor
        attr_extractor = self.aextractor_obj
        # get feature extractor
        f_extractor = self.fextractor_obj

        # create sequence representer
        seq_representer = SeqsRepresenter(attr_extractor, f_extractor)
        self.seq_representer = seq_representer
        # check if a sequence file is passed
        if kwargs.get("seq_file"):
            seq_file = kwargs.get("seq_file")
            # get the data file parser options
            data_parser_options = kwargs.get("data_parser_options")
            num_seqs = kwargs.get("num_seqs")
            if not num_seqs:  # default read all file
                num_seqs = numpy.inf
            # build the seqs_info by parsing the sequences from file iteratively
            seqs_info = self.build_seqsinfo_from_seqfile(
                seq_file, data_parser_options, num_seqs=num_seqs
            )
        elif kwargs.get("seqs"):
            seqs = kwargs.get("seqs")
            seqs_info = self.build_seqsinfo_from_seqs(seqs)

        seqs_id = list(seqs_info.keys())
        self.seqs_id = seqs_id

        # preprocess and generate attributes in case of segments with length >1 or in case of scaling of continuous attributes is needed
        seq_representer.preprocess_attributes(seqs_id, seqs_info)

        # check if we want to generate global features per boundary too
        # this is mostly used in perceptron/search based training
        full_parsing = kwargs.get("full_parsing")
        # extract global features F(X,Y)
        seq_representer.extract_seqs_globalfeatures(
            seqs_id, seqs_info, dump_gfeat_perboundary=full_parsing
        )

        # save the link to seqs_info and seq_representer as instance variables
        # because the seqs_info and seq_representer is updated
        self.seqs_info = seqs_info
        self.seq_representer = seq_representer

        # split dataset according to the specified split options
        data_split = self.split_dataset(seqs_info, split_options)

        # save the datasplit dictionary on disk
        gfeatures_dir = seqs_info[1]["globalfeatures_dir"]
        ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir))
        ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split"))
        return data_split
예제 #4
0
    def seq_parsing_workflow(self, seqs, split_options):
        """preparing sequences to be used in the learning framework"""

        # create working directory
        corpus_name = "reference_corpus"
        working_dir = create_directory("working_dir", self.root_dir)
        self.working_dir = working_dir
        unique_id = True

        # create sequence dictionary mapping each sequence to a unique id
        seqs_dict = {i + 1: seqs[i] for i in range(len(seqs))}
        seqs_id = list(seqs_dict.keys())
        self.seqs_id = seqs_id

        # initialize attribute extractor
        attr_extractor = self.aextractor_class()

        # create the feature extractor
        scaling_method = self.scaling_method
        fextractor_class = self.fextractor_class
        f_extractor = fextractor_class(
            self.template_xy, self.template_y, attr_extractor.attr_desc
        )

        # create sequence representer
        seq_representer = SeqsRepresenter(attr_extractor, f_extractor)

        # get the seqs_info representing the information about the parsed sequences
        seqs_info = seq_representer.prepare_seqs(
            seqs_dict, corpus_name, working_dir, unique_id
        )

        # preporcess and generate attributes in case of segments with length >1 or in case of scaling of
        # attributes is needed
        seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method)

        # extract global features F(X,Y)
        percep_training = False
        if self.optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}:
            percep_training = True
        seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info, percep_training)

        # save the link to seqs_info and seq_representer as instance variables
        self.seqs_info = seqs_info
        self.seq_representer = seq_representer

        # split dataset according to the specified split options
        data_split = self.split_dataset(seqs_info, split_options)

        # save the datasplit dictionary on disk
        gfeatures_dir = seqs_info[1]["globalfeatures_dir"]
        ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir))
        ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split"))
        return data_split
예제 #5
0
 def traineval_folds(self, data_split, **kwargs):
     """train and evaluate model on different dataset splits"""
     
     seqs_id = self.seqs_id
     seq_representer = self.seq_representer
     seqs_info = self.seqs_info
     model_repr_class = self.model_repr_class
     model_class = self.model_class
     models_info = []
     ref_corpusdir = os.path.dirname(os.path.dirname(seqs_info[1]['globalfeatures_dir']))
     
     info_fromdisk = kwargs.get('load_info_fromdisk')
     # specify large number such that we always load the computed data from disk rather keeping them in memory
     if(type(info_fromdisk) != int):
         info_fromdisk = 10
     elif(info_fromdisk < 0):
         info_fromdisk = 10
     # check if file name is specified
     file_name = kwargs.get('file_name')
     for fold in data_split:
         for dtype in ('train', 'test'):
             fold_seqs_id = data_split[fold].get(dtype)
             if(dtype == 'train'):
                 # create model using the sequences assigned for training
                 model_repr = seq_representer.create_model(fold_seqs_id, seqs_info, model_repr_class, self.filter_obj)
                 # extract for each sequence model active features
                 seq_representer.extract_seqs_modelactivefeatures(seqs_id, seqs_info, model_repr, "f{}".format(fold))
                 # create a CRF model
                 crf_model = model_class(model_repr, seq_representer, seqs_info, load_info_fromdisk = info_fromdisk)
                 # get the directory of the trained model
                 savedmodel_dir = self.train_model(fold_seqs_id, crf_model)      
             if(fold_seqs_id):
                 # evaluate on the current data fold 
                 fold_name = '{}_f{}'.format(dtype, fold)
                 fold_seqs_info = {seq_id:seqs_info[seq_id] for seq_id in fold_seqs_id}
                 kwargs['seqs_info'] = fold_seqs_info 
                 
                 if(file_name):
                     # add prefix
                     update_filename = fold_name + "_" + file_name
                     kwargs['file_name'] = update_filename
                 
                 res = self.eval_model(savedmodel_dir, kwargs)
                 res['fold_name'] = fold_name
                 res['model_dir'] = savedmodel_dir
                 models_info.append(res)                
     # save workflow trainer instance on disk
     ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, 'workflow_trainer'))
     return(models_info)
예제 #6
0
    def traineval_folds(self, data_split, **kwargs):
        """train and evaluate model on different dataset splits"""
        
        seqs_info = self.seqs_info

        models_info = []
        ref_corpusdir = os.path.dirname(os.path.dirname(seqs_info[1]['globalfeatures_dir']))
        
        info_fromdisk = kwargs.get('load_info_fromdisk')
        # specify large number such that we always load the computed data from disk rather keeping them in memory
        if(type(info_fromdisk) != int):
            info_fromdisk = 10
        elif(info_fromdisk < 0):
            info_fromdisk = 10
        # get optimization options
        optimization_options = kwargs.get("optimization_options")
        if(not optimization_options):
            raise("optimization_options need to be specified !!")
        full_parsing = False
        if(optimization_options['method'] in {'COLLINS-PERCEPTRON', 'SAPO'}):
            full_parsing = True
        # check if file name is specified
        file_name = kwargs.get('file_name')
        for fold in data_split:
            for dtype in ('train', 'test'):
                fold_seqs_id = data_split[fold].get(dtype)
                if(dtype == 'train'):
                    crf_model = self.build_crf_model(fold_seqs_id, "f{}".format(fold), info_fromdisk, full_parsing)
                    # get the directory of the trained model
                    savedmodel_dir = self.train_model(fold_seqs_id, crf_model, optimization_options)      
                if(fold_seqs_id):
                    # evaluate on the current data fold 
                    fold_name = '{}_f{}'.format(dtype, fold)
                    fold_seqs_info = {seq_id:seqs_info[seq_id] for seq_id in fold_seqs_id}
                    kwargs['seqs_info'] = fold_seqs_info 
                    if(file_name):
                        # add prefix
                        update_filename = fold_name + "_" + file_name
                        kwargs['file_name'] = update_filename
                    
                    res = self.use_model(savedmodel_dir, kwargs)
                    res['fold_name'] = fold_name
                    res['model_dir'] = savedmodel_dir
                    models_info.append(res)                
        # save workflow trainer instance on disk
        ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, 'workflow_trainer'))
        return(models_info)
예제 #7
0
    def traineval_folds(self, data_split, meval=True, sep=" "):
        """train and evaluate model on different dataset splits"""
        
        seqs_id = self.seqs_id
        seq_representer = self.seq_representer
        seqs_info = self.seqs_info
        model_repr_class = self.model_repr_class
        model_class = self.model_class
        models_info = []
        ref_corpusdir = os.path.dirname(os.path.dirname(seqs_info[1]['globalfeatures_dir']))
        if(meval):
            traineval_fname = "modeleval_train.txt"
            testeval_fname = "modeleval_test.txt"
        else:
            traineval_fname = None
            testeval_fname = None
               
        for fold in data_split:
            trainseqs_id = data_split[fold]['train']
            # create model using the sequences assigned for training
            model_repr = seq_representer.create_model(trainseqs_id, seqs_info, model_repr_class, self.filter_obj)
            # extract for each sequence model active features
            seq_representer.extract_seqs_modelactivefeatures(seqs_id, seqs_info, model_repr, "f{}".format(fold))
            
            # create a CRF model
            crf_model = model_class(model_repr, seq_representer, seqs_info, load_info_fromdisk = 4)
            # get the directory of the trained model
            savedmodel_info = self.train_model(trainseqs_id, crf_model)      
            # evaluate on the training data 
            trainseqs_info = {seq_id:seqs_info[seq_id] for seq_id in trainseqs_id} 
            self.eval_model(savedmodel_info, {'seqs_info':trainseqs_info},
                            traineval_fname, "dec_trainseqs_fold_{}.txt".format(fold), sep=sep)
           
            # evaluate on the test data 
            testseqs_id = data_split[fold].get('test')
            if(testseqs_id):
                testseqs_info = {seq_id:seqs_info[seq_id] for seq_id in testseqs_id} 
                self.eval_model(savedmodel_info, {'seqs_info':testseqs_info}, 
                                testeval_fname, "dec_testseqs_fold_{}.txt".format(fold), sep=sep)

            models_info.append(savedmodel_info)
        # save workflow trainer instance on disk
        ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, 'workflow_trainer'))
        return(models_info)
예제 #8
0
    def seq_parsing_workflow(self, seq_file, split_options):
        """preparing sequences to be used in the learning framework"""

        # initialize attribute extractor
        attr_extractor = self.aextractor_class()

        # create the feature extractor
        scaling_method = self.scaling_method
        fextractor_class = self.fextractor_class
        f_extractor = fextractor_class(
            self.template_xy, self.template_y, attr_extractor.attr_desc
        )

        # create sequence representer
        seq_representer = SeqsRepresenter(attr_extractor, f_extractor)
        self.seq_representer = seq_representer
        # build the seqs_info by parsing the sequences from file iteratively
        seqs_info = self.build_seqsinfo(seq_file)
        seqs_id = list(seqs_info.keys())
        self.seqs_id = seqs_id

        # preprocess and generate attributes in case of segments with length >1 or in case of scaling of
        # attributes is needed
        seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method)

        # extract global features F(X,Y)
        percep_training = False
        if self.optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}:
            percep_training = True
        seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info, percep_training)

        # save the link to seqs_info and seq_representer as instance variables
        self.seqs_info = seqs_info
        self.seq_representer = seq_representer

        # split dataset according to the specified split options
        data_split = self.split_dataset(seqs_info, split_options)

        # save the datasplit dictionary on disk
        gfeatures_dir = seqs_info[1]["globalfeatures_dir"]
        ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir))
        ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split"))
        return data_split