def traineval_folds(self, data_split, **kwargs): """train and evaluate model on different dataset splits""" seqs_info = self.seqs_info models_info = [] ref_corpusdir = os.path.dirname( os.path.dirname(seqs_info[1]["globalfeatures_dir"]) ) info_fromdisk = kwargs.get("load_info_fromdisk") # specify large number such that we always load the computed data from disk rather keeping them in memory if type(info_fromdisk) != int: info_fromdisk = 10 elif info_fromdisk < 0: info_fromdisk = 10 # get optimization options optimization_options = kwargs.get("optimization_options") if not optimization_options: raise ("optimization_options need to be specified !!") full_parsing = False if optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}: full_parsing = True # check if file name is specified file_name = kwargs.get("file_name") for fold in data_split: for dtype in ("train", "test"): fold_seqs_id = data_split[fold].get(dtype) if dtype == "train": crf_model = self.build_crf_model( fold_seqs_id, "f{}".format(fold), info_fromdisk, full_parsing ) # get the directory of the trained model savedmodel_dir = self.train_model( fold_seqs_id, crf_model, optimization_options ) if fold_seqs_id: # evaluate on the current data fold fold_name = "{}_f{}".format(dtype, fold) fold_seqs_info = { seq_id: seqs_info[seq_id] for seq_id in fold_seqs_id } kwargs["seqs_info"] = fold_seqs_info if file_name: # add prefix update_filename = fold_name + "_" + file_name kwargs["file_name"] = update_filename res = self.use_model(savedmodel_dir, kwargs) res["fold_name"] = fold_name res["model_dir"] = savedmodel_dir models_info.append(res) # save workflow trainer instance on disk ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, "workflow_trainer")) return models_info
def save(self, folder_dir): """save relevant info about the scaler on disk Args: folder_dir: string representing directory where files are pickled/dumped """ save_info = {'AS_scalinginfo': self.scaling_info, 'AS_method':self.method } for name in save_info: ReaderWriter.dump_data(save_info[name], os.path.join(folder_dir, name))
def seq_parsing_workflow(self, split_options, **kwargs): """preparing and parsing sequences to be later used in the learning framework""" # get attribute extractor attr_extractor = self.aextractor_obj # get feature extractor f_extractor = self.fextractor_obj # create sequence representer seq_representer = SeqsRepresenter(attr_extractor, f_extractor) self.seq_representer = seq_representer # check if a sequence file is passed if kwargs.get("seq_file"): seq_file = kwargs.get("seq_file") # get the data file parser options data_parser_options = kwargs.get("data_parser_options") num_seqs = kwargs.get("num_seqs") if not num_seqs: # default read all file num_seqs = numpy.inf # build the seqs_info by parsing the sequences from file iteratively seqs_info = self.build_seqsinfo_from_seqfile( seq_file, data_parser_options, num_seqs=num_seqs ) elif kwargs.get("seqs"): seqs = kwargs.get("seqs") seqs_info = self.build_seqsinfo_from_seqs(seqs) seqs_id = list(seqs_info.keys()) self.seqs_id = seqs_id # preprocess and generate attributes in case of segments with length >1 or in case of scaling of continuous attributes is needed seq_representer.preprocess_attributes(seqs_id, seqs_info) # check if we want to generate global features per boundary too # this is mostly used in perceptron/search based training full_parsing = kwargs.get("full_parsing") # extract global features F(X,Y) seq_representer.extract_seqs_globalfeatures( seqs_id, seqs_info, dump_gfeat_perboundary=full_parsing ) # save the link to seqs_info and seq_representer as instance variables # because the seqs_info and seq_representer is updated self.seqs_info = seqs_info self.seq_representer = seq_representer # split dataset according to the specified split options data_split = self.split_dataset(seqs_info, split_options) # save the datasplit dictionary on disk gfeatures_dir = seqs_info[1]["globalfeatures_dir"] ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir)) ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split")) return data_split
def seq_parsing_workflow(self, seqs, split_options): """preparing sequences to be used in the learning framework""" # create working directory corpus_name = "reference_corpus" working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = True # create sequence dictionary mapping each sequence to a unique id seqs_dict = {i + 1: seqs[i] for i in range(len(seqs))} seqs_id = list(seqs_dict.keys()) self.seqs_id = seqs_id # initialize attribute extractor attr_extractor = self.aextractor_class() # create the feature extractor scaling_method = self.scaling_method fextractor_class = self.fextractor_class f_extractor = fextractor_class( self.template_xy, self.template_y, attr_extractor.attr_desc ) # create sequence representer seq_representer = SeqsRepresenter(attr_extractor, f_extractor) # get the seqs_info representing the information about the parsed sequences seqs_info = seq_representer.prepare_seqs( seqs_dict, corpus_name, working_dir, unique_id ) # preporcess and generate attributes in case of segments with length >1 or in case of scaling of # attributes is needed seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method) # extract global features F(X,Y) percep_training = False if self.optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}: percep_training = True seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info, percep_training) # save the link to seqs_info and seq_representer as instance variables self.seqs_info = seqs_info self.seq_representer = seq_representer # split dataset according to the specified split options data_split = self.split_dataset(seqs_info, split_options) # save the datasplit dictionary on disk gfeatures_dir = seqs_info[1]["globalfeatures_dir"] ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir)) ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split")) return data_split
def traineval_folds(self, data_split, **kwargs): """train and evaluate model on different dataset splits""" seqs_id = self.seqs_id seq_representer = self.seq_representer seqs_info = self.seqs_info model_repr_class = self.model_repr_class model_class = self.model_class models_info = [] ref_corpusdir = os.path.dirname(os.path.dirname(seqs_info[1]['globalfeatures_dir'])) info_fromdisk = kwargs.get('load_info_fromdisk') # specify large number such that we always load the computed data from disk rather keeping them in memory if(type(info_fromdisk) != int): info_fromdisk = 10 elif(info_fromdisk < 0): info_fromdisk = 10 # check if file name is specified file_name = kwargs.get('file_name') for fold in data_split: for dtype in ('train', 'test'): fold_seqs_id = data_split[fold].get(dtype) if(dtype == 'train'): # create model using the sequences assigned for training model_repr = seq_representer.create_model(fold_seqs_id, seqs_info, model_repr_class, self.filter_obj) # extract for each sequence model active features seq_representer.extract_seqs_modelactivefeatures(seqs_id, seqs_info, model_repr, "f{}".format(fold)) # create a CRF model crf_model = model_class(model_repr, seq_representer, seqs_info, load_info_fromdisk = info_fromdisk) # get the directory of the trained model savedmodel_dir = self.train_model(fold_seqs_id, crf_model) if(fold_seqs_id): # evaluate on the current data fold fold_name = '{}_f{}'.format(dtype, fold) fold_seqs_info = {seq_id:seqs_info[seq_id] for seq_id in fold_seqs_id} kwargs['seqs_info'] = fold_seqs_info if(file_name): # add prefix update_filename = fold_name + "_" + file_name kwargs['file_name'] = update_filename res = self.eval_model(savedmodel_dir, kwargs) res['fold_name'] = fold_name res['model_dir'] = savedmodel_dir models_info.append(res) # save workflow trainer instance on disk ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, 'workflow_trainer')) return(models_info)
def traineval_folds(self, data_split, **kwargs): """train and evaluate model on different dataset splits""" seqs_info = self.seqs_info models_info = [] ref_corpusdir = os.path.dirname(os.path.dirname(seqs_info[1]['globalfeatures_dir'])) info_fromdisk = kwargs.get('load_info_fromdisk') # specify large number such that we always load the computed data from disk rather keeping them in memory if(type(info_fromdisk) != int): info_fromdisk = 10 elif(info_fromdisk < 0): info_fromdisk = 10 # get optimization options optimization_options = kwargs.get("optimization_options") if(not optimization_options): raise("optimization_options need to be specified !!") full_parsing = False if(optimization_options['method'] in {'COLLINS-PERCEPTRON', 'SAPO'}): full_parsing = True # check if file name is specified file_name = kwargs.get('file_name') for fold in data_split: for dtype in ('train', 'test'): fold_seqs_id = data_split[fold].get(dtype) if(dtype == 'train'): crf_model = self.build_crf_model(fold_seqs_id, "f{}".format(fold), info_fromdisk, full_parsing) # get the directory of the trained model savedmodel_dir = self.train_model(fold_seqs_id, crf_model, optimization_options) if(fold_seqs_id): # evaluate on the current data fold fold_name = '{}_f{}'.format(dtype, fold) fold_seqs_info = {seq_id:seqs_info[seq_id] for seq_id in fold_seqs_id} kwargs['seqs_info'] = fold_seqs_info if(file_name): # add prefix update_filename = fold_name + "_" + file_name kwargs['file_name'] = update_filename res = self.use_model(savedmodel_dir, kwargs) res['fold_name'] = fold_name res['model_dir'] = savedmodel_dir models_info.append(res) # save workflow trainer instance on disk ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, 'workflow_trainer')) return(models_info)
def traineval_folds(self, data_split, meval=True, sep=" "): """train and evaluate model on different dataset splits""" seqs_id = self.seqs_id seq_representer = self.seq_representer seqs_info = self.seqs_info model_repr_class = self.model_repr_class model_class = self.model_class models_info = [] ref_corpusdir = os.path.dirname(os.path.dirname(seqs_info[1]['globalfeatures_dir'])) if(meval): traineval_fname = "modeleval_train.txt" testeval_fname = "modeleval_test.txt" else: traineval_fname = None testeval_fname = None for fold in data_split: trainseqs_id = data_split[fold]['train'] # create model using the sequences assigned for training model_repr = seq_representer.create_model(trainseqs_id, seqs_info, model_repr_class, self.filter_obj) # extract for each sequence model active features seq_representer.extract_seqs_modelactivefeatures(seqs_id, seqs_info, model_repr, "f{}".format(fold)) # create a CRF model crf_model = model_class(model_repr, seq_representer, seqs_info, load_info_fromdisk = 4) # get the directory of the trained model savedmodel_info = self.train_model(trainseqs_id, crf_model) # evaluate on the training data trainseqs_info = {seq_id:seqs_info[seq_id] for seq_id in trainseqs_id} self.eval_model(savedmodel_info, {'seqs_info':trainseqs_info}, traineval_fname, "dec_trainseqs_fold_{}.txt".format(fold), sep=sep) # evaluate on the test data testseqs_id = data_split[fold].get('test') if(testseqs_id): testseqs_info = {seq_id:seqs_info[seq_id] for seq_id in testseqs_id} self.eval_model(savedmodel_info, {'seqs_info':testseqs_info}, testeval_fname, "dec_testseqs_fold_{}.txt".format(fold), sep=sep) models_info.append(savedmodel_info) # save workflow trainer instance on disk ReaderWriter.dump_data(self, os.path.join(ref_corpusdir, 'workflow_trainer')) return(models_info)
def seq_parsing_workflow(self, seq_file, split_options): """preparing sequences to be used in the learning framework""" # initialize attribute extractor attr_extractor = self.aextractor_class() # create the feature extractor scaling_method = self.scaling_method fextractor_class = self.fextractor_class f_extractor = fextractor_class( self.template_xy, self.template_y, attr_extractor.attr_desc ) # create sequence representer seq_representer = SeqsRepresenter(attr_extractor, f_extractor) self.seq_representer = seq_representer # build the seqs_info by parsing the sequences from file iteratively seqs_info = self.build_seqsinfo(seq_file) seqs_id = list(seqs_info.keys()) self.seqs_id = seqs_id # preprocess and generate attributes in case of segments with length >1 or in case of scaling of # attributes is needed seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method) # extract global features F(X,Y) percep_training = False if self.optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}: percep_training = True seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info, percep_training) # save the link to seqs_info and seq_representer as instance variables self.seqs_info = seqs_info self.seq_representer = seq_representer # split dataset according to the specified split options data_split = self.split_dataset(seqs_info, split_options) # save the datasplit dictionary on disk gfeatures_dir = seqs_info[1]["globalfeatures_dir"] ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir)) ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split")) return data_split