def build_seqsinfo_from_seqs(self, seqs): """prepares and process sequences to disk and return info dictionary about the parsed sequences Args: seqs: list of sequences that are instances of :class:`SequenceStruct` class """ # create working directory corpus_name = "reference_corpus" working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = True seq_representer = self.seq_representer # create sequence dictionary mapping each sequence to a unique id seqs_dict = {} counter = 1 for seq in seqs: if(hasattr(seq, 'id')): seq_id = seq.id else: seq_id = counter seqs_dict[seq_id] = seq counter+=1 # get the seqs_info representing the information about the parsed sequences seqs_info = seq_representer.prepare_seqs(seqs_dict, corpus_name, working_dir, unique_id) return(seqs_info)
def build_seqsinfo_from_seqfile(self, seq_file, data_parser_options, num_seqs=numpy.inf): """prepares and process sequences to disk and return info dictionary about the parsed sequences Args: seq_file: string representing the path to the sequence file data_parser_options: dictionary containing options to be passed to :func:`read_file` method of :class:`DataFileParser` class num_seqs: integer, maximum number of sequences to read from file (default numpy.inf -- means read all file) """ seq_representer = self.seq_representer # create working directory corpus_name = "reference_corpus_" + generate_datetime_str() working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = False # build the seqs_info by parsing the sequences from file iteratively seqs_info = {} counter=1 dparser = DataFileParser() for seq in self.get_seqs_from_file(seq_file, dparser, data_parser_options): if(hasattr(seq, 'id')): seq_id = seq.id else: seq_id = counter seqs_info.update(seq_representer.prepare_seqs({seq_id:seq}, corpus_name, working_dir, unique_id, log_progress=False)) print("{} sequences have been processed".format(counter)) if(counter>=num_seqs): break counter+=1 return(seqs_info)
def build_seqsinfo(self, seq_file): seq_representer = self.seq_representer # create working directory corpus_name = "reference_corpus_" + generate_datetime_str() working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = False # build the seqs_info by parsing the sequences from file iteratively seqs_info = {} counter = 1 for seq in self.get_seqs_from_file(seq_file): if hasattr(seq, "id"): seq_id = seq.id else: seq_id = counter seqs_info.update( seq_representer.prepare_seqs( {seq_id: seq}, corpus_name, working_dir, unique_id, log_progress=False, ) ) print("{} sequences have been processed".format(counter)) counter += 1 return seqs_info
def build_model(model_type, template_config, num_seqs): if (model_type == 'HOCRFAD'): modelrepr_class = HOCRFADModelRepresentation model_class = HOCRFAD fextractor_class = HOFeatureExtractor elif (model_type == 'HOCRF'): modelrepr_class = HOCRFModelRepresentation model_class = HOCRF fextractor_class = HOFeatureExtractor elif (model_type == 'HOSemiCRFAD'): modelrepr_class = HOSemiCRFADModelRepresentation model_class = HOSemiCRFAD fextractor_class = HOFeatureExtractor elif (model_type == 'HOSemiCRF'): modelrepr_class = HOSemiCRFModelRepresentation model_class = HOSemiCRF fextractor_class = HOFeatureExtractor elif (model_type == 'FirstOrderCRF'): modelrepr_class = FirstOrderCRFModelRepresentation model_class = FirstOrderCRF fextractor_class = FOFeatureExtractor # init attribute extractor attr_extractor = AttributeExtractor() # load templates template_XY, template_Y = template_config() # init feature extractor fextractor = fextractor_class(template_XY, template_Y, attr_extractor.attr_desc) # generate data seqs = seq_generator.generate_seqs(num_seqs) # use all passed data as training data -- no splitting data_split_options = {'method': 'none'} # no feature filter fe_filter = None working_dir = create_directory('wd', current_dir) workflow = GenericTrainingWorkflow(attr_extractor, fextractor, fe_filter, modelrepr_class, model_class, working_dir) # since we are going to train using perceptron based methods full_parsing = True data_split = workflow.seq_parsing_workflow(data_split_options, seqs=seqs, full_parsing=full_parsing) # build and return a CRFs model # folder name will be f_0 as fold 0 trainseqs_id = data_split[0]['train'] crf_m = workflow.build_crf_model(trainseqs_id, "f_0", full_parsing=full_parsing) return (workflow, crf_m, data_split)
def seq_parsing_workflow(self, seqs, split_options): """preparing sequences to be used in the learning framework""" # create working directory corpus_name = "reference_corpus" working_dir = create_directory("working_dir", self.root_dir) self.working_dir = working_dir unique_id = True # create sequence dictionary mapping each sequence to a unique id seqs_dict = {i + 1: seqs[i] for i in range(len(seqs))} seqs_id = list(seqs_dict.keys()) self.seqs_id = seqs_id # initialize attribute extractor attr_extractor = self.aextractor_class() # create the feature extractor scaling_method = self.scaling_method fextractor_class = self.fextractor_class f_extractor = fextractor_class( self.template_xy, self.template_y, attr_extractor.attr_desc ) # create sequence representer seq_representer = SeqsRepresenter(attr_extractor, f_extractor) # get the seqs_info representing the information about the parsed sequences seqs_info = seq_representer.prepare_seqs( seqs_dict, corpus_name, working_dir, unique_id ) # preporcess and generate attributes in case of segments with length >1 or in case of scaling of # attributes is needed seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method) # extract global features F(X,Y) percep_training = False if self.optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}: percep_training = True seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info, percep_training) # save the link to seqs_info and seq_representer as instance variables self.seqs_info = seqs_info self.seq_representer = seq_representer # split dataset according to the specified split options data_split = self.split_dataset(seqs_info, split_options) # save the datasplit dictionary on disk gfeatures_dir = seqs_info[1]["globalfeatures_dir"] ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir)) ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split")) return data_split
def profile_test(model_type, scaling_method, optimization_options, run_config, test_type): import cProfile local_def = { 'model_type': model_type, 'scaling_method': scaling_method, 'optimization_options': optimization_options, 'run_config': run_config, 'test_type': test_type } global_def = {'test_crfs': test_crfs} profiling_dir = create_directory('profiling', root_dir) cProfile.runctx( 'test_crfs(model_type, scaling_method, optimization_options, run_config, test_type)', global_def, local_def, filename=os.path.join(profiling_dir, "profile_out"))
def test_workflow(self, seqs): """ testing scenarios of mixing different templates """ corpus_name = "reference_corpus" working_dir = create_directory("working_dir", self.root_dir) self._working_dir = working_dir unique_id = True seqs_dict = {} templateY = self.template_Y templateXY = self.template_XY modelrepr_class = self.model_repr_class model_class = self.model_class fextractor_class = self.fextractor_class scaling_method = self.scaling_method attr_extractor = NERSegmentAttributeExtractor() f_extractor = fextractor_class(templateXY, templateY, attr_extractor.attr_desc) seq_representer = SeqsRepresenter(attr_extractor, f_extractor) for i in range(len(seqs)): seqs_dict[i + 1] = deepcopy(seqs[i - 1]) seqs_info = seq_representer.prepare_seqs(seqs_dict, corpus_name, working_dir, unique_id) seqs_id = list(seqs_info.keys()) seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method) seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info) model = seq_representer.create_model(seqs_id, seqs_info, modelrepr_class, self.filter_obj) seq_representer.extract_seqs_modelactivefeatures(seqs_id, seqs_info, model, "", learning=True) crf_model = model_class(model, seq_representer, seqs_info) self._seq_representer = seq_representer self._seqs_id = seqs_id self._seqs_info = seqs_info self._crf_model = crf_model self._model = model