def save_to_file(self, model_file): """\ Save the model to a pickle file or stream (supports GZip compression). """ log_info('Saving model to file ' + str(model_file)) fh = file_stream(model_file, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self) fh.close() log_info('Model successfully saved.')
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() fh.close() log_info('Model loaded successfully.') return model
def load_training_set(self, filename, encoding='UTF-8'): """\ Load the given training data set into memory and strip it if configured to via the train_part parameter. """ log_info('Loading training data set from ' + str(filename) + '...') train = DataSet() train.load_from_arff(filename, encoding) if self.train_part < 1: train = train.subset(0, int(round(self.train_part * len(train))), copy=False) return train
def train_on_data(self, train): """\ Train model on the specified training data set (which must be a loaded DataSet object). """ log_info('Preparing data set...') self.data_headers = train.get_headers() train_vect = self.__vectorize(train) train_classes = self.get_classes(train) # if all the training data have the same class, use a dummy classifier if train.get_attrib(self.class_attr).num_values == 1: self.feature_filter = None self.classifier = DummyClassifier() # filter features log_info('Filtering...') train_filt = self.__filter_features(train_vect, train_classes) # train the classifier log_info('Training...') if self.use_weights: self.classifier.fit(train_filt, train_classes, sample_weight=train.inst_weights) else: self.classifier.fit(train_filt, train_classes) self.classifier_trained = True log_info('Training done.')
def apply_to(self, filename=None, string=None, language=None, selector=None): """ Apply the whole scenario to a file or to a string (which should be readable by the first block of the scenario). If processing a string, return the result. """ if filename is not None: # the first block is supposed to be a reader which creates the document log_info('Processing ' + filename) log_info('Applying block 1/' + str(len(self.blocks)) + ': ' + self.blocks[0].__class__.__name__) doc = self.blocks[0].process_document(filename) # apply all other blocks for block_no, block in enumerate(self.blocks[1:], start=2): log_info('Applying block ' + str(block_no) + '/' + str(len(self.blocks)) + ': ' + block.__class__.__name__) block.process_document(doc) elif string is not None: # check if we know the target language and selector language = language or self.global_args.get['language'] selector = selector or self.global_args.get('selector', '') # the first block is supposed to be a reader which creates the document fh = StringIO(string) doc = self.blocks[0].process_document(fh) # apply all other blocks for block_no, block in enumerate(self.blocks[1:], start=2): log_info('Applying block ' + str(block_no) + '/' + str(len(self.blocks)) + ': ' + block.__class__.__name__) block.process_document(doc) # return the text of all bundles for the specified sentence return "\n".join([b.get_zone(language, selector).sentence for b in doc.bundles]) else: raise ScenarioException('Filename or input string must be set!')
def process_document(self, filename): """\ Read a Tecto-Template file and return its contents as a Document object. """ fh = file_stream(filename, encoding=self.encoding) doc = Document(filename) for line in fh: bundle = doc.create_bundle() zone = bundle.create_zone(self.language, self.selector) ttree = zone.create_ttree() self.parse_line(line, ttree) log_info('Parsed a tree with %d nodes.' % len(ttree.get_descendants())) fh.close() return doc
def load_blocks(self): "Load all blocks into memory, finding and creating class objects." self.blocks = [] for block_no, block_data in enumerate(self.scenario_data, start=1): # create the block name and import it if '.' in block_data["block"]: class_subpath, class_name = block_data["block"].rsplit('.', 1) class_subpath += '.' else: class_subpath, class_name = '', block_data["block"] class_package = 'pytreex.block.' + class_subpath + class_name.lower() log_info('Loading block ' + str(block_no) + '/' + str(len(self.scenario_data)) + ': ' + class_name) exec('import ' + class_package) class_obj = getattr(sys.modules[class_package], class_name) # create the block object args = self.global_args.copy() args.update(block_data.get("args", {})) self.blocks.append(class_obj(self, args)) # load models etc. self.blocks[-1].load()
def load_blocks(self): "Load all blocks into memory, finding and creating class objects." self.blocks = [] for block_no, block_data in enumerate(self.scenario_data, start=1): # create the block name and import it if '.' in block_data["block"]: class_subpath, class_name = block_data["block"].rsplit('.', 1) class_subpath += '.' else: class_subpath, class_name = '', block_data["block"] class_package = 'pytreex.block.' + class_subpath + class_name.lower( ) log_info('Loading block ' + str(block_no) + '/' + str(len(self.scenario_data)) + ': ' + class_name) exec('import ' + class_package) class_obj = getattr(sys.modules[class_package], class_name) # create the block object args = self.global_args.copy() args.update(block_data.get("args", {})) self.blocks.append(class_obj(self, args)) # load models etc. self.blocks[-1].load()
def run_on_cluster(self): # split input files for different jobs job_files = [self.input_files[i::self.jobs] for i in xrange(self.jobs)] jobs = [Job(name=self.JOB_NAME_PREFIX + self.scenario.name)] work_dir = jobs[0].work_dir for jobnum in xrange(1, self.jobs): jobs.append(Job(name=self.JOB_NAME_PREFIX + self.scenario.name + '-' + str(jobnum).zfill(2), work_dir=work_dir)) log_info('Creating jobs ...') for job, files in zip(jobs, job_files): job.header += "from treex.core.run import Run\n" args = [self.scenario.file_path] + [os.path.abspath(file_path) for file_path in files] job.code = "run = Run(" + str(args) + ")\nrun.run()\n" log_info('Submitting jobs ...') for job in jobs: job.submit() log_info('Waiting for jobs ...') for job in jobs: job.wait(poll_delay=10) log_info('All jobs done.')
def process_subtree(self, amrnode): # progress depth-first for child in amrnode.get_children(): self.process_subtree(child) # #Separ is "+" if amrnode.concept == '#Separ': val = 0 for child in amrnode.get_children(): num = self.get_numeric_value(child) if num is None: continue val += num self.rehang_children_and_remove(child) amrnode.concept = str(val) log_info('Separ: ' + amrnode.concept) return # / is "/" if amrnode.concept in ['/', '#Slash']: children = amrnode.get_children(ordered=True) if len(children) == 2 and all( [self.get_numeric_value(c) is not None for c in children]): val = self.get_numeric_value(children[0]) / float( self.get_numeric_value(children[1])) amrnode.concept = str(val) log_info('/: ' + amrnode.concept) self.rehang_children_and_remove(children[0]) self.rehang_children_and_remove(children[1]) return # check if we are a number, normalize our concept name val = self.get_numeric_value(amrnode) if val is not None: # any numeric children = '*' for child in amrnode.get_children(preceding_only=True): num = self.get_numeric_value(child) if num is not None: val *= num self.rehang_children_and_remove(child) log_info('Number child: ' + str(num)) log_info('Number: ' + amrnode.concept) amrnode.concept = str(val)
def apply_to(self, filename=None, string=None, language=None, selector=None): """ Apply the whole scenario to a file or to a string (which should be readable by the first block of the scenario). If processing a string, return the result. """ if filename is not None: # the first block is supposed to be a reader which creates the document log_info('Processing ' + filename) log_info('Applying block 1/' + str(len(self.blocks)) + ': ' + self.blocks[0].__class__.__name__) doc = self.blocks[0].process_document(filename) # apply all other blocks for block_no, block in enumerate(self.blocks[1:], start=2): log_info('Applying block ' + str(block_no) + '/' + str(len(self.blocks)) + ': ' + block.__class__.__name__) block.process_document(doc) elif string is not None: # check if we know the target language and selector language = language or self.global_args.get['language'] selector = selector or self.global_args.get('selector', '') # the first block is supposed to be a reader which creates the document fh = StringIO(string) doc = self.blocks[0].process_document(fh) # apply all other blocks for block_no, block in enumerate(self.blocks[1:], start=2): log_info('Applying block ' + str(block_no) + '/' + str(len(self.blocks)) + ': ' + block.__class__.__name__) block.process_document(doc) # return the text of all bundles for the specified sentence return "\n".join( [b.get_zone(language, selector).sentence for b in doc.bundles]) else: raise ScenarioException('Filename or input string must be set!')
def process_subtree(self, amrnode): # progress depth-first for child in amrnode.get_children(): self.process_subtree(child) # #Separ is "+" if amrnode.concept == '#Separ': val = 0 for child in amrnode.get_children(): num = self.get_numeric_value(child) if num is None: continue val += num self.rehang_children_and_remove(child) amrnode.concept = unicode(val) log_info('Separ: ' + amrnode.concept) return # / is "/" if amrnode.concept in ['/', '#Slash']: children = amrnode.get_children(ordered=True) if len(children) == 2 and all([self.get_numeric_value(c) is not None for c in children]): val = self.get_numeric_value(children[0]) / float(self.get_numeric_value(children[1])) amrnode.concept = unicode(val) log_info('/: ' + amrnode.concept) self.rehang_children_and_remove(children[0]) self.rehang_children_and_remove(children[1]) return # check if we are a number, normalize our concept name val = self.get_numeric_value(amrnode) if val is not None: # any numeric children = '*' for child in amrnode.get_children(preceding_only=True): num = self.get_numeric_value(child) if num is not None: val *= num self.rehang_children_and_remove(child) log_info('Number child: ' + str(num)) log_info('Number: ' + amrnode.concept) amrnode.concept = unicode(val)
def run_on_cluster(self): # split input files for different jobs job_files = [self.input_files[i::self.jobs] for i in xrange(self.jobs)] jobs = [Job(name=self.JOB_NAME_PREFIX + self.scenario.name)] work_dir = jobs[0].work_dir for jobnum in xrange(1, self.jobs): jobs.append( Job(name=self.JOB_NAME_PREFIX + self.scenario.name + '-' + str(jobnum).zfill(2), work_dir=work_dir)) log_info('Creating jobs ...') for job, files in zip(jobs, job_files): job.header += "from treex.core.run import Run\n" args = [self.scenario.file_path ] + [os.path.abspath(file_path) for file_path in files] job.code = "run = Run(" + str(args) + ")\nrun.run()\n" log_info('Submitting jobs ...') for job in jobs: job.submit() log_info('Waiting for jobs ...') for job in jobs: job.wait(poll_delay=10) log_info('All jobs done.')
def train(self, train_file, work_dir, memory=8, encoding='UTF-8'): """\ Read training data, split them and train the individual models (in cluster jobs). """ # load the entire data set train = self.load_training_set(train_file, encoding) self.data_headers = train.get_headers() # train a backoff model log_info('Training a backoff model...') self.backoff_model = self.__train_backoff_model(train) # split it log_info('Split...') train_split = train.split(eval(self.divide_func), keep_copy=False) jobs = [] model_files = {} # save training files and create training jobs for key, subset in train_split.iteritems(): fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file) fn = os.path.join(work_dir, os.path.basename(fn)) subset.save_to_arff(fn, encoding) job, model_file = Model.create_training_job(self.config, work_dir, fn, memory=memory, encoding=encoding) jobs.append(job) model_files[key] = model_file # submit the training jobs and wait for all of them log_info('Submitting training jobs...') for job in jobs: job.submit() log_info('Waiting for jobs...') for job in jobs: job.wait() # load all models log_info('Training complete. Assembling model files...') for key, model_file in model_files.iteritems(): self.models[key] = Model.load_from_file(model_file) self.trained = True log_info('Training done.')