class TrainingManager(object): """ Manages multithreaded training of either Intents or Entities Args: cls (Type[Trainable]): Class to wrap cache_dir (str): Place to store cache files """ def __init__(self, cls, cache_dir): self.cls = cls self.cache = cache_dir self.objects = [] self.objects_to_train = [] self.train_data = TrainData() def add(self, name, lines, reload_cache=False): hash_fn = join(self.cache, name + '.hash') old_hsh = None if isfile(hash_fn): with open(hash_fn, 'rb') as g: old_hsh = g.read() min_ver = splitext(padatious.__version__)[0] new_hsh = lines_hash([min_ver] + lines) if reload_cache or old_hsh != new_hsh: self.objects_to_train.append(self.cls(name=name, hsh=new_hsh)) else: self.objects.append(self.cls.from_file(name=name, folder=self.cache)) self.train_data.add_lines(name, lines) def load(self, name, file_name, reload_cache=False): with open(file_name) as f: self.add(name, f.read().split('\n'), reload_cache) def remove(self, name): self.objects = [i for i in self.objects if i.name != name] self.objects_to_train = [i for i in self.objects_to_train if i.name != name] self.train_data.remove_lines(name) def train(self, debug=True, single_thread=False, timeout=20): if not isdir(self.cache): mkdir(self.cache) train = partial( _train_and_save, cache=self.cache, data=self.train_data, print_updates=debug ) if single_thread: for i in self.objects_to_train: train(i) else: # Train in multiple processes to disk pool = mp.Pool() try: pool.map_async(train, self.objects_to_train).get(timeout) except TimeoutError: if debug: print('Some objects timed out while training') finally: pool.close() # Load saved objects from disk for obj in self.objects_to_train: try: self.objects.append(self.cls.from_file(name=obj.name, folder=self.cache)) except IOError: if debug: print('Took too long to train', obj.name) self.objects_to_train = []
class TrainingManager(object): """ Manages multithreaded training of either Intents or Entities Args: cls (Type[Trainable]): Class to wrap cache_dir (str): Place to store cache files """ def __init__(self, cls, cache_dir): self.cls = cls self.cache = cache_dir self.objects = [] self.objects_to_train = [] self.train_data = TrainData() def add(self, name, lines, reload_cache=False): hash_fn = join(self.cache, name + '.hash') old_hsh = None if isfile(hash_fn): with open(hash_fn, 'rb') as g: old_hsh = g.read() min_ver = splitext(padatious.__version__)[0] new_hsh = lines_hash([min_ver] + lines) if reload_cache or old_hsh != new_hsh: self.objects_to_train.append(self.cls(name=name, hsh=new_hsh)) else: self.objects.append( self.cls.from_file(name=name, folder=self.cache)) self.train_data.add_lines(name, lines) def load(self, name, file_name, reload_cache=False): with open(file_name) as f: self.add(name, f.read().split('\n'), reload_cache) def remove(self, name): self.objects = [i for i in self.objects if i.name != name] self.objects_to_train = [ i for i in self.objects_to_train if i.name != name ] self.train_data.remove_lines(name) def train(self, debug=True, single_thread=False): if not isdir(self.cache): mkdir(self.cache) def args(i): return i, self.cache, self.train_data, debug if single_thread: for i in self.objects_to_train: _train_and_save(*args(i)) else: # Train in multiple processes to disk pool = mp.Pool() try: results = [ pool.apply_async(_train_and_save, args(i)) for i in self.objects_to_train ] for i in results: i.get() finally: pool.close() # Load saved objects from disk for obj in self.objects_to_train: self.objects.append( self.cls.from_file(name=obj.name, folder=self.cache)) self.objects_to_train = []