def _load_archive(self, path, extract_path): from zipfile import ZipFile, ZIP_DEFLATED try: zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED) allfile = zfile.namelist() # validate extract_path if not os.path.isdir(extract_path): raise ValueError('Extract path must be path folder, but path' '={} is a file'.format(extract_path)) extract_path = os.path.join( extract_path, os.path.basename(path).replace('.zip', '')) # found the extracted dir, use it if os.path.isdir(extract_path) and \ set(os.listdir(extract_path)) == set(allfile): self._set_path(extract_path) return # decompress everything if not os.path.exists(extract_path): os.mkdir(extract_path) maxlen = max([len(i) for i in allfile]) progbar = Progbar(len(allfile)) for i, f in enumerate(allfile): zfile.extract(f, path=extract_path) progbar.title = ('Unarchiving: %-' + str(maxlen) + 's') % f progbar.update(i + 1) # ====== finally set path ====== # self._set_path(extract_path) except IOError as e: raise IOError('Error loading archived dataset, path:{}, error:{}' '.'.format(path, e)) return None
def fit(self, texts, vocabulary=None): """q Parameters ---------- texts: iterator of unicode iterator, generator or list of unicode string. """ texts = self._validate_texts(texts) word_counts = self._word_counts word_docs = self._word_docs # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== start processing ====== # prog = Progbar(target=1208) start_time = timeit.default_timer() for nb_docs, doc in processor(texts, vocabulary, keep_order=False): total_docs_tokens = 0 seen_words = {} # update words->count for token in doc: total_docs_tokens += 1 word_counts[token] += 1 # update words->doc if token not in seen_words: seen_words[token] = 1 word_docs[token] += 1 # save longest docs if total_docs_tokens > self.__longest_document[-1]: self.__longest_document = [doc, total_docs_tokens] # print progress if self.print_progress: prog.title = '[Training]#Doc:%d #Tok:%d' % (nb_docs, len(word_counts)) prog.add(1) if prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # ====== print summary of the process ====== # if self.print_progress: prog.target = nb_docs prog.update(nb_docs) processing_time = timeit.default_timer() - start_time print('Processed %d-docs, %d-tokens in %f second.' % (nb_docs, len(word_counts), processing_time)) self.nb_docs += nb_docs # ====== sorting ====== # self._refresh_dictionary() return self
def archive(self): from zipfile import ZipFile, ZIP_DEFLATED path = self.archive_path zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED) files = set([_[-1] for _ in self._data_map.itervalues()]) progbar = Progbar(len(files), title='Archiving:') maxlen = max([len(os.path.basename(i)) for i in files]) for i, f in enumerate(files): zfile.write(f, os.path.basename(f)) progbar.title = ('Archiving: %-' + str(maxlen) + 's') % os.path.basename(f) progbar.update(i + 1) zfile.close() return path
def predict_proba(self, *args): self._auto_create_inputs(args) self._create_function() n = 0 nb_samples = args[0].shape[0] batch_size = self._batch_size prediction = [] prog = Progbar(target=nb_samples, title='Predicting') while n < nb_samples: end = min(n + batch_size, nb_samples) x = [i[n:end] for i in args] x = self._functions['pred'](*x) _min = np.min(x, axis=-1)[:, None] _max = np.max(x, axis=-1)[:, None] x = (x - _min) / (_max - _min) x = x / x.sum(-1)[:, None] prediction.append(x) n = end prog.update(n) return np.concatenate(prediction, axis=0)
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not isinstance(token_not_found, Number) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif isinstance(token_not_found, Number): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif isinstance(end_document, Number): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError( 'Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog.title = "[Transforming] %d docs" % nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process if self.print_progress and auto_adjust_len: prog.target = nb_docs prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.iteritems(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.iteritems(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.iteritems(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get( tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
class ProgressMonitor(Callback): ''' Parameters ---------- title : str pattern to serialize return from function to string format: str format for the results, using the new python style (e.g. {0}, {1}, {:.4f}), ... and not %s, %d ...) tracking: list list of [(index, postprocessing_func)] or a dictionary, tracking information at given index of the return value during batch_end, then, postprocess and print it in epoch_end. Example ------- >>> t = training.Task(dataset=ds, batch_size=512) >>> t.set_callback(training.ProgressMonitor(name='Test', ... format='Result: {:.4f}', ... tracking={1: lambda x: sum(x)})) >>> t.run() # Result: 52751.29 98/98 [=======================================] - 0s Note ---- This callback require specify `samples_size` in **kwargs of record ''' def __init__(self, name, format='', tracking=[]): super(ProgressMonitor, self).__init__() self.name = name self._history = [] self._prog = Progbar(100, title='') # ====== format ====== # self._format_results = 0 for i in _PLACEHOLDER: self._format_results += len(i.findall(format)) self._format = format # ====== one-time tracking at epoch_end ====== # if isinstance(tracking, dict): tracking = tracking.iteritems() self.tracking = [(int(i), j) for i, j in tracking if callable(j)] self._tracking_history = defaultdict(list) @property def _saveable_variables(self): return {'_format': self._format, '_format_results': self._format_results, '_history': [], 'tracking': self.tracking, '_tracking_history': defaultdict(list), '_prog': Progbar(100, title=''), 'name': self.name} def epoch_start(self): # reset tiem of ProgressBar if self.name == self.event_name: self._prog.start = time.time() def batch_end(self): # do nothing for not specified task if self.name != self.event_name or 'samples_size' not in self: return samples_size = self['samples_size'] # ====== title ====== # r = self.results if isinstance(self.results, (tuple, list)) else (self.results,) r = [i.tolist() if isinstance(i, np.ndarray) and i.ndim == 0 else i for i in r] # ====== tracking ====== # for i, j in self.tracking: self._tracking_history[i].append(r[i]) r = r[:self._format_results] self._history.append(r) title = (self._format.format(*r) if self._format_results else self._format) # title self._prog.title = 'Name:%-8s,Epoch:%2d,' % \ (self.name[:8], self.nb_epoch) + title # progress n = round(((self.nb_samples % samples_size) / samples_size) * 100) self._prog.update(min(int(n), 99)) def epoch_end(self): # do nothing for not specified task if self.name != self.event_name: return # risky move: get the mean of all results if self._format_results: r = np.mean(self._history, axis=0).tolist() title = self._format.format(*r) else: title = self._format # reset self._history = [] # title self._prog.title = 'Name:%-8s,Epoch:%2d,' % ( self.event_name, self.nb_epoch) + title # always 100% at the end of epoch self._prog.target = 100; self._prog.update(100) # tracking for i, f in self.tracking: r = self._tracking_history[i] r = f(r) print('Tracking name-"%s" at location-%d:' % (self.name, i)) print(r) self._tracking_history = defaultdict(list)