def compute_splits(self, data, plot=True): """ """ len2cnt = Counter(data) # Error checking if len(len2cnt) < self.k: raise ValueError('Trying to sort %d lengths into %d buckets' % (len(len2cnt), self.k)) # Initialize self._len2cnt = len2cnt self._lengths = sorted(self.len2cnt.keys()) # Initialize the splits evenly lengths = sorted([ l for length, count in list(len2cnt.items()) for l in [length] * count ]) self._splits = [ np.max(split) for split in np.array_split(lengths, self.k) ] # Make sure all the splits are ordered correctly and present in the len2cnt idx = len(self) - 1 while idx > 0: while self[idx] > self.lengths[0] and (self[idx] <= self[idx - 1] or self[idx] not in self.len2cnt): self[idx] -= 1 idx -= 1 idx = 1 while idx < len(self) - 1: while self[idx] < self.lengths[-1] and (self[idx] <= self[idx - 1] or self[idx] not in self.len2cnt): self[idx] += 1 idx += 1 # Reindex self.reindex() # Iterate old_splits = None i = 0 if self.verbose: print(color_pattern('Initial # of tokens in buckets:', str(self.size()), 'bright_red'), file=sys.stderr) while self != old_splits: old_splits = list(self) self.recenter() i += 1 if self.verbose: print(color_pattern('Final # of tokens in buckets:', str(self.size()), 'bright_white'), file=sys.stderr) self.reindex() return self._splits
def print_accuracy(self, accumulators, time, prefix='Train'): """ """ acc_dict = self.process_accumulators(accumulators, time=time) strings = [] strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red')) strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan')) strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green')) strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta')) string = ctext('{0} ', 'bold') + ' | '.join(strings) print(string.format(prefix, **acc_dict),file=sys.stderr) return