def main(input_file: str, output_file: str, checking_file: str, keep_rate: float): keep_rate = float(keep_rate) _gen = Random.get_np_generator(12345) rstream = Random.stream(_gen.random_sample) # -- # read input stat = {} input_sents = list( yield_sents(ReaderGetterConf().get_reader(input_path=input_file))) stat["input"] = get_stat(input_sents) if checking_file: checking_sents = list( yield_sents( ReaderGetterConf().get_reader(input_path=checking_file))) stat["check"] = get_stat(checking_sents) # collect keys hit_keys = set() for one_check_sent in checking_sents: tok_key = ''.join(one_check_sent.seq_word.vals).lower() tok_key = ''.join(tok_key.split()) # split and join again hit_keys.add(tok_key) # filter filtered_sents = [] for one_input_sent in input_sents: tok_key = ''.join(one_input_sent.seq_word.vals).lower() tok_key = ''.join(tok_key.split()) # split and join again if tok_key not in hit_keys: filtered_sents.append(one_input_sent) else: filtered_sents = input_sents stat["filter"] = get_stat(filtered_sents) # sample if keep_rate < 1.: sample_sents = [ s for r, s in zip(rstream, filtered_sents) if r < keep_rate ] elif keep_rate > 10: sample_sents = [z for z in filtered_sents] for _ in range(10): _gen.shuffle(sample_sents) sample_sents = sample_sents[:int(keep_rate)] else: sample_sents = filtered_sents stat["sample"] = get_stat(sample_sents) # write if os.path.exists(output_file): assert False, f"File exists: {output_file}, delete it first!" if output_file: with WriterGetterConf().get_writer(output_path=output_file) as writer: writer.write_insts(sample_sents) # stat zlog( f"Read {input_file}, check {checking_file}, output {output_file}, stat:" ) OtherHelper.printd(stat)
def _proj_grads(self, flattened_grads): _shuffle = self.conf.shuffle_losses if _shuffle: _gen = Random.get_generator('loss') _rates = self.conflicting_change_rates # -- all_g = [] for i, cur_g in enumerate(flattened_grads): new_g = cur_g.clone() other_idxes = list(range(len(flattened_grads))) if _shuffle: _gen.shuffle(other_idxes) for j in other_idxes: other_g = flattened_grads[j] rate = _rates[i][j] if rate > 0.: _dot = (new_g * other_g).sum() _other_s2 = (other_g * other_g).sum() _offset = (_dot / _other_s2) * other_g new_g.sub_(rate * ((_dot < 0).float() * _offset)) # -- just checking! if BK.get_value(_dot).item() < 0: zlog( f"Here! _dot<0 as _dot={_dot}, _off={_dot / _other_s2}" ) # -- all_g.append(new_g) ret = BK.stack(all_g, 0).sum(0) # [*] return ret
def yield_train_yielder(self): all_yielders = [] all_svs = [] all_inner_rates = [] for group_name, group_datasets in self.datasets["train"].items(): all_yielders.append([z.yield_batches() for z in group_datasets]) one_inner_rates = np.asarray([ (len(z.items)**z.conf.group_sample_alpha) for z in group_datasets ]) all_inner_rates.append(one_inner_rates / one_inner_rates.sum()) # inner sample all_svs.append(self.train_sample_svs[group_name]) _gen = Random.get_generator('stream') _n_groups = len(all_svs) while True: # choose the outer if len(all_svs) == 1: cur_gidx = 0 # simply 1 else: pvals = np.asarray([z.value for z in all_svs]) pvals = pvals / pvals.sum() cur_gidx = _gen.choice(_n_groups, p=pvals) # choose group # choose the inner pvals2 = all_inner_rates[cur_gidx] if len(pvals2) == 1: cur_iidx = 0 else: cur_iidx = _gen.choice(len(pvals2), p=pvals2) # choose inner one # choose that one! chosen_yielder = all_yielders[cur_gidx][cur_iidx] yield chosen_yielder
def main(args): conf = MainConf() conf.update_from_args(args) # input with zopen(conf.input) as fd: lines = list(fd) if conf.skip_blank: lines = [z for z in lines if str.isspace(z)] # shuffle? origin_len = len(lines) if conf.shuffle_times > 0 or conf.shuffle: _t = max(1, conf.shuffle_times) # at least once! _gen = Random.get_generator('') for _ in range(_t): _gen.shuffle(lines) # sample? final_size = int(0.999 + (conf.rate * origin_len if conf.rate <= 1. else conf.rate)) out_lines = lines[:final_size] # output if conf.output: with zopen(conf.output, 'w') as fd2: for line in out_lines: fd2.write(line) # -- zlog( f"Sample({conf.rate}) {conf.input}=>{conf.output}: {origin_len}=>{len(out_lines)}" )
def iter_arg_choices(m: List, repeat=True, shuffle=True, max_num=-1): _gen = Random.get_generator("tune") # -- idx = 0 # expand fully args_pool = None if not repeat: args_pool = [[]] for cur_items in m: new_pool = [] for a in args_pool: for one_idx in range(len(cur_items)): new_pool.append(a + [one_idx]) args_pool = new_pool # -- zlog("** Arrange non-repeat iter, sized %d." % len(args_pool)) if shuffle: for _ in range(10): _gen.shuffle(args_pool) else: args_pool.reverse() # later using pop while True: if idx == max_num: break if repeat: sel_idxes = [_gen.randint(len(one)) for one in m] else: if len(args_pool) > 0: sel_idxes = args_pool.pop() else: break # ----- yield sel_idxes # return selection idxes idx += 1
def __init__(self, cl_helper: CLHelper, ii: int, frames: List): super().__init__() # -- self.frames = frames.copy() # save a copy for shuffle self.cl_helper = cl_helper self.ii = ii self.r_ii = self.cl_helper.cl_rank_idx[ii] # -- self._gen = Random.get_generator('stream') self.p = 0 # which point self.p_ret = 0 # how many is returned?
def do_presample(insts: List, s: float, shuffle: bool, reverse: bool): assert s > 0 if s < 1.: s = len(insts) * s s = int(s + 0.99999) # -- ret_idxes = list(range(len(insts))) if reverse: ret_idxes = list(reversed(ret_idxes)) if shuffle: _gen = Random.get_generator('presample') _gen.shuffle(ret_idxes) ret_idxes = ret_idxes[:s] return [insts[z] for z in ret_idxes], ret_idxes
def yield_batches(self, stream_item, loop: bool, filter_f=None): conf = self.conf _gen = Random.get_generator('stream') _bucket_shuffle_times = conf.bucket_shuffle_times if filter_f is None: filter_f = lambda x: True # no drop # -- # prepare buckets = self._put_buckets(stream_item) orig_counts = [len(b) for b in buckets] pvals = np.asarray(orig_counts) / sum(orig_counts) # for sample! arrangers = [] for b_items in buckets: # first shuffle for _ in range(_bucket_shuffle_times): _gen.shuffle(b_items) # get arranger input_stream = IterStreamer(b_items, restartable=True) arranger = BatchArranger( input_stream, bsize=conf.batch_size, maxi_bsize=conf.batch_maxi_bsize, batch_size_f=self.batch_size_f, dump_detectors=(lambda x: not filter_f(x)), sorting_keyer=(lambda x: len(x)), shuffle_batches_times=_bucket_shuffle_times) arranger.restart() arrangers.append(arranger) # go!! _len_buckets = len(buckets) while True: choice = _gen.choice(_len_buckets, p=pvals) # choose a bucket chosen_arranger = arrangers[choice] items, _eos = chosen_arranger.next_and_check() if _eos: if loop: # simply restart it chosen_arranger.restart() else: # pval clear; note: not change pvals every batch, but maybe does not matter! pvals[choice] = 0. _remain = pvals.sum().item() if _remain <= 0.: break # finished!! pvals = pvals / _remain else: yield items
def __init__(self, base_streamers: List[Streamer], stop_sidx=-1, ratios: List[SupportsFloat] = None, verbose=True): super().__init__(base_streamers) # -- if ratios is None: # by default all 1 ratios = [1.] * self._num_streamers assert self._num_streamers > 0 and self._num_streamers == len(ratios) self._ratios = ratios self._stop_sidx = stop_sidx self._random_sampler = Random.stream(STREAMER_RANDOM_GEN.random_sample) # status self._cur_idx = self._num_streamers - 1 self._cur_ratio = 0. self._cur_counts = [0] * self._num_streamers self.verbose = verbose
def _my_get_params_init(conf: NIConf, shape: Union[List[int], Tuple[int]], init: Union[str, object], lookup: bool): # shape is a tuple of dims assert init in ["default", "random", "glorot", "ortho", "gaussian", "zeros"], f"Unknown init method {init}" poss_scale = conf.init_scale_l if lookup else conf.init_scale_nl if len(shape) == 1: # set bias to 0 return np.zeros((shape[0],)) else: # get defaults if init == "default": init = conf.init_def_l if lookup else conf.init_def_nl _gen = Random.get_generator("param") # specifics if init == "glorot": if lookup: # special for lookups shape_g = (shape[-1], ) # fan-out for lookup else: shape_g = shape w0 = _gen.random_sample(shape) # [0,1) w0 = (w0-0.5)*(2*(np.sqrt(3.0*len(shape_g)/(sum(shape_g))))) return w0*poss_scale elif init == "random": w0 = _gen.random_sample(shape) # [0,1) w0 = (w0-0.5)*2 return w0*poss_scale elif init == "gaussian": w0 = _randn_clip(_gen, shape, 2.) # clip to [-2, 2] return w0*poss_scale elif init == "ortho": # todo(note): always assume init square matrices assert len(shape)==2 and (shape[0] % shape[1] == 0 or shape[1] % shape[0] == 0), f"Bad shape {shape} for ortho_init!" orig_num = shape[0] // shape[1] if orig_num == 0: num = shape[1] // shape[0] else: num = orig_num if num == 1: w0 = _ortho_weight(_gen, shape[1]) else: w0 = np.concatenate([_ortho_weight(_gen, shape[1]) for _ in range(num)]) if orig_num == 0: # reverse it! w0 = np.transpose(w0) return w0*poss_scale elif init == "zeros": return np.zeros(shape)
def _get_grads(self, params, flatten: bool, drop_whole=0., drop_partial=0.): grads = [ p.grad.detach().clone() if p.grad is not None else BK.zeros(p.shape) for p in params ] if drop_whole > 0.: _gen = Random.get_generator('loss') _mask = (_gen.random(len(grads)) < drop_whole) grads = [(g * 0. if m else g) for g, m in zip(grads, _mask)] if drop_partial > 0.: grads = [ g * (BK.rand(g.shape) < drop_partial).float() for g in grads ] if flatten: return BK.concat([z.flatten() for z in grads], 0) else: return grads
def filter_embed(self, wv: 'WordVectors', init_nohit=None, scale=1.0, assert_all_hit=False): if init_nohit is None: # auto decide by wv init_nohit = np.mean([np.std(z) for z in wv.vecs]).item() zlog(f"Auto decide init_nohit={init_nohit}") if init_nohit <= 0.: get_nohit = lambda s: np.zeros((s, ), dtype=np.float32) else: _generator = Random.get_generator("vocab") # get_nohit = lambda s: (_generator.random_sample((s,)).astype(np.float32)-0.5) * (2*init_nohit) get_nohit = lambda s: _generator.standard_normal(s) * init_nohit # ret = [] res = defaultdict(int) embed_size = wv.get_emb_size() # for w in self.keys(): # todo(+N): once a bug! for w in self.full_i2w: hit, norm_name, norm_w = wv.norm_until_hit(w) if hit: value = np.asarray(wv.get_vec(norm_w, norm_name=False), dtype=np.float32) res[norm_name] += 1 else: value = get_nohit(embed_size) # value = np.zeros((embed_size,), dtype=np.float32) res["no-hit"] += 1 ret.append(value) # -- if assert_all_hit: assert res[ "no-hit"] == 0, f"Filter-embed error: assert all-hit but get no-hit of {res['no-hit']}" zret = np.asarray(ret, dtype=np.float32) * scale zlog( f"Filter pre-trained embed {self}->{zret.shape}: {res}, no-hit is inited with {init_nohit}." ) return zret
c_stream = CacheStreamer(i_stream, shuffle_times=cache_shuffle_times) return c_stream # especially for training def train_prep_stream(in_stream: Streamer, tconf: TConf): # for training, we get all the sentences! assert tconf.train_stream_mode == "sent", "Currently we only support sent training!" sent_stream = FListWrapperStreamer( in_stream, lambda d: [x for x in yield_sents([d]) if len(x) <= tconf.train_max_length and len(x) >= tconf.train_min_length and (len(x.events) > 0 or next(_BS_sample_stream) > tconf.train_skip_noevt_rate)]) # filter out certain sents! if tconf.train_stream_reshuffle_times > 0: # reshuffle for sents sent_stream = ShuffleStreamer(sent_stream, shuffle_bsize=tconf.train_stream_reshuffle_bsize, shuffle_times=tconf.train_stream_reshuffle_times) return sent_stream # function to get BatchArranger _BK_gen = Random.get_generator("train") _BS_sample_stream = Random.stream(_BK_gen.random_sample) def batch_stream(in_stream: Streamer, tconf: TConf, training: bool): _sent_counter = lambda d: len(list(yield_sents([d]))) _tok_counter = lambda d: sum(len(s) for s in yield_sents([d])) _frame_counter = lambda d: sum(len(s.events) for s in yield_sents([d])) _ftok_counter = lambda d: sum(max(1, len(s.events))*len(s) for s in yield_sents([d])) batch_size_f_map = {"sent": _sent_counter, "tok": _tok_counter, "frame": _frame_counter, "ftok": _ftok_counter} if training: batch_size_f = batch_size_f_map[tconf.train_count_mode] b_stream = BatchArranger(in_stream, bsize=tconf.train_batch_size, maxi_bsize=tconf.train_maxibatch_size, batch_size_f=batch_size_f, dump_detectors=None, single_detectors=None, sorting_keyer=lambda x: len(x), shuffle_batches_times=tconf.train_batch_shuffle_times) else: batch_size_f = batch_size_f_map[tconf.test_count_mode] b_stream = BatchArranger(in_stream, bsize=tconf.test_batch_size, maxi_bsize=1, batch_size_f=batch_size_f,
def run(self): conf = self.conf last_report_uidx, last_dev_uidx = 0, 0 # -- if conf.valid_first: # valid before training self.validate() # -- _lrate_warmup_factor, _lrate_warmup_steps = self.lrate_warmup_factor, self.lrate_warmup_steps _skip_batch = conf.skip_batch _gen0 = Random.get_generator("train") _gen = Random.stream(_gen0.random_sample) # -- _accu_checker = 0 _accu_batch = conf.accu_batch # -- # start before loop self.adjust_scheduled_values() # loop act_lrate = None while True: # loop over and over _train_stream = self.get_train_stream( ) # we may change train_stream!! # -- if _train_stream.is_inactive( ): # check to avoid restart after load_progress _train_stream.restart() insts, _eos = _train_stream.next_and_check() if _eos: # end of epoch zlog( f"End of epoch at {self.tp.current_suffix(False)}: Current act_lrate is {act_lrate}.", func="plain", timed=True) if conf.valid_epoch: last_dev_uidx = self.tp.uidx self.validate() # todo(+N): do we need to adjust sv at a finer grained? self.adjust_scheduled_values() # adjust after validation if self._finished(): break self.tp.update_eidx(1) continue # skip batch? if _skip_batch > 0 and next(_gen) < _skip_batch: continue if self.train_discard_batch_f(insts): continue # discard this batch due to some specific reasons (like noevt) # run fb (possibly split batch) self.fb_batch(insts, 1. / _accu_batch) self.tp.update_iidx(len(insts)) # == # only update for certain accu fb runs _accu_checker += 1 if _accu_checker % _accu_batch == 0: self.tp.update_uidx(1) cur_uidx = self.tp.uidx # get the effective lrate and update act_lrate = float( self.lrate.value) # start with the lrate.value if cur_uidx < _lrate_warmup_steps: # linear increase act_lrate *= (cur_uidx / _lrate_warmup_steps) else: # decrease act_lrate *= _lrate_warmup_factor * ( cur_uidx**conf.lrate_decrease_alpha) self._run_update(act_lrate, 1.) # -- # report on training process if conf.flag_verbose and ( cur_uidx - last_report_uidx) >= conf.report_ufreq: zlog( f"Report at {self.tp.current_suffix(False)}: Current act_lrate is {act_lrate}.", func="plain", timed=True) self._run_train_report() last_report_uidx = cur_uidx # valid? if (cur_uidx - last_dev_uidx) >= conf.valid_ufreq: last_dev_uidx = self.tp.uidx self.validate() # todo(+N): do we need to adjust sv at a finer grained? self.adjust_scheduled_values() # adjust after validation if self._finished(): break # ===== zlog(f"Finish training because of: {self._reach_ends()}", func="plain") zlog( f"zzzzzfinal: After training, the best point is: {self.tp.info_best()}.", func="report")
# from typing import Union, Iterable, List, Callable, SupportsFloat from msp2.utils import zfatal, Random, Constants, zlog # ===== # basic STREAMER_RANDOM_GEN = Random.get_generator('stream') # basic streamer class Streamer: def __init__(self): self.eos = None # by default, EOS is None # status self._count = 0 self._max_count = 0 self._restart_times = 0 self._active = False self._stack = [] def __repr__(self): return f"{self.__class__.__name__}(A={self._active},R={self._restart_times},C={self._count})" def __iter__(self): self.restart() # for convenience return self def __next__(self): one = self.next()