def fit_once(fit_files): # first fitting a simple one: y = gaussian(ax+b, sigma), here not including xenc for that will be too large with utils.Timer(tag="Fit-length-once", print_date=True): # 1. collect length with utils.zopen(fit_files[0]) as f0, utils.zopen(fit_files[1]) as f1: # todo(warn): plus one for the <eos> tokens x = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f0] y = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f1] utils.zcheck_matched_length(x, y, _forced=True) ll = len(x) x1, y1 = np.array(x, dtype=np.float32).reshape((-1,1)), np.array(y, dtype=np.float32) # 2. fit linear model try: # todo(warn) regr = linear_model.LinearRegression() regr.fit(x1, y1) a, b = float(regr.coef_[0]), float(regr.intercept_) except: utils.zlog("Cannot linear-regression, skip that.") a, b = 1., 0. # 3. fit sigma x1 = x1.reshape((-1,)) errors = a*x1+b - y1 mu = np.mean(errors) sigma = np.sqrt(((errors - mu)**2).mean()) ret = (a, b, sigma, mu) del x, y, x1, y1 utils.zlog("Fitting Length with %s sentences and get %s." % (ll, ret), func="score") return ret
def __init__(self, name, mm=None): self.name = name self.loss = defaultdict(float) self.sents = 1e-6 self.words = 1e-6 self.updates = 0 self.timer = utils.Timer("") self._mm = mm
def reset(self): self.loss = self.loss = defaultdict(float) self.sents = 1e-5 self.words = 1e-5 self.updates = 0 self.timer = utils.Timer("") # if self._mm is not None: self._mm.stat_clear()
def _validate_len(self, dev_iter): # sqrt error count = 0 loss = 0. with utils.Timer(tag="VALID-LEN", print_date=True) as et: utils.zlog("With lg as %s." % (self._mm.lg.obtain_params(), )) for insts in dev_iter.arrange_batches(): ys = [i[1] for i in insts] ylens = np.asarray([len(_y) for _y in ys]) count += len(ys) Model.new_graph() self._mm.refresh(False) preds = self._mm.predict_length(insts) loss += np.sum((preds - ylens)**2) return -loss / count
def main(): # init opts = mt_args.init("test") looping = opts["loop"] # 1. data source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read(opts["dicts"][1]) # -- here usually no need for test[1], but for convenience ... if not looping: dicts = [source_dict] + [target_dict for _ in opts["test"][1:]] test_iter = get_arranger(opts["test"], dicts, multis=False, shuffling_corpus=False, shuflling_buckets=False, sort_prior=[0], batch_size=opts["test_batch_size"], maxibatch_size=-1, max_len=utils.Constants.MAX_V, min_len=0, one_len=opts["max_len"]+1, shuffling0=False) # 2. model mm = [] for mn in opts["models"]: x = s2sModel(opts, source_dict, target_dict, None) # rebuild from opts, thus use the same opts when testing x.load(mn) mm.append(x) if len(mm) == 0: utils.zlog("No models specified, must be testing mode?", func="warn") mm.append(s2sModel(opts, source_dict, target_dict, None)) # no loading, only for testing # 3. decode if not looping: utils.zlog("=== Start to decode ===", func="info") with utils.Timer(tag="Decoding", print_date=True): mt_decode(opts["decode_way"], test_iter, mm, target_dict, opts, opts["output"]) utils.zlog("=== End decoding, write to %s ===" % opts["output"], func="info") # todo(warn) forward-compatible evaluation if len(opts["test"]) > 1: gold = opts["test"][1] else: gold = opts["gold"][0] mt_eval.evaluate(opts["output"], gold, opts["eval_metric"]) else: ot = Outputter(opts) while True: utils.zlog("Enter the src to translate:") line = sys.stdin.readline() if len(line)==0: break # prepare one one_words = line.strip().split() one_idxes = Vocab.w2i(source_dict, one_words, add_eos=True, use_factor=False) one_inst = TextInstance([one_words], [one_idxes]) rs = mt_decode(opts["decode_way"], [one_inst], mm, target_dict, opts, opts["output"]) utils.zlog(ot.format(rs[0], target_dict, False, False))
def main(): # init opts = mt_args.init("rerank") # special readings from args for re-ranking mode # only accept spaced (multi-mode) nbest files for target & non-multi for golds # 1. data (only accepting nbest files) source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read( opts["dicts"][1]) dicts = [source_dict] + [target_dict for _ in opts["test"][1:]] test_iter = get_arranger_simple(opts["test"], dicts, multis=[False] + [True for _ in opts["test"][1:]], batch_size=opts["test_batch_size"]) gold_iter = get_arranger_simple(opts["gold"], [target_dict for _ in opts["gold"]], multis=False, batch_size=opts["test_batch_size"]) utils.zcheck_matched_length(test_iter, gold_iter) # 2. model mm = [] try: for mn in opts["models"]: x = mt_mt.s2sModel( opts, source_dict, target_dict, None) # rebuild from opts, thus use the same opts when testing try: x.load(mn) except: utils.zlog("Load model error %s!" % mn, func="warn") mm.append(x) except: pass # 3. analysis if len(mm) == 0: utils.zlog("No models specified, only analysing!", func="warn") num_test = len(opts["test"]) - 1 golds = [] srcs = [] preds = [[] for _ in range(num_test)] for one in gold_iter.arrange_batches(): golds += one for one in test_iter.arrange_batches(): for zz in one: zzs = zz.extract() srcs.append(zzs[0]) for i in range(num_test): preds[i].append(zzs[i + 1]) Analyzer.analyse(srcs, golds, preds, kbests=opts["rr_analysis_kbests"]) # 4. rerank else: utils.zlog("=== Start to rerank ===", func="info") with utils.Timer(tag="Reranking", print_date=True): mt_decode(None, test_iter, mm, target_dict, opts, opts["output"], gold_iter=gold_iter) utils.zlog("=== End reranking, write to %s ===" % opts["output"], func="info") mt_eval.evaluate(opts["output"], opts["gold"][0], opts["eval_metric"])