Пример #1
0
 def fit_once(fit_files):
     # first fitting a simple one: y = gaussian(ax+b, sigma), here not including xenc for that will be too large
     with utils.Timer(tag="Fit-length-once", print_date=True):
         # 1. collect length
         with utils.zopen(fit_files[0]) as f0, utils.zopen(fit_files[1]) as f1:
             # todo(warn): plus one for the <eos> tokens
             x = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f0]
             y = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f1]
         utils.zcheck_matched_length(x, y, _forced=True)
         ll = len(x)
         x1, y1 = np.array(x, dtype=np.float32).reshape((-1,1)), np.array(y, dtype=np.float32)
         # 2. fit linear model
         try:    # todo(warn)
             regr = linear_model.LinearRegression()
             regr.fit(x1, y1)
             a, b = float(regr.coef_[0]), float(regr.intercept_)
         except:
             utils.zlog("Cannot linear-regression, skip that.")
             a, b = 1., 0.
         # 3. fit sigma
         x1 = x1.reshape((-1,))
         errors = a*x1+b - y1
         mu = np.mean(errors)
         sigma = np.sqrt(((errors - mu)**2).mean())
         ret = (a, b, sigma, mu)
         del x, y, x1, y1
         utils.zlog("Fitting Length with %s sentences and get %s." % (ll, ret), func="score")
     return ret
Пример #2
0
 def __init__(self, name, mm=None):
     self.name = name
     self.loss = defaultdict(float)
     self.sents = 1e-6
     self.words = 1e-6
     self.updates = 0
     self.timer = utils.Timer("")
     self._mm = mm
Пример #3
0
 def reset(self):
     self.loss = self.loss = defaultdict(float)
     self.sents = 1e-5
     self.words = 1e-5
     self.updates = 0
     self.timer = utils.Timer("")
     #
     if self._mm is not None:
         self._mm.stat_clear()
Пример #4
0
 def _validate_len(self, dev_iter):
     # sqrt error
     count = 0
     loss = 0.
     with utils.Timer(tag="VALID-LEN", print_date=True) as et:
         utils.zlog("With lg as %s." % (self._mm.lg.obtain_params(), ))
         for insts in dev_iter.arrange_batches():
             ys = [i[1] for i in insts]
             ylens = np.asarray([len(_y) for _y in ys])
             count += len(ys)
             Model.new_graph()
             self._mm.refresh(False)
             preds = self._mm.predict_length(insts)
             loss += np.sum((preds - ylens)**2)
     return -loss / count
Пример #5
0
def main():
    # init
    opts = mt_args.init("test")
    looping = opts["loop"]
    # 1. data
    source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read(opts["dicts"][1])
    # -- here usually no need for test[1], but for convenience ...
    if not looping:
        dicts = [source_dict] + [target_dict for _ in opts["test"][1:]]
        test_iter = get_arranger(opts["test"], dicts, multis=False, shuffling_corpus=False, shuflling_buckets=False, sort_prior=[0], batch_size=opts["test_batch_size"], maxibatch_size=-1, max_len=utils.Constants.MAX_V, min_len=0, one_len=opts["max_len"]+1, shuffling0=False)
    # 2. model
    mm = []
    for mn in opts["models"]:
        x = s2sModel(opts, source_dict, target_dict, None)     # rebuild from opts, thus use the same opts when testing
        x.load(mn)
        mm.append(x)
    if len(mm) == 0:
        utils.zlog("No models specified, must be testing mode?", func="warn")
        mm.append(s2sModel(opts, source_dict, target_dict, None))      # no loading, only for testing
    # 3. decode
    if not looping:
        utils.zlog("=== Start to decode ===", func="info")
        with utils.Timer(tag="Decoding", print_date=True):
            mt_decode(opts["decode_way"], test_iter, mm, target_dict, opts, opts["output"])
        utils.zlog("=== End decoding, write to %s ===" % opts["output"], func="info")
        # todo(warn) forward-compatible evaluation
        if len(opts["test"]) > 1:
            gold = opts["test"][1]
        else:
            gold = opts["gold"][0]
        mt_eval.evaluate(opts["output"], gold, opts["eval_metric"])
    else:
        ot = Outputter(opts)
        while True:
            utils.zlog("Enter the src to translate:")
            line = sys.stdin.readline()
            if len(line)==0:
                break
            # prepare one
            one_words = line.strip().split()
            one_idxes = Vocab.w2i(source_dict, one_words, add_eos=True, use_factor=False)
            one_inst = TextInstance([one_words], [one_idxes])
            rs = mt_decode(opts["decode_way"], [one_inst], mm, target_dict, opts, opts["output"])
            utils.zlog(ot.format(rs[0], target_dict, False, False))
Пример #6
0
def main():
    # init
    opts = mt_args.init("rerank")
    # special readings from args for re-ranking mode
    # only accept spaced (multi-mode) nbest files for target & non-multi for golds
    # 1. data (only accepting nbest files)
    source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read(
        opts["dicts"][1])
    dicts = [source_dict] + [target_dict for _ in opts["test"][1:]]
    test_iter = get_arranger_simple(opts["test"],
                                    dicts,
                                    multis=[False] +
                                    [True for _ in opts["test"][1:]],
                                    batch_size=opts["test_batch_size"])
    gold_iter = get_arranger_simple(opts["gold"],
                                    [target_dict for _ in opts["gold"]],
                                    multis=False,
                                    batch_size=opts["test_batch_size"])
    utils.zcheck_matched_length(test_iter, gold_iter)
    # 2. model
    mm = []
    try:
        for mn in opts["models"]:
            x = mt_mt.s2sModel(
                opts, source_dict, target_dict,
                None)  # rebuild from opts, thus use the same opts when testing
            try:
                x.load(mn)
            except:
                utils.zlog("Load model error %s!" % mn, func="warn")
            mm.append(x)
    except:
        pass
    # 3. analysis
    if len(mm) == 0:
        utils.zlog("No models specified, only analysing!", func="warn")
        num_test = len(opts["test"]) - 1
        golds = []
        srcs = []
        preds = [[] for _ in range(num_test)]
        for one in gold_iter.arrange_batches():
            golds += one
        for one in test_iter.arrange_batches():
            for zz in one:
                zzs = zz.extract()
                srcs.append(zzs[0])
                for i in range(num_test):
                    preds[i].append(zzs[i + 1])
        Analyzer.analyse(srcs, golds, preds, kbests=opts["rr_analysis_kbests"])
    # 4. rerank
    else:
        utils.zlog("=== Start to rerank ===", func="info")
        with utils.Timer(tag="Reranking", print_date=True):
            mt_decode(None,
                      test_iter,
                      mm,
                      target_dict,
                      opts,
                      opts["output"],
                      gold_iter=gold_iter)
        utils.zlog("=== End reranking, write to %s ===" % opts["output"],
                   func="info")
        mt_eval.evaluate(opts["output"], opts["gold"][0], opts["eval_metric"])