def validate_model(model, valset, refset, search_option, name, reader, option,
                   **variables):
    trans = translate(model, valset, **search_option)
    bleu_score = bleu(trans, refset)

    step = variables["step"]
    epoch = variables["epoch"]
    global_step = variables["global_step"]

    msg = "global_step: %d, epoch: %d, step: %d, bleu: %2.4f"
    print(msg % (global_step, epoch, step, bleu_score))

    best_score = option["bleu"]
    if bleu_score > best_score:
        option["bleu"] = bleu_score
        save_model(model, name, reader, option, **variables)
Пример #2
0
# coding=utf-8

from metric import bleu, otem, utem

ref = "You should learn to use the computer ."

can1 = "You should learn to use the car ."
can2 = "You should learn to use the the computer ."
can3 = "You should learn to use the ."

print('Reference: {}'.format(ref))
print('Candidate1: {}'.format(can1))
print('Candidate2: {}'.format(can2))
print('Candidate3: {}'.format(can3))

refs = [[ref.split()]]
can1 = [can1.split()]
can2 = [can2.split()]
can3 = [can3.split()]

print('BLEU {} vs {} vs {}'.format(bleu(can1, refs), bleu(can2, refs),
                                   bleu(can3, refs)))
print('OTEM {} vs {} vs {}'.format(otem(can1, refs, n=1), otem(can2, refs,
                                                               n=1),
                                   otem(can3, refs, n=1)))
print('UTEM {} vs {} vs {}'.format(utem(can1, refs), utem(can2, refs),
                                   utem(can3, refs)))
Пример #3
0
def train(args):
    option = getoption()

    if os.path.exists(args.model):
        option, params = loadmodel(args.model)
        init = False
    else:
        init = True

    override(option, args)
    print_option(option)

    # set seed
    numpy.random.seed(option["seed"])

    if option["ref"]:
        references = loadreferences(option["ref"])
    else:
        references = None

    svocabs, tvocabs = option["vocabulary"]
    svocab, isvocab = svocabs
    tvocab, itvocab = tvocabs

    pathname, basename = os.path.split(args.model)
    modelname = getfilename(basename)
    autoname = os.path.join(pathname, modelname + ".autosave.pkl")
    bestname = os.path.join(pathname, modelname + ".best.pkl")
    batch = option["batch"]
    sortk = option["sort"] or 1
    shuffle = option["seed"] if option["shuffle"] else None
    reader = textreader(option["corpus"], shuffle)
    processor = [getlen, getlen]
    stream = textiterator(reader, [batch, batch * sortk], processor,
                          option["limit"], option["sort"])

    if shuffle and option["indices"] is not None:
        reader.set_indices(option["indices"])

    if args.reset:
        option["count"] = [0, 0]
        option["epoch"] = 0
        option["cost"] = 0.0

    skipstream(reader, option["count"][1])
    epoch = option["epoch"]
    maxepoch = option["maxepoch"]

    model = rnnsearch(**option)

    if init:
        uniform(model.parameter, -0.08, 0.08)
    else:
        set_variables(model.parameter, params)

    print "parameters:", parameters(model.parameter)

    # tuning option
    toption = {}
    toption["algorithm"] = option["optimizer"]
    toption["variant"] = option["variant"]
    toption["constraint"] = ("norm", option["norm"])
    toption["norm"] = True
    trainer = optimizer(model, **toption)
    alpha = option["alpha"]

    # beamsearch option
    doption = {}
    doption["beamsize"] = option["beamsize"]
    doption["normalize"] = option["normalize"]
    doption["maxlen"] = option["maxlen"]
    doption["minlen"] = option["minlen"]

    best_score = option["bleu"]
    unk_sym = option["unk"]
    eos_sym = option["eos"]
    count = option["count"][0]
    totcost = option["cost"]

    for i in range(epoch, maxepoch):
        for data in stream:
            xdata, xmask = processdata(data[0], svocab, unk_sym, eos_sym)
            ydata, ymask = processdata(data[1], tvocab, unk_sym, eos_sym)

            t1 = time.time()
            cost, norm = trainer.optimize(xdata, xmask, ydata, ymask)
            trainer.update(alpha=alpha)
            t2 = time.time()

            count += 1

            cost = cost * ymask.shape[1] / ymask.sum()
            totcost += cost / math.log(2)
            print i + 1, count, cost, norm, t2 - t1

            # autosave
            if count % option["freq"] == 0:
                model.option["indices"] = reader.get_indices()
                model.option["bleu"] = best_score
                model.option["cost"] = totcost
                model.option["count"] = [count, reader.count]
                serialize(autoname, model)

            if count % option["vfreq"] == 0:
                if option["validation"] and references:
                    trans = translate(model, option["validation"], **doption)
                    bleu_score = bleu(trans, references)
                    print "bleu: %2.4f" % bleu_score
                    if bleu_score > best_score:
                        best_score = bleu_score
                        model.option["indices"] = reader.get_indices()
                        model.option["bleu"] = best_score
                        model.option["cost"] = totcost
                        model.option["count"] = [count, reader.count]
                        serialize(bestname, model)

            if count % option["sfreq"] == 0:
                n = len(data[0])
                ind = numpy.random.randint(0, n)
                sdata = data[0][ind]
                tdata = data[1][ind]
                xdata = xdata[:, ind:ind + 1]
                hls = beamsearch(model, xdata)
                if len(hls) > 0:
                    best, score = hls[0]
                    print sdata
                    print tdata
                    print " ".join(best[:-1])
                else:
                    print sdata
                    print tdata
                    print "warning: no translation"

        print "--------------------------------------------------"

        if option["validation"] and references:
            trans = translate(model, option["validation"], **doption)
            bleu_score = bleu(trans, references)
            print "iter: %d, bleu: %2.4f" % (i + 1, bleu_score)
            if bleu_score > best_score:
                best_score = bleu_score
                model.option["indices"] = reader.get_indices()
                model.option["bleu"] = best_score
                model.option["cost"] = totcost
                model.option["count"] = [count, reader.count]
                serialize(bestname, model)

        print "averaged cost: ", totcost / count
        print "--------------------------------------------------"

        # early stopping
        if i + 1 >= option["stop"]:
            alpha = alpha * option["decay"]

        count = 0
        totcost = 0.0
        stream.reset()

        # update autosave
        model.option["epoch"] = i + 1
        model.option["alpha"] = alpha
        model.option["indices"] = reader.get_indices()
        model.option["bleu"] = best_score
        model.option["cost"] = totcost
        model.option["count"] = [0, 0]
        serialize(autoname, model)

    print "best(bleu): %2.4f" % best_score

    stream.close()
Пример #4
0
def decode(args):
    num_models = len(args.model)
    models = [None for i in range(num_models)]

    for i in range(num_models):
        option, params = load_model(args.model[i])
        scope = "rnnsearch_%d" % i
        model = rnnsearch(scope=scope, **option)
        var_list = get_variables_with_prefix(scope)
        set_variables(var_list, params)
        models[i] = model

    # use the first model
    svocabs, tvocabs = models[0].option["vocabulary"]
    unk_sym = models[0].option["unk"]
    eos_sym = models[0].option["eos"]

    count = 0

    svocab, isvocab = svocabs
    tvocab, itvocab = tvocabs

    option = {}
    option["maxlen"] = args.maxlen
    option["minlen"] = args.minlen
    option["beamsize"] = args.beamsize
    option["normalize"] = args.normalize
    option["arithmetic"] = args.arithmetic

    if args.oracle:
        references = load_references(args.oracle)
    else:
        references = None

    while True:
        line = sys.stdin.readline()

        if line == "":
            break

        data = [line]
        seq, mask = convert_data(data, svocab, unk_sym, eos_sym)
        t1 = time.time()
        tlist = beamsearch(models, seq, **option)
        t2 = time.time()

        if len(tlist) == 0:
            translation = ""
            score = -10000.0
        else:
            if references is None:
                best, score = tlist[0]
                translation = " ".join(best[:-1])
                sys.stdout.write(translation)
                sys.stdout.write("\n")
            else:
                best_ind = 0
                best_score = 0
                # find the best translation according to oracle
                for i, (trans, score) in enumerate(tlist):
                    trans = trans[:-1]
                    bleu_score = bleu([trans], [references[count]],
                                      smoothing=True)
                    if bleu_score > best_score:
                        best_score = bleu_score
                        best_ind = i

                output = " ".join(tlist[0][0][:-1]) + " ||| "
                output += str(tlist[0][1]) + " ||| " + str(best_ind) + " ||| "
                output += " ".join(tlist[best_ind][0][:-1]) + " ||| "
                output += str(tlist[best_ind][1])

                sys.stdout.write(output)
                sys.stdout.write("\n")

        count = count + 1
        sys.stderr.write(str(count) + " ")
        sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n")
Пример #5
0
def train(args):
    option = default_option()

    # predefined model names
    pathname, basename = os.path.split(args.model)
    modelname = get_filename(basename)
    autoname = os.path.join(pathname, modelname + ".autosave.pkl")
    bestname = os.path.join(pathname, modelname + ".best.pkl")

    # load models
    if os.path.exists(args.model):
        opt, params = load_model(args.model)
        option = opt
        init = False
    else:
        init = True

    if args.initialize:
        init_params = load_model(args.initialize)
        init_params = init_params[1]
        restore = True
    else:
        restore = False

    override(option, args_to_dict(args))
    print_option(option)

    # load references
    if option["references"]:
        references = load_references(option["references"])
    else:
        references = None

    if args.skip_val:
        references = None

    criterion = option["criterion"]

    if criterion == "mrt":
        sys.stderr.write("warning: In MRT mode, batch is set to 1\n")

    # input corpus
    batch = option["batch"] if criterion == "mle" else 1
    sortk = option["sort"] or 1 if criterion == "mle" else 1
    shuffle = option["seed"] if option["shuffle"] else None
    reader = textreader(option["corpus"], shuffle)
    processor = [data_length, data_length]
    stream = textiterator(reader, [batch, batch * sortk], processor,
                          option["limit"], option["sort"])

    if shuffle and option["indices"] is not None:
        reader.set_indices(option["indices"])

    if args.reset:
        option["count"] = [0, 0]
        option["epoch"] = 0
        option["cost"] = 0.0

    skip_stream(reader, option["count"][1])
    epoch = option["epoch"]
    maxepoch = option["maxepoch"]

    # create model
    regularizer = []

    if option["l1_scale"]:
        regularizer.append(ops.l1_regularizer(option["l1_scale"]))

    if option["l2_scale"]:
        regularizer.append(ops.l2_regularizer(option["l2_scale"]))

    scale = option["scale"]
    initializer = ops.random_uniform_initializer(-scale, scale)
    regularizer = ops.sum_regularizer(regularizer)
    # set seed
    numpy.random.seed(option["seed"])
    model = rnnsearch(initializer=initializer, regularizer=regularizer,
                      **option)

    variables = None

    if restore:
        matched, not_matched = match_variables(ops.trainable_variables(),
                                               init_params)
        if args.finetune:
            variables = not_matched
            if not variables:
                raise RuntimeError("no variables to finetune")

    if not init:
        set_variables(ops.trainable_variables(), params)

    if restore:
        restore_variables(matched, not_matched)

    print "parameters:", count_parameters(ops.trainable_variables())

    # tuning option
    tune_opt = {}
    tune_opt["algorithm"] = option["optimizer"]
    tune_opt["constraint"] = ("norm", option["norm"])
    tune_opt["norm"] = True
    tune_opt["variables"] = variables

    # create optimizer
    trainer = optimizer(model, **tune_opt)

    # beamsearch option
    search_opt = {}
    search_opt["beamsize"] = option["beamsize"]
    search_opt["normalize"] = option["normalize"]
    search_opt["maxlen"] = option["maxlen"]
    search_opt["minlen"] = option["minlen"]

    # vocabulary and special symbol
    svocabs, tvocabs = option["vocabulary"]
    svocab, isvocab = svocabs
    tvocab, itvocab = tvocabs
    unk_sym = option["unk"]
    eos_sym = option["eos"]

    # summary
    count = option["count"][0]
    totcost = option["cost"]
    best_score = option["bleu"]
    alpha = option["alpha"]
    sharp = option["sharp"]

    for i in range(epoch, maxepoch):
        for data in stream:
            xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym)
            ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym)

            if criterion == "mrt":
                refs = []

                for item in data[1]:
                    item = item.split()
                    item = [unk_sym if word not in tvocab else word
                            for word in item]
                    refs.append(" ".join(item))

                t1 = time.time()

                # sample from model
                nsample = option["sample"] - len(refs)
                xdata = numpy.repeat(xdata, nsample, 1)
                xmask = numpy.repeat(xmask, nsample, 1)
                maxlen = int(1.5 * len(ydata))
                examples = batchsample(model, xdata, xmask, maxlen)
                space = build_sample_space(refs, examples)
                score = numpy.zeros((len(space),), "float32")

                refs = [ref.split() for ref in refs]

                for j in range(len(space)):
                    example = space[j].split()
                    score[j] = 1.0 - bleu([example], [refs], smoothing=True)

                ydata, ymask = convert_data(space, tvocab, unk_sym, eos_sym)
                cost, norm = trainer.optimize(xdata[:, 0:1], xmask[:, 0:1],
                                              ydata, ymask, score, sharp)
                trainer.update(alpha=alpha)
                t2 = time.time()

                totcost += cost
                count += 1
                t = t2 - t1
                ac = totcost / count
                print i + 1, count, len(space), cost, norm, ac, t
            else:
                t1 = time.time()
                cost, norm = trainer.optimize(xdata, xmask, ydata, ymask)
                trainer.update(alpha = alpha)
                t2 = time.time()

                count += 1
                cost = cost * ymask.shape[1] / ymask.sum()
                totcost += cost / math.log(2)
                print i + 1, count, cost, norm, t2 - t1

            # autosave
            if count % option["freq"] == 0:
                option["indices"] = reader.get_indices()
                option["bleu"] = best_score
                option["cost"] = totcost
                option["count"] = [count, reader.count]
                serialize(autoname, option)

            if count % option["vfreq"] == 0:
                if option["validation"] and references:
                    trans = translate(model, option["validation"],
                                      **search_opt)
                    bleu_score = bleu(trans, references)
                    print "bleu: %2.4f" % bleu_score
                    if bleu_score > best_score:
                        best_score = bleu_score
                        option["indices"] = reader.get_indices()
                        option["bleu"] = best_score
                        option["cost"] = totcost
                        option["count"] = [count, reader.count]
                        serialize(bestname, option)

            if count % option["sfreq"] == 0:
                n = len(data[0])
                ind = numpy.random.randint(0, n)
                sdata = data[0][ind]
                tdata = data[1][ind]
                xdata = xdata[:, ind : ind + 1]
                xmask = xmask[:, ind : ind + 1]
                hls = beamsearch(model, xdata, xmask)
                best, score = hls[0]
                print sdata
                print tdata
                print " ".join(best[:-1])


        print "--------------------------------------------------"

        if option["validation"] and references:
            trans = translate(model, option["validation"], **search_opt)
            bleu_score = bleu(trans, references)
            print "iter: %d, bleu: %2.4f" % (i + 1, bleu_score)
            if bleu_score > best_score:
                best_score = bleu_score
                option["indices"] = reader.get_indices()
                option["bleu"] = best_score
                option["cost"] = totcost
                option["count"] = [count, reader.count]
                serialize(bestname, option)

        print "averaged cost: ", totcost / count
        print "--------------------------------------------------"

        # early stopping
        if i + 1 >= option["stop"]:
            alpha = alpha * option["decay"]

        count = 0
        totcost = 0.0
        stream.reset()

        # update autosave
        option["epoch"] = i + 1
        option["alpha"] = alpha
        option["indices"] = reader.get_indices()
        option["bleu"] = best_score
        option["cost"] = totcost
        option["count"] = [0, 0]
        serialize(autoname, option)

    print "best(bleu): %2.4f" % best_score

    stream.close()