示例#1
0
import numpy as np
from dataset import minibatches
inputs = np.array([1, 2, 3, 4])
targets = np.array([1, 2, 3, 4])
for c in range(5):
    print(str(c) + ":c")
    creatorGenerate = minibatches(inputs, targets, batch_size=1)
    for i, j in creatorGenerate:
        print(i, j)
示例#2
0
    def inference(self, tst, output, quiet=False):

        if self.config.show_svg:
            output.write("<html>\n<body>\n")
        nbatches = (len(tst) + self.config.batch_size -
                    1) // self.config.batch_size
        score = Score()
        n_sents = 0
        for iter, (src_batch, tgt_batch, raw_src_batch, raw_tgt_batch,
                   sign_src_batch, sign_tgt_batch, sign_batch, len_src_batch,
                   len_tgt_batch) in enumerate(
                       minibatches(tst, self.config.batch_size)):
            fd = self.get_feed_dict(src_batch, tgt_batch, sign_src_batch,
                                    sign_tgt_batch, sign_batch, len_src_batch,
                                    len_tgt_batch, 0.0)

            if self.config.mode == "sentence":
                out_batch, last_src_batch, last_tgt_batch = self.sess.run(
                    [self.output, self.last_src, self.last_tgt], feed_dict=fd)
                if tst.annotated:
                    score.add_batch(out_batch, sign_batch)
                for i_sent in range(len(out_batch)):
                    n_sents += 1
                    v = Visualize(output, n_sents, raw_src_batch[i_sent],
                                  raw_tgt_batch[i_sent], out_batch[i_sent])
                    last_src = []
                    last_tgt = []
                    if self.config.show_last:
                        last_src = last_src_batch[i_sent]
                        last_tgt = last_tgt_batch[i_sent]
                    v.print_vectors(last_src,
                                    last_tgt,
                                    aggr_src=[],
                                    aggr_tgt=[],
                                    align=[],
                                    quiet=quiet)
            else:
                align_batch, aggr_src_batch, aggr_tgt_batch, out_src_batch, out_tgt_batch, last_src_batch, \
                    last_tgt_batch, sim_batch = self.sess.run([self.align, self.aggregation_src,
                                                               self.aggregation_tgt, self.out_src, self.out_tgt,
                                                               self.last_src, self.last_tgt, self.cos_similarity],
                                                              feed_dict=fd)
                if tst.annotated:
                    score.add_batch_tokens(aggr_src_batch, sign_src_batch,
                                           len_src_batch)
                    score.add_batch_tokens(aggr_tgt_batch, sign_tgt_batch,
                                           len_tgt_batch)
                for i_sent in range(len(align_batch)):
                    n_sents += 1
                    v = Visualize(output, n_sents, raw_src_batch[i_sent],
                                  raw_tgt_batch[i_sent], sim_batch[i_sent])
                    if self.config.show_svg:
                        v.print_svg(aggr_src_batch[i_sent],
                                    aggr_tgt_batch[i_sent],
                                    align_batch[i_sent])
                    elif self.config.show_matrix:
                        v.print_matrix(aggr_src_batch[i_sent],
                                       aggr_tgt_batch[i_sent],
                                       align_batch[i_sent])
                    else:
                        last_src = []
                        last_tgt = []
                        aggr_src = []
                        aggr_tgt = []
                        align = []
                        if self.config.show_last:
                            last_src = last_src_batch[i_sent]
                            last_tgt = last_tgt_batch[i_sent]
                        if self.config.show_aggr:
                            aggr_src = aggr_src_batch[i_sent]
                            aggr_tgt = aggr_tgt_batch[i_sent]
                        if self.config.show_align:
                            align = align_batch[i_sent]
                        v.print_vectors(last_src,
                                        last_tgt,
                                        aggr_src,
                                        aggr_tgt,
                                        align,
                                        quiet=quiet)

        if tst.annotated:
            score.update()
            unk_s = float(100) * tst.nunk_src / tst.nsrc
            unk_t = float(100) * tst.nunk_tgt / tst.ntgt
            div_s = float(100) * tst.ndiv_src / tst.nsrc
            div_t = float(100) * tst.ndiv_tgt / tst.ntgt
            sys.stderr.write(
                'TEST words={}/{} %div={:.2f}/{:.2f} %unk={:.2f}/{:.2f} (A{:.4f},P{:.4f},R{:.4f},F{:.4f})'
                ' (TP:{},TN:{},FP:{},FN:{})\n'.format(tst.nsrc, tst.ntgt,
                                                      div_s, div_t, unk_s,
                                                      unk_t, score.A, score.P,
                                                      score.R, score.F,
                                                      score.TP, score.TN,
                                                      score.FP, score.FN))

        if self.config.show_svg:
            output.write("</body>\n</html>\n")
示例#3
0
    def inference(self, tst):

        if self.config.show_svg: print "<html>\n<body>"
        nbatches = (len(tst) + self.config.batch_size -
                    1) // self.config.batch_size
        score = Score()
        n_pos = 0
        n_sents = 0

        for iter, (src_batch, tgt_batch, ali_batch, ali_src_batch,
                   ali_tgt_batch, sim_batch, raw_src_batch, raw_tgt_batch,
                   len_src_batch, len_tgt_batch) in enumerate(
                       minibatches(tst, self.config.batch_size)):
            fd = self.get_feed_dict(src_batch, tgt_batch, ali_batch,
                                    ali_src_batch, ali_tgt_batch, sim_batch,
                                    len_src_batch, len_tgt_batch, 0.0)

            align, snt_src, snt_tgt, align_src, align_tgt, sim = self.sess.run(
                [
                    self.align, self.snt_src, self.snt_tgt, self.align_src,
                    self.align_tgt, self.cos_similarity
                ],
                feed_dict=fd)
            n_pos += sum(np.greater(sim, np.zeros_like(sim)))

            if tst.annotated:
                if self.config.error == 'lse':
                    score.add_batch(
                        np.concatenate([align_src, align_tgt], 1),
                        np.concatenate([ali_src_batch, ali_tgt_batch], 1), sim,
                        sim_batch, 0.0, 0.0, 0.0)
                else:
                    score.add_batch(align, ali_batch, sim, sim_batch, 0.0, 0.0,
                                    0.0)

            for i_sent in range(len(align)):
                n_sents += 1
                v = Visualize(n_sents, src_batch[i_sent], tgt_batch[i_sent],
                              raw_src_batch[i_sent], raw_tgt_batch[i_sent],
                              sim[i_sent], align[i_sent], align_src[i_sent],
                              align_tgt[i_sent], snt_src[i_sent],
                              snt_tgt[i_sent], self.config.mark_unks)
                if self.config.show_svg: v.print_svg()
                elif self.config.show_matrix: v.print_matrix()
                else:
                    v.print_vectors(self.config.show_sim,
                                    self.config.show_align)

        if tst.annotated:
            curr_time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
            sys.stderr.write('{} TEST ({})'.format(curr_time,
                                                   score.summarize()))
            unk_s = float(100) * tst.nunk_src / tst.nsrc
            unk_t = float(100) * tst.nunk_tgt / tst.ntgt
            sys.stderr.write(
                ' Test set: words={}/{} %ones={:.2f} pair={} unpair={} delete={} extend={} replace={} %unk={:.2f}/{:.2f}\n'
                .format(tst.nsrc, tst.ntgt, 100.0 * tst.nones / tst.nlnks,
                        tst.npair, tst.nunpair, tst.ndelete, tst.nextend,
                        tst.nreplace, unk_s, unk_t))

        if self.config.show_svg: print "</body>\n</html>"
        sys.stderr.write(
            "Predicted similar {} out of {} examples {:.2f}%\n".format(
                n_pos, n_sents, 100.0 * n_pos / n_sents))
示例#4
0
    def run_epoch(self, train, dev, lr):

        #######################
        # learn on trainset ###
        #######################
        nbatches = (len(train) + self.config.batch_size -
                    1) // self.config.batch_size
        curr_epoch = self.config.last_epoch + 1
        # training loss
        TLOSS = 0.0
        # intermediate loss (average over [config.report_every] iterations)
        ILOSS = 0.0
        tscore = Score()
        iscore = Score()
        ini_time = time.time()
        for iter, (src_batch, tgt_batch, raw_src_batch, raw_tgt_batch, sign_src_batch,
                   sign_tgt_batch, sign_batch, len_src_batch, len_tgt_batch) in \
                enumerate(minibatches(train, self.config.batch_size)):
            fd = self.get_feed_dict(src_batch, tgt_batch, sign_src_batch,
                                    sign_tgt_batch, sign_batch, len_src_batch,
                                    len_tgt_batch, lr)
            if self.config.mode == "sentence":
                _, loss, out = self.sess.run(
                    [self.train_op, self.loss, self.output], feed_dict=fd)
                tscore.add_batch(out, sign_batch)
                iscore.add_batch(out, sign_batch)
            else:
                _, loss, aggr_src, aggr_tgt, last_src, last_tgt = self.sess.run(
                    [
                        self.train_op, self.loss, self.aggregation_src,
                        self.aggregation_tgt, self.last_src, self.last_tgt
                    ],
                    feed_dict=fd)
                # print("src_batch is {}".format(src_batch[0]))
                # print("tgt_batch is {}".format(tgt_batch[0]))
                # print("loss is {}".format(loss))
                # print("aggr_src is {}".format(aggr_src))
                # print("aggr_tgt is {}".format(aggr_tgt))
                # print("last_src is {}".format(last_src[0]))
                # print("last_tgt is {}".format(last_tgt[0]))
                # if iter==2: sys.exit()
                tscore.add_batch_tokens(aggr_src, sign_src_batch,
                                        len_src_batch)
                tscore.add_batch_tokens(aggr_tgt, sign_tgt_batch,
                                        len_tgt_batch)
                iscore.add_batch_tokens(aggr_src, sign_src_batch,
                                        len_src_batch)
                iscore.add_batch_tokens(aggr_tgt, sign_tgt_batch,
                                        len_tgt_batch)
            TLOSS += loss
            ILOSS += loss

            if (iter + 1) % self.config.report_every == 0:
                curr_time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
                iscore.update()
                ILOSS = ILOSS / self.config.report_every
                sys.stdout.write(
                    '{} Epoch {} Iteration {}/{} loss:{:.4f} (A{:.4f},P{:.4f},R{:.4f},F{:.4f})\n'
                    .format(curr_time, curr_epoch, iter + 1, nbatches, ILOSS,
                            iscore.A, iscore.P, iscore.R, iscore.F))
                ILOSS = 0.0
                iscore = Score()

        TLOSS = TLOSS / nbatches
        tscore.update()
        curr_time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
        sys.stdout.write(
            '{} Epoch {} TRAIN loss={:.4f} (A{:.4f},P{:.4f},R{:.4f},F{:.4f}) lr={:.4f}'
            .format(curr_time, curr_epoch, TLOSS, tscore.A, tscore.P, tscore.R,
                    tscore.F, lr))
        unk_src = float(100) * train.nunk_src / train.nsrc
        unk_tgt = float(100) * train.nunk_tgt / train.ntgt
        div_src = float(100) * train.ndiv_src / train.nsrc
        div_tgt = float(100) * train.ndiv_tgt / train.ntgt
        sys.stdout.write(
            ' Train set: words={}/{} %div={:.2f}/{:.2f} %unk={:.2f}/{:.2f}\n'.
            format(train.nsrc, train.ntgt, div_src, div_tgt, unk_src, unk_tgt))

        ##########################
        # evaluate over devset ###
        ##########################
        VLOSS = 0.0
        if dev is not None:
            nbatches = (len(dev) + self.config.batch_size -
                        1) // self.config.batch_size
            # iterate over dataset
            VLOSS = 0
            vscore = Score()
            for iter, (src_batch, tgt_batch, raw_src_batch, raw_tgt_batch,
                       sign_src_batch, sign_tgt_batch, sign_batch,
                       len_src_batch, len_tgt_batch) in enumerate(
                           minibatches(dev, self.config.batch_size)):
                fd = self.get_feed_dict(src_batch, tgt_batch, sign_src_batch,
                                        sign_tgt_batch, sign_batch,
                                        len_src_batch, len_tgt_batch, 0.0)
                if self.config.mode == "sentence":
                    loss, out = self.sess.run([self.loss, self.output],
                                              feed_dict=fd)
                    vscore.add_batch(out, sign_batch)
                else:
                    loss, aggr_src, aggr_tgt = self.sess.run([
                        self.loss, self.aggregation_src, self.aggregation_tgt
                    ],
                                                             feed_dict=fd)
                    vscore.add_batch_tokens(aggr_src, sign_src_batch,
                                            len_src_batch)
                    vscore.add_batch_tokens(aggr_tgt, sign_tgt_batch,
                                            len_tgt_batch)
                # append single value which is a mean of losses of the n examples in the batch
                VLOSS += loss
            vscore.update()
            VLOSS = VLOSS / nbatches
            sys.stdout.write(
                '{} Epoch {} VALID loss={:.4f} (A{:.4f},P{:.4f},R{:.4f},F{:.4f})'
                .format(curr_time, curr_epoch, VLOSS, vscore.A, vscore.P,
                        vscore.R, vscore.F))
            unk_src = float(100) * dev.nunk_src / dev.nsrc
            unk_tgt = float(100) * dev.nunk_tgt / dev.ntgt
            div_src = float(100) * dev.ndiv_src / dev.nsrc
            div_tgt = float(100) * dev.ndiv_tgt / dev.ntgt
            sys.stdout.write(
                ' Valid set words={}/{} %div={:.2f}/{:.2f} %unk={:.2f}/{:.2f}\n'
                .format(dev.nsrc, dev.ntgt, div_src, div_tgt, unk_src, unk_tgt,
                        VLOSS, vscore.A, vscore.P, vscore.R, vscore.F))

        ##################################
        # keep record of current epoch ###
        ##################################
        self.config.tloss = TLOSS
        self.config.tA = tscore.A
        self.config.tP = tscore.P
        self.config.tR = tscore.R
        self.config.tF = tscore.F
        self.config.time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
        self.config.seconds = "{:.2f}".format(time.time() - ini_time)
        self.config.last_epoch += 1
        self.save_session(self.config.last_epoch)
        if dev is not None:
            self.config.vloss = VLOSS
            self.config.vA = vscore.A
            self.config.vP = vscore.P
            self.config.vR = vscore.R
            self.config.vF = vscore.F
        self.config.write_config()
        return VLOSS, curr_epoch
示例#5
0
    def run_epoch(self, train, dev, lr):
        #######################
        # learn on trainset ###
        #######################
        nbatches = (len(train) + self.config.batch_size -
                    1) // self.config.batch_size
        curr_epoch = self.config.last_epoch + 1
        tscore = Score()  #training scores
        iscore = Score()  #intermediate scores
        ini_time = time.time()
        for iter, (src_batch, tgt_batch, ali_batch, ali_src_batch,
                   ali_tgt_batch, sim_batch, raw_src_batch, raw_tgt_batch,
                   len_src_batch, len_tgt_batch) in enumerate(
                       minibatches(train, self.config.batch_size)):
            fd = self.get_feed_dict(src_batch, tgt_batch, ali_batch,
                                    ali_src_batch, ali_tgt_batch, sim_batch,
                                    len_src_batch, len_tgt_batch, lr)
            _, loss, wloss, sloss, align, align_src, align_tgt, sim = self.sess.run(
                [
                    self.train_op, self.loss, self.wloss, self.sloss,
                    self.align, self.align_src, self.align_tgt,
                    self.cos_similarity
                ],
                feed_dict=fd)
            if self.config.error == 'lse':
                tscore.add_batch(
                    np.concatenate([align_src, align_tgt], 1),
                    np.concatenate([ali_src_batch, ali_tgt_batch], 1), sim,
                    sim_batch, loss, wloss, sloss)
                iscore.add_batch(
                    np.concatenate([align_src, align_tgt], 1),
                    np.concatenate([ali_src_batch, ali_tgt_batch], 1), sim,
                    sim_batch, loss, wloss, sloss)
            else:
                tscore.add_batch(align, ali_batch, sim, sim_batch, loss, wloss,
                                 sloss)
                iscore.add_batch(align, ali_batch, sim, sim_batch, loss, wloss,
                                 sloss)

            if (iter + 1) % self.config.report_every == 0:
                curr_time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
                sys.stderr.write('{} Epoch {} Iteration {}/{} ({})\n'.format(
                    curr_time, curr_epoch, iter + 1, nbatches,
                    iscore.summarize()))
                iscore = Score()

        curr_time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
        sys.stderr.write('{} Epoch {} TRAIN lr={:.4f} ({})'.format(
            curr_time, curr_epoch, lr, tscore.summarize()))
        unk_src = float(100) * train.nunk_src / train.nsrc
        unk_tgt = float(100) * train.nunk_tgt / train.ntgt
        sys.stderr.write(
            ' Train set: words={}/{} %ones={:.2f} pair={} unpair={} delete={} extend={} replace={} %unk={:.2f}/{:.2f}\n'
            .format(train.nsrc, train.ntgt, 100.0 * train.nones / train.nlnks,
                    train.npair, train.nunpair, train.ndelete, train.nextend,
                    train.nreplace, unk_src, unk_tgt))
        #keep records
        self.config.tloss = tscore.average_loss
        self.config.tA = tscore.A
        self.config.tP = tscore.P
        self.config.tR = tscore.R
        self.config.tF = tscore.F
        self.config.time = time.strftime("[%Y-%m-%d_%X]", time.localtime())
        self.config.seconds = "{:.2f}".format(time.time() - ini_time)
        self.config.last_epoch += 1
        self.save_session(self.config.last_epoch)

        ##########################
        # evaluate over devset ###
        ##########################
        vscore = Score()  #validation score
        if dev is not None:
            nbatches = (len(dev) + self.config.batch_size -
                        1) // self.config.batch_size
            # iterate over dataset
            for iter, (src_batch, tgt_batch, ali_batch, ali_src_batch,
                       ali_tgt_batch, sim_batch, raw_src_batch, raw_tgt_batch,
                       len_src_batch, len_tgt_batch) in enumerate(
                           minibatches(dev, self.config.batch_size)):
                fd = self.get_feed_dict(src_batch, tgt_batch, ali_batch,
                                        ali_src_batch, ali_tgt_batch,
                                        sim_batch, len_src_batch,
                                        len_tgt_batch, 0.0)
                loss, wloss, sloss, align, align_src, align_tgt, sim = self.sess.run(
                    [
                        self.loss, self.wloss, self.sloss, self.align,
                        self.align_src, self.align_tgt, self.cos_similarity
                    ],
                    feed_dict=fd)
                if self.config.error == 'lse':
                    vscore.add_batch(
                        np.concatenate([align_src, align_tgt], 1),
                        np.concatenate([ali_src_batch, ali_tgt_batch], 1), sim,
                        sim_batch, loss, wloss, sloss)
                else:
                    vscore.add_batch(align, ali_batch, sim, sim_batch, loss,
                                     wloss, sloss)
            sys.stderr.write('{} Epoch {} VALID ({})'.format(
                curr_time, curr_epoch, vscore.summarize()))
            unk_s = float(100) * dev.nunk_src / dev.nsrc
            unk_t = float(100) * dev.nunk_tgt / dev.ntgt
            sys.stderr.write(
                ' Valid set: words={}/{} %ones={:.2f} pair={} unpair={} delete={} extend={} replace={} %unk={:.2f}/{:.2f}\n'
                .format(dev.nsrc, dev.ntgt, 100.0 * dev.nones / dev.nlnks,
                        dev.npair, dev.nunpair, dev.ndelete, dev.nextend,
                        dev.nreplace, unk_s, unk_t))
            #keep records
            self.config.vloss = vscore.average_loss
            self.config.vA = vscore.A
            self.config.vP = vscore.P
            self.config.vR = vscore.R
            self.config.vF = vscore.F

        self.config.write_config()
        return vscore.average_loss, curr_epoch