Exemplo n.º 1
0
    def write(self, filename="-", weights=None):

        if weights is None:
            weights = self.weights

        if filename == "-":
            outfile = sys.stdout
            filename = "STDOUT"  # careful overriding
        else:
            outfile = open(filename, "wt")

        self.print_templates(outfile)

        mytime = Mytime()

        nonzero = 0
        print >> logs, "sorting %d features..." % len(weights),
        for i, f in enumerate(sorted(weights), 1):
            if i == 1:  # sorting done
                print >> logs, "done in %.2lf seconds." % mytime.period()
                print >> logs, "writing features to %s..." % filename

            v = weights[f]
            if math.fabs(v) > 1e-3:
                print >> outfile, "%s\t%.5lf" % (f, v)
                nonzero += 1

        if self.unk > 0:  # print known words
            print >> outfile, " " + " ".join(sorted(
                self.knowns))  # " " to mark

        print >> logs, "%d nonzero feature instances written in %.2lf seconds." % \
              (nonzero, mytime.period())  ## nonzero != i
Exemplo n.º 2
0
    def read_weights(self, filename, infertemplates=False):
        '''instances are like "s0t-q0t=LRB-</s>=>LEFT     3.8234"'''

        infile = self.read_templates(filename)

        infertemplates = len(self.templates) < 1
        if infertemplates:
            print >> logs, "will infer templates from weights..."

        mytime = Mytime()
        i = 0
        if infile is not None:
            print >> logs, "reading feature weights from %s\t" % filename,
            for i, line in enumerate(infile, 1):
                if i % 200000 == 0:
                    print >> logs, "%d lines read..." % i,

                if line[0] == " ":
                    # TODO: separate known words line (last line)
                    self.knowns = set(line.split())
                    print >> logs, "\n%d known words read." % len(self.knowns)
                    self.unk = 1  # in cae you forgot to say it; doesn't matter 1 or x
                    break

                feat, weight = line.split()
                self.weights[feat] = float(weight)

                if infertemplates:
                    self.add_template(feat.split("=", 1)[0],
                                      1)  ## one occurrence

        print >> logs, "\n%d feature instances (%d lines) read in %.2lf seconds." % \
              (len(self.weights), i, mytime.period())

        self.print_autoevals()
Exemplo n.º 3
0
    def write(self, filename="-", weights=None):

        if weights is None:
            weights = self.weights

        if filename == "-":
            outfile = sys.stdout
            filename = "STDOUT"  # careful overriding
        else:
            outfile = open(filename, "wt")

        self.print_templates(outfile)

        mytime = Mytime()

        nonzero = 0
        # my wvector
        for action, feats in weights.iteritems():
            for f in sorted(feats):
                v = feats[f]
                if math.fabs(float(v)) > 1e-3:  # TODO
                    tid, feat = f.split("=", 1)
                    print >> outfile, "%s=%s=>%s\t%.5lf" % (
                        self.list_templates[int(tid)][0], feat, action,
                        float(v))
                    nonzero += 1

        print >> logs, "%d nonzero feature instances written in %.2lf seconds." % \
              (nonzero, mytime.period())  ## nonzero != i
Exemplo n.º 4
0
    def read_weights(self, filename, infertemplates=False):
        '''instances are like "s0t-q0t=LRB-</s>=>LEFT     3.8234"'''

        infile = self.read_templates(filename)

        infertemplates = len(self.templates) <= 1
        if infertemplates:
            print >> logs, "will infer templates from weights..."

        mytime = Mytime()
        i = 0
        if infile is not None:
            print >> logs, "reading feature weights from %s\t" % filename,
            for i, line in enumerate(infile, 1):
                if i % 200000 == 0:
                    print >> logs, "%d lines read..." % i,

                feat, weight = line.split()
                weight = WVector.value_class(
                    float(weight))  # in case of mydouble

                if FLAGS.use_template_id:
                    template, instance = feat.split("=", 1)
                    tid = self.templates[template]
                    feat = "%d=%s" % (tid, instance)

                if Model.doublehash == 1:
                    if FLAGS.tuplefeats:
                        f, action = instance.rsplit("=>", 1)
                        #action = Model.mapnames[action]
                        fs = tuple(f.split("|"))
                        if FLAGS.integerize:
                            fs = tuple(map(Vocab.str2id, fs))
                        self.weights[action][(tid, ) + fs] = weight
                    else:
                        f, action = feat.rsplit("=>", 1)
                        #action = Model.mapnames[action]
                        self.weights[action][f] = weight

                elif Model.doublehash == 2:
                    f, action = instance.rsplit("=>", 1)
                    action = Model.mapnames[action]
                    self.weights[action][tid][f] = weight
                else:
                    self.weights[feat] = weight

                if infertemplates:
                    self.add_template(feat.split("=", 1)[0],
                                      1)  ## one occurrence

        print >> logs, "\n%d feature instances (%d lines) read in %.2lf seconds." % \
              (len(self.weights), i, mytime.period())

        self.print_autoevals()
Exemplo n.º 5
0
            yield raw_input()
    except:
        return


logs = sys.stderr

from collections import defaultdict

from svector import Vector
from model import Model
from deptree import DepTree, DepVal

from mytime import Mytime

mytime = Mytime()

import gflags as flags

FLAGS = flags.FLAGS

flags.DEFINE_integer("beam", 1, "beam width", short_name="b")
flags.DEFINE_integer(
    "leftbeam", 1000, "leftptrs beam width"
)  # number of left items (predictors to be combined w/ current)
flags.DEFINE_integer("kbest", 0, "kbest", short_name="k")
flags.DEFINE_boolean("forest", False, "dump the forest")
flags.DEFINE_boolean("earlystop", False,
                     "try early stop (compared with gold seq)")
flags.DEFINE_integer("debuglevel",
                     0,
Exemplo n.º 6
0
    def train(self):

        start_mem = memory()

        starttime = time.time()

        print >> logs, "starting perceptron at", time.ctime()

        best_prec = 0
        for it in xrange(1, self.iter + 1):

            print >> logs, "iteration %d starts..............%s" % (
                it, time.ctime())

            curr_mem = memory()
            iterstarttime = time.time()
            self.decoder.num_edges = 0
            num_updates, early_updates, num_steps = self.one_pass_on_train()
            iterendtime = time.time()

            print >> logs, "memory usage at iter %d: extra %s, total %s" % (
                it, human(memory(curr_mem)), human(memory(start_mem)))
            if FLAGS.debuglevel >= 1:
                print >> logs, "weights=", self.weights

            curr_mem = memory()

            print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % (
                it, time.ctime())
            ##            avgweights = self.avg_weights() if self.avg else self.weights

            avgtime = 0
            timer = Mytime()
            if self.avg:
                ##                print >> logs, "    w=", self.weights
                ##                print >> logs, " ".join(map(str, [x.get_step() for x in self.weights.values()]))
                self.weights.set_avg(self.c)
                avgtime += timer.gap()
                if FLAGS.debuglevel >= 1:
                    print >> logs, "avgweights=", self.weights

            prec = self.eval_on_dev()

            print >> logs, "eval on dev took %.1f seconds." % timer.gap()

            print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}{7}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h, root={10:.1%}"\
                  .format(it, num_updates, prec, len(self.weights), early_updates,
                          (time.time() - iterstarttime)/3600,
                          (time.time() - starttime)/3600.,
                          "+" if prec > best_prec else "",
                          num_steps, self.decoder.num_edges,
                          prec.root())
            logs.flush()

            if prec > best_prec:
                best_prec = prec
                best_it = it
                best_wlen = len(self.weights)
                best_time = time.time() - starttime
                print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format(
                    it, prec)
                if not FLAGS.dump_last:
                    self.dump(self.weights)
                else:
                    self.bestweights = self.weights.deepcopy()

            if self.avg:
                timer = Mytime()
                self.weights.reset_avg(self.c)  # restore weights
                t = timer.gap()
                print >> logs, "avg weights (set/reset) took %.1f+%.1f=%.1f seconds." % (
                    avgtime, t, avgtime + t)


##            self.decoder.model.weights = self.weights # restore non-avg

##            del avgweights
            gc.collect()

            if FLAGS.mydouble:
                from mydouble import counts
                print >> logs, "mydouble usage and freed: %d %d" % counts()

        print >> logs, "peaked at iteration {0}: {1} ({3:.1f}h), |bestw|= {2}.".format(
            best_it, best_prec, best_wlen, best_time / 3600)
        print >> logs, best_prec.details()
        print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)"  % \
              (it, time.ctime(), (time.time() - starttime)/3600.)

        if FLAGS.dump_last:
            self.dump(self.bestweights)
Exemplo n.º 7
0
    def read_weights_and_insert_different_noise(self,
                                                filename,
                                                noise_info,
                                                infertemplates=False):
        # ADDED CODE HERE
        '''instances are like "s0t-q0t=LRB-</s>=>LEFT     3.8234"'''
        """
            noise_info = {
                'method': FLAGS.noise_method,
                'mu': FLAGS.mu,
                'sigma': FLAGS.sigma,
                'noise_file_path': FLAGS.noise_file_path,
        }
        """
        method = noise_info['method']
        mu = noise_info['mu']
        sigma = noise_info['sigma']
        noise_file_path = noise_info['noise_file_path']
        noises_vector = numpy.load(noise_file_path)

        infile = self.read_templates(filename)

        infertemplates = len(self.templates) <= 1
        if infertemplates:
            print >> logs, "will infer templates from weights..."

        mytime = Mytime()
        i = 0
        if infile is not None:
            print >> logs, "reading feature weights from %s\t" % filename,
            # for i, line in enumerate(infile, 1):
            for i, (line, noises_vector_coeff) in enumerate(
                    zip(infile, noises_vector), 1):

                if i % 200000 == 0:
                    print >> logs, "%d lines read..." % i,

                feat, weight = line.split()
                weight = float(
                    weight
                )  # WVector.value_class(float(weight)) # in case of mydouble
                noise_ = float(
                    noises_vector_coeff) * numpy.random.randn() + float(mu)

                if method == 'a':
                    weight = weight + noise_
                elif method == 'm':
                    weight = weight * noise_

                if FLAGS.use_template_id:
                    template, instance = feat.split("=", 1)
                    tid = self.templates[template]
                    feat = "%d=%s" % (tid, instance)

                if Model.doublehash == 1:
                    if FLAGS.tuplefeats:
                        f, action = instance.rsplit("=>", 1)
                        # action = Model.mapnames[action]
                        fs = tuple(f.split("|"))
                        self.weights[action][(tid, ) + fs] = weight
                    else:
                        f, action = feat.rsplit("=>", 1)
                        # action = Model.mapnames[action]
                        self.weights[action][f] = weight

                elif Model.doublehash == 2:
                    f, action = instance.rsplit("=>", 1)
                    action = Model.mapnames[action]
                    self.weights[action][tid][f] = weight
                else:
                    self.weights[feat] = weight

                if infertemplates:
                    self.add_template(feat.split("=", 1)[0],
                                      1)  ## one occurrence

        print >> logs, "\n%d feature instances (%d lines) read in %.2lf seconds." % \
                       (len(self.weights), i, mytime.period())

        self.print_autoevals()