Exemplo n.º 1
0
def tabulate():
    if log.level >= 1:
        sys.stderr.write("(3) Tabulating filtered phrases\n")
    count = 1

    inputfiles = []
    for input in inputs:
        if os.path.isdir(input):
            inputfiles.extend(
                os.path.join(input, name) for name in os.listdir(input))
        else:
            inputfiles.append(input)
    inputfiles = [file(inputfile) for inputfile in inputfiles]

    global fsum, esum, allsum, xsum, gram
    fsum = {}  # c(lhs, french)
    esum = {}  # c(lhs, english)
    allsum = 0.0  # c(*)
    xsum = {}  # c(lhs)
    gram = {}

    # read in all rules with matching english sides at the same time.
    # this way, we can sum only those english sides that ever appeared
    # with a french side that passes the filter.

    for rules in read_rule_blocks(inputfiles):
        flag = False
        blocksum = 0.
        for r in rules:
            scores = r.scores
            weight = scores[0]
            allsum += weight
            blocksum += weight
            xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight
            if ffilter is None or ffilter.match(
                    r.f
            ):  # there used to be a shortcut here -- if fsum.has_key(r.f)
                #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight
                fsum[r.f] = fsum.get(r.f, 0.0) + weight
                if r in gram:
                    gram[r] += r
                else:
                    gram[r] = r
                flag = True
            if log.level >= 1 and count % interval == 0:
                sys.stderr.write(
                    "time: %f, memory: %s, rules in: %d, rules counted: %d\n" %
                    (monitor.cpu(), monitor.memory(), count, len(gram)))

            count += 1
        if flag:
            ewordsnorm = rules[0].e.handle()
            if ewordsnorm in esum:
                sys.stderr.write("warning: files not sorted properly\n")
            esum[ewordsnorm] = blocksum
Exemplo n.º 2
0
    flags.DEFINE_boolean("uniqstat", False, "print uniq states stat info")
    flags.DEFINE_boolean("seq", False, "print action sequence")
    flags.DEFINE_string("sim", None, "simulate action sequences from FILE", short_name="s")

    flags.DEFINE_boolean("profile", False, "profile")

    flags.DEFINE_boolean("output", True, "output parsed results (turn it off for timing data)")
    flags.DEFINE_boolean("early", False, "use early update")

    flags.DEFINE_string("fakemem", None, "read in a file to occupy memory")

    argv = FLAGS(sys.argv)

    from monitor import memory, human

    start_mem = memory()

    if FLAGS.fakemem:
        s = Model(FLAGS.fakemem)
        t = Model(FLAGS.fakemem)
        print >> logs, "memory usage after read in fake: ", human(memory(start_mem))

    if FLAGS.weights is None:
        if not FLAGS.sim:
            print >> logs, "Error: must specify a weights file" + str(FLAGS)
            sys.exit(1)
        else:
            model = None # can simulate w/o a model
    else:
        model = Model(FLAGS.weights) #FLAGS.model, FLAGS.weights)
Exemplo n.º 3
0
                        "simulate action sequences from FILE",
                        short_name="s")

    flags.DEFINE_boolean("profile", False, "profile")

    flags.DEFINE_boolean(
        "output", True, "output parsed results (turn it off for timing data)")
    flags.DEFINE_boolean("early", False, "use early update")

    flags.DEFINE_string("fakemem", None, "read in a file to occupy memory")

    argv = FLAGS(sys.argv)

    from monitor import memory, human

    start_mem = memory()

    if FLAGS.fakemem:
        s = Model(FLAGS.fakemem)
        t = Model(FLAGS.fakemem)
        print >> logs, "memory usage after read in fake: ", human(
            memory(start_mem))

    if FLAGS.weights is None:
        if not FLAGS.sim:
            print >> logs, "Error: must specify a weights file" + str(FLAGS)
            sys.exit(1)
        else:
            model = None  # can simulate w/o a model
    else:
        model = Model(FLAGS.weights)  #FLAGS.model, FLAGS.weights)
Exemplo n.º 4
0
    def train(self):

        start_mem = memory()       

        starttime = time.time()

        if FLAGS.finaldump:
            Perceptron.best_weights = self.decoder.model.new_weights()

##        model name	: Intel(R) Xeon(R) CPU           W3570  @ 3.20GHz

        print >> logs, "%d CPUs at %s %s" % (cpu_count(),
                                             os.popen("cat /proc/cpuinfo|grep [GM]Hz").readlines()[0].strip().split(":")[-1],
                                             os.popen("cat /proc/cpuinfo|grep [GM]Hz").readlines()[-1].strip().split(":")[-1])
        
        print >> logs, "starting perceptron at", time.ctime()        

        best_prec = 0
        acc_steps = 0
        for it in xrange(1, self.iter+1):
            Perceptron.curr = it 

            #ram change
            #print >> logs, "iteration %d starts..............%s" % (it, time.ctime())

            curr_mem = memory() # outside of multi

            # ram change
            #print >> logs, "memory usage at iter %d before pool: %s" % (it, human(memory(start_mem)))

            iterstarttime = time.time()

            if Perceptron.shuffle:
                self.shuffle_train()

            if not Perceptron.singletrain:
                pool = Pool(processes=self.ncpus)
            pool_time = time.time() - iterstarttime

            num_updates, early_updates, total_steps, bad_updates = 0, 0, 0, 0
##            new_allweights, new_weights = self.decoder.model.new_weights(), self.decoder.model.new_weights()

            # ram change
            #print >> logs, "memory usage at iter %d after pool: %s" % (it, human(memory(start_mem)))

            tt= time.time()
            # ram change
            #print >> logs, "before para time...", tt
            results = map(self.train_worker, self.trainchunks) if Perceptron.singletrain else \
                          pool.map(self.train_worker, self.trainchunks, chunksize=1)

            if FLAGS.mydouble:
                print >> logs, \
                      "|w|=", len(Perceptron.weights), "|avgw|=", len(Perceptron.weights) if FLAGS.avg else 0, \
                      "|dw|=", len(results[0][-1])

            print >> logs, "after para time...", time.time()
            compute_time = time.time() - tt

            copy_time = 0
            para_times = []
            for dtime, size, (_num_updates, _early_updates, _steps, _bad_updates), _weights in results:

                num_updates += _num_updates
                early_updates += _early_updates
                total_steps += _steps
                bad_updates += _bad_updates
                
                factor = size / self.trainsize * Perceptron.learning_rate

                tt = time.time()
                if not Perceptron.singletrain: # singletrain: updated in place in one_pass_on_train()
                    Perceptron.weights.iaddc(_weights, factor)

                del _weights #, _allweights
                    
                copy_time += time.time() - tt

                para_times.append(dtime)

            del results
            
            if not Perceptron.singletrain:
                pool.close()
                pool.join()

            # ram change
            #print >> logs, "gc can't reach", gc.collect()

            #print >> logs, "pool_time= %.1f s, compute_walltime= %.1f s, compute_cputime= %.1f (%s), copy_time= %.1f s" \
            #     % (pool_time, compute_time, sum(para_times), " ".join("%.1f" % x for x in para_times), copy_time)

            #print >> logs, "memory usage at iter %d after fork: %s" % (it, human(memory(start_mem)))

            if not Perceptron.singletrain: # N.B.: in non-multiproc mode, self.c is updated in place
                Perceptron.c += self.trainsize / self.ncpus

            #print >> logs, "self.c=", Perceptron.c
            ##print >> logs, "w =", Perceptron.weights
            
            iterendtime = time.time()

            #print >> logs, "memory usage at iter %d: extra %s, total %s" % (it,
            #                                                                human(memory(curr_mem)),
            #                                                                human(memory(start_mem)))
            #if FLAGS.debuglevel >= 1:
            #   print >> logs, "weights=", Perceptron.weights

            curr_mem = memory()
                                                            
            #print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % (it, time.ctime())

            #ram adding
            #self.decoder.model.weights = Perceptron.weights # OK if noavg; see above
            #Parser.State.model.weights = Perceptron.weights # multiprocessing: State.model is static

            #prec_before_avg = self.eval_on_dev()
            #print ("Eval on dev before averaging weights, iteration:"+str(it)+" prec:",str(prec_before_avg))

##            avgweights = self.avg_weights() if self.avg else Perceptron.weights
            if self.avg:
##                Perceptron.weights.set_step(Perceptron.c)
                Perceptron.weights.set_avg(Perceptron.c)
            #    if FLAGS.debuglevel >= 1:
            #        print >> logs, "avgweights=", self.weights
                
            avgendtime = time.time()
            #print >> logs, "avg weights (trim) took %.1f seconds." % (avgendtime - iterendtime)
##            avgweights = self.decoder.model.new_weights()

            self.decoder.model.weights = Perceptron.weights # OK if noavg; see above
            Parser.State.model.weights = Perceptron.weights # multiprocessing: State.model is static

            prec = self.eval_on_dev()
            print ("Eval on dev without averaging weights, iteration:"+str(it)+" prec:",str(prec))

            #print >> logs, "eval on dev took %.1f seconds." % (time.time() - avgendtime)

            acc_steps += total_steps
            #print >> logs, "at iter {0}, updates {1} (early {4}, er {10:.1f}%), dev {2}{7}, |w| {3}, time {5:.3f}h acctime {6:.3f}h; steps {8} cover {9:.1f}% accsteps {11}; bad {12} br {13:.1f}%"\
            #      .format(it, num_updates, prec, len(Perceptron.weights), early_updates, \
            #             (time.time() - iterstarttime)/3600,
            #              (time.time() - starttime)/3600.,
            #               "+" if prec > best_prec else "",
            #              total_steps, 100.0*total_steps/Perceptron.trainsteps,
            #              100.*early_updates/num_updates,
            #              acc_steps,
            #              bad_updates, 100.*bad_updates/num_updates) # 13 elements
            logs.flush()

            if prec > best_prec:
                best_prec = prec
                best_it = it
                best_wlen = len(Perceptron.weights)
                if not FLAGS.finaldump:
                    #print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format(it, prec)
                    self.dump(Perceptron.weights, it)
                else:
                    Perceptron.best_weights = Perceptron.weights.copy()

            if self.avg:
                Perceptron.weights.reset_avg(Perceptron.c) # restore non-avg

            print >> logs, "gc can't reach", gc.collect()

            logs.flush() # for hpc


        #print >> logs, "peaked at iteration {0}: {1}, |bestw|= {2}.".format(best_it, best_prec, best_wlen)
        if FLAGS.finaldump:
            #print >> logs, "Dumping best weights..."
            self.dump(Perceptron.best_weights, best_it)
Exemplo n.º 5
0
    if opts.french_parse_file:
        french_parse_file = open(makefilename(opts.french_parse_file), "w")
    else:
        french_parse_file = None

    if opts.english_parse_file:
        english_parse_file = open(makefilename(opts.english_parse_file), "w")
    else:
        english_parse_file = None

    if not opts.parallel or parallel.rank != parallel.master:
        thedecoder = make_decoder()
        if log.level >= 1:
            gc.collect()
            log.write("all structures loaded, memory=%s\n" % (monitor.memory(),))

    def process(sent):
        goal = thedecoder.translate(sent)

        thedecoder.process_output(sent, goal)

        if goal is None:
            log.writeln("warning: parse failure")
            return None

        if opts.forest_dir:
            forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w")
            forest_file.write(
                forest.forest_to_json(
                    goal, fwords=sent.words, mode="english", models=thedecoder.models, weights=thedecoder.weights
Exemplo n.º 6
0
    def process(sent):
        # Need to add an flen attribute that gives the length of the input sentence.
        # In the lattice-decoding case, we have to make a guess.
        distance = sent.compute_distance()
        sent.flen = distance.get((0, sent.n - 1),
                                 None)  # could be missing if n == 0

        theoracle.input(sent)

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None: raise Exception("parse failure")
        except Exception:
            import traceback
            log.write("decoder raised exception: %s" %
                      "".join(traceback.format_exception(*sys.exc_info())))
            decoder_errors += 1
            if decoder_errors >= 3:
                log.write(
                    "decoder failed too many times, passing exception through!\n"
                )
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        best_vector, best = decoder.get_nbest(goal, 1)[0]
        best_mvector = theoracle.clean(best_vector)
        best_ovector = theoracle.finish(best_vector, best)
        best_loss = theoracle.make_weights(
            additive="sentence").dot(best_ovector)
        log.write("best hyp: %s %s cost=%s loss=%s\n" %
                  (" ".join(sym.tostring(e) for e in best), best_vector,
                   thedecoder.weights.dot(best_mvector), best_loss))

        # Set up quadratic program
        qp = maxmargin.QuadraticProgram()
        cur_instance = ForestInstance(sent.id, goal)
        qp.add_instance(cur_instance)

        if opts.parallel:
            while MPI.COMM_WORLD.Iprobe(tag=1, source=MPI.ANY_SOURCE):
                log.writeln("received update...\n")
                recv_instance = MPI.COMM_WORLD.recv(tag=1,
                                                    source=MPI.ANY_SOURCE)
                log.writeln("received update for %s" %
                            (recv_instance.instance_id, ))
                # need to check for duplicate instances?
                qp.add_instance(recv_instance)

        # Add cached hyps
        if cache_hyps:
            for instance in qp.instances:
                hyps = hyp_cache[instance.instance_id]
                if len(hyps) > 0:
                    log.writeln("retrieved %d cached hyps for %s" %
                                (len(hyps), instance.instance_id))
                for hyp in hyps:
                    instance.add_hyp(hyp)

        # Make oracle weight vector
        oweights = theoracle.make_weights(additive="sentence")
        oweights *= -1

        # Make vector of learning rates
        # We have to be careful to assign a learning rate to every possible feature
        # This is not very efficient
        feats = set()
        for item in goal.bottomup():
            for ded in item.deds:
                feats.update(ded.dcost)
        for instance in qp.instances:
            for hyp in instance.hyps:
                feats.update(hyp.mvector)
        learning_rates = svector.Vector()
        for feat in feats:
            learning_rates[feat] = compute_feature_learning_rate(feat)
        if log.level >= 3:
            log.writeln("learning rate vector: %s" % learning_rates)

        qp.optimize(thedecoder.weights, oweights, learning_rate=learning_rates)

        thedecoder.weights.compact()
        log.write("feature weights: %s\n" %
                  (thedecoder.weights * watch_features))

        # update weight sum for averaging
        global nweights, sumweights_helper

        # sumweights_helper = \sum_{i=0}^n (i \Delta w_i)
        sumweights_helper += nweights * qp.delta_mweights()
        nweights += 1

        # update feature scales
        if update_feature_scales:
            global sum_updates2, n_updates
            for instance in qp.instances:
                """u = svector.Vector(instance.hope.mvector)
                for hyp in instance.hyps:
                    u -= hyp.alpha*hyp.mvector
                sum_updates2 += u*u"""
                for hyp in instance.hyps:
                    if hyp is not instance.hope:  # hyp = instance.hope is a non-update
                        u = instance.hope.mvector - hyp.mvector
                        sum_updates2 += hyp.alpha * (u * u)
                        n_updates += hyp.alpha

            #log.write("sum of squared updates: %s\n" % (" ".join("%s=%s" % (f,sum_updates2[f]) for f in watch_features)))
            log.write("feature learning rates: %s\n" %
                      (" ".join("%s=%s" % (f, compute_feature_learning_rate(f))
                                for f in watch_features)))

        if opts.parallel:
            # flush out filled requests
            global requests
            requests = [request for request in requests if not request.Test()]

            # transmit updates to other nodes
            # make a plain Instance (without forest)
            # we used to designate a hope translation,
            #send_instance = maxmargin.Instance(cur_instance.hyps, hope=cur_instance.hope, instance_id=cur_instance.sentid)
            # but now are letting the other node choose.
            send_instance = maxmargin.Instance(cur_instance.hyps,
                                               instance_id=cur_instance.sentid)

            for node in parallel.slaves:
                if node != parallel.rank:
                    requests.append(
                        MPI.COMM_WORLD.isend(send_instance, dest=node, tag=1))

        # save all hyps for next time
        if cache_hyps:
            epsilon = 0.01
            for instance in qp.instances:
                hyps = hyp_cache[instance.instance_id]
                for hyp in instance.hyps:
                    for hyp1 in hyps:
                        if (hyp.mvector -
                                hyp1.mvector).normsquared() <= epsilon and (
                                    hyp.ovector -
                                    hyp1.ovector).normsquared() <= epsilon:
                            break
                    else:
                        if log.level >= 2:
                            log.writeln("add hyp to cache: %s" % hyp)
                        hyps.append(hyp)

        theoracle.update(best_ovector)
        sent.score_comps = best_ovector

        if log.level >= 1:
            gc.collect()
            log.write("done updating, memory = %s\n" % monitor.memory())

        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Exemplo n.º 7
0
        if log.level >= 1:
            log.write("Reading configuration from %s\n" % opts.config)
        execfile(opts.config)

    if len(args) >= 1 and args[0] != "-":
        input_file = file(args[0], "r")
    else:
        input_file = sys.stdin

    if len(args) >= 2 and args[1] != "-":
        output_file = file(args[1], "w")
    else:
        output_file = sys.stdout

    gc.collect()
    if log.level >= 1:
        log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
        log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))

    sents = sgml.read_raw(input_file)
    for sent in sents:
        mark = sent.getmark()
        if mark is not None:
            (tag, attrs) = mark
            if tag == "seg":
                sent.unmark()
                dattrs = sgml.attrs_to_dict(attrs)
                sent.meta = attrs
        extract_grammar(sent)

Exemplo n.º 8
0
            count = {}
            files = []
            for words in input:
                line = " ".join(words)
                if opts.key:
                    key = " ".join(words[keystart:keystop])
                else:
                    key = line
                if opts.parallel and myhash.myhash(key, modulus) != residue:
                    continue
                count[line] = count.get(line, 0) + 1
                if len(count) >= opts.max_types:
                    if opts.verbose:
                        sys.stderr.write(
                            "writing counts to temporary file (memory=%s)\n" %
                            monitor.memory())
                    keys = count.keys()
                    keys.sort()
                    f = tempfile.TemporaryFile()
                    for key in keys:
                        f.write("%s\t%s\n" % (count[key], key))
                    f.seek(0)
                    files.append(f)
                    count = {}
                    del keys

        fileinput.close()
        sys.stderr.write("merging %d files to output (memory=%s)\n" %
                         (len(files), monitor.memory()))

        heap = []
Exemplo n.º 9
0
    def process(sent):
        # Need to add an flen attribute that gives the length of the input sentence.
        # In the lattice-decoding case, we have to make a guess.
        distance = sent.compute_distance()
        sent.flen = distance.get((0,sent.n-1), None) # could be missing if n == 0

        theoracle.input(sent)
        
        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None: raise Exception("parse failure")
        except Exception:
            import traceback
            log.write("decoder raised exception: %s" % "".join(traceback.format_exception(*sys.exc_info())))
            decoder_errors += 1
            if decoder_errors >= 3:
                log.write("decoder failed too many times, passing exception through!\n")
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)
            
        best_vector, best = decoder.get_nbest(goal, 1)[0]
        best_mvector = theoracle.clean(best_vector)
        best_ovector = theoracle.finish(best_vector, best)
        best_loss = theoracle.make_weights(additive="sentence").dot(best_ovector)
        log.write("best hyp: %s %s cost=%s loss=%s\n"  % (" ".join(sym.tostring(e) for e in best), best_vector, thedecoder.weights.dot(best_mvector), best_loss))

        # Set up quadratic program
        qp = maxmargin.QuadraticProgram()
        cur_instance = ForestInstance(sent.id, goal)
        qp.add_instance(cur_instance)

        if opts.parallel:
            while MPI.COMM_WORLD.Iprobe(tag=1, source=MPI.ANY_SOURCE):
                log.writeln("received update...\n")
                recv_instance = MPI.COMM_WORLD.recv(tag=1, source=MPI.ANY_SOURCE)
                log.writeln("received update for %s" % (recv_instance.instance_id,))
                # need to check for duplicate instances?
                qp.add_instance(recv_instance)

        # Add cached hyps
        if cache_hyps:
            for instance in qp.instances:
                hyps = hyp_cache[instance.instance_id]
                if len(hyps) > 0:
                    log.writeln("retrieved %d cached hyps for %s" % (len(hyps), instance.instance_id))
                for hyp in hyps:
                    instance.add_hyp(hyp)

        # Make oracle weight vector
        oweights = theoracle.make_weights(additive="sentence")
        oweights *= -1

        # Make vector of learning rates
        # We have to be careful to assign a learning rate to every possible feature
        # This is not very efficient
        feats = set()
        for item in goal.bottomup():
            for ded in item.deds:
                feats.update(ded.dcost)
        for instance in qp.instances:
            for hyp in instance.hyps:
                feats.update(hyp.mvector)
        learning_rates = svector.Vector()
        for feat in feats:
            learning_rates[feat] = compute_feature_learning_rate(feat)
        if log.level >= 3:
            log.writeln("learning rate vector: %s" % learning_rates)

        qp.optimize(thedecoder.weights, oweights, learning_rate=learning_rates)

        thedecoder.weights.compact()
        log.write("feature weights: %s\n" % (thedecoder.weights * watch_features))

        # update weight sum for averaging
        global nweights, sumweights_helper

        # sumweights_helper = \sum_{i=0}^n (i \Delta w_i)
        sumweights_helper += nweights * qp.delta_mweights()
        nweights += 1

        # update feature scales
        if update_feature_scales:
            global sum_updates2, n_updates
            for instance in qp.instances:
                """u = svector.Vector(instance.hope.mvector)
                for hyp in instance.hyps:
                    u -= hyp.alpha*hyp.mvector
                sum_updates2 += u*u"""
                for hyp in instance.hyps:
                    if hyp is not instance.hope: # hyp = instance.hope is a non-update
                        u = instance.hope.mvector - hyp.mvector
                        sum_updates2 += hyp.alpha*(u*u)
                        n_updates += hyp.alpha

            #log.write("sum of squared updates: %s\n" % (" ".join("%s=%s" % (f,sum_updates2[f]) for f in watch_features)))
            log.write("feature learning rates: %s\n" % (" ".join("%s=%s" % (f,compute_feature_learning_rate(f)) for f in watch_features)))

        if opts.parallel:
            # flush out filled requests
            global requests
            requests = [request for request in requests if not request.Test()]

            # transmit updates to other nodes
            # make a plain Instance (without forest)
            # we used to designate a hope translation,
            #send_instance = maxmargin.Instance(cur_instance.hyps, hope=cur_instance.hope, instance_id=cur_instance.sentid)
            # but now are letting the other node choose.
            send_instance = maxmargin.Instance(cur_instance.hyps, instance_id=cur_instance.sentid)

            for node in parallel.slaves:
                if node != parallel.rank:
                    requests.append(MPI.COMM_WORLD.isend(send_instance, dest=node, tag=1))

        # save all hyps for next time
        if cache_hyps:
            epsilon = 0.01
            for instance in qp.instances:
                hyps = hyp_cache[instance.instance_id]
                for hyp in instance.hyps:
                    for hyp1 in hyps:
                        if (hyp.mvector-hyp1.mvector).normsquared() <= epsilon and (hyp.ovector-hyp1.ovector).normsquared() <= epsilon:
                            break
                    else:
                        if log.level >= 2:
                            log.writeln("add hyp to cache: %s" % hyp)
                        hyps.append(hyp)

        theoracle.update(best_ovector)
        sent.score_comps = best_ovector

        if log.level >= 1:
            gc.collect()
            log.write("done updating, memory = %s\n" % monitor.memory())

        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Exemplo n.º 10
0
    if opts.french_parse_file:
        french_parse_file = open(makefilename(opts.french_parse_file), "w")
    else:
        french_parse_file = None

    if opts.english_parse_file:
        english_parse_file = open(makefilename(opts.english_parse_file), "w")
    else:
        english_parse_file = None

    if not opts.parallel or parallel.rank != parallel.master:
        thedecoder = make_decoder()
        if log.level >= 1:
            gc.collect()
            log.write("all structures loaded, memory=%s\n" %
                      (monitor.memory(), ))

    def process(sent):
        goal = thedecoder.translate(sent)

        thedecoder.process_output(sent, goal)

        if goal is None:
            log.writeln("warning: parse failure")
            return None

        if opts.forest_dir:
            forest_file = gzip.open(
                os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w")
            forest_file.write(
                forest.forest_to_json(goal,
Exemplo n.º 11
0
    def train(self):

        start_mem = memory()

        starttime = time.time()

        print >> logs, "starting perceptron at", time.ctime()

        best_prec = 0
        for it in xrange(1, self.iter + 1):

            print >> logs, "iteration %d starts..............%s" % (
                it, time.ctime())

            curr_mem = memory()
            iterstarttime = time.time()
            num_updates, early_updates = self.one_pass_on_train()
            iterendtime = time.time()

            print >> logs, "memory usage at iter %d: extra %s, total %s" % (
                it, human(memory(curr_mem)), human(memory(start_mem)))
            curr_mem = memory()

            print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % (
                it, time.ctime())
            avgweights = self.avg_weights() if self.avg else self.weights
            avgendtime = time.time()
            print >> logs, "avg weights (trim) took %.1f seconds." % (
                avgendtime - iterendtime)
            if FLAGS.debuglevel >= 2:
                print >> logs, "avg w=", avgweights
            self.decoder.model.weights = avgweights
            prec = self.eval_on_dev()
            print >> logs, "eval on dev took %.1f seconds." % (time.time() -
                                                               avgendtime)


            print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h"\
                  .format(it, num_updates, prec, len(avgweights), early_updates, \
                          (time.time() - iterstarttime)/3600, (time.time() - starttime)/3600.)
            logs.flush()

            if prec > best_prec:
                best_prec = prec
                best_it = it
                best_wlen = len(avgweights)
                print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format(
                    it, prec)
                self.dump(avgweights)

            self.decoder.model.weights = self.weights  # restore non-avg

            del avgweights
            gc.collect()

            if FLAGS.mydouble:
                from mydouble import counts
                print >> logs, "mydouble usage and freed: %d %d" % counts()

        print >> logs, "peaked at iteration {0}: {1}, |bestw|= {2}.".format(
            best_it, best_prec, best_wlen)
        print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)"  % \
              (it, time.ctime(), (time.time() - starttime)/3600.)
Exemplo n.º 12
0
            # the input. The output is sorted, just like
            # sort | uniq -c.
            count = {}
            files = []
            for words in input:
                line = " ".join(words)
                if opts.key:
                    key = " ".join(words[keystart:keystop])
                else:
                    key = line
                if opts.parallel and myhash.myhash(key,modulus) != residue:
                    continue
                count[line] = count.get(line, 0)+1
                if len(count) >= opts.max_types:
                    if opts.verbose:
                        sys.stderr.write("writing counts to temporary file (memory=%s)\n" % monitor.memory())
                    keys = count.keys()
                    keys.sort()
                    f = tempfile.TemporaryFile()
                    for key in keys:
                        f.write("%s\t%s\n" % (count[key], key))
                    f.seek(0)
                    files.append(f)
                    count = {}
                    del keys

        fileinput.close()
        sys.stderr.write("merging %d files to output (memory=%s)\n" % (len(files), monitor.memory()))

        heap = []
        for f in files:
Exemplo n.º 13
0
    def train(self):

        start_mem = memory()

        starttime = time.time()

        print >> logs, "starting perceptron at", time.ctime()

        best_prec = 0
        for it in xrange(1, self.iter + 1):

            print >> logs, "iteration %d starts..............%s" % (
                it, time.ctime())

            curr_mem = memory()
            iterstarttime = time.time()
            self.decoder.num_edges = 0
            num_updates, early_updates, num_steps = self.one_pass_on_train()
            iterendtime = time.time()

            print >> logs, "memory usage at iter %d: extra %s, total %s" % (
                it, human(memory(curr_mem)), human(memory(start_mem)))
            if FLAGS.debuglevel >= 1:
                print >> logs, "weights=", self.weights

            curr_mem = memory()

            print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % (
                it, time.ctime())
            ##            avgweights = self.avg_weights() if self.avg else self.weights

            avgtime = 0
            timer = Mytime()
            if self.avg:
                ##                print >> logs, "    w=", self.weights
                ##                print >> logs, " ".join(map(str, [x.get_step() for x in self.weights.values()]))
                self.weights.set_avg(self.c)
                avgtime += timer.gap()
                if FLAGS.debuglevel >= 1:
                    print >> logs, "avgweights=", self.weights

            prec = self.eval_on_dev()

            print >> logs, "eval on dev took %.1f seconds." % timer.gap()

            print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}{7}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h, root={10:.1%}"\
                  .format(it, num_updates, prec, len(self.weights), early_updates,
                          (time.time() - iterstarttime)/3600,
                          (time.time() - starttime)/3600.,
                          "+" if prec > best_prec else "",
                          num_steps, self.decoder.num_edges,
                          prec.root())
            logs.flush()

            if prec > best_prec:
                best_prec = prec
                best_it = it
                best_wlen = len(self.weights)
                best_time = time.time() - starttime
                print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format(
                    it, prec)
                if not FLAGS.dump_last:
                    self.dump(self.weights)
                else:
                    self.bestweights = self.weights.deepcopy()

            if self.avg:
                timer = Mytime()
                self.weights.reset_avg(self.c)  # restore weights
                t = timer.gap()
                print >> logs, "avg weights (set/reset) took %.1f+%.1f=%.1f seconds." % (
                    avgtime, t, avgtime + t)


##            self.decoder.model.weights = self.weights # restore non-avg

##            del avgweights
            gc.collect()

            if FLAGS.mydouble:
                from mydouble import counts
                print >> logs, "mydouble usage and freed: %d %d" % counts()

        print >> logs, "peaked at iteration {0}: {1} ({3:.1f}h), |bestw|= {2}.".format(
            best_it, best_prec, best_wlen, best_time / 3600)
        print >> logs, best_prec.details()
        print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)"  % \
              (it, time.ctime(), (time.time() - starttime)/3600.)

        if FLAGS.dump_last:
            self.dump(self.bestweights)
Exemplo n.º 14
0
def main():

    if FLAGS.sim is not None:
        sequencefile = open(FLAGS.sim)

    parser = Parser(model, b=FLAGS.beam)

    print >> logs, "memory usage before parsing: ", human(memory(start_mem))

    totalscore = 0
    totalstates = 0
    totaluniq = 0
    totaledges = 0
    totaltime = 0

    totalprec = DepVal()    
    totaloracle = DepVal()

    print >> logs, "gc.collect unreachable: %d" % gc.collect()

    if FLAGS.manual_gc:
        gc.disable()
    
    i = 0
    gctime = 0
    for i, line in enumerate(shell_input(), 1):

        if FLAGS.manual_gc and i % FLAGS.gc == 0:
            print >> logs, "garbage collection...",
            tt = time.time()
            print >> logs, "gc.collect unreachable: %d" % gc.collect()
            tt = time.time() - tt
            print >> logs, "took %.1f seconds" % tt
            gctime += tt

        line = line.strip()
        if line[0]=="(":
            # input is a gold tree (so that we can evaluate)
            reftree = DepTree.parse(line)
            sentence = DepTree.sent # assigned in DepTree.parse()            
        else:
            # input is word/tag list
            reftree = None
            sentence = [tuple(x.rsplit("/", 1)) for x in line.split()]   # split by default returns list            
            DepTree.sent = sentence

        if FLAGS.debuglevel >= 1:
            print >> logs, sentence
            print >> logs, reftree

        mytime.zero()
        
        if FLAGS.sim is not None: # simulation, not parsing
            actions = map(int, sequencefile.readline().split())
            goal, feats = parser.simulate(actions, sentence) #if model is None score=0
            print >> logs, feats
            score, tree = goal.score, goal.top()
            (nstates, nedges, nuniq) = (0, 0, 0)
        else:
            # real parsing
            if True: #FLAGS.earlystop:
                refseq = reftree.seq() if reftree is not None else None
                tree, myseq, score, _ = parser.try_parse(sentence, refseq, update=False)
                if FLAGS.early:
                    print >> logs, "ref=", refseq
                    print >> logs, "myt=", myseq

                    refseq = refseq[:len(myseq)] # truncate
                    _, reffeats = parser.simulate(refseq, sentence) 
                    _, myfeats = parser.simulate(myseq, sentence)
                    print >> logs, "+feats", reffeats
                    print >> logs, "-feats", myfeats
                    
                nstates, nedges, nuniq = parser.stats()
            else:
                goal = parser.parse(sentence)
                nstates, nedges, nuniq = parser.stats()

##        score, tree = goal.score, goal.top()
#        score, tree = mytree
            
        dtime = mytime.period()

        if not FLAGS.early and not FLAGS.profile:
            if FLAGS.forest:
                parser.dumpforest(i)
            elif FLAGS.output:
                if not FLAGS.kbest:
                    print tree
                else:
                    stuff = parser.beams[-1][:FLAGS.kbest]
                    print "sent.%d\t%d" % (i, len(stuff))
                    for state in stuff:
                        print "%.2f\t%s" % (state.score, state.tree())
                    print
                    
            if FLAGS.oracle:
                oracle, oracletree = parser.forestoracle(reftree)
                totaloracle += oracle

        prec = DepTree.compare(tree, reftree) # OK if either is None

        searched = sum(x.derivation_count() for x in parser.beams[-1]) if FLAGS.forest else 0
        print >> logs, "sent {i:-4} (len {l}):\tmodelcost= {c:.2f}\tprec= {p:.2%}"\
              "\tstates= {ns} (uniq {uq})\tedges= {ne}\ttime= {t:.3f}\tsearched= {sp}" \
              .format(i=i, l=len(sentence), c=score, p=prec.prec(), \
                      ns=nstates, uq=nuniq, ne=nedges, t=dtime, sp=searched)
        if FLAGS.seq:
            actions = goal.all_actions()
            print >> logs, " ".join(actions)
            check = simulate(actions, sentence, model) #if model is None score=0
            checkscore = check.score
            checktree = check.top()
            print >> logs, checktree
            checkprec = checktree.evaluate(reftree)
            print >> logs, "verify: tree:%s\tscore:%s\tprec:%s" % (tree == checktree, score == checkscore, prec == checkprec)
            print >> logs, "sentence %-4d (len %d): modelcost= %.2lf\tprec= %.2lf\tstates= %d (uniq %d)\tedges= %d\ttime= %.3lf" % \
                  (i, len(sentence), checkscore, checkprec.prec100(), nstates, nuniq, nedges, dtime)

        totalscore += score
        totalstates += nstates
        totaledges += nedges
        totaluniq += nuniq
        totaltime += dtime

        totalprec += prec

    if i == 0:
        print >> logs, "Error: empty input."
        sys.exit(1)

    if FLAGS.featscache:
        print >> logs, "feature constructions: tot= %d shared= %d (%.2f%%)" % (State.tot, State.shared, State.shared / State.tot * 100)

    print >> logs, "beam= {b}, avg {a} sents,\tmodelcost= {c:.2f}\tprec= {p:.2%}" \
          "\tstates= {ns:.1f} (uniq {uq:.1f})\tedges= {ne:.1f}\ttime= {t:.4f}\n{d:s}" \
          .format(b=FLAGS.b, a=i, c=totalscore/i, p=totalprec.prec(), 
                  ns=totalstates/i, uq=totaluniq/i, ne=totaledges/i, t=totaltime/i, 
                  d=totalprec.details())
    
    if FLAGS.uniqstat:
        for i in sorted(uniqstats):
            print >> logs, "%d\t%.1lf\t%d\t%d" % \
                  (i, sum(uniqstats[i]) / len(uniqstats[i]), \
                   min(uniqstats[i]), max(uniqstats[i]))

    if FLAGS.oracle:
        print >> logs, "oracle= ", totaloracle

    if FLAGS.manual_gc:
        print >> logs, "garbage collection took %.1f seconds" % gctime

    print >> logs, "memory usage after parsing: ", human(memory(start_mem))
    if FLAGS.mydouble:
        from mydouble import counts
        print >> logs, "mydouble usage and freed: %d %d" % counts()
Exemplo n.º 15
0
def main(argv=None):
	'''Call this from the command-line to create a 
	pre-computed binary data array for later use'''
	if argv is None:
		argv = sys.argv

	parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+
								"\n\nNote: -d,-s,-a, and -p are mutually exclusive")
	parser.add_option("-d", "--data-array", 
					action="store_true", default=False,
					dest="da", help="Compile file into data array (default)")
	parser.add_option("-s", "--suffix-array", 
					action="store_true", default=False,
					dest="sa", help="Compile file into suffix array")
	parser.add_option("-a", "--alignment", 
					action="store_true", default=False,
					dest="a", help="Compile file into alignment")
	parser.add_option("-l", "--lexical", 
					action="store_true", default=False,
					dest="l", help="Compile file into lex file")
	parser.add_option("-x", "--compute_lexical", action="store", nargs=2,
					dest="lex_args", help="Compute lex file from data",
					metavar="<f file> <e file>")
	parser.add_option("-p", "--parse", 
					action="store_true", default=False,
					dest="p", help="Compile file into parse")
	parser.add_option("-b", "--binary-infile", 
					action="store_true", default=False,
					dest="bin", help="Input file is binary (default: text)")
	parser.add_option("-t", "--text-outfile", 
					action="store_true", default=False,
					dest="text", help="Output file is text (default: binary)")
	parser.add_option("-e", "--enhanced-outfile", 
					action="store_true", default=False,
					dest="enhanced", help="Output file is enhanced text (default: binary)")
	parser.add_option("-r", action="store", nargs=7,
					dest="precomp_args", help="Precompute collocations (Hiero only)", 
					metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>")
	(options, args) = parser.parse_args()

	filetype_opts =  [options.da, options.sa, options.a, options.p]

	if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2:
		parser.print_help()
		sys.exit(1)

	(infilename, outfilename) = args
	if options.bin:
		bin = " binary"
	else:
		bin = ""

	start_time = monitor.cpu()
	if options.precomp_args:
		if options.bin:
			obj = precomputation.Precomputation(infilename, from_binary=True)
		else:
			keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"])
			precomp_opts = {} 
			sys.stderr.write("Precomputing statistics for list %s\n" % infilename)
			for pair in options.precomp_args:
				(key, val) = pair.split("=")
				if key in keys:
					keys.remove(key)
					if key != "sa":
						val = int(val)
					precomp_opts[key] = val
				else:
					sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key)
					return 1
			sa = csuf.SuffixArray(precomp_opts["sa"], True)
			obj = precomputation.Precomputation(infilename, sa, 
				precompute_rank=precomp_opts["rank1"], 
				precompute_secondary_rank=precomp_opts["rank2"], 
				max_length=precomp_opts["max-len"], 
				max_nonterminals=precomp_opts["max-nt"], 
				train_max_initial_size=precomp_opts["max-size"], 
				train_min_gap_size=precomp_opts["min-gap"])
	elif options.sa:
		sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin))
		obj = csuf.SuffixArray(infilename, options.bin)
	elif options.a:
		sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin))
		obj = calignment.Alignment(infilename, options.bin)
	elif options.p:
		sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin))
		obj = parse.ParseArray(infilename, options.bin)
	elif options.l:
		sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin))
		obj = clex.CLex(infilename, options.bin)
	elif options.lex_args:
		ffile = options.lex_args[0]
		efile = options.lex_args[1]
		sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile))
		fsarray = csuf.SuffixArray(ffile, True)
		earray = cdat.DataArray(efile, True)
		aarray = calignment.Alignment(infilename, True)
		obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray)
	else:
		sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin))
		obj = cdat.DataArray(infilename, options.bin)

	sys.stderr.write("  Total time for read: %f\n" % (monitor.cpu() - start_time))
	start_time = monitor.cpu()
	if options.text:
		sys.stderr.write("Writing text file %s...\n" % outfilename)
		obj.write_text(outfilename)
	elif options.enhanced:
		sys.stderr.write("Writing enhanced text file %s...\n" % outfilename)
		obj.write_enhanced(outfilename)
	else:
		sys.stderr.write("Writing binary file %s...\n" % outfilename)
		obj.write_binary(outfilename)
	sys.stderr.write("Finished.\n")
	sys.stderr.write("  Total time for write: %f\n" % (monitor.cpu() - start_time))

	mem_use = float(monitor.memory())
	metric = "B"
	if mem_use / 1000 > 1:
		mem_use /= 1000
		metric = "KB"
	if mem_use / 1000 > 1:
		mem_use /= 1000
		metric = "MB"
	if mem_use / 1000 > 1:
		mem_use /= 1000
		metric = "GB"
	sys.stderr.write("  Memory usage: %.1f%s\n" % (mem_use, metric))
Exemplo n.º 16
0
        if opts.output_dir is not None and len(gram) >= opts.dump_size:
            if opts.parallel:
                name = "%04d.%04d" % (opts.parallel[0], n_dump)
            else:
                name = "%04d" % n_dump
            dump_rules(gram, opts.output_dir, name)
            dumped += len(gram)
            gram = {}
            n_dump += 1

        if log.level >= 1 and count % slice == 0:
            sys.stderr.write("time: %f, sentences in: %d (%.1f/sec), " %
                             (time.time() - start_time, count, slice /
                              (time.time() - prev_time)))
            sys.stderr.write("rules out: %d+%d\n" % (dumped, len(gram)))
            sys.stderr.write("memory: %s\n" % monitor.memory())
            prev_time = time.time()

        count += 1

    if opts.output_dir is not None:
        if opts.parallel:
            name = "%04d.%04d" % (opts.parallel[0], n_dump)
        else:
            name = "%04d" % n_dump
        dump_rules(gram, opts.output_dir, name)
    else:
        dump_rules(gram, output_file)
    """if opts.output_forests:
        pickler.dump(sym.alphabet)"""
Exemplo n.º 17
0
Arquivo: mira.py Projeto: isi-nlp/sbmt
    def process(sent):
        global alphas

        if online_learning:
            updates.clear()
            alphas.clear()

        theoracle.input(sent)

        log.write("done preparing\n")

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None:
                raise Exception("parse failure")
        except Exception:
            import traceback

            log.writeln(
                "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info())))
            )
            decoder_errors += 1
            if decoder_errors >= 100:
                log.write("decoder failed too many times, passing exception through!\n")
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        bestv, best = decoder.get_nbest(goal, 1)[0]
        log.write("done decoding\n")

        bestscore = get_score(bestv, best)
        log.write(
            "best hyp: %s %s cost=%s score=%s\n"
            % (" ".join(sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore)
        )

        goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights)

        assert (
            sent.id not in updates
        )  # in batch learning, this can happen, and we would have to undo the update associated with this sentence

        updates[sent.id] = [(svector.Vector(), 0.0)]
        alphas[sent.id] = [max_learning_rate]

        if opts.parallel:
            while True:
                if mpi.world.iprobe(tag=1):
                    (sentid, vscores) = mpi.world.recv(tag=1)
                    log.write("received update for %s\n" % (sentid,))

                    if sentid in updates:  # see comment above
                        log.write("ignoring update for %s\n" % (sentid,))
                        continue  # drop this update on the floor

                    updates[sentid] = vscores
                    alphas[sentid] = [max_learning_rate] + [0.0] * (len(vscores) - 1)
                    # since the first update is zero, the alphas & updates
                    # are still consistent with weights
                else:
                    break

        def oracle(weights):
            hyps = get_hyps(sent, goal, weights)
            return [(goldv - hypv, goldscore - hypscore) for (hypv, hyp, hypscore) in hyps]

        thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates, alphas, {sent.id: oracle})

        remove_zeros(thedecoder.weights)
        log.write("feature weights: %s\n" % (thedecoder.weights * watch_features))
        log.write("weight norm: %s\n" % (math.sqrt(thedecoder.weights.normsquared())))

        # update weight sum for averaging
        global nweights, sumweights_helper

        # sumweights_helper = \sum_{i=0}^n (i \Delta w_i)
        for sentid in updates:
            for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]):
                apply_update(sumweights_helper, nweights * alpha * v)
        nweights += 1

        # update feature scales
        if update_feature_scales:
            global sum_updates2, n_updates, feature_scales
            for sentid in updates:
                u = svector.Vector()
                for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]):
                    u += alpha / max_learning_rate * v
                sum_updates2 += u * u
                n_updates += 1

            try:
                default_feature_scale = 1.0 / compute_variance(0, n_updates)
            except ZeroDivisionError:
                default_feature_scale = 0.0  # pseudoinverse
            feature_scales = collections.defaultdict(lambda: default_feature_scale)
            for feat in sum_updates2:
                try:
                    feature_scales[feat] = 1.0 / compute_variance(sum_updates2[feat], n_updates)
                except ZeroDivisionError:
                    feature_scales[feat] = 0.0  # pseudoinverse

            log.write(
                "feature scales: %s\n"
                % (" ".join("%s=%s" % (f, feature_scales[f]) for f in watch_features if f in feature_scales))
            )

        if opts.parallel:
            # flush out filled requests
            global requests
            requests = [request for request in requests if not request.test()]

            # transmit updates to other nodes
            for node in parallel.slaves:
                if node != parallel.rank:
                    requests.append(mpi.world.isend(value=(sent.id, updates[sent.id]), dest=node, tag=1))

        bestv = theoracle.finish(bestv, best)
        theoracle.update(bestv)
        sent.score_comps = bestv

        if log.level >= 1:
            gc.collect()
            log.write("done updating, memory = %s\n" % monitor.memory())

        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Exemplo n.º 18
0
    def train(self):

        start_mem = memory()

        starttime = time.time()
        ##        model name	: Intel(R) Xeon(R) CPU           W3570  @ 3.20GHz

        print >> logs, "%d CPUs at %s %s" % (
            cpu_count(), os.popen("cat /proc/cpuinfo|grep GHz").readlines()
            [-1].strip().split(":")[-1], os.popen("cat /proc/cpuinfo|grep MHz")
            .readlines()[-1].strip().split(":")[-1])

        print >> logs, "starting perceptron at", time.ctime()

        best_prec = 0
        for it in xrange(1, self.iter + 1):

            print >> logs, "iteration %d starts..............%s" % (
                it, time.ctime())

            curr_mem = memory()  # outside of multi

            print >> logs, "memory usage at iter %d before pool: %s" % (
                it, human(memory(start_mem)))

            iterstarttime = time.time()

            if Perceptron.shuffle:
                self.shuffle_train()

            if not FLAGS.singletrain:
                pool = Pool(processes=self.ncpus)
            pool_time = time.time() - iterstarttime

            num_updates, early_updates = 0, 0
            ##            new_allweights, new_weights = self.decoder.model.new_weights(), self.decoder.model.new_weights()

            print >> logs, "memory usage at iter %d after pool: %s" % (
                it, human(memory(start_mem)))

            tt = time.time()
            print >> logs, "before para time...", tt
            results = map(self.train_worker, self.trainchunks) if FLAGS.singletrain else \
                          pool.map(self.train_worker, self.trainchunks, chunksize=1)

            if FLAGS.mydouble:
                print >> logs, "mydouble usage and freed: %d %d" % counts(), \
                      "|w|=", len(Perceptron.weights), "|avgw|=", len(Perceptron.allweights) if FLAGS.avg else 0, \
                      "|dw|=", len(results[0][-1][0])

            print >> logs, "after para time...", time.time()
            compute_time = time.time() - tt

            copy_time = 0
            para_times = []
            for dtime, size, (_num_updates,
                              _early_updates), (_weights,
                                                _allweights) in results:

                num_updates += _num_updates
                early_updates += _early_updates

                factor = size / self.trainsize  # not exactly uniform (if not equal-size split)!

                tt = time.time()
                if not FLAGS.singletrain:
                    Perceptron.weights.iaddc(_weights, factor)
#                 print _weights
#                 print new_weights
#                 print

                if self.avg:
                    if FLAGS.naiveavg:
                        Perceptron.allweights.iaddc(_allweights, factor)
                    else:
                        Perceptron.allweights.iaddc(_allweights, factor)

                del _weights, _allweights

                copy_time += time.time() - tt

                para_times.append(dtime)

            del results

            if not FLAGS.singletrain:
                pool.close()
                pool.join()
##            else:
##                del self.delta_weights, self.delta_allweights # not in process

            print >> logs, "gc can't reach", gc.collect()

            print >> logs, "pool_time= %.1f s, compute_walltime= %.1f s, compute_cputime= %.1f (%s), copy_time= %.1f s" \
                  % (pool_time, compute_time, sum(para_times), " ".join("%.1f" % x for x in para_times), copy_time)

            print >> logs, "memory usage at iter %d after fork: %s" % (
                it, human(memory(start_mem)))

            if not FLAGS.singletrain:  # N.B.: in non-multiproc mode, self.c is updated
                Perceptron.c += self.trainsize / self.ncpus
                print >> logs, "self.c=", Perceptron.c

#             if self.avg:
#                 Perceptron.allweights = new_allweights
#            Perceptron.weights, Decoder.model.weights = new_weights, new_weights

##            num_updates, early_updates = self.one_pass_on_train() # old single-cpu
            iterendtime = time.time()

            print >> logs, "memory usage at iter %d: extra %s, total %s" % (
                it, human(memory(curr_mem)), human(memory(start_mem)))
            if FLAGS.debuglevel >= 1:
                print >> logs, "weights=", Perceptron.weights

            curr_mem = memory()

            print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % (
                it, time.ctime())

            avgweights = self.avg_weights() if self.avg else Perceptron.weights
            if FLAGS.avg and FLAGS.debuglevel >= 1:
                print >> logs, "avgweights=", avgweights

            avgendtime = time.time()
            print >> logs, "avg weights (trim) took %.1f seconds." % (
                avgendtime - iterendtime)
            if FLAGS.debuglevel >= 2:
                print >> logs, "avg w=", avgweights


##            avgweights = self.decoder.model.new_weights()

            self.decoder.model.weights = avgweights  # OK if noavg; see above
            Parser.State.model.weights = avgweights  # multiprocessing: State.model is static

            prec = self.eval_on_dev()
            print >> logs, "eval on dev took %.1f seconds." % (time.time() -
                                                               avgendtime)


            print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h"\
                  .format(it, num_updates, prec, len(avgweights), early_updates, \
                          (time.time() - iterstarttime)/3600, (time.time() - starttime)/3600.)
            logs.flush()

            if prec > best_prec:
                best_prec = prec
                best_it = it
                best_wlen = len(avgweights)
                print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format(
                    it, prec)
                self.dump(avgweights)

            self.decoder.model.weights = Perceptron.weights  # restore non-avg

            del avgweights
            print >> logs, "gc can't reach", gc.collect()

            if FLAGS.mydouble:
                print >> logs, "mydouble usage and freed: %d %d ------------------------" % counts(
                )

            logs.flush()  # for hpc

        print >> logs, "peaked at iteration {0}: {1}, |bestw|= {2}.".format(
            best_it, best_prec, best_wlen)
        print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)"  % \
              (it, time.ctime(), (time.time() - starttime)/3600.)
Exemplo n.º 19
0
            pickler.clear_memo()

        if opts.output_dir is not None and len(gram) >= opts.dump_size:
            if opts.parallel:
                name = "%04d.%04d" % (opts.parallel[0], n_dump)
            else:
                name = "%04d" % n_dump
            dump_rules(gram, opts.output_dir, name)
            dumped += len(gram)
            gram = {}
            n_dump += 1

        if log.level >= 1 and count%slice == 0:
            sys.stderr.write("time: %f, sentences in: %d (%.1f/sec), " % (time.time()-start_time, count, slice/(time.time()-prev_time)))
            sys.stderr.write("rules out: %d+%d\n" % (dumped, len(gram)))
            sys.stderr.write("memory: %s\n" % monitor.memory())
            prev_time = time.time()

        count += 1

    if opts.output_dir is not None:
        if opts.parallel:
            name = "%04d.%04d" % (opts.parallel[0], n_dump)
        else:
            name = "%04d" % n_dump
        dump_rules(gram, opts.output_dir, name)
    else:
        dump_rules(gram, output_file)

    """if opts.output_forests:
        pickler.dump(sym.alphabet)"""
Exemplo n.º 20
0
    def process(sent):
        goal = thedecoder.translate(sent)

        thedecoder.process_output(sent, goal)

        if goal is None:
            log.writeln("warning: parse failure")
            return None

        if opts.forest_dir:
            forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w")
            forest_file.write(
                forest.forest_to_json(
                    goal, fwords=sent.words, mode="english", models=thedecoder.models, weights=thedecoder.weights
                )
            )
            forest_file.close()

        if opts.rule_posterior_dir:
            rule_posterior_file = open(os.path.join(opts.rule_posterior_dir, "rule_posterior.%s" % sent.id), "w")
            beta = 1.0
            insides = goal.compute_inside(thedecoder.weights, beta=beta)
            outsides = goal.compute_outside(thedecoder.weights, insides, beta=beta)
            z = insides[id(goal)]
            for item in goal.bottomup():
                for ded in item.deds:
                    c = outsides[id(item)]
                    c += thedecoder.weights.dot(ded.dcost)
                    c += sum(insides[id(ant)] for ant in ded.ants)
                    c -= z
                    rule_posterior_file.write(
                        "%s ||| span=%s posterior=%s\n" % (ded.rule, (item.i, item.j), cost.prob(c))
                    )
                    ded.dcost["posterior"] = c
            rule_posterior_file.close()
            max_posterior_file = open(os.path.join(opts.rule_posterior_dir, "max_posterior.%s" % sent.id), "w")
            goal.reweight(svector.Vector("posterior=1"))
            max_posterior = goal.viterbi_deriv()

            def show(ded, antvalues):
                if ded.rule:
                    value = rule.subst(ded.rule.erhs, antvalues)
                else:
                    value = antvalues[0]
                return ("[%.3f" % cost.prob(ded.dcost["posterior"]),) + value + ("]",)

            value = max_posterior.value(show)
            s = " ".join(value)
            max_posterior_file.write("%s\n" % s)

            max_posterior_file.close()

        outputs = get_nbest(goal, n_best, ambiguity_limit)

        if n_best_file:
            for (v, e) in outputs:
                e = " ".join(e)
                # n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, -thedecoder.weights.dot(v)))
                n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, v))
            n_best_file.flush()

        (bestv, best) = outputs[0]

        if french_parse_file:
            french_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().french_tree()))
            french_parse_file.flush()
        if english_parse_file:
            english_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().english_tree()))
            english_parse_file.flush()

        if log.level >= 1:
            gc.collect()
            log.write("  done decoding, memory=%s\n" % monitor.memory())
            log.write("  features: %s; %s\n" % (bestv, thedecoder.weights.dot(bestv)))

        sent.ewords = best
        return sent
Exemplo n.º 21
0
    def process(sent):
        global alphas

        if online_learning:
            updates.clear()
            alphas.clear()

        theoracle.input(sent)

        log.write("done preparing\n")

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None: raise Exception("parse failure")
        except Exception:
            import traceback
            log.writeln(
                "decoder raised exception: %s %s" %
                (sent, "".join(traceback.format_exception(*sys.exc_info()))))
            decoder_errors += 1
            if decoder_errors >= 100:
                log.write(
                    "decoder failed too many times, passing exception through!\n"
                )
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        bestv, best = decoder.get_nbest(goal, 1)[0]
        log.write("done decoding\n")

        bestscore = get_score(bestv, best)
        log.write("best hyp: %s %s cost=%s score=%s\n" % (" ".join(
            sym.tostring(e)
            for e in best), bestv, thedecoder.weights.dot(bestv), bestscore))

        goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights)

        assert (
            sent.id not in updates
        )  # in batch learning, this can happen, and we would have to undo the update associated with this sentence

        updates[sent.id] = [(svector.Vector(), 0.)]
        alphas[sent.id] = [max_learning_rate]

        if opts.parallel:
            while True:
                if mpi.world.iprobe(tag=1):
                    (sentid, vscores) = mpi.world.recv(tag=1)
                    log.write("received update for %s\n" % (sentid, ))

                    if sentid in updates:  # see comment above
                        log.write("ignoring update for %s\n" % (sentid, ))
                        continue  # drop this update on the floor

                    updates[sentid] = vscores
                    alphas[sentid] = [max_learning_rate
                                      ] + [0.] * (len(vscores) - 1)
                    # since the first update is zero, the alphas & updates
                    # are still consistent with weights
                else:
                    break

        def oracle(weights):
            hyps = get_hyps(sent, goal, weights)
            return [(goldv - hypv, goldscore - hypscore)
                    for (hypv, hyp, hypscore) in hyps]

        thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates,
                                                   alphas, {sent.id: oracle})

        remove_zeros(thedecoder.weights)
        log.write("feature weights: %s\n" %
                  (thedecoder.weights * watch_features))
        log.write("weight norm: %s\n" %
                  (math.sqrt(thedecoder.weights.normsquared())))

        # update weight sum for averaging
        global nweights, sumweights_helper

        # sumweights_helper = \sum_{i=0}^n (i \Delta w_i)
        for sentid in updates:
            for (v, score), alpha in itertools.izip(updates[sentid],
                                                    alphas[sentid]):
                apply_update(sumweights_helper, nweights * alpha * v)
        nweights += 1

        # update feature scales
        if update_feature_scales:
            global sum_updates2, n_updates, feature_scales
            for sentid in updates:
                u = svector.Vector()
                for (v,
                     score), alpha in itertools.izip(updates[sentid],
                                                     alphas[sentid]):
                    u += alpha / max_learning_rate * v
                sum_updates2 += u * u
                n_updates += 1

            try:
                default_feature_scale = 1. / compute_variance(0, n_updates)
            except ZeroDivisionError:
                default_feature_scale = 0.  # pseudoinverse
            feature_scales = collections.defaultdict(
                lambda: default_feature_scale)
            for feat in sum_updates2:
                try:
                    feature_scales[feat] = 1. / compute_variance(
                        sum_updates2[feat], n_updates)
                except ZeroDivisionError:
                    feature_scales[feat] = 0.  # pseudoinverse

            log.write(
                "feature scales: %s\n" %
                (" ".join("%s=%s" % (f, feature_scales[f])
                          for f in watch_features if f in feature_scales)))

        if opts.parallel:
            # flush out filled requests
            global requests
            requests = [request for request in requests if not request.test()]

            # transmit updates to other nodes
            for node in parallel.slaves:
                if node != parallel.rank:
                    requests.append(
                        mpi.world.isend(value=(sent.id, updates[sent.id]),
                                        dest=node,
                                        tag=1))

        bestv = theoracle.finish(bestv, best)
        theoracle.update(bestv)
        sent.score_comps = bestv

        if log.level >= 1:
            gc.collect()
            log.write("done updating, memory = %s\n" % monitor.memory())

        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Exemplo n.º 22
0
    execfile(configfilename)

    opts, args = optparser.parse_args(args=sys.argv[2:])

    maxmargin.watch_features = watch_features

    theoracle = oracle.Oracle(order=4,
                              variant=opts.bleuvariant,
                              oracledoc_size=10)
    thedecoder = make_decoder()
    thelearner = Learner()
    weight_stack = []

    if log.level >= 1:
        gc.collect()
        log.write("all structures loaded, memory=%s\n" % (monitor.memory()))

    comm = MPI.Comm.Get_parent()
    log.prefix = '[%s] ' % (comm.Get_rank(), )

    instances = []
    while True:
        msg = comm.recv()

        if msg[0] == 'train':
            sent = msg[1]
            goal = process(sent)
            instances.append(ForestInstance(sent.id, goal))

            while comm.Iprobe(tag=1):
                msg = comm.recv(tag=1)
Exemplo n.º 23
0
    def process(sent):
        goal = thedecoder.translate(sent)

        thedecoder.process_output(sent, goal)

        if goal is None:
            return None

        if opts.forest_dir:
            forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w")
            forest_file.write(forest.forest_to_json(goal, fwords=sent.fwords, mode='english', models=thedecoder.models, weights=thedecoder.weights))
            forest_file.close()

        if opts.rule_posterior_dir:
            rule_posterior_file = open(os.path.join(opts.rule_posterior_dir, "rule_posterior.%s" % sent.id), "w")
            beta = 1.
            insides = goal.compute_inside(thedecoder.weights, beta=beta)
            outsides = goal.compute_outside(thedecoder.weights, insides, beta=beta)
            z = insides[id(goal)]
            for item in goal.bottomup():
                for ded in item.deds:
                    c = outsides[id(item)]
                    c += thedecoder.weights.dot(ded.dcost)
                    c += sum(insides[id(ant)] for ant in ded.ants)
                    c -= z
                    rule_posterior_file.write("%s ||| span=%s posterior=%s\n" % (ded.rule, (item.i, item.j), cost.prob(c)))
                    ded.dcost['posterior'] = c
            rule_posterior_file.close()
            max_posterior_file = open(os.path.join(opts.rule_posterior_dir, "max_posterior.%s" % sent.id), "w")
            goal.reweight(svector.Vector('posterior=1'))
            max_posterior = goal.viterbi_deriv()

            def show(ded, antvalues):
                if ded.rule:
                    value = ded.rule.e.subst((), antvalues)
                else:
                    value = antvalues[0]
                return ("[%.3f" % cost.prob(ded.dcost['posterior']),) + value + ("]",)
            value = max_posterior.value(show)
            s = " ".join((sym.tostring(e) if type(e) is int else e) for e in value)
            max_posterior_file.write("%s\n" % s)

            max_posterior_file.close()

        outputs = get_nbest(goal, n_best, ambiguity_limit)

        if n_best_file:
            for (v,e) in outputs:
                e = " ".join(sym.tostring(w) for w in e)
                #n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, -thedecoder.weights.dot(v)))
                n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, v))
            n_best_file.flush()

        (bestv,best) = outputs[0]

        if french_parse_file:
            french_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().french_tree()))
            french_parse_file.flush()
        if english_parse_file:
            english_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().english_tree()))
            english_parse_file.flush()

        if log.level >= 1:
            gc.collect()
            log.write("  done decoding, memory=%s\n" % monitor.memory())
            log.write("  features: %s; %s\n" % (bestv, thedecoder.weights.dot(bestv)))

        sent.ewords = [sym.tostring(e) for e in best]
        return sent
Exemplo n.º 24
0
            refreader(thereader(infile), [file(fn) for fn in reffilenames]))

        output_file = sys.stdout
    else:
        insents = []  # dummy

    oraclemodel = oracle.OracleModel(4, variant=opts.bleuvariant)

    if not opts.parallel or parallel.rank != parallel.master:
        thedecoder = make_decoder()
        oraclemodels = [oraclemodel, oracle.WordCounter()]

        if log.level >= 1:
            gc.collect()
            log.write("all structures loaded, memory=%s\n" %
                      (monitor.memory()))

        updates = collections.defaultdict(list)
        decoder_errors = 0

    def process(sent):
        oraclemodel.input(sent)
        log.write("done preparing\n")
        try:
            goal = thedecoder.translate(sent)
        except Exception:
            import traceback
            log.writeln("decoder raised exception: %s" %
                        "".join(traceback.format_exception(*sys.exc_info())))
            global decoder_errors
            decoder_errors += 1
Exemplo n.º 25
0
            SITE_COUNT = {}
            SITE_CHECK = 0
            SITE_CHECK_SUCCESS = 0

        if FLAGS.model == 'PY' and FLAGS.discount > 0:
            SAMPLER.update_rule_size_tables()

        for s in timed(samples):
            #print(s)
            s.sample()

        logger.writeln('iteration time: %s sec' % (time.time() - iter_start))
        logger.writeln(
            '%s rules, %s rule types, loglikelihood: %s' %
            (SAMPLER.nsamples(), SAMPLER.ntypes(), SAMPLER.likelihood()))
        logger.writeln('memory: %s' % memory())
        logger.writeln('resident memory: %s' % resident())

        if FLAGS.type:
            logger.writeln(
                '%s sampling operations in total, distribution of number of sites: %s'
                % (sum(SITE_COUNT.values()), SITE_COUNT))
            logger.writeln(
                '%s sites: %s singleton sites, %s (2-10) sites, %s (>10) sites'
                % (sum(k * v for k, v in SITE_COUNT.items()),
                   sum(k * v for k, v in SITE_COUNT.items() if k == 1),
                   sum(k * v for k, v in SITE_COUNT.items() if 2 <= k <= 10),
                   sum(k * v for k, v in SITE_COUNT.items() if k > 10)))

            logger.writeln('site checks: %s, success: %s' %
                           (SITE_CHECK, SITE_CHECK_SUCCESS))
Exemplo n.º 26
0
def tabulate():
    if log.level >= 1:
        sys.stderr.write("(3) Tabulating filtered phrases\n")
    count = 1

    inputfiles = []
    for input in inputs:
        if os.path.isdir(input):
            inputfiles.extend(os.path.join(input, name) for name in os.listdir(input))
        else:
            inputfiles.append(input)
    inputfiles = [file(inputfile) for inputfile in inputfiles]

    global fsum, esum, allsum, xsum, gram
    fsum = {} # c(lhs, french)
    esum = {} # c(lhs, english)
    allsum = 0.0 # c(*)
    xsum = {} # c(lhs)
    gram = {}

    # read in all rules with matching english sides at the same time.
    # this way, we can sum only those english sides that ever appeared
    # with a french side that passes the filter.

    for rules in read_rule_blocks(inputfiles):
        flag = False
        blocksum = 0.
        for r in rules:
            scores = r.scores
            weight = scores[0]
            allsum += weight
            blocksum += weight
            xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight
            if ffilter is None or ffilter.match(r.f): # there used to be a shortcut here -- if fsum.has_key(r.f)
                #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight
                fsum[r.f] = fsum.get(r.f, 0.0) + weight
                if r in gram:
                    gram[r] += r
                else:
                    gram[r] = r
                flag = True
            if log.level >= 1 and count%interval == 0:
                sys.stderr.write("time: %f, memory: %s, rules in: %d, rules counted: %d\n" % (monitor.cpu(), monitor.memory(), count, len(gram)))

            count += 1
        if flag:
            ewordsnorm = rules[0].e.handle()
            if ewordsnorm in esum:
                sys.stderr.write("warning: files not sorted properly\n")
            esum[ewordsnorm] = blocksum
Exemplo n.º 27
0
        if len(rules) >= 100000:
            log.write("sentence %s has %s rules\n" % (li+1, len(rules)))
            log.write("input: %s\n" % line.rstrip())

        if not combiner:
            # simple version
            for r in rules:
                print "%s\t%s" % (r, r.scores)
        else:
            # do some combining before writing out
            for r in rules:
                existing = gram.get(r, None)
                if existing is not None:
                    existing.scores += r.scores
                else:
                    del r.fpos
                    del r.epos
                    del r.span
                    gram[r] = r

            if len(gram) >= 100000:
                log.write("dumping...\n")
                for r in gram:
                    print "%s\t%s" % (r, r.scores)
                gram.clear()
                log.write("memory: %s\n" % monitor.memory())

    if combiner:
        for r in gram:
            print "%s\t%s" % (r, r.scores)