Exemplo n.º 1
0
def print_aggregate(cnts):
    for k in cnts:
        print k
        tot = 0
        for v, k2 in dictsort(cnts[k]): tot += v
        for v, k2 in dictsort(cnts[k]):
            print "\t", percent(v, tot), k2
Exemplo n.º 2
0
def print_aggregate(cnts):
    for k in cnts:
        print k
        tot = 0
        for v, k2 in dictsort(cnts[k]):
            tot += v
        for v, k2 in dictsort(cnts[k]):
            print "\t", percent(v, tot), k2
Exemplo n.º 3
0
def print_aggregate_compare(cnts, cntsmore):
    """
    Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs.
    """
    cntscopy = copy.deepcopy(cnts)
    for k in cnts:
        print k
        for k2 in cnts[k].keys():
            cntscopy[k][k2] = (1. * cnt[k][k2]/cntsmore[k][k2], cnts[k][k2], cntsmore[k][k2])
        maxperc = dictsort(cntscopy[k])[0][0][0]
        for v, k2 in dictsort(cntscopy[k]):
            # The second column (v[0]/maxperc) is a score for how good this hyperparam is.
            print "\t", k2, "\t", "%.2f" % (v[0]/maxperc), "\t", percent(v[1], v[2], rev=True)
Exemplo n.º 4
0
def print_aggregate_compare(cnts, cntsmore):
    """
    Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs.
    """
    cntscopy = copy.deepcopy(cnts)
    for k in cnts:
        print k
        for k2 in cnts[k].keys():
            cntscopy[k][k2] = (1. * cnt[k][k2] / cntsmore[k][k2], cnts[k][k2],
                               cntsmore[k][k2])
        maxperc = dictsort(cntscopy[k])[0][0][0]
        for v, k2 in dictsort(cntscopy[k]):
            # The second column (v[0]/maxperc) is a score for how good this hyperparam is.
            print "\t", k2, "\t", "%.2f" % (v[0] / maxperc), "\t", percent(
                v[1], v[2], rev=True)
    from vocabulary import wordmap, wordform, language
    from targetvocabulary import targetmap

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)`
            continue
        if w1 not in targetmap():
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)`
            continue
        for l2 in targetmap()[w1]:
            totcnt = 0
            for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt
            print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2])]

    print >> sys.stderr, "REVERSE MAP NOW"

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)`
            continue
        if w1 not in targetmap(name="reverse"):
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)`
            continue
        for l2 in targetmap(name="reverse")[w1]:
    from targetvocabulary import targetmap

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) `
            continue
        if w1 not in targetmap():
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % ` wordmap(
            ).str(w1) `
            continue
        for l2 in targetmap()[w1]:
            totcnt = 0
            for cnt, w2 in dictsort(targetmap()[w1][l2]):
                totcnt += cnt
            print wordmap().str(w1), l2, [
                (percent(cnt, totcnt), wordform(w2))
                for cnt, w2 in dictsort(targetmap()[w1][l2])
            ]

    print >> sys.stderr, "REVERSE MAP NOW"

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) `
            continue
    import w2w.corpora
    import string

    from common.mydict import sort as dictsort

    from collections import defaultdict
    wordfreq = defaultdict(int)
    for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
        for w in readwords(f1): wordfreq[(l1,w)] += 1
        for w in readwords(f2): wordfreq[(l2,w)] += 1

    for l, f in w2w.corpora.monocorpora_filenames():
        assert 0

    for (l, w) in wordfreq.keys():
        if wordfreq[(l, w)] < HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]:
            del wordfreq[(l, w)]
        if w == "*UNKNOWN*":
            del wordfreq[(l, w)]

    import w2w.vocabulary
    import common.idmap

    wordfreqkeys = [key for cnt, key in dictsort(wordfreq)]

#    for k in wordfreq.keys():
#        print k
    v = common.idmap.IDmap([(None, "*LBOUNDARY*"), (None, "*RBOUNDARY*")] + wordfreqkeys, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], unknown_key=(None, "*UNKNOWN*"))
    w2w.vocabulary.write(v)
Exemplo n.º 8
0
    import examples, sys
    import graph
    import numpy as N
    from vocabulary import labelmap
    ODIM = labelmap.len
    from common.mydict import sort as dictsort
    for l in sys.stdin:
        e = examples._example_from_string(l)
        (x, y) = e
        if HYPERPARAMETERS["locally normalize"]:
            targety = N.array([y])
        else:
            targety = N.zeros(ODIM)
            targety[y] = 1.
        if HLAYERS == 2:
            o = graph.validatefn([x.data], targety, w1[x.indices], b1, wh, bh,
                                 w2, b2)
            (kl, softmax, argmax, prehidden1, prehidden2) = o
        else:
            o = graph.validatefn([x.data], targety, w1[x.indices], b1, w2, b2)
            (kl, softmax, argmax, prehidden) = o

        assert softmax.shape[0] == 1
        softmax = softmax[0]
        prs = {}
        for i in range(softmax.shape[0]):
            prs[labelmap.str(i)] = softmax[i]
        print dictsort(prs)[:3]
#        print argmax, softmax
Exemplo n.º 9
0
    from collections import defaultdict
    wordfreq = defaultdict(int)
    for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
        for w in readwords(f1):
            wordfreq[(l1, w)] += 1
        for w in readwords(f2):
            wordfreq[(l2, w)] += 1

    for l, f in w2w.corpora.monocorpora_filenames():
        assert 0

    for (l, w) in wordfreq.keys():
        if wordfreq[(l, w)] < HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]:
            del wordfreq[(l, w)]
        if w == "*UNKNOWN*":
            del wordfreq[(l, w)]

    import w2w.vocabulary
    import common.idmap

    wordfreqkeys = [key for cnt, key in dictsort(wordfreq)]

    #    for k in wordfreq.keys():
    #        print k
    v = common.idmap.IDmap(
        [(None, "*LBOUNDARY*"), (None, "*RBOUNDARY*")] + wordfreqkeys,
        allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"],
        unknown_key=(None, "*UNKNOWN*"))
    w2w.vocabulary.write(v)