Exemplo n.º 1
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None,
                        help="random seed")
    parser.add_argument("--random", dest="random", action="store_true", default=False)
    parser.add_argument("--freq", dest="most_frequent", action="store_true", default=False)
    parser.add_argument("--cvn", metavar="INT", type=int, default=10)
    parser.add_argument("langs", metavar="LANGS", default=None)
    parser.add_argument("f1", metavar="LANGS2 PREFIX", default=None)
    parser.add_argument("f2", metavar="LANGS2 PREFIX", default=None)
    args = parser.parse_args()

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    langs = list(load_json_stream(open(args.langs)))
    mat = np.zeros((2, 2), dtype=np.int32)
    for cvi in range(args.cvn):
        fp1 = args.f1.format(cvi)
        fp2 = args.f2.format(cvi)
        sys.stderr.write("processsing {} and {}\n".format(fp1, fp2))
        filled_langs1 = list(load_json_stream(open(fp1)))
        filled_langs2 = list(load_json_stream(open(fp2)))
        mat += eval_mv(filled_langs1, filled_langs2, langs)
    print(mat)
    bunch = mcnemar(mat, exact=False)
    print("mcnemar\t{}".format(bunch))
Exemplo n.º 2
0
def main():
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("--type", metavar="POINT_TYPE", default="theta")
    parser.add_argument("--output", metavar="IMG", default=None)
    parser.add_argument("dumps", metavar="DUMP", default=None)
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    args = parser.parse_args()

    burnin = 51

    fid2struct = load_json_file(args.flist)
    langs = {}
    for lang in load_json_stream(open(args.langs)):
        if lang["source"] == "APiCS":
            langs[lang["name"]] = lang

    # stats = np.zeros(len(bin_flist))
    # points = []
    fcount = defaultdict(int)
    samples = 0
    total = 0
    rtotal = 0

    stream = load_json_stream(open(args.dumps))
    for i in xrange(burnin):
        stream.next()
    for dump in stream:
        # lang_num = len(dump['mixlist'])
        for creole in dump['mixlist']:
            catvect = langs[creole["langname"]]["catvect_filled"]
            total += 1
            for j, val in enumerate(creole["assignments"]):
                if val == 0:
                    rtotal += 1
                    vid = catvect[j]
                    flabel = fid2struct[j]["name"] + "\t" + fid2struct[j]["vid2label"][vid]
                    fcount[flabel] += 1
        samples += 1

    total = float(total)
    rtotal = float(rtotal)
    _sorted = sorted(fcount.keys(), key=lambda x: fcount[x], reverse=True)
    cum = 0
    for flabel in _sorted:
        cum += fcount[flabel]
        sys.stdout.write("%d\t%f\t%f\t%s\n" % (fcount[flabel], fcount[flabel] / total, cum / rtotal, flabel))
Exemplo n.º 3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("walslangs", metavar="WALS_LANGS", default=None)
    parser.add_argument("walsfeatures", metavar="WALS_FEATURES", default=None)
    parser.add_argument("apicslangs", metavar="APiCS_LANGS", default=None)
    parser.add_argument("merged", metavar="MERGED", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    args = parser.parse_args()

    wals_langs = {}
    for lang in load_json_stream(open(args.walslangs)):
        wals_langs[lang["name"]] = lang
    fid2struct = load_json_file(args.walsfeatures)

    apics_langs = {}
    for lang in load_json_stream(open(args.apicslangs)):
        apics_langs[lang["name"]] = lang

    # count features used in apics
    feature2count = defaultdict(float)
    for name, lang in apics_langs.iteritems():
        for wals_id, v in lang["features"].iteritems():
            feature2count[wals_id] += 1

    # shrink features
    fid2struct2 = []
    for struct in fid2struct:
        if struct["wals_id"] in feature2count:
            struct["idx"] = len(fid2struct2)
            fid2struct2.append(struct)
    fid2struct = fid2struct2

    # shrink features property of each WALS language
    for name in wals_langs.keys():
        lang = wals_langs[name]
        lang["source"] = "WALS"
        lang["orig_features"] = copy.copy(lang["features"])
        for wals_id in lang["features"].keys():
            if wals_id not in feature2count:
                del lang["features"][wals_id]

    with codecs.getwriter("utf-8")(open(args.merged, 'w')) as f:
        for _l in (apics_langs, wals_langs):
            for name, lang in _l.iteritems():
                f.write("%s\n" % json.dumps(lang))

    with codecs.getwriter("utf-8")(open(args.flist, 'w')) as f:
        f.write("%s\n" % json.dumps(fid2struct))
Exemplo n.º 4
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None,
                        help="random seed")
    parser.add_argument("--cv", dest="cv", metavar="INT", type=int, default=5,
                        help="N-fold cross-validation")
    parser.add_argument("_in", metavar="INPUT", help="input")
    parser.add_argument("_out", metavar="OUTPUT", help="output")
    args = parser.parse_args()

    sys.stderr.write("%d-fold cross validation\n" % args.cv)

    if args.seed is not None:
        random.seed(args.seed)

    langs = []
    cvns = []
    for i, lang in enumerate(load_json_stream(open(args._in))):
        langs.append(lang)
        cvns.append(i % args.cv)
    random.shuffle(cvns)

    with codecs.getwriter("utf-8")(open(args._out, 'w')) as f:
        for lang, cvn in zip(langs, cvns):
            lang["cvn"] = cvn
            f.write("%s\n" % json.dumps(lang))
Exemplo n.º 5
0
def main(src, fpath, dst, cvmap_file, cvn):
    langs = [lang for lang in load_json_stream(open(src))]
    flist = load_json_file(fpath)
    cvmap = load_json_file(cvmap_file)

    name2lang = {}
    for lang in langs:
        lang["annotation"]["features_orig"] = copy.copy(
            lang["annotation"]["features"])
        lang["catvect_orig"] = copy.copy(lang["catvect"])
        lang["cv"] = cvn
        name2lang[lang["annotation"]["name"]] = lang

    name2fstruct = {}
    for fstruct in flist:
        name2fstruct[fstruct["annotation"]["name"]] = fstruct

    for lname, fname in cvmap[cvn]:
        lang = name2lang[lname]
        fstruct = name2fstruct[fname]
        lang["catvect"][fstruct["fid"]] = -1
        del lang["annotation"]["features"][fname]

    with open(dst, 'w') as f:
        for lang in langs:
            f.write("%s\n" % json.dumps(lang))
Exemplo n.º 6
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--type", metavar="POINT_TYPE", default="theta")
    parser.add_argument("--output", metavar="IMG", default=None)
    parser.add_argument("dumps", metavar="LANG", default=None)
    args = parser.parse_args()

    fsize=24
    subdiv=8
    burnin = 51

    assignments = np.zeros(3, dtype=np.int)
    samples = 0

    stream = load_json_stream(open(args.dumps))
    for i in xrange(burnin):
        stream.next()
    for dump in stream:
        assignments += dump["assignments_summary"]
        samples += 1

    assignments = fix_order(assignments)
    _sum = float(sum(assignments))
    sys.stdout.write("%d samples\n" % samples)
    sys.stdout.write("%f\t%f\t%f\n" % (assignments[0] / _sum, assignments[1] / _sum, assignments[2] / _sum))
Exemplo n.º 7
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None,
                        help="random seed")
    parser.add_argument("src", metavar="SOURCE", default=None)
    parser.add_argument("dst", metavar="DESTINATION", default=None)
    parser.add_argument("cvn", metavar="INT", default=None)
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)

    src, dst = args.src, args.dst
    cvn = int(args.cvn)
    langs = list(load_json_stream(open(src)))

    filled_list = []
    for lang in langs:
        for name, v in lang["annotation"]["features"].items():
            filled_list.append((lang["annotation"]["name"], name))
    random.shuffle(filled_list)

    # N-fold cross-validation
    cell_size = len(filled_list) // cvn
    cell_size2 = len(filled_list) % cvn

    cvmap = [[] for i in range(cvn)]
    for i in range(cvn):
        cell_start = cell_size * i + min(i, cell_size2)
        cell_len = cell_size + (i < cell_size2)
        for j in range(cell_start, cell_start + cell_len):
            cvmap[i].append(filled_list[j])

    with open(dst, 'w') as f:
        f.write(json.dumps(cvmap))
Exemplo n.º 8
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seed",
                        dest="seed",
                        metavar="INT",
                        type=int,
                        default=None,
                        help="random seed")
    parser.add_argument("--random",
                        dest="random",
                        action="store_true",
                        default=False)
    parser.add_argument("--freq",
                        dest="most_frequent",
                        action="store_true",
                        default=False)
    parser.add_argument("langs", metavar="LANGS", default=None)
    parser.add_argument("f1",
                        metavar="DUMMY_OR_LANGS_FILLED_OR_LANGS_HIDDEN",
                        default=None)
    parser.add_argument("f2",
                        metavar="FLIST_OR_DUMMY_OR_LANGS_HIDDEN",
                        default=None)
    args = parser.parse_args()

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    langs = list(load_json_stream(open(args.langs)))
    if args.random:
        flist = load_json_file(args.f2)
        total, correct = eval_random(flist, langs)
    elif args.most_frequent:
        hidelist = list(load_json_stream(open(args.f1)))
        flist = load_json_file(args.f2)
        total, correct = eval_most_frequent(flist, hidelist, langs)
    else:
        filled_langs = list(load_json_stream(open(args.f1)))
        total, correct = eval_mv(filled_langs, langs)
    sys.stdout.write("%f\t%d\t%d\n" % (float(correct) / total, correct, total))
Exemplo n.º 9
0
def main():
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("langs_all", metavar="INPUT", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    parser.add_argument("langs", metavar="OUTPUT", default=None)
    args = parser.parse_args()

    fid2struct = load_json_file(args.flist)
    fsize = len(fid2struct)

    fname = "Ongoing creolization of pidgins"
    vnames = ["Not applicable (because the language is not a pidgin)", "Widespread"]

    fstruct = None
    for fstruct2 in fid2struct:
        if fstruct2["name"] == fname:
            fstruct = fstruct2
            break
    if not fstruct:
        sys.stderr.write("No such feature found\n")
        exit(1)
    vids = []
    for vname in vnames:
        if vname not in fstruct["label2vid"]:
            sys.stderr.write("No such feature value found\n")
            exit(1)
        vid = fstruct["label2vid"][vname]
        vids.append(vid)
    fid = str(fstruct["fid"])

    sys.stderr.write("fid, vid: %s %s\n" % (fid, vids))

    lang_total, lang_survived = 0, 0
    with codecs.getwriter("utf-8")(open(args.langs, "w")) as out:
        for lang in load_json_stream(open(args.langs_all)):
            lang_total += 1
            survived = True
            if lang["source"] == "APiCS":
                if fid in lang["apics_features"]:
                    if lang["apics_features"][fid][0][0] not in vids:
                        sys.stderr.write("remove %s (pidgins: %s)\n" % (lang["name"], lang["apics_features"][fid][0][0]))
                        survived = False
                else:
                    sys.stderr.write("keep %s (feature missed)\n" % lang["name"])
                    # survived = False
            if survived:
                lang_survived += 1
                out.write("%s\n" % json.dumps(lang))

    sys.stderr.write("language thresholding: %d -> %d\n" % (lang_total, lang_survived))
Exemplo n.º 10
0
def main():
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("langs_in", metavar="LANGS_IN", default=None)
    parser.add_argument("flist_in", metavar="FLIST_IN", default=None)
    parser.add_argument("langs_out", metavar="LANGS_OUT", default=None)
    args = parser.parse_args()

    fid2struct = load_json_file(args.flist_in)
    with codecs.getwriter("utf-8")(open(args.langs_out, 'w')) as f:
        for lang in load_json_stream(open(args.langs_in)):
            lang["catvect"] = create_cat_vect(fid2struct, lang["features"])
            if "features_filled" in lang:
                lang["catvect_filled"] = create_cat_vect(fid2struct, lang["features_filled"])
            f.write("%s\n" % json.dumps(lang))
Exemplo n.º 11
0
def main(orig, src, fpath, dst):
    fid2struct = load_json_file(fpath)

    with open(src) as fin:
        fin.readline() # ignore the header
        with codecs.getwriter("utf-8")(open(dst, 'w')) as fout:
            for lang, l in zip(load_json_stream(open(orig)), fin):
                lang["features_filled"] = {}
                l = l.rstrip()
                a = l.split("\t")
                label = a.pop(0)
                for fid, v in enumerate(a):
                    wals_id = fid2struct[fid]["wals_id"]
                    lang["features_filled"][wals_id] = int(v)
                    assert(wals_id not in lang["features"] or lang["features"][wals_id] == int(v))
                fout.write("%s\n" % json.dumps(lang))
Exemplo n.º 12
0
def main(orig, src, fpath, dst):
    langs = list(load_json_stream(open(orig)))
    flist = load_json_file(fpath)

    for lang in langs:
        lang["counted_features"] = [Counter() for feature in flist]
        lang["annotation"]["features_filled"] = {}
        for fstruct in flist:
            lang["annotation"]["features_filled"][fstruct["annotation"]
                                                  ["name"]] = -1

    for fpath in glob.glob(src + ".*"):
        sys.stderr.write("processing {}\n".format(fpath))
        with open(fpath) as fin:
            fin.readline()  # ignore the header
            for lang, l in zip(langs, fin):
                l = l.rstrip()
                a = l.split("\t")
                label = a.pop(0)
                for fid, v in enumerate(a):
                    lang["counted_features"][fid][int(v)] += 1

    for lang in langs:
        binsize = 0
        xfreq = Counter()
        for fid, (fstruct,
                  counts) in enumerate(zip(flist, lang["counted_features"])):
            if fstruct["type"] == "bin":
                size = 2
            else:
                size = len(fstruct["annotation"]["vid2label"])
            name = fstruct["annotation"]["name"]
            maxv, maxvv = -1, -1
            for i in range(size):
                xfreq[binsize + i] += counts[i]
                # if lang["xfreq"][binsize+i] >= maxvv:
                if counts[i] >= maxvv:
                    maxvv = counts[i]
                    maxv = i
                lang["annotation"]["features_filled"][name] = maxv
            binsize += size
        del lang["counted_features"]
        lang["xfreq"] = [xfreq[i] for i in range(binsize)]

    with open(dst, 'w') as fout:
        for lang in langs:
            fout.write("%s\n" % json.dumps(lang))
Exemplo n.º 13
0
def main(src, fpath, dst):
    flist = load_json_file(fpath)
    langs = list(load_json_stream(open(src, "r")))

    counts = {}
    for fstruct in flist:
        if fstruct["type"] == "count":
            counts[fstruct["fid"]] = fstruct

    with open(dst, 'w') as f:
        rv = "\t".join([fstruct["annotation"]["name"] for fstruct in flist])
        f.write(rv + "\n")

        for i, lang in enumerate(langs):
            catvect = list(lang["catvect"])
            f.write("L{}\t{}\n".format(
                i, "\t".join(map(lambda x: str(x)
                                 if x >= 0 else "NA", catvect))))
Exemplo n.º 14
0
def main(orig, src, fpath, dst):
    flist = load_json_file(fpath)

    with open(src) as fin:
        fin.readline()  # ignore the header
        with open(dst, 'w') as fout:
            for lang, line in zip(load_json_stream(open(orig)), fin):
                line = line.rstrip()
                a = line.split("\t")
                a.pop(0)  # lang id
                catvect = list(map(lambda x: int(x), a))
                lang["catvect_filled"] = catvect
                lang["annotation"]["features_filled"] = {}
                for fid, v in enumerate(catvect):
                    name = flist[fid]["annotation"]["name"]
                    lang["annotation"]["features_filled"][name] = v
                    assert (name not in lang["annotation"]["features"]
                            or lang["annotation"]["features"][name] == v)
                fout.write("%s\n" % json.dumps(lang))
Exemplo n.º 15
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--type", metavar="POINT_TYPE", default="theta")
    parser.add_argument("--output", metavar="IMG", default=None)
    parser.add_argument("dumps", metavar="LANG", default=None)
    args = parser.parse_args()

    fsize=24
    subdiv=8
    burnin = 51

    stats = np.zeros(3)
    points = []
    samples = 0

    stream = load_json_stream(open(args.dumps))
    for i in xrange(burnin):
        stream.next()
    for dump in stream:
        if args.type == "feature":
            for j, mu in enumerate(dump['mus']):
                _sum = logsumexp(mu)
                probs = np.exp(mu - _sum)
                probs2 = fix_order(probs)
                points.append(probs2)
                stats += probs2
        elif args.type == "lang":
            for creole in dump['mixlist']:
                etas = np.array(creole['etas'])
                _sum = logsumexp(etas)
                probs = np.exp(etas - _sum)
                probs2 = fix_order(probs)
                points.append(probs2)
                stats += probs2
        samples += 1

    _sum = float(sum(stats))
    sys.stdout.write("%d samples\n" % samples)
    sys.stdout.write("%f\t%f\t%f\n" % (stats[0] / _sum, stats[1] / _sum, stats[2] / _sum))
Exemplo n.º 16
0
def main(src, fpath, dst, fpath2):
    fid2struct = load_json_file(fpath)

    with codecs.getwriter("utf-8")(open(dst, 'w')) as f:
        for i, lang in enumerate(load_json_stream(open(src))):
            rv = ""
            for struct in fid2struct:
                flen = len(struct["vid2label"])
                _arr = ["0"] * flen
                wals_id = struct["wals_id"]
                v = lang["features_filled"][wals_id]
                _arr[v] = "1"
                rv += "".join(_arr)
            lang["bin"] = rv
            f.write("%s\n" % json.dumps(lang))

    flist_bin = []
    for struct in fid2struct:
        name = struct["name"]
        for v in struct["vid2label"]:
            flist_bin.append("%s\t%s" % (name, v))
    with codecs.getwriter("utf-8")(open(fpath2, 'w')) as f:
        f.write("%s\n" % json.dumps(flist_bin))
Exemplo n.º 17
0
def main():
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("--lthres", dest="lthres", metavar="FLOAT", type=float, default=0.0,
                        help="eliminate languages with higher rate of missing values [0,1]")
    parser.add_argument("langs_all", metavar="INPUT", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    parser.add_argument("langs", metavar="OUTPUT", default=None)
    args = parser.parse_args()

    fid2struct = load_json_file(args.flist)
    fsize = len(fid2struct)
    sys.stderr.write("%d featurs\n" % fsize)

    lang_total, lang_survived = 0, 0
    with codecs.getwriter("utf-8")(open(args.langs, "w")) as out:
        for lang in load_json_stream(open(args.langs_all)):
            lang_total += 1
            if float(len(lang["features"])) / fsize >= args.lthres:
                lang_survived += 1
                out.write("%s\n" % json.dumps(lang))

    sys.stderr.write("language thresholding: %d -> %d\n" % (lang_total, lang_survived))
Exemplo n.º 18
0
def main():
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None,
                        help="random seed")
    parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=20000,
                        help="number of dimensions")
    parser.add_argument("-t", "--type", dest="mtype", metavar="MODEL_TYPE", default="mono",
                        help="model type (mono or fact)")
    parser.add_argument("--dump", default=None)
    parser.add_argument("langs", metavar="LANG", default=None)
    # parser.add_argument("fid2struct", metavar="FLIST", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    parser.add_argument("sources", metavar="SOURCES", default=None)
    args = parser.parse_args()

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    creoles = {}
    with codecs.getreader("utf-8")(open(args.sources)) as f:
        for line in f:
            line = line.rstrip()
            creole, lexifier, substrate = line.split("\t")
            creoles[creole] = {
                "lexifier": lexifier,
                "substrate": substrate,
            }
    langs = {}
    for lang in load_json_stream(open(args.langs)):
        if lang["source"] == "APiCS":
            langs[lang["name"]] = lang
        else:
            langs[lang["wals_code"]] = lang

    fid2struct = load_json_file(args.flist)

    # **TODO** pass command-line args
    if args.mtype == 'mono':
        cm = CreoleMixtureDiscrete(fid2struct, alpha_a=1.0, alpha_u=1.0)
    elif args.mtype == 'fact':
        cm = CreoleMixtureDiscreteFactored(fid2struct, gamma_f=10.0, gamma_c=10.0, alpha_u=1.0)
    else:
        sys.stderr.write("unsupported model\n")
        exit(1)

    objs = []
    for creole, obj in creoles.iteritems():
        if creole not in langs:
            sys.stderr.write("creole %s not found in the lang list\n" % creole)
            continue
        clang = langs[creole]
        if obj["lexifier"] not in langs:
            sys.stderr.write("lexifier %s not found in the lang list\n" % obj["lexifier"])
            continue
        llang = langs[obj["lexifier"]]
        if obj["substrate"] not in langs:
            sys.stderr.write("substrate %s not found in the lang list\n" % obj["substrate"])
            continue
        slang = langs[obj["substrate"]]
        clang_cat = clang["catvect_filled"]
        llang_cat = llang["catvect_filled"]
        slang_cat = slang["catvect_filled"]
        obj = cm.add_mix(clang_cat, llang_cat, slang_cat, langname=creole)
        objs.append({
            "obj": obj,
            # "creole": evaluator.cat2bin(np.array(clang["catvect_filled"])),
            # "lexifier": evaluator.cat2bin(np.array(llang["catvect_filled"])),
            # "substrate": evaluator.cat2bin(np.array(slang["catvect_filled"])),
        })

    sys.stderr.write("%d creoles\n" % len(cm.mixlist))

    cm.init_mcmc()
    sys.stderr.write("0\tlog marginal: %f\n" % cm.log_marginal())
    # print cm.niw.L
    sys.stdout.write("%s\n" % cm.serialize(_iter=0))
    temp = 2.0
    tempstep = temp / (args._iter * 0.75)
    for _iter in xrange(args._iter):
        temp -= tempstep
        if temp <= 0.1:
            temp = 0.1
        # print >>sys.stderr, temp
        # cm.sample(temp=temp)
        cm.sample(temp=1.0)

        sys.stderr.write("%d\tlog marginal: %f\n" % (_iter + 1, cm.log_marginal()))
        if (_iter + 1) % 100 == 0:
            sys.stdout.write("%s\n" % cm.serialize(_iter=_iter + 1))
            # print cm.niw.L

    if args.dump:
        if args.dump == "-":
            f = codecs.getwriter("utf-8")(sys.stdout)
        else:
            f = codecs.getwriter("utf-8")(open(args.dump, "w"))
        rv = []
        for obj_base in objs:
            obj = obj_base["obj"]
            rv.append({
                "creole": obj["creole"],
                "lexifier": obj["lexifier"],
                "substrate": obj["substrate"],
                "assignments": obj["assignments"].tolist(),
            })
        f.write("%s\n" % json.dumps(rv))
        f.close()
Exemplo n.º 19
0
def main():
    # sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None,
                        help="random seed")
    # parser.add_argument("--sidx", metavar="IDX", type=int, default=0,
    #                     help="i-th sample of leaf states (-1: last sample)")
    # parser.add_argument("--npriors", metavar="NODE_PRIORS", default=None, help="priors for nodes (json)")
    parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=1000,
                        help="# of iterations")
    # parser.add_argument("--resume_if", action="store_true", default=False,
    #                     help="resume training if the output exists")
    parser.add_argument("model", metavar="FILE", default=None,
                        help="resume training from model dump")
    parser.add_argument("flist", metavar="FLIST", default=None)
    # parser.add_argument("trees", metavar="TREES", default=None, help="merged trees (pkl)")
    parser.add_argument("langs", metavar="LANG", default=None) # **HACK**
    # parser.add_argument("samples", metavar="SAMPLES", default=None, help="parameter states (json stream)")
    parser.add_argument("out", metavar="OUT", default=None, help="out (pkl)")
    args = parser.parse_args()

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    # if args.resume_if:
    #     if os.path.isfile(args.out + ".current"):
    #         args.resume = args.out + ".current"
    #     elif os.path.isfile(args.out + ".best"):
    #         args.resume = args.out + ".best"
    # if args.resume:
    flist = load_json_file(args.flist)
    for fid, fnode in enumerate(flist):
        if fnode["annotation"]["fullname"] == "81A Order of Subject, Object and Verb":
            wals_id = fnode["annotation"]["name"]
            T = fnode["size"] # len(fnode["vid2label"])
            break
    # "label2vid": {"1 SOV": 0, "2 SVO": 1, "3 VSO": 2, "6 OSV": 5, "4 VOS": 3, "5 OVS": 4, "7 No dominant order": 6}
    # fval = 0
    # j_start, T = ibp.bmap(fid)

    sys.stderr.write("loading model from %s\n" % args.model)
    spec = pickle.load(open(args.model, "rb"), encoding="latin-1")
    trees = spec["trees"]

    
    # HACK
    from train_bin_ctmc import register_node
    langs = list(load_json_stream(open(args.langs, "r")))
    idx2id = {}
    for i, lang in enumerate(langs):
        if "glottocode" in lang:
            idx2id[i] = lang["glottocode"] + ":" + lang["annotation"]["name"]
    id2node = {}
    glottocode2node = {}
    for tree in trees:
        register_node(tree, id2node, glottocode2node)
    for i, lang in enumerate(langs):
        if i in idx2id:
            _id = idx2id[i]
            if _id in id2node:
                node = id2node[_id]
                node.lang = lang
    
    # sampler = spec["sampler"]
    # if "logprob" not in spec:
    #     logprob = sampler.logprob(trees)
    # else:
    #     logprob = spec["logprob"]
    # sys.stderr.write("iter {}\t{}\n".format(spec["iter"], logprob))
    # _start = spec["iter"] + 1
    # else:
    _start = 1
    #     trees = load(open(args.trees, 'rb'))
    #     # trees2 = []
    #     # for tree in trees:
    #     #     if tree.is_isolate is False:
    #     #         trees2.append(tree)
    #     # trees = trees2
    #     sys.stderr.write("{} trees\n".format(len(trees)))

    #     node_priors = None
    #     if args.npriors is not None:
    #         prior_specs = load_json_file(args.npriors)
    #         node_priors = create_node_priors(prior_specs)
    
    #     langs = list(load_json_stream(open(args.langs)))
    #     with open(args.samples, 'r') as f:
    #         for i, sample in enumerate(load_json_stream(f)):
    #             if i == args.sidx:
    #                 break
    for tree in trees:
        update_state(tree, [wals_id])

    sampler = CTMC_Sampler(1, states=[T], ctmc_scale=0.00005)
    sampler.init_trees(trees, sample_dates=False)
    sys.stderr.write("iter 0\t{}\n".format(sampler.logprob(trees)))
    for _iter in six.moves.range(_start, args._iter + 1):
        sampler.sample(_iter=_iter)
        logprob = sampler.logprob(trees)
        sys.stderr.write("iter {}\t{}\n".format(_iter, logprob))
        with open(args.out + ".current", "wb") as f:
            pickle.dump({ "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f)
    with open(args.out + ".final", "wb") as f:
        pickle.dump({ "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f)
Exemplo n.º 20
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--plot_type", dest="plot_type", metavar="INT", type=int, default=0)
    parser.add_argument("--pc1", dest="pc1", metavar="INT", type=int, default=0)
    parser.add_argument("--pc2", dest="pc2", metavar="INT", type=int, default=1)
    parser.add_argument("--kde", dest="do_kde", action="store_true", default=False)
    parser.add_argument("--output", metavar="IMG", default=None)
    parser.add_argument("langs", metavar="LANG", default=None)
    args = parser.parse_args()

    langs = list(load_json_stream(open(args.langs)))
    # flist = load_json_file(sys.argv[2])
    dims = len(langs[0]["bin"])

    X = extract_mat(langs)
    pca, X_transformed = do_pca(X)

    import matplotlib.pyplot as plt
    plt.figure(figsize=(8, 6), dpi=120)

    # import matplotlib as mpl
    # mpl.rcParams['font.family'] = 'Nimbus Roman No9 L'
    import matplotlib.font_manager as font_manager
    path = '/usr/share/fonts/truetype/msttcorefonts/Times_New_Roman.ttf'
    fontprop = font_manager.FontProperties(fname=path)

    p1, p2 = args.pc1, args.pc2  # first and second PCs (zero-based numbering)
    plot_type = args.plot_type # 0: both, 1: creole, 2: non-creole, 3: none
    # plt.xlim((-5, 4))
    # plt.ylim((-4, 3))
    plt.xlim((-4, 4))
    plt.ylim((-4, 4))
    plt.xticks(range(-4, 5), fontproperties=fontprop, size="25")
    plt.yticks(range(-4, 5), fontproperties=fontprop, size="25")

    plt.xlabel("PC%d (%2.1f%%)" % (p1 + 1, pca.explained_variance_ratio_[p1] * 100), fontproperties=fontprop, size="25")
    plt.ylabel("PC%d (%2.1f%%)" % (p2 + 1, pca.explained_variance_ratio_[p2] * 100), fontproperties=fontprop, size="25")
    plot_langs(langs, X_transformed, plt, p1, p2, plot_type=plot_type)
    plt.legend()

    if args.do_kde:
        val = []
        for i, lang in enumerate(langs):
            x, y = X_transformed[i, p1], X_transformed[i, p2]
            if plot_type == 1 and lang["source"] == "APiCS":
                val.append((x, y))
            elif plot_type == 2 and lang["source"] == "WALS":
                val.append((x, y))
        val = np.array(val).T
        # val = np.vstack((X_transformed[:, p1], X_transformed[:, p2]))
        kernel = gaussian_kde(val)
        xmin, xmax = plt.xlim()
        ymin, ymax = plt.ylim()
        _X, _Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
        positions = np.vstack([_X.ravel(), _Y.ravel()])
        Z = np.reshape(kernel(positions).T, _X.shape)
        # http://matplotlib.org/users/colormaps.html
        plt.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax])
        # plt.imshow(np.rot90(Z), cmap=plt.cm.hot_r, extent=[xmin, xmax, ymin, ymax])
        # plt.imshow(np.rot90(Z), cmap=plt.cm.afmhot_r, extent=[xmin, xmax, ymin, ymax])


    # plt.title('PCA')

    # plt.xlim([-2.5, 1.5])
    # plt.ylim([-1.5, 2.5])

    if args.output:
        plt.savefig(args.output, format="pdf", transparent=False, bbox_inches="tight")
        # plt.savefig(args.output, format="png", transparent=False, dpi=160)
    plt.show()
Exemplo n.º 21
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seed",
                        dest="seed",
                        metavar="INT",
                        type=int,
                        default=None,
                        help="random seed")
    # parser.add_argument("--sidx", metavar="IDX", type=int, default=0,
    #                     help="i-th sample of leaf states (-1: last sample)")
    parser.add_argument("--npriors",
                        metavar="NODE_PRIORS",
                        default=None,
                        help="priors for nodes (json)")
    parser.add_argument("-i",
                        "--iter",
                        dest="_iter",
                        metavar="INT",
                        type=int,
                        default=1000,
                        help="# of iterations")
    parser.add_argument("--resume_if",
                        action="store_true",
                        default=False,
                        help="resume training if the output exists")
    parser.add_argument("--resume",
                        metavar="FILE",
                        default=None,
                        help="resume training from model dump")
    parser.add_argument("--save_interval",
                        metavar="INT",
                        type=int,
                        default=-1,
                        help="save interval")
    # parser.add_argument("--merged", action="store_true", default=False,
    #                     help="use merged langs instead of a single sample")
    parser.add_argument("--has_bias",
                        action="store_true",
                        default=False,
                        help="0th item of z is a bias term (always 1)")
    parser.add_argument("--surface",
                        dest="latent",
                        action="store_false",
                        default=True,
                        help="use surface features")
    parser.add_argument(
        "--surface_state_limit",
        dest="max_states",
        type=int,
        default=-1,
        help=
        "maximum number of distinct states of surface features (trimming for speed)"
    )
    parser.add_argument("trees",
                        metavar="TREES",
                        default=None,
                        help="merged trees (pkl)")
    parser.add_argument("langs", metavar="LANG", default=None)
    # parser.add_argument("samples", metavar="SAMPLES", default=None, help="parameter states (json stream)")
    parser.add_argument("out", metavar="OUT", default=None, help="out (pkl)")
    args = parser.parse_args()

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    if args.resume_if:
        if os.path.isfile(args.out + ".current"):
            args.resume = args.out + ".current"
        elif os.path.isfile(args.out + ".best"):
            args.resume = args.out + ".best"
    if args.resume:
        sys.stderr.write("loading model from %s\n" % args.resume)
        spec = load(open(args.resume, "rb"))
        trees = spec["trees"]
        sampler = spec["sampler"]
        if "logprob" not in spec:
            logprob = sampler.logprob(trees)
        else:
            logprob = spec["logprob"]
        sys.stderr.write("iter {}\t{}\n".format(spec["iter"], logprob))
        _start = spec["iter"] + 1
    else:
        _start = 1
        trees = load(open(args.trees, 'rb'))
        sys.stderr.write("{} trees\n".format(len(trees)))

        node_priors = None
        if args.npriors is not None:
            prior_specs = load_json_file(args.npriors)
            node_priors = create_node_priors(prior_specs)

        langs = list(load_json_stream(open(args.langs, "r")))
        # with open(args.samples, 'r') as f:
        #     if args.merged:
        #         sample = list(load_json_stream(f))
        #     else:
        #         for i, sample in enumerate(load_json_stream(f)):
        #             if i == args.sidx:
        #                 break
        K, states = attach_sample(trees,
                                  langs,
                                  node_priors,
                                  has_bias=args.has_bias,
                                  latent=args.latent,
                                  max_states=args.max_states)

        sampler = CTMC_Sampler(K, states=states, ctmc_scale=0.00005)
        sampler.init_trees(trees)
        sys.stderr.write("iter 0\t{}\n".format(sampler.logprob(trees)))

    _iter = _start - 1
    if args.save_interval >= 0 and _iter % args.save_interval == 0:
        with open(args.out + ".{}".format(_iter), "wb") as f:
            dump(
                {
                    "sampler": sampler,
                    "trees": trees,
                    "iter": _iter,
                    "logprob": logprob
                }, f)
    for _iter in range(_start, args._iter + 1):
        sampler.sample(_iter=_iter)
        logprob = sampler.logprob(trees)
        sys.stderr.write("iter {}\t{}\n".format(_iter, logprob))
        if args.save_interval >= 0 and _iter % args.save_interval == 0:
            with open(args.out + ".{}".format(_iter), "wb") as f:
                dump(
                    {
                        "sampler": sampler,
                        "trees": trees,
                        "iter": _iter,
                        "logprob": logprob
                    }, f)
        with open(args.out + ".current", "wb") as f:
            dump(
                {
                    "sampler": sampler,
                    "trees": trees,
                    "iter": _iter,
                    "logprob": logprob
                }, f)
    with open(args.out + ".final", "wb") as f:
        dump(
            {
                "sampler": sampler,
                "trees": trees,
                "iter": _iter,
                "logprob": logprob
            }, f)
Exemplo n.º 22
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seed",
                        metavar="INT",
                        type=int,
                        default=None,
                        help="random seed")
    parser.add_argument("--K", metavar="INT", type=int, default=3, help="K")
    parser.add_argument("-i",
                        "--iter",
                        dest="_iter",
                        metavar="INT",
                        type=int,
                        default=1000,
                        help="# of iterations")
    parser.add_argument("--output",
                        dest="output",
                        metavar="FILE",
                        default=None,
                        help="save the model to the specified path")
    parser.add_argument('--bins', type=str, default=None)
    parser.add_argument('--bins_iter', type=int, default=500)
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    args = parser.parse_args()
    sys.stderr.write("args\t{}\n".format(args))

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    flist = load_json_file(args.flist)
    langs = list(load_json_stream(open(args.langs)))
    mat, mvs = create_mat(langs, flist)

    adm = Admixture(mat, flist, K=args.K, mvs=mvs)
    ll = adm.logprob()
    sys.stderr.write("iter 0: {}\n".format(ll))
    ll_max = ll
    for _iter in range(args._iter):
        adm.sample()
        ll = adm.logprob()
        sys.stderr.write("iter {}: {}\n".format(_iter + 1, ll))
        sys.stderr.flush()
        if args.output is not None:
            with open(args.output + ".current", "wb") as f:
                obj = {"model": adm, "iter": _iter + 1, "ll": ll}
                pickle.dump(obj, f)
        if ll > ll_max:
            ll_max = ll
            shutil.copyfile(args.output + ".current", args.output + ".best")
    if args.output is not None:
        with open(args.output + ".final", "wb") as f:
            obj = {"model": adm, "iter": _iter + 1, "ll": ll}
            pickle.dump(obj, f)

    if args.bins is not None:
        # Zs = [np.copy(adm.Z)]
        bins = [
            np.apply_along_axis(lambda x: np.bincount(x, minlength=adm.K),
                                axis=1,
                                arr=adm.Z)
        ]
        for i in range(args.bins_iter):
            adm.sample()
            # Zs.append(np.copy(adm.Z))
            bins.append(
                np.apply_along_axis(lambda x: np.bincount(x, minlength=adm.K),
                                    axis=1,
                                    arr=adm.Z))
        # Zs = np.vstack(Zs)
        # bins = np.apply_along_axis(lambda x: np.bincount(x, minlength=adm.K), axis=1, arr=Zs)
        bins = np.dstack(bins).sum(axis=2)
        with open(args.bins, 'w') as f:
            f.write("{}\n".format(json.dumps(bins.tolist())))
Exemplo n.º 23
0
def main():
    # sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seed",
                        metavar="INT",
                        type=int,
                        default=None,
                        help="random seed")
    parser.add_argument("--fid", metavar="INT", type=int, default=-1)
    parser.add_argument("--only_alphas",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore v and h")
    parser.add_argument("--drop_hs",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore v")
    parser.add_argument("--burnin",
                        metavar="INT",
                        type=int,
                        default=1000,
                        help="# of iterations")
    parser.add_argument("--samples",
                        metavar="INT",
                        type=int,
                        default=500,
                        help="save interval")
    parser.add_argument("--interval",
                        metavar="INT",
                        type=int,
                        default=5,
                        help="sampling interval")
    parser.add_argument("--alpha",
                        metavar="FLOAT",
                        type=float,
                        default=-1.0,
                        help="parameter alpha")
    parser.add_argument("--K", metavar="INT", type=int, default=100, help="K")
    parser.add_argument('--norm_sigma',
                        type=float,
                        default=5.0,
                        help='standard deviation of Gaussian prior for u')
    parser.add_argument('--gamma_shape',
                        type=float,
                        default=1.0,
                        help='shape of Gamma prior for v and h')
    parser.add_argument('--gamma_scale',
                        type=float,
                        default=0.001,
                        help='scale of Gamma prior for v and h')
    parser.add_argument("--output",
                        dest="output",
                        metavar="FILE",
                        default=None,
                        help="save the model to the specified path")
    parser.add_argument("--resume",
                        metavar="FILE",
                        default=None,
                        help="resume training from model dump")
    parser.add_argument("--resume_if",
                        action="store_true",
                        default=False,
                        help="resume training if the output exists")
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    parser.add_argument("aggregated", metavar="FLIST", default=None)
    args = parser.parse_args()
    sys.stderr.write("args\t{}\n".format(args))

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    flist = load_json_file(args.flist)

    # offset = 0
    # if args.resume_if:
    #     if os.path.isfile(args.output + ".current"):
    #         args.resume = args.output + ".current"
    #     elif os.path.isfile(args.output + ".best"):
    #         args.resume = args.output + ".best"
    # if args.resume:
    #     sys.stderr.write("loading model from {}\n".format(args.resume))
    #     spec = pickle.load(open(args.resume, "rb"))
    #     mda = spec["model"]
    #     sys.stderr.write("iter {}\n".format(spec["iter"] + 1))
    #     if args.cv:
    #         eval_cvlist(mda)
    #     offset = spec["iter"] + 1
    # else:
    langs = list(load_json_stream(open(args.langs)))
    sys.stderr.write("building hnet\n")
    hnet = WeightedNeighborGraph(langs)

    if args.fid >= 0:
        fstruct = flist[args.fid]
        vec, mvs, size = create_vec(langs, fstruct, args.fid)

        al = CategoricalAutologistic(vec,
                                     size,
                                     hnet=hnet,
                                     mvs=mvs,
                                     only_alphas=args.only_alphas,
                                     drop_hs=args.drop_hs,
                                     norm_sigma=args.norm_sigma,
                                     gamma_shape=args.gamma_shape,
                                     gamma_scale=args.gamma_scale)
    else:
        als = []
        for fid, fstruct in enumerate(flist):
            vec, mvs, size = create_vec(langs, fstruct, fid)
            al = CategoricalAutologistic(vec,
                                         size,
                                         hnet=hnet,
                                         mvs=mvs,
                                         only_alphas=args.only_alphas,
                                         drop_hs=args.drop_hs,
                                         norm_sigma=args.norm_sigma,
                                         gamma_shape=args.gamma_shape,
                                         gamma_scale=args.gamma_scale)
            als.append(al)
        al = CategoricalAutologisticGroup(als)

    sys.stderr.write("iter 0\n")
    offset = 0
    for _iter in range(args.burnin):
        al.sample()
        offset += 1
        # ll = mda.calc_loglikelihood()
        sys.stderr.write("iter {}\n".format(offset))
        sys.stderr.flush()
    if args.output is not None:
        with open(args.output, "wb") as f:
            obj = {"model": al, "iter": offset}
            pickle.dump(obj, f)
    results = []
    results.append(get_result(al))
    while len(results) < args.samples:
        for _iter in range(args.interval):
            al.sample()
            offset += 1
            sys.stderr.write("iter {}\n".format(offset))
            sys.stderr.flush()
        results.append(get_result(al))
    if args.aggregated == "-":
        f = sys.stdout
    else:
        f = open(args.aggregated, "w")
    aggregated = aggregate_results(results, al, flist, args.fid)
    f.write("%s\n" % json.dumps(aggregated))
Exemplo n.º 24
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--lthres", dest="lthres", metavar="FLOAT", type=float, default=0.0,
                        help="eliminate trees with higher rate of missing values [0,1]")
    parser.add_argument("--npriors", metavar="NODE_PRIORS", default=None, help="priors for nodes (json)")
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("tree", metavar="TREE", default=None)
    parser.add_argument("out", metavar="OUTPUT", default=None)
    args = parser.parse_args()

    trees = load(open(args.tree, 'rb'))
    langs = list(load_json_stream(open(args.langs)))

    # a glottocode corresponds to one or more WALS languages
    glotto_code2lang = {}
    for lang in langs:
        if "glottocode" in lang and lang["glottocode"]:
            if lang["glottocode"] in glotto_code2lang:
                glotto_code2lang[lang["glottocode"]].append(lang)
            else:
                glotto_code2lang[lang["glottocode"]] = [lang]
        else:
            sys.stderr.write(u"dropping lang without glottocode: {}\n".format(lang["annotation"]["name"]))

    # try to keep nodes specified by priors
    registered_codes = {}
    if args.npriors is not None:
        prior_specs = load_json_file(args.npriors)
        for spec in prior_specs:
            registered_codes[spec["glottocode"]] = True

    for tree in trees:
        remove_unclassified(tree)
    for tree in trees:
        attach_lang(tree, glotto_code2lang)

    ucount, total = 0, 0
    for code, langlist in glotto_code2lang.items():
        for lang in langlist:
            total += 1
            if "used" in lang and lang["used"] == True:
                ucount += 1
                del lang["used"]
            else:
                sys.stderr.write("glottocode never appeared in trees: {}\n".format(code))
    sys.stderr.write("{} (out of {}) languages in the trees\n".format(ucount, total))
        
    trees2 = []
    for tree in trees:
        c = shrink_tree(tree, registered_codes)
        if c > 0:
            trees2.append(tree)
    sys.stderr.write("# of trees: {} -> {}\n".format(len(trees), len(trees2)))
    trees = trees2

    if args.lthres > 0.0:
        trees2 = []
        for tree in trees:
            c = get_feature_coverage(tree)
            if c >= args.lthres:
                trees2.append(tree)
        sys.stderr.write("# of trees ({} thres): {} -> {}\n".format(args.lthres, len(trees), len(trees2)))
        trees = trees2

    isolate = 0
    for tree in trees:
        tree.is_root = True
        tree.parent = None
        if len(tree.children) <= 0:
            tree.is_isolate = True
            isolate += 1
            tree.date = 0.0
            tree.is_date_frozen = True
            tree.is_state_frozen = True
        else:
            tree.is_isolate = False
            tree.is_date_frozen = False
            tree.is_state_frozen = False
            for child in tree.children:
                set_date(child)
    sys.stderr.write("# of isolate trees: {} (out of {})\n".format(isolate, len(trees)))
            
    # for tree in trees:
    #     sys.stdout.write(tree.name + "\n")
    with open(args.out, "wb") as f:
        dump(trees, f)
Exemplo n.º 25
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--input", metavar="FILE", default=None)
    parser.add_argument("--burnin", metavar="INT", type=int, default=0,
                        help="# of burn-in iterations")
    parser.add_argument("--interval", metavar="INT", type=int, default=1,
                        help="pick up one per # samples")
    parser.add_argument("--update", action="store_true", default=False,
                        help="update features (for MVI)")
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    args = parser.parse_args()

    flist = load_json_file(args.flist)
    langs = list(load_json_stream(open(args.langs)))
    P = len(flist)
    L = len(langs)

    count = 0
    xfreq = []
    for l in range(L):
        xfreq.append([None] * P)
    zfreq = None

    if args.input is None or args.input == "-":
        f = sys.stdin
    elif args.input.endswith(".bz2"):
        import bz2
        f = bz2.open(args.input, "r")
    else:
        f = open(args.input, "r")

    for langdat in load_json_stream(f):
        sys.stderr.write("+")
        sys.stderr.flush()
        if langdat["iter"] >= args.burnin and langdat["iter"] % args.interval == 0:
            count += 1
            if zfreq is None:
                zfreq = np.zeros((L, len(langdat["z"])), dtype=np.int32)
            zfreq += np.array(langdat["z"]).T
            for l in range(L):
                for p in range(P):
                    v = langdat["x"][l][p]
                    if xfreq[l][p] is None:
                        if flist[p]["type"] == "bin":
                            xfreq[l][p] = [0, 0]
                        elif flist[p]["type"] == "cat":
                            xfreq[l][p] = [0] * flist[p]["size"]
                        elif flist[p]["type"] == "count":
                            xfreq[l][p] = Counter()
                    xfreq[l][p][v] += 1
    if args.input is not None and args.input != "-":
        f.close()
    sys.stderr.write("\n")

    for p in range(P):
        if flist[p]["type"] == "count":
            for l, lang in enumerate(langs):
                maxv = max(xfreq[l][p].keys())
                vlist = [0] * (maxv + 1)
                for k, v in xfreq[l][p].items():
                    vlist[k] = v
                xfreq[l][p] = vlist
    for l, lang in enumerate(langs):
        lang["count"] = count
        lang["xfreq"] = xfreq[l]
        lang["zfreq"] = zfreq[l].tolist()
        if args.update:
            for p, fnode in enumerate(flist):
                v = int(np.argmax(np.array(lang["xfreq"][p])))
                lang["catvect_filled"][p] = v
                lang["annotation"]["features_filled"][fnode["annotation"]["name"]] = v
        sys.stdout.write("{}\n".format(json.dumps(lang)))
Exemplo n.º 26
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seed",
                        metavar="INT",
                        type=int,
                        default=None,
                        help="random seed")
    parser.add_argument("--bias",
                        action="store_true",
                        default=False,
                        help="bias term in Z")
    parser.add_argument("--only_alphas",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore v and h")
    parser.add_argument("--drop_vs",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore h")
    parser.add_argument("--drop_hs",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore v")
    parser.add_argument("-i",
                        "--iter",
                        dest="_iter",
                        metavar="INT",
                        type=int,
                        default=1000,
                        help="# of iterations")
    parser.add_argument("--save_interval",
                        metavar="INT",
                        type=int,
                        default=-1,
                        help="save interval")
    parser.add_argument("--K", metavar="INT", type=int, default=100, help="K")
    parser.add_argument('--norm_sigma',
                        type=float,
                        default=5.0,
                        help='standard deviation of Gaussian prior for u')
    parser.add_argument('--gamma_shape',
                        type=float,
                        default=1.0,
                        help='shape of Gamma prior for v and h')
    parser.add_argument('--gamma_scale',
                        type=float,
                        default=0.001,
                        help='scale of Gamma prior for v and h')
    parser.add_argument("--hmc_l", metavar="INT", type=int, default=10)
    parser.add_argument('--hmc_epsilon',
                        type=float,
                        default=0.05,
                        help='HMC epsilon')
    parser.add_argument("--maxanneal", metavar="INT", type=int, default=0)
    parser.add_argument(
        "--cv",
        action="store_true",
        default=False,
        help=
        "some features are intentionally hidden (but kept as \"catvect_orig\")"
    )
    parser.add_argument("--output",
                        dest="output",
                        metavar="FILE",
                        default=None,
                        help="save the model to the specified path")
    parser.add_argument("--resume",
                        metavar="FILE",
                        default=None,
                        help="resume training from model dump")
    parser.add_argument("--resume_if",
                        action="store_true",
                        default=False,
                        help="resume training if the output exists")
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    args = parser.parse_args()
    sys.stderr.write("args\t{}\n".format(args))

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    flist = load_json_file(args.flist)

    offset = 0
    if args.resume_if:
        if os.path.isfile(args.output + ".current"):
            args.resume = args.output + ".current"
        elif os.path.isfile(args.output + ".best"):
            args.resume = args.output + ".best"
    if args.resume:
        sys.stderr.write("loading model from {}\n".format(args.resume))
        spec = pickle.load(open(args.resume, "rb"))
        mda = spec["model"]
        sys.stderr.write("iter {}: {}\n".format(spec["iter"] + 1, spec["ll"]))
        if args.cv:
            eval_cvlist(mda)
        offset = spec["iter"] + 1
    else:
        langs = list(load_json_stream(open(args.langs)))
        mat, mvs = create_mat(langs, flist)

        sys.stderr.write("building vnet\n")
        vnet = create_vnet(langs)
        sys.stderr.write("building hnet\n")
        hnet = create_hnet(langs)
        mda = MatrixDecompositionAutologistic(mat,
                                              flist,
                                              vnet=vnet,
                                              hnet=hnet,
                                              K=args.K,
                                              mvs=mvs,
                                              bias=args.bias,
                                              only_alphas=args.only_alphas,
                                              drop_vs=args.drop_vs,
                                              drop_hs=args.drop_hs,
                                              norm_sigma=args.norm_sigma,
                                              gamma_shape=args.gamma_shape,
                                              gamma_scale=args.gamma_scale,
                                              hmc_l=args.hmc_l,
                                              hmc_epsilon=args.hmc_epsilon)
        if args.cv:
            mda.cvlist = create_cvlist(langs)
        mda.init_with_clusters()
        sys.stderr.write("iter 0: {}\n".format(mda.calc_loglikelihood()))
        if args.cv:
            eval_cvlist(mda)
    ll_max = -np.inf
    for _iter in range(offset, args._iter):
        mda.sample(_iter=_iter, maxanneal=args.maxanneal)
        ll = mda.calc_loglikelihood()
        sys.stderr.write("iter {}: {}\n".format(_iter + 1, ll))
        sys.stderr.flush()
        if args.cv:
            cv_result = eval_cvlist(mda)
            sys.stderr.flush()
        if args.save_interval >= 0 and (_iter + 1) % args.save_interval == 0:
            with open(args.output + ".{}".format(_iter), "wb") as f:
                obj = {"model": mda, "iter": _iter, "ll": ll}
                if args.cv:
                    obj["cv_result"] = cv_result
                pickle.dump(obj, f)
        if args.output is not None:
            with open(args.output + ".current", "wb") as f:
                obj = {"model": mda, "iter": _iter, "ll": ll}
                if args.cv:
                    obj["cv_result"] = cv_result
                pickle.dump(obj, f)
        if ll > ll_max:
            ll_max = ll
            shutil.copyfile(args.output + ".current", args.output + ".best")
    if args.output is not None:
        with open(args.output + ".final", "wb") as f:
            obj = {"model": mda, "iter": _iter, "ll": ll}
            if args.cv:
                obj["cv_result"] = cv_result
            pickle.dump(obj, f)
Exemplo n.º 27
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seed",
                        metavar="INT",
                        type=int,
                        default=None,
                        help="random seed")
    parser.add_argument("--bias",
                        action="store_true",
                        default=False,
                        help="bias term in Z")
    parser.add_argument("--only_alphas",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore v and h")
    parser.add_argument("--drop_hs",
                        action="store_true",
                        default=False,
                        help="autologistic: ignore h")
    parser.add_argument("-i",
                        "--iter",
                        dest="_iter",
                        metavar="INT",
                        type=int,
                        default=1000,
                        help="# of iterations")
    parser.add_argument("--save_interval",
                        metavar="INT",
                        type=int,
                        default=-1,
                        help="save interval")
    parser.add_argument("--K", metavar="INT", type=int, default=100, help="K")
    parser.add_argument('--norm_sigma',
                        type=float,
                        default=5.0,
                        help='standard deviation of Gaussian prior for u')
    parser.add_argument('--gamma_shape',
                        type=float,
                        default=1.0,
                        help='shape of Gamma prior for v and h')
    parser.add_argument('--gamma_scale',
                        type=float,
                        default=0.001,
                        help='scale of Gamma prior for v and h')
    parser.add_argument("--hmc_l", metavar="INT", type=int, default=10)
    parser.add_argument('--hmc_epsilon',
                        type=float,
                        default=0.05,
                        help='HMC epsilon')
    parser.add_argument("--maxanneal", metavar="INT", type=int, default=0)
    parser.add_argument("--output",
                        dest="output",
                        metavar="FILE",
                        default=None,
                        help="save the model to the specified path")
    parser.add_argument("--resume",
                        metavar="FILE",
                        default=None,
                        help="resume training from model dump")
    parser.add_argument("--resume_if",
                        action="store_true",
                        default=False,
                        help="resume training if the output exists")
    parser.add_argument('--bins', type=str, default=None)
    parser.add_argument('--bins_iter', type=int, default=100)
    parser.add_argument("langs", metavar="LANG", default=None)
    parser.add_argument("flist", metavar="FLIST", default=None)
    args = parser.parse_args()
    sys.stderr.write("args\t{}\n".format(args))

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    flist = load_json_file(args.flist)

    offset = 0
    if args.resume_if:
        if os.path.isfile(args.output + ".current"):
            args.resume = args.output + ".current"
        elif os.path.isfile(args.output + ".best"):
            args.resume = args.output + ".best"
    if args.resume:
        sys.stderr.write("loading model from {}\n".format(args.resume))
        spec = pickle.load(open(args.resume, "rb"))
        mda = spec["model"]
        sys.stderr.write("iter {}: {}\n".format(spec["iter"] + 1, spec["ll"]))
        offset = spec["iter"] + 1
    else:
        langs = list(load_json_stream(open(args.langs)))
        mat, mvs = create_mat(langs, flist)

        sys.stderr.write("building hnet\n")
        hnet = WeightedNeighborGraph(langs)
        mda = MatrixDecompositionAutologistic(
            mat,
            flist,
            hnet=hnet,
            K=args.K,
            mvs=mvs,
            bias=args.bias,
            only_alphas=args.only_alphas,
            drop_hs=args.drop_hs,
            norm_sigma=args.norm_sigma,
            # const_h = 0.03253780242472478,
            gamma_shape=args.gamma_shape,
            gamma_scale=args.gamma_scale,
            hmc_l=args.hmc_l,
            hmc_epsilon=args.hmc_epsilon)
        mda.init_with_clusters()
        sys.stderr.write("iter 0: {}\n".format(mda.calc_loglikelihood()))
    ll_max = -np.inf
    for _iter in range(offset, args._iter):
        mda.sample(_iter=_iter, maxanneal=args.maxanneal)
        ll = mda.calc_loglikelihood()
        sys.stderr.write("iter {}: {}\n".format(_iter + 1, ll))
        sys.stderr.flush()
        if args.save_interval >= 0 and (_iter + 1) % args.save_interval == 0:
            with open(args.output + ".{}".format(_iter), "wb") as f:
                obj = {"model": mda, "iter": _iter, "ll": ll}
        if args.output is not None:
            with open(args.output + ".current", "wb") as f:
                obj = {"model": mda, "iter": _iter, "ll": ll}
                pickle.dump(obj, f)
        if ll > ll_max:
            ll_max = ll
            shutil.copyfile(args.output + ".current", args.output + ".best")
    if args.output is not None:
        with open(args.output + ".final", "wb") as f:
            obj = {"model": mda, "iter": _iter, "ll": ll}
            pickle.dump(obj, f)

    if args.bins is not None:
        zmats = [np.copy(mda.zmat)]
        wmats = [np.copy(mda.wmat)]
        hkss = [np.copy(mda.hks)]
        for i in range(args.bins_iter):
            mda.sample()
            zmats.append(np.copy(mda.zmat))
            wmats.append(np.copy(mda.wmat))
            hkss.append(np.copy(mda.hks))
        avg_zmat = np.sum(zmats, axis=0) / float(len(zmats))
        avg_wmat = np.sum(wmats, axis=0) / float(len(wmats))
        avg_hks = np.sum(hkss, axis=0) / float(len(hkss))
        with open(args.bins, 'w') as f:
            f.write("{}\n".format(
                json.dumps({
                    "avg_zmat": avg_zmat.tolist(),
                    "avg_wmat": avg_wmat.tolist(),
                    "avg_hks": avg_hks.tolist(),
                })))