def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--random", dest="random", action="store_true", default=False) parser.add_argument("--freq", dest="most_frequent", action="store_true", default=False) parser.add_argument("--cvn", metavar="INT", type=int, default=10) parser.add_argument("langs", metavar="LANGS", default=None) parser.add_argument("f1", metavar="LANGS2 PREFIX", default=None) parser.add_argument("f2", metavar="LANGS2 PREFIX", default=None) args = parser.parse_args() if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) langs = list(load_json_stream(open(args.langs))) mat = np.zeros((2, 2), dtype=np.int32) for cvi in range(args.cvn): fp1 = args.f1.format(cvi) fp2 = args.f2.format(cvi) sys.stderr.write("processsing {} and {}\n".format(fp1, fp2)) filled_langs1 = list(load_json_stream(open(fp1))) filled_langs2 = list(load_json_stream(open(fp2))) mat += eval_mv(filled_langs1, filled_langs2, langs) print(mat) bunch = mcnemar(mat, exact=False) print("mcnemar\t{}".format(bunch))
def main(): sys.stdout = codecs.getwriter("utf-8")(sys.stdout) sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("--type", metavar="POINT_TYPE", default="theta") parser.add_argument("--output", metavar="IMG", default=None) parser.add_argument("dumps", metavar="DUMP", default=None) parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("flist", metavar="FLIST", default=None) args = parser.parse_args() burnin = 51 fid2struct = load_json_file(args.flist) langs = {} for lang in load_json_stream(open(args.langs)): if lang["source"] == "APiCS": langs[lang["name"]] = lang # stats = np.zeros(len(bin_flist)) # points = [] fcount = defaultdict(int) samples = 0 total = 0 rtotal = 0 stream = load_json_stream(open(args.dumps)) for i in xrange(burnin): stream.next() for dump in stream: # lang_num = len(dump['mixlist']) for creole in dump['mixlist']: catvect = langs[creole["langname"]]["catvect_filled"] total += 1 for j, val in enumerate(creole["assignments"]): if val == 0: rtotal += 1 vid = catvect[j] flabel = fid2struct[j]["name"] + "\t" + fid2struct[j]["vid2label"][vid] fcount[flabel] += 1 samples += 1 total = float(total) rtotal = float(rtotal) _sorted = sorted(fcount.keys(), key=lambda x: fcount[x], reverse=True) cum = 0 for flabel in _sorted: cum += fcount[flabel] sys.stdout.write("%d\t%f\t%f\t%s\n" % (fcount[flabel], fcount[flabel] / total, cum / rtotal, flabel))
def main(): parser = ArgumentParser() parser.add_argument("walslangs", metavar="WALS_LANGS", default=None) parser.add_argument("walsfeatures", metavar="WALS_FEATURES", default=None) parser.add_argument("apicslangs", metavar="APiCS_LANGS", default=None) parser.add_argument("merged", metavar="MERGED", default=None) parser.add_argument("flist", metavar="FLIST", default=None) args = parser.parse_args() wals_langs = {} for lang in load_json_stream(open(args.walslangs)): wals_langs[lang["name"]] = lang fid2struct = load_json_file(args.walsfeatures) apics_langs = {} for lang in load_json_stream(open(args.apicslangs)): apics_langs[lang["name"]] = lang # count features used in apics feature2count = defaultdict(float) for name, lang in apics_langs.iteritems(): for wals_id, v in lang["features"].iteritems(): feature2count[wals_id] += 1 # shrink features fid2struct2 = [] for struct in fid2struct: if struct["wals_id"] in feature2count: struct["idx"] = len(fid2struct2) fid2struct2.append(struct) fid2struct = fid2struct2 # shrink features property of each WALS language for name in wals_langs.keys(): lang = wals_langs[name] lang["source"] = "WALS" lang["orig_features"] = copy.copy(lang["features"]) for wals_id in lang["features"].keys(): if wals_id not in feature2count: del lang["features"][wals_id] with codecs.getwriter("utf-8")(open(args.merged, 'w')) as f: for _l in (apics_langs, wals_langs): for name, lang in _l.iteritems(): f.write("%s\n" % json.dumps(lang)) with codecs.getwriter("utf-8")(open(args.flist, 'w')) as f: f.write("%s\n" % json.dumps(fid2struct))
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--cv", dest="cv", metavar="INT", type=int, default=5, help="N-fold cross-validation") parser.add_argument("_in", metavar="INPUT", help="input") parser.add_argument("_out", metavar="OUTPUT", help="output") args = parser.parse_args() sys.stderr.write("%d-fold cross validation\n" % args.cv) if args.seed is not None: random.seed(args.seed) langs = [] cvns = [] for i, lang in enumerate(load_json_stream(open(args._in))): langs.append(lang) cvns.append(i % args.cv) random.shuffle(cvns) with codecs.getwriter("utf-8")(open(args._out, 'w')) as f: for lang, cvn in zip(langs, cvns): lang["cvn"] = cvn f.write("%s\n" % json.dumps(lang))
def main(src, fpath, dst, cvmap_file, cvn): langs = [lang for lang in load_json_stream(open(src))] flist = load_json_file(fpath) cvmap = load_json_file(cvmap_file) name2lang = {} for lang in langs: lang["annotation"]["features_orig"] = copy.copy( lang["annotation"]["features"]) lang["catvect_orig"] = copy.copy(lang["catvect"]) lang["cv"] = cvn name2lang[lang["annotation"]["name"]] = lang name2fstruct = {} for fstruct in flist: name2fstruct[fstruct["annotation"]["name"]] = fstruct for lname, fname in cvmap[cvn]: lang = name2lang[lname] fstruct = name2fstruct[fname] lang["catvect"][fstruct["fid"]] = -1 del lang["annotation"]["features"][fname] with open(dst, 'w') as f: for lang in langs: f.write("%s\n" % json.dumps(lang))
def main(): parser = ArgumentParser() parser.add_argument("--type", metavar="POINT_TYPE", default="theta") parser.add_argument("--output", metavar="IMG", default=None) parser.add_argument("dumps", metavar="LANG", default=None) args = parser.parse_args() fsize=24 subdiv=8 burnin = 51 assignments = np.zeros(3, dtype=np.int) samples = 0 stream = load_json_stream(open(args.dumps)) for i in xrange(burnin): stream.next() for dump in stream: assignments += dump["assignments_summary"] samples += 1 assignments = fix_order(assignments) _sum = float(sum(assignments)) sys.stdout.write("%d samples\n" % samples) sys.stdout.write("%f\t%f\t%f\n" % (assignments[0] / _sum, assignments[1] / _sum, assignments[2] / _sum))
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("src", metavar="SOURCE", default=None) parser.add_argument("dst", metavar="DESTINATION", default=None) parser.add_argument("cvn", metavar="INT", default=None) args = parser.parse_args() if args.seed is not None: random.seed(args.seed) src, dst = args.src, args.dst cvn = int(args.cvn) langs = list(load_json_stream(open(src))) filled_list = [] for lang in langs: for name, v in lang["annotation"]["features"].items(): filled_list.append((lang["annotation"]["name"], name)) random.shuffle(filled_list) # N-fold cross-validation cell_size = len(filled_list) // cvn cell_size2 = len(filled_list) % cvn cvmap = [[] for i in range(cvn)] for i in range(cvn): cell_start = cell_size * i + min(i, cell_size2) cell_len = cell_size + (i < cell_size2) for j in range(cell_start, cell_start + cell_len): cvmap[i].append(filled_list[j]) with open(dst, 'w') as f: f.write(json.dumps(cvmap))
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--random", dest="random", action="store_true", default=False) parser.add_argument("--freq", dest="most_frequent", action="store_true", default=False) parser.add_argument("langs", metavar="LANGS", default=None) parser.add_argument("f1", metavar="DUMMY_OR_LANGS_FILLED_OR_LANGS_HIDDEN", default=None) parser.add_argument("f2", metavar="FLIST_OR_DUMMY_OR_LANGS_HIDDEN", default=None) args = parser.parse_args() if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) langs = list(load_json_stream(open(args.langs))) if args.random: flist = load_json_file(args.f2) total, correct = eval_random(flist, langs) elif args.most_frequent: hidelist = list(load_json_stream(open(args.f1))) flist = load_json_file(args.f2) total, correct = eval_most_frequent(flist, hidelist, langs) else: filled_langs = list(load_json_stream(open(args.f1))) total, correct = eval_mv(filled_langs, langs) sys.stdout.write("%f\t%d\t%d\n" % (float(correct) / total, correct, total))
def main(): sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("langs_all", metavar="INPUT", default=None) parser.add_argument("flist", metavar="FLIST", default=None) parser.add_argument("langs", metavar="OUTPUT", default=None) args = parser.parse_args() fid2struct = load_json_file(args.flist) fsize = len(fid2struct) fname = "Ongoing creolization of pidgins" vnames = ["Not applicable (because the language is not a pidgin)", "Widespread"] fstruct = None for fstruct2 in fid2struct: if fstruct2["name"] == fname: fstruct = fstruct2 break if not fstruct: sys.stderr.write("No such feature found\n") exit(1) vids = [] for vname in vnames: if vname not in fstruct["label2vid"]: sys.stderr.write("No such feature value found\n") exit(1) vid = fstruct["label2vid"][vname] vids.append(vid) fid = str(fstruct["fid"]) sys.stderr.write("fid, vid: %s %s\n" % (fid, vids)) lang_total, lang_survived = 0, 0 with codecs.getwriter("utf-8")(open(args.langs, "w")) as out: for lang in load_json_stream(open(args.langs_all)): lang_total += 1 survived = True if lang["source"] == "APiCS": if fid in lang["apics_features"]: if lang["apics_features"][fid][0][0] not in vids: sys.stderr.write("remove %s (pidgins: %s)\n" % (lang["name"], lang["apics_features"][fid][0][0])) survived = False else: sys.stderr.write("keep %s (feature missed)\n" % lang["name"]) # survived = False if survived: lang_survived += 1 out.write("%s\n" % json.dumps(lang)) sys.stderr.write("language thresholding: %d -> %d\n" % (lang_total, lang_survived))
def main(): sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("langs_in", metavar="LANGS_IN", default=None) parser.add_argument("flist_in", metavar="FLIST_IN", default=None) parser.add_argument("langs_out", metavar="LANGS_OUT", default=None) args = parser.parse_args() fid2struct = load_json_file(args.flist_in) with codecs.getwriter("utf-8")(open(args.langs_out, 'w')) as f: for lang in load_json_stream(open(args.langs_in)): lang["catvect"] = create_cat_vect(fid2struct, lang["features"]) if "features_filled" in lang: lang["catvect_filled"] = create_cat_vect(fid2struct, lang["features_filled"]) f.write("%s\n" % json.dumps(lang))
def main(orig, src, fpath, dst): fid2struct = load_json_file(fpath) with open(src) as fin: fin.readline() # ignore the header with codecs.getwriter("utf-8")(open(dst, 'w')) as fout: for lang, l in zip(load_json_stream(open(orig)), fin): lang["features_filled"] = {} l = l.rstrip() a = l.split("\t") label = a.pop(0) for fid, v in enumerate(a): wals_id = fid2struct[fid]["wals_id"] lang["features_filled"][wals_id] = int(v) assert(wals_id not in lang["features"] or lang["features"][wals_id] == int(v)) fout.write("%s\n" % json.dumps(lang))
def main(orig, src, fpath, dst): langs = list(load_json_stream(open(orig))) flist = load_json_file(fpath) for lang in langs: lang["counted_features"] = [Counter() for feature in flist] lang["annotation"]["features_filled"] = {} for fstruct in flist: lang["annotation"]["features_filled"][fstruct["annotation"] ["name"]] = -1 for fpath in glob.glob(src + ".*"): sys.stderr.write("processing {}\n".format(fpath)) with open(fpath) as fin: fin.readline() # ignore the header for lang, l in zip(langs, fin): l = l.rstrip() a = l.split("\t") label = a.pop(0) for fid, v in enumerate(a): lang["counted_features"][fid][int(v)] += 1 for lang in langs: binsize = 0 xfreq = Counter() for fid, (fstruct, counts) in enumerate(zip(flist, lang["counted_features"])): if fstruct["type"] == "bin": size = 2 else: size = len(fstruct["annotation"]["vid2label"]) name = fstruct["annotation"]["name"] maxv, maxvv = -1, -1 for i in range(size): xfreq[binsize + i] += counts[i] # if lang["xfreq"][binsize+i] >= maxvv: if counts[i] >= maxvv: maxvv = counts[i] maxv = i lang["annotation"]["features_filled"][name] = maxv binsize += size del lang["counted_features"] lang["xfreq"] = [xfreq[i] for i in range(binsize)] with open(dst, 'w') as fout: for lang in langs: fout.write("%s\n" % json.dumps(lang))
def main(src, fpath, dst): flist = load_json_file(fpath) langs = list(load_json_stream(open(src, "r"))) counts = {} for fstruct in flist: if fstruct["type"] == "count": counts[fstruct["fid"]] = fstruct with open(dst, 'w') as f: rv = "\t".join([fstruct["annotation"]["name"] for fstruct in flist]) f.write(rv + "\n") for i, lang in enumerate(langs): catvect = list(lang["catvect"]) f.write("L{}\t{}\n".format( i, "\t".join(map(lambda x: str(x) if x >= 0 else "NA", catvect))))
def main(orig, src, fpath, dst): flist = load_json_file(fpath) with open(src) as fin: fin.readline() # ignore the header with open(dst, 'w') as fout: for lang, line in zip(load_json_stream(open(orig)), fin): line = line.rstrip() a = line.split("\t") a.pop(0) # lang id catvect = list(map(lambda x: int(x), a)) lang["catvect_filled"] = catvect lang["annotation"]["features_filled"] = {} for fid, v in enumerate(catvect): name = flist[fid]["annotation"]["name"] lang["annotation"]["features_filled"][name] = v assert (name not in lang["annotation"]["features"] or lang["annotation"]["features"][name] == v) fout.write("%s\n" % json.dumps(lang))
def main(): parser = ArgumentParser() parser.add_argument("--type", metavar="POINT_TYPE", default="theta") parser.add_argument("--output", metavar="IMG", default=None) parser.add_argument("dumps", metavar="LANG", default=None) args = parser.parse_args() fsize=24 subdiv=8 burnin = 51 stats = np.zeros(3) points = [] samples = 0 stream = load_json_stream(open(args.dumps)) for i in xrange(burnin): stream.next() for dump in stream: if args.type == "feature": for j, mu in enumerate(dump['mus']): _sum = logsumexp(mu) probs = np.exp(mu - _sum) probs2 = fix_order(probs) points.append(probs2) stats += probs2 elif args.type == "lang": for creole in dump['mixlist']: etas = np.array(creole['etas']) _sum = logsumexp(etas) probs = np.exp(etas - _sum) probs2 = fix_order(probs) points.append(probs2) stats += probs2 samples += 1 _sum = float(sum(stats)) sys.stdout.write("%d samples\n" % samples) sys.stdout.write("%f\t%f\t%f\n" % (stats[0] / _sum, stats[1] / _sum, stats[2] / _sum))
def main(src, fpath, dst, fpath2): fid2struct = load_json_file(fpath) with codecs.getwriter("utf-8")(open(dst, 'w')) as f: for i, lang in enumerate(load_json_stream(open(src))): rv = "" for struct in fid2struct: flen = len(struct["vid2label"]) _arr = ["0"] * flen wals_id = struct["wals_id"] v = lang["features_filled"][wals_id] _arr[v] = "1" rv += "".join(_arr) lang["bin"] = rv f.write("%s\n" % json.dumps(lang)) flist_bin = [] for struct in fid2struct: name = struct["name"] for v in struct["vid2label"]: flist_bin.append("%s\t%s" % (name, v)) with codecs.getwriter("utf-8")(open(fpath2, 'w')) as f: f.write("%s\n" % json.dumps(flist_bin))
def main(): sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("--lthres", dest="lthres", metavar="FLOAT", type=float, default=0.0, help="eliminate languages with higher rate of missing values [0,1]") parser.add_argument("langs_all", metavar="INPUT", default=None) parser.add_argument("flist", metavar="FLIST", default=None) parser.add_argument("langs", metavar="OUTPUT", default=None) args = parser.parse_args() fid2struct = load_json_file(args.flist) fsize = len(fid2struct) sys.stderr.write("%d featurs\n" % fsize) lang_total, lang_survived = 0, 0 with codecs.getwriter("utf-8")(open(args.langs, "w")) as out: for lang in load_json_stream(open(args.langs_all)): lang_total += 1 if float(len(lang["features"])) / fsize >= args.lthres: lang_survived += 1 out.write("%s\n" % json.dumps(lang)) sys.stderr.write("language thresholding: %d -> %d\n" % (lang_total, lang_survived))
def main(): sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=20000, help="number of dimensions") parser.add_argument("-t", "--type", dest="mtype", metavar="MODEL_TYPE", default="mono", help="model type (mono or fact)") parser.add_argument("--dump", default=None) parser.add_argument("langs", metavar="LANG", default=None) # parser.add_argument("fid2struct", metavar="FLIST", default=None) parser.add_argument("flist", metavar="FLIST", default=None) parser.add_argument("sources", metavar="SOURCES", default=None) args = parser.parse_args() if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) creoles = {} with codecs.getreader("utf-8")(open(args.sources)) as f: for line in f: line = line.rstrip() creole, lexifier, substrate = line.split("\t") creoles[creole] = { "lexifier": lexifier, "substrate": substrate, } langs = {} for lang in load_json_stream(open(args.langs)): if lang["source"] == "APiCS": langs[lang["name"]] = lang else: langs[lang["wals_code"]] = lang fid2struct = load_json_file(args.flist) # **TODO** pass command-line args if args.mtype == 'mono': cm = CreoleMixtureDiscrete(fid2struct, alpha_a=1.0, alpha_u=1.0) elif args.mtype == 'fact': cm = CreoleMixtureDiscreteFactored(fid2struct, gamma_f=10.0, gamma_c=10.0, alpha_u=1.0) else: sys.stderr.write("unsupported model\n") exit(1) objs = [] for creole, obj in creoles.iteritems(): if creole not in langs: sys.stderr.write("creole %s not found in the lang list\n" % creole) continue clang = langs[creole] if obj["lexifier"] not in langs: sys.stderr.write("lexifier %s not found in the lang list\n" % obj["lexifier"]) continue llang = langs[obj["lexifier"]] if obj["substrate"] not in langs: sys.stderr.write("substrate %s not found in the lang list\n" % obj["substrate"]) continue slang = langs[obj["substrate"]] clang_cat = clang["catvect_filled"] llang_cat = llang["catvect_filled"] slang_cat = slang["catvect_filled"] obj = cm.add_mix(clang_cat, llang_cat, slang_cat, langname=creole) objs.append({ "obj": obj, # "creole": evaluator.cat2bin(np.array(clang["catvect_filled"])), # "lexifier": evaluator.cat2bin(np.array(llang["catvect_filled"])), # "substrate": evaluator.cat2bin(np.array(slang["catvect_filled"])), }) sys.stderr.write("%d creoles\n" % len(cm.mixlist)) cm.init_mcmc() sys.stderr.write("0\tlog marginal: %f\n" % cm.log_marginal()) # print cm.niw.L sys.stdout.write("%s\n" % cm.serialize(_iter=0)) temp = 2.0 tempstep = temp / (args._iter * 0.75) for _iter in xrange(args._iter): temp -= tempstep if temp <= 0.1: temp = 0.1 # print >>sys.stderr, temp # cm.sample(temp=temp) cm.sample(temp=1.0) sys.stderr.write("%d\tlog marginal: %f\n" % (_iter + 1, cm.log_marginal())) if (_iter + 1) % 100 == 0: sys.stdout.write("%s\n" % cm.serialize(_iter=_iter + 1)) # print cm.niw.L if args.dump: if args.dump == "-": f = codecs.getwriter("utf-8")(sys.stdout) else: f = codecs.getwriter("utf-8")(open(args.dump, "w")) rv = [] for obj_base in objs: obj = obj_base["obj"] rv.append({ "creole": obj["creole"], "lexifier": obj["lexifier"], "substrate": obj["substrate"], "assignments": obj["assignments"].tolist(), }) f.write("%s\n" % json.dumps(rv)) f.close()
def main(): # sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") # parser.add_argument("--sidx", metavar="IDX", type=int, default=0, # help="i-th sample of leaf states (-1: last sample)") # parser.add_argument("--npriors", metavar="NODE_PRIORS", default=None, help="priors for nodes (json)") parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=1000, help="# of iterations") # parser.add_argument("--resume_if", action="store_true", default=False, # help="resume training if the output exists") parser.add_argument("model", metavar="FILE", default=None, help="resume training from model dump") parser.add_argument("flist", metavar="FLIST", default=None) # parser.add_argument("trees", metavar="TREES", default=None, help="merged trees (pkl)") parser.add_argument("langs", metavar="LANG", default=None) # **HACK** # parser.add_argument("samples", metavar="SAMPLES", default=None, help="parameter states (json stream)") parser.add_argument("out", metavar="OUT", default=None, help="out (pkl)") args = parser.parse_args() if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) # if args.resume_if: # if os.path.isfile(args.out + ".current"): # args.resume = args.out + ".current" # elif os.path.isfile(args.out + ".best"): # args.resume = args.out + ".best" # if args.resume: flist = load_json_file(args.flist) for fid, fnode in enumerate(flist): if fnode["annotation"]["fullname"] == "81A Order of Subject, Object and Verb": wals_id = fnode["annotation"]["name"] T = fnode["size"] # len(fnode["vid2label"]) break # "label2vid": {"1 SOV": 0, "2 SVO": 1, "3 VSO": 2, "6 OSV": 5, "4 VOS": 3, "5 OVS": 4, "7 No dominant order": 6} # fval = 0 # j_start, T = ibp.bmap(fid) sys.stderr.write("loading model from %s\n" % args.model) spec = pickle.load(open(args.model, "rb"), encoding="latin-1") trees = spec["trees"] # HACK from train_bin_ctmc import register_node langs = list(load_json_stream(open(args.langs, "r"))) idx2id = {} for i, lang in enumerate(langs): if "glottocode" in lang: idx2id[i] = lang["glottocode"] + ":" + lang["annotation"]["name"] id2node = {} glottocode2node = {} for tree in trees: register_node(tree, id2node, glottocode2node) for i, lang in enumerate(langs): if i in idx2id: _id = idx2id[i] if _id in id2node: node = id2node[_id] node.lang = lang # sampler = spec["sampler"] # if "logprob" not in spec: # logprob = sampler.logprob(trees) # else: # logprob = spec["logprob"] # sys.stderr.write("iter {}\t{}\n".format(spec["iter"], logprob)) # _start = spec["iter"] + 1 # else: _start = 1 # trees = load(open(args.trees, 'rb')) # # trees2 = [] # # for tree in trees: # # if tree.is_isolate is False: # # trees2.append(tree) # # trees = trees2 # sys.stderr.write("{} trees\n".format(len(trees))) # node_priors = None # if args.npriors is not None: # prior_specs = load_json_file(args.npriors) # node_priors = create_node_priors(prior_specs) # langs = list(load_json_stream(open(args.langs))) # with open(args.samples, 'r') as f: # for i, sample in enumerate(load_json_stream(f)): # if i == args.sidx: # break for tree in trees: update_state(tree, [wals_id]) sampler = CTMC_Sampler(1, states=[T], ctmc_scale=0.00005) sampler.init_trees(trees, sample_dates=False) sys.stderr.write("iter 0\t{}\n".format(sampler.logprob(trees))) for _iter in six.moves.range(_start, args._iter + 1): sampler.sample(_iter=_iter) logprob = sampler.logprob(trees) sys.stderr.write("iter {}\t{}\n".format(_iter, logprob)) with open(args.out + ".current", "wb") as f: pickle.dump({ "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f) with open(args.out + ".final", "wb") as f: pickle.dump({ "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f)
def main(): parser = ArgumentParser() parser.add_argument("--plot_type", dest="plot_type", metavar="INT", type=int, default=0) parser.add_argument("--pc1", dest="pc1", metavar="INT", type=int, default=0) parser.add_argument("--pc2", dest="pc2", metavar="INT", type=int, default=1) parser.add_argument("--kde", dest="do_kde", action="store_true", default=False) parser.add_argument("--output", metavar="IMG", default=None) parser.add_argument("langs", metavar="LANG", default=None) args = parser.parse_args() langs = list(load_json_stream(open(args.langs))) # flist = load_json_file(sys.argv[2]) dims = len(langs[0]["bin"]) X = extract_mat(langs) pca, X_transformed = do_pca(X) import matplotlib.pyplot as plt plt.figure(figsize=(8, 6), dpi=120) # import matplotlib as mpl # mpl.rcParams['font.family'] = 'Nimbus Roman No9 L' import matplotlib.font_manager as font_manager path = '/usr/share/fonts/truetype/msttcorefonts/Times_New_Roman.ttf' fontprop = font_manager.FontProperties(fname=path) p1, p2 = args.pc1, args.pc2 # first and second PCs (zero-based numbering) plot_type = args.plot_type # 0: both, 1: creole, 2: non-creole, 3: none # plt.xlim((-5, 4)) # plt.ylim((-4, 3)) plt.xlim((-4, 4)) plt.ylim((-4, 4)) plt.xticks(range(-4, 5), fontproperties=fontprop, size="25") plt.yticks(range(-4, 5), fontproperties=fontprop, size="25") plt.xlabel("PC%d (%2.1f%%)" % (p1 + 1, pca.explained_variance_ratio_[p1] * 100), fontproperties=fontprop, size="25") plt.ylabel("PC%d (%2.1f%%)" % (p2 + 1, pca.explained_variance_ratio_[p2] * 100), fontproperties=fontprop, size="25") plot_langs(langs, X_transformed, plt, p1, p2, plot_type=plot_type) plt.legend() if args.do_kde: val = [] for i, lang in enumerate(langs): x, y = X_transformed[i, p1], X_transformed[i, p2] if plot_type == 1 and lang["source"] == "APiCS": val.append((x, y)) elif plot_type == 2 and lang["source"] == "WALS": val.append((x, y)) val = np.array(val).T # val = np.vstack((X_transformed[:, p1], X_transformed[:, p2])) kernel = gaussian_kde(val) xmin, xmax = plt.xlim() ymin, ymax = plt.ylim() _X, _Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = np.vstack([_X.ravel(), _Y.ravel()]) Z = np.reshape(kernel(positions).T, _X.shape) # http://matplotlib.org/users/colormaps.html plt.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) # plt.imshow(np.rot90(Z), cmap=plt.cm.hot_r, extent=[xmin, xmax, ymin, ymax]) # plt.imshow(np.rot90(Z), cmap=plt.cm.afmhot_r, extent=[xmin, xmax, ymin, ymax]) # plt.title('PCA') # plt.xlim([-2.5, 1.5]) # plt.ylim([-1.5, 2.5]) if args.output: plt.savefig(args.output, format="pdf", transparent=False, bbox_inches="tight") # plt.savefig(args.output, format="png", transparent=False, dpi=160) plt.show()
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", dest="seed", metavar="INT", type=int, default=None, help="random seed") # parser.add_argument("--sidx", metavar="IDX", type=int, default=0, # help="i-th sample of leaf states (-1: last sample)") parser.add_argument("--npriors", metavar="NODE_PRIORS", default=None, help="priors for nodes (json)") parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=1000, help="# of iterations") parser.add_argument("--resume_if", action="store_true", default=False, help="resume training if the output exists") parser.add_argument("--resume", metavar="FILE", default=None, help="resume training from model dump") parser.add_argument("--save_interval", metavar="INT", type=int, default=-1, help="save interval") # parser.add_argument("--merged", action="store_true", default=False, # help="use merged langs instead of a single sample") parser.add_argument("--has_bias", action="store_true", default=False, help="0th item of z is a bias term (always 1)") parser.add_argument("--surface", dest="latent", action="store_false", default=True, help="use surface features") parser.add_argument( "--surface_state_limit", dest="max_states", type=int, default=-1, help= "maximum number of distinct states of surface features (trimming for speed)" ) parser.add_argument("trees", metavar="TREES", default=None, help="merged trees (pkl)") parser.add_argument("langs", metavar="LANG", default=None) # parser.add_argument("samples", metavar="SAMPLES", default=None, help="parameter states (json stream)") parser.add_argument("out", metavar="OUT", default=None, help="out (pkl)") args = parser.parse_args() if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) if args.resume_if: if os.path.isfile(args.out + ".current"): args.resume = args.out + ".current" elif os.path.isfile(args.out + ".best"): args.resume = args.out + ".best" if args.resume: sys.stderr.write("loading model from %s\n" % args.resume) spec = load(open(args.resume, "rb")) trees = spec["trees"] sampler = spec["sampler"] if "logprob" not in spec: logprob = sampler.logprob(trees) else: logprob = spec["logprob"] sys.stderr.write("iter {}\t{}\n".format(spec["iter"], logprob)) _start = spec["iter"] + 1 else: _start = 1 trees = load(open(args.trees, 'rb')) sys.stderr.write("{} trees\n".format(len(trees))) node_priors = None if args.npriors is not None: prior_specs = load_json_file(args.npriors) node_priors = create_node_priors(prior_specs) langs = list(load_json_stream(open(args.langs, "r"))) # with open(args.samples, 'r') as f: # if args.merged: # sample = list(load_json_stream(f)) # else: # for i, sample in enumerate(load_json_stream(f)): # if i == args.sidx: # break K, states = attach_sample(trees, langs, node_priors, has_bias=args.has_bias, latent=args.latent, max_states=args.max_states) sampler = CTMC_Sampler(K, states=states, ctmc_scale=0.00005) sampler.init_trees(trees) sys.stderr.write("iter 0\t{}\n".format(sampler.logprob(trees))) _iter = _start - 1 if args.save_interval >= 0 and _iter % args.save_interval == 0: with open(args.out + ".{}".format(_iter), "wb") as f: dump( { "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f) for _iter in range(_start, args._iter + 1): sampler.sample(_iter=_iter) logprob = sampler.logprob(trees) sys.stderr.write("iter {}\t{}\n".format(_iter, logprob)) if args.save_interval >= 0 and _iter % args.save_interval == 0: with open(args.out + ".{}".format(_iter), "wb") as f: dump( { "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f) with open(args.out + ".current", "wb") as f: dump( { "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f) with open(args.out + ".final", "wb") as f: dump( { "sampler": sampler, "trees": trees, "iter": _iter, "logprob": logprob }, f)
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--K", metavar="INT", type=int, default=3, help="K") parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=1000, help="# of iterations") parser.add_argument("--output", dest="output", metavar="FILE", default=None, help="save the model to the specified path") parser.add_argument('--bins', type=str, default=None) parser.add_argument('--bins_iter', type=int, default=500) parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("flist", metavar="FLIST", default=None) args = parser.parse_args() sys.stderr.write("args\t{}\n".format(args)) if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) flist = load_json_file(args.flist) langs = list(load_json_stream(open(args.langs))) mat, mvs = create_mat(langs, flist) adm = Admixture(mat, flist, K=args.K, mvs=mvs) ll = adm.logprob() sys.stderr.write("iter 0: {}\n".format(ll)) ll_max = ll for _iter in range(args._iter): adm.sample() ll = adm.logprob() sys.stderr.write("iter {}: {}\n".format(_iter + 1, ll)) sys.stderr.flush() if args.output is not None: with open(args.output + ".current", "wb") as f: obj = {"model": adm, "iter": _iter + 1, "ll": ll} pickle.dump(obj, f) if ll > ll_max: ll_max = ll shutil.copyfile(args.output + ".current", args.output + ".best") if args.output is not None: with open(args.output + ".final", "wb") as f: obj = {"model": adm, "iter": _iter + 1, "ll": ll} pickle.dump(obj, f) if args.bins is not None: # Zs = [np.copy(adm.Z)] bins = [ np.apply_along_axis(lambda x: np.bincount(x, minlength=adm.K), axis=1, arr=adm.Z) ] for i in range(args.bins_iter): adm.sample() # Zs.append(np.copy(adm.Z)) bins.append( np.apply_along_axis(lambda x: np.bincount(x, minlength=adm.K), axis=1, arr=adm.Z)) # Zs = np.vstack(Zs) # bins = np.apply_along_axis(lambda x: np.bincount(x, minlength=adm.K), axis=1, arr=Zs) bins = np.dstack(bins).sum(axis=2) with open(args.bins, 'w') as f: f.write("{}\n".format(json.dumps(bins.tolist())))
def main(): # sys.stderr = codecs.getwriter("utf-8")(sys.stderr) parser = ArgumentParser() parser.add_argument("-s", "--seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--fid", metavar="INT", type=int, default=-1) parser.add_argument("--only_alphas", action="store_true", default=False, help="autologistic: ignore v and h") parser.add_argument("--drop_hs", action="store_true", default=False, help="autologistic: ignore v") parser.add_argument("--burnin", metavar="INT", type=int, default=1000, help="# of iterations") parser.add_argument("--samples", metavar="INT", type=int, default=500, help="save interval") parser.add_argument("--interval", metavar="INT", type=int, default=5, help="sampling interval") parser.add_argument("--alpha", metavar="FLOAT", type=float, default=-1.0, help="parameter alpha") parser.add_argument("--K", metavar="INT", type=int, default=100, help="K") parser.add_argument('--norm_sigma', type=float, default=5.0, help='standard deviation of Gaussian prior for u') parser.add_argument('--gamma_shape', type=float, default=1.0, help='shape of Gamma prior for v and h') parser.add_argument('--gamma_scale', type=float, default=0.001, help='scale of Gamma prior for v and h') parser.add_argument("--output", dest="output", metavar="FILE", default=None, help="save the model to the specified path") parser.add_argument("--resume", metavar="FILE", default=None, help="resume training from model dump") parser.add_argument("--resume_if", action="store_true", default=False, help="resume training if the output exists") parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("flist", metavar="FLIST", default=None) parser.add_argument("aggregated", metavar="FLIST", default=None) args = parser.parse_args() sys.stderr.write("args\t{}\n".format(args)) if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) flist = load_json_file(args.flist) # offset = 0 # if args.resume_if: # if os.path.isfile(args.output + ".current"): # args.resume = args.output + ".current" # elif os.path.isfile(args.output + ".best"): # args.resume = args.output + ".best" # if args.resume: # sys.stderr.write("loading model from {}\n".format(args.resume)) # spec = pickle.load(open(args.resume, "rb")) # mda = spec["model"] # sys.stderr.write("iter {}\n".format(spec["iter"] + 1)) # if args.cv: # eval_cvlist(mda) # offset = spec["iter"] + 1 # else: langs = list(load_json_stream(open(args.langs))) sys.stderr.write("building hnet\n") hnet = WeightedNeighborGraph(langs) if args.fid >= 0: fstruct = flist[args.fid] vec, mvs, size = create_vec(langs, fstruct, args.fid) al = CategoricalAutologistic(vec, size, hnet=hnet, mvs=mvs, only_alphas=args.only_alphas, drop_hs=args.drop_hs, norm_sigma=args.norm_sigma, gamma_shape=args.gamma_shape, gamma_scale=args.gamma_scale) else: als = [] for fid, fstruct in enumerate(flist): vec, mvs, size = create_vec(langs, fstruct, fid) al = CategoricalAutologistic(vec, size, hnet=hnet, mvs=mvs, only_alphas=args.only_alphas, drop_hs=args.drop_hs, norm_sigma=args.norm_sigma, gamma_shape=args.gamma_shape, gamma_scale=args.gamma_scale) als.append(al) al = CategoricalAutologisticGroup(als) sys.stderr.write("iter 0\n") offset = 0 for _iter in range(args.burnin): al.sample() offset += 1 # ll = mda.calc_loglikelihood() sys.stderr.write("iter {}\n".format(offset)) sys.stderr.flush() if args.output is not None: with open(args.output, "wb") as f: obj = {"model": al, "iter": offset} pickle.dump(obj, f) results = [] results.append(get_result(al)) while len(results) < args.samples: for _iter in range(args.interval): al.sample() offset += 1 sys.stderr.write("iter {}\n".format(offset)) sys.stderr.flush() results.append(get_result(al)) if args.aggregated == "-": f = sys.stdout else: f = open(args.aggregated, "w") aggregated = aggregate_results(results, al, flist, args.fid) f.write("%s\n" % json.dumps(aggregated))
def main(): parser = ArgumentParser() parser.add_argument("--lthres", dest="lthres", metavar="FLOAT", type=float, default=0.0, help="eliminate trees with higher rate of missing values [0,1]") parser.add_argument("--npriors", metavar="NODE_PRIORS", default=None, help="priors for nodes (json)") parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("tree", metavar="TREE", default=None) parser.add_argument("out", metavar="OUTPUT", default=None) args = parser.parse_args() trees = load(open(args.tree, 'rb')) langs = list(load_json_stream(open(args.langs))) # a glottocode corresponds to one or more WALS languages glotto_code2lang = {} for lang in langs: if "glottocode" in lang and lang["glottocode"]: if lang["glottocode"] in glotto_code2lang: glotto_code2lang[lang["glottocode"]].append(lang) else: glotto_code2lang[lang["glottocode"]] = [lang] else: sys.stderr.write(u"dropping lang without glottocode: {}\n".format(lang["annotation"]["name"])) # try to keep nodes specified by priors registered_codes = {} if args.npriors is not None: prior_specs = load_json_file(args.npriors) for spec in prior_specs: registered_codes[spec["glottocode"]] = True for tree in trees: remove_unclassified(tree) for tree in trees: attach_lang(tree, glotto_code2lang) ucount, total = 0, 0 for code, langlist in glotto_code2lang.items(): for lang in langlist: total += 1 if "used" in lang and lang["used"] == True: ucount += 1 del lang["used"] else: sys.stderr.write("glottocode never appeared in trees: {}\n".format(code)) sys.stderr.write("{} (out of {}) languages in the trees\n".format(ucount, total)) trees2 = [] for tree in trees: c = shrink_tree(tree, registered_codes) if c > 0: trees2.append(tree) sys.stderr.write("# of trees: {} -> {}\n".format(len(trees), len(trees2))) trees = trees2 if args.lthres > 0.0: trees2 = [] for tree in trees: c = get_feature_coverage(tree) if c >= args.lthres: trees2.append(tree) sys.stderr.write("# of trees ({} thres): {} -> {}\n".format(args.lthres, len(trees), len(trees2))) trees = trees2 isolate = 0 for tree in trees: tree.is_root = True tree.parent = None if len(tree.children) <= 0: tree.is_isolate = True isolate += 1 tree.date = 0.0 tree.is_date_frozen = True tree.is_state_frozen = True else: tree.is_isolate = False tree.is_date_frozen = False tree.is_state_frozen = False for child in tree.children: set_date(child) sys.stderr.write("# of isolate trees: {} (out of {})\n".format(isolate, len(trees))) # for tree in trees: # sys.stdout.write(tree.name + "\n") with open(args.out, "wb") as f: dump(trees, f)
def main(): parser = ArgumentParser() parser.add_argument("--input", metavar="FILE", default=None) parser.add_argument("--burnin", metavar="INT", type=int, default=0, help="# of burn-in iterations") parser.add_argument("--interval", metavar="INT", type=int, default=1, help="pick up one per # samples") parser.add_argument("--update", action="store_true", default=False, help="update features (for MVI)") parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("flist", metavar="FLIST", default=None) args = parser.parse_args() flist = load_json_file(args.flist) langs = list(load_json_stream(open(args.langs))) P = len(flist) L = len(langs) count = 0 xfreq = [] for l in range(L): xfreq.append([None] * P) zfreq = None if args.input is None or args.input == "-": f = sys.stdin elif args.input.endswith(".bz2"): import bz2 f = bz2.open(args.input, "r") else: f = open(args.input, "r") for langdat in load_json_stream(f): sys.stderr.write("+") sys.stderr.flush() if langdat["iter"] >= args.burnin and langdat["iter"] % args.interval == 0: count += 1 if zfreq is None: zfreq = np.zeros((L, len(langdat["z"])), dtype=np.int32) zfreq += np.array(langdat["z"]).T for l in range(L): for p in range(P): v = langdat["x"][l][p] if xfreq[l][p] is None: if flist[p]["type"] == "bin": xfreq[l][p] = [0, 0] elif flist[p]["type"] == "cat": xfreq[l][p] = [0] * flist[p]["size"] elif flist[p]["type"] == "count": xfreq[l][p] = Counter() xfreq[l][p][v] += 1 if args.input is not None and args.input != "-": f.close() sys.stderr.write("\n") for p in range(P): if flist[p]["type"] == "count": for l, lang in enumerate(langs): maxv = max(xfreq[l][p].keys()) vlist = [0] * (maxv + 1) for k, v in xfreq[l][p].items(): vlist[k] = v xfreq[l][p] = vlist for l, lang in enumerate(langs): lang["count"] = count lang["xfreq"] = xfreq[l] lang["zfreq"] = zfreq[l].tolist() if args.update: for p, fnode in enumerate(flist): v = int(np.argmax(np.array(lang["xfreq"][p]))) lang["catvect_filled"][p] = v lang["annotation"]["features_filled"][fnode["annotation"]["name"]] = v sys.stdout.write("{}\n".format(json.dumps(lang)))
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--bias", action="store_true", default=False, help="bias term in Z") parser.add_argument("--only_alphas", action="store_true", default=False, help="autologistic: ignore v and h") parser.add_argument("--drop_vs", action="store_true", default=False, help="autologistic: ignore h") parser.add_argument("--drop_hs", action="store_true", default=False, help="autologistic: ignore v") parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=1000, help="# of iterations") parser.add_argument("--save_interval", metavar="INT", type=int, default=-1, help="save interval") parser.add_argument("--K", metavar="INT", type=int, default=100, help="K") parser.add_argument('--norm_sigma', type=float, default=5.0, help='standard deviation of Gaussian prior for u') parser.add_argument('--gamma_shape', type=float, default=1.0, help='shape of Gamma prior for v and h') parser.add_argument('--gamma_scale', type=float, default=0.001, help='scale of Gamma prior for v and h') parser.add_argument("--hmc_l", metavar="INT", type=int, default=10) parser.add_argument('--hmc_epsilon', type=float, default=0.05, help='HMC epsilon') parser.add_argument("--maxanneal", metavar="INT", type=int, default=0) parser.add_argument( "--cv", action="store_true", default=False, help= "some features are intentionally hidden (but kept as \"catvect_orig\")" ) parser.add_argument("--output", dest="output", metavar="FILE", default=None, help="save the model to the specified path") parser.add_argument("--resume", metavar="FILE", default=None, help="resume training from model dump") parser.add_argument("--resume_if", action="store_true", default=False, help="resume training if the output exists") parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("flist", metavar="FLIST", default=None) args = parser.parse_args() sys.stderr.write("args\t{}\n".format(args)) if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) flist = load_json_file(args.flist) offset = 0 if args.resume_if: if os.path.isfile(args.output + ".current"): args.resume = args.output + ".current" elif os.path.isfile(args.output + ".best"): args.resume = args.output + ".best" if args.resume: sys.stderr.write("loading model from {}\n".format(args.resume)) spec = pickle.load(open(args.resume, "rb")) mda = spec["model"] sys.stderr.write("iter {}: {}\n".format(spec["iter"] + 1, spec["ll"])) if args.cv: eval_cvlist(mda) offset = spec["iter"] + 1 else: langs = list(load_json_stream(open(args.langs))) mat, mvs = create_mat(langs, flist) sys.stderr.write("building vnet\n") vnet = create_vnet(langs) sys.stderr.write("building hnet\n") hnet = create_hnet(langs) mda = MatrixDecompositionAutologistic(mat, flist, vnet=vnet, hnet=hnet, K=args.K, mvs=mvs, bias=args.bias, only_alphas=args.only_alphas, drop_vs=args.drop_vs, drop_hs=args.drop_hs, norm_sigma=args.norm_sigma, gamma_shape=args.gamma_shape, gamma_scale=args.gamma_scale, hmc_l=args.hmc_l, hmc_epsilon=args.hmc_epsilon) if args.cv: mda.cvlist = create_cvlist(langs) mda.init_with_clusters() sys.stderr.write("iter 0: {}\n".format(mda.calc_loglikelihood())) if args.cv: eval_cvlist(mda) ll_max = -np.inf for _iter in range(offset, args._iter): mda.sample(_iter=_iter, maxanneal=args.maxanneal) ll = mda.calc_loglikelihood() sys.stderr.write("iter {}: {}\n".format(_iter + 1, ll)) sys.stderr.flush() if args.cv: cv_result = eval_cvlist(mda) sys.stderr.flush() if args.save_interval >= 0 and (_iter + 1) % args.save_interval == 0: with open(args.output + ".{}".format(_iter), "wb") as f: obj = {"model": mda, "iter": _iter, "ll": ll} if args.cv: obj["cv_result"] = cv_result pickle.dump(obj, f) if args.output is not None: with open(args.output + ".current", "wb") as f: obj = {"model": mda, "iter": _iter, "ll": ll} if args.cv: obj["cv_result"] = cv_result pickle.dump(obj, f) if ll > ll_max: ll_max = ll shutil.copyfile(args.output + ".current", args.output + ".best") if args.output is not None: with open(args.output + ".final", "wb") as f: obj = {"model": mda, "iter": _iter, "ll": ll} if args.cv: obj["cv_result"] = cv_result pickle.dump(obj, f)
def main(): parser = ArgumentParser() parser.add_argument("-s", "--seed", metavar="INT", type=int, default=None, help="random seed") parser.add_argument("--bias", action="store_true", default=False, help="bias term in Z") parser.add_argument("--only_alphas", action="store_true", default=False, help="autologistic: ignore v and h") parser.add_argument("--drop_hs", action="store_true", default=False, help="autologistic: ignore h") parser.add_argument("-i", "--iter", dest="_iter", metavar="INT", type=int, default=1000, help="# of iterations") parser.add_argument("--save_interval", metavar="INT", type=int, default=-1, help="save interval") parser.add_argument("--K", metavar="INT", type=int, default=100, help="K") parser.add_argument('--norm_sigma', type=float, default=5.0, help='standard deviation of Gaussian prior for u') parser.add_argument('--gamma_shape', type=float, default=1.0, help='shape of Gamma prior for v and h') parser.add_argument('--gamma_scale', type=float, default=0.001, help='scale of Gamma prior for v and h') parser.add_argument("--hmc_l", metavar="INT", type=int, default=10) parser.add_argument('--hmc_epsilon', type=float, default=0.05, help='HMC epsilon') parser.add_argument("--maxanneal", metavar="INT", type=int, default=0) parser.add_argument("--output", dest="output", metavar="FILE", default=None, help="save the model to the specified path") parser.add_argument("--resume", metavar="FILE", default=None, help="resume training from model dump") parser.add_argument("--resume_if", action="store_true", default=False, help="resume training if the output exists") parser.add_argument('--bins', type=str, default=None) parser.add_argument('--bins_iter', type=int, default=100) parser.add_argument("langs", metavar="LANG", default=None) parser.add_argument("flist", metavar="FLIST", default=None) args = parser.parse_args() sys.stderr.write("args\t{}\n".format(args)) if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) flist = load_json_file(args.flist) offset = 0 if args.resume_if: if os.path.isfile(args.output + ".current"): args.resume = args.output + ".current" elif os.path.isfile(args.output + ".best"): args.resume = args.output + ".best" if args.resume: sys.stderr.write("loading model from {}\n".format(args.resume)) spec = pickle.load(open(args.resume, "rb")) mda = spec["model"] sys.stderr.write("iter {}: {}\n".format(spec["iter"] + 1, spec["ll"])) offset = spec["iter"] + 1 else: langs = list(load_json_stream(open(args.langs))) mat, mvs = create_mat(langs, flist) sys.stderr.write("building hnet\n") hnet = WeightedNeighborGraph(langs) mda = MatrixDecompositionAutologistic( mat, flist, hnet=hnet, K=args.K, mvs=mvs, bias=args.bias, only_alphas=args.only_alphas, drop_hs=args.drop_hs, norm_sigma=args.norm_sigma, # const_h = 0.03253780242472478, gamma_shape=args.gamma_shape, gamma_scale=args.gamma_scale, hmc_l=args.hmc_l, hmc_epsilon=args.hmc_epsilon) mda.init_with_clusters() sys.stderr.write("iter 0: {}\n".format(mda.calc_loglikelihood())) ll_max = -np.inf for _iter in range(offset, args._iter): mda.sample(_iter=_iter, maxanneal=args.maxanneal) ll = mda.calc_loglikelihood() sys.stderr.write("iter {}: {}\n".format(_iter + 1, ll)) sys.stderr.flush() if args.save_interval >= 0 and (_iter + 1) % args.save_interval == 0: with open(args.output + ".{}".format(_iter), "wb") as f: obj = {"model": mda, "iter": _iter, "ll": ll} if args.output is not None: with open(args.output + ".current", "wb") as f: obj = {"model": mda, "iter": _iter, "ll": ll} pickle.dump(obj, f) if ll > ll_max: ll_max = ll shutil.copyfile(args.output + ".current", args.output + ".best") if args.output is not None: with open(args.output + ".final", "wb") as f: obj = {"model": mda, "iter": _iter, "ll": ll} pickle.dump(obj, f) if args.bins is not None: zmats = [np.copy(mda.zmat)] wmats = [np.copy(mda.wmat)] hkss = [np.copy(mda.hks)] for i in range(args.bins_iter): mda.sample() zmats.append(np.copy(mda.zmat)) wmats.append(np.copy(mda.wmat)) hkss.append(np.copy(mda.hks)) avg_zmat = np.sum(zmats, axis=0) / float(len(zmats)) avg_wmat = np.sum(wmats, axis=0) / float(len(wmats)) avg_hks = np.sum(hkss, axis=0) / float(len(hkss)) with open(args.bins, 'w') as f: f.write("{}\n".format( json.dumps({ "avg_zmat": avg_zmat.tolist(), "avg_wmat": avg_wmat.tolist(), "avg_hks": avg_hks.tolist(), })))