def writeout_cluster_to_word_map(mapping, output_f_v, output_f_rep, replace_ids=True, one_hot=False): with open(output_f_v, "w") as out_word, open(output_f_rep, "w") as out_rep: if replace_ids: new_c_id = 0 for _, w_set in mapping.items(): new_c_id += 1 for w in w_set: out_word.write(w+"\n") out_rep.write("{}\n".format(new_c_id)) elif one_hot: for c, (_, w_set) in enumerate(mapping.items()): for w in w_set: out_word.write(w+"\n") one_hot = np.zeros(len(mapping), 'int') one_hot[c] = 1 out_rep.write("{}\n".format(nparr_to_str(one_hot)))
def posttype_txt_plain(posttype_f, vocab_f, output_f_v, output_f_rep, threedim=False, vocab_r=None): """ Create two txt files: vocabulary with one word per line, and representation vectors, one per line. :param threedim: npy posttypes file contains 3-dimensions, ie extra dim. for syn. fun. """ if threedim and not vocab_r: sys.exit("Missing rel. vocabulary.") if vocab_r and not threedim: sys.exit("Use rel. vocabulary?") _, _, posttypes = plain_posttype_txt(posttype_f, vocab_f, threedim, vocab_r) with open(output_f_v, "w") as out_v, open(output_f_rep, "w") as out_r: for w, rep in posttypes: if np.isnan(np.sum(rep)): continue out_v.write("{}\n".format(w)) out_r.write("{}\n".format(nparr_to_str(rep)))
def writeout_cluster_to_word_map(mapping, output_f_v, output_f_rep, replace_ids=True, one_hot=False): with open(output_f_v, "w") as out_word, open(output_f_rep, "w") as out_rep: if replace_ids: new_c_id = 0 for _, w_set in mapping.items(): new_c_id += 1 for w in w_set: out_word.write(w + "\n") out_rep.write("{}\n".format(new_c_id)) elif one_hot: for c, (_, w_set) in enumerate(mapping.items()): for w in w_set: out_word.write(w + "\n") one_hot = np.zeros(len(mapping), 'int') one_hot[c] = 1 out_rep.write("{}\n".format(nparr_to_str(one_hot)))
# obtain model parameters n_states, n_obs, _, _, _, omit_class_cond, omit_emis_cond = read_params_from_path(path) lemmas = args.use_lemmas eval_spec_rel = args.synfunc lr = False # load model params_fixed = (np.load("{}ip.npy".format(path)), np.load("{}tp.npy".format(path)), np.load("{}fp.npy".format(path)), np.load("{}ep.npy".format(path))) # prepare sents for decoding sents = ConllCorpus(infile, howbig=1000000, lemmas=lemmas, eval_spec_rels=eval_spec_rel, dirname=path, lr=lr) sents.prepare_trees() h = HMRTM(n_states, n_obs, R=len(sents.r_dict), params=params_fixed, writeout=False, dirname=path, omit_class_cond=omit_class_cond, omit_emis_cond=omit_emis_cond) if eval_spec_rel else \ HMTM(n_states, n_obs, params=params_fixed, writeout=False, dirname=path) with open(args.outfile, "w") as out: for tree in sents.train: # obtain posteriors for all nodes node_to_rep = h.posterior_decode(tree, cont=True) # get words for node in tree.get_nonroots(): out.write( "{} {}\n".format(sents.x_dict.get_label_name(node.name), nparr_to_str(node_to_rep[node.index]))) out.write("\n")
def write_fig_data(reps, ws, outfile): with open(outfile, "w") as out: for w, arr in zip(ws, reps): out.write("{} {}\n".format(w, nparr_to_str(arr)))
def format(word, nparray): return "{} {}\n".format(word, nparr_to_str(nparray))