) argparser.add_argument( "--rec_model_type", type=str, help="Reconstruction model (gaussian, multinomial)", default="gaussian" ) args = argparser.parse_args() pred_arg_pos = args.word_types.split("_") learning_rate = args.lr use_pretrained_wordrep = False if args.pt_rep: use_pretrained_wordrep = True pt_word_rep = { l.split()[0]: numpy.asarray([float(f) for f in l.strip().split()[1:]]) for l in gzip.open(args.pt_rep) } dp = DataProcessor(pred_arg_pos) x_data, y_s_data, w_ind, c_ind, w_h_map, w_oov, c_oov = dp.make_data(args.train_file, relaxed=args.use_relaxation) rev_w_ind = {ind: word for word, ind in w_ind.items()} rev_c_ind = {ind: concept for concept, ind in c_ind.items()} init_hyp_strengths = None if args.rec_model_type == "multinomial": init_hyp_strengths = numpy.zeros((len(c_ind), len(w_ind))) for word in w_h_map: word_ind = w_ind[word] if word in w_ind else 0 for concept in w_h_map[word]: concept_ind = c_ind[concept] if concept in c_ind else 0 init_hyp_strengths[concept_ind][word_ind] = 1.0 if len(w_oov) != 0: print >> sys.stderr, "Regarding %d words as OOV" % (len(w_oov))
from event_ae import EventAE from process_data import DataProcessor sys.setrecursionlimit(10000) num_args = 2 num_slots = num_args + 1 hyp_hidden_size = 50 learning_rate = 0.01 wc_hidden_sizes = [50] * num_slots cc_hidden_sizes = [50] * num_args max_iter = 10 num_procs = int(sys.argv[2]) dp = DataProcessor() x_data, y_s_data, w_ind, c_ind, w_h_map = dp.make_data(sys.argv[1]) vocab_file = codecs.open("vocab.txt", "w", "utf-8") for w, ind in w_ind.items(): print >>vocab_file, w, ind vocab_file.close() ont_file = codecs.open("ont.txt", "w", "utf-8") for c, ind in c_ind.items(): print >>ont_file, c, ind ont_file.close() rev_w_ind = {ind:word for word, ind in w_ind.items()} rev_c_ind = {ind:concept for concept, ind in c_ind.items()} train_data = zip(x_data, y_s_data) sanity_test_data = random.sample(train_data, len(train_data)/10)
#argparser.add_argument('--pt_rep', type=str, help="File containing pretrained embeddings") argparser.add_argument('--use_em', help="Use EM (Default is False)", action='store_true') argparser.set_defaults(use_em=False) argparser.add_argument('--use_nce', help="Use NCE for estimating encoding probability. (Default is False)", action='store_true') argparser.set_defaults(use_nce=False) argparser.add_argument('--hyp_model_type', type=str, help="Hypernymy model (weighted_prod, linlayer, tanhlayer)", default="weighted_prod") argparser.add_argument('--wc_pref_model_type', type=str, help="Word-concept preference model (weighted_prod, linlayer, tanhlayer)", default="tanhlayer") argparser.add_argument('--cc_pref_model_type', type=str, help="Concept-concept preference model (weighted_prod, linlayer, tanhlayer)", default="tanhlayer") argparser.add_argument('--rec_model_type', type=str, help="Reconstruction model (gaussian, multinomial)", default="gaussian") argparser.add_argument('--param_iter', type=int, help="Iteration of learned param to use (default 1)", default=1) args = argparser.parse_args() use_relaxation = args.use_relaxation pred_arg_pos = args.word_types.split("_") dp = DataProcessor(pred_arg_pos) x_data, y_s_data, w_ind, c_ind, _, _, _ = dp.make_data(args.test_file, relaxed=args.use_relaxation, handle_oov=False) num_slots = len(pred_arg_pos) num_args = num_slots - 1 hyp_hidden_size = 20 wc_hidden_sizes = [20] * num_slots cc_hidden_sizes = [20] * num_args #use_pretrained_wordrep = False #if args.pt_rep: # print >>sys.stderr, "Using pretrained word representations from %s"%(args.pt_rep) # use_pretrained_wordrep = True # pt_word_rep = {l.split()[0]: numpy.asarray([float(f) for f in l.strip().split()[1:]]) for l in gzip.open(args.pt_rep)} train_vocab_file = codecs.open(args.vocab_file, "r", "utf-8") train_ont_file = codecs.open(args.ont_file, "r", "utf-8")