def init_static_dialog_agent(args) : print "reading in Ontology" ont = Ontology.Ontology(sys.argv[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "reading in Lexicon" lex = Lexicon.Lexicon(ont, sys.argv[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) load_parser_from_file = False if len(args) > 4 : if args[4].lower() == 'true' : load_parser_from_file = True if load_parser_from_file : parser = load_model('static_parser') grounder.parser = parser grounder.ontology = parser.ontology else : print "instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10, safety=True) print "instantiating Generator" generator = Generator.Generator(ont, lex, learner, parser, beam_width=sys.maxint, safety=True) print "instantiating DialogAgent" static_policy = StaticDialogPolicy.StaticDialogPolicy() A = StaticDialogAgent(parser, generator, grounder, static_policy, None, None) if not load_parser_from_file : print "reading in training data" D = A.read_in_utterance_action_pairs(args[3]) if len(args) > 4 and args[4] == "both": print "training parser and generator jointly from actions" converged = A.jointly_train_parser_and_generator_from_utterance_action_pairs( D, epochs=10, parse_beam=30, generator_beam=10) else: print "training parser from actions" converged = A.train_parser_from_utterance_action_pairs( D, epochs=10, parse_beam=30) print "theta: "+str(parser.learner.theta) save_model(parser, 'static_parser') return A
def init_pomdp_dialog_agent(args) : print "Reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "Reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "Instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "Instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "Instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) load_models_from_file = False if len(args) > 4 : if args[4].lower() == 'true' : load_models_from_file = True if load_models_from_file : parser = load_model('pomdp_parser') grounder.parser = parser grounder.ontology = parser.ontology else : print "Instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10) print "Instantiating DialogAgent" if load_models_from_file : agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=True) else : agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=False) if not load_models_from_file : print "reading in data and training parser from actions" D = agent.read_in_utterance_action_pairs(args[3]) converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30) print "theta: "+str(parser.learner.theta) save_model(parser, 'pomdp_parser') #print 'Parser ontology : ', parser.ontology.preds return agent
def create_pomdp_dialog_agent(self, parser_file=None): ont = copy.deepcopy(self.ont) lex = copy.deepcopy(self.lex) print "instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) if self.load_models_from_file: parser = load_model(parser_file) else: parser = CKYParser.CKYParser(ont, lex, use_language_model=True) # Set parser hyperparams to best known values for training parser.max_multiword_expression = 2 # max span of a multi-word expression to be considered during tokenization parser.max_new_senses_per_utterance = 3 # max number of new word senses that can be induced on a training example parser.max_cky_trees_per_token_sequence_beam = 100 # for tokenization of an utterance, max cky trees considered parser.max_hypothesis_categories_for_unknown_token_beam = 3 # for unknown token, max syntax categories tried # Train parser d = parser.read_in_paired_utterance_semantics( self.parser_train_file) converged = parser.train_learner_on_semantic_forms( d, 10, reranker_beam=10) save_model(parser, parser_file) # Set parser hyperparams to best known values for test time parser.max_multiword_expression = 2 # max span of a multi-word expression to be considered during tokenization parser.max_new_senses_per_utterance = 2 # max number of new word senses that can be induced on a training example parser.max_cky_trees_per_token_sequence_beam = 1000 # for tokenization of an utterance, max cky trees considered parser.max_hypothesis_categories_for_unknown_token_beam = 2 # for unknown token, max syntax categories tried grounder.parser = parser grounder.ontology = parser.ontology print "Instantiating DialogAgent" if self.load_models_from_file: policy = load_model('ktdq_policy') policy.untrained = False policy.training = False else: knowledge = Knowledge() policy = PomdpKtdqPolicy(knowledge) policy.untrained = True policy.training = True agent = PomdpDialogAgent(parser, grounder, policy, None, None) agent.retrain_parser = False return agent
def init_dialog_agent(args): print "Reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "Reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "Instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "Instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "Instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) print "Instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10) parser = load_model('parser') grounder.parser = parser grounder.ontology = parser.ontology print "Instantiating DialogAgent" agent = PomdpDialogAgent(parser, grounder, None, None) #print "reading in data and training parser from actions" #D = agent.read_in_utterance_action_pairs(args[3]) #converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30) #print "theta: "+str(parser.learner.theta) #save_model(parser, 'parser') #print 'Parser ontology : ', parser.ontology.preds return agent
print "reading in Ontology" ont = Ontology.Ontology(sys.argv[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "reading in Lexicon" lex = Lexicon.Lexicon(ont, sys.argv[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) #print "instantiating Parser" #parser = CKYParser.CKYParser(ont, lex, use_language_model=True) ## Set parser hyperparams to best known values for training #parser.max_multiword_expression = 2 # max span of a multi-word expression to be considered during tokenization #parser.max_new_senses_per_utterance = 2 # max number of new word senses that can be induced on a training example #parser.max_cky_trees_per_token_sequence_beam = 1000 # for tokenization of an utterance, max cky trees considered #parser.max_hypothesis_categories_for_unknown_token_beam = 5 # for unknown token, max syntax categories tried #parser.max_expansions_per_non_terminal = 5 # max number of backpointers stored per nonterminal per cell in CKY chart #d = parser.read_in_paired_utterance_semantics(sys.argv[3]) #converged = parser.train_learner_on_semantic_forms(d, 10, reranker_beam=10) #if not converged: #raise AssertionError("Training failed to converge to correct values.") #save_model(parser, 'parser') parser = load_model('parsers/parser_1000')
def main(): # Load parameters from command line. parser_fn = FLAGS_parser_fn word_embeddings_fn = FLAGS_word_embeddings_fn kb_static_facts_fn = FLAGS_kb_static_facts_fn kb_perception_source_dir = FLAGS_kb_perception_source_dir kb_perception_feature_dir = FLAGS_kb_perception_feature_dir active_test_set = [int(oidx) for oidx in FLAGS_active_test_set.split(',')] active_train_set = [ int(oidx) for oidx in FLAGS_active_train_set.split(',') ] if FLAGS_active_train_set is not None else None server_spin_time = FLAGS_server_spin_time cycles_per_user = FLAGS_cycles_per_user client_dir = FLAGS_client_dir log_dir = FLAGS_log_dir data_dir = FLAGS_data_dir write_classifiers = FLAGS_write_classifiers load_grounder = FLAGS_load_grounder num_dialogs = FLAGS_num_dialogs init_phase = FLAGS_init_phase # Load the parser from file. print "main: loading parser from file..." with open(parser_fn, 'rb') as f: p = pickle.load(f) p.lexicon.wv = p.lexicon.load_word_embeddings(word_embeddings_fn) print "main: ... done" # Create a new labels.pickle that erases the labels of the active training set for test purposes. full_annotation_fn = os.path.join(kb_perception_source_dir, 'full_annotations.pickle') if os.path.isfile(full_annotation_fn): print "main: creating new labels.pickle that blinds the active training set for this test..." with open(full_annotation_fn, 'rb') as f: fa = pickle.load(f) with open(os.path.join(kb_perception_source_dir, 'labels.pickle'), 'wb') as f: labels = [] for oidx in fa: if active_train_set is None or oidx not in active_train_set: for pidx in range(len(fa[oidx])): labels.append((pidx, oidx, fa[oidx][pidx])) pickle.dump(labels, f) print "main: ... done" # Instantiate a grounder. grounder_fn = os.path.join(client_dir, 'grounder.pickle') if load_grounder != 1: print "main: instantiating grounder..." g = KBGrounder.KBGrounder(p, kb_static_facts_fn, kb_perception_source_dir, kb_perception_feature_dir, active_test_set) if write_classifiers: print "main: and writing grounder perception classifiers to file..." g.kb.pc.commit_changes() # save classifiers to disk print "main: writing grounder to pickle..." with open(grounder_fn, 'wb') as f: pickle.dump(g, f) print "main: ... done" # Start the Server. print "main: instantiated server..." s = Server(active_train_set, grounder_fn, server_spin_time, cycles_per_user, client_dir, log_dir, data_dir, num_dialogs, init_phase) print "main: ... done" print "main: spinning server..." s.spin()
def main(): # Load parameters from command line. parser_fn = FLAGS_parser_fn kb_static_facts_fn = FLAGS_kb_static_facts_fn kb_perception_source_dir = FLAGS_kb_perception_source_dir kb_perception_feature_dir = FLAGS_kb_perception_feature_dir active_train_set = [str(oidx) for oidx in FLAGS_active_train_set.split(',')] \ if FLAGS_active_train_set is not None else None active_test_set = [str(oidx) for oidx in FLAGS_active_test_set.split(',')] outfile = FLAGS_outfile # Create a new labels.pickle that erases the labels of the active training set for test purposes. full_annotation_fn = os.path.join(kb_perception_source_dir, 'full_annotations.pickle') if os.path.isfile(full_annotation_fn): print "main: creating new labels.pickle that blinds the active training set for this test..." with open(full_annotation_fn, 'rb') as f: fa = pickle.load(f) with open(os.path.join(kb_perception_source_dir, 'labels.pickle'), 'wb') as f: labels = [] for oidx in fa: if active_train_set is None or oidx not in active_train_set: for pidx in range(len(fa[oidx])): labels.append((pidx, oidx, fa[oidx][pidx])) pickle.dump(labels, f) with open(parser_fn, 'rb') as f: p = pickle.load(f) g = KBGrounder.KBGrounder(p, kb_static_facts_fn, kb_perception_source_dir, kb_perception_feature_dir, active_test_set) # Start dumping HTML. table_format = "<table border=1px cellspacing=1px cellpadding=1px>" with open(outfile, 'wb') as f: f.write("<p><b>Train object data</b>") f.write( table_format + "<tr><th>predicate</th><th>positive</th><th>negative</th></tr>") preds = g.kb.perceptual_preds w = 3 for pidx in range(len(preds)): f.write("<tr><td>" + preds[pidx] + "</td>") pairs = [] oidx_votes = {} for pjdx, oidx, l in g.kb.pc.labels: if pjdx == pidx and oidx not in g.kb.pc.active_test_set: if oidx not in oidx_votes: oidx_votes[oidx] = [] oidx_votes[oidx].append(1 if l else -1) for oidx in oidx_votes: s = sum(oidx_votes[oidx]) if s > 0: pairs.append((oidx, 1, oidx_votes[oidx].count(1), oidx_votes[oidx].count(-1))) elif s < 0: pairs.append((oidx, -1, oidx_votes[oidx].count(1), oidx_votes[oidx].count(-1))) for label in [1, -1]: f.write("<td>") c = 0 f.write("<table><tr>") for oidx, l, pos_v, neg_v in pairs: if l == label: f.write("<td><img width=\"200px\" height=\"200px\" " + "src=\"../www/images/objects/oidx_" + str(oidx) + ".jpg\">") f.write("<br/>(" + str(pos_v) + ", " + str(neg_v) + ")</td>") c += 1 if c == w: f.write("</tr><tr>") c = 0 f.write("</tr></table>") f.write("</td>") f.write("</tr>") f.write("</table></p>") f.write("<hr>") f.write("<p><b>Test object results</b>") f.write(table_format + "<tr><th>predicate</th>") for idx in range(len(active_test_set)): f.write("<th>" + str(idx + 1) + "</th>") f.write("</tr>") # Run each trained classifier on each object in the test set. for pidx in range(len(preds)): f.write("<tr><td>" + preds[pidx] + "</td>") if g.kb.pc.classifiers[pidx] is not None: oidx_pos = {} for oidx in active_test_set: q = (preds[pidx], "oidx_" + str(oidx)) pos, neg = g.kb.query(q) oidx_pos[oidx] = pos s = sum([oidx_pos[oidx] for oidx in oidx_pos.keys()]) oidx_d = {oidx: oidx_pos[oidx] / s for oidx in oidx_pos.keys()} for oidx, pos in sorted(oidx_pos.items(), key=operator.itemgetter(1), reverse=True): f.write("<td><img width=\"200px\" height=\"200px\" " + "src=\"../www/images/objects/oidx_" + str(oidx) + ".jpg\"><br/>" + str(pos) + "<br/>" + str(oidx_d[oidx]) + "</td>") else: for _ in range(len(active_test_set)): f.write("<td> </td>") f.write("</tr>") f.write("</table></p>")
def main(): # Load parameters from command line. parser_fn = FLAGS_parser_fn word_embeddings_fn = FLAGS_word_embeddings_fn io_type = FLAGS_io_type grounder_fn = FLAGS_grounder_fn active_train_set = [ int(oidx) for oidx in FLAGS_active_train_set.split(',') ] if FLAGS_active_train_set is not None else None kb_static_facts_fn = None kb_perception_source_dir = None kb_perception_feature_dir = None active_test_set = None if grounder_fn is None: kb_static_facts_fn = FLAGS_kb_static_facts_fn kb_perception_source_dir = FLAGS_kb_perception_source_dir kb_perception_feature_dir = FLAGS_kb_perception_feature_dir active_test_set = [ int(oidx) for oidx in FLAGS_active_test_set.split(',') ] write_classifiers = FLAGS_write_classifiers uid = FLAGS_uid data_dir = FLAGS_data_dir client_dir = FLAGS_client_dir spin_time = FLAGS_spin_time num_dialogs = FLAGS_num_dialogs init_phase = FLAGS_init_phase max_syn_qs = FLAGS_max_syn_qs max_opp_qs = FLAGS_max_opp_qs image_path = FLAGS_image_path bbc_demo = FLAGS_bbc_demo no_clarify = FLAGS_no_clarify.split( ',') if FLAGS_no_clarify is not None else None assert io_type == 'keyboard' or io_type == 'server' or io_type == 'robot' assert io_type != 'server' or (uid is not None and client_dir is not None and data_dir is not None) assert io_type != 'robot' or image_path is not None if grounder_fn is None: # Load the parser from file. print "main: loading parser from file..." with open(parser_fn, 'rb') as f: p = pickle.load(f) p.lexicon.wv = p.lexicon.load_word_embeddings(word_embeddings_fn) print "main: ... done" # Create a new labels.pickle that erases the labels of the active training set for test purposes. full_annotation_fn = os.path.join(kb_perception_source_dir, 'full_annotations.pickle') if os.path.isfile(full_annotation_fn): print "main: creating new labels.pickle that blinds the active training set for this test..." with open(full_annotation_fn, 'rb') as f: fa = pickle.load(f) with open(os.path.join(kb_perception_source_dir, 'labels.pickle'), 'wb') as f: labels = [] for oidx in fa: if active_train_set is None or oidx not in active_train_set: for pidx in range(len(fa[oidx])): labels.append((pidx, oidx, fa[oidx][pidx])) pickle.dump(labels, f) print "main: ... done" # Instantiate a grounder. print "main: instantiating grounder..." g = KBGrounder.KBGrounder(p, kb_static_facts_fn, kb_perception_source_dir, kb_perception_feature_dir, active_test_set) if write_classifiers: print "main: and writing grounder perception classifiers to file..." g.kb.pc.commit_changes() # save classifiers to disk print "main: ... done" else: # Load a grounder from file print "main: loading grounder from file..." with open(grounder_fn, 'rb') as f: g = pickle.load(f) print "main: ... done" # Grab a reference to the parser from the loaded grounder. p = g.parser # Instantiate an input/output print "main: instantiating IO..." use_shorter_utterances = False if io_type == 'keyboard': io = IO.KeyboardIO() elif io_type == 'server': io = IO.SeverIO(uid, client_dir, spin_time=spin_time) elif io_type == 'robot': # includes some hard-coded expectations like 2 tables, 8 training objects if len(active_train_set) == 8: # All are train objects table_oidxs = { 1: active_train_set[0:4], 2: active_train_set[4:8], 3: None } else: # Table 1 test objects, Table 2 train objects table_oidxs = {1: active_test_set[:], 2: active_train_set[:]} rospy.init_node('phm_node') print "WARNING: ensure robot is facing Table 2 on startup!" io = IO.RobotIO(table_oidxs, 2, image_path) use_shorter_utterances = True else: io = None # won't be executed due to asserts print "main: ... done" # Normal operation. if init_phase == 0 and bbc_demo != 1: # Instantiate an Agent. print "main: instantiating Agent..." a = Agent.Agent(p, g, io, active_train_set, no_clarify=no_clarify, use_shorter_utterances=use_shorter_utterances, word_neighbors_to_consider_as_synonyms=max_syn_qs, max_perception_subdialog_qs=max_opp_qs) print "main: ... done" # Start a dialog. perception_labels_requested = [] action_confirmed_per_dialog = [] utterances_by_role_per_dialog = [] for _ in range(num_dialogs): print "main: running command dialog..." action_confirmed, user_utterances_by_role = a.start_action_dialog( perception_labels_requested=perception_labels_requested) action_confirmed_per_dialog.append(action_confirmed) utterances_by_role_per_dialog.append(user_utterances_by_role) print "main: ... done; got action " + str(action_confirmed) # Write out new information gleaned from this user after every dialog. if uid is not None: # DEBUG print "main: writing new information from dialog(s) to file..." fn = os.path.join(data_dir, uid + ".pickle") d = [ action_confirmed_per_dialog, utterances_by_role_per_dialog, a.new_perceptual_labels, a.perceptual_pred_synonymy ] with open(fn, 'wb') as f: pickle.dump(d, f) print "main: ... done; wrote data d = " + str(d) # Ask for pointing commands. elif bbc_demo == 1: print "main: instantiating Agent..." a = Agent.Agent(p, g, io, active_train_set, no_clarify=no_clarify, use_shorter_utterances=use_shorter_utterances, word_neighbors_to_consider_as_synonyms=max_syn_qs, max_perception_subdialog_qs=max_opp_qs) print "main: ... done" print "main: updating lexicon with 'rattling'" a.add_new_perceptual_lexical_entries('rattling', False, None) a.parser.type_raise_bare_nouns() a.parser.theta.update_probabilities() print "main: ... done" print "main: training 'rattling' classifier" g.kb.pc.update_classifiers(['rattling'], [], [], []) perception_pidx = g.kb.pc.predicates.index('rattling') upidxs = [perception_pidx] * 8 uoidxs = [5, 14, 4, 27, 0, 30, 1, 31] ulabels = [0, 1, 0, 0, 0, 0, 1, 1] g.kb.pc.update_classifiers([], upidxs, uoidxs, ulabels) print "main: ... done" print "main: starting bbc phase dialog..." io.say_to_user("What should I do?") cmd = io.get_from_user() p = cmd.split() if (p[0] == 'point' or p[1] == 'points') and (p[1] == 'to' or p[1] == '2'): gps, _ = a.parse_and_ground_utterance(' '.join(p[2:])) for g, conf in gps: selected_oidx = a.parser.ontology.preds[g.idx] print selected_oidx, conf # DEBUG g, top_conf = gps[0] for g, conf in gps: if conf == top_conf: selected_oidx = a.parser.ontology.preds[g.idx] oidx = int( selected_oidx.split('_')[1]) # e.g. 'oidx_1' -> 1 ttid = None for tid in a.io.table_oidxs: if a.io.table_oidxs[ tid] is not None and oidx in a.io.table_oidxs[ tid]: ttid = tid if ttid is not None: a.io.face_table(ttid) a.io.point(a.io.table_oidxs[ttid].index(oidx)) a.io.point(-1) print "main: ... done" # Just ask the user for a few rephrases of the command. else: print "main: starting init phase dialog..." for nd in range(num_dialogs): io.say_to_user("What should I do?") _ = io.get_from_user() for ip in range(init_phase - 1): io.say_to_user( "What's another way you could phrase that command?") _ = io.get_from_user() io.perform_action({'action': 'init_phase'}) print "main: ... done"
def main(): # Load parameters from command line. agg_fns = FLAGS_agg_fns.split(',') parser_fn = FLAGS_parser_fn embeddings_fn = FLAGS_embeddings_fn parser_outfile = FLAGS_parser_outfile parser_base_pairs_fn = FLAGS_parser_base_pairs_fn only_use_base_pairs = True if FLAGS_only_use_base_pairs == 1 else False kb_static_facts_fn = FLAGS_kb_static_facts_fn kb_perception_feature_dir = FLAGS_kb_perception_feature_dir kb_perception_source_base_dir = FLAGS_kb_perception_source_base_dir kb_perception_source_target_dir = FLAGS_kb_perception_source_target_dir active_test_set = [int(oidx) for oidx in FLAGS_active_test_set.split(',')] only_bare_nouns = True if FLAGS_only_bare_nouns == 1 else False training_log_fn = FLAGS_training_log_fn full_pairs_log_fn = FLAGS_full_pairs_log_fn epochs = FLAGS_epochs use_condor = FLAGS_use_condor condor_target_dir = FLAGS_condor_target_dir condor_parser_script_dir = FLAGS_condor_parser_script_dir condor_grounder_script_dir = FLAGS_condor_grounder_script_dir assert not use_condor or (condor_target_dir is not None and condor_parser_script_dir is not None and condor_grounder_script_dir is not None) # Load the aggregate information from file print "main: loading aggregate conversation files..." agg_all_utterances = [] agg_role_utterances_role_chosen_pairs = [] agg_perceptual_labels = [] agg_perceptual_synonymy = [] for agg_fn in agg_fns: print "main: ... loading from '" + agg_fn + "'" with open(agg_fn, 'rb') as f: _agg_all_utterances, _agg_role_utterances_role_chosen_pairs, _agg_perceptual_labels,\ _agg_perceptual_synonymy = pickle.load(f) agg_all_utterances.extend(_agg_all_utterances) agg_role_utterances_role_chosen_pairs.extend( _agg_role_utterances_role_chosen_pairs) agg_perceptual_labels.extend(_agg_perceptual_labels) agg_perceptual_synonymy.extend(_agg_perceptual_synonymy) print "... done" # Load a grounder from file print "main: loading base parser from file..." with open(parser_fn, 'rb') as f: p = pickle.load(f) p.lexicon.wv = None if embeddings_fn is not None: print "main: ... adding embeddings" p.lexicon.wv = p.lexicon.load_word_embeddings(embeddings_fn) print "main: ... done" # Load parser base pairs, if any. print "main: loading base parser pairs from file..." if parser_base_pairs_fn is not None: parser_base_pairs = p.read_in_paired_utterance_semantics( parser_base_pairs_fn) else: parser_base_pairs = [] print "main: ... done" # Copy the base grounder labels.pickle and predicates.pickle into the target directory. print "main: copying base KB perception labels and pickles to target dir..." base_labels_fn = os.path.join(kb_perception_source_base_dir, "labels.pickle") base_pickles_fn = os.path.join(kb_perception_source_base_dir, "predicates.pickle") if os.path.isfile(base_labels_fn): os.system( "cp " + base_labels_fn + " " + os.path.join(kb_perception_source_target_dir, "labels.pickle")) else: print "ERROR: file not found '" + base_labels_fn + "'" return 1 if os.path.isfile(base_pickles_fn): os.system( "cp " + base_pickles_fn + " " + os.path.join(kb_perception_source_target_dir, "predicates.pickle")) else: print "ERROR: file not found '" + base_pickles_fn + "'" return 1 print "main: ... done" # Instantiate a new grounder with the base parser and with perception source at the target dir. print "main: instantiating grounder..." g = KBGrounder.KBGrounder(p, kb_static_facts_fn, kb_perception_source_target_dir, kb_perception_feature_dir, active_test_set) print "main: ... done" # Instantiate vestigial input/output print "main: instantiating basic IO..." io = IO.KeyboardIO() print "main: ... done" # Instantiate an Agent. print "main: instantiating Agent..." a = Agent.Agent(p, g, io, None) print "main: ... done" # Open logfile. log_f = open(training_log_fn, 'w') # Look through aggregated labels to identify good perceptual candidates. preds_by_label = {} for pred, oidx, l in agg_perceptual_labels: if pred not in preds_by_label: preds_by_label[pred] = {} if oidx not in preds_by_label[pred]: preds_by_label[pred][oidx] = 0 preds_by_label[pred][oidx] += 1 if l else -1 # print "main: preds_by_label: " + str(preds_by_label) preds_by_oidx_label = {} for pred in preds_by_label: preds_by_oidx_label[pred] = {True: [], False: []} for oidx in preds_by_label[pred]: if preds_by_label[pred][oidx] > 0: preds_by_oidx_label[pred][True].append(oidx) elif preds_by_label[pred][oidx] < 0: preds_by_oidx_label[pred][False].append(oidx) # print "main: preds_by_oidx_label: " + str(preds_by_oidx_label) preds_w_pos = [ pred for pred in preds_by_oidx_label if len(preds_by_oidx_label[pred][True]) > 0 ] print "main: preds_w_pos: " + str(preds_w_pos) # Analyze synonymy votes and decide which pairs to treat as synonymous. synonymy_votes = { } # maps from tuples of preds to the sum of votes for and against their being synonymous for predi, predj, v in agg_perceptual_synonymy: if (predi, predj) in synonymy_votes: key = (predi, predj) elif (predj, predi) in synonymy_votes: key = (predj, predi) else: key = (predi, predj) synonymy_votes[key] = 0 synonymy_votes[key] += 1 if v else -1 print "main: synonymy votes: " + str(synonymy_votes) synonymy_candidates = { key: synonymy_votes[key] for key in synonymy_votes.keys() if synonymy_votes[key] > 0 } print "main: synonymy candidates: " + str(synonymy_candidates) # Decide based on synonymy and pred labels which lexicon entries to add (similar to procedure in Agent.py, # but based on voting instead of single-user feedback.) all_preds = list( set([pred for pred, _, _ in agg_perceptual_labels] + [pred for pred, _ in synonymy_votes.keys()] + [pred for _, pred in synonymy_votes.keys()])) preds = [ pred for pred in all_preds if pred not in a.parser.lexicon.surface_forms and (pred in preds_w_pos or len([ synp for synp in preds_w_pos if (pred, synp) in synonymy_candidates or (synp, pred) in synonymy_candidates ])) > 0 ] print "main: preds to consider: " + str(preds) utterances_with_pred = {} for pred in all_preds: utterances_with_pred[pred] = [] for u in agg_all_utterances: if pred in a.parser.tokenize(u): utterances_with_pred[pred].append(u) # print "main: utterances with preds: " + str(utterances_with_pred) # Iterate over pedicates to identify likely adjectives (those left of other already-known predicates). # This process should repeat until no further adjectives around found (allowing chaining unseen adjs). # Afterwards, any predicate not flagged as an adjective is probably a noun (no percept neighbors to the right). pred_is_perc = {pred: False for pred in preds} new_perceptual_adds = True known_perc_preds = [ tk for tk in a.parser.lexicon.surface_forms if a.is_token_perceptual(tk) ] while new_perceptual_adds: new_perceptual_adds = False print "main: checking for new adjectives and nouns..." for pred in preds: if not pred_is_perc[pred] and len(utterances_with_pred[pred]) > 0: syn = get_syn_from_candidates(a, pred, synonymy_candidates) if only_bare_nouns: # Add bare nouns, later type-raise. a.add_new_perceptual_lexical_entries(pred, False, syn) print "main: added noun for '" + pred + "'" if syn is not None: print "main: ... with known synonym '" + a.parser.lexicon.surface_forms[ syn[0]] + "'" log_f.write("added noun entry for '" + pred + "' with synonym " + str(a.parser.lexicon.surface_forms[syn[0]] if syn is not None else None) + "\n") else: # Turkers tend to use malformed language, so add all new preds as both adjectives and nouns. if True: pred_is_perc[pred] = True new_perceptual_adds = True ont_pred = a.add_new_perceptual_lexical_entries( pred, True, syn) a.add_new_perceptual_lexical_entries( pred, False, syn, ont_pred) print "main: added noun and adjective for '" + pred + "'" if syn is not None: print "main: ... with known synonym '" + a.parser.lexicon.surface_forms[ syn[0]] + "'" log_f.write("added adjective and noun entry for '" + pred + "' with synonym " + str(a.parser.lexicon.surface_forms[syn[0]] if syn is not None else None) + "\n") # Determine whether each predicate is mostly behaving like a noun or adjective before adding. else: # Just count how often a pred is 'acting' like an adjective or noun based on position. la = ln = 0 for u in utterances_with_pred[pred]: tks = a.parser.tokenize(u) tkidx = tks.index(pred) if tkidx < len(tks) - 1 and ( tks[tkidx + 1] in known_perc_preds or tks[tkidx + 1] in all_preds or tks[tkidx + 1] not in a.parser.lexicon.surface_forms): la += 1 elif tkidx == len(tks) - 1 or tks[ tkidx + 1] in a.parser.lexicon.surface_forms: ln += 1 la /= float(len(utterances_with_pred[pred])) ln /= float(len(utterances_with_pred[pred])) if la > 0.5: pred_is_perc[pred] = True new_perceptual_adds = True a.add_new_perceptual_lexical_entries( pred, True, syn) print "main: added adjective '" + pred + "'" if syn is not None: print "main: ... with known synonym '" + a.parser.lexicon.surface_forms[ syn[0]] + "'" log_f.write("added adjective '" + pred + "' with synonym " + str(syn) + "\n") elif ln > 0.5: pred_is_perc[pred] = True new_perceptual_adds = True a.add_new_perceptual_lexical_entries( pred, False, syn) print "main: added noun '" + pred + "'" if syn is not None: print "main: ... with known synonym '" + a.parser.lexicon.surface_forms[ syn[0]] + "'" log_f.write("added noun '" + pred + "' with synonym " + str(syn) + "\n") if only_bare_nouns: a.parser.type_raise_bare_nouns() # should only affect new nouns a.parser.theta.update_probabilities( ) # because the above adds new entries print "main: ... done" # Retrain perceptual classifiers from aggregated labels. upidxs = [] uoidxs = [] ulabels = [] for pred, oidx, label in agg_perceptual_labels: if pred in a.grounder.kb.pc.predicates: pidx = a.grounder.kb.pc.predicates.index(pred) upidxs.append(pidx) uoidxs.append(oidx) ulabels.append(1 if label else 0) print("main: updating predicate classifiers with " + str(len(upidxs)) + " new labels across " + str(len(set(upidxs))) + " predicates...") a.grounder.kb.pc.update_classifiers([], upidxs, uoidxs, ulabels) log_f.write("updated classifiers with " + str(len(upidxs)) + " new labels across " + str(len(set(upidxs))) + " predicates...\n") print "main: ... done" # Write new classifiers to file. print "main: committing grouder classifiers to file..." g.kb.pc.commit_changes() # save classifiers to disk print "main: ... done" # Induce pairs from agg data. print "main: ... creating induced pairs from aggregated conversations..." for action_confirmed, user_utterances_by_role in agg_role_utterances_role_chosen_pairs: new_i_pairs = a.induce_utterance_grounding_pairs_from_conversation( user_utterances_by_role, action_confirmed) a.induced_utterance_grounding_pairs.extend(new_i_pairs) print "main: ...... done; induced " + str( len(a.induced_utterance_grounding_pairs)) + " pairs" log_f.write("induced " + str(len(a.induced_utterance_grounding_pairs)) + " utterance/grounding pairs\n") # DEBUG - write the Agent out to file for use by other scripts with open("agent.temp.pickle", 'wb') as f: pickle.dump(a, f) # END DEBUG # Iterate inducing new pairs using most up-to-date parser and training for single epoch. # Each of these stages can be distributed over the UT Condor system for more linear-time computation. print "main: training parser by alternative grounding->semantics and semantics->parser training steps..." fplfn = open(full_pairs_log_fn, 'w') for epoch in range(epochs): # Get grouding->semantics pairs if not only_use_base_pairs: print "main: ... getting utterance/semantic form pairs from induced utterance/grounding pairs..." utterance_semantic_grounding_triples = a.get_semantic_forms_for_induced_pairs( 1, 10, verbose=1, use_condor=use_condor, condor_target_dir=condor_target_dir, condor_script_dir=condor_grounder_script_dir) print("main: ...... got " + str(len(utterance_semantic_grounding_triples)) + " utterance/semantics " + "pairs from induced utterance/grounding pairs") log_f.write("epoch " + str(epoch) + ": got " + str(len(utterance_semantic_grounding_triples)) + " utterance/semantic pairs\n") # Write out induced pairs to logfile(s) for later inspection and qualitative analysis. fplfn.write("epoch " + str(epoch) + ":\n\n" + '\n\n'.join([ '\n'.join([ x, a.parser.print_parse(y, True), a.parser.print_parse(z, False) ]) for x, y, z in utterance_semantic_grounding_triples ]) + '\n\n') else: utterance_semantic_grounding_triples = [] # Write the new parser to file. print "main: writing current re-trained parser to file..." with open(parser_outfile + "." + str(epoch), 'wb') as f: pickle.dump(p, f) print "main: ... done" # Train parser on utterances->semantics pairs print "main: ... re-training parser on pairs induced from aggregated conversations..." utterance_semantic_pairs = [ [x, y] for x, y, _ in utterance_semantic_grounding_triples ] perf = [] a.parser.train_learner_on_semantic_forms( parser_base_pairs + utterance_semantic_pairs, epochs=1, epoch_offset=epoch, reranker_beam=1, verbose=2, use_condor=use_condor, condor_target_dir=condor_target_dir, condor_script_dir=condor_parser_script_dir, perf_log=perf) log_f.write( "epoch " + str(epoch) + ": parser trained on " + str(perf[0][0]) + " examples and " + "failed on " + str(perf[0][1]) + " out of " + str(len(parser_base_pairs) + len(utterance_semantic_pairs)) + "\n") # Write the final parser to file. print "main: writing current re-trained parser to file..." with open(parser_outfile + ".final", 'wb') as f: pickle.dump(p, f) print "main: ... done" fplfn.close() print "main: ... done" # Close logfile. log_f.close()