def main(argv): parser = argparse.ArgumentParser(description='Simple SLG baseline.') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/...') parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE', help='File to write with SAP output') parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE', 'TOURIST'], required=True, help='Target role') args = parser.parse_args() sap = SimpleSLG() trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True, task='SLG', roletype=args.roletype.lower()) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() == args.roletype.lower(): instance = {'semantic_tags': log_utter['semantic_tags'], 'speech_act': log_utter['speech_act']} sap.add_instance(instance, translations) sap.train() sys.stderr.write('Done\n') output = {'sessions': []} output['dataset'] = args.testset output['task_type'] = 'SAP' output['role_type'] = args.roletype start_time = time.time() testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True, task='SLG', roletype=args.roletype.lower()) sys.stderr.write('Loading testing instances ... ') for call in testset: this_session = {"session_id": call.log["session_id"], "utterances": []} for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() == args.roletype.lower(): instance = {'semantic_tags': log_utter['semantic_tags'], 'speech_act': log_utter['speech_act']} slg_result = {'utter_index': log_utter['utter_index'], 'generated': sap.generate(instance)} this_session['utterances'].append(slg_result) output['sessions'].append(this_session) sys.stderr.write('Done\n') end_time = time.time() elapsed_time = end_time - start_time output['wall_time'] = elapsed_time with open(args.outfile, "wb") as of: json.dump(output, of, indent=4) sys.stderr.write('Done\n')
def main(argv): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--model_dir',dest='model_dir',action='store',required=True,metavar='PATH', help='model dir') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') parser.add_argument('--ratio_thres',dest='ratio_thres',type=float,action='store',default=0.8,help='ration threshold') parser.add_argument('--value_prob',dest='value_prob',type=float,action='store',default=0.8,help='output value prob threshold') parser.add_argument('--slot_prob',dest='slot_prob',type=float,action='store',default=0.6,help='output slot prob threshold') parser.add_argument('--STCMode',dest='STCMode',action='store',default='hr',help='STC mode, high precision or high recall') parser.add_argument('--BSMode',dest='BSMode',action='store',default='enhance',help='Belief State mode: max, average or enhance') parser.add_argument('--BSAlpha',dest='BSAlpha',type=float,action='store',default=0.0,help='Belief State average history alpha') args = parser.parse_args() # 读取配置文件 InitConfig() config = GetConfig() config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')]) # 设置logging log_level_key = config.get('logging','level') run_code_name = os.path.basename(sys.argv[0])[0:-3] logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \ level = GetLogLevel(log_level_key), format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s') dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() tracker = msiip_nsvc_tracker(tagsets, args.model_dir, ratio_thres = args.ratio_thres, slot_prob_thres = args.slot_prob, value_prob_thres = args.value_prob, mode = args.STCMode, bs_mode = args.BSMode, bs_alpha = args.BSAlpha) for call in dataset: this_session = {"session_id":call.log["session_id"], "utterances":[]} tracker.reset() for (utter,_) in call: sys.stderr.write('%d:%d\n'%(call.log['session_id'], utter['utter_index'])) tracker_result = tracker.addUtter(utter) if tracker_result is not None: this_session["utterances"].append(tracker_result) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track['wall_time'] = elapsed_time json.dump(track, track_file, indent=4) track_file.close()
def main(): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() tracker = Tracker() for call in dataset : this_session = {"session-id":call.log["session-id"], "turns":[]} tracker.reset() for turn, _ in call : tracker_turn = tracker.addTurn(turn) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file,indent=4)
def main(argv): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() tracker = BaselineTracker(tagsets) for call in dataset: this_session = {"session_id":call.log["session_id"], "utterances":[]} tracker.reset() for (utter,_) in call: sys.stderr.write('%d:%d\n'%(call.log['session_id'], utter['utter_index'])) tracker_result = tracker.addUtter(utter) if tracker_result is not None: this_session["utterances"].append(copy.deepcopy(tracker_result)) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track['wall_time'] = elapsed_time json.dump(track, track_file, indent=4) track_file.close()
def main(argv): install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker parser = argparse.ArgumentParser(description='Check the validity of a tracker output object.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='scorefile',action='store',metavar='JSON_FILE',required=True, help='File containing score JSON') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True, help='JSON Ontology file') args = parser.parse_args() sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=False) tracker_output = json.load(open(args.scorefile)) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() checker = TrackChecker(sessions, tracker_output, tagsets) checker.check() checker.print_errors()
def main(argv): parser = argparse.ArgumentParser(description="find slot property.") parser.add_argument( "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze" ) parser.add_argument( "--dataroot", dest="dataroot", action="store", required=True, metavar="PATH", help="Will look for corpus in <destroot>/<dataset>/...", ) parser.add_argument("output", help="output slot property file") args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True) slot_count_dic = find_slot_property(dataset) output = codecs.open(args.output, "w", "utf-8") sorted_slot_count = sorted(slot_count_dic.items(), key=lambda x: x[1][1], reverse=True) for slot, count in sorted_slot_count: print >> output, "%s\t%d\t%d" % (slot, count[0], count[1]) output.close()
def main(argv): parser = argparse.ArgumentParser(description='MSIIP ensemble tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--LogBaseDir',dest='LogBaseDir',action='store',required=True,help='The base directory that contains the log files') parser.add_argument('--config',dest='config',action='store',required=True,help='Config file, indicate log files and weight') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') parser.add_argument('--value_prob',dest='value_prob',type=float,action='store',default=0.4,help='output value prob threshold') parser.add_argument('--slot_prob',dest='slot_prob',type=float,action='store',default=0.5,help='output slot prob threshold') args = parser.parse_args() # 读取配置文件 InitConfig() config = GetConfig() config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')]) # 设置logging log_level_key = config.get('logging','level') run_code_name = os.path.basename(sys.argv[0])[0:-3] logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \ level = GetLogLevel(log_level_key), format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s') dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() tracker = msiip_ensemble_tracker(tagsets, dataset, args.LogBaseDir, args.config, args.slot_prob, args.value_prob) track = tracker.ensemble() track_file = open(args.trackfile, "wb") json.dump(track, track_file, indent=4) track_file.close()
def main(argv): # 读取配置文件 InitConfig() config = GetConfig() config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')]) # 设置logging log_level_key = config.get('logging','level') run_code_name = os.path.basename(sys.argv[0])[0:-3] ''' logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \ level = GetLogLevel(log_level_key), format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s') ''' parser = argparse.ArgumentParser(description='find stop words.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('stop_words_file', help='output stop words file') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) stop_words_count = find_stop_words(dataset) output = codecs.open(args.stop_words_file, 'w', 'utf-8') sorted_stop_words = sorted(stop_words_count.items(), key=lambda x:x[1], reverse=True) for word,count in sorted_stop_words: print >>output, '%s\t%d' %(word, count) output.close()
def main(argv): parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.") parser.add_argument( "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze" ) parser.add_argument( "--dataroot", dest="dataroot", action="store", required=True, metavar="PATH", help="Will look for corpus in <destroot>/<dataset>/...", ) args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=False) start_time = time.time() tracker = conversation_extractor() for call in dataset: this_session = {"session_id": call.log["session_id"], "utterances": []} tracker.reset() for (utter, _) in call: sys.stderr.write("%d:%d\n" % (call.log["session_id"], utter["utter_index"])) tracker.addUtter(utter) end_time = time.time() elapsed_time = end_time - start_time
def main(argv): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") out_json = {'dataset':args.dataset, 'utterances':[]} extractor = sub_utters_extractor() for call in dataset: for (log_utter, label_utter) in call: sys.stderr.write('%d:%d\n'%(call.log['session_id'], log_utter['utter_index'])) print '%d:%d'%(call.log['session_id'], log_utter['utter_index']) (transcript, sub_utters_list, sub_tag_list, speech_acts) = extractor.addUtter(log_utter,label_utter) if transcript: item = {} item['transcript'] = transcript item['sub_utters_list'] = sub_utters_list item['sub_tag_list'] = sub_tag_list item['speech_acts'] = speech_acts out_json['utterances'].append(item) json.dump(out_json, track_file, indent=4)
def main(argv): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--ar',dest='association_rules',action='store',required=True,metavar='JSON_FILE', help='association_rules') parser.add_argument('--stm',dest='semantic_tagger_model',action='store',required=True, help='semantic_tagger_model') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') parser.add_argument('--pt',dest='prob_threshold',type=float,action='store',default=0.8,help='prob_threshold') parser.add_argument('--exact',dest='exact',action='store_true',help='exact mode of fuzz mode') args = parser.parse_args() # 读取配置文件 InitConfig() config = GetConfig() config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')]) # 设置logging log_level_key = config.get('logging','level') run_code_name = os.path.basename(sys.argv[0])[0:-3] logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \ level = GetLogLevel(log_level_key), format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s') dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() if args.exact: mode = 'exact' else: mode = 'fuzzy' tracker = association_rule_tracker(tagsets, args.association_rules, args.semantic_tagger_model, args.prob_threshold, mode) for call in dataset: this_session = {"session_id":call.log["session_id"], "utterances":[]} tracker.reset() for (utter,_) in call: sys.stderr.write('%d:%d\n'%(call.log['session_id'], utter['utter_index'])) tracker_result = tracker.addUtter(utter) if tracker_result is not None: this_session["utterances"].append(tracker_result) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track['wall_time'] = elapsed_time json.dump(track, track_file, indent=4) track_file.close()
def main(): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--labelfile',dest='labelfile',action='store',required=True,metavar='TXT', help='File with 2-way prediction results') #parser.add_argument('--methodfile',dest='methodfile',action='store',required=False,metavar='TXT', # help='File with method prediction results') parser.add_argument('--topK',dest='topK',action='store',type=int, help='get topK accuracy') args = parser.parse_args() global topK topK = args.topK head, body = fio.readMatrix(args.labelfile, True) rank_index = head.index('rank_H3') labels = [item[rank_index] for item in body] #head, body = fio.readMatrix(args.labelfile, True) #labels = [item[0] for item in body] dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() turn_count = -1 tracker = Tracker() for call in dataset : this_session = {"session-id":call.log["session-id"], "turns":[]} tracker.reset() for turn, _ in call : turn_count = turn_count + 1 rank = labels[turn_count] tracker_turn = tracker.addTurn(turn, rank) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file,indent=4)
def main(argv): print argv parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.") parser.add_argument( "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze" ) parser.add_argument( "--dataroot", dest="dataroot", action="store", required=True, metavar="PATH", help="Will look for corpus in <destroot>/<dataset>/...", ) parser.add_argument( "--trackfile", dest="trackfile", action="store", required=True, metavar="JSON_FILE", help="File to write with tracker output", ) parser.add_argument( "--ontology", dest="ontology", action="store", metavar="JSON_FILE", required=True, help="JSON Ontology file" ) # args = parser.parse_args() args = parser.parse_args(argv) dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=False) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions": []} track["dataset"] = args.dataset start_time = time.time() tracker = NaiveEnsembleBasedTrackerWithNBest(tagsets, nameOfODictPickle="dictOutput.pic") for call in dataset: this_session = {"session_id": call.log["session_id"], "utterances": []} tracker.reset() for (utter, _) in call: sys.stderr.write("%d:%d\n" % (call.log["session_id"], utter["utter_index"])) tracker_result = tracker.addUtter(utter, call) if tracker_result is not None: this_session["utterances"].append(tracker_result) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall_time"] = elapsed_time json.dump(track, track_file, indent=4) track_file.close()
def main(argv): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() sub_seg_counter = 0 topic_slot_counter = {} for topic in tagsets: topic_slot_counter[topic] = defaultdict(int) extractor = sub_segment_extractor() for call in dataset: this_session = {"session_id":call.log["session_id"], "sub_segments":[]} extractor.reset() for (log_utter, label_utter) in call: sys.stderr.write('%d:%d\n'%(call.log['session_id'], log_utter['utter_index'])) if log_utter['segment_info']['target_bio'] == 'B': if not extractor.is_empty: sub_segment = extractor.state sub_segment['id'] = sub_seg_counter sub_seg_counter += 1 this_session['sub_segments'].append(sub_segment) for slot in sub_segment['frame_label']: topic_slot_counter[sub_segment['topic']][slot] += 1 extractor.addUtter(log_utter,label_utter) if not extractor.is_empty: sub_segment = extractor.state sub_segment['id'] = sub_seg_counter sub_seg_counter += 1 this_session['sub_segments'].append(sub_segment) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track['wall_time'] = elapsed_time json.dump(track, track_file, indent=4) track_file.close() print json.dumps(topic_slot_counter, indent = 4)
def main(): parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.") parser.add_argument( "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze" ) parser.add_argument( "--dataroot", dest="dataroot", action="store", required=True, metavar="PATH", help="Will look for corpus in <destroot>/<dataset>/...", ) parser.add_argument( "--trackfile", dest="trackfile", action="store", required=True, metavar="JSON_FILE", help="File to write with tracker output", ) parser.add_argument( "--focus", dest="focus", action="store", nargs="?", default="False", const="True", help="Use focus node tracker" ) args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) track_file = open(args.trackfile, "wb") track = {"sessions": []} track["dataset"] = args.dataset start_time = time.time() if args.focus.lower() == "true": tracker = FocusTracker() elif args.focus.lower() == "false": tracker = Tracker() else: raise RuntimeError, "Dont recognize focus=%s (must be True or False)" % (args.focus) for call in dataset: this_session = {"session-id": call.log["session-id"], "turns": []} tracker.reset() for turn, _ in call: tracker_turn = tracker.addTurn(turn) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file, indent=4)
def main() : parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--focus',dest='focus',action='store',nargs='?',default="False",const="True", help='Use focus node tracker') args = parser.parse_args() #dataset文件中有多少对话,dataset就有多少对话,dataset是一个dataset_walker类的对象 dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() if args.focus.lower() == "true": tracker = FocusTracker() elif args.focus.lower() == "false": tracker = Tracker() else: raise RuntimeError,'Dont recognize focus=%s (must be True or False)' % (args.focus) for call in dataset : #把dataset中的对话一个个拿出来按照话轮处理 this_session = {"session-id":call.log["session-id"], "turns":[]} #tracker的reset操作:self.hyps = {"goal-labels":{},"method-label":{}, "requested-slots":{}} tracker.reset() #对每一个对话按照其中的话轮开始跟中用户的对话目的 #跟踪对话状态:dialog state 或者说是 dialog hypthesis for turn, _ in call : #所以关于对于对话状态跟踪都在addTurn函数里面处理 #最核心的代码,核心算法部分。 #turn是个字典,键值有3种,output,turn-index,input #################################### tracker_turn = tracker.addTurn(turn) #################################### this_session["turns"].append(tracker_turn) #把当前对话所有话轮的用户目的的跟中结果放在this_session,添加到最终的结果集合track中。 track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file,indent=4)
def main() : print_gplv3() parser = argparse.ArgumentParser(description='HWU Rule-based Dialog State Tracker Baseline V2.0\n by Zhuoran Wang\t [email protected]\n This version extends the work in (Wang & Lemon, SigDial 2013).',\ formatter_class=RawTextHelpFormatter) parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='The ontology to use') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--original',dest='original',action='store',required=False,metavar='TRUE/FALSE', help='Use the original version presented in (Wang & Lemon, SigDial 2013)') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) original = False if args.original and args.original.lower() == "true" : original = True load_ontology(args.ontology) track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() tracker = HWU_Tracker() for call in dataset : this_session = {"session-id":call.log["session-id"], "turns":[]} tracker.reset() for turn, _ in call : tracker_turn = tracker.addTurn(turn,original) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file,indent=4)
def main(): parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--goal_area',dest='goal_area',action='store',required=False,metavar='TXT', help='File with goal_area prediction results') parser.add_argument('--goal_food',dest='goal_food',action='store',required=False,metavar='TXT', help='File with goal_food prediction results') parser.add_argument('--goal_name',dest='goal_name',action='store',required=False,metavar='TXT', help='File with goal_name prediction results') parser.add_argument('--goal_pricerange',dest='goal_pricerange',action='store',required=False,metavar='TXT', help='File with goal_pricerange prediction results') parser.add_argument('--topK',dest='topK',action='store',type=int, help='get topK accuracy') args = parser.parse_args() global topK topK = args.topK for goal in [args.goal_area, args.goal_food, args.goal_name, args.goal_pricerange]: if goal == None: continue head, body = fio.readMatrix(goal, True) dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) turn_count = -1 nbest_count = 0 nbest = [] for call in dataset : for turn, _ in call : turn_count = turn_count + 1 n_asr_live = len(turn['input']['live']['asr-hyps']) combinedgoals = getNbest(body[nbest_count:nbest_count + n_asr_live][:], topK) nbest.append(combinedgoals) nbest_count = nbest_count + n_asr_live fio.writeMatrix(goal+'.'+str(topK)+".combine", nbest, head)
def main(argv): # # CMD LINE ARGS # install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path,'config') parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze, for example train1 or test2 or train3a') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--datafile',dest='datafile',action='store',required=True,metavar='JSON_FILE', help='File to write output') parser.add_argument('--label',dest='label',action='store',required=True,metavar='BOOL', help='load labels') args = parser.parse_args() label = False if args.label.lower() == 'true': label = True dataset = dataset_walker(args.dataset,dataroot=args.dataroot,labels=label) datafile = open(args.datafile, "wb") data = {"sessions":[]} data["dataset"] = args.dataset vector = vectorizer() for call in dataset : this_session = {"session-id":call.log["session-id"], "turns":[]} vector.reset() for turn, labels in call : data_point = vector.addTurn(turn,labels) this_session["turns"].append(data_point) data["sessions"].append(this_session) json.dump(data, datafile,indent=4) datafile.close()
def constructDoc2vec(nameDataS2V=["dstc4_train","dstc4_dev"], dataPath="data", NameLearnedFile="LearnedDoc2Vec.d2v"): #nameDataS2V list of source dialogue data #Name of the file for learned doc2vecs. #label=s<sessionID>u(utteranceIndex) #words given bow print "Start to construct doc2vec from given dialogs." print nameDataS2V #Make input to doc to vec #-Load data dataS2V=[] for nameData in nameDataS2V: dataS2V.append(dataset_walker.dataset_walker(nameData,dataroot=dataPath,labels=True)) lSentences=[] for dataset in dataS2V: print dataset for call in dataset: sid=call.log["session_id"] for (uttr,_) in call: uid=uttr["utter_index"] label="s"+str(sid)+"u"+str(uid)# #print label words=LSTMWithBOWTracker.__getRegurelisedBOW(copy.copy(uttr["transcript"])) #print words lSentences.append(TaggedDocument(words=words,tags=[label])) #Learn #reference: http://rare-technologies.com/doc2vec-tutorial/ numMaxCPU=multiprocessing.cpu_count() if numMaxCPU > LSTMWithBOWTracker.D2V_MAXNUMCPU: print "As number of CPU is exceeded, it rescaled into " + str(LSTMWithBOWTracker.D2V_MAXNUMCPU) numMaxCPU=LSTMWithBOWTracker.D2V_MAXNUMCPU print "Lern doc2vec with " + str(numMaxCPU)+" CPUs." model = Doc2Vec(size=LSTMWithBOWTracker.D2V_VECTORSIZE,workers=numMaxCPU,min_count=0) # use fixed learning rate model.build_vocab(lSentences) for epoch in range(LSTMWithBOWTracker.D2V_MAXITERATION): model.train(lSentences) model.alpha *= 0.995# decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay pass print str(epoch)+ "/"+ str(LSTMWithBOWTracker.D2V_MAXITERATION) + "(epoch/max epoch)" model.save(NameLearnedFile) print "Doc2Vec was constructed with:" print dataS2V print ", stored to " + NameLearnedFile
def main(argv): install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker parser = argparse.ArgumentParser(description='Check the validity of a system output for SAP task.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/...') parser.add_argument('--jsonfile',dest='jsonfile',action='store',metavar='JSON_FILE',required=True, help='File containing JSON output') parser.add_argument('--roletype',dest='roletype',action='store',choices=['GUIDE', 'TOURIST'],required=True, help='Target role') args = parser.parse_args() sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=False, task='SLG', roletype=args.roletype.lower()) system_output = json.load(open(args.jsonfile)) checker = TrackChecker(sessions, system_output, args.roletype) checker.check() checker.print_errors()
def main(argv): #TODO implementation #Confirmation hypothesis about data tagsets = ontology_reader.OntologyReader("scripts/config/ontology_dstc4.json").get_tagsets() datasetTrain = dataset_walker.dataset_walker("dstc4_train",dataroot="data",labels=True) datasetDev = dataset_walker.dataset_walker("dstc4_dev",dataroot="data",labels=True) print "Calculate statics of dialog. " #-Is number of value in each slot is always 1 if it exist? i.e., it does not contain multiple value? #-There are many multiple value isEnumerateMultiValueCase=True isEnumerateMultiSlotCase=True countMultipleValueInOneSlot=0 # maxSlotValueTrain={} countMultipleSlot=0 for call in datasetTrain: for (uttr,label) in call: if "frame_label" in label: if isEnumerateMultiSlotCase: if len(label["frame_label"].keys()) > 1: print label["frame_label"].keys() countMultipleSlot+=1 for slot in label["frame_label"].keys(): if isEnumerateMultiValueCase: if slot not in maxSlotValueTrain: maxSlotValueTrain[slot]=len(label["frame_label"][slot]) else: if maxSlotValueTrain[slot] < len(label["frame_label"][slot]): maxSlotValueTrain[slot] = len(label["frame_label"][slot]) if len(label["frame_label"][slot]) > 1: print "slot=" + slot + ":", print label["frame_label"][slot] countMultipleValueInOneSlot+=1 for call in datasetDev: for (uttr,label) in call: if "frame_label" in label: if isEnumerateMultiSlotCase: if len(label["frame_label"].keys()) > 1: print label["frame_label"].keys() countMultipleSlot+=1 for slot in label["frame_label"].keys(): if isEnumerateMultiValueCase: if slot not in maxSlotValueTrain: maxSlotValueTrain[slot]=len(label["frame_label"][slot]) else: if maxSlotValueTrain[slot] < len(label["frame_label"][slot]): maxSlotValueTrain[slot] = len(label["frame_label"][slot]) if len(label["frame_label"][slot]) > 1: print "slot=" + slot + ":", print label["frame_label"][slot] countMultipleValueInOneSlot+=1 if isEnumerateMultiValueCase: print "Number of multiple value situation = " + str(countMultipleValueInOneSlot) avr=0.0 for slot in maxSlotValueTrain.keys(): avr+=(float)(maxSlotValueTrain[slot]) avr/=float(len(maxSlotValueTrain.keys())) maxSlotValueTrain["AverageNumber"]=int(round(avr)) print "Number of max slot value per slot:" print maxSlotValueTrain if isEnumerateMultiSlotCase: print "Number of multiple slot situation = " + str(countMultipleSlot) #-How many OOV case? #-Train -> dev: 1195, Dev->Train: 4789 #-With additional text normalizing, Train -> dev: 937, Dev->Train: 3643 #-With additional normalization Train -> dev: 831, Dev->Train: 3237 isCountNumberofOOVCase=False dictVocabInTrain={} dictVocabInDev={} numberOfOOVCaseInTrain2Dev=0 numberOfOOVCaseInDev2Train=0 if isCountNumberofOOVCase: for call in datasetTrain: for (uttr,label) in call: trans=uttr["transcript"] transt=re.sub("\,","",trans) transt=re.sub("\?","",transt) transt=re.sub("\.","",transt) transt=re.sub("(%.+ )?","",transt) #Additional normalize transt=re.sub("(%.+$)?","",transt) transt=re.sub("%","",transt) transt=re.sub("(-|~)"," ",transt) transt=re.sub("\!","",transt) transt=re.sub("'"," ",transt) transt=re.sub("\"","",transt) # transt=re.sub("/","",transt) transt=re.sub("[1-9]+","Replacedval",transt) transt=transt.lower() words=transt.split(" ") for word in words: #Additional normalization lmtr=nltk.stem.wordnet.WordNetLemmatizer() word=lmtr.lemmatize(word) dictVocabInTrain[word]=0 for call in datasetDev: for (uttr,label) in call: trans=uttr["transcript"] transt=re.sub("\,","",trans) transt=re.sub("\?","",transt) transt=re.sub("\.","",transt) transt=re.sub("(%.+ )?","",transt) #Additional normalize transt=re.sub("(%.+$)?","",transt) transt=re.sub("%","",transt) transt=re.sub("(-|~)"," ",transt) transt=re.sub("\!","",transt) transt=re.sub("'"," ",transt) transt=re.sub("\"","",transt) # transt=re.sub("/","",transt) transt=re.sub("[1-9]+","Replacedval",transt) transt=transt.lower() words=transt.split(" ") for word in words: #Additional normalization lmtr=nltk.stem.wordnet.WordNetLemmatizer() word=lmtr.lemmatize(word) if word not in dictVocabInTrain: print word.encode("utf-8") numberOfOOVCaseInTrain2Dev+=1 print "Number of OOV case in Train -> Dev situation = " + str(numberOfOOVCaseInTrain2Dev) print "\n\n\n\n\n" for call in datasetDev: for (uttr,label) in call: trans=uttr["transcript"] transt=re.sub("\,","",trans) transt=re.sub("\?","",transt) transt=re.sub("\.","",transt) transt=re.sub("(%.+ )?","",transt) #Additional normalize transt=re.sub("(%.+$)?","",transt) transt=re.sub("%","",transt) transt=re.sub("(-|~)"," ",transt) transt=re.sub("\!","",transt) transt=re.sub("'"," ",transt) transt=re.sub("\"","",transt) # transt=re.sub("/","",transt) transt=re.sub("[1-9]+","Replacedval",transt) transt=transt.lower() words=transt.split(" ") for word in words: #Additional normalization lmtr=nltk.stem.wordnet.WordNetLemmatizer() word=lmtr.lemmatize(word) dictVocabInDev[word]=0 for call in datasetTrain: for (uttr,label) in call: trans=uttr["transcript"] transt=re.sub("\,","",trans) transt=re.sub("\?","",transt) transt=re.sub("\.","",transt) transt=re.sub("(%.+ )?","",transt) #Additional normalize transt=re.sub("(%.+$)?","",transt) transt=re.sub("%","",transt) transt=re.sub("(-|~)"," ",transt) transt=re.sub("\!","",transt) transt=re.sub("'"," ",transt) transt=re.sub("\"","",transt) # transt=re.sub("/","",transt) transt=re.sub("[1-9]+","Replacedval",transt) transt=transt.lower() words=transt.split(" ") for word in words: #Additional normalization lmtr=nltk.stem.wordnet.WordNetLemmatizer() word=lmtr.lemmatize(word) if word not in dictVocabInDev: print word.encode("utf-8") numberOfOOVCaseInDev2Train+=1 print "Number of OOV case in Dev -> Train situation = " + str(numberOfOOVCaseInDev2Train) #-How many frame_label are unseen between train and dev data? #-So many, train -> dev 96/313 (unseen/all in dev), dev -> train 346/563 (unseen/all in train) isCountUnseenframeLabel=False dictTopicSlotValueTrain=[] numUnseenframeLabel=0 alreadychecked=[] dictTopicSlotValueDev={} if isCountUnseenframeLabel: for call in datasetTrain: for (uttr,label) in call: if "frame_label" in label: for slot in label["frame_label"].keys(): for value in label["frame_label"][slot]: dictTopicSlotValueTrain.append(slot+value) for call in datasetDev: for (uttr,label) in call: if "frame_label" in label: for slot in label["frame_label"].keys(): for value in label["frame_label"][slot]: dictTopicSlotValueDev[(slot+value)]=0 if (slot+value) not in dictTopicSlotValueTrain: if (slot+value) not in alreadychecked: numUnseenframeLabel+=1 alreadychecked.append((slot+value)) print "Number of Unseen label train -> dev = " + str(numUnseenframeLabel) print "Ratio (unseen/all in dev) = " + str(numUnseenframeLabel) + "/" + str(len(dictTopicSlotValueDev.keys())) dictTopicSlotValueDev=[] numUnseenframeLabel=0 alreadychecked=[] dictTopicSlotValueTrain={} for call in datasetDev: for (uttr,label) in call: if "frame_label" in label: for slot in label["frame_label"].keys(): for value in label["frame_label"][slot]: dictTopicSlotValueDev.append(slot+value) for call in datasetTrain: for (uttr,label) in call: if "frame_label" in label: for slot in label["frame_label"].keys(): for value in label["frame_label"][slot]: dictTopicSlotValueTrain[(slot+value)]=0 if (slot+value) not in dictTopicSlotValueDev: if (slot+value) not in alreadychecked: numUnseenframeLabel+=1 alreadychecked.append((slot+value)) print "Number of Unseen label dev -> train = " + str(numUnseenframeLabel) print "Ratio (unseen/all in train) = " + str(numUnseenframeLabel) + "/" + str(len(dictTopicSlotValueTrain.keys()))
def main(): parser = argparse.ArgumentParser( description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument( '--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile', dest='trackfile', action='store', required=True, metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--focus', dest='focus', action='store', nargs='?', default="False", const="True", help='Use focus node tracker') parser.add_argument( '--config', dest='config', action='store', required=True, metavar='TRUE/FALSE', help='The path of the config folder containing the .flist files') parser.add_argument('--tracker', dest='tracker', action='store', nargs='?', default="LearnedTracker", help='Tracker to use') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='The ontology to use') args = parser.parse_args() # Opens data set file and stores it in dataset object dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, config_folder=args.config) # Opens track file track_file = open(args.trackfile, "w") track = {"sessions": [], "dataset": args.dataset} start_time = time.time() # Choosing what kind of tracker to use if args.tracker.lower() == "tracker": tracker = Tracker() elif args.tracker.lower() == "focustracker": tracker = FocusTracker() elif args.tracker.lower() == "customtracker": ontology = json.load(open(args.ontology)) tracker = CustomTracker(ontology) elif args.tracker.lower() == "berttracker": ontology = json.load(open(args.ontology)) tracker = BertTracker(ontology) elif args.tracker.lower() == "learnedtracker": ontology = json.load(open(args.ontology)) tracker = LearnedTracker(ontology) elif args.tracker.lower() == "bandittracker": ontology = json.load(open(args.ontology)) tracker = BanditTracker(ontology) elif args.tracker.lower() == "bandittrackertf": ontology = json.load(open(args.ontology)) tracker = BanditTrackerTF(ontology) elif args.tracker.lower() == "simpletracker": ontology = json.load(open(args.ontology)) tracker = SimpleTracker(ontology) # Iterates over every call in the dataset for call in dataset: this_session = {"session-id": call.log["session-id"], "turns": []} tracker.reset() # Iterates over every turn in a call for turn, _ in call: # Adds the turn to the tracker tracker_turn = tracker.addTurn(turn, call.log["session-id"]) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file, indent=4)
def errorAnalysis(argv): print "ERROR ANALYSIS OF NAIVEENSEMBLER" print argv parser = argparse.ArgumentParser( description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument( '--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile', dest='trackfile', action='store', required=True, metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='JSON Ontology file') #args = parser.parse_args() args = parser.parse_args(argv) dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track = {"sessions": []} track["dataset"] = args.dataset start_time = time.time() tracker = NaiveEnsembleBasedTrackerWithNBest( tagsets, nameOfODictPickle="dictOutput.pic") for call in dataset: this_session = {"session_id": call.log["session_id"], "utterances": []} tracker.reset() for (utter, label) in call: #-mae shori2 if utter['segment_info']['target_bio'] == 'B': print "\n -----New sub-dialogue----------------------------------------------------" print "s:" + str(call.log['session_id']) + " u:" + str( utter['utter_index']) print "Input=" + utter["transcript"] tracker_result = tracker.addUtter(utter, call) if tracker_result is not None: this_session["utterances"].append(tracker_result) # print "Tracker's output:" print tracker_result if "frame_label" in label: for slot in label["frame_label"].keys(): if (slot not in tracker_result["frame_label"]): print "-slot [" + slot + "] is not exsisted in output" for value in label["frame_label"][slot]: print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output" else: if len(label["frame_label"][slot]) != len( tracker_result["frame_label"][slot]): #In case value in output, but repudant print "-slot [" + slot + "] include repudant values" for value in label["frame_label"][slot]: #In case value not in output if (value not in tracker_result["frame_label"] [slot]): print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output" track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track['wall_time'] = elapsed_time
def main(argv): install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path,'config') parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='scorefile',action='store',metavar='JSON_FILE',required=True, help='File containing score JSON') parser.add_argument('--scorefile',dest='csv',action='store',metavar='CSV_FILE',required=True, help='File to write with CSV scoring data') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True, help='JSON Ontology file') parser.add_argument('--rocdump',dest='rocdump',action='store',metavar='FILE_STEM', help='If present, use this file stem to write out ROC plot data: filestem.<schedule>.<slot>.<type>.csv, where type is either roc (which contains the ROC curve coordinates) or scores (which contains the raw scores used to compute the ROC curves).') args = parser.parse_args() sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) tracker_output = json.load(open(args.scorefile)) ontology = json.load(open(args.ontology)) slots_informable = ontology["informable"].keys() slots_requestable = ontology["requestable"] csvfile = open(args.csv,'w') # what stats are there? stats = [] stat_classes = [Stat_Accuracy, Stat_Probs, Stat_MRR, Stat_Updates, Stat_ROC] for schedule in SCHEDULES: for label_scheme in LABEL_SCHEMES: for component in ['goal','requested', 'method', 'all']: if component == 'goal' : for slot in slots_informable + ['all','joint','joint_independent'] : for stat_class in stat_classes: stats.append((('goal', slot), (schedule, label_scheme), stat_class())) elif component == 'requested' : if label_scheme != "a" : continue for slot in slots_requestable + ['all'] : for stat_class in stat_classes: stats.append((('requested', slot), (schedule, label_scheme), stat_class())) elif component == 'method' : for stat_class in stat_classes: stats.append((('method',), (schedule, label_scheme), stat_class())) elif component == 'all' : for stat_class in stat_classes: stats.append((('all',), (schedule, label_scheme), stat_class())) turn_counter = 0.0 for session_num, (session_tracker, session) in enumerate(zip(tracker_output['sessions'], sessions)): for _, _, stat_class in stats: stat_class.newDialog() session_id = session.log['session-id'] try: # these are the set of slots 'mentioned so far', i.e. for schedule2 S = defaultdict(lambda : set([])) S_requested = set([]) session_length = len(session) goal_labels_b, method_labels_b = misc.LabelsB(session, ontology) method_schedule_2 = False # whether schedule 2 is active for method for turn_num, ((log_turn,label_turn),_tracker_turn) in enumerate(zip(session,session_tracker['turns'])): turn_counter += 1.0 S_new = misc.S(log_turn, ontology) for slot in S_new : S[slot] = S[slot].union(S_new[slot]) # remove just informed slots from S_requested S_requested = S_requested.difference(misc.SysInformed(log_turn)) # add in ones from slu hyps S_requested = S_requested.union(set(misc.S_requested(log_turn))) tracker_goal_labels = _tracker_turn["goal-labels"] for slot in slots_informable: if slot in tracker_goal_labels : tracker_goal_labels[slot] = normalise_dist(tracker_goal_labels[slot].items(), (session_id, turn_num, "goal."+slot)) else : tracker_goal_labels[slot] = [(None, 1.0)] # prepare for joint goals scoring: tracker_goal_joint_labels = "independent" if "goal-labels-joint" in _tracker_turn : tracker_goal_joint_labels = _tracker_turn["goal-labels-joint"] if tracker_goal_joint_labels != "independent" : # tracker_goal_joint_labels must be a list of joint hyps tracker_goal_joint_labels = [(hyp["slots"], hyp["score"]) for hyp in tracker_goal_joint_labels] tracker_goal_joint_labels = normalise_dist(tracker_goal_joint_labels, (session_id, turn_num, "goal.joint")) # also gather the correct joint label true_goal_joint = None for slot in label_turn["goal-labels"]: if true_goal_joint == None : true_goal_joint = {} true_goal_joint[slot] = label_turn["goal-labels"][slot] true_goal_joint_b = None for slot in goal_labels_b[turn_num]: if true_goal_joint_b == None : true_goal_joint_b = {} true_goal_joint_b[slot] = goal_labels_b[turn_num][slot] tracker_requested_slots = _tracker_turn["requested-slots"] for slot in tracker_requested_slots: dist = [(True, tracker_requested_slots[slot]), (False,1.0-tracker_requested_slots[slot])] tracker_requested_slots[slot] = normalise_dist(dist, (session_id, turn_num, "requested."+slot)) tracker_method_label = normalise_dist(_tracker_turn["method-label"].items(), (session_id, turn_num,"method")) # for method schedule 2, work out whether any slu-hyp has been given # which informs the method: if not method_schedule_2 : mact = log_turn["output"]["dialog-acts"] for slu_hyp in log_turn["input"]["live"]["slu-hyps"] : user_act = slu_hyp["slu-hyp"] method_label = misc.MethodLabel(user_act, mact) if method_label != "none" : method_schedule_2 = True break for component, (schedule, label_scheme), stat_class in stats: if component[0] == "goal" and (component[1] == "joint" or component[1] == "joint_independent"): if schedule == 2: # calculate schedule2 applicability applies = False for slot in slots_informable: if len(S[slot]) > 0: applies = True break if not applies : continue this_true_label = true_goal_joint if label_scheme == "b" : this_true_label = true_goal_joint_b if tracker_goal_joint_labels == "independent" or component[1] == "joint_independent" : stat_class.add(tracker_goal_labels, this_true_label, (session_id, turn_num, component, schedule, label_scheme), independent=True) else : stat_class.add(tracker_goal_joint_labels, this_true_label, (session_id, turn_num, component, schedule, label_scheme)) if (component[0] == "goal" or component[0] == "all") and (len(component)==1 or ("joint" not in component[1])) : if component[0] == "all" or component[1] == "all" : slots = slots_informable[:] else : slots = [component[1]] for slot in slots: if schedule ==2 and len(S[slot]) == 0 : continue dist = tracker_goal_labels[slot] true_label = None if slot in label_turn["goal-labels"] : true_label = label_turn["goal-labels"][slot] if label_scheme == "b" : true_label = None if slot in goal_labels_b[turn_num] : true_label = goal_labels_b[turn_num][slot] stat_class.add(dist, true_label, (session_id, turn_num, component, schedule, label_scheme)) if component[0] == "requested" or component[0] == "all" : if component[0] == "all" or component[1] == "all": slots = slots_requestable[:] else : slots = [component[1]] for slot in slots: if schedule ==2 and (slot not in S_requested): continue dist = [(False,1.0), (True,0.0)] if slot in tracker_requested_slots : dist = tracker_requested_slots[slot] true_label = (slot in label_turn["requested-slots"]) stat_class.add(dist, true_label, (session_id, turn_num, component, schedule, label_scheme)) if component[0] == "method" or component[0] == "all": if schedule == 2 and not method_schedule_2: continue # no slu hyp informing the method has been given yet. dist = tracker_method_label true_label = label_turn["method-label"] if label_scheme == "b" : true_label = method_labels_b[turn_num] stat_class.add(dist, true_label, (session_id, turn_num, component, schedule, label_scheme)) except KeyboardInterrupt : raise except: traceback.print_exc(file=sys.stdout) print "While scoring " + str(session_id) # output to csv print >>csvfile,( "state_component, stat, schedule, label_scheme, N, result") for stat in stats: component, (schedule, label_scheme), stat_class = stat results = stat_class.results() for stat_subname, N, result in results: if result == None : result = "-" else : result = "%.7f"%result print >>csvfile,( "%s, %s, %i, %s, %i, %s"%(".".join(component), stat_subname, schedule, label_scheme, N, result)) if isinstance(stat_class, Stat_ROC) and (args.rocdump): rocfile = args.rocdump + '.schedule' + str(schedule) + str(label_scheme)+'.' + (".".join(component)) + '.roc.csv' scoresfile = args.rocdump + '.schedule' + str(schedule) + str(label_scheme)+'.' + (".".join(component)) + '.scores.csv' stat_class.DumpROCToFile(rocfile) stat_class.DumpScoresToFile(scoresfile) print >>csvfile,'basic,total_wall_time,,,,%s' % (tracker_output['wall-time']) print >>csvfile,'basic,sessions,,,,%s' % (len(sessions)) print >>csvfile,'basic,turns,,,,%i' % (int(turn_counter)) print >>csvfile,'basic,wall_time_per_turn,,,,%s' % (tracker_output['wall-time'] / turn_counter) print >>csvfile,'basic,dataset,,,,%s' % (tracker_output['dataset'] ) csvfile.close()
def main(argv): install_path = os.path.abspath( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path, 'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker from stat_classes import Stat_Precision_Recall from eval_func import eval_acts, eval_semantics parser = argparse.ArgumentParser( description='Evaluate output from an SLU system.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot', dest='dataroot', action='store', metavar='PATH', required=True, help='look for corpus in <destroot>/<dataset>/...') parser.add_argument('--jsonfile', dest='jsonfile', action='store', metavar='JSON_FILE', required=True, help='File containing JSON output') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='JSON Ontology file') parser.add_argument('--roletype', dest='roletype', action='store', required=True, choices=['GUIDE', 'TOURIST'], help='Target role') parser.add_argument('--scorefile', dest='scorefile', action='store', metavar='JSON_FILE', required=True, help='File to write with CSV scoring data') args = parser.parse_args() sessions = dataset_walker( args.dataset, dataroot=args.dataroot, labels=True) system_output = json.load(open(args.jsonfile)) stats = {} stats['semantic_tagged'] = {} stats['semantic_tagged']['detection'] = Stat_Precision_Recall() stats['semantic_tagged']['class'] = Stat_Precision_Recall() stats['semantic_tagged']['all'] = Stat_Precision_Recall() stats['speech_act'] = {} stats['speech_act']['act'] = Stat_Precision_Recall() stats['speech_act']['all'] = Stat_Precision_Recall() for session, track_session in zip(sessions, system_output["sessions"]): log_utter_list = [] label_utter_list = [] for log_utter, translations, label_utter in session: if (args.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (args.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'): log_utter_list.append(log_utter) label_utter_list.append(label_utter) # now iterate through turns for log_utter, label_utter, track_utter in zip( log_utter_list, label_utter_list, track_session["utterances"]): for subtask in stats: if subtask == 'speech_act': ref_sa_list = label_utter['speech_act'] pred_sa_list = track_utter['speech_act'] eval_acts(ref_sa_list, pred_sa_list, stats[subtask]) elif subtask == 'semantic_tagged': ref_tagged = ' '.join(label_utter['semantic_tagged']) pred_tagged = track_utter['semantic_tagged'] eval_semantics(ref_tagged, pred_tagged, stats[subtask]) csvfile = open(args.scorefile, 'w') print >> csvfile, ("task, subtask, schedule, stat, N, result") for subtask in stats: for schedule in stats[subtask]: for measure, N, result in stats[subtask][schedule].results(): print >>csvfile, ("%s, %s, %s, %s, %i, %s" % ( 'SLU', subtask, schedule, measure, N, result)) csvfile.close()
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '<PAD/>' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [transcript] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] train_utters += [ (transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [translation] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] test_utters += [ (translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] ctx_utters = [utter[1].split(' ') for utter in train_utters] print("max context utter length: %d " % max([len(ctx_utter) for ctx_utter in ctx_utters])) max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] ctx_utters = [utter[1].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) # build labels sa_train_labels = [utter[3] for utter in train_utters] sa_test_labels = [utter[3] for utter in test_utters] sa_train_ctx_labels = [utter[5] for utter in train_utters] sa_test_ctx_labels = [utter[5] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels) test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] guide_train_ctx_labels = train_ctx_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] guide_test_ctx_labels = test_ctx_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_ctx_inputs, tourist_train_labels, tourist_train_ctx_labels, tourist_test_inputs, tourist_test_ctx_inputs, tourist_test_labels, tourist_test_ctx_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_ctx_inputs, guide_train_labels, guide_train_ctx_labels, guide_test_inputs, guide_test_ctx_inputs, guide_test_labels, guide_test_ctx_labels) print("")
def main(argv): # # CMD LINE ARGS # install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path,'config') parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--decodefile',dest='decodefile',action='store',metavar='JSON_FILE',required=True, help='File containing decoder output JSON') parser.add_argument('--scorefile',dest='csv',action='store',metavar='CSV_FILE',required=True, help='File to write with CSV scoring data') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True, help='JSON Ontology file') parser.add_argument('--trackerfile',dest='trackerfile',action='store',metavar='JSON_FILE',required=True, help='Tracker JSON file for output') args = parser.parse_args() sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) decode_results = json.load(open(args.decodefile)) ontology = json.load(open(args.ontology)) metrics = { "tophyp":Fscore(ontology), "ice":ICE(ontology) } belief_metrics = { "accuracy":BeliefAccuracy(ontology) } # we run the baseline focus tracker on the output of the SLU tracker = baseline.FocusTracker() tracker_output = {"sessions":[],"wall-time":0.0} tracker_output["dataset"] = args.dataset for call, decode_session in zip(sessions, decode_results["sessions"]): tracker.reset() this_session = {"session-id":call.log["session-id"], "turns":[]} for (log_turn, label), decode_result in zip(call, decode_session["turns"]): true_label = label["semantics"]["json"] slu_hyps = decode_result["slu-hyps"] slu_hyps.sort(key=lambda x:-x["score"]) total_p = sum([x["score"] for x in slu_hyps]) if total_p > 1.0 : if total_p > 1.00001 : print "Warning: total_p =",total_p,"> 1.0- renormalising." for slu_hyp in slu_hyps: slu_hyp["score"] = slu_hyp["score"]/total_p for metric in metrics.values(): metric.add_turn(true_label, slu_hyps, log_turn, label) # for passing to tracker this_turn = { "input":{"live":{"slu-hyps":slu_hyps}}, "output":log_turn["output"] } goal_hyps = tracker.addTurn(this_turn) for belief_metric in belief_metrics.values(): belief_metric.add_turn(goal_hyps, label) this_session["turns"].append(goal_hyps) tracker_output["sessions"].append(this_session) tracker_file = open(args.trackerfile, "wb") json.dump(tracker_output, tracker_file, indent=4) tracker_file.close() csv_file = open(args.csv, "wb") output = [] for key, metric in metrics.items(): this_output = metric.output() for this_key, value in this_output.items(): output.append(( key + ","+ this_key, value)) for key, belief_metric in belief_metrics.items(): this_output = belief_metric.output() key = "belief_"+key for this_key, value in this_output.items(): output.append((key + ","+ this_key, value)) output.sort(key=lambda x:x[0]) for key, value in output: w = 35 if value < 0 : w = w-1 metric_name = (key+",").ljust(w) csv_file.write(metric_name + ("%.5f"%value)+"\n") csv_file.close()
def main(argv): parser = argparse.ArgumentParser(description='Simple SLU baseline.') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--modelfile', dest='modelfile', action='store', required=True, metavar='MODEL_FILE', help='File to write with trained model') parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE', help='File to write with SLU output') parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE', 'TOURIST'], required=True, help='Target role') args = parser.parse_args() slu = SimpleSLU() trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'): slu.add_instance(log_utter['transcript'], label_utter['speech_act'], label_utter['semantic_tagged']) sys.stderr.write('Done\n') slu.train(args.modelfile) projection = DirectLabelProjection() output = {'sessions': []} output['dataset'] = args.testset output['task_type'] = 'SLU' output['role_type'] = args.roletype start_time = time.time() testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True) sys.stderr.write('Loading training instances ... ') for call in testset: this_session = {"session_id": call.log["session_id"], "utterances": []} for (log_utter, translations, label_utter) in call: if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'): slu_result = {'utter_index': log_utter['utter_index']} if len(translations['translated']) > 0: top_hyp = translations['translated'][0]['hyp'] pred_act, pred_semantic = slu.pred(top_hyp) combined_act = {} for act_label in reduce(operator.add, pred_act): m = re.match('^([^_]+)_(.+)$', act_label) act = m.group(1) attr = m.group(2) if act not in combined_act: combined_act[act] = [] if attr not in combined_act[act]: combined_act[act].append(attr) slu_result['speech_act'] = [] for act in combined_act: attr = combined_act[act] slu_result['speech_act'].append({'act': act, 'attributes': attr}) align = translations['translated'][0]['align'] projected = projection.project(log_utter['transcript'], top_hyp, align, pred_semantic) slu_result['semantic_tagged'] = projection.convert_to_tagged_utter(projected) else: slu_result['semantic_tagged'] = log_utter['transcript'] slu_result['speech_act'] = [] this_session['utterances'].append(slu_result) output['sessions'].append(this_session) end_time = time.time() elapsed_time = end_time - start_time output['wall_time'] = elapsed_time with open(args.outfile, "wb") as of: json.dump(output, of, indent=4) sys.stderr.write('Done\n')
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset( trainset, devset, testset) train_utters += dev_utters # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
def main(argv): install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker from stat_classes import Stat_Accuracy, Stat_Frame_Precision_Recall parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',metavar='JSON_FILE',required=True,help='File containing tracker JSON output') parser.add_argument('--scorefile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,help='File to write with JSON scoring data') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') args = parser.parse_args() sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=True) tracker_output = json.load(open(args.trackfile)) ontology = OntologyReader(args.ontology) stats = [] stat_classes = [Stat_Accuracy, Stat_Frame_Precision_Recall] for schedule in SCHEDULES: for stat_class in stat_classes: stats.append((('all', 'all'), schedule, stat_class())) for topic in ontology.get_topics(): for slot in ontology.get_slots(topic) + ['all']: for stat_class in stat_classes: stats.append(((topic, slot), schedule, stat_class())) utter_counter = 0.0 for session, track_session in zip(sessions, tracker_output["sessions"]): prev_ref_frame = None prev_track_frame = None prev_topic = None for (log_utter, translations, label_utter), track_utter in zip(session, track_session["utterances"]): utter_counter += 1.0 if log_utter['segment_info']['target_bio'] == 'B': # Beginning of a new segment ref_frame = label_utter['frame_label'] track_frame = track_utter['frame_label'] for (topic, slot), schedule, stat_class in stats: if schedule == 2: if topic == 'all': stat_class.add(prev_track_frame, prev_ref_frame) elif prev_topic == topic: if slot == 'all': stat_class.add(prev_track_frame, prev_ref_frame) else: if slot in prev_track_frame and slot in prev_ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]}) elif slot in prev_track_frame and slot not in prev_ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: []}) elif slot not in prev_track_frame and slot in prev_ref_frame: stat_class.add({slot: []}, {slot: prev_ref_frame[slot]}) elif log_utter['segment_info']['target_bio'] == 'I': ref_frame = label_utter['frame_label'] track_frame = track_utter['frame_label'] elif log_utter['segment_info']['target_bio'] == 'O': ref_frame = None track_frame = None for (topic, slot), schedule, stat_class in stats: if schedule == 1: if topic == 'all': stat_class.add(track_frame, ref_frame) elif log_utter['segment_info']['topic'] == topic: if slot == 'all': stat_class.add(track_frame, ref_frame) else: if slot in track_frame and slot in ref_frame: stat_class.add({slot: track_frame[slot]}, {slot: ref_frame[slot]}) elif slot in track_frame and slot not in ref_frame: stat_class.add({slot: track_frame[slot]}, {slot: []}) elif slot not in track_frame and slot in ref_frame: stat_class.add({slot: []}, {slot: ref_frame[slot]}) prev_ref_frame = ref_frame prev_track_frame = track_frame prev_topic = log_utter['segment_info']['topic'] for (topic, slot), schedule, stat_class in stats: if schedule == 2: if topic == 'all': stat_class.add(prev_track_frame, prev_ref_frame) elif prev_topic == topic: if slot == 'all': stat_class.add(prev_track_frame, prev_ref_frame) else: if slot in prev_track_frame and slot in prev_ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]}) elif slot in track_frame and slot not in ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: []}) elif slot not in track_frame and slot in ref_frame: stat_class.add({slot: []}, {slot: prev_ref_frame[slot]}) csvfile = open(args.scorefile, 'w') print >> csvfile, ("topic, slot, schedule, stat, N, result") for stat in stats: (topic, slot), schedule, stat_class = stat results = stat_class.results() for stat_subname, N, result in results: if result == None: result = "-" else: result = "%.7f"%result print >>csvfile,("%s, %s, %i, %s, %i, %s"%(topic, slot, schedule, stat_subname, N, result)) print >>csvfile,'basic,total_wall_time,,,,%s' % (tracker_output['wall_time']) print >>csvfile,'basic,sessions,,,,%s' % (len(sessions)) print >>csvfile,'basic,utterances,,,,%i' % (int(utter_counter)) print >>csvfile,'basic,wall_time_per_utterance,,,,%s' % (tracker_output['wall_time'] / utter_counter) print >>csvfile,'basic,dataset,,,,%s' % (tracker_output['dataset'] ) csvfile.close()
local_recall_avg = 0.0 if recall_count != 0: local_recall_avg = local_recall / recall_count precision += local_precision_avg recall += local_recall_avg predictions.append(predicted) eval_str = '\n[%d]\npredicted: %s\nactual: %s\nLocal Precision: %f\nLocal Recall: %f\n' % ( j, str(predicted), str(actual), local_precision_avg, local_recall_avg, ) writer.write(eval_str) print 'Accuracy: %f' % (correct * 1.0 / len(model_predictions)) print 'Precision: %f' % (precision / len(model_predictions)) print 'Recall: %f' % (recall / len(model_predictions)) if __name__ == '__main__': dataset = dataset_walker("dstc2_dev", dataroot=data_folder, labels=True) my_user_sim = UserSim() my_user_sim.generate_training_data(dataset, context_turn_num=3) my_user_sim.train_and_test(training_percent=0.7, training=True, testing=True)
def learn(self, pathdataset=["dstc4_train"], Pathdataroot="data", numberOfHiddenUnit=20, EPOCHS_PER_CYCLE=10, CYCLES=40, weightdecayw=0.01): print "Start learning LSTM, and make dictionary file" #Construct dictionary: variable name -> corresponding index of element in i/o vector print "Star make dictionary: variable name -> corresponding index of element in i/o vector" self.dictOut = { } #"TOPIC_SLOT_VALUE" -> corresponding index of element self.dictIn = { } #"SPEAKER_{val}"or"TOPIC_{val}","WORD_{word}" "BIO_{BIO}", "CLASS_{slot,value}", ""{defined label}-> corresponding index of element #-target vector dictionary index = 0 totalNumSlot = 0 for topic in self.tagsets.keys(): for slot in self.tagsets[topic].keys(): totalNumSlot += 1 for value in self.tagsets[topic][slot]: self.dictOut[topic + "_" + slot + "_" + value] = index index += 1 print "totalNumSlot:" + str(totalNumSlot) print "outputSize:" + str(len(self.dictOut.keys())) #-input dictionry dataset = [] for pathdat in pathdataset: dataset.append( dataset_walker.dataset_walker(pathdat, dataroot=Pathdataroot, labels=False)) #--(sub input vector 1) Class features i.e., Slot and value ratio (Similar to base line) index = 0 for topic in self.tagsets.keys(): for slot in self.tagsets[topic].keys(): if ("CLASS_" + slot) not in self.dictIn: self.dictIn["CLASS_" + slot] = index index += 1 for value in self.tagsets[topic][slot]: if ("CLASS_" + value) not in self.dictIn: self.dictIn["CLASS_" + value] = index index += 1 self.TOTALSIZEOFCLASSFeature = index f = open(self.FileNameofNumClassFeature, "wb") pickle.dump(self.TOTALSIZEOFCLASSFeature, f) f.close() #--(sub input vector 2) Sentence features if not self.isUseSentenceRepresentationInsteadofBOW: index = 0 for elemDataset in dataset: for call in elemDataset: for (uttr, _) in call: #General info1 (CLASS; this feature must be rejistered at first) if ("SPEAKER_" + uttr["speaker"]) not in self.dictIn: self.dictIn["SPEAKER_" + uttr["speaker"]] = index index += 1 if ("TOPIC_" + uttr["segment_info"]["topic"] ) not in self.dictIn: self.dictIn["TOPIC_" + uttr["segment_info"]["topic"]] = index index += 1 #General info2 #-BIO if ("BIO_" + uttr['segment_info']['target_bio'] ) not in self.dictIn: self.dictIn[ "BIO_" + uttr['segment_info']['target_bio']] = index index += 1 #BOW if LSTMWithBOWTracker.isIgnoreUtterancesNotRelatedToMainTask: if not (uttr['segment_info']['target_bio'] == "O"): #-BOW splitedtrans = self.__getRegurelisedBOW( uttr["transcript"]) for word in splitedtrans: if ("WORD_" + word) not in self.dictIn: self.dictIn["WORD_" + word] = index index += 1 self.TOTALSIZEOFSENTENCEFeature = index f = open(self.FileNameofNumSentenceFeature, "wb") pickle.dump(self.TOTALSIZEOFSENTENCEFeature, f) f.close() elif self.isUseSentenceRepresentationInsteadofBOW: index = 0 for i in range(0, LSTMWithBOWTracker.D2V_VECTORSIZE): self.dictIn[str(index) + "thElemPV"] = index index += 1 index = 0 for i in range(0, LSTMWithBOWTracker.D2V_VECTORSIZE): self.dictIn[str(index) + "thAvrWord"] = index index += 1 assert self.D2V_VECTORSIZE == LSTMWithBOWTracker.D2V_VECTORSIZE, "D2V_VECTORSIZE is restrected to be same over the class" else: assert False, "Unexpected block" #--(sub input vector 3) Features M1s defined index = 0 if self.isEnableToUseM1sFeature: rejisteredFeatures = self.__rejisterM1sInputFeatureLabel( self.tagsets, dataset) for rFeature in rejisteredFeatures: assert rFeature not in self.dictIn, rFeature + " already registered in input vector. Use different label name. " self.dictIn[rFeature] = index index += 1 self.TOTALSIZEOFM1DEFINEDFeature = index f = open(self.FileNameofNumM1Feature, "wb") pickle.dump(self.TOTALSIZEOFM1DEFINEDFeature, f) f.close() print "inputSize:" + str(len(self.dictIn.keys())) assert self.dictIn[ "CLASS_INFO"] == 0, "Unexpected index CLASS_INFO should has value 0" assert self.dictIn[ "CLASS_Fort Siloso"] == 334, "Unexpected index CLASS_Fort Siloso should has value 334" assert self.dictIn[ "CLASS_Yunnan"] == 1344, "Unexpected index CLASS_Yunnan should has value 1611" #--write fileObject = open('dictInput.pic', 'w') pickle.dump(self.dictIn, fileObject) fileObject.close() fileObject = open('dictOutput.pic', 'w') pickle.dump(self.dictOut, fileObject) fileObject.close() #Build RNN frame work print "Start learning Network" #Capability of network is: (30 hidden units can represents 1048576 relations) wherease (10 hidden units can represents 1024) #Same to Henderson (http://www.aclweb.org/anthology/W13-4073)? net = buildNetwork(len(self.dictIn.keys()), numberOfHiddenUnit, len(self.dictOut.keys()), hiddenclass=LSTMLayer, outclass=SigmoidLayer, outputbias=False, recurrent=True) #Train network #-convert training data into sequence of vector convDataset = [] #[call][uttr][input,targetvec] iuttr = 0 convCall = [] for elemDataset in dataset: for call in elemDataset: for (uttr, label) in call: if self.isIgnoreUtterancesNotRelatedToMainTask: if uttr['segment_info']['target_bio'] == "O": continue #-input convInput = self._translateUtteranceIntoInputVector( uttr, call) #-output convOutput = [0.0] * len( self.dictOut.keys()) #Occured:+1, Not occured:0 if "frame_label" in label: for slot in label["frame_label"].keys(): for value in label["frame_label"][slot]: convOutput[self.dictOut[ uttr["segment_info"]["topic"] + "_" + slot + "_" + value]] = 1 #-post proccess if self.isSeparateDialogIntoSubDialog: if uttr['segment_info']['target_bio'] == "B": if len(convCall) > 0: convDataset.append(convCall) convCall = [] convCall.append([convInput, convOutput]) #print "Converted utterance" + str(iuttr) iuttr += 1 if not self.isSeparateDialogIntoSubDialog: if len(convCall) > 0: convDataset.append(convCall) convCall = [] #Online learning trainer = RPropMinusTrainer(net, weightdecay=weightdecayw) EPOCHS = EPOCHS_PER_CYCLE * CYCLES for i in xrange(CYCLES): #Shuffle order ds = SequentialDataSet(len(self.dictIn.keys()), len(self.dictOut.keys())) datInd = range(0, len(convDataset)) random.shuffle( datInd ) #Backpropergation already implemeted data shuffling, however though RpropMinus don't. for ind in datInd: ds.newSequence() for convuttr in convDataset[ind]: ds.addSample(convuttr[0], convuttr[1]) #Evaluation and Train epoch = (i + 1) * EPOCHS_PER_CYCLE print "\r epoch {}/{} Error={}".format( epoch, EPOCHS, trainer.testOnData(dataset=ds)) stdout.flush() trainer.trainOnDataset(dataset=ds, epochs=EPOCHS_PER_CYCLE) NetworkWriter.writeToFile( trainer.module, "LSTM_" + "Epoche" + str(i + 1) + ".rnnw") NetworkWriter.writeToFile(trainer.module, "LSTM.rnnw")
def main(argv): # # CMD LINE ARGS # install_path = os.path.abspath( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path, 'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path, 'config') parser = argparse.ArgumentParser( description='Evaluate output from a belief tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument( '--dataroot', dest='dataroot', action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--decodefile', dest='decodefile', action='store', metavar='JSON_FILE', required=True, help='File containing decoder output JSON') parser.add_argument('--scorefile', dest='csv', action='store', metavar='CSV_FILE', required=True, help='File to write with CSV scoring data') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='JSON Ontology file') parser.add_argument('--trackerfile', dest='trackerfile', action='store', metavar='JSON_FILE', required=True, help='Tracker JSON file for output') args = parser.parse_args() sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=True) decode_results = json.load(open(args.decodefile)) ontology = json.load(open(args.ontology)) metrics = {"tophyp": Fscore(ontology), "ice": ICE(ontology)} belief_metrics = {"accuracy": BeliefAccuracy(ontology)} # we run the baseline focus tracker on the output of the SLU tracker = baseline.FocusTracker() tracker_output = {"sessions": [], "wall-time": 0.0} tracker_output["dataset"] = args.dataset for call, decode_session in zip(sessions, decode_results["sessions"]): tracker.reset() this_session = {"session-id": call.log["session-id"], "turns": []} for (log_turn, label), decode_result in zip(call, decode_session["turns"]): true_label = label["semantics"]["json"] slu_hyps = decode_result["slu-hyps"] slu_hyps.sort(key=lambda x: -x["score"]) total_p = sum([x["score"] for x in slu_hyps]) if total_p > 1.0: if total_p > 1.00001: print "Warning: total_p =", total_p, "> 1.0- renormalising." for slu_hyp in slu_hyps: slu_hyp["score"] = slu_hyp["score"] / total_p for metric in metrics.values(): metric.add_turn(true_label, slu_hyps, log_turn, label) # for passing to tracker this_turn = { "input": { "live": { "slu-hyps": slu_hyps } }, "output": log_turn["output"] } goal_hyps = tracker.addTurn(this_turn) for belief_metric in belief_metrics.values(): belief_metric.add_turn(goal_hyps, label) this_session["turns"].append(goal_hyps) tracker_output["sessions"].append(this_session) tracker_file = open(args.trackerfile, "wb") json.dump(tracker_output, tracker_file, indent=4) tracker_file.close() csv_file = open(args.csv, "wb") output = [] for key, metric in metrics.items(): this_output = metric.output() for this_key, value in this_output.items(): output.append((key + "," + this_key, value)) for key, belief_metric in belief_metrics.items(): this_output = belief_metric.output() key = "belief_" + key for this_key, value in this_output.items(): output.append((key + "," + this_key, value)) output.sort(key=lambda x: x[0]) for key, value in output: w = 35 if value < 0: w = w - 1 metric_name = (key + ",").ljust(w) csv_file.write(metric_name + ("%.5f" % value) + "\n") csv_file.close()
def main(argv): # # CMD LINE ARGS # install_path = os.path.abspath( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path, 'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path, 'config') parser = argparse.ArgumentParser( description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument( '--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze, for example train1 or test2 or train3a') parser.add_argument( '--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--datafile', dest='datafile', action='store', required=True, metavar='JSON_FILE', help='File to write output') parser.add_argument('--label', dest='label', action='store', required=True, metavar='BOOL', help='load labels') args = parser.parse_args() label = False if args.label.lower() == 'true': label = True dataset = dataset_walker(args.dataset, dataroot=args.dataroot, labels=label) datafile = open(args.datafile, "wb") data = {"sessions": []} data["dataset"] = args.dataset vector = vectorizer() for call in dataset: this_session = {"session-id": call.log["session-id"], "turns": []} vector.reset() for turn, labels in call: data_point = vector.addTurn(turn, labels) this_session["turns"].append(data_point) data["sessions"].append(this_session) json.dump(data, datafile, indent=4) datafile.close()
def main(argv): parser = argparse.ArgumentParser(description='Simple SLG baseline.') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/...') parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE', help='File to write with SLG output') parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE', 'TOURIST'], required=True, help='Target role') args = parser.parse_args() slg = SimpleSLG() trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True, task='SLG', roletype=args.roletype.lower()) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() == args.roletype.lower(): instance = { 'semantic_tags': log_utter['semantic_tags'], 'speech_act': log_utter['speech_act'] } slg.add_instance(instance, translations) slg.train() sys.stderr.write('Done\n') output = {'sessions': []} output['dataset'] = args.testset output['task_type'] = 'SLG' output['role_type'] = args.roletype start_time = time.time() testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True, task='SLG', roletype=args.roletype.lower()) sys.stderr.write('Loading testing instances ... ') for call in testset: this_session = {"session_id": call.log["session_id"], "utterances": []} for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() == args.roletype.lower(): instance = { 'semantic_tags': log_utter['semantic_tags'], 'speech_act': log_utter['speech_act'] } slg_result = { 'utter_index': log_utter['utter_index'], 'generated': slg.generate(instance) } this_session['utterances'].append(slg_result) output['sessions'].append(this_session) sys.stderr.write('Done\n') end_time = time.time() elapsed_time = end_time - start_time output['wall_time'] = elapsed_time with open(args.outfile, "wb") as of: json.dump(output, of, indent=4) sys.stderr.write('Done\n')
def main(argv): install_path = os.path.abspath( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path, 'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker parser = argparse.ArgumentParser( description='Evaluate output from an SLU system.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument( '--dataroot', dest='dataroot', action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--pilotfile', dest='pilotfile', action='store', metavar='JSON_FILE', required=True, help='File containing JSON output') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='JSON Ontology file') parser.add_argument('--pilottask', dest='pilottask', action='store', choices=['SLU', 'SAP', 'SLG', 'EES'], required=True, help='Target task') parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE', 'TOURIST'], required=True, help='Target role') parser.add_argument('--scorefile', dest='scorefile', action='store', metavar='JSON_FILE', required=True, help='File to write with CSV scoring data') args = parser.parse_args() sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=True) system_output = json.load(open(args.pilotfile)) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() stats = {} if args.pilottask == 'SLU': stats['semantic_tagged'] = {} stats['semantic_tagged']['detection'] = Stat_Precision_Recall() stats['semantic_tagged']['class'] = Stat_Precision_Recall() stats['semantic_tagged']['all'] = Stat_Precision_Recall() if args.pilottask == 'SLU' or args.pilottask == 'SAP': stats['speech_act'] = {} stats['speech_act']['act'] = Stat_Precision_Recall() stats['speech_act']['all'] = Stat_Precision_Recall() if args.pilottask == 'SLG' or args.pilottask == 'EES': stats['utt_transcriptions'] = {} stats['utt_transcriptions']['all'] = Stat_BLEU_AM_FM() for session, track_session in zip(sessions, system_output["sessions"]): session_id = session.log['session_id'] log_utter_list = [] label_utter_list = [] for log_utter, label_utter in session: if (args.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (args.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'): log_utter_list.append(log_utter) label_utter_list.append(label_utter) # now iterate through turns for log_utter, label_utter, track_utter in zip( log_utter_list, label_utter_list, track_session["utterances"]): for subtask in stats: if subtask == 'speech_act': ref_sa_list = label_utter['speech_act'] pred_sa_list = track_utter['speech_act'] eval_acts(ref_sa_list, pred_sa_list, stats[subtask]) elif subtask == 'semantic_tagged': ref_tagged = ' '.join(label_utter['semantic_tagged']) pred_tagged = track_utter['semantic_tagged'] eval_semantics(ref_tagged, pred_tagged, stats[subtask]) elif subtask == 'utt_transcriptions': ref = log_utter['transcript'] pred = track_utter['generated_sentence'] eval_utt(ref, pred, stats[subtask]) csvfile = open(args.scorefile, 'w') print >> csvfile, ("task, subtask, schedule, stat, N, result") for subtask in stats: for schedule in stats[subtask]: for measure, N, result in stats[subtask][schedule].results(): print >> csvfile, ( "%s, %s, %s, %s, %i, %s" % (args.pilottask, subtask, schedule, measure, N, result)) csvfile.close()
def main(argv): install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path,'config') parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',metavar='JSON_FILE',required=True,help='File containing tracker JSON output') parser.add_argument('--scorefile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,help='File to write with JSON scoring data') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') #args = parser.parse_args() args = parser.parse_args(argv) sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) tracker_output = json.load(open(args.trackfile)) ontology = OntologyReader(args.ontology) tagsets = ontology.get_tagsets() stats = [] stat_classes = [Stat_Accuracy, Stat_Precision_Recall] for schedule in SCHEDULES: for stat_class in stat_classes: stats.append((('all', 'all'), schedule, stat_class())) for topic in ontology.get_topics(): for slot in ontology.get_slots(topic) + ['all']: for stat_class in stat_classes: stats.append(((topic, slot), schedule, stat_class())) utter_counter = 0.0 for session, track_session in zip(sessions, tracker_output["sessions"]): session_id = session.log['session_id'] prev_ref_frame = None prev_track_frame = None for (log_utter, label_utter), track_utter in zip(session, track_session["utterances"]): utter_counter += 1.0 if log_utter['segment_info']['target_bio'] == 'B': # Beginning of a new segment ref_frame = label_utter['frame_label'] track_frame = track_utter['frame_label'] for (topic, slot), schedule, stat_class in stats: if schedule == 2: if topic == 'all': stat_class.add(prev_track_frame, prev_ref_frame) elif prev_topic == topic: if slot == 'all': stat_class.add(prev_track_frame, prev_ref_frame) else: if slot in prev_track_frame and slot in prev_ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]}) elif slot in prev_track_frame and slot not in prev_ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: []}) elif slot not in prev_track_frame and slot in prev_ref_frame: stat_class.add({slot: []}, {slot: prev_ref_frame[slot]}) elif log_utter['segment_info']['target_bio'] == 'I': ref_frame = label_utter['frame_label'] track_frame = track_utter['frame_label'] elif log_utter['segment_info']['target_bio'] == 'O': ref_frame = None track_frame = None for (topic, slot), schedule, stat_class in stats: if schedule == 1: if topic == 'all': stat_class.add(track_frame, ref_frame) elif log_utter['segment_info']['topic'] == topic: if slot == 'all': stat_class.add(track_frame, ref_frame) else: if slot in track_frame and slot in ref_frame: stat_class.add({slot: track_frame[slot]}, {slot: ref_frame[slot]}) elif slot in track_frame and slot not in ref_frame: stat_class.add({slot: track_frame[slot]}, {slot: []}) elif slot not in track_frame and slot in ref_frame: stat_class.add({slot: []}, {slot: ref_frame[slot]}) prev_ref_frame = ref_frame prev_track_frame = track_frame prev_topic = log_utter['segment_info']['topic'] for (topic, slot), schedule, stat_class in stats: if schedule == 2: if topic == 'all': stat_class.add(prev_track_frame, prev_ref_frame) elif prev_topic == topic: if slot == 'all': stat_class.add(prev_track_frame, prev_ref_frame) else: if slot in prev_track_frame and slot in prev_ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]}) elif slot in track_frame and slot not in ref_frame: stat_class.add({slot: prev_track_frame[slot]}, {slot: []}) elif slot not in track_frame and slot in ref_frame: stat_class.add({slot: []}, {slot: prev_ref_frame[slot]}) csvfile = open(args.scorefile,'w') print >> csvfile,("topic, slot, schedule, stat, N, result") for stat in stats: (topic, slot), schedule, stat_class = stat results = stat_class.results() for stat_subname, N, result in results: if result == None: result = "-" else: result = "%.7f"%result print >>csvfile,("%s, %s, %i, %s, %i, %s"%(topic, slot, schedule, stat_subname, N, result)) print >>csvfile,'basic,total_wall_time,,,,%s' % (tracker_output['wall_time']) print >>csvfile,'basic,sessions,,,,%s' % (len(sessions)) print >>csvfile,'basic,utterances,,,,%i' % (int(utter_counter)) print >>csvfile,'basic,wall_time_per_utterance,,,,%s' % (tracker_output['wall_time'] / utter_counter) print >>csvfile,'basic,dataset,,,,%s' % (tracker_output['dataset'] ) csvfile.close()
def main(argv): # # CMD LINE ARGS # install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') version_filename = os.path.join(install_path,'VERSION') f = open(version_filename) scorer_version = f.readline().strip() f.close() sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path,'config') parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze, for example train1 or test2 or train3a') parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='scorefile',action='store',metavar='JSON_FILE',required=True, help='File containing score JSON') parser.add_argument('--scorefile',dest='csv',action='store',metavar='CSV_FILE',required=True, help='File to write with CSV scoring data') parser.add_argument('--markfile',dest='markfile',action='store',metavar='JSON_FILE', help='Optional: re-write scorefile with scoring mark-up (for debugging and checking scoring process)') parser.add_argument('--csvdetail',dest='csvdetail',action='store',metavar='CSV_FILE', help='Optional: output a CSV file showing how each turn was scored (for error analysis)') parser.add_argument('--rocbins',dest='rocbins',action='store',metavar='INT',default=10000,type=int, help='ROC bins to use (default 10000). Lower numbers make the script run faster, but produce less accurate ROC results.') parser.add_argument('--rocdump',dest='rocdump',action='store',metavar='FILE_STEM', help='If present, use this file stem to write out ROC plot data: filestem.<schedule>.<slot>.<type>.csv, where type is either roc (which contains the ROC curve coordinates) or scores (which contains the raw scores used to compute the ROC curves).') args = parser.parse_args() sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) tracker_results = json.load(open(args.scorefile)) csvfile = open(args.csv,'w') if (args.csvdetail): detail = [] detail.append( ['session-id','turn-index','slot-group','label','instantiated-label','oracle-label', 'schedule1','schedule2','schedule3','top-hyp'] ) stats = {} for meta_slot in SLOT_GROUPS + ['joint','all']: stats[meta_slot] = {} for schedule in SCHEDULES: stats[meta_slot][schedule] = { 'accuracy': Stat_Accuracy(), #'mrr': Stat_MRR(), #'roc': Stat_ROC(bins=args.rocbins), #'l2': Stat_L2(), #'avgp': Stat_AverageProb(), #'nonempty': Stat_NonEmpty(), #'hypcount': Stat_HypCount(), } # mark each label hyp as being correct or not # also indicate whether each utt is on each kind of "schedule" turn_counter = 0 session_counter = 0 for session_tracker,session in zip(tracker_results['sessions'],sessions): session_counter += 1 session_id = session.log['session-id'] offlist_flag = {} for meta_slot in SLOT_GROUPS + ['joint']: offlist_flag[meta_slot] = True labels = {} for grounded_slot in SLOT_GROUPS: labels[grounded_slot] = {} turn_index = 0 for (log_turn,label_turn),tracker_turn in zip(session,session_tracker['turns']): turn_index += 1 turn_counter += 1 # check if this was a start-over if (log_turn['restart'] == True): for grounded_slot in SLOT_GROUPS: labels[grounded_slot] = {} for meta_slot in SLOT_GROUPS + ['joint']: offlist_flag[meta_slot] = True # accumulate labels for slu_label_entry in label_turn['slu-labels']: try: slot_group,pairset_key = _MakePairsetKey(slu_label_entry['slots']) except ScoreError as e: raise RuntimeError,'Problem with label file: %s' % (e.msg) if (pairset_key not in labels[slot_group]): labels[slot_group][pairset_key] = slu_label_entry['label'] elif (labels[slot_group][pairset_key] == False and slu_label_entry['label'] == True): # the old label was False but the new label is True # change label to True labels[slot_group][pairset_key] = True for meta_slot in SLOT_GROUPS + ['joint']: # check if tracker guessed anything at all for this slot if (meta_slot not in tracker_turn): tracker_turn[meta_slot] = {} tracker_turn[meta_slot]['hyps'] = [] tracker_turn_slot = tracker_turn[meta_slot]#the hypes of the slot # sort traker_hyps (in case they weren't sorted already) tracker_turn_slot['hyps'].sort(key=lambda x:x['score'],reverse=True) #print >>sys.stderr,'Working on session %s, turn %s, slot %s' % ( # session.log['session-id'], # log_turn['turn-index'], # meta_slot) # check which schedules this turn is on if (meta_slot != 'joint'): tracker_turn_slot['schedules'] = GetTurnSchedulesGroundedSlot(log_turn,meta_slot,args.dataset) else: tracker_turn_slot['schedules'] = GetTurnSchedulesJoint(tracker_turn) # check whether ANY correct value has been observed yet (offlist_flag) if (offlist_flag[meta_slot] == True): if (meta_slot != 'joint'): offlist_flag[meta_slot] = AreAllItemsIncorrectGroundedSlot(labels[meta_slot]) else: offlist_flag[meta_slot] = AreAllItemsIncorrectJoint(tracker_turn) tracker_turn_slot['offlist_flag'] = offlist_flag[meta_slot] # compute offlist score offlist_score = 1.0 total = 0.0 for i,tracker_hyp in enumerate(tracker_turn_slot['hyps']): if (tracker_hyp['score'] < 0.0): print >>sys.stderr,'WARNING: Score is less than 0.0 (%s); changing to 0.0 (session %s, turn %s, slot %s, hyp %s)' % ( tracker_hyp['score'], session.log['session-id'], log_turn['turn-index'], meta_slot, i) tracker_hyp['score'] = 0.0 offlist_score -= tracker_hyp['score'] total += tracker_hyp['score'] if (offlist_score < 0.0): print >>sys.stderr,'WARNING: Scores sum to more than 1.0 (%s); normalizing and setting offlist_score to 0.0 (session %s, turn %s, slot %s)' % ( 1.0 - offlist_score, session.log['session-id'], log_turn['turn-index'], meta_slot) offlist_score = 0.0 for tracker_hyp in tracker_turn_slot['hyps']: tracker_hyp['score'] = tracker_hyp['score'] / total tracker_turn_slot['offlist_score'] = offlist_score # assign correctness values to labels for i,tracker_hyp in enumerate(tracker_turn_slot['hyps']): try: tracker_hyp['label'] = AssignLabelToTrackerHyp(tracker_hyp['slots'],labels,meta_slot,tracker_turn) except ScoreError as e: print sys.stderr,'WARNING: %s (session %s, turn %s, slot %s, hyp %s); assigning incorrect' % (e.msg, session.log['session-id'], log_turn['turn-index'], meta_slot, i) tracker_hyp['label'] = False # for convenience, compute a list of True/False values that describe correctness tracker_turn_slot['all-hyps'] = [] tracker_turn_slot['all-hyps'].append( { 'hyp': None, 'label': tracker_turn_slot['offlist_flag'], 'score': tracker_turn_slot['offlist_score'], }) for hyp_index,tracker_hyp in enumerate(tracker_turn_slot['hyps']): tracker_turn_slot['all-hyps'].append( { 'hyp': tracker_hyp, 'label': tracker_hyp['label'], 'score': tracker_hyp['score'], }) tracker_turn_slot['all-hyps'].sort(key=lambda x: x['score'],reverse=True) # compute stats according to each schedule for schedule in SCHEDULES: if ( tracker_turn_slot['schedules'][schedule] ): for stat_type in stats[meta_slot][schedule]: stats[meta_slot][schedule][stat_type].AddTurn(tracker_turn_slot,log_turn) if (meta_slot in SLOT_GROUPS): stats['all'][schedule][stat_type].AddTurn(tracker_turn_slot,log_turn) # save details # ['session-id','turn-index','slot-group','label','schedule1','schedule2','schedule3','top-hyp'] if (args.csvdetail): if (tracker_turn_slot['all-hyps'][0]['hyp'] == None): top_hyp = None else: top_hyp_array = [] for k in sorted(tracker_turn_slot['all-hyps'][0]['hyp']['slots'].keys()): top_hyp_array.append( [k,tracker_turn_slot['all-hyps'][0]['hyp']['slots'][k]] ) top_hyp = ';'.join( [ '='.join([k,str(v)]) for k,v in top_hyp_array] ) # is top instantiated hyp correct? for tracker_hyp in tracker_turn_slot['all-hyps']: if (tracker_hyp['hyp'] != None): instantiated_label = tracker_hyp['label'] break else: instantiated_label = None for tracker_hyp in tracker_turn_slot['all-hyps']: if (tracker_hyp['hyp'] == None): continue if (tracker_hyp['label'] == True): oracle_label = True break else: oracle_label = False detail.append([ session_id, log_turn['turn-index'], meta_slot, tracker_turn_slot['all-hyps'][0]['label'], instantiated_label, oracle_label, tracker_turn_slot['schedules']['schedule1'], tracker_turn_slot['schedules']['schedule2'], tracker_turn_slot['schedules']['schedule3'], top_hyp, ]) # handle last turn and restart turns for schedule3 if (len(session.log['turns']) == turn_index or # this is the last turn session.log['turns'][turn_index]['restart'] == True): # next turn is a restart any_slot_found = False for grounded_slot in SLOT_GROUPS: if (len(labels[grounded_slot]) > 0): any_slot_found = True tracker_turn_slot = tracker_turn[grounded_slot] tracker_turn_slot['schedules']['schedule3'] = True for stat_type in stats[grounded_slot]['schedule3']: stats[grounded_slot]['schedule3'][stat_type].AddTurn(tracker_turn_slot,log_turn) stats['all']['schedule3'][stat_type].AddTurn(tracker_turn_slot,log_turn) if (any_slot_found): tracker_turn_slot = tracker_turn['joint'] tracker_turn_slot['schedules']['schedule3'] = True for stat_type in stats['joint']['schedule3']: stats['joint']['schedule3'][stat_type].AddTurn(tracker_turn_slot,log_turn) # compute stats according to each schedule for meta_slot in sorted(SLOT_GROUPS) + ['joint','all']: for schedule in sorted(SCHEDULES): if (args.rocdump): rocfile = args.rocdump + '.' + schedule + '.' + meta_slot + '.roc.csv' stats[meta_slot][schedule]['roc'].DumpROCToFile(rocfile) rawfile = args.rocdump + '.' + schedule + '.' + meta_slot + '.scores.csv' stats[meta_slot][schedule]['roc'].DumpScoresToFile(rawfile) for stat_type in sorted(stats[meta_slot][schedule]): R = stats[meta_slot][schedule][stat_type].Result() N = stats[meta_slot][schedule][stat_type].N for name,r in sorted(R,key=lambda x:x[0]): if (name == ''): print_name = stat_type else: print_name = '%s.%s' % (stat_type,name) print >>csvfile,'%s,%s,%s,%s,%s' % (meta_slot,schedule,print_name,N,r) print >>csvfile,'basic,,total_wall_time,,%s' % (tracker_results['wall-time']) print >>csvfile,'basic,,sessions,,%s' % (session_counter) print >>csvfile,'basic,,turns,,%s' % (turn_counter) print >>csvfile,'basic,,wall_time_per_turn,,%s' % (tracker_results['wall-time'] / turn_counter) print >>csvfile,'basic,,dataset,,%s' % (tracker_results['dataset'] ) print >>csvfile,'basic,,scorer_version,,%s' % (scorer_version ) csvfile.close() # optional: save label file if (args.markfile): f = open(args.markfile,'w') json.dump(tracker_results,f,indent=2) f.close() # optional: save csvdetail file if (args.csvdetail): f = open(args.csvdetail,'w') for row in detail: print >>f,','.join( [str(x) for x in row] ) f.close()
thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') if __name__ == '__main__': dataset = dataset_walker("dstc2_dev", dataroot="data", labels=True) informable = ['area', 'food', 'name', 'pricerange'] requestable = informable + ['addr', 'phone', 'postcode', 'signature'] machineActs = [ 'affirm', 'bye', 'canthear', 'confirm-domain', 'negate', 'repeat', 'reqmore', 'welcomemsg', 'canthelp', 'canthelp.missing_slot_value', 'canthelp.exception', 'expl-conf', 'impl-conf', 'inform', 'offer', 'request', 'select', 'welcomemsg' ] userActs = [ 'ack', 'affirm', 'bye', 'hello', 'help', 'negate', 'null', 'repeat', 'reqalts', 'reqmore', 'restart', 'silence', 'thankyou', 'confirm', 'deny', 'inform', 'request' ]
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask( trainset, devset, testset) train_utters += dev_utters context_case = 1 # 여기다가 previous labels context 를 구성하는 코드를 작성하자! # 1) 이전 화행 N개 (speaker 구분안함) # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개) if context_case == 1: pass else: pass # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels train_labels_category = [utter[3] for utter in train_utters] test_labels_category = [utter[3] for utter in test_utters] train_labels_attr = [utter[4] for utter in train_utters] test_labels_attr = [utter[4] for utter in test_utters] train_labels_sa = [utter[5] for utter in train_utters] test_labels_sa = [utter[5] for utter in test_utters] label_binarizer_category = preprocessing.MultiLabelBinarizer() label_binarizer_category.fit(train_labels_category + test_labels_category) label_binarizer_attr = preprocessing.MultiLabelBinarizer() label_binarizer_attr.fit(train_labels_attr + test_labels_attr) label_binarizer_sa = preprocessing.MultiLabelBinarizer() label_binarizer_sa.fit(train_labels_sa + test_labels_sa) train_labels_category = label_binarizer_category.transform( train_labels_category) test_labels_category = label_binarizer_category.transform( test_labels_category) train_labels_attr = label_binarizer_attr.transform(train_labels_attr) test_labels_attr = label_binarizer_attr.transform(test_labels_attr) train_labels_sa = label_binarizer_sa.transform(train_labels_sa) test_labels_sa = label_binarizer_sa.transform(test_labels_sa) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels_category = train_labels_category[ tourist_train_indices] tourist_train_labels_attr = train_labels_attr[tourist_train_indices] tourist_train_labels_sa = train_labels_sa[tourist_train_indices] tourist_train_labels = (tourist_train_labels_category, tourist_train_labels_attr, tourist_train_labels_sa) guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels_category = train_labels_category[guide_train_indices] guide_train_labels_attr = train_labels_attr[guide_train_indices] guide_train_labels_sa = train_labels_sa[guide_train_indices] guide_train_labels = (guide_train_labels_category, guide_train_labels_attr, guide_train_labels_sa) tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels_category = test_labels_category[tourist_test_indices] tourist_test_labels_attr = test_labels_attr[tourist_test_indices] tourist_test_labels_sa = test_labels_sa[tourist_test_indices] tourist_test_labels = (tourist_test_labels_category, tourist_test_labels_attr, tourist_test_labels_sa) guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels_category = test_labels_category[guide_test_indices] guide_test_labels_attr = test_labels_attr[guide_test_indices] guide_test_labels_sa = test_labels_sa[guide_test_indices] guide_test_labels = (guide_test_labels_category, guide_test_labels_attr, guide_test_labels_sa) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels)
def main(argv): install_path = os.path.abspath( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path, 'lib') sys.path.append(utils_dirname) parser = argparse.ArgumentParser() parser.add_argument( '--dataroot', dest='dataroot', help='The directory where to find the data [default: data]', default='data') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') args = parser.parse_args() if not os.path.exists(dir_output): print("...creating " + dir_output) os.makedirs(dir_output) dataset = args.dataset # Save first the information, then later we start calling the team info_for_tasks = fntInitStruct(dataset) sessions = ds.dataset_walker(dataset, dataroot=args.dataroot, labels=True) print('Collecting information for all pilot tasks') for session in sessions: session_id = session.log['session_id'] info_for_tasks = fntInitSession(info_for_tasks, session_id) utter_span = [ ] # Saves the utterance indexes for different concatenated turns # Saves the temporal information along several continuous turns for a given user concatenated_info = { 'transcripts': [], 'speech_acts': [], 'slots': [], 'uniq_sa': {} } utter_index = 0 # now iterate through turns current_role = session.log['utterances'][0]['speaker'].upper() for id, (log_utter, label_utter) in enumerate(session): # We concatenate the utterance + semantic slots + speech_acts if log_utter['speaker'].upper() == current_role: utter_span.append(str(log_utter['utter_index'])) concatenated_info = fntConcatInfo( concatenated_info, log_utter['transcript'], ' '.join(label_utter['semantic_tagged']), label_utter['speech_act']) else: # Change the user info_for_tasks = fntAddUtterance( info_for_tasks, session_id, utter_index, '_'.join(utter_span), current_role, concatenated_info['speech_acts'], ' '.join(concatenated_info['slots']), ' '.join(concatenated_info['transcripts'])) concatenated_info = { 'transcripts': [], 'speech_acts': [], 'slots': [], 'uniq_sa': {} } # Restart the process for the next user utter_index += 1 current_role = log_utter['speaker'].upper() utter_span = [] utter_span.append(str(log_utter['utter_index'])) concatenated_info = fntConcatInfo( concatenated_info, log_utter['transcript'], ' '.join(label_utter['semantic_tagged']), label_utter['speech_act']) info_for_tasks = fntAddUtterance( info_for_tasks, session_id, utter_index, '_'.join(utter_span), current_role, concatenated_info['speech_acts'], ' '.join(concatenated_info['slots']), ' '.join(concatenated_info['transcripts'])) # Now we start the process of asking information for each team for team in info_teams: # Configuration of logger for each team genlog_fh = logging.handlers.RotatingFileHandler( dir_output + '/' + team + '.log', mode='a', maxBytes=MAXBYTESLOG, backupCount=BACKUPCOUNT, encoding="latin-1") # save up to 1 GB before rotation genlog_fh.setLevel(logging.DEBUG) genlog_fh.setFormatter(formatter) logger.addHandler(genlog_fh) logger.info('Processing team ' + team) url = info_teams[team]['url'] logger.info('Connecting to ' + url + ' for ' + team) # websocket.enableTrace(True) # To check the content of each data send to the server ws = websocket.create_connection(url) ws.settimeout(MAX_TIMEOUT) stats = {} for pilottask in info_teams[team]['tasks']: logger.info('Doing task: ' + pilottask) for roletype in info_teams[team]['roles']: if pilottask == 'SLU': stats['semantic_tagged'] = {} stats['semantic_tagged'][ 'detection'] = Stat_Precision_Recall() stats['semantic_tagged']['class'] = Stat_Precision_Recall() stats['semantic_tagged']['all'] = Stat_Precision_Recall() if pilottask == 'SLU' or pilottask == 'SAP': stats['speech_act'] = {} stats['speech_act']['act'] = Stat_Precision_Recall() stats['speech_act']['all'] = Stat_Precision_Recall() if pilottask == 'SLG' or pilottask == 'EES': stats['utt_transcriptions'] = {} stats['utt_transcriptions']['all'] = Stat_BLEU_AM_FM() logger.info('Doing role: ' + roletype) for session_id in info_for_tasks['sessions']: logger.info('Processing session: ' + str(session_id)) for n_utt in sorted(info_for_tasks['sessions'][session_id] ['utterances']): utterance = info_for_tasks['sessions'][session_id][ 'utterances'][n_utt] logger.info('utterance: ' + str(n_utt)) if pilottask == 'SLU': # INPUT: The user's utterance, # OUTPUT: The current user's slots and speech acts if utterance['role_type'] == roletype: ref_sa_list = utterance['speech_act'] ref_tagged = utterance['semantic_tagged'] else: ref_sa_list = [] ref_tagged = '' jsonMsg = fntCreateJSONMessage( info_for_tasks['dataset'], session_id, n_utt, roletype, utterance['role_type'], pilottask, utterance['transcript'], None, None, None) pred_sa_list, pred_tagged = fntSendMessage( ws, pilottask, jsonMsg) eval_acts(ref_sa_list, pred_sa_list, stats['speech_act']) eval_semantics(ref_tagged, pred_tagged, stats['semantic_tagged']) elif pilottask == 'SAP': # Here we need to concatenate the turns for a given role user + the slots for the following user # INPUT: The user's utterance + speech acts and semantic tags for the current user + semantic tags for the next user # OUTPUT: The next user's speech acts if utterance['role_type'] == roletype: if n_utt + 1 in info_for_tasks['sessions'][ session_id][ 'utterances']: # Check there is a next turn ref_sa_list = info_for_tasks['sessions'][ session_id]['utterances'][ n_utt + 1]['speech_act'] jsonMsg = fntCreateJSONMessage( info_for_tasks['dataset'], session_id, n_utt, roletype, utterance['role_type'], pilottask, utterance['transcript'], utterance['speech_act'], utterance['semantic_tagged'], info_for_tasks['sessions'][session_id] ['utterances'][n_utt + 1]['semantic_tagged']) pred_sa_list = fntSendMessage( ws, pilottask, jsonMsg) eval_acts(ref_sa_list, pred_sa_list, stats['speech_act']) elif pilottask == 'SLG': # Here we need to concatenate the turns for a given role user + the slots for the following user # INPUT: Speech acts and semantic tags for the current user # OUTPUT: The user's utterance if utterance['role_type'] == roletype: ref = info_for_tasks['sessions'][session_id][ 'utterances'][n_utt]['transcript'] jsonMsg = fntCreateJSONMessage( info_for_tasks['dataset'], session_id, n_utt, roletype, utterance['role_type'], pilottask, None, utterance['speech_act'], utterance['semantic_tagged'], None) pred = fntSendMessage(ws, pilottask, jsonMsg) eval_utt(ref, pred, stats['utt_transcriptions']) elif pilottask == 'EES': # Here we need to concatenate the turns for a given role user + the slots for the following user # INPUT: The current user's utterance # OUTPUT: The next user's utterance if utterance['role_type'] == roletype: if n_utt + 1 in info_for_tasks['sessions'][ session_id][ 'utterances']: # Check there is a next turn ref = info_for_tasks['sessions'][ session_id]['utterances'][ n_utt + 1]['transcript'] jsonMsg = fntCreateJSONMessage( info_for_tasks['dataset'], session_id, n_utt, roletype, utterance['role_type'], pilottask, utterance['transcript'], None, None, None) pred = fntSendMessage( ws, pilottask, jsonMsg) eval_utt(ref, pred, stats['utt_transcriptions']) # Save the final results in a CSV file with codecs.open( dir_output + '/' + team + '_' + pilottask + '_' + roletype + '.csv', 'w', 'utf-8') as f: f.write("task, subtask, schedule, stat, N, result\n") for subtask in stats: for schedule in stats[subtask]: for measure, N, result in stats[subtask][ schedule].results(): f.write("%s, %s, %s, %s, %i, %s\n" % (pilottask, subtask, schedule, measure, N, result)) logger.info('Closing the connection with team ' + team) logger.removeHandler(genlog_fh) ws.close()
def main(argv): # # CMD LINE ARGS # install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path,'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker list_dir = os.path.join(install_path,'config') parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze, for example train1 or test2 or train3a') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='scorefile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--null',dest='null',action='store_true', help='Always output "None of the above" for all slots with score 1.0') parser.add_argument('--ignorescores',dest='ignorescores',action='store_true', help='Ignore score in data; always use a score of 1.0 (nop if --null also specified)') args = parser.parse_args() sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=False) start_time = time.time() r = { 'sessions': [], 'dataset': args.dataset, } for session in sessions: r['sessions'].append( { 'turns': [], 'session-id': session.log['session-id'], } ) state = _InitState() if (args.null == True): state['joint'] = { 'hyps': [], } for turn_index,(log_turn,scratch) in enumerate(session): if (args.null == True): r['sessions'][-1]['turns'].append(state) continue # check whether to initialize state or copy if (log_turn['restart'] == True or turn_index == 0): state = _InitState() else: state = copy.deepcopy(state) r['sessions'][-1]['turns'].append(state) if (len(log_turn['input']['live']['slu-hyps']) == 0): # no recognition results; skip continue slu_hyp = log_turn['input']['live']['slu-hyps'][0] joint = {} joint_scores = [] for slot in SLOTS: for act_hyp in slu_hyp['slu-hyp']: this_pairset = {} for found_slot,val in act_hyp['slots']: if (found_slot.startswith(slot)): this_pairset[found_slot] = val if (len(this_pairset) == 0): continue if True: score = slu_hyp['score'] if (args.ignorescores == False) else 1.0 state[slot]['hyps'] = [ { 'score-save': slu_hyp['score'], 'score': score, 'slots': this_pairset, } ] if (len(state[slot]['hyps']) > 0): joint_scores.append( state[slot]['hyps'][0]['score'] ) for (my_slot,my_val) in state[slot]['hyps'][0]['slots'].items(): joint[my_slot] = my_val state['joint'] = { 'hyps': [], } if (len(joint_scores) > 0): state['joint']['hyps'].append( { 'score': sum(joint_scores) / len(joint_scores), 'slots': joint, } ) for turn in r['sessions'][-1]['turns']: for slots_entry in turn.values(): for hyp_entry in slots_entry['hyps']: if ('score-save' in hyp_entry): del hyp_entry['score-save'] end_time = time.time() elapsed_time = end_time - start_time r['wall-time'] = elapsed_time f = open(args.scorefile,'w') json.dump(r,f,indent=2) f.close()
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide', 'tourist'], required=True, help='speaker') args = parser.parse_args() train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') last_speaker = args.roletype last_sa_label_str = None total = 0 same = 0 multilabel_utter_cnt = 0 utter_cnt = 0 for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: last_sa_label_str = None pass else: transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] if len(sa_label_list) > 1: multilabel_utter_cnt += 1 utter_cnt += 1 sa_label_str = '|'.join(sa_label_list) if log_utter['speaker'] == last_speaker: total += 1 if last_sa_label_str is None or sa_label_str == last_sa_label_str: same += 1 else: # print("") pass # sa_label_list = sorted(set(sa_label_list)) # train_utters += [(transcript, log_utter['speaker'], sa_label_list)] last_sa_label_str = sa_label_str last_speaker = log_utter['speaker'] sys.stderr.write('Done\n') print("same/total=ratio: %d/%d=%.4f" % (same, total, 1.0 * same / total)) print("multi_label/total=ratio: %d/%d=%.4f" % (multilabel_utter_cnt, utter_cnt, (1.0 * multilabel_utter_cnt / utter_cnt))) test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) test_utters += [(translation, log_utter['speaker'], sa_label_list)] pprint(train_utters[:2]) pprint(test_utters[:2])
def errorAnalysis(argv): print "ERROR ANALYSIS OF NAIVEENSEMBLER" print argv parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.") parser.add_argument( "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze" ) parser.add_argument( "--dataroot", dest="dataroot", action="store", required=True, metavar="PATH", help="Will look for corpus in <destroot>/<dataset>/...", ) parser.add_argument( "--trackfile", dest="trackfile", action="store", required=True, metavar="JSON_FILE", help="File to write with tracker output", ) parser.add_argument( "--ontology", dest="ontology", action="store", metavar="JSON_FILE", required=True, help="JSON Ontology file" ) # args = parser.parse_args() args = parser.parse_args(argv) dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track = {"sessions": []} track["dataset"] = args.dataset start_time = time.time() tracker = NaiveEnsembleBasedTrackerWithNBest(tagsets, nameOfODictPickle="dictOutput.pic") for call in dataset: this_session = {"session_id": call.log["session_id"], "utterances": []} tracker.reset() for (utter, label) in call: # -mae shori2 if utter["segment_info"]["target_bio"] == "B": print "\n -----New sub-dialogue----------------------------------------------------" print "s:" + str(call.log["session_id"]) + " u:" + str(utter["utter_index"]) print "Input=" + utter["transcript"] tracker_result = tracker.addUtter(utter, call) if tracker_result is not None: this_session["utterances"].append(tracker_result) # print "Tracker's output:" print tracker_result if "frame_label" in label: for slot in label["frame_label"].keys(): if slot not in tracker_result["frame_label"]: print "-slot [" + slot + "] is not exsisted in output" for value in label["frame_label"][slot]: print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output" else: if len(label["frame_label"][slot]) != len(tracker_result["frame_label"][slot]): # In case value in output, but repudant print "-slot [" + slot + "] include repudant values" for value in label["frame_label"][slot]: # In case value not in output if value not in tracker_result["frame_label"][slot]: print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output" track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall_time"] = elapsed_time
def main(argv): print argv parser = argparse.ArgumentParser( description='Simple hand-crafted dialog state tracker baseline.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument( '--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile', dest='trackfile', action='store', required=True, metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='JSON Ontology file') #args = parser.parse_args() args = parser.parse_args(argv) dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=False) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions": []} track["dataset"] = args.dataset start_time = time.time() tracker = NaiveEnsembleBasedTrackerWithNBest( tagsets, nameOfODictPickle="dictOutput.pic") for call in dataset: this_session = {"session_id": call.log["session_id"], "utterances": []} tracker.reset() for (utter, _) in call: sys.stderr.write('%d:%d\n' % (call.log['session_id'], utter['utter_index'])) tracker_result = tracker.addUtter(utter, call) if tracker_result is not None: this_session["utterances"].append(tracker_result) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track['wall_time'] = elapsed_time json.dump(track, track_file, indent=4) track_file.close()
def main(): #print_gplv3() parser = argparse.ArgumentParser(description='YARBUS Rule-based Dialog State Tracker Baseline V1.0\n by Jeremy Fix\t [email protected] \n and Hervé Frezza-Buet\t [email protected]\n',\ formatter_class=RawTextHelpFormatter) parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='The ontology to use') parser.add_argument( '--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile', dest='trackfile', action='store', required=True, metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument( '--thr_belief', dest='thr_belief', action='store', required=False, default=0.0, type=float, help= 'Sets the threshold below which the hypothesis in the belief are removed' ) parser.add_argument( '--rule_set', dest='rule_set', action='store', required=False, default=31, type=int, help='Specifies which rule set to use, an int in [0, 31]') parser.add_argument('--session_id', dest='session_id', action='store', required=False, metavar='voip-...', help='A particular session id to run on') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot) session_id = None if args.session_id: session_id = args.session_id print 'Running on session_id ' + session_id verbose = (session_id != None) ontology = load_ontology(args.ontology) track_file = open(args.trackfile, "wb") track = {"sessions": []} track["dataset"] = args.dataset start_time = time.time() print( "Yarbus will prune its belief with a threshold of %f ; to change this, check out the option --thr_belief" % args.thr_belief) tracker = YARBUS_Tracker(ontology["informable"].keys(), args.thr_belief, args.rule_set) nb_dialogs = len(dataset) print("%i dialogs to process" % nb_dialogs) for index_call, call in enumerate(dataset): if (session_id and session_id != call.log["session-id"]): continue if (verbose): print("Processing session : " + call.log["session-id"]) else: sys.stdout.write('\r Processing dialog %i / %i' % (index_call + 1, nb_dialogs)) sys.stdout.flush() this_session = {"session-id": call.log["session-id"], "turns": []} tracker.reset() for index_turn, (turn, _) in enumerate(call): if (verbose): print("*" * 10 + " Turn " + str(index_turn + 1) + " " + "*" * 10) tracker_turn = tracker.addTurn(turn, verbose) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) sys.stdout.write('\n') end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file, indent=4)
def main(argv): parser = argparse.ArgumentParser(description='Stat information about the data.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True) tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets() track_file = open(args.trackfile, "wb") track = {"sessions":[]} track["dataset"] = args.dataset start_time = time.time() out_json = {} out_json['speech_act_cat_dic'] = {} out_json['speech_act_attr_dic'] = {} out_json['semantic_tags_dic'] = {} out_json['ontology_dic'] = {} for topic in tagsets: out_json['ontology_dic'][topic] = {'count':0, 'detail':{}} for slot, values in tagsets[topic].items(): out_json['ontology_dic'][topic]['detail'][slot] = {'count':0, 'detail':{}} #for value in values: # out_json['ontology_dic'][topic]['detail'][slot]['detail'][value] = 0 extractor = sub_segment_extractor() for call in dataset: extractor.reset() for (log_utter, label_utter) in call: sys.stderr.write('%d:%d\n'%(call.log['session_id'], log_utter['utter_index'])) # speech act for sa in label_utter['speech_act']: act = sa['act'].strip() attrs = sa['attributes'] if act not in out_json['speech_act_cat_dic']: out_json['speech_act_cat_dic'][act] = {'count':0, 'detail':{}} out_json['speech_act_cat_dic'][act]['count'] += 1 for attr in attrs: attr = attr.strip() if attr not in out_json['speech_act_cat_dic'][act]['detail']: out_json['speech_act_cat_dic'][act]['detail'][attr] = 1 else: out_json['speech_act_cat_dic'][act]['detail'][attr] += 1 if attr not in out_json['speech_act_attr_dic']: out_json['speech_act_attr_dic'][attr] = 1 else: out_json['speech_act_attr_dic'][attr] += 1 # semantic for semantic_tag in label_utter['semantic_tagged']: sem_tags = extract_semantic_tags(semantic_tag) for sem_tag in sem_tags: name = sem_tag['name'] value = sem_tag['value'] if name not in out_json['semantic_tags_dic']: out_json['semantic_tags_dic'][name] = {'count':0, 'detail':{}} out_json['semantic_tags_dic'][name]['count'] += 1 if value not in out_json['semantic_tags_dic'][name]['detail']: out_json['semantic_tags_dic'][name]['detail'][value] = 1 else: out_json['semantic_tags_dic'][name]['detail'][value] += 1 topic = log_utter['segment_info']['topic'] # frame label if log_utter['segment_info']['target_bio'] == 'B': if not extractor.is_empty: sub_segment = extractor.state topic = sub_segment['topic'] out_json['ontology_dic'][topic]['count'] += 1 for slot, value_list in sub_segment['frame_label'].items(): out_json['ontology_dic'][topic]['detail'][slot]['count'] += 1 for t_value in value_list: t_value = t_value.strip() if t_value not in out_json['ontology_dic'][topic]['detail'][slot]['detail']: out_json['ontology_dic'][topic]['detail'][slot]['detail'][t_value] = 1 else: out_json['ontology_dic'][topic]['detail'][slot]['detail'][t_value] += 1 extractor.addUtter(log_utter,label_utter) json.dump(out_json, track_file, indent=4) track_file.close()
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) ctx_len = int(params['context_length']) train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') transcript_contexts = [] for call in trainset: for i, (log_utter, translations, label_utter) in enumerate(call): transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) transcript_contexts += [transcript] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] train_utters += [(transcript, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') transcript_contexts = [] for call in testset: for i, (log_utter, translations, label_utter) in enumerate(call): try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' transcript_contexts += [translation] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] test_utters += [(translation, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # make windowed input data as context train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len) test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
def main(argv): parser = argparse.ArgumentParser(description='Dataset Converter for SAP pilot task.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The target dataset to be converted') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/...') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True, translations=False) for call in dataset: session_id = call.log["session_id"] input_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'} output_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'} input_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'} output_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'} for (log_utter, _, label_utter) in call: speaker = log_utter['speaker'] utter_index = log_utter['utter_index'] transcript = log_utter['transcript'] speech_act = label_utter['speech_act'] mention_words = [] curr_cat = None curr_attrs = None semantic_tags = [] for semantic_tagged in label_utter['semantic_tagged']: parser = SemanticTagParser(False) parser.feed(semantic_tagged) for word, (bio, cat, attrs) in zip(parser.get_word_seq(), parser.get_word_tag_seq()): if bio == 'I': mention_words.append(word) else: if curr_cat is not None: semantic_tags.append({ u'main': curr_cat, u'attributes': curr_attrs, u'mention': ' '.join(mention_words) }) mention_words = [] curr_cat = None curr_attrs = None if bio == 'B': mention_words = [word] curr_cat = cat curr_attrs = {} for key, value in attrs: curr_attrs[key] = value if curr_cat is not None: semantic_tags.append({ u'main': curr_cat, u'attributes': curr_attrs, u'mention': ' '.join(mention_words) }) if speaker == 'Guide': input_guide[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'semantic_tags': semantic_tags }) output_guide[u'utterances'].append({ u'utter_index': utter_index, u'speech_act': speech_act }) input_tourist[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'transcript': transcript, u'semantic_tags': semantic_tags, u'speech_act': speech_act }) elif speaker == 'Tourist': input_tourist[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'semantic_tags': semantic_tags }) output_tourist[u'utterances'].append({ u'utter_index': utter_index, u'speech_act': speech_act }) input_guide[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'transcript': transcript, u'semantic_tags': semantic_tags, u'speech_act': speech_act }) path = os.path.join(os.path.abspath(args.dataroot), '%03d' % (session_id,)) with open(os.path.join(path, 'sap.guide.in.json'), 'w') as fp: json.dump(input_guide, fp) with open(os.path.join(path, 'sap.guide.label.json'), 'w') as fp: json.dump(output_guide, fp) with open(os.path.join(path, 'sap.tourist.in.json'), 'w') as fp: json.dump(input_tourist, fp) with open(os.path.join(path, 'sap.tourist.label.json'), 'w') as fp: json.dump(output_tourist, fp)
def main(): print_gplv3() parser = argparse.ArgumentParser( description='HWU Rule-based Dialog State Tracker Baseline V2.0\n by Zhuoran Wang\t [email protected]\n This version extends the work in (Wang & Lemon, SigDial 2013).', \ formatter_class=RawTextHelpFormatter) parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True, help='The ontology to use') parser.add_argument( '--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--trackfile', dest='trackfile', action='store', required=True, metavar='JSON_FILE', help='File to write with tracker output') parser.add_argument( '--original', dest='original', action='store', required=False, metavar='TRUE/FALSE', help= 'Use the original version presented in (Wang & Lemon, SigDial 2013)') parser.add_argument( '--config', dest='config', action='store', required=True, metavar='TRUE/FALSE', help='The path of the config folder containing the .flist files') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, config_folder=args.config) original = False if args.original and args.original.lower() == "true": original = True load_ontology(args.ontology) track_file = open(args.trackfile, "w") track = {"sessions": [], "dataset": args.dataset} start_time = time.time() tracker = HWU_Tracker() for call in dataset: this_session = {"session-id": call.log["session-id"], "turns": []} tracker.reset() for turn, _ in call: tracker_turn = tracker.addTurn(turn, original) this_session["turns"].append(tracker_turn) track["sessions"].append(this_session) end_time = time.time() elapsed_time = end_time - start_time track["wall-time"] = elapsed_time json.dump(track, track_file, indent=4)
def main(argv): parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide', 'tourist'], required=True, help='speaker') args = parser.parse_args() threshold_predictor = None train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue transcript = data_helpers.tokenize_and_lower(log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) train_utters += [(transcript, log_utter['speaker'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue try: translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) test_utters += [(translation, log_utter['speaker'], sa_label_list)] pprint(train_utters[:2]) pprint(test_utters[:2]) # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) num_epochs = int(params['num_epochs']) validation_split = float(params['validation_split']) batch_size = int(params['batch_size']) multilabel = params['multilabel']=="true" # build vocabulary sents = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_sents = data_helpers.pad_sentences(sents, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents) print("vocabulary size: %d" % len(vocabulary)) # params['max_sent_len'] = max_sent_len # build inputs train_inputs = data_helpers.build_input_data(pad_sents, vocabulary) test_sents = [utter[0].split(' ') for utter in test_utters] test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len) test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels+sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split and shuffle data indices = np.arange(train_inputs.shape[0]) np.random.shuffle(indices) train_inputs = train_inputs[indices] train_labels = train_labels[indices] num_validation = int(validation_split * train_inputs.shape[0]) # x_train = train_inputs[:-num_validation] # y_train = train_labels[:-num_validation] # x_val = train_inputs[-num_validation:] # y_val = train_labels[-num_validation:] x_train = train_inputs y_train = train_labels x_test = test_inputs y_test = test_labels # construct a pytorch data_loader x_train = torch.from_numpy(x_train).long() y_train = torch.from_numpy(y_train).float() dataset_tensor = data_utils.TensorDataset(x_train, y_train) train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False) x_test = torch.from_numpy(x_test).long() y_test = torch.from_numpy(y_test).long() dataset_tensor = data_utils.TensorDataset(x_test, y_test) test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) # load model model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1]) if torch.cuda.is_available(): model = model.cuda() learning_rate = float(params['learning_rate']) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_fn = nn.MultiLabelSoftMarginLoss() # loss_fn = nn.BCEWithLogitsLoss() for epoch in range(num_epochs): model.train() # set the model to training mode (apply dropout etc) for i, (inputs, labels) in enumerate(train_loader): inputs, labels = autograd.Variable(inputs), autograd.Variable(labels) if torch.cuda.is_available(): inputs, labels = inputs.cuda(), labels.cuda() preds = model(inputs) if torch.cuda.is_available(): preds = preds.cuda() loss = loss_fn(preds, labels) optimizer.zero_grad() loss.backward() optimizer.step() if i % 100 == 0: print("current loss: %.4f" % loss) model.eval() # set the model to evaluation mode # if threshold_predictor is None: threshold_predictor = train_threshold(model, train_loader, y_train.numpy()) # count_predictor = train_count(model, train_loader, y_train.numpy()) true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor) # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) # end of training true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) with open(("pred_result_%s.txt" % args.roletype), "w") as f: for pred_act, true_act in zip(pred_acts, true_acts): f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
def main(argv): install_path = os.path.abspath( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dirname = os.path.join(install_path, 'lib') sys.path.append(utils_dirname) from dataset_walker import dataset_walker from eval_func import eval_utt from stat_classes import Stat_BLEU_AM_FM parser = argparse.ArgumentParser( description='Evaluate output from an SLG system.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze') parser.add_argument('--dataroot', dest='dataroot', action='store', metavar='PATH', required=True, help='look for corpus in <destroot>/...') parser.add_argument('--jsonfile', dest='jsonfile', action='store', metavar='JSON_FILE', required=True, help='File containing JSON output') parser.add_argument('--roletype', dest='roletype', action='store', required=True, choices=['GUIDE', 'TOURIST'], help='Target role') parser.add_argument('--scorefile', dest='scorefile', action='store', metavar='JSON_FILE', required=True, help='File to write with CSV scoring data') args = parser.parse_args() sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=True, task='SLG', roletype=args.roletype.lower()) system_output = json.load(open(args.jsonfile)) stats = {} stats['generated'] = {} stats['generated']['all'] = Stat_BLEU_AM_FM('cn') for session, track_session in zip(sessions, system_output["sessions"]): log_utter_list = [] label_utter_list = [] for log_utter, translations, label_utter in session: if (args.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (args.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'): log_utter_list.append(log_utter) label_utter_list.append(label_utter) # now iterate through turns for log_utter, label_utter, track_utter in zip( log_utter_list, label_utter_list, track_session["utterances"]): for subtask in stats: if subtask == 'generated': ref = label_utter['transcript'] pred = track_utter['generated'] eval_utt(ref, pred, stats[subtask]) csvfile = open(args.scorefile, 'w') print >> csvfile, ("task, subtask, schedule, stat, N, result") for subtask in stats: for schedule in stats[subtask]: for measure, N, result in stats[subtask][schedule].results(): print >> csvfile, ( "%s, %s, %s, %s, %i, %s" % ('SLG', subtask, schedule, measure, N, result)) csvfile.close()
def main(argv): parser = argparse.ArgumentParser(description='Simple SAP baseline.') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/...') parser.add_argument('--modelfile', dest='modelfile', action='store', required=True, metavar='MODEL_FILE', help='File to write with trained model') parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE', help='File to write with SAP output') parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE', 'TOURIST'], required=True, help='Target role') args = parser.parse_args() sap = SimpleSAP() trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True, task='SAP', roletype=args.roletype.lower()) sys.stderr.write('Loading training instances ... ') for call in trainset: instance = { 'prev_turn_act': None, 'curr_semantic_tags': None, 'prev_semantic_tags': None, 'dist_from_prev_turn': 0 } for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() == args.roletype.lower(): instance['curr_semantic_tags'] = log_utter['semantic_tags'] instance['dist_from_prev_turn'] += 1 sap.add_instance(copy.deepcopy(instance), label_utter['speech_act']) else: instance['prev_turn_act'] = log_utter['speech_act'] instance['dist_from_prev_turn'] = 0 instance['prev_semantic_tags'] = log_utter['semantic_tags'] sys.stderr.write('Done\n') sap.train(args.modelfile) output = {'sessions': []} output['dataset'] = args.testset output['task_type'] = 'SAP' output['role_type'] = args.roletype start_time = time.time() testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True, task='SAP', roletype=args.roletype.lower()) sys.stderr.write('Loading testing instances ... ') for call in testset: this_session = {"session_id": call.log["session_id"], "utterances": []} instance = { 'prev_turn_act': None, 'curr_semantic_tags': None, 'prev_semantic_tags': None, 'dist_from_prev_turn': 0 } for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() == args.roletype.lower(): sap_result = {'utter_index': log_utter['utter_index']} instance['curr_semantic_tags'] = log_utter['semantic_tags'] instance['dist_from_prev_turn'] += 1 pred_act = sap.pred(copy.deepcopy(instance)) combined_act = {} for act_label in reduce(operator.add, pred_act): m = re.match('^([^_]+)_(.+)$', act_label) act = m.group(1) attr = m.group(2) if act not in combined_act: combined_act[act] = [] if attr not in combined_act[act]: combined_act[act].append(attr) sap_result['speech_act'] = [] for act in combined_act: attr = combined_act[act] sap_result['speech_act'].append({ 'act': act, 'attributes': attr }) this_session['utterances'].append(sap_result) else: instance['prev_turn_act'] = log_utter['speech_act'] instance['dist_from_prev_turn'] = 0 instance['prev_semantic_tags'] = log_utter['semantic_tags'] output['sessions'].append(this_session) sys.stderr.write('Done\n') end_time = time.time() elapsed_time = end_time - start_time output['wall_time'] = elapsed_time with open(args.outfile, "wb") as of: json.dump(output, of, indent=4) sys.stderr.write('Done\n')
for val in ontology[key][slot]: add_words(val) # TODO for slot in ["food"]: add_words(slot) for val in ontology[key][slot]: add_words(val) # TODO for slot in ["name"]: add_words(slot) for val in ontology[key][slot]: add_words(val) # include asr words and slu words appeared in data set dataset = dataset_walker.dataset_walker(dataset_name, dataroot=dataroot, labels=True) add_words("asr") add_words("slots") add_words("act") for call in dataset: for turn, labelJson in call: asrs = turn["input"]["live"]["asr-hyps"] # 1best add_words(asrs[0]["asr-hyp"]) # 2best - nbest # TODO for asr in asrs[1:]: add_words(asr["asr-hyp"])
def main(argv): parser = argparse.ArgumentParser(description='Simple SLU baseline.') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...') parser.add_argument('--modelfile', dest='modelfile', action='store', required=True, metavar='MODEL_FILE', help='File to write with trained model') parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE', help='File to write with SLU output') parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE', 'TOURIST'], required=True, help='Target role') args = parser.parse_args() slu = SimpleSLU() trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'): slu.add_instance(log_utter['transcript'], label_utter['speech_act'], label_utter['semantic_tagged']) sys.stderr.write('Done\n') slu.train(args.modelfile) projection = DirectLabelProjection() output = {'sessions': []} output['dataset'] = args.testset output['task_type'] = 'SLU' output['role_type'] = args.roletype start_time = time.time() testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: this_session = {"session_id": call.log["session_id"], "utterances": []} for (log_utter, translations, label_utter) in call: if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'): slu_result = {'utter_index': log_utter['utter_index']} if len(translations['translated']) > 0: top_hyp = translations['translated'][0]['hyp'] pred_act, pred_semantic = slu.pred(top_hyp) combined_act = {} for act_label in reduce(operator.add, pred_act): m = re.match('^([^_]+)_(.+)$', act_label) act = m.group(1) attr = m.group(2) if act not in combined_act: combined_act[act] = [] if attr not in combined_act[act]: combined_act[act].append(attr) slu_result['speech_act'] = [] for act in combined_act: attr = combined_act[act] slu_result['speech_act'].append({'act': act, 'attributes': attr}) align = translations['translated'][0]['align'] projected = projection.project(log_utter['transcript'], top_hyp, align, pred_semantic) slu_result['semantic_tagged'] = projection.convert_to_tagged_utter(projected) else: slu_result['semantic_tagged'] = log_utter['transcript'] slu_result['speech_act'] = [] this_session['utterances'].append(slu_result) output['sessions'].append(this_session) end_time = time.time() elapsed_time = end_time - start_time output['wall_time'] = elapsed_time with open(args.outfile, "wb") as of: json.dump(output, of, indent=4) sys.stderr.write('Done\n')
def gen_baseline(dataset_name, dataroot, tagged=False): res = {'dataset': dataset_name, 'sessions': []} dataset = dataset_walker.dataset_walker(dataset_name, dataroot=dataroot, labels=True) mod_config_dict = { 'context_type': 'cpu', 'nn_type': offline_config_dict["nn_type"], 'model_dir': offline_config_dict["model_dir"] } if mod_config_dict['nn_type'] in [ 'doublelstm', 'reslstm', 'matlstm', 'cnnlstm', 'cnncnnlstm' ]: mod_config_dict['batch_size'] = 32 mod_tracker = ModTracker(config_dict=mod_config_dict) start_time = time.time() # decide how to process data if mod_config_dict['nn_type'] in ['bowlstm']: level = 'turn' feature_type = 'bow' elif mod_config_dict['nn_type'] in ['reslstm', 'matlstm', 'cnnlstm']: level = 'turn' feature_type = 'bowbow' elif mod_config_dict['nn_type'] in ['doublelstm', 'cnncnnlstm']: level = 'turn' feature_type = 'sentbow' else: level = 'word' # process by word-level dialogue if level == 'word': for call in dataset: res_dialogue = dict() res_dialogue["session-id"] = call.log["session-id"] res_dialogue["turns"] = list() fileDatas = [] tag_dicts = [] fileData = {} fileData["turns"] = [] for turn, labelJson in call: if tagged: turnData = genTurnData_nbest_tagged(turn, labelJson) tag_dicts.append(turnData["tag_dict"]) else: turnData = genTurnData_nbest(turn, labelJson) fileData["turns"].append(turnData) fileDatas.append(copy.deepcopy(fileData)) tracker_outputs = mod_tracker.get_batch_new_state(fileDatas) for i in xrange(len(tracker_outputs)): del_none_val(tracker_outputs[i]) if tagged: tag_to_val(tracker_outputs[i], tag_dicts[i]) res_dialogue["turns"].append(tracker_outputs[i]) res["sessions"].append(res_dialogue) print "processed dialogue no.:", len(res["sessions"]) # process by turn-level dialogue elif level == 'turn': batch_size = mod_tracker.batch_size fileDatas_all = gen_resdata(dataset, 'nbest_tagged') # fileDatas_all = [] # for call in dataset: # fileData = {} # fileData["turns"] = [] # fileData["session-id"] = call.log["session-id"] # for turn, labelJson in call: # turnData = genTurnData_nbest(turn, labelJson) # fileData["turns"].append(turnData) # fileDatas_all.append(fileData) batch_num = int(math.ceil(len(fileDatas_all[0]) / float(batch_size))) for j in xrange(batch_num): fileDatas0 = fileDatas_all[0][batch_size * j:batch_size * (j + 1)] fileDatas1 = fileDatas_all[1][batch_size * j:batch_size * (j + 1)] fileDatas = [] fileDatas.append(fileDatas0) fileDatas.append(fileDatas1) tracker_outputs = mod_tracker.get_turn_batch_state( fileDatas, feature_type) for i in xrange(len(fileDatas[0])): res_dialogue = dict() res_dialogue["session-id"] = fileDatas[0][i]["session-id"] res_dialogue["turns"] = tracker_outputs[i] for turn_output in res_dialogue["turns"]: del_none_val(turn_output) res["sessions"].append(res_dialogue) print "processed dialogue no.:", len(res["sessions"]) end_time = time.time() res['wall-time'] = end_time - start_time if tagged: baseline_json_file = 'baseline_%s_tagged.json' % dataset_name else: baseline_json_file = 'baseline_%s_dlstm.json' % dataset_name json.dump(res, open(baseline_json_file, 'wb'), indent=4)