示例#1
0
def main(argv):
    parser = argparse.ArgumentParser(description='Simple SLG baseline.')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='Will look for corpus in <destroot>/...')
    parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE',  help='File to write with SAP output')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE',  'TOURIST'], required=True,  help='Target role')

    args = parser.parse_args()

    sap = SimpleSLG()

    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True, task='SLG', roletype=args.roletype.lower())
    sys.stderr.write('Loading training instances ... ')

    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() == args.roletype.lower():
                instance = {'semantic_tags': log_utter['semantic_tags'], 'speech_act': log_utter['speech_act']}
                sap.add_instance(instance, translations)

    sap.train()
    sys.stderr.write('Done\n')

    output = {'sessions': []}
    output['dataset'] = args.testset
    output['task_type'] = 'SAP'
    output['role_type'] = args.roletype
    start_time = time.time()

    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True, task='SLG', roletype=args.roletype.lower())
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}

        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() == args.roletype.lower():
                instance = {'semantic_tags': log_utter['semantic_tags'], 'speech_act': log_utter['speech_act']}

                slg_result = {'utter_index': log_utter['utter_index'], 'generated': sap.generate(instance)}
                this_session['utterances'].append(slg_result)

        output['sessions'].append(this_session)
    sys.stderr.write('Done\n')

    end_time = time.time()
    elapsed_time = end_time - start_time
    output['wall_time'] = elapsed_time

    with open(args.outfile, "wb") as of:
        json.dump(output, of, indent=4)

    sys.stderr.write('Done\n')
示例#2
0
def main(argv):
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--model_dir',dest='model_dir',action='store',required=True,metavar='PATH', help='model dir')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')
	parser.add_argument('--ratio_thres',dest='ratio_thres',type=float,action='store',default=0.8,help='ration threshold')
	parser.add_argument('--value_prob',dest='value_prob',type=float,action='store',default=0.8,help='output value prob threshold')
	parser.add_argument('--slot_prob',dest='slot_prob',type=float,action='store',default=0.6,help='output slot prob threshold')
	parser.add_argument('--STCMode',dest='STCMode',action='store',default='hr',help='STC mode, high precision or high recall')
	parser.add_argument('--BSMode',dest='BSMode',action='store',default='enhance',help='Belief State mode: max, average or enhance')
	parser.add_argument('--BSAlpha',dest='BSAlpha',type=float,action='store',default=0.0,help='Belief State average history alpha')
	args = parser.parse_args()

	# 读取配置文件
	InitConfig()
	config = GetConfig()
	config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')])

	# 设置logging
	log_level_key = config.get('logging','level')
	run_code_name = os.path.basename(sys.argv[0])[0:-3]
	logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \
    					level = GetLogLevel(log_level_key), 
    					format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s')



	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False)
	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	track_file = open(args.trackfile, "wb")
	track = {"sessions":[]}
	track["dataset"]  = args.dataset
	start_time = time.time()

	tracker = msiip_nsvc_tracker(tagsets, args.model_dir,
								ratio_thres = args.ratio_thres, 
								slot_prob_thres = args.slot_prob, 
								value_prob_thres = args.value_prob, 
								mode = args.STCMode, 
								bs_mode = args.BSMode, 
								bs_alpha = args.BSAlpha)
	for call in dataset:
		this_session = {"session_id":call.log["session_id"], "utterances":[]}
		tracker.reset()
		for (utter,_) in call:
			sys.stderr.write('%d:%d\n'%(call.log['session_id'], utter['utter_index']))
			tracker_result = tracker.addUtter(utter)
			if tracker_result is not None:
				this_session["utterances"].append(tracker_result)
		track["sessions"].append(this_session)
	end_time = time.time()
	elapsed_time = end_time - start_time
	track['wall_time'] = elapsed_time

	json.dump(track, track_file, indent=4)

	track_file.close()
示例#3
0
def main():
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
						help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
						help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE',
						help='File to write with tracker output')
	
	args = parser.parse_args()
	dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot)
	track_file = open(args.trackfile, "wb")
	track = {"sessions":[]}
	track["dataset"]  = args.dataset
	start_time = time.time()

	tracker = Tracker()
	
	for call in dataset :
		this_session = {"session-id":call.log["session-id"], "turns":[]}
		tracker.reset()
		for turn, _ in call :
			tracker_turn = tracker.addTurn(turn)
			this_session["turns"].append(tracker_turn)
		
		track["sessions"].append(this_session)
	end_time = time.time()
	elapsed_time = end_time - start_time
	track["wall-time"] = elapsed_time
   
	json.dump(track, track_file,indent=4)
示例#4
0
def main(argv):
    parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
    parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
    parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')

    args = parser.parse_args()
    dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False)
    tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

    track_file = open(args.trackfile, "wb")
    track = {"sessions":[]}
    track["dataset"]  = args.dataset
    start_time = time.time()

    tracker = BaselineTracker(tagsets)
    for call in dataset:
        this_session = {"session_id":call.log["session_id"], "utterances":[]}
        tracker.reset()
        for (utter,_) in call:
            sys.stderr.write('%d:%d\n'%(call.log['session_id'], utter['utter_index']))
            tracker_result = tracker.addUtter(utter)
            if tracker_result is not None:
                this_session["utterances"].append(copy.deepcopy(tracker_result))
        track["sessions"].append(this_session)

    end_time = time.time()
    elapsed_time = end_time - start_time
    track['wall_time'] = elapsed_time

    json.dump(track, track_file, indent=4)

    track_file.close()
def main(argv):
	install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	utils_dirname = os.path.join(install_path,'lib')

	sys.path.append(utils_dirname)
	from dataset_walker import dataset_walker
	
	parser = argparse.ArgumentParser(description='Check the validity of a tracker output object.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,
                        help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,
                        help='File containing score JSON')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,
                        help='JSON Ontology file')

	args = parser.parse_args()

	sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=False)
	tracker_output = json.load(open(args.scorefile))

	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	checker = TrackChecker(sessions, tracker_output, tagsets)
	checker.check()
	checker.print_errors()
示例#6
0
def main(argv):
    parser = argparse.ArgumentParser(description="find slot property.")
    parser.add_argument(
        "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze"
    )
    parser.add_argument(
        "--dataroot",
        dest="dataroot",
        action="store",
        required=True,
        metavar="PATH",
        help="Will look for corpus in <destroot>/<dataset>/...",
    )
    parser.add_argument("output", help="output slot property file")

    args = parser.parse_args()

    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True)

    slot_count_dic = find_slot_property(dataset)

    output = codecs.open(args.output, "w", "utf-8")
    sorted_slot_count = sorted(slot_count_dic.items(), key=lambda x: x[1][1], reverse=True)
    for slot, count in sorted_slot_count:
        print >> output, "%s\t%d\t%d" % (slot, count[0], count[1])
    output.close()
示例#7
0
def main(argv):
	parser = argparse.ArgumentParser(description='MSIIP ensemble tracker.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--LogBaseDir',dest='LogBaseDir',action='store',required=True,help='The base directory that contains the log files')
	parser.add_argument('--config',dest='config',action='store',required=True,help='Config file, indicate log files and weight')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')
	parser.add_argument('--value_prob',dest='value_prob',type=float,action='store',default=0.4,help='output value prob threshold')
	parser.add_argument('--slot_prob',dest='slot_prob',type=float,action='store',default=0.5,help='output slot prob threshold')
	args = parser.parse_args()

	# 读取配置文件
	InitConfig()
	config = GetConfig()
	config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')])

	# 设置logging
	log_level_key = config.get('logging','level')
	run_code_name = os.path.basename(sys.argv[0])[0:-3]
	logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \
    					level = GetLogLevel(log_level_key), 
    					format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s')

	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False)
	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	tracker = msiip_ensemble_tracker(tagsets, dataset, args.LogBaseDir, args.config, args.slot_prob, args.value_prob)
	track = tracker.ensemble()

	track_file = open(args.trackfile, "wb")
	json.dump(track, track_file, indent=4)
	track_file.close()
示例#8
0
def main(argv):
	
	# 读取配置文件
	InitConfig()
	config = GetConfig()
	config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')])

	# 设置logging
	log_level_key = config.get('logging','level')
	run_code_name = os.path.basename(sys.argv[0])[0:-3]
	'''
	logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \
    					level = GetLogLevel(log_level_key), 
    					format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s')
	'''
	

	parser = argparse.ArgumentParser(description='find stop words.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('stop_words_file', help='output stop words file')
	
	args = parser.parse_args()

	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)

	stop_words_count = find_stop_words(dataset)

	output = codecs.open(args.stop_words_file, 'w', 'utf-8')
	sorted_stop_words = sorted(stop_words_count.items(), key=lambda x:x[1], reverse=True)
	for word,count in sorted_stop_words:
		print >>output, '%s\t%d' %(word, count)
	output.close()
示例#9
0
def main(argv):
    parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.")
    parser.add_argument(
        "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze"
    )
    parser.add_argument(
        "--dataroot",
        dest="dataroot",
        action="store",
        required=True,
        metavar="PATH",
        help="Will look for corpus in <destroot>/<dataset>/...",
    )

    args = parser.parse_args()
    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=False)

    start_time = time.time()

    tracker = conversation_extractor()
    for call in dataset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        tracker.reset()
        for (utter, _) in call:
            sys.stderr.write("%d:%d\n" % (call.log["session_id"], utter["utter_index"]))
            tracker.addUtter(utter)
    end_time = time.time()
    elapsed_time = end_time - start_time
示例#10
0
def main(argv):
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')

	args = parser.parse_args()
	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)
	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	track_file = open(args.trackfile, "wb")
	out_json = {'dataset':args.dataset, 'utterances':[]}
	extractor = sub_utters_extractor()
	for call in dataset:
		for (log_utter, label_utter) in call:
			sys.stderr.write('%d:%d\n'%(call.log['session_id'], log_utter['utter_index']))
			print '%d:%d'%(call.log['session_id'], log_utter['utter_index'])
			(transcript, sub_utters_list, sub_tag_list, speech_acts) = extractor.addUtter(log_utter,label_utter)
			if transcript:
				item = {}
				item['transcript'] = transcript
				item['sub_utters_list'] = sub_utters_list
				item['sub_tag_list'] = sub_tag_list
				item['speech_acts'] = speech_acts
				out_json['utterances'].append(item)
	json.dump(out_json, track_file, indent=4)
示例#11
0
def main(argv):
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--ar',dest='association_rules',action='store',required=True,metavar='JSON_FILE', help='association_rules')
	parser.add_argument('--stm',dest='semantic_tagger_model',action='store',required=True, help='semantic_tagger_model')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')
	parser.add_argument('--pt',dest='prob_threshold',type=float,action='store',default=0.8,help='prob_threshold')
	parser.add_argument('--exact',dest='exact',action='store_true',help='exact mode of fuzz mode')
	args = parser.parse_args()


	# 读取配置文件
	InitConfig()
	config = GetConfig()
	config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')])

	# 设置logging
	log_level_key = config.get('logging','level')
	run_code_name = os.path.basename(sys.argv[0])[0:-3]
	logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \
    					level = GetLogLevel(log_level_key), 
    					format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s')



	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=False)
	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	track_file = open(args.trackfile, "wb")
	track = {"sessions":[]}
	track["dataset"]  = args.dataset
	start_time = time.time()

	if args.exact:
		mode = 'exact'
	else:
		mode = 'fuzzy'
	tracker = association_rule_tracker(tagsets, args.association_rules, args.semantic_tagger_model, args.prob_threshold, mode)
	for call in dataset:
		this_session = {"session_id":call.log["session_id"], "utterances":[]}
		tracker.reset()
		for (utter,_) in call:
			sys.stderr.write('%d:%d\n'%(call.log['session_id'], utter['utter_index']))
			tracker_result = tracker.addUtter(utter)
			if tracker_result is not None:
				this_session["utterances"].append(tracker_result)
		track["sessions"].append(this_session)
	end_time = time.time()
	elapsed_time = end_time - start_time
	track['wall_time'] = elapsed_time

	json.dump(track, track_file, indent=4)

	track_file.close()
示例#12
0
def main():
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
						help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
						help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE',
						help='File to write with tracker output')
	parser.add_argument('--labelfile',dest='labelfile',action='store',required=True,metavar='TXT',
						help='File with 2-way prediction results')
	#parser.add_argument('--methodfile',dest='methodfile',action='store',required=False,metavar='TXT',
	#					help='File with method prediction results')

	parser.add_argument('--topK',dest='topK',action='store',type=int, help='get topK accuracy')
	
	args = parser.parse_args()
	
	global topK
	topK = args.topK
	
	head, body = fio.readMatrix(args.labelfile, True)
	rank_index = head.index('rank_H3')
	
	labels = [item[rank_index] for item in body]
	
	#head, body = fio.readMatrix(args.labelfile, True)
	#labels = [item[0] for item in body]
	
	dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot)
	track_file = open(args.trackfile, "wb")
	track = {"sessions":[]}
	track["dataset"]  = args.dataset
	start_time = time.time()

	turn_count = -1
	
	tracker = Tracker()
	
	for call in dataset :
		this_session = {"session-id":call.log["session-id"], "turns":[]}
		tracker.reset()
		for turn, _ in call :
			turn_count = turn_count + 1
			
			rank = labels[turn_count]
			
			tracker_turn = tracker.addTurn(turn, rank)
			this_session["turns"].append(tracker_turn)
		
		track["sessions"].append(this_session)
	end_time = time.time()
	elapsed_time = end_time - start_time
	track["wall-time"] = elapsed_time
   
	json.dump(track, track_file,indent=4)
def main(argv):
    print argv

    parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.")
    parser.add_argument(
        "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze"
    )
    parser.add_argument(
        "--dataroot",
        dest="dataroot",
        action="store",
        required=True,
        metavar="PATH",
        help="Will look for corpus in <destroot>/<dataset>/...",
    )
    parser.add_argument(
        "--trackfile",
        dest="trackfile",
        action="store",
        required=True,
        metavar="JSON_FILE",
        help="File to write with tracker output",
    )
    parser.add_argument(
        "--ontology", dest="ontology", action="store", metavar="JSON_FILE", required=True, help="JSON Ontology file"
    )

    # args = parser.parse_args()
    args = parser.parse_args(argv)
    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=False)
    tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

    track_file = open(args.trackfile, "wb")
    track = {"sessions": []}
    track["dataset"] = args.dataset
    start_time = time.time()

    tracker = NaiveEnsembleBasedTrackerWithNBest(tagsets, nameOfODictPickle="dictOutput.pic")
    for call in dataset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        tracker.reset()
        for (utter, _) in call:
            sys.stderr.write("%d:%d\n" % (call.log["session_id"], utter["utter_index"]))
            tracker_result = tracker.addUtter(utter, call)
            if tracker_result is not None:
                this_session["utterances"].append(tracker_result)
        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall_time"] = elapsed_time

    json.dump(track, track_file, indent=4)

    track_file.close()
示例#14
0
def main(argv):
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')

	args = parser.parse_args()
	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)
	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	track_file = open(args.trackfile, "wb")
	track = {"sessions":[]}
	track["dataset"]  = args.dataset
	start_time = time.time()

	sub_seg_counter = 0
	topic_slot_counter = {}
	for topic in tagsets:
		topic_slot_counter[topic] = defaultdict(int)

	extractor = sub_segment_extractor()
	for call in dataset:
		this_session = {"session_id":call.log["session_id"], "sub_segments":[]}
		extractor.reset()
		for (log_utter, label_utter) in call:
			sys.stderr.write('%d:%d\n'%(call.log['session_id'], log_utter['utter_index']))
			if log_utter['segment_info']['target_bio'] == 'B':
				if not extractor.is_empty:
					sub_segment = extractor.state
					sub_segment['id'] = sub_seg_counter
					sub_seg_counter += 1
					this_session['sub_segments'].append(sub_segment)
					for slot in sub_segment['frame_label']:
						topic_slot_counter[sub_segment['topic']][slot] += 1

			extractor.addUtter(log_utter,label_utter)
		if not extractor.is_empty:
			sub_segment = extractor.state
			sub_segment['id'] = sub_seg_counter
			sub_seg_counter += 1
			this_session['sub_segments'].append(sub_segment)
		track["sessions"].append(this_session)
	end_time = time.time()
	elapsed_time = end_time - start_time
	track['wall_time'] = elapsed_time

	json.dump(track, track_file, indent=4)

	track_file.close()

	print json.dumps(topic_slot_counter, indent = 4)
def main():

    parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.")
    parser.add_argument(
        "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze"
    )
    parser.add_argument(
        "--dataroot",
        dest="dataroot",
        action="store",
        required=True,
        metavar="PATH",
        help="Will look for corpus in <destroot>/<dataset>/...",
    )
    parser.add_argument(
        "--trackfile",
        dest="trackfile",
        action="store",
        required=True,
        metavar="JSON_FILE",
        help="File to write with tracker output",
    )
    parser.add_argument(
        "--focus", dest="focus", action="store", nargs="?", default="False", const="True", help="Use focus node tracker"
    )
    args = parser.parse_args()
    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot)
    track_file = open(args.trackfile, "wb")
    track = {"sessions": []}
    track["dataset"] = args.dataset
    start_time = time.time()

    if args.focus.lower() == "true":
        tracker = FocusTracker()
    elif args.focus.lower() == "false":
        tracker = Tracker()
    else:
        raise RuntimeError, "Dont recognize focus=%s (must be True or False)" % (args.focus)
    for call in dataset:
        this_session = {"session-id": call.log["session-id"], "turns": []}
        tracker.reset()
        for turn, _ in call:
            tracker_turn = tracker.addTurn(turn)
            this_session["turns"].append(tracker_turn)

        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall-time"] = elapsed_time

    json.dump(track, track_file, indent=4)
示例#16
0
def main() :
    
    parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze')
    parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
                        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument('--focus',dest='focus',action='store',nargs='?',default="False",const="True",
                        help='Use focus node tracker')
    args = parser.parse_args()
    #dataset文件中有多少对话,dataset就有多少对话,dataset是一个dataset_walker类的对象
    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot)
    track_file = open(args.trackfile, "wb")
    track = {"sessions":[]}
    track["dataset"]  = args.dataset
    start_time = time.time()

    if args.focus.lower() == "true":
        tracker = FocusTracker()
    elif args.focus.lower() == "false":
        tracker = Tracker()
    else:
        raise RuntimeError,'Dont recognize focus=%s (must be True or False)' % (args.focus)    
    for call in dataset :
        #把dataset中的对话一个个拿出来按照话轮处理
        this_session = {"session-id":call.log["session-id"], "turns":[]}

        #tracker的reset操作:self.hyps = {"goal-labels":{},"method-label":{}, "requested-slots":{}}
        tracker.reset()

        #对每一个对话按照其中的话轮开始跟中用户的对话目的
        #跟踪对话状态:dialog state 或者说是 dialog hypthesis
        for turn, _ in call :
            #所以关于对于对话状态跟踪都在addTurn函数里面处理
            #最核心的代码,核心算法部分。
            #turn是个字典,键值有3种,output,turn-index,input
            ####################################
            tracker_turn = tracker.addTurn(turn)
            ####################################
            this_session["turns"].append(tracker_turn)
        
        #把当前对话所有话轮的用户目的的跟中结果放在this_session,添加到最终的结果集合track中。
        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall-time"] = elapsed_time
   
    json.dump(track, track_file,indent=4)
示例#17
0
def main() :
    
    print_gplv3()
    
    parser = argparse.ArgumentParser(description='HWU Rule-based Dialog State Tracker Baseline V2.0\n  by Zhuoran Wang\t [email protected]\n  This version extends the work in (Wang & Lemon, SigDial 2013).',\
                                     formatter_class=RawTextHelpFormatter)
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze')
    parser.add_argument('--ontology', dest='ontology', action='store', metavar='JSON_FILE', required=True,
                        help='The ontology to use')
    parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
                        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument('--original',dest='original',action='store',required=False,metavar='TRUE/FALSE',
                        help='Use the original version presented in (Wang & Lemon, SigDial 2013)')
    
    args = parser.parse_args()
    
    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot)
    
    original = False
    if args.original and args.original.lower() == "true" :
        original = True
    
    load_ontology(args.ontology)
    
    track_file = open(args.trackfile, "wb")
    track = {"sessions":[]}
    track["dataset"]  = args.dataset
    start_time = time.time()
    tracker = HWU_Tracker()
    
    for call in dataset :
        this_session = {"session-id":call.log["session-id"], "turns":[]}
        tracker.reset()
        for turn, _ in call :
            tracker_turn = tracker.addTurn(turn,original)
            this_session["turns"].append(tracker_turn)
        
        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall-time"] = elapsed_time
   
    json.dump(track, track_file,indent=4)
示例#18
0
def main():
	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
						help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
						help='Will look for corpus in <destroot>/<dataset>/...')
	
	parser.add_argument('--goal_area',dest='goal_area',action='store',required=False,metavar='TXT',
						help='File with goal_area prediction results')
	parser.add_argument('--goal_food',dest='goal_food',action='store',required=False,metavar='TXT',
						help='File with goal_food prediction results')
	parser.add_argument('--goal_name',dest='goal_name',action='store',required=False,metavar='TXT',
						help='File with goal_name prediction results')
	parser.add_argument('--goal_pricerange',dest='goal_pricerange',action='store',required=False,metavar='TXT',
						help='File with goal_pricerange prediction results')
	parser.add_argument('--topK',dest='topK',action='store',type=int, help='get topK accuracy')
	
	args = parser.parse_args()
	
	global topK
	topK = args.topK
	
	for goal in [args.goal_area, args.goal_food, args.goal_name, args.goal_pricerange]:
		if goal == None: continue
		
		head, body = fio.readMatrix(goal, True)
		
		dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot)
		
		turn_count = -1
		nbest_count = 0
		
		nbest = []
		
		for call in dataset :
			for turn, _ in call :
				turn_count = turn_count + 1
				
				n_asr_live = len(turn['input']['live']['asr-hyps'])
				
				combinedgoals = getNbest(body[nbest_count:nbest_count + n_asr_live][:], topK)
				nbest.append(combinedgoals)
				nbest_count = nbest_count + n_asr_live
		
		fio.writeMatrix(goal+'.'+str(topK)+".combine", nbest, head)
示例#19
0
def main(argv):
    #
    # CMD LINE ARGS
    # 
    install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path,'lib')
    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    list_dir = os.path.join(install_path,'config')

    parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze, for example train1 or test2 or train3a')
    parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
                        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--datafile',dest='datafile',action='store',required=True,metavar='JSON_FILE',
                            help='File to write output')
    parser.add_argument('--label',dest='label',action='store',required=True,metavar='BOOL',
                            help='load labels')
    
    args = parser.parse_args()
    
    label = False
    if args.label.lower() == 'true':
	label = True

    dataset = dataset_walker(args.dataset,dataroot=args.dataroot,labels=label)
    
    datafile = open(args.datafile, "wb")
    data = {"sessions":[]}
    data["dataset"] = args.dataset
    
    vector = vectorizer()
    
    for call in dataset :
        this_session = {"session-id":call.log["session-id"], "turns":[]}
        vector.reset()
        for turn, labels in call :
            data_point = vector.addTurn(turn,labels)
            this_session["turns"].append(data_point)
        
        data["sessions"].append(this_session)
    
    json.dump(data, datafile,indent=4)

    datafile.close()
 def constructDoc2vec(nameDataS2V=["dstc4_train","dstc4_dev"], dataPath="data", NameLearnedFile="LearnedDoc2Vec.d2v"):
     #nameDataS2V list of source dialogue data
     #Name of the file for learned doc2vecs.
     
     #label=s<sessionID>u(utteranceIndex)
     #words given bow
     
     print "Start to construct doc2vec from given dialogs."
     print nameDataS2V
     #Make input to doc to vec
     #-Load data
     dataS2V=[]
     for nameData in nameDataS2V:
         dataS2V.append(dataset_walker.dataset_walker(nameData,dataroot=dataPath,labels=True))
     lSentences=[]
     for dataset in dataS2V:
         print dataset
         for call in dataset:
             sid=call.log["session_id"]
             for (uttr,_) in call:
                 uid=uttr["utter_index"]
                 label="s"+str(sid)+"u"+str(uid)#
                 #print label
                 words=LSTMWithBOWTracker.__getRegurelisedBOW(copy.copy(uttr["transcript"]))
                 #print words
                 lSentences.append(TaggedDocument(words=words,tags=[label]))
     #Learn
     #reference: http://rare-technologies.com/doc2vec-tutorial/
     numMaxCPU=multiprocessing.cpu_count()
     if numMaxCPU > LSTMWithBOWTracker.D2V_MAXNUMCPU:
         print "As number of  CPU is exceeded, it rescaled into " + str(LSTMWithBOWTracker.D2V_MAXNUMCPU)
         numMaxCPU=LSTMWithBOWTracker.D2V_MAXNUMCPU
     print "Lern doc2vec with " + str(numMaxCPU)+" CPUs."
     model = Doc2Vec(size=LSTMWithBOWTracker.D2V_VECTORSIZE,workers=numMaxCPU,min_count=0)  # use fixed learning rate
     model.build_vocab(lSentences)
     for epoch in range(LSTMWithBOWTracker.D2V_MAXITERATION):
         model.train(lSentences)
         model.alpha *= 0.995# decrease the learning rate
         model.min_alpha = model.alpha  # fix the learning rate, no decay    pass
         print str(epoch)+ "/"+ str(LSTMWithBOWTracker.D2V_MAXITERATION) + "(epoch/max epoch)"
     model.save(NameLearnedFile)
     print "Doc2Vec was constructed with:" 
     print dataS2V
     print ", stored to " + NameLearnedFile    
示例#21
0
def main(argv):
    install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path,'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker

    parser = argparse.ArgumentParser(description='Check the validity of a system output for SAP task.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
    parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True, help='Will look for corpus in <destroot>/...')
    parser.add_argument('--jsonfile',dest='jsonfile',action='store',metavar='JSON_FILE',required=True, help='File containing JSON output')
    parser.add_argument('--roletype',dest='roletype',action='store',choices=['GUIDE', 'TOURIST'],required=True, help='Target role')

    args = parser.parse_args()

    sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=False, task='SLG', roletype=args.roletype.lower())
    system_output = json.load(open(args.jsonfile))

    checker = TrackChecker(sessions, system_output, args.roletype)
    checker.check()
    checker.print_errors()
def main(argv):
    #TODO implementation
    #Confirmation hypothesis about data
    tagsets = ontology_reader.OntologyReader("scripts/config/ontology_dstc4.json").get_tagsets()
    datasetTrain = dataset_walker.dataset_walker("dstc4_train",dataroot="data",labels=True)
    datasetDev = dataset_walker.dataset_walker("dstc4_dev",dataroot="data",labels=True)
    print "Calculate statics of dialog. "
    #-Is number of value in each slot is always 1 if it exist? i.e., it does not contain multiple value?
    #-There are many multiple value
    isEnumerateMultiValueCase=True
    isEnumerateMultiSlotCase=True       
    countMultipleValueInOneSlot=0
    #
    maxSlotValueTrain={}
    countMultipleSlot=0
    for call in datasetTrain:
        for (uttr,label) in call:
            if "frame_label" in label:
                if isEnumerateMultiSlotCase:
                    if len(label["frame_label"].keys()) > 1:
                        print label["frame_label"].keys()
                        countMultipleSlot+=1
                for slot in label["frame_label"].keys():
                    if isEnumerateMultiValueCase:
                        if slot not in maxSlotValueTrain:
                            maxSlotValueTrain[slot]=len(label["frame_label"][slot])
                        else:
                            if maxSlotValueTrain[slot] < len(label["frame_label"][slot]):
                                maxSlotValueTrain[slot] = len(label["frame_label"][slot])
                        if len(label["frame_label"][slot]) > 1:
                            print "slot=" + slot + ":",
                            print label["frame_label"][slot]
                            countMultipleValueInOneSlot+=1                            
    
    for call in datasetDev:
        for (uttr,label) in call:
            if "frame_label" in label:
                if isEnumerateMultiSlotCase:
                    if len(label["frame_label"].keys()) > 1:
                        print label["frame_label"].keys()
                        countMultipleSlot+=1
                for slot in label["frame_label"].keys():
                    if isEnumerateMultiValueCase:
                        if slot not in maxSlotValueTrain:
                            maxSlotValueTrain[slot]=len(label["frame_label"][slot])
                        else:
                            if maxSlotValueTrain[slot] < len(label["frame_label"][slot]):
                                maxSlotValueTrain[slot] = len(label["frame_label"][slot])
                        if len(label["frame_label"][slot]) > 1:
                            print "slot=" + slot + ":",
                            print label["frame_label"][slot]
                            countMultipleValueInOneSlot+=1
    if isEnumerateMultiValueCase:
        print "Number of multiple value situation = " + str(countMultipleValueInOneSlot)
        avr=0.0
        for slot in maxSlotValueTrain.keys():
            avr+=(float)(maxSlotValueTrain[slot])
        avr/=float(len(maxSlotValueTrain.keys()))
        maxSlotValueTrain["AverageNumber"]=int(round(avr))
        print "Number of max slot value per slot:"
        print maxSlotValueTrain
        
    if isEnumerateMultiSlotCase:
        print "Number of multiple slot situation = " + str(countMultipleSlot)        
    #-How many OOV case?
    #-Train -> dev: 1195, Dev->Train: 4789
    #-With additional text normalizing, Train -> dev: 937, Dev->Train: 3643
    #-With additional normalization Train -> dev: 831, Dev->Train: 3237
    isCountNumberofOOVCase=False
    dictVocabInTrain={}
    dictVocabInDev={}
    numberOfOOVCaseInTrain2Dev=0
    numberOfOOVCaseInDev2Train=0
    if isCountNumberofOOVCase:
        for call in datasetTrain:
            for (uttr,label) in call:
                trans=uttr["transcript"]
                transt=re.sub("\,","",trans)
                transt=re.sub("\?","",transt)
                transt=re.sub("\.","",transt)
                transt=re.sub("(%.+ )?","",transt)
                #Additional normalize
                transt=re.sub("(%.+$)?","",transt)
                transt=re.sub("%","",transt)
                transt=re.sub("(-|~)"," ",transt)
                transt=re.sub("\!","",transt)
                transt=re.sub("'"," ",transt)
                transt=re.sub("\"","",transt)
                #
                transt=re.sub("/","",transt)
                transt=re.sub("[1-9]+","Replacedval",transt)
                transt=transt.lower()
                                    
                words=transt.split(" ")
                for word in words:
                    #Additional normalization
                    lmtr=nltk.stem.wordnet.WordNetLemmatizer()
                    word=lmtr.lemmatize(word)

                    dictVocabInTrain[word]=0
        for call in datasetDev:
            for (uttr,label) in call:
                trans=uttr["transcript"]
                transt=re.sub("\,","",trans)
                transt=re.sub("\?","",transt)
                transt=re.sub("\.","",transt)
                transt=re.sub("(%.+ )?","",transt)
                #Additional normalize
                transt=re.sub("(%.+$)?","",transt)
                transt=re.sub("%","",transt)
                transt=re.sub("(-|~)"," ",transt)
                transt=re.sub("\!","",transt)
                transt=re.sub("'"," ",transt)
                transt=re.sub("\"","",transt)
                #
                transt=re.sub("/","",transt)
                transt=re.sub("[1-9]+","Replacedval",transt)
                transt=transt.lower()
                
                words=transt.split(" ")
                for word in words:
                    #Additional normalization
                    lmtr=nltk.stem.wordnet.WordNetLemmatizer()
                    word=lmtr.lemmatize(word)

                    if word not in dictVocabInTrain:
                        print word.encode("utf-8")
                        numberOfOOVCaseInTrain2Dev+=1
        print "Number of OOV case in Train -> Dev situation = " + str(numberOfOOVCaseInTrain2Dev)
        print "\n\n\n\n\n"
        for call in datasetDev:
            for (uttr,label) in call:
                trans=uttr["transcript"]
                transt=re.sub("\,","",trans)
                transt=re.sub("\?","",transt)
                transt=re.sub("\.","",transt)
                transt=re.sub("(%.+ )?","",transt)
                #Additional normalize
                transt=re.sub("(%.+$)?","",transt)
                transt=re.sub("%","",transt)
                transt=re.sub("(-|~)"," ",transt)
                transt=re.sub("\!","",transt)
                transt=re.sub("'"," ",transt)
                transt=re.sub("\"","",transt)
                #
                transt=re.sub("/","",transt)
                transt=re.sub("[1-9]+","Replacedval",transt)                    
                transt=transt.lower()

                words=transt.split(" ")
                for word in words:
                    #Additional normalization
                    lmtr=nltk.stem.wordnet.WordNetLemmatizer()
                    word=lmtr.lemmatize(word)
                    
                    dictVocabInDev[word]=0
        for call in datasetTrain:
            for (uttr,label) in call:
                trans=uttr["transcript"]
                transt=re.sub("\,","",trans)
                transt=re.sub("\?","",transt)
                transt=re.sub("\.","",transt)
                transt=re.sub("(%.+ )?","",transt)
                #Additional normalize
                transt=re.sub("(%.+$)?","",transt)
                transt=re.sub("%","",transt)
                transt=re.sub("(-|~)"," ",transt)
                transt=re.sub("\!","",transt)
                transt=re.sub("'"," ",transt)
                transt=re.sub("\"","",transt)
                #
                transt=re.sub("/","",transt)
                transt=re.sub("[1-9]+","Replacedval",transt)
                transt=transt.lower()

                words=transt.split(" ")
                for word in words:
                    #Additional normalization
                    lmtr=nltk.stem.wordnet.WordNetLemmatizer()
                    word=lmtr.lemmatize(word)

                    if word not in dictVocabInDev:
                        print word.encode("utf-8")
                        numberOfOOVCaseInDev2Train+=1            
        print "Number of OOV case in Dev -> Train situation = " + str(numberOfOOVCaseInDev2Train)
        
    #-How many frame_label are unseen between train and dev data?
    #-So many, train -> dev 96/313 (unseen/all in dev), dev -> train 346/563 (unseen/all in train)
    isCountUnseenframeLabel=False
    dictTopicSlotValueTrain=[]
    numUnseenframeLabel=0
    alreadychecked=[]
    dictTopicSlotValueDev={}
    if isCountUnseenframeLabel:
        for call in datasetTrain:
            for (uttr,label) in call:
                if "frame_label" in label:
                    for slot in label["frame_label"].keys():
                        for value in label["frame_label"][slot]:
                            dictTopicSlotValueTrain.append(slot+value)
        for call in datasetDev:
            for (uttr,label) in call:
                if "frame_label" in label:
                    for slot in label["frame_label"].keys():
                        for value in label["frame_label"][slot]:
                            dictTopicSlotValueDev[(slot+value)]=0
                            if (slot+value) not in dictTopicSlotValueTrain:
                                if (slot+value) not in alreadychecked:
                                    numUnseenframeLabel+=1
                                    alreadychecked.append((slot+value))
        print "Number of Unseen label train -> dev = " + str(numUnseenframeLabel)
        print "Ratio (unseen/all in dev) = " + str(numUnseenframeLabel) + "/" + str(len(dictTopicSlotValueDev.keys()))

        dictTopicSlotValueDev=[]
        numUnseenframeLabel=0
        alreadychecked=[]
        dictTopicSlotValueTrain={}
        for call in datasetDev:
            for (uttr,label) in call:
                if "frame_label" in label:
                    for slot in label["frame_label"].keys():
                        for value in label["frame_label"][slot]:
                            dictTopicSlotValueDev.append(slot+value)
        for call in datasetTrain:
            for (uttr,label) in call:
                if "frame_label" in label:
                    for slot in label["frame_label"].keys():
                        for value in label["frame_label"][slot]:
                            dictTopicSlotValueTrain[(slot+value)]=0
                            if (slot+value) not in dictTopicSlotValueDev:
                                if (slot+value) not in alreadychecked:
                                    numUnseenframeLabel+=1
                                    alreadychecked.append((slot+value))
        print "Number of Unseen label dev -> train = " + str(numUnseenframeLabel)
        print "Ratio (unseen/all in train) = " + str(numUnseenframeLabel) + "/" + str(len(dictTopicSlotValueTrain.keys()))
示例#23
0
def main():
    parser = argparse.ArgumentParser(
        description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        required=True,
        metavar='PATH',
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',
                        dest='trackfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument('--focus',
                        dest='focus',
                        action='store',
                        nargs='?',
                        default="False",
                        const="True",
                        help='Use focus node tracker')
    parser.add_argument(
        '--config',
        dest='config',
        action='store',
        required=True,
        metavar='TRUE/FALSE',
        help='The path of the config folder containing the .flist files')
    parser.add_argument('--tracker',
                        dest='tracker',
                        action='store',
                        nargs='?',
                        default="LearnedTracker",
                        help='Tracker to use')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='The ontology to use')

    args = parser.parse_args()

    # Opens data set file and stores it in dataset object
    dataset = dataset_walker.dataset_walker(args.dataset,
                                            dataroot=args.dataroot,
                                            config_folder=args.config)

    # Opens track file
    track_file = open(args.trackfile, "w")

    track = {"sessions": [], "dataset": args.dataset}

    start_time = time.time()

    # Choosing what kind of tracker to use
    if args.tracker.lower() == "tracker":
        tracker = Tracker()

    elif args.tracker.lower() == "focustracker":
        tracker = FocusTracker()

    elif args.tracker.lower() == "customtracker":
        ontology = json.load(open(args.ontology))
        tracker = CustomTracker(ontology)

    elif args.tracker.lower() == "berttracker":
        ontology = json.load(open(args.ontology))
        tracker = BertTracker(ontology)

    elif args.tracker.lower() == "learnedtracker":
        ontology = json.load(open(args.ontology))
        tracker = LearnedTracker(ontology)

    elif args.tracker.lower() == "bandittracker":
        ontology = json.load(open(args.ontology))
        tracker = BanditTracker(ontology)

    elif args.tracker.lower() == "bandittrackertf":
        ontology = json.load(open(args.ontology))
        tracker = BanditTrackerTF(ontology)

    elif args.tracker.lower() == "simpletracker":
        ontology = json.load(open(args.ontology))
        tracker = SimpleTracker(ontology)

        # Iterates over every call in the dataset
    for call in dataset:
        this_session = {"session-id": call.log["session-id"], "turns": []}
        tracker.reset()

        # Iterates over every turn in a call
        for turn, _ in call:
            # Adds the turn to the tracker
            tracker_turn = tracker.addTurn(turn, call.log["session-id"])

            this_session["turns"].append(tracker_turn)

        track["sessions"].append(this_session)

    end_time = time.time()
    elapsed_time = end_time - start_time

    track["wall-time"] = elapsed_time

    json.dump(track, track_file, indent=4)
def errorAnalysis(argv):
    print "ERROR ANALYSIS OF NAIVEENSEMBLER"
    print argv

    parser = argparse.ArgumentParser(
        description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        required=True,
        metavar='PATH',
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',
                        dest='trackfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='JSON Ontology file')

    #args = parser.parse_args()
    args = parser.parse_args(argv)
    dataset = dataset_walker.dataset_walker(args.dataset,
                                            dataroot=args.dataroot,
                                            labels=True)
    tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

    track = {"sessions": []}
    track["dataset"] = args.dataset
    start_time = time.time()

    tracker = NaiveEnsembleBasedTrackerWithNBest(
        tagsets, nameOfODictPickle="dictOutput.pic")
    for call in dataset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        tracker.reset()
        for (utter, label) in call:
            #-mae shori2
            if utter['segment_info']['target_bio'] == 'B':
                print "\n -----New sub-dialogue----------------------------------------------------"
            print "s:" + str(call.log['session_id']) + " u:" + str(
                utter['utter_index'])
            print "Input=" + utter["transcript"]
            tracker_result = tracker.addUtter(utter, call)
            if tracker_result is not None:
                this_session["utterances"].append(tracker_result)
                #
                print "Tracker's output:"
                print tracker_result
                if "frame_label" in label:
                    for slot in label["frame_label"].keys():
                        if (slot not in tracker_result["frame_label"]):
                            print "-slot [" + slot + "] is not exsisted in output"
                            for value in label["frame_label"][slot]:
                                print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output"
                        else:
                            if len(label["frame_label"][slot]) != len(
                                    tracker_result["frame_label"][slot]):
                                #In case value in output, but repudant
                                print "-slot [" + slot + "] include repudant values"
                            for value in label["frame_label"][slot]:
                                #In case value not in output
                                if (value not in tracker_result["frame_label"]
                                    [slot]):
                                    print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output"
        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track['wall_time'] = elapsed_time
示例#25
0
def main(argv):
    
    install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path,'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    list_dir = os.path.join(install_path,'config')

    parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze')
    parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,
                        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,
                        help='File containing score JSON')
    parser.add_argument('--scorefile',dest='csv',action='store',metavar='CSV_FILE',required=True,
                        help='File to write with CSV scoring data')
    parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,
                        help='JSON Ontology file')
    parser.add_argument('--rocdump',dest='rocdump',action='store',metavar='FILE_STEM',
                        help='If present, use this file stem to write out ROC plot data: filestem.<schedule>.<slot>.<type>.csv, where type is either roc (which contains the ROC curve coordinates) or scores (which contains the raw scores used to compute the ROC curves).')

    args = parser.parse_args()

    sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)
    tracker_output = json.load(open(args.scorefile))
    ontology = json.load(open(args.ontology))
    
    slots_informable  = ontology["informable"].keys()
    slots_requestable = ontology["requestable"]
    
    csvfile = open(args.csv,'w')
    
    # what stats are there?
    stats = []
    stat_classes = [Stat_Accuracy, Stat_Probs, Stat_MRR, Stat_Updates, Stat_ROC]
    
    for schedule in SCHEDULES:
        for label_scheme in LABEL_SCHEMES:
            for component in ['goal','requested', 'method', 'all']:
                if component == 'goal' :
                    for slot in slots_informable + ['all','joint','joint_independent'] :
                        for stat_class in stat_classes:
                            stats.append((('goal', slot), (schedule, label_scheme), stat_class()))
                
                        
                elif component == 'requested' :
                    if label_scheme != "a" :
                        continue
                    for slot in slots_requestable + ['all'] :
                        for stat_class in stat_classes:
                            stats.append((('requested', slot), (schedule, label_scheme), stat_class()))
                            
                elif component == 'method' :
                    for stat_class in stat_classes:
                        stats.append((('method',), (schedule, label_scheme), stat_class()))
                            
                elif component == 'all' :
                    for stat_class in stat_classes:
                        stats.append((('all',), (schedule, label_scheme), stat_class()))
                     
    
    turn_counter = 0.0
    
    for session_num, (session_tracker, session) in enumerate(zip(tracker_output['sessions'], sessions)):
        
      for _, _, stat_class in stats:
          stat_class.newDialog()
            
      session_id = session.log['session-id']
      try:
        
        # these are the set of slots 'mentioned so far', i.e. for schedule2
        S = defaultdict(lambda : set([]))
        S_requested = set([])
        
        session_length = len(session)
        
        goal_labels_b, method_labels_b = misc.LabelsB(session, ontology)
        method_schedule_2 = False # whether schedule 2 is active for method
        
        for turn_num, ((log_turn,label_turn),_tracker_turn) in enumerate(zip(session,session_tracker['turns'])):
            turn_counter += 1.0
            S_new = misc.S(log_turn, ontology)
            
            for slot in S_new :
                S[slot] = S[slot].union(S_new[slot])
                
            # remove just informed slots from S_requested
            S_requested = S_requested.difference(misc.SysInformed(log_turn))
            # add in ones from slu hyps
            S_requested = S_requested.union(set(misc.S_requested(log_turn)))

            tracker_goal_labels = _tracker_turn["goal-labels"]
            for slot in slots_informable:
                if slot in tracker_goal_labels :
                    tracker_goal_labels[slot] = normalise_dist(tracker_goal_labels[slot].items(), (session_id, turn_num, "goal."+slot))
                else :
                    tracker_goal_labels[slot] = [(None, 1.0)]
            
            
            # prepare for joint goals scoring:
            tracker_goal_joint_labels = "independent"
            if "goal-labels-joint" in _tracker_turn :
                tracker_goal_joint_labels = _tracker_turn["goal-labels-joint"]
                
            if tracker_goal_joint_labels != "independent" :
                # tracker_goal_joint_labels must be a list of joint hyps
                tracker_goal_joint_labels = [(hyp["slots"], hyp["score"]) for hyp in tracker_goal_joint_labels]
                tracker_goal_joint_labels = normalise_dist(tracker_goal_joint_labels, (session_id, turn_num, "goal.joint"))
            
            # also gather the correct joint label
            true_goal_joint = None
            for slot in label_turn["goal-labels"]:
                if true_goal_joint == None :
                    true_goal_joint = {}
                true_goal_joint[slot] = label_turn["goal-labels"][slot]
            
            true_goal_joint_b = None
            for slot in goal_labels_b[turn_num]:
                if true_goal_joint_b == None :
                    true_goal_joint_b = {}
                true_goal_joint_b[slot] = goal_labels_b[turn_num][slot]
            
            
            tracker_requested_slots = _tracker_turn["requested-slots"]
            for slot in tracker_requested_slots:
                dist = [(True, tracker_requested_slots[slot]), (False,1.0-tracker_requested_slots[slot])]
                tracker_requested_slots[slot] = normalise_dist(dist, (session_id, turn_num, "requested."+slot))
            
            tracker_method_label = normalise_dist(_tracker_turn["method-label"].items(), (session_id, turn_num,"method"))
            
            # for method schedule 2, work out whether any slu-hyp has been given
            # which informs the method:
            
            if not method_schedule_2 :
                mact = log_turn["output"]["dialog-acts"]
                for slu_hyp in log_turn["input"]["live"]["slu-hyps"] :
                    user_act = slu_hyp["slu-hyp"]
                    method_label = misc.MethodLabel(user_act, mact)
                    if method_label != "none" :
                        method_schedule_2 = True
                        break
                    
            
            for component, (schedule, label_scheme), stat_class in stats:
                if component[0] == "goal" and (component[1] == "joint" or  component[1] == "joint_independent"):
                    if schedule == 2:
                        # calculate schedule2 applicability
                        applies = False
                        for slot in slots_informable:
                            if len(S[slot]) > 0:
                                applies = True
                                break
                        if not applies :
                            continue
                        
                    this_true_label = true_goal_joint
                    if label_scheme == "b" :
                        this_true_label = true_goal_joint_b
                    
                    if tracker_goal_joint_labels == "independent" or component[1] == "joint_independent" :
                        stat_class.add(tracker_goal_labels, this_true_label, (session_id, turn_num, component, schedule, label_scheme), independent=True)
                    else :
                        stat_class.add(tracker_goal_joint_labels, this_true_label, (session_id, turn_num, component, schedule, label_scheme))
                
                if (component[0] == "goal" or component[0] == "all") and (len(component)==1 or ("joint" not in component[1])) :
                    if component[0] == "all" or component[1] == "all" :
                        slots = slots_informable[:]
                    else :
                        slots = [component[1]]
                    for slot in slots:
                        if schedule ==2 and len(S[slot]) == 0 :
                            continue
                        dist = tracker_goal_labels[slot]
                        
                        true_label = None
                        if slot in label_turn["goal-labels"] :
                            true_label = label_turn["goal-labels"][slot]
                            
                        if label_scheme == "b" :
                            true_label = None
                            if slot in goal_labels_b[turn_num] :
                                true_label = goal_labels_b[turn_num][slot]
                            
                        stat_class.add(dist, true_label, (session_id, turn_num, component, schedule, label_scheme))
                
                
                if component[0] == "requested" or component[0] == "all" :
                    if  component[0] == "all" or  component[1] == "all":
                        slots = slots_requestable[:]
                    else :
                        slots = [component[1]]
                    for slot in slots:
                        if schedule ==2 and (slot not in S_requested):
                            continue
                        dist =  [(False,1.0), (True,0.0)]
                        if  slot in tracker_requested_slots :
                            dist = tracker_requested_slots[slot]
                        
                        true_label = (slot in label_turn["requested-slots"])                        
                        stat_class.add(dist, true_label, (session_id, turn_num, component, schedule, label_scheme))
                        
                        
                if component[0] == "method" or component[0] == "all":
                    if schedule == 2 and not method_schedule_2:
                        continue # no slu hyp informing the method has been given yet.
                    dist = tracker_method_label
                    true_label =  label_turn["method-label"]
                    if label_scheme == "b" :
                        true_label = method_labels_b[turn_num]
                        
                        
                    stat_class.add(dist, true_label, (session_id, turn_num, component, schedule, label_scheme))
      except KeyboardInterrupt :
          raise
      except:
          traceback.print_exc(file=sys.stdout)
          print "While scoring " + str(session_id)
    # output to csv
    print >>csvfile,( "state_component, stat, schedule, label_scheme, N, result")
    
    for stat in stats:
        component, (schedule, label_scheme), stat_class = stat
        results = stat_class.results()
        for stat_subname, N, result in results:
            if result == None :
                result = "-"
            else :
                result = "%.7f"%result
            print >>csvfile,( "%s, %s, %i, %s, %i, %s"%(".".join(component), stat_subname, schedule, label_scheme, N, result))
        if isinstance(stat_class, Stat_ROC) and (args.rocdump):
            rocfile = args.rocdump + '.schedule' + str(schedule) + str(label_scheme)+'.' + (".".join(component)) + '.roc.csv'
            scoresfile = args.rocdump + '.schedule' + str(schedule) + str(label_scheme)+'.' + (".".join(component)) + '.scores.csv'
            stat_class.DumpROCToFile(rocfile)
            stat_class.DumpScoresToFile(scoresfile)
        
    print >>csvfile,'basic,total_wall_time,,,,%s' % (tracker_output['wall-time'])
    print >>csvfile,'basic,sessions,,,,%s' % (len(sessions))
    print >>csvfile,'basic,turns,,,,%i' % (int(turn_counter))
    print >>csvfile,'basic,wall_time_per_turn,,,,%s' % (tracker_output['wall-time'] / turn_counter)
    print >>csvfile,'basic,dataset,,,,%s' % (tracker_output['dataset'] )

    csvfile.close()
示例#26
0
def main(argv):
    install_path = os.path.abspath(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path, 'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    from stat_classes import Stat_Precision_Recall
    from eval_func import eval_acts, eval_semantics

    parser = argparse.ArgumentParser(
        description='Evaluate output from an SLU system.')
    parser.add_argument('--dataset', dest='dataset',
                        action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze')
    parser.add_argument('--dataroot', dest='dataroot',
                        action='store', metavar='PATH', required=True,
                        help='look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--jsonfile', dest='jsonfile',
                        action='store', metavar='JSON_FILE', required=True,
                        help='File containing JSON output')
    parser.add_argument('--ontology', dest='ontology',
                        action='store', metavar='JSON_FILE', required=True,
                        help='JSON Ontology file')
    parser.add_argument('--roletype', dest='roletype',
                        action='store', required=True,
                        choices=['GUIDE', 'TOURIST'], help='Target role')
    parser.add_argument('--scorefile', dest='scorefile',
                        action='store', metavar='JSON_FILE', required=True,
                        help='File to write with CSV scoring data')

    args = parser.parse_args()

    sessions = dataset_walker(
        args.dataset, dataroot=args.dataroot, labels=True)

    system_output = json.load(open(args.jsonfile))

    stats = {}
    stats['semantic_tagged'] = {}
    stats['semantic_tagged']['detection'] = Stat_Precision_Recall()
    stats['semantic_tagged']['class'] = Stat_Precision_Recall()
    stats['semantic_tagged']['all'] = Stat_Precision_Recall()

    stats['speech_act'] = {}
    stats['speech_act']['act'] = Stat_Precision_Recall()
    stats['speech_act']['all'] = Stat_Precision_Recall()

    for session, track_session in zip(sessions, system_output["sessions"]):
        log_utter_list = []
        label_utter_list = []

        for log_utter, translations, label_utter in session:
            if (args.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (args.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'):
                log_utter_list.append(log_utter)
                label_utter_list.append(label_utter)

        # now iterate through turns
        for log_utter, label_utter, track_utter in zip(
                log_utter_list, label_utter_list, track_session["utterances"]):
            for subtask in stats:
                if subtask == 'speech_act':
                    ref_sa_list = label_utter['speech_act']
                    pred_sa_list = track_utter['speech_act']
                    eval_acts(ref_sa_list, pred_sa_list, stats[subtask])
                elif subtask == 'semantic_tagged':
                    ref_tagged = ' '.join(label_utter['semantic_tagged'])
                    pred_tagged = track_utter['semantic_tagged']
                    eval_semantics(ref_tagged, pred_tagged, stats[subtask])

    csvfile = open(args.scorefile, 'w')
    print >> csvfile, ("task, subtask, schedule, stat, N, result")

    for subtask in stats:
        for schedule in stats[subtask]:
            for measure, N, result in stats[subtask][schedule].results():
                print >>csvfile, ("%s, %s, %s, %s, %i, %s" % (
                    'SLU', subtask, schedule, measure, N, result))
    csvfile.close()
示例#27
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = '<PAD/>'
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [transcript]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']
            train_utters += [
                (transcript, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = ''
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [translation]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']

            test_utters += [
                (translation, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    ctx_utters = [utter[1].split(' ') for utter in train_utters]
    print("max context utter length: %d " %
          max([len(ctx_utter) for ctx_utter in ctx_utters]))
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters,
                                                     vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    ctx_utters = [utter[1].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[3] for utter in train_utters]
    sa_test_labels = [utter[3] for utter in test_utters]
    sa_train_ctx_labels = [utter[5] for utter in train_utters]
    sa_test_ctx_labels = [utter[5] for utter in test_utters]

    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)
    train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels)
    test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]
    tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]
    guide_train_ctx_labels = train_ctx_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]
    tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]
    guide_test_ctx_labels = test_ctx_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_ctx_inputs,
                 tourist_train_labels, tourist_train_ctx_labels,
                 tourist_test_inputs, tourist_test_ctx_inputs,
                 tourist_test_labels, tourist_test_ctx_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_ctx_inputs,
                 guide_train_labels, guide_train_ctx_labels, guide_test_inputs,
                 guide_test_ctx_inputs, guide_test_labels,
                 guide_test_ctx_labels)

    print("")
示例#28
0
def main(argv):
    #
    # CMD LINE ARGS
    #
    install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path,'lib')
    
    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    list_dir = os.path.join(install_path,'config')

    parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
                        help='The dataset to analyze')
    parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,
                        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--decodefile',dest='decodefile',action='store',metavar='JSON_FILE',required=True,
                        help='File containing decoder output JSON')
    parser.add_argument('--scorefile',dest='csv',action='store',metavar='CSV_FILE',required=True,
                        help='File to write with CSV scoring data')
    parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,
                        help='JSON Ontology file')
    parser.add_argument('--trackerfile',dest='trackerfile',action='store',metavar='JSON_FILE',required=True,
                        help='Tracker JSON file for output')
    
    
    args = parser.parse_args()

    sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)
    decode_results = json.load(open(args.decodefile))
    ontology = json.load(open(args.ontology))

    metrics = {
        "tophyp":Fscore(ontology),
        "ice":ICE(ontology)
    }
    
    belief_metrics = {
        "accuracy":BeliefAccuracy(ontology)
    }
    
    # we run the baseline focus tracker on the output of the SLU
    tracker = baseline.FocusTracker()
    tracker_output = {"sessions":[],"wall-time":0.0}
    tracker_output["dataset"]  = args.dataset
    
    for call, decode_session in zip(sessions, decode_results["sessions"]):
        tracker.reset()
        this_session = {"session-id":call.log["session-id"], "turns":[]}
        for (log_turn, label), decode_result in zip(call, decode_session["turns"]):
       
            true_label = label["semantics"]["json"]
            slu_hyps = decode_result["slu-hyps"]
            slu_hyps.sort(key=lambda x:-x["score"])
            total_p = sum([x["score"] for x in slu_hyps])
            if total_p > 1.0 :
                if total_p > 1.00001 :
                    print "Warning: total_p =",total_p,"> 1.0- renormalising."
                for slu_hyp in slu_hyps:
                    slu_hyp["score"] = slu_hyp["score"]/total_p
            
            
            for metric in metrics.values():
                metric.add_turn(true_label, slu_hyps, log_turn, label)
                
            # for passing to tracker
            this_turn = {
                    "input":{"live":{"slu-hyps":slu_hyps}},
                    "output":log_turn["output"]
            }
            goal_hyps = tracker.addTurn(this_turn)
            for belief_metric in belief_metrics.values():
                belief_metric.add_turn(goal_hyps, label)
            
            
            this_session["turns"].append(goal_hyps)
            
            
        tracker_output["sessions"].append(this_session)
    
    tracker_file = open(args.trackerfile, "wb")
    json.dump(tracker_output, tracker_file, indent=4)
    tracker_file.close()
      
    csv_file = open(args.csv, "wb")
    
    
    output = []
    
    for key, metric in metrics.items():
        this_output =  metric.output()
        for this_key, value in this_output.items():
            output.append(( key + ","+ this_key, value))
            
    for key, belief_metric in belief_metrics.items():
        this_output =  belief_metric.output()
        key = "belief_"+key
        for this_key, value in this_output.items():
            output.append((key + ","+ this_key, value))

    output.sort(key=lambda x:x[0])
    for key, value in output:
        w = 35
        if value < 0 :
            w = w-1
        metric_name = (key+",").ljust(w)    
        csv_file.write(metric_name + ("%.5f"%value)+"\n")
    
    csv_file.close()
示例#29
0
def main(argv):
    parser = argparse.ArgumentParser(description='Simple SLU baseline.')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--modelfile', dest='modelfile', action='store', required=True, metavar='MODEL_FILE',  help='File to write with trained model')
    parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE',  help='File to write with SLU output')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE',  'TOURIST'], required=True,  help='Target role')

    args = parser.parse_args()

    slu = SimpleSLU()

    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'):
                slu.add_instance(log_utter['transcript'], label_utter['speech_act'], label_utter['semantic_tagged'])
    sys.stderr.write('Done\n')

    slu.train(args.modelfile)

    projection = DirectLabelProjection()

    output = {'sessions': []}
    output['dataset'] = args.testset
    output['task_type'] = 'SLU'
    output['role_type'] = args.roletype
    start_time = time.time()

    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in testset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        for (log_utter, translations, label_utter) in call:
            if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'):
                slu_result = {'utter_index': log_utter['utter_index']}
                if len(translations['translated']) > 0:
                    top_hyp = translations['translated'][0]['hyp']
                    pred_act, pred_semantic = slu.pred(top_hyp)

                    combined_act = {}
                    for act_label in reduce(operator.add, pred_act):
                        m = re.match('^([^_]+)_(.+)$', act_label)
                        act = m.group(1)
                        attr = m.group(2)
                        if act not in combined_act:
                            combined_act[act] = []
                        if attr not in combined_act[act]:
                            combined_act[act].append(attr)

                    slu_result['speech_act'] = []
                    for act in combined_act:
                        attr = combined_act[act]
                        slu_result['speech_act'].append({'act': act, 'attributes': attr})

                    align = translations['translated'][0]['align']

                    projected = projection.project(log_utter['transcript'], top_hyp, align, pred_semantic)
                    slu_result['semantic_tagged'] = projection.convert_to_tagged_utter(projected)
                else:
                    slu_result['semantic_tagged'] = log_utter['transcript']
                    slu_result['speech_act'] = []
                this_session['utterances'].append(slu_result)
        output['sessions'].append(this_session)

    end_time = time.time()
    elapsed_time = end_time - start_time
    output['wall_time'] = elapsed_time

    with open(args.outfile, "wb") as of:
        json.dump(output, of, indent=4)

    sys.stderr.write('Done\n')
示例#30
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset(
        trainset, devset, testset)

    train_utters += dev_utters

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)

    print("")
示例#31
0
def main(argv):
    install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path,'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    from stat_classes import Stat_Accuracy, Stat_Frame_Precision_Recall

    parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,help='The dataset to analyze')
    parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',dest='trackfile',action='store',metavar='JSON_FILE',required=True,help='File containing tracker JSON output')
    parser.add_argument('--scorefile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,help='File to write with JSON scoring data')
    parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')

    args = parser.parse_args()

    sessions = dataset_walker(args.dataset, dataroot=args.dataroot, labels=True)
    tracker_output = json.load(open(args.trackfile))

    ontology = OntologyReader(args.ontology)

    stats = []
    stat_classes = [Stat_Accuracy, Stat_Frame_Precision_Recall]

    for schedule in SCHEDULES:
        for stat_class in stat_classes:
            stats.append((('all', 'all'), schedule, stat_class()))

        for topic in ontology.get_topics():
            for slot in ontology.get_slots(topic) + ['all']:
                for stat_class in stat_classes:
                    stats.append(((topic, slot), schedule, stat_class()))

    utter_counter = 0.0

    for session, track_session in zip(sessions, tracker_output["sessions"]):
        prev_ref_frame = None
        prev_track_frame = None
        prev_topic = None

        for (log_utter, translations, label_utter), track_utter in zip(session, track_session["utterances"]):
            utter_counter += 1.0

            if log_utter['segment_info']['target_bio'] == 'B':
                # Beginning of a new segment
                ref_frame = label_utter['frame_label']
                track_frame = track_utter['frame_label']

                for (topic, slot), schedule, stat_class in stats:
                    if schedule == 2:
                        if topic == 'all':
                            stat_class.add(prev_track_frame, prev_ref_frame)
                        elif prev_topic == topic:
                            if slot == 'all':
                                stat_class.add(prev_track_frame, prev_ref_frame)
                            else:
                                if slot in prev_track_frame and slot in prev_ref_frame:
                                    stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]})
                                elif slot in prev_track_frame and slot not in prev_ref_frame:
                                    stat_class.add({slot: prev_track_frame[slot]}, {slot: []})
                                elif slot not in prev_track_frame and slot in prev_ref_frame:
                                    stat_class.add({slot: []}, {slot: prev_ref_frame[slot]})

            elif log_utter['segment_info']['target_bio'] == 'I':
                ref_frame = label_utter['frame_label']
                track_frame = track_utter['frame_label']
            elif log_utter['segment_info']['target_bio'] == 'O':
                ref_frame = None
                track_frame = None

            for (topic, slot), schedule, stat_class in stats:
                if schedule == 1:
                    if topic == 'all':
                        stat_class.add(track_frame, ref_frame)
                    elif log_utter['segment_info']['topic'] == topic:
                        if slot == 'all':
                            stat_class.add(track_frame, ref_frame)
                        else:
                            if slot in track_frame and slot in ref_frame:
                                stat_class.add({slot: track_frame[slot]}, {slot: ref_frame[slot]})
                            elif slot in track_frame and slot not in ref_frame:
                                stat_class.add({slot: track_frame[slot]}, {slot: []})
                            elif slot not in track_frame and slot in ref_frame:
                                stat_class.add({slot: []}, {slot: ref_frame[slot]})

            prev_ref_frame = ref_frame
            prev_track_frame = track_frame
            prev_topic = log_utter['segment_info']['topic']

        for (topic, slot), schedule, stat_class in stats:
            if schedule == 2:
                if topic == 'all':
                    stat_class.add(prev_track_frame, prev_ref_frame)
                elif prev_topic == topic:
                    if slot == 'all':
                        stat_class.add(prev_track_frame, prev_ref_frame)
                    else:
                        if slot in prev_track_frame and slot in prev_ref_frame:
                            stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]})
                        elif slot in track_frame and slot not in ref_frame:
                            stat_class.add({slot: prev_track_frame[slot]}, {slot: []})
                        elif slot not in track_frame and slot in ref_frame:
                            stat_class.add({slot: []}, {slot: prev_ref_frame[slot]})

    csvfile = open(args.scorefile, 'w')
    print >> csvfile, ("topic, slot, schedule, stat, N, result")

    for stat in stats:
        (topic, slot), schedule, stat_class = stat

        results = stat_class.results()
        for stat_subname, N, result in results:
            if result == None:
                result = "-"
            else:
                result = "%.7f"%result
            print >>csvfile,("%s, %s, %i, %s, %i, %s"%(topic, slot, schedule, stat_subname, N, result))

    print >>csvfile,'basic,total_wall_time,,,,%s' % (tracker_output['wall_time'])
    print >>csvfile,'basic,sessions,,,,%s' % (len(sessions))
    print >>csvfile,'basic,utterances,,,,%i' % (int(utter_counter))
    print >>csvfile,'basic,wall_time_per_utterance,,,,%s' % (tracker_output['wall_time'] / utter_counter)
    print >>csvfile,'basic,dataset,,,,%s' % (tracker_output['dataset'] )

    csvfile.close()
示例#32
0
                local_recall_avg = 0.0
                if recall_count != 0:
                    local_recall_avg = local_recall / recall_count

                precision += local_precision_avg
                recall += local_recall_avg

                predictions.append(predicted)

                eval_str = '\n[%d]\npredicted: %s\nactual: %s\nLocal Precision: %f\nLocal Recall: %f\n' % (
                    j,
                    str(predicted),
                    str(actual),
                    local_precision_avg,
                    local_recall_avg,
                )
                writer.write(eval_str)

        print 'Accuracy: %f' % (correct * 1.0 / len(model_predictions))
        print 'Precision: %f' % (precision / len(model_predictions))
        print 'Recall: %f' % (recall / len(model_predictions))


if __name__ == '__main__':
    dataset = dataset_walker("dstc2_dev", dataroot=data_folder, labels=True)
    my_user_sim = UserSim()
    my_user_sim.generate_training_data(dataset, context_turn_num=3)
    my_user_sim.train_and_test(training_percent=0.7,
                               training=True,
                               testing=True)
    def learn(self,
              pathdataset=["dstc4_train"],
              Pathdataroot="data",
              numberOfHiddenUnit=20,
              EPOCHS_PER_CYCLE=10,
              CYCLES=40,
              weightdecayw=0.01):
        print "Start learning LSTM, and make dictionary file"
        #Construct dictionary: variable name -> corresponding index of element in i/o vector
        print "Star make dictionary: variable name -> corresponding index of element in i/o vector"
        self.dictOut = {
        }  #"TOPIC_SLOT_VALUE" -> corresponding index of element
        self.dictIn = {
        }  #"SPEAKER_{val}"or"TOPIC_{val}","WORD_{word}" "BIO_{BIO}", "CLASS_{slot,value}", ""{defined label}-> corresponding  index of element
        #-target vector dictionary
        index = 0
        totalNumSlot = 0
        for topic in self.tagsets.keys():
            for slot in self.tagsets[topic].keys():
                totalNumSlot += 1
                for value in self.tagsets[topic][slot]:
                    self.dictOut[topic + "_" + slot + "_" + value] = index
                    index += 1
        print "totalNumSlot:" + str(totalNumSlot)
        print "outputSize:" + str(len(self.dictOut.keys()))
        #-input dictionry
        dataset = []
        for pathdat in pathdataset:
            dataset.append(
                dataset_walker.dataset_walker(pathdat,
                                              dataroot=Pathdataroot,
                                              labels=False))
        #--(sub input vector 1) Class features i.e., Slot and value ratio (Similar to base line)
        index = 0
        for topic in self.tagsets.keys():
            for slot in self.tagsets[topic].keys():
                if ("CLASS_" + slot) not in self.dictIn:
                    self.dictIn["CLASS_" + slot] = index
                    index += 1
                for value in self.tagsets[topic][slot]:
                    if ("CLASS_" + value) not in self.dictIn:
                        self.dictIn["CLASS_" + value] = index
                        index += 1
        self.TOTALSIZEOFCLASSFeature = index
        f = open(self.FileNameofNumClassFeature, "wb")
        pickle.dump(self.TOTALSIZEOFCLASSFeature, f)
        f.close()
        #--(sub input vector 2) Sentence features
        if not self.isUseSentenceRepresentationInsteadofBOW:
            index = 0
            for elemDataset in dataset:
                for call in elemDataset:
                    for (uttr, _) in call:
                        #General info1 (CLASS; this feature must be rejistered at first)
                        if ("SPEAKER_" + uttr["speaker"]) not in self.dictIn:
                            self.dictIn["SPEAKER_" + uttr["speaker"]] = index
                            index += 1
                        if ("TOPIC_" + uttr["segment_info"]["topic"]
                            ) not in self.dictIn:
                            self.dictIn["TOPIC_" +
                                        uttr["segment_info"]["topic"]] = index
                            index += 1
                        #General info2
                        #-BIO
                        if ("BIO_" + uttr['segment_info']['target_bio']
                            ) not in self.dictIn:
                            self.dictIn[
                                "BIO_" +
                                uttr['segment_info']['target_bio']] = index
                            index += 1

                        #BOW
                        if LSTMWithBOWTracker.isIgnoreUtterancesNotRelatedToMainTask:
                            if not (uttr['segment_info']['target_bio'] == "O"):
                                #-BOW
                                splitedtrans = self.__getRegurelisedBOW(
                                    uttr["transcript"])
                                for word in splitedtrans:
                                    if ("WORD_" + word) not in self.dictIn:
                                        self.dictIn["WORD_" + word] = index
                                        index += 1
            self.TOTALSIZEOFSENTENCEFeature = index
            f = open(self.FileNameofNumSentenceFeature, "wb")
            pickle.dump(self.TOTALSIZEOFSENTENCEFeature, f)
            f.close()
        elif self.isUseSentenceRepresentationInsteadofBOW:
            index = 0
            for i in range(0, LSTMWithBOWTracker.D2V_VECTORSIZE):
                self.dictIn[str(index) + "thElemPV"] = index
                index += 1
            index = 0
            for i in range(0, LSTMWithBOWTracker.D2V_VECTORSIZE):
                self.dictIn[str(index) + "thAvrWord"] = index
                index += 1
            assert self.D2V_VECTORSIZE == LSTMWithBOWTracker.D2V_VECTORSIZE, "D2V_VECTORSIZE is restrected to be same over the class"
        else:
            assert False, "Unexpected block"
        #--(sub input vector 3) Features M1s defined
        index = 0
        if self.isEnableToUseM1sFeature:
            rejisteredFeatures = self.__rejisterM1sInputFeatureLabel(
                self.tagsets, dataset)
            for rFeature in rejisteredFeatures:
                assert rFeature not in self.dictIn, rFeature + " already registered in input vector. Use different label name. "
                self.dictIn[rFeature] = index
                index += 1
            self.TOTALSIZEOFM1DEFINEDFeature = index
            f = open(self.FileNameofNumM1Feature, "wb")
            pickle.dump(self.TOTALSIZEOFM1DEFINEDFeature, f)
            f.close()

        print "inputSize:" + str(len(self.dictIn.keys()))
        assert self.dictIn[
            "CLASS_INFO"] == 0, "Unexpected index CLASS_INFO should has value 0"
        assert self.dictIn[
            "CLASS_Fort Siloso"] == 334, "Unexpected index CLASS_Fort Siloso should has value 334"
        assert self.dictIn[
            "CLASS_Yunnan"] == 1344, "Unexpected index CLASS_Yunnan should has value 1611"
        #--write
        fileObject = open('dictInput.pic', 'w')
        pickle.dump(self.dictIn, fileObject)
        fileObject.close()
        fileObject = open('dictOutput.pic', 'w')
        pickle.dump(self.dictOut, fileObject)
        fileObject.close()

        #Build RNN frame work
        print "Start learning Network"
        #Capability of network is: (30 hidden units can represents 1048576 relations) wherease (10 hidden units can represents 1024)
        #Same to Henderson (http://www.aclweb.org/anthology/W13-4073)?
        net = buildNetwork(len(self.dictIn.keys()),
                           numberOfHiddenUnit,
                           len(self.dictOut.keys()),
                           hiddenclass=LSTMLayer,
                           outclass=SigmoidLayer,
                           outputbias=False,
                           recurrent=True)

        #Train network
        #-convert training data into sequence of vector
        convDataset = []  #[call][uttr][input,targetvec]
        iuttr = 0
        convCall = []
        for elemDataset in dataset:
            for call in elemDataset:
                for (uttr, label) in call:
                    if self.isIgnoreUtterancesNotRelatedToMainTask:
                        if uttr['segment_info']['target_bio'] == "O":
                            continue
                    #-input
                    convInput = self._translateUtteranceIntoInputVector(
                        uttr, call)
                    #-output
                    convOutput = [0.0] * len(
                        self.dictOut.keys())  #Occured:+1, Not occured:0
                    if "frame_label" in label:
                        for slot in label["frame_label"].keys():
                            for value in label["frame_label"][slot]:
                                convOutput[self.dictOut[
                                    uttr["segment_info"]["topic"] + "_" +
                                    slot + "_" + value]] = 1
                    #-post proccess
                    if self.isSeparateDialogIntoSubDialog:
                        if uttr['segment_info']['target_bio'] == "B":
                            if len(convCall) > 0:
                                convDataset.append(convCall)
                            convCall = []
                    convCall.append([convInput, convOutput])
                    #print "Converted utterance" + str(iuttr)
                    iuttr += 1
                if not self.isSeparateDialogIntoSubDialog:
                    if len(convCall) > 0:
                        convDataset.append(convCall)
                    convCall = []
        #Online learning
        trainer = RPropMinusTrainer(net, weightdecay=weightdecayw)
        EPOCHS = EPOCHS_PER_CYCLE * CYCLES
        for i in xrange(CYCLES):
            #Shuffle order
            ds = SequentialDataSet(len(self.dictIn.keys()),
                                   len(self.dictOut.keys()))
            datInd = range(0, len(convDataset))
            random.shuffle(
                datInd
            )  #Backpropergation already implemeted data shuffling, however though RpropMinus don't.
            for ind in datInd:
                ds.newSequence()
                for convuttr in convDataset[ind]:
                    ds.addSample(convuttr[0], convuttr[1])
            #Evaluation and Train
            epoch = (i + 1) * EPOCHS_PER_CYCLE
            print "\r epoch {}/{} Error={}".format(
                epoch, EPOCHS, trainer.testOnData(dataset=ds))
            stdout.flush()
            trainer.trainOnDataset(dataset=ds, epochs=EPOCHS_PER_CYCLE)
            NetworkWriter.writeToFile(
                trainer.module, "LSTM_" + "Epoche" + str(i + 1) + ".rnnw")
            NetworkWriter.writeToFile(trainer.module, "LSTM.rnnw")
示例#34
0
def main(argv):
    #
    # CMD LINE ARGS
    #
    install_path = os.path.abspath(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path, 'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    list_dir = os.path.join(install_path, 'config')

    parser = argparse.ArgumentParser(
        description='Evaluate output from a belief tracker.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        metavar='PATH',
        required=True,
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--decodefile',
                        dest='decodefile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='File containing decoder output JSON')
    parser.add_argument('--scorefile',
                        dest='csv',
                        action='store',
                        metavar='CSV_FILE',
                        required=True,
                        help='File to write with CSV scoring data')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='JSON Ontology file')
    parser.add_argument('--trackerfile',
                        dest='trackerfile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='Tracker JSON file for output')

    args = parser.parse_args()

    sessions = dataset_walker(args.dataset,
                              dataroot=args.dataroot,
                              labels=True)
    decode_results = json.load(open(args.decodefile))
    ontology = json.load(open(args.ontology))

    metrics = {"tophyp": Fscore(ontology), "ice": ICE(ontology)}

    belief_metrics = {"accuracy": BeliefAccuracy(ontology)}

    # we run the baseline focus tracker on the output of the SLU
    tracker = baseline.FocusTracker()
    tracker_output = {"sessions": [], "wall-time": 0.0}
    tracker_output["dataset"] = args.dataset

    for call, decode_session in zip(sessions, decode_results["sessions"]):
        tracker.reset()
        this_session = {"session-id": call.log["session-id"], "turns": []}
        for (log_turn, label), decode_result in zip(call,
                                                    decode_session["turns"]):

            true_label = label["semantics"]["json"]
            slu_hyps = decode_result["slu-hyps"]
            slu_hyps.sort(key=lambda x: -x["score"])
            total_p = sum([x["score"] for x in slu_hyps])
            if total_p > 1.0:
                if total_p > 1.00001:
                    print "Warning: total_p =", total_p, "> 1.0- renormalising."
                for slu_hyp in slu_hyps:
                    slu_hyp["score"] = slu_hyp["score"] / total_p

            for metric in metrics.values():
                metric.add_turn(true_label, slu_hyps, log_turn, label)

            # for passing to tracker
            this_turn = {
                "input": {
                    "live": {
                        "slu-hyps": slu_hyps
                    }
                },
                "output": log_turn["output"]
            }
            goal_hyps = tracker.addTurn(this_turn)
            for belief_metric in belief_metrics.values():
                belief_metric.add_turn(goal_hyps, label)

            this_session["turns"].append(goal_hyps)

        tracker_output["sessions"].append(this_session)

    tracker_file = open(args.trackerfile, "wb")
    json.dump(tracker_output, tracker_file, indent=4)
    tracker_file.close()

    csv_file = open(args.csv, "wb")

    output = []

    for key, metric in metrics.items():
        this_output = metric.output()
        for this_key, value in this_output.items():
            output.append((key + "," + this_key, value))

    for key, belief_metric in belief_metrics.items():
        this_output = belief_metric.output()
        key = "belief_" + key
        for this_key, value in this_output.items():
            output.append((key + "," + this_key, value))

    output.sort(key=lambda x: x[0])
    for key, value in output:
        w = 35
        if value < 0:
            w = w - 1
        metric_name = (key + ",").ljust(w)
        csv_file.write(metric_name + ("%.5f" % value) + "\n")

    csv_file.close()
示例#35
0
def main(argv):
    #
    # CMD LINE ARGS
    #
    install_path = os.path.abspath(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path, 'lib')
    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    list_dir = os.path.join(install_path, 'config')

    parser = argparse.ArgumentParser(
        description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument(
        '--dataset',
        dest='dataset',
        action='store',
        metavar='DATASET',
        required=True,
        help='The dataset to analyze, for example train1 or test2 or train3a')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        required=True,
        metavar='PATH',
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--datafile',
                        dest='datafile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write output')
    parser.add_argument('--label',
                        dest='label',
                        action='store',
                        required=True,
                        metavar='BOOL',
                        help='load labels')

    args = parser.parse_args()

    label = False
    if args.label.lower() == 'true':
        label = True

    dataset = dataset_walker(args.dataset,
                             dataroot=args.dataroot,
                             labels=label)

    datafile = open(args.datafile, "wb")
    data = {"sessions": []}
    data["dataset"] = args.dataset

    vector = vectorizer()

    for call in dataset:
        this_session = {"session-id": call.log["session-id"], "turns": []}
        vector.reset()
        for turn, labels in call:
            data_point = vector.addTurn(turn, labels)
            this_session["turns"].append(data_point)

        data["sessions"].append(this_session)

    json.dump(data, datafile, indent=4)

    datafile.close()
示例#36
0
def main(argv):
    parser = argparse.ArgumentParser(description='Simple SLG baseline.')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='The training dataset')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='The test dataset')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='Will look for corpus in <destroot>/...')
    parser.add_argument('--outfile',
                        dest='outfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with SLG output')
    parser.add_argument('--roletype',
                        dest='roletype',
                        action='store',
                        choices=['GUIDE', 'TOURIST'],
                        required=True,
                        help='Target role')

    args = parser.parse_args()

    slg = SimpleSLG()

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True,
                                             task='SLG',
                                             roletype=args.roletype.lower())
    sys.stderr.write('Loading training instances ... ')

    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() == args.roletype.lower():
                instance = {
                    'semantic_tags': log_utter['semantic_tags'],
                    'speech_act': log_utter['speech_act']
                }
                slg.add_instance(instance, translations)

    slg.train()
    sys.stderr.write('Done\n')

    output = {'sessions': []}
    output['dataset'] = args.testset
    output['task_type'] = 'SLG'
    output['role_type'] = args.roletype
    start_time = time.time()

    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=False,
                                            translations=True,
                                            task='SLG',
                                            roletype=args.roletype.lower())
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}

        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() == args.roletype.lower():
                instance = {
                    'semantic_tags': log_utter['semantic_tags'],
                    'speech_act': log_utter['speech_act']
                }

                slg_result = {
                    'utter_index': log_utter['utter_index'],
                    'generated': slg.generate(instance)
                }
                this_session['utterances'].append(slg_result)

        output['sessions'].append(this_session)
    sys.stderr.write('Done\n')

    end_time = time.time()
    elapsed_time = end_time - start_time
    output['wall_time'] = elapsed_time

    with open(args.outfile, "wb") as of:
        json.dump(output, of, indent=4)

    sys.stderr.write('Done\n')
示例#37
0
def main(argv):
    install_path = os.path.abspath(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path, 'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker

    parser = argparse.ArgumentParser(
        description='Evaluate output from an SLU system.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        metavar='PATH',
        required=True,
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--pilotfile',
                        dest='pilotfile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='File containing JSON output')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='JSON Ontology file')
    parser.add_argument('--pilottask',
                        dest='pilottask',
                        action='store',
                        choices=['SLU', 'SAP', 'SLG', 'EES'],
                        required=True,
                        help='Target task')
    parser.add_argument('--roletype',
                        dest='roletype',
                        action='store',
                        choices=['GUIDE', 'TOURIST'],
                        required=True,
                        help='Target role')
    parser.add_argument('--scorefile',
                        dest='scorefile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='File to write with CSV scoring data')

    args = parser.parse_args()

    sessions = dataset_walker(args.dataset,
                              dataroot=args.dataroot,
                              labels=True)

    system_output = json.load(open(args.pilotfile))

    tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

    stats = {}
    if args.pilottask == 'SLU':
        stats['semantic_tagged'] = {}
        stats['semantic_tagged']['detection'] = Stat_Precision_Recall()
        stats['semantic_tagged']['class'] = Stat_Precision_Recall()
        stats['semantic_tagged']['all'] = Stat_Precision_Recall()

    if args.pilottask == 'SLU' or args.pilottask == 'SAP':
        stats['speech_act'] = {}
        stats['speech_act']['act'] = Stat_Precision_Recall()
        stats['speech_act']['all'] = Stat_Precision_Recall()

    if args.pilottask == 'SLG' or args.pilottask == 'EES':
        stats['utt_transcriptions'] = {}
        stats['utt_transcriptions']['all'] = Stat_BLEU_AM_FM()

    for session, track_session in zip(sessions, system_output["sessions"]):
        session_id = session.log['session_id']

        log_utter_list = []
        label_utter_list = []

        for log_utter, label_utter in session:
            if (args.roletype == 'GUIDE' and log_utter['speaker']
                    == 'Guide') or (args.roletype == 'TOURIST'
                                    and log_utter['speaker'] == 'Tourist'):
                log_utter_list.append(log_utter)
                label_utter_list.append(label_utter)

        # now iterate through turns
        for log_utter, label_utter, track_utter in zip(
                log_utter_list, label_utter_list, track_session["utterances"]):
            for subtask in stats:
                if subtask == 'speech_act':
                    ref_sa_list = label_utter['speech_act']
                    pred_sa_list = track_utter['speech_act']
                    eval_acts(ref_sa_list, pred_sa_list, stats[subtask])
                elif subtask == 'semantic_tagged':
                    ref_tagged = ' '.join(label_utter['semantic_tagged'])
                    pred_tagged = track_utter['semantic_tagged']
                    eval_semantics(ref_tagged, pred_tagged, stats[subtask])
                elif subtask == 'utt_transcriptions':
                    ref = log_utter['transcript']
                    pred = track_utter['generated_sentence']
                    eval_utt(ref, pred, stats[subtask])

    csvfile = open(args.scorefile, 'w')
    print >> csvfile, ("task, subtask, schedule, stat, N, result")

    for subtask in stats:
        for schedule in stats[subtask]:
            for measure, N, result in stats[subtask][schedule].results():
                print >> csvfile, (
                    "%s, %s, %s, %s, %i, %s" %
                    (args.pilottask, subtask, schedule, measure, N, result))
    csvfile.close()
def main(argv):
	install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	utils_dirname = os.path.join(install_path,'lib')

	sys.path.append(utils_dirname)
	from dataset_walker import dataset_walker
	list_dir = os.path.join(install_path,'config')

	parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='trackfile',action='store',metavar='JSON_FILE',required=True,help='File containing tracker JSON output')
	parser.add_argument('--scorefile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,help='File to write with JSON scoring data')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')
	
	#args = parser.parse_args()
	args = parser.parse_args(argv)

	sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)
	tracker_output = json.load(open(args.trackfile))

	ontology = OntologyReader(args.ontology)
	tagsets = ontology.get_tagsets()

	stats = []
	stat_classes = [Stat_Accuracy, Stat_Precision_Recall]

	for schedule in SCHEDULES:
		for stat_class in stat_classes:
			stats.append((('all', 'all'), schedule, stat_class()))

		for topic in ontology.get_topics():
			for slot in ontology.get_slots(topic) + ['all']:
				for stat_class in stat_classes:
					stats.append(((topic, slot), schedule, stat_class()))

	utter_counter = 0.0

	for session, track_session in zip(sessions, tracker_output["sessions"]):
		session_id = session.log['session_id']

		prev_ref_frame = None
		prev_track_frame = None

		for (log_utter, label_utter), track_utter in zip(session, track_session["utterances"]):
			utter_counter += 1.0

			if log_utter['segment_info']['target_bio'] == 'B':
				# Beginning of a new segment
				ref_frame = label_utter['frame_label']
				track_frame = track_utter['frame_label']

				for (topic, slot), schedule, stat_class in stats:
					if schedule == 2:
						if topic == 'all':
							stat_class.add(prev_track_frame, prev_ref_frame)
						elif prev_topic == topic:
							if slot == 'all':
								stat_class.add(prev_track_frame, prev_ref_frame)
							else:
								if slot in prev_track_frame and slot in prev_ref_frame:
									stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]})
								elif slot in prev_track_frame and slot not in prev_ref_frame:
									stat_class.add({slot: prev_track_frame[slot]}, {slot: []})
								elif slot not in prev_track_frame and slot in prev_ref_frame:
									stat_class.add({slot: []}, {slot: prev_ref_frame[slot]})

			elif log_utter['segment_info']['target_bio'] == 'I':
				ref_frame = label_utter['frame_label']
				track_frame = track_utter['frame_label']
			elif log_utter['segment_info']['target_bio'] == 'O':
				ref_frame = None
				track_frame = None

			for (topic, slot), schedule, stat_class in stats:
				if schedule == 1:
					if topic == 'all':
						stat_class.add(track_frame, ref_frame)
					elif log_utter['segment_info']['topic'] == topic:
						if slot == 'all':
							stat_class.add(track_frame, ref_frame)
						else:
							if slot in track_frame and slot in ref_frame:
								stat_class.add({slot: track_frame[slot]}, {slot: ref_frame[slot]})
							elif slot in track_frame and slot not in ref_frame:
								stat_class.add({slot: track_frame[slot]}, {slot: []})
							elif slot not in track_frame and slot in ref_frame:
								stat_class.add({slot: []}, {slot: ref_frame[slot]})

			prev_ref_frame = ref_frame
			prev_track_frame = track_frame
			prev_topic = log_utter['segment_info']['topic']

		for (topic, slot), schedule, stat_class in stats:
			if schedule == 2:
				if topic == 'all':
					stat_class.add(prev_track_frame, prev_ref_frame)
				elif prev_topic == topic:
					if slot == 'all':
						stat_class.add(prev_track_frame, prev_ref_frame)
					else:
						if slot in prev_track_frame and slot in prev_ref_frame:
							stat_class.add({slot: prev_track_frame[slot]}, {slot: prev_ref_frame[slot]})
						elif slot in track_frame and slot not in ref_frame:
							stat_class.add({slot: prev_track_frame[slot]}, {slot: []})
						elif slot not in track_frame and slot in ref_frame:
							stat_class.add({slot: []}, {slot: prev_ref_frame[slot]})

	csvfile = open(args.scorefile,'w')
	print >> csvfile,("topic, slot, schedule, stat, N, result")

	for stat in stats:
		(topic, slot), schedule, stat_class = stat
		
		results = stat_class.results()
		for stat_subname, N, result in results:
			if result == None:
				result = "-"
			else:
				result = "%.7f"%result
			print >>csvfile,("%s, %s, %i, %s, %i, %s"%(topic, slot, schedule, stat_subname, N, result))

	print >>csvfile,'basic,total_wall_time,,,,%s' % (tracker_output['wall_time'])
	print >>csvfile,'basic,sessions,,,,%s' % (len(sessions))
	print >>csvfile,'basic,utterances,,,,%i' % (int(utter_counter))
	print >>csvfile,'basic,wall_time_per_utterance,,,,%s' % (tracker_output['wall_time'] / utter_counter)
	print >>csvfile,'basic,dataset,,,,%s' % (tracker_output['dataset'] )

	csvfile.close()
示例#39
0
def main(argv):
	#
	# CMD LINE ARGS
	# 
	install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	utils_dirname = os.path.join(install_path,'lib')
	version_filename = os.path.join(install_path,'VERSION')
	f = open(version_filename)
	scorer_version = f.readline().strip()
	f.close()
	sys.path.append(utils_dirname)
	from dataset_walker import dataset_walker
	list_dir = os.path.join(install_path,'config')

	parser = argparse.ArgumentParser(description='Evaluate output from a belief tracker.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
						help='The dataset to analyze, for example train1 or test2 or train3a')
	parser.add_argument('--dataroot',dest='dataroot',action='store', metavar='PATH', required=True,
						help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='scorefile',action='store',metavar='JSON_FILE',required=True,
						help='File containing score JSON')
	parser.add_argument('--scorefile',dest='csv',action='store',metavar='CSV_FILE',required=True,
						help='File to write with CSV scoring data')
	parser.add_argument('--markfile',dest='markfile',action='store',metavar='JSON_FILE',
						help='Optional: re-write scorefile with scoring mark-up (for debugging and checking scoring process)')
	parser.add_argument('--csvdetail',dest='csvdetail',action='store',metavar='CSV_FILE',
						help='Optional: output a CSV file showing how each turn was scored (for error analysis)')
	parser.add_argument('--rocbins',dest='rocbins',action='store',metavar='INT',default=10000,type=int,
						help='ROC bins to use (default 10000).  Lower numbers make the script run faster, but produce less accurate ROC results.')
	parser.add_argument('--rocdump',dest='rocdump',action='store',metavar='FILE_STEM',
						help='If present, use this file stem to write out ROC plot data: filestem.<schedule>.<slot>.<type>.csv, where type is either roc (which contains the ROC curve coordinates) or scores (which contains the raw scores used to compute the ROC curves).')
	args = parser.parse_args()

	sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)	
	tracker_results = json.load(open(args.scorefile))

	csvfile = open(args.csv,'w')
	if (args.csvdetail):
		detail = []
		detail.append( ['session-id','turn-index','slot-group','label','instantiated-label','oracle-label',
						'schedule1','schedule2','schedule3','top-hyp'] )
	
	stats = {}
	for meta_slot in SLOT_GROUPS + ['joint','all']:
		stats[meta_slot] = {}
		for schedule in SCHEDULES:
			stats[meta_slot][schedule] = {
				'accuracy': Stat_Accuracy(),
				#'mrr':	  Stat_MRR(),
				#'roc':	  Stat_ROC(bins=args.rocbins),
				#'l2':	   Stat_L2(),
				#'avgp':	 Stat_AverageProb(),
				#'nonempty': Stat_NonEmpty(),
				#'hypcount': Stat_HypCount(),
				}

	# mark each label hyp as being correct or not
	# also indicate whether each utt is on each kind of "schedule"
	turn_counter = 0
	session_counter = 0
	for session_tracker,session in zip(tracker_results['sessions'],sessions):
		session_counter += 1
		session_id = session.log['session-id']
		offlist_flag = {}
		for meta_slot in SLOT_GROUPS + ['joint']:
			offlist_flag[meta_slot] = True
		labels = {}
		for grounded_slot in SLOT_GROUPS:
			labels[grounded_slot] = {}
		turn_index = 0
		for (log_turn,label_turn),tracker_turn in zip(session,session_tracker['turns']):
			turn_index += 1
			turn_counter += 1
			# check if this was a start-over
			if (log_turn['restart'] == True):
				for grounded_slot in SLOT_GROUPS:
					labels[grounded_slot] = {}
				for meta_slot in SLOT_GROUPS + ['joint']:
					offlist_flag[meta_slot] = True

			# accumulate labels
			for slu_label_entry in label_turn['slu-labels']:
				try:
					slot_group,pairset_key = _MakePairsetKey(slu_label_entry['slots'])
				except ScoreError as e:
					raise RuntimeError,'Problem with label file: %s' % (e.msg)
				if (pairset_key not in labels[slot_group]):
					labels[slot_group][pairset_key] = slu_label_entry['label']
				elif (labels[slot_group][pairset_key] == False and slu_label_entry['label'] == True):
					# the old label was False but the new label is True
					# change label to True
					labels[slot_group][pairset_key] = True

			for meta_slot in SLOT_GROUPS + ['joint']:				
				# check if tracker guessed anything at all for this slot 
				if (meta_slot not in tracker_turn):
					tracker_turn[meta_slot] = {}
					tracker_turn[meta_slot]['hyps'] = []
				tracker_turn_slot = tracker_turn[meta_slot]#the hypes of the slot

				# sort traker_hyps (in case they weren't sorted already)
				tracker_turn_slot['hyps'].sort(key=lambda x:x['score'],reverse=True)

				#print >>sys.stderr,'Working on session %s, turn %s, slot %s' % (
				#	session.log['session-id'],
				#	log_turn['turn-index'],
				#	meta_slot)

				# check which schedules this turn is on
				if (meta_slot != 'joint'):
					tracker_turn_slot['schedules'] = GetTurnSchedulesGroundedSlot(log_turn,meta_slot,args.dataset)
				else:
					tracker_turn_slot['schedules'] = GetTurnSchedulesJoint(tracker_turn)

				# check whether ANY correct value has been observed yet (offlist_flag)
				if (offlist_flag[meta_slot] == True):
					if (meta_slot != 'joint'):
						offlist_flag[meta_slot] = AreAllItemsIncorrectGroundedSlot(labels[meta_slot])
					else:
						offlist_flag[meta_slot] = AreAllItemsIncorrectJoint(tracker_turn)
				tracker_turn_slot['offlist_flag'] = offlist_flag[meta_slot]

				# compute offlist score
				offlist_score = 1.0
				total = 0.0
				for i,tracker_hyp in enumerate(tracker_turn_slot['hyps']):
					if (tracker_hyp['score'] < 0.0):
						print >>sys.stderr,'WARNING: Score is less than 0.0 (%s); changing to 0.0 (session %s, turn %s, slot %s, hyp %s)' % (
							tracker_hyp['score'],
							session.log['session-id'],
							log_turn['turn-index'],
							meta_slot,
							i)
						tracker_hyp['score'] = 0.0
					offlist_score -= tracker_hyp['score']
					total += tracker_hyp['score']
				if (offlist_score < 0.0):
					print >>sys.stderr,'WARNING: Scores sum to more than 1.0 (%s); normalizing and setting offlist_score to 0.0 (session %s, turn %s, slot %s)' % (
						1.0 - offlist_score,
						session.log['session-id'],
						log_turn['turn-index'],
						meta_slot)
					offlist_score = 0.0
					for tracker_hyp in tracker_turn_slot['hyps']:
						tracker_hyp['score'] = tracker_hyp['score'] / total
				tracker_turn_slot['offlist_score'] = offlist_score

				# assign correctness values to labels 
				for i,tracker_hyp in enumerate(tracker_turn_slot['hyps']):
					try:
						tracker_hyp['label'] = AssignLabelToTrackerHyp(tracker_hyp['slots'],labels,meta_slot,tracker_turn)
					except ScoreError as e:
						print sys.stderr,'WARNING: %s (session %s, turn %s, slot %s, hyp %s); assigning incorrect' % (e.msg,
																										   session.log['session-id'],
																										   log_turn['turn-index'],
																										   meta_slot,
																										   i)
						tracker_hyp['label'] = False

				# for convenience, compute a list of True/False values that describe correctness
				tracker_turn_slot['all-hyps'] = []
				tracker_turn_slot['all-hyps'].append( {
						'hyp': None, 
						'label': tracker_turn_slot['offlist_flag'],
						'score': tracker_turn_slot['offlist_score'],
				})
				for hyp_index,tracker_hyp in enumerate(tracker_turn_slot['hyps']):
					tracker_turn_slot['all-hyps'].append( {
							'hyp': tracker_hyp,
							'label': tracker_hyp['label'],
							'score': tracker_hyp['score'],
							})
				tracker_turn_slot['all-hyps'].sort(key=lambda x: x['score'],reverse=True)

				# compute stats according to each schedule
				for schedule in SCHEDULES:
					if ( tracker_turn_slot['schedules'][schedule] ):
						for stat_type in stats[meta_slot][schedule]:
							stats[meta_slot][schedule][stat_type].AddTurn(tracker_turn_slot,log_turn)				
							if (meta_slot in SLOT_GROUPS):
								stats['all'][schedule][stat_type].AddTurn(tracker_turn_slot,log_turn)

				# save details
				# ['session-id','turn-index','slot-group','label','schedule1','schedule2','schedule3','top-hyp'] 
				if (args.csvdetail):
					if (tracker_turn_slot['all-hyps'][0]['hyp'] == None):
						top_hyp = None
					else:
						top_hyp_array = []
						for k in sorted(tracker_turn_slot['all-hyps'][0]['hyp']['slots'].keys()):
							top_hyp_array.append( [k,tracker_turn_slot['all-hyps'][0]['hyp']['slots'][k]] )
						top_hyp = ';'.join( [ '='.join([k,str(v)]) for k,v in top_hyp_array] )
					# is top instantiated hyp correct?
					for tracker_hyp in tracker_turn_slot['all-hyps']:
						if (tracker_hyp['hyp'] != None):
							instantiated_label = tracker_hyp['label']
							break
					else:
						instantiated_label = None

					for tracker_hyp in tracker_turn_slot['all-hyps']:
						if (tracker_hyp['hyp'] == None):
							continue
						if (tracker_hyp['label'] == True):
							oracle_label = True
							break
					else:
						oracle_label = False

					detail.append([
							session_id,
							log_turn['turn-index'],
							meta_slot,
							tracker_turn_slot['all-hyps'][0]['label'],
							instantiated_label,
							oracle_label,
							tracker_turn_slot['schedules']['schedule1'],
							tracker_turn_slot['schedules']['schedule2'],
							tracker_turn_slot['schedules']['schedule3'],
							top_hyp,
							])
						
			# handle last turn and restart turns for schedule3
			if (len(session.log['turns']) == turn_index or			 # this is the last turn
				session.log['turns'][turn_index]['restart'] == True):  # next turn is a restart
				any_slot_found = False
				for grounded_slot in SLOT_GROUPS:
					if (len(labels[grounded_slot]) > 0):
						any_slot_found = True
						tracker_turn_slot = tracker_turn[grounded_slot]
						tracker_turn_slot['schedules']['schedule3'] = True
						for stat_type in stats[grounded_slot]['schedule3']:
							stats[grounded_slot]['schedule3'][stat_type].AddTurn(tracker_turn_slot,log_turn)
							stats['all']['schedule3'][stat_type].AddTurn(tracker_turn_slot,log_turn)
				if (any_slot_found):
					tracker_turn_slot = tracker_turn['joint']
					tracker_turn_slot['schedules']['schedule3'] = True
					for stat_type in stats['joint']['schedule3']:
						stats['joint']['schedule3'][stat_type].AddTurn(tracker_turn_slot,log_turn)

	# compute stats according to each schedule
	for meta_slot in sorted(SLOT_GROUPS) + ['joint','all']:
		for schedule in sorted(SCHEDULES):
			if (args.rocdump):
				rocfile = args.rocdump + '.' + schedule + '.' + meta_slot + '.roc.csv'
				stats[meta_slot][schedule]['roc'].DumpROCToFile(rocfile)
				rawfile = args.rocdump + '.' + schedule + '.' + meta_slot + '.scores.csv'
				stats[meta_slot][schedule]['roc'].DumpScoresToFile(rawfile)
			for stat_type in sorted(stats[meta_slot][schedule]):
				R = stats[meta_slot][schedule][stat_type].Result()
				N = stats[meta_slot][schedule][stat_type].N
				for name,r in sorted(R,key=lambda x:x[0]):
					if (name == ''):
						print_name = stat_type
					else:
						print_name = '%s.%s' % (stat_type,name)
					print >>csvfile,'%s,%s,%s,%s,%s' % (meta_slot,schedule,print_name,N,r)

	print >>csvfile,'basic,,total_wall_time,,%s' % (tracker_results['wall-time'])
	print >>csvfile,'basic,,sessions,,%s' % (session_counter)
	print >>csvfile,'basic,,turns,,%s' % (turn_counter)
	print >>csvfile,'basic,,wall_time_per_turn,,%s' % (tracker_results['wall-time'] / turn_counter)
	print >>csvfile,'basic,,dataset,,%s' % (tracker_results['dataset'] )
	print >>csvfile,'basic,,scorer_version,,%s' % (scorer_version )
	
	csvfile.close()

	# optional: save label file
	if (args.markfile):
		f = open(args.markfile,'w')
		json.dump(tracker_results,f,indent=2)
		f.close()

	# optional: save csvdetail file
	if (args.csvdetail):
		f = open(args.csvdetail,'w')
		for row in detail:
			print >>f,','.join( [str(x) for x in row] )
		f.close()
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,
                 i,
                 cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


if __name__ == '__main__':
    dataset = dataset_walker("dstc2_dev", dataroot="data", labels=True)

    informable = ['area', 'food', 'name', 'pricerange']
    requestable = informable + ['addr', 'phone', 'postcode', 'signature']
    machineActs = [
        'affirm', 'bye', 'canthear', 'confirm-domain', 'negate', 'repeat',
        'reqmore', 'welcomemsg', 'canthelp', 'canthelp.missing_slot_value',
        'canthelp.exception', 'expl-conf', 'impl-conf', 'inform', 'offer',
        'request', 'select', 'welcomemsg'
    ]

    userActs = [
        'ack', 'affirm', 'bye', 'hello', 'help', 'negate', 'null', 'repeat',
        'reqalts', 'reqmore', 'restart', 'silence', 'thankyou', 'confirm',
        'deny', 'inform', 'request'
    ]
示例#41
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask(
        trainset, devset, testset)

    train_utters += dev_utters

    context_case = 1
    # 여기다가 previous labels context 를 구성하는 코드를 작성하자!
    # 1) 이전 화행 N개 (speaker 구분안함)
    # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개)
    if context_case == 1:

        pass

    else:
        pass

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    train_labels_category = [utter[3] for utter in train_utters]
    test_labels_category = [utter[3] for utter in test_utters]
    train_labels_attr = [utter[4] for utter in train_utters]
    test_labels_attr = [utter[4] for utter in test_utters]
    train_labels_sa = [utter[5] for utter in train_utters]
    test_labels_sa = [utter[5] for utter in test_utters]

    label_binarizer_category = preprocessing.MultiLabelBinarizer()
    label_binarizer_category.fit(train_labels_category + test_labels_category)

    label_binarizer_attr = preprocessing.MultiLabelBinarizer()
    label_binarizer_attr.fit(train_labels_attr + test_labels_attr)

    label_binarizer_sa = preprocessing.MultiLabelBinarizer()
    label_binarizer_sa.fit(train_labels_sa + test_labels_sa)

    train_labels_category = label_binarizer_category.transform(
        train_labels_category)
    test_labels_category = label_binarizer_category.transform(
        test_labels_category)
    train_labels_attr = label_binarizer_attr.transform(train_labels_attr)
    test_labels_attr = label_binarizer_attr.transform(test_labels_attr)
    train_labels_sa = label_binarizer_sa.transform(train_labels_sa)
    test_labels_sa = label_binarizer_sa.transform(test_labels_sa)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels_category = train_labels_category[
        tourist_train_indices]
    tourist_train_labels_attr = train_labels_attr[tourist_train_indices]
    tourist_train_labels_sa = train_labels_sa[tourist_train_indices]
    tourist_train_labels = (tourist_train_labels_category,
                            tourist_train_labels_attr, tourist_train_labels_sa)

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels_category = train_labels_category[guide_train_indices]
    guide_train_labels_attr = train_labels_attr[guide_train_indices]
    guide_train_labels_sa = train_labels_sa[guide_train_indices]
    guide_train_labels = (guide_train_labels_category, guide_train_labels_attr,
                          guide_train_labels_sa)

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels_category = test_labels_category[tourist_test_indices]
    tourist_test_labels_attr = test_labels_attr[tourist_test_indices]
    tourist_test_labels_sa = test_labels_sa[tourist_test_indices]
    tourist_test_labels = (tourist_test_labels_category,
                           tourist_test_labels_attr, tourist_test_labels_sa)

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels_category = test_labels_category[guide_test_indices]
    guide_test_labels_attr = test_labels_attr[guide_test_indices]
    guide_test_labels_sa = test_labels_sa[guide_test_indices]
    guide_test_labels = (guide_test_labels_category, guide_test_labels_attr,
                         guide_test_labels_sa)

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)
示例#42
0
def main(argv):
    install_path = os.path.abspath(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path, 'lib')

    sys.path.append(utils_dirname)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        help='The directory where to find the data [default: data]',
        default='data')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    args = parser.parse_args()

    if not os.path.exists(dir_output):
        print("...creating " + dir_output)
        os.makedirs(dir_output)

    dataset = args.dataset
    # Save first the information, then later we start calling the team
    info_for_tasks = fntInitStruct(dataset)
    sessions = ds.dataset_walker(dataset, dataroot=args.dataroot, labels=True)
    print('Collecting information for all pilot tasks')
    for session in sessions:
        session_id = session.log['session_id']
        info_for_tasks = fntInitSession(info_for_tasks, session_id)

        utter_span = [
        ]  # Saves the utterance indexes for different concatenated turns
        # Saves the temporal information along several continuous turns for a given user
        concatenated_info = {
            'transcripts': [],
            'speech_acts': [],
            'slots': [],
            'uniq_sa': {}
        }
        utter_index = 0
        # now iterate through turns
        current_role = session.log['utterances'][0]['speaker'].upper()
        for id, (log_utter, label_utter) in enumerate(session):
            # We concatenate the utterance + semantic slots + speech_acts
            if log_utter['speaker'].upper() == current_role:
                utter_span.append(str(log_utter['utter_index']))
                concatenated_info = fntConcatInfo(
                    concatenated_info, log_utter['transcript'],
                    ' '.join(label_utter['semantic_tagged']),
                    label_utter['speech_act'])
            else:  # Change the user
                info_for_tasks = fntAddUtterance(
                    info_for_tasks, session_id, utter_index,
                    '_'.join(utter_span), current_role,
                    concatenated_info['speech_acts'],
                    ' '.join(concatenated_info['slots']),
                    ' '.join(concatenated_info['transcripts']))
                concatenated_info = {
                    'transcripts': [],
                    'speech_acts': [],
                    'slots': [],
                    'uniq_sa': {}
                }
                # Restart the process for the next user
                utter_index += 1
                current_role = log_utter['speaker'].upper()
                utter_span = []
                utter_span.append(str(log_utter['utter_index']))
                concatenated_info = fntConcatInfo(
                    concatenated_info, log_utter['transcript'],
                    ' '.join(label_utter['semantic_tagged']),
                    label_utter['speech_act'])
        info_for_tasks = fntAddUtterance(
            info_for_tasks, session_id, utter_index, '_'.join(utter_span),
            current_role, concatenated_info['speech_acts'],
            ' '.join(concatenated_info['slots']),
            ' '.join(concatenated_info['transcripts']))

    # Now we start the process of asking information for each team
    for team in info_teams:
        # Configuration of logger for each team
        genlog_fh = logging.handlers.RotatingFileHandler(
            dir_output + '/' + team + '.log',
            mode='a',
            maxBytes=MAXBYTESLOG,
            backupCount=BACKUPCOUNT,
            encoding="latin-1")  # save up to 1 GB before rotation
        genlog_fh.setLevel(logging.DEBUG)
        genlog_fh.setFormatter(formatter)
        logger.addHandler(genlog_fh)
        logger.info('Processing team ' + team)
        url = info_teams[team]['url']
        logger.info('Connecting to ' + url + ' for ' + team)
        # websocket.enableTrace(True)  # To check the content of each data send to the server
        ws = websocket.create_connection(url)
        ws.settimeout(MAX_TIMEOUT)
        stats = {}
        for pilottask in info_teams[team]['tasks']:
            logger.info('Doing task: ' + pilottask)
            for roletype in info_teams[team]['roles']:
                if pilottask == 'SLU':
                    stats['semantic_tagged'] = {}
                    stats['semantic_tagged'][
                        'detection'] = Stat_Precision_Recall()
                    stats['semantic_tagged']['class'] = Stat_Precision_Recall()
                    stats['semantic_tagged']['all'] = Stat_Precision_Recall()

                if pilottask == 'SLU' or pilottask == 'SAP':
                    stats['speech_act'] = {}
                    stats['speech_act']['act'] = Stat_Precision_Recall()
                    stats['speech_act']['all'] = Stat_Precision_Recall()

                if pilottask == 'SLG' or pilottask == 'EES':
                    stats['utt_transcriptions'] = {}
                    stats['utt_transcriptions']['all'] = Stat_BLEU_AM_FM()

                logger.info('Doing role: ' + roletype)

                for session_id in info_for_tasks['sessions']:
                    logger.info('Processing session: ' + str(session_id))
                    for n_utt in sorted(info_for_tasks['sessions'][session_id]
                                        ['utterances']):
                        utterance = info_for_tasks['sessions'][session_id][
                            'utterances'][n_utt]
                        logger.info('utterance: ' + str(n_utt))
                        if pilottask == 'SLU':
                            # INPUT: The user's utterance,
                            # OUTPUT: The current user's slots and speech acts
                            if utterance['role_type'] == roletype:
                                ref_sa_list = utterance['speech_act']
                                ref_tagged = utterance['semantic_tagged']
                            else:
                                ref_sa_list = []
                                ref_tagged = ''

                            jsonMsg = fntCreateJSONMessage(
                                info_for_tasks['dataset'], session_id, n_utt,
                                roletype, utterance['role_type'], pilottask,
                                utterance['transcript'], None, None, None)
                            pred_sa_list, pred_tagged = fntSendMessage(
                                ws, pilottask, jsonMsg)
                            eval_acts(ref_sa_list, pred_sa_list,
                                      stats['speech_act'])
                            eval_semantics(ref_tagged, pred_tagged,
                                           stats['semantic_tagged'])

                        elif pilottask == 'SAP':
                            # Here we need to concatenate the turns for a given role user + the slots for the following user
                            # INPUT: The user's utterance + speech acts and semantic tags for the current user + semantic tags for the next user
                            # OUTPUT: The next user's speech acts
                            if utterance['role_type'] == roletype:
                                if n_utt + 1 in info_for_tasks['sessions'][
                                        session_id][
                                            'utterances']:  # Check there is a next turn
                                    ref_sa_list = info_for_tasks['sessions'][
                                        session_id]['utterances'][
                                            n_utt + 1]['speech_act']
                                    jsonMsg = fntCreateJSONMessage(
                                        info_for_tasks['dataset'], session_id,
                                        n_utt, roletype,
                                        utterance['role_type'], pilottask,
                                        utterance['transcript'],
                                        utterance['speech_act'],
                                        utterance['semantic_tagged'],
                                        info_for_tasks['sessions'][session_id]
                                        ['utterances'][n_utt +
                                                       1]['semantic_tagged'])
                                    pred_sa_list = fntSendMessage(
                                        ws, pilottask, jsonMsg)
                                    eval_acts(ref_sa_list, pred_sa_list,
                                              stats['speech_act'])

                        elif pilottask == 'SLG':
                            # Here we need to concatenate the turns for a given role user + the slots for the following user
                            # INPUT: Speech acts and semantic tags for the current user
                            # OUTPUT: The user's utterance
                            if utterance['role_type'] == roletype:
                                ref = info_for_tasks['sessions'][session_id][
                                    'utterances'][n_utt]['transcript']
                                jsonMsg = fntCreateJSONMessage(
                                    info_for_tasks['dataset'], session_id,
                                    n_utt, roletype, utterance['role_type'],
                                    pilottask, None, utterance['speech_act'],
                                    utterance['semantic_tagged'], None)
                                pred = fntSendMessage(ws, pilottask, jsonMsg)
                                eval_utt(ref, pred,
                                         stats['utt_transcriptions'])

                        elif pilottask == 'EES':
                            # Here we need to concatenate the turns for a given role user + the slots for the following user
                            # INPUT: The current user's utterance
                            # OUTPUT: The next user's utterance
                            if utterance['role_type'] == roletype:
                                if n_utt + 1 in info_for_tasks['sessions'][
                                        session_id][
                                            'utterances']:  # Check there is a next turn
                                    ref = info_for_tasks['sessions'][
                                        session_id]['utterances'][
                                            n_utt + 1]['transcript']
                                    jsonMsg = fntCreateJSONMessage(
                                        info_for_tasks['dataset'], session_id,
                                        n_utt, roletype,
                                        utterance['role_type'], pilottask,
                                        utterance['transcript'], None, None,
                                        None)
                                    pred = fntSendMessage(
                                        ws, pilottask, jsonMsg)
                                    eval_utt(ref, pred,
                                             stats['utt_transcriptions'])

                # Save the final results in a CSV file
                with codecs.open(
                        dir_output + '/' + team + '_' + pilottask + '_' +
                        roletype + '.csv', 'w', 'utf-8') as f:
                    f.write("task, subtask, schedule, stat, N, result\n")
                    for subtask in stats:
                        for schedule in stats[subtask]:
                            for measure, N, result in stats[subtask][
                                    schedule].results():
                                f.write("%s, %s, %s, %s, %i, %s\n" %
                                        (pilottask, subtask, schedule, measure,
                                         N, result))

        logger.info('Closing the connection with team ' + team)
        logger.removeHandler(genlog_fh)
        ws.close()
示例#43
0
def main(argv):
	#
	# CMD LINE ARGS
	# 
	install_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	utils_dirname = os.path.join(install_path,'lib')
	sys.path.append(utils_dirname)
	from dataset_walker import dataset_walker
	list_dir = os.path.join(install_path,'config')

	parser = argparse.ArgumentParser(description='Simple hand-crafted dialog state tracker baseline.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True,
						help='The dataset to analyze, for example train1 or test2 or train3a')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH',
						help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='scorefile',action='store',required=True,metavar='JSON_FILE',
						help='File to write with tracker output')
	parser.add_argument('--null',dest='null',action='store_true',
						help='Always output "None of the above" for all slots with score 1.0')
	parser.add_argument('--ignorescores',dest='ignorescores',action='store_true',
						help='Ignore score in data; always use a score of 1.0 (nop if --null also specified)')
	args = parser.parse_args()

	sessions = dataset_walker(args.dataset,dataroot=args.dataroot,labels=False)	
	start_time = time.time()
	r = {
		'sessions': [],
		'dataset': args.dataset,
		}
	
	for session in sessions:
		r['sessions'].append( { 'turns': [], 'session-id': session.log['session-id'], } )
		state = _InitState()		
		if (args.null == True):
			state['joint'] = { 'hyps': [], }
		for turn_index,(log_turn,scratch) in enumerate(session):
			if (args.null == True):
				r['sessions'][-1]['turns'].append(state)
				continue
			# check whether to initialize state or copy
			if (log_turn['restart'] == True or turn_index == 0):
				state = _InitState()
			else:
				state = copy.deepcopy(state)
			r['sessions'][-1]['turns'].append(state)
			if (len(log_turn['input']['live']['slu-hyps']) == 0):
				# no recognition results; skip
				continue
			slu_hyp = log_turn['input']['live']['slu-hyps'][0]
			joint = {}
			joint_scores = []			
			for slot in SLOTS:
				for act_hyp in slu_hyp['slu-hyp']:
					this_pairset = {}
					for found_slot,val in act_hyp['slots']:
						if (found_slot.startswith(slot)):
							this_pairset[found_slot] = val
					if (len(this_pairset) == 0):
						continue
					if True:
						score = slu_hyp['score'] if (args.ignorescores == False) else 1.0
						state[slot]['hyps'] = [ {
								'score-save': slu_hyp['score'],
								'score': score,
								'slots': this_pairset,
								} ]
				if (len(state[slot]['hyps']) > 0):
					joint_scores.append( state[slot]['hyps'][0]['score'] )
					for (my_slot,my_val) in state[slot]['hyps'][0]['slots'].items():
						joint[my_slot] = my_val
			state['joint'] = { 'hyps': [], }
			if (len(joint_scores) > 0):
				state['joint']['hyps'].append( {
						'score': sum(joint_scores) / len(joint_scores),
						'slots': joint,
						} )
		for turn in r['sessions'][-1]['turns']:
			for slots_entry in turn.values():
				for hyp_entry in slots_entry['hyps']:
					if ('score-save' in hyp_entry):
						del hyp_entry['score-save']
	end_time = time.time()
	elapsed_time = end_time - start_time
	r['wall-time'] = elapsed_time

	f = open(args.scorefile,'w')
	json.dump(r,f,indent=2)
	f.close()
示例#44
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')
    parser.add_argument('--roletype',
                        dest='roletype',
                        action='store',
                        choices=['guide', 'tourist'],
                        required=True,
                        help='speaker')

    args = parser.parse_args()

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')

    last_speaker = args.roletype
    last_sa_label_str = None
    total = 0
    same = 0
    multilabel_utter_cnt = 0
    utter_cnt = 0

    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                last_sa_label_str = None
                pass
            else:
                transcript = data_helpers.tokenize_and_lower(
                    log_utter['transcript'])
                speech_act = label_utter['speech_act']
                sa_label_list = []
                for sa in speech_act:
                    sa_label_list += [
                        '%s_%s' % (sa['act'], attr)
                        for attr in sa['attributes']
                    ]

                if len(sa_label_list) > 1:
                    multilabel_utter_cnt += 1
                utter_cnt += 1

                sa_label_str = '|'.join(sa_label_list)
                if log_utter['speaker'] == last_speaker:
                    total += 1
                    if last_sa_label_str is None or sa_label_str == last_sa_label_str:
                        same += 1
                    else:
                        # print("")
                        pass
                # sa_label_list = sorted(set(sa_label_list))
                # train_utters += [(transcript, log_utter['speaker'], sa_label_list)]

                last_sa_label_str = sa_label_str
            last_speaker = log_utter['speaker']
    sys.stderr.write('Done\n')

    print("same/total=ratio: %d/%d=%.4f" % (same, total, 1.0 * same / total))
    print("multi_label/total=ratio: %d/%d=%.4f" %
          (multilabel_utter_cnt, utter_cnt,
           (1.0 * multilabel_utter_cnt / utter_cnt)))

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            test_utters += [(translation, log_utter['speaker'], sa_label_list)]

    pprint(train_utters[:2])
    pprint(test_utters[:2])
def errorAnalysis(argv):
    print "ERROR ANALYSIS OF NAIVEENSEMBLER"
    print argv

    parser = argparse.ArgumentParser(description="Simple hand-crafted dialog state tracker baseline.")
    parser.add_argument(
        "--dataset", dest="dataset", action="store", metavar="DATASET", required=True, help="The dataset to analyze"
    )
    parser.add_argument(
        "--dataroot",
        dest="dataroot",
        action="store",
        required=True,
        metavar="PATH",
        help="Will look for corpus in <destroot>/<dataset>/...",
    )
    parser.add_argument(
        "--trackfile",
        dest="trackfile",
        action="store",
        required=True,
        metavar="JSON_FILE",
        help="File to write with tracker output",
    )
    parser.add_argument(
        "--ontology", dest="ontology", action="store", metavar="JSON_FILE", required=True, help="JSON Ontology file"
    )

    # args = parser.parse_args()
    args = parser.parse_args(argv)
    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True)
    tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

    track = {"sessions": []}
    track["dataset"] = args.dataset
    start_time = time.time()

    tracker = NaiveEnsembleBasedTrackerWithNBest(tagsets, nameOfODictPickle="dictOutput.pic")
    for call in dataset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        tracker.reset()
        for (utter, label) in call:
            # -mae shori2
            if utter["segment_info"]["target_bio"] == "B":
                print "\n -----New sub-dialogue----------------------------------------------------"
            print "s:" + str(call.log["session_id"]) + " u:" + str(utter["utter_index"])
            print "Input=" + utter["transcript"]
            tracker_result = tracker.addUtter(utter, call)
            if tracker_result is not None:
                this_session["utterances"].append(tracker_result)
                #
                print "Tracker's output:"
                print tracker_result
                if "frame_label" in label:
                    for slot in label["frame_label"].keys():
                        if slot not in tracker_result["frame_label"]:
                            print "-slot [" + slot + "] is not exsisted in output"
                            for value in label["frame_label"][slot]:
                                print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output"
                        else:
                            if len(label["frame_label"][slot]) != len(tracker_result["frame_label"][slot]):
                                # In case value in output, but repudant
                                print "-slot [" + slot + "] include repudant values"
                            for value in label["frame_label"][slot]:
                                # In case value not in output
                                if value not in tracker_result["frame_label"][slot]:
                                    print "-value [" + value + "] of slot [" + slot + "] is not exsisted in output"
        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall_time"] = elapsed_time
def main(argv):
    print argv

    parser = argparse.ArgumentParser(
        description='Simple hand-crafted dialog state tracker baseline.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        required=True,
        metavar='PATH',
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',
                        dest='trackfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='JSON Ontology file')

    #args = parser.parse_args()
    args = parser.parse_args(argv)
    dataset = dataset_walker.dataset_walker(args.dataset,
                                            dataroot=args.dataroot,
                                            labels=False)
    tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

    track_file = open(args.trackfile, "wb")
    track = {"sessions": []}
    track["dataset"] = args.dataset
    start_time = time.time()

    tracker = NaiveEnsembleBasedTrackerWithNBest(
        tagsets, nameOfODictPickle="dictOutput.pic")
    for call in dataset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        tracker.reset()
        for (utter, _) in call:
            sys.stderr.write('%d:%d\n' %
                             (call.log['session_id'], utter['utter_index']))
            tracker_result = tracker.addUtter(utter, call)
            if tracker_result is not None:
                this_session["utterances"].append(tracker_result)
        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track['wall_time'] = elapsed_time

    json.dump(track, track_file, indent=4)

    track_file.close()
示例#47
0
def main():

    #print_gplv3()

    parser = argparse.ArgumentParser(description='YARBUS Rule-based Dialog State Tracker Baseline V1.0\n  by Jeremy Fix\t [email protected] \n  and Hervé Frezza-Buet\t [email protected]\n',\
                                     formatter_class=RawTextHelpFormatter)
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='The ontology to use')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        required=True,
        metavar='PATH',
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',
                        dest='trackfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument(
        '--thr_belief',
        dest='thr_belief',
        action='store',
        required=False,
        default=0.0,
        type=float,
        help=
        'Sets the threshold below which the hypothesis in the belief are removed'
    )
    parser.add_argument(
        '--rule_set',
        dest='rule_set',
        action='store',
        required=False,
        default=31,
        type=int,
        help='Specifies which rule set to use, an int in [0, 31]')
    parser.add_argument('--session_id',
                        dest='session_id',
                        action='store',
                        required=False,
                        metavar='voip-...',
                        help='A particular session id to run on')

    args = parser.parse_args()

    dataset = dataset_walker.dataset_walker(args.dataset,
                                            dataroot=args.dataroot)
    session_id = None
    if args.session_id:
        session_id = args.session_id
        print 'Running on session_id ' + session_id
    verbose = (session_id != None)

    ontology = load_ontology(args.ontology)

    track_file = open(args.trackfile, "wb")
    track = {"sessions": []}
    track["dataset"] = args.dataset
    start_time = time.time()
    print(
        "Yarbus will prune its belief with a threshold of %f ; to change this, check out the option --thr_belief"
        % args.thr_belief)
    tracker = YARBUS_Tracker(ontology["informable"].keys(), args.thr_belief,
                             args.rule_set)

    nb_dialogs = len(dataset)
    print("%i dialogs to process" % nb_dialogs)
    for index_call, call in enumerate(dataset):
        if (session_id and session_id != call.log["session-id"]):
            continue
        if (verbose):
            print("Processing session : " + call.log["session-id"])
        else:
            sys.stdout.write('\r Processing dialog %i / %i' %
                             (index_call + 1, nb_dialogs))
            sys.stdout.flush()

        this_session = {"session-id": call.log["session-id"], "turns": []}
        tracker.reset()
        for index_turn, (turn, _) in enumerate(call):
            if (verbose):
                print("*" * 10 + " Turn " + str(index_turn + 1) + " " +
                      "*" * 10)
            tracker_turn = tracker.addTurn(turn, verbose)
            this_session["turns"].append(tracker_turn)

        track["sessions"].append(this_session)
    sys.stdout.write('\n')
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall-time"] = elapsed_time

    json.dump(track, track_file, indent=4)
示例#48
0
def main(argv):
	parser = argparse.ArgumentParser(description='Stat information about the data.')
	parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The dataset to analyze')
	parser.add_argument('--dataroot',dest='dataroot',action='store',required=True,metavar='PATH', help='Will look for corpus in <destroot>/<dataset>/...')
	parser.add_argument('--trackfile',dest='trackfile',action='store',required=True,metavar='JSON_FILE', help='File to write with tracker output')
	parser.add_argument('--ontology',dest='ontology',action='store',metavar='JSON_FILE',required=True,help='JSON Ontology file')

	args = parser.parse_args()
	dataset = dataset_walker.dataset_walker(args.dataset,dataroot=args.dataroot,labels=True)
	tagsets = ontology_reader.OntologyReader(args.ontology).get_tagsets()

	track_file = open(args.trackfile, "wb")
	track = {"sessions":[]}
	track["dataset"]  = args.dataset
	start_time = time.time()

	out_json = {}
	out_json['speech_act_cat_dic'] = {}
	out_json['speech_act_attr_dic'] = {}
	out_json['semantic_tags_dic'] = {}
	out_json['ontology_dic'] = {}

	for topic in tagsets:
		out_json['ontology_dic'][topic] = {'count':0, 'detail':{}}
		for slot, values in tagsets[topic].items():
			out_json['ontology_dic'][topic]['detail'][slot] = {'count':0, 'detail':{}}
			#for value in values:
			#	out_json['ontology_dic'][topic]['detail'][slot]['detail'][value] = 0	



	extractor = sub_segment_extractor()

	for call in dataset:
		extractor.reset()
		for (log_utter, label_utter) in call:
			sys.stderr.write('%d:%d\n'%(call.log['session_id'], log_utter['utter_index']))
			# speech act
			for sa in label_utter['speech_act']:
				act = sa['act'].strip()
				attrs = sa['attributes']
				if act not in out_json['speech_act_cat_dic']:
					out_json['speech_act_cat_dic'][act] = {'count':0, 'detail':{}}
				out_json['speech_act_cat_dic'][act]['count'] += 1
				for attr in attrs:
					attr = attr.strip()
					if attr not in out_json['speech_act_cat_dic'][act]['detail']:
						out_json['speech_act_cat_dic'][act]['detail'][attr] = 1
					else:
						out_json['speech_act_cat_dic'][act]['detail'][attr] += 1
					if attr not in out_json['speech_act_attr_dic']:
						out_json['speech_act_attr_dic'][attr] = 1
					else:
						out_json['speech_act_attr_dic'][attr] += 1

			# semantic 
			for semantic_tag in label_utter['semantic_tagged']:
				sem_tags = extract_semantic_tags(semantic_tag)
				for sem_tag in sem_tags:
					name = sem_tag['name']
					value = sem_tag['value']
					if name not in out_json['semantic_tags_dic']:
						out_json['semantic_tags_dic'][name] = {'count':0, 'detail':{}}
					out_json['semantic_tags_dic'][name]['count'] += 1
					if value not in out_json['semantic_tags_dic'][name]['detail']:
						out_json['semantic_tags_dic'][name]['detail'][value] = 1
					else:
						out_json['semantic_tags_dic'][name]['detail'][value] += 1

			topic = log_utter['segment_info']['topic']
			# frame label
			if log_utter['segment_info']['target_bio'] == 'B':
				if not extractor.is_empty:
					sub_segment = extractor.state
					topic = sub_segment['topic']
					out_json['ontology_dic'][topic]['count'] += 1

					for slot, value_list in sub_segment['frame_label'].items():
						out_json['ontology_dic'][topic]['detail'][slot]['count'] += 1
						for t_value in value_list:
							t_value = t_value.strip()
							if t_value not in out_json['ontology_dic'][topic]['detail'][slot]['detail']:
								out_json['ontology_dic'][topic]['detail'][slot]['detail'][t_value] = 1
							else:
								out_json['ontology_dic'][topic]['detail'][slot]['detail'][t_value] += 1
			extractor.addUtter(log_utter,label_utter)

		

	json.dump(out_json, track_file, indent=4)

	track_file.close()
示例#49
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    ctx_len = int(params['context_length'])

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    transcript_contexts = []
    for call in trainset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            transcript_contexts += [transcript]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            train_utters += [(transcript, log_utter['speaker'], sa_label_list,
                              log_utter['utter_index'])]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    transcript_contexts = []
    for call in testset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''
            transcript_contexts += [translation]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            test_utters += [(translation, log_utter['speaker'], sa_label_list,
                             log_utter['utter_index'])]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # make windowed input data as context
    train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len)
    test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          tourist_train_inputs, tourist_train_labels,
                          tourist_test_inputs, tourist_test_labels)

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          guide_train_inputs, guide_train_labels,
                          guide_test_inputs, guide_test_labels)

    print("")
示例#50
0
def main(argv):
    parser = argparse.ArgumentParser(description='Dataset Converter for SAP pilot task.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The target dataset to be converted')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='Will look for corpus in <destroot>/...')

    args = parser.parse_args()

    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True, translations=False)

    for call in dataset:
        session_id = call.log["session_id"]

        input_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'}
        output_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'}

        input_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'}
        output_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'}

        for (log_utter, _, label_utter) in call:
            speaker = log_utter['speaker']
            utter_index = log_utter['utter_index']
            transcript = log_utter['transcript']

            speech_act = label_utter['speech_act']

            mention_words = []
            curr_cat = None
            curr_attrs = None

            semantic_tags = []

            for semantic_tagged in label_utter['semantic_tagged']:
                parser = SemanticTagParser(False)
                parser.feed(semantic_tagged)

                for word, (bio, cat, attrs) in zip(parser.get_word_seq(), parser.get_word_tag_seq()):
                    if bio == 'I':
                        mention_words.append(word)
                    else:
                        if curr_cat is not None:
                            semantic_tags.append({
                                u'main': curr_cat,
                                u'attributes': curr_attrs,
                                u'mention': ' '.join(mention_words)
                            })

                        mention_words = []
                        curr_cat = None
                        curr_attrs = None

                        if bio == 'B':
                            mention_words = [word]
                            curr_cat = cat
                            curr_attrs = {}
                            for key, value in attrs:
                                curr_attrs[key] = value

                if curr_cat is not None:
                    semantic_tags.append({
                        u'main': curr_cat,
                        u'attributes': curr_attrs,
                        u'mention': ' '.join(mention_words)
                    })

            if speaker == 'Guide':
                input_guide[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'semantic_tags': semantic_tags
                })
                output_guide[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speech_act': speech_act
                })
                input_tourist[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'transcript': transcript,
                    u'semantic_tags': semantic_tags,
                    u'speech_act': speech_act
                })
            elif speaker == 'Tourist':
                input_tourist[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'semantic_tags': semantic_tags
                })
                output_tourist[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speech_act': speech_act
                })
                input_guide[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'transcript': transcript,
                    u'semantic_tags': semantic_tags,
                    u'speech_act': speech_act
                })

        path = os.path.join(os.path.abspath(args.dataroot), '%03d' % (session_id,))

        with open(os.path.join(path, 'sap.guide.in.json'), 'w') as fp:
            json.dump(input_guide, fp)
        with open(os.path.join(path, 'sap.guide.label.json'), 'w') as fp:
            json.dump(output_guide, fp)
        with open(os.path.join(path, 'sap.tourist.in.json'), 'w') as fp:
            json.dump(input_tourist, fp)
        with open(os.path.join(path, 'sap.tourist.label.json'), 'w') as fp:
            json.dump(output_tourist, fp)
示例#51
0
def main():
    print_gplv3()

    parser = argparse.ArgumentParser(
        description='HWU Rule-based Dialog State Tracker Baseline V2.0\n  by Zhuoran Wang\t [email protected]\n  This version extends the work in (Wang & Lemon, SigDial 2013).', \
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument('--ontology',
                        dest='ontology',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='The ontology to use')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        required=True,
        metavar='PATH',
        help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--trackfile',
                        dest='trackfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with tracker output')
    parser.add_argument(
        '--original',
        dest='original',
        action='store',
        required=False,
        metavar='TRUE/FALSE',
        help=
        'Use the original version presented in (Wang & Lemon, SigDial 2013)')
    parser.add_argument(
        '--config',
        dest='config',
        action='store',
        required=True,
        metavar='TRUE/FALSE',
        help='The path of the config folder containing the .flist files')

    args = parser.parse_args()

    dataset = dataset_walker.dataset_walker(args.dataset,
                                            dataroot=args.dataroot,
                                            config_folder=args.config)

    original = False
    if args.original and args.original.lower() == "true":
        original = True

    load_ontology(args.ontology)

    track_file = open(args.trackfile, "w")
    track = {"sessions": [], "dataset": args.dataset}
    start_time = time.time()
    tracker = HWU_Tracker()

    for call in dataset:
        this_session = {"session-id": call.log["session-id"], "turns": []}
        tracker.reset()
        for turn, _ in call:
            tracker_turn = tracker.addTurn(turn, original)
            this_session["turns"].append(tracker_turn)

        track["sessions"].append(this_session)
    end_time = time.time()
    elapsed_time = end_time - start_time
    track["wall-time"] = elapsed_time

    json.dump(track, track_file, indent=4)
示例#52
0
def main(argv):
    parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide',  'tourist'], required=True,  help='speaker')

    args = parser.parse_args()
    threshold_predictor = None

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            transcript = data_helpers.tokenize_and_lower(log_utter['transcript'])

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            train_utters += [(transcript, log_utter['speaker'], sa_label_list)]
    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            try:
                translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            test_utters += [(translation, log_utter['speaker'], sa_label_list)]

    pprint(train_utters[:2])
    pprint(test_utters[:2])

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    num_epochs = int(params['num_epochs'])
    validation_split = float(params['validation_split'])
    batch_size = int(params['batch_size'])
    multilabel = params['multilabel']=="true"

    # build vocabulary
    sents = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_sents = data_helpers.pad_sentences(sents, max_sent_len)
    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents)
    print("vocabulary size: %d" % len(vocabulary))
    # params['max_sent_len'] = max_sent_len

    # build inputs
    train_inputs = data_helpers.build_input_data(pad_sents, vocabulary)

    test_sents = [utter[0].split(' ') for utter in test_utters]
    test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len)
    test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels+sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split and shuffle data
    indices = np.arange(train_inputs.shape[0])
    np.random.shuffle(indices)
    train_inputs = train_inputs[indices]
    train_labels = train_labels[indices]
    num_validation = int(validation_split * train_inputs.shape[0])

    # x_train = train_inputs[:-num_validation]
    # y_train = train_labels[:-num_validation]
    # x_val = train_inputs[-num_validation:]
    # y_val = train_labels[-num_validation:]
    x_train = train_inputs
    y_train = train_labels

    x_test = test_inputs
    y_test = test_labels

    # construct a pytorch data_loader
    x_train = torch.from_numpy(x_train).long()
    y_train = torch.from_numpy(y_train).float()
    dataset_tensor = data_utils.TensorDataset(x_train, y_train)
    train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4,
                                         pin_memory=False)

    x_test = torch.from_numpy(x_test).long()
    y_test = torch.from_numpy(y_test).long()
    dataset_tensor = data_utils.TensorDataset(x_test, y_test)
    test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4,
                                         pin_memory=False)


    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    # load model
    model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1])

    if torch.cuda.is_available():
        model = model.cuda()
    learning_rate = float(params['learning_rate'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    loss_fn = nn.MultiLabelSoftMarginLoss()
    # loss_fn = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()   # set the model to training mode (apply dropout etc)
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = autograd.Variable(inputs), autograd.Variable(labels)
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            preds = model(inputs)
            if torch.cuda.is_available():
                preds = preds.cuda()

            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print("current loss: %.4f" % loss)

        model.eval()        # set the model to evaluation mode
        # if threshold_predictor is None:
        threshold_predictor = train_threshold(model, train_loader, y_train.numpy())
        # count_predictor = train_count(model, train_loader, y_train.numpy())
        true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor)
        # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor)
        print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    # end of training
    true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel)
    print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    with open(("pred_result_%s.txt" % args.roletype), "w") as f:
        for pred_act, true_act in zip(pred_acts, true_acts):
            f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
示例#53
0
def main(argv):
    install_path = os.path.abspath(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    utils_dirname = os.path.join(install_path, 'lib')

    sys.path.append(utils_dirname)
    from dataset_walker import dataset_walker
    from eval_func import eval_utt
    from stat_classes import Stat_BLEU_AM_FM

    parser = argparse.ArgumentParser(
        description='Evaluate output from an SLG system.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        metavar='PATH',
                        required=True,
                        help='look for corpus in <destroot>/...')
    parser.add_argument('--jsonfile',
                        dest='jsonfile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='File containing JSON output')
    parser.add_argument('--roletype',
                        dest='roletype',
                        action='store',
                        required=True,
                        choices=['GUIDE', 'TOURIST'],
                        help='Target role')
    parser.add_argument('--scorefile',
                        dest='scorefile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='File to write with CSV scoring data')

    args = parser.parse_args()

    sessions = dataset_walker(args.dataset,
                              dataroot=args.dataroot,
                              labels=True,
                              task='SLG',
                              roletype=args.roletype.lower())

    system_output = json.load(open(args.jsonfile))

    stats = {}
    stats['generated'] = {}
    stats['generated']['all'] = Stat_BLEU_AM_FM('cn')

    for session, track_session in zip(sessions, system_output["sessions"]):
        log_utter_list = []
        label_utter_list = []

        for log_utter, translations, label_utter in session:
            if (args.roletype == 'GUIDE' and log_utter['speaker']
                    == 'Guide') or (args.roletype == 'TOURIST'
                                    and log_utter['speaker'] == 'Tourist'):
                log_utter_list.append(log_utter)
                label_utter_list.append(label_utter)

        # now iterate through turns
        for log_utter, label_utter, track_utter in zip(
                log_utter_list, label_utter_list, track_session["utterances"]):
            for subtask in stats:
                if subtask == 'generated':
                    ref = label_utter['transcript']
                    pred = track_utter['generated']
                    eval_utt(ref, pred, stats[subtask])

    csvfile = open(args.scorefile, 'w')
    print >> csvfile, ("task, subtask, schedule, stat, N, result")

    for subtask in stats:
        for schedule in stats[subtask]:
            for measure, N, result in stats[subtask][schedule].results():
                print >> csvfile, (
                    "%s, %s, %s, %s, %i, %s" %
                    ('SLG', subtask, schedule, measure, N, result))
    csvfile.close()
示例#54
0
def main(argv):
    parser = argparse.ArgumentParser(description='Simple SAP baseline.')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='The training dataset')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='The test dataset')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='Will look for corpus in <destroot>/...')
    parser.add_argument('--modelfile',
                        dest='modelfile',
                        action='store',
                        required=True,
                        metavar='MODEL_FILE',
                        help='File to write with trained model')
    parser.add_argument('--outfile',
                        dest='outfile',
                        action='store',
                        required=True,
                        metavar='JSON_FILE',
                        help='File to write with SAP output')
    parser.add_argument('--roletype',
                        dest='roletype',
                        action='store',
                        choices=['GUIDE', 'TOURIST'],
                        required=True,
                        help='Target role')

    args = parser.parse_args()

    sap = SimpleSAP()

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True,
                                             task='SAP',
                                             roletype=args.roletype.lower())
    sys.stderr.write('Loading training instances ... ')

    for call in trainset:
        instance = {
            'prev_turn_act': None,
            'curr_semantic_tags': None,
            'prev_semantic_tags': None,
            'dist_from_prev_turn': 0
        }
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() == args.roletype.lower():
                instance['curr_semantic_tags'] = log_utter['semantic_tags']
                instance['dist_from_prev_turn'] += 1

                sap.add_instance(copy.deepcopy(instance),
                                 label_utter['speech_act'])
            else:
                instance['prev_turn_act'] = log_utter['speech_act']
                instance['dist_from_prev_turn'] = 0
            instance['prev_semantic_tags'] = log_utter['semantic_tags']
    sys.stderr.write('Done\n')

    sap.train(args.modelfile)

    output = {'sessions': []}
    output['dataset'] = args.testset
    output['task_type'] = 'SAP'
    output['role_type'] = args.roletype
    start_time = time.time()

    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=False,
                                            translations=True,
                                            task='SAP',
                                            roletype=args.roletype.lower())
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}

        instance = {
            'prev_turn_act': None,
            'curr_semantic_tags': None,
            'prev_semantic_tags': None,
            'dist_from_prev_turn': 0
        }

        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() == args.roletype.lower():
                sap_result = {'utter_index': log_utter['utter_index']}

                instance['curr_semantic_tags'] = log_utter['semantic_tags']
                instance['dist_from_prev_turn'] += 1

                pred_act = sap.pred(copy.deepcopy(instance))
                combined_act = {}
                for act_label in reduce(operator.add, pred_act):
                    m = re.match('^([^_]+)_(.+)$', act_label)
                    act = m.group(1)
                    attr = m.group(2)
                    if act not in combined_act:
                        combined_act[act] = []
                    if attr not in combined_act[act]:
                        combined_act[act].append(attr)

                sap_result['speech_act'] = []
                for act in combined_act:
                    attr = combined_act[act]
                    sap_result['speech_act'].append({
                        'act': act,
                        'attributes': attr
                    })

                this_session['utterances'].append(sap_result)
            else:
                instance['prev_turn_act'] = log_utter['speech_act']
                instance['dist_from_prev_turn'] = 0
            instance['prev_semantic_tags'] = log_utter['semantic_tags']

        output['sessions'].append(this_session)
    sys.stderr.write('Done\n')

    end_time = time.time()
    elapsed_time = end_time - start_time
    output['wall_time'] = elapsed_time

    with open(args.outfile, "wb") as of:
        json.dump(output, of, indent=4)

    sys.stderr.write('Done\n')
示例#55
0
                for val in ontology[key][slot]:
                    add_words(val)
            # TODO
            for slot in ["food"]:
                add_words(slot)
                for val in ontology[key][slot]:
                    add_words(val)
            # TODO
            for slot in ["name"]:
                add_words(slot)
                for val in ontology[key][slot]:
                    add_words(val)

# include asr words and slu words appeared in data set
dataset = dataset_walker.dataset_walker(dataset_name,
                                        dataroot=dataroot,
                                        labels=True)
add_words("asr")
add_words("slots")
add_words("act")
for call in dataset:
    for turn, labelJson in call:
        asrs = turn["input"]["live"]["asr-hyps"]

        # 1best
        add_words(asrs[0]["asr-hyp"])

        # 2best - nbest
        # TODO
        for asr in asrs[1:]:
            add_words(asr["asr-hyp"])
示例#56
0
def main(argv):
    parser = argparse.ArgumentParser(description='Simple SLU baseline.')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='The training dataset')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='The test dataset')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='Will look for corpus in <destroot>/<dataset>/...')
    parser.add_argument('--modelfile', dest='modelfile', action='store', required=True, metavar='MODEL_FILE',  help='File to write with trained model')
    parser.add_argument('--outfile', dest='outfile', action='store', required=True, metavar='JSON_FILE',  help='File to write with SLU output')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['GUIDE',  'TOURIST'], required=True,  help='Target role')

    args = parser.parse_args()

    slu = SimpleSLU()

    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'):
                slu.add_instance(log_utter['transcript'], label_utter['speech_act'], label_utter['semantic_tagged'])
    sys.stderr.write('Done\n')

    slu.train(args.modelfile)

    projection = DirectLabelProjection()

    output = {'sessions': []}
    output['dataset'] = args.testset
    output['task_type'] = 'SLU'
    output['role_type'] = args.roletype
    start_time = time.time()

    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=False, translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        this_session = {"session_id": call.log["session_id"], "utterances": []}
        for (log_utter, translations, label_utter) in call:
            if (log_utter['speaker'] == 'Guide' and args.roletype == 'GUIDE') or (log_utter['speaker'] == 'Tourist' and args.roletype == 'TOURIST'):
                slu_result = {'utter_index': log_utter['utter_index']}
                if len(translations['translated']) > 0:
                    top_hyp = translations['translated'][0]['hyp']
                    pred_act, pred_semantic = slu.pred(top_hyp)

                    combined_act = {}
                    for act_label in reduce(operator.add, pred_act):
                        m = re.match('^([^_]+)_(.+)$', act_label)
                        act = m.group(1)
                        attr = m.group(2)
                        if act not in combined_act:
                            combined_act[act] = []
                        if attr not in combined_act[act]:
                            combined_act[act].append(attr)

                    slu_result['speech_act'] = []
                    for act in combined_act:
                        attr = combined_act[act]
                        slu_result['speech_act'].append({'act': act, 'attributes': attr})

                    align = translations['translated'][0]['align']

                    projected = projection.project(log_utter['transcript'], top_hyp, align, pred_semantic)
                    slu_result['semantic_tagged'] = projection.convert_to_tagged_utter(projected)
                else:
                    slu_result['semantic_tagged'] = log_utter['transcript']
                    slu_result['speech_act'] = []
                this_session['utterances'].append(slu_result)
        output['sessions'].append(this_session)

    end_time = time.time()
    elapsed_time = end_time - start_time
    output['wall_time'] = elapsed_time

    with open(args.outfile, "wb") as of:
        json.dump(output, of, indent=4)

    sys.stderr.write('Done\n')
示例#57
0
def gen_baseline(dataset_name, dataroot, tagged=False):
    res = {'dataset': dataset_name, 'sessions': []}
    dataset = dataset_walker.dataset_walker(dataset_name,
                                            dataroot=dataroot,
                                            labels=True)
    mod_config_dict = {
        'context_type': 'cpu',
        'nn_type': offline_config_dict["nn_type"],
        'model_dir': offline_config_dict["model_dir"]
    }
    if mod_config_dict['nn_type'] in [
            'doublelstm', 'reslstm', 'matlstm', 'cnnlstm', 'cnncnnlstm'
    ]:
        mod_config_dict['batch_size'] = 32

    mod_tracker = ModTracker(config_dict=mod_config_dict)
    start_time = time.time()

    # decide how to process data
    if mod_config_dict['nn_type'] in ['bowlstm']:
        level = 'turn'
        feature_type = 'bow'
    elif mod_config_dict['nn_type'] in ['reslstm', 'matlstm', 'cnnlstm']:
        level = 'turn'
        feature_type = 'bowbow'
    elif mod_config_dict['nn_type'] in ['doublelstm', 'cnncnnlstm']:
        level = 'turn'
        feature_type = 'sentbow'
    else:
        level = 'word'

    # process by word-level dialogue
    if level == 'word':
        for call in dataset:
            res_dialogue = dict()
            res_dialogue["session-id"] = call.log["session-id"]
            res_dialogue["turns"] = list()

            fileDatas = []
            tag_dicts = []

            fileData = {}
            fileData["turns"] = []
            for turn, labelJson in call:
                if tagged:
                    turnData = genTurnData_nbest_tagged(turn, labelJson)
                    tag_dicts.append(turnData["tag_dict"])
                else:
                    turnData = genTurnData_nbest(turn, labelJson)
                fileData["turns"].append(turnData)
                fileDatas.append(copy.deepcopy(fileData))

            tracker_outputs = mod_tracker.get_batch_new_state(fileDatas)
            for i in xrange(len(tracker_outputs)):
                del_none_val(tracker_outputs[i])
                if tagged:
                    tag_to_val(tracker_outputs[i], tag_dicts[i])
                res_dialogue["turns"].append(tracker_outputs[i])
            res["sessions"].append(res_dialogue)
            print "processed dialogue no.:", len(res["sessions"])

    # process by turn-level dialogue
    elif level == 'turn':

        batch_size = mod_tracker.batch_size

        fileDatas_all = gen_resdata(dataset, 'nbest_tagged')
        # fileDatas_all = []
        # for call in dataset:
        #     fileData = {}
        #     fileData["turns"] = []
        #     fileData["session-id"] = call.log["session-id"]
        #     for turn, labelJson in call:
        #         turnData = genTurnData_nbest(turn, labelJson)
        #         fileData["turns"].append(turnData)
        #     fileDatas_all.append(fileData)

        batch_num = int(math.ceil(len(fileDatas_all[0]) / float(batch_size)))
        for j in xrange(batch_num):
            fileDatas0 = fileDatas_all[0][batch_size * j:batch_size * (j + 1)]
            fileDatas1 = fileDatas_all[1][batch_size * j:batch_size * (j + 1)]
            fileDatas = []
            fileDatas.append(fileDatas0)
            fileDatas.append(fileDatas1)
            tracker_outputs = mod_tracker.get_turn_batch_state(
                fileDatas, feature_type)

            for i in xrange(len(fileDatas[0])):
                res_dialogue = dict()
                res_dialogue["session-id"] = fileDatas[0][i]["session-id"]
                res_dialogue["turns"] = tracker_outputs[i]
                for turn_output in res_dialogue["turns"]:
                    del_none_val(turn_output)
                res["sessions"].append(res_dialogue)
            print "processed dialogue no.:", len(res["sessions"])

    end_time = time.time()
    res['wall-time'] = end_time - start_time
    if tagged:
        baseline_json_file = 'baseline_%s_tagged.json' % dataset_name
    else:
        baseline_json_file = 'baseline_%s_dlstm.json' % dataset_name
    json.dump(res, open(baseline_json_file, 'wb'), indent=4)