def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
def main(argv): parser = argparse.ArgumentParser(description='Dataset Converter for SAP pilot task.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The target dataset to be converted') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='Will look for corpus in <destroot>/...') args = parser.parse_args() dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True, translations=False) for call in dataset: session_id = call.log["session_id"] input_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'} output_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'} input_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'} output_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'} for (log_utter, _, label_utter) in call: speaker = log_utter['speaker'] utter_index = log_utter['utter_index'] transcript = log_utter['transcript'] speech_act = label_utter['speech_act'] mention_words = [] curr_cat = None curr_attrs = None semantic_tags = [] for semantic_tagged in label_utter['semantic_tagged']: parser = SemanticTagParser(False) parser.feed(semantic_tagged) for word, (bio, cat, attrs) in zip(parser.get_word_seq(), parser.get_word_tag_seq()): if bio == 'I': mention_words.append(word) else: if curr_cat is not None: semantic_tags.append({ u'main': curr_cat, u'attributes': curr_attrs, u'mention': ' '.join(mention_words) }) mention_words = [] curr_cat = None curr_attrs = None if bio == 'B': mention_words = [word] curr_cat = cat curr_attrs = {} for key, value in attrs: curr_attrs[key] = value if curr_cat is not None: semantic_tags.append({ u'main': curr_cat, u'attributes': curr_attrs, u'mention': ' '.join(mention_words) }) if speaker == 'Guide': input_guide[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'semantic_tags': semantic_tags }) output_guide[u'utterances'].append({ u'utter_index': utter_index, u'speech_act': speech_act }) input_tourist[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'transcript': transcript, u'semantic_tags': semantic_tags, u'speech_act': speech_act }) elif speaker == 'Tourist': input_tourist[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'semantic_tags': semantic_tags }) output_tourist[u'utterances'].append({ u'utter_index': utter_index, u'speech_act': speech_act }) input_guide[u'utterances'].append({ u'utter_index': utter_index, u'speaker': speaker, u'transcript': transcript, u'semantic_tags': semantic_tags, u'speech_act': speech_act }) path = os.path.join(os.path.abspath(args.dataroot), '%03d' % (session_id,)) with open(os.path.join(path, 'sap.guide.in.json'), 'w') as fp: json.dump(input_guide, fp) with open(os.path.join(path, 'sap.guide.label.json'), 'w') as fp: json.dump(output_guide, fp) with open(os.path.join(path, 'sap.tourist.in.json'), 'w') as fp: json.dump(input_tourist, fp) with open(os.path.join(path, 'sap.tourist.label.json'), 'w') as fp: json.dump(output_tourist, fp)
def eval_semantics(ref_tagged, pred_tagged, stat_semantics): parser_ref = SemanticTagParser() parser_pred = SemanticTagParser() try: parser_ref.feed(ref_tagged) ref_chr_seq = parser_ref.get_chr_seq() ref_space_seq = parser_ref.get_chr_space_seq() parser_pred.feed(pred_tagged) pred_chr_seq = parser_pred.get_chr_seq() pred_space_seq = parser_pred.get_chr_space_seq() if ref_chr_seq != pred_chr_seq: raise merged_space_seq = [ x or y for x, y in zip(ref_space_seq, pred_space_seq) ] parser_ref.tokenize(merged_space_seq) parser_pred.tokenize(merged_space_seq) ref_word_tag_seq = parser_ref.get_word_tag_seq() pred_word_tag_seq = parser_pred.get_word_tag_seq() for ref_tuple, pred_tuple in zip(ref_word_tag_seq, pred_word_tag_seq): ref_bio, ref_tag, ref_attrs = ref_tuple pred_bio, pred_tag, pred_attrs = pred_tuple pred_obj = None ref_obj = None if pred_bio is not None: pred_obj = {'bio': pred_bio} if ref_bio is not None: ref_obj = {'bio': ref_bio} if 'detection' in stat_semantics: stat_semantics['detection'].add(pred_obj, ref_obj) if pred_obj is not None and pred_tag is not None: pred_obj['tag'] = pred_tag if ref_obj is not None and ref_tag is not None: ref_obj['tag'] = ref_tag if 'class' in stat_semantics: stat_semantics['class'].add(pred_obj, ref_obj) if pred_obj is not None and pred_attrs is not None: for (s, v) in pred_attrs: if v != 'NONE': pred_obj[s] = v if ref_obj is not None and ref_attrs is not None: for (s, v) in ref_attrs: if v != 'NONE': ref_obj[s] = v if 'all' in stat_semantics: stat_semantics['all'].add(pred_obj, ref_obj) parser_ref.close() parser_pred.close() except HTMLParseError, err: print "HTMLParseError: %s" % err
def eval_semantics(ref_tagged, pred_tagged, stat_semantics): parser_ref = SemanticTagParser() parser_pred = SemanticTagParser() try: parser_ref.feed(ref_tagged) ref_chr_seq = parser_ref.get_chr_seq() ref_space_seq = parser_ref.get_chr_space_seq() parser_pred.feed(pred_tagged) pred_chr_seq = parser_pred.get_chr_seq() pred_space_seq = parser_pred.get_chr_space_seq() if ref_chr_seq != pred_chr_seq: raise merged_space_seq = [ x or y for x, y in zip(ref_space_seq, pred_space_seq)] parser_ref.tokenize(merged_space_seq) parser_pred.tokenize(merged_space_seq) ref_word_tag_seq = parser_ref.get_word_tag_seq() pred_word_tag_seq = parser_pred.get_word_tag_seq() for ref_tuple, pred_tuple in zip(ref_word_tag_seq, pred_word_tag_seq): ref_bio, ref_tag, ref_attrs = ref_tuple pred_bio, pred_tag, pred_attrs = pred_tuple pred_obj = None ref_obj = None if pred_bio is not None: pred_obj = {'bio': pred_bio} if ref_bio is not None: ref_obj = {'bio': ref_bio} if 'detection' in stat_semantics: stat_semantics['detection'].add(pred_obj, ref_obj) if pred_obj is not None and pred_tag is not None: pred_obj['tag'] = pred_tag if ref_obj is not None and ref_tag is not None: ref_obj['tag'] = ref_tag if 'class' in stat_semantics: stat_semantics['class'].add(pred_obj, ref_obj) if pred_obj is not None and pred_attrs is not None: for (s, v) in pred_attrs: if v != 'NONE': pred_obj[s] = v if ref_obj is not None and ref_attrs is not None: for (s, v) in ref_attrs: if v != 'NONE': ref_obj[s] = v if 'all' in stat_semantics: stat_semantics['all'].add(pred_obj, ref_obj) parser_ref.close() parser_pred.close() except HTMLParseError, err: print "HTMLParseError: %s" % err
def check(self): # first check the top-level stuff if len(self.sessions.datasets) != 1 : self.add_error(("top level",), "tracker output should be over a single dataset") if "dataset" not in self.tracker_output : self.add_error(("top level",),"trackfile should specify its dataset") elif self.sessions.datasets[0] != self.tracker_output["dataset"]: self.add_error(("top level",),"datasets do not match") if len(self.tracker_output["sessions"]) != len(self.sessions) : self.add_error(("top level",),"number of sessions does not match") if "wall_time" not in self.tracker_output : self.add_error(("top level",),"wall_time should be included") else: wall_time = self.tracker_output["wall_time"] if type(wall_time) != type(0.0): self.add_error(("top level",),"wall_time must be a float") elif wall_time <= 0.0 : self.add_error(("top level",),"wall_time must be positive") if "task_type" not in self.tracker_output : self.add_error(("top level",),"task_type should be specified") elif self.tracker_output['task_type'] != 'SLU': self.add_error(("top level",),"task_type does not match") if "role_type" not in self.tracker_output: self.add_error(("top level",),"role_type should be specified") elif self.tracker_output['role_type'] != self.roletype: self.add_error(("top level",),"role_type does not match") for session, track_session in zip(self.sessions, self.tracker_output["sessions"]): session_id = session.log["session_id"] # check session id if session_id != track_session["session_id"] : self.add_error((session_id,),"session-id does not match") log_utter_list = [] label_utter_list = [] for log_utter, _, label_utter in session: if (self.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (self.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'): log_utter_list.append(log_utter) label_utter_list.append(label_utter) # check number of utterances if len(log_utter_list) != len(track_session["utterances"]) : self.add_error((session_id,),"number of utterances spoken by %s does not match" % (self.roletype,)) # now iterate through turns for log_utter, label_utter, track_utter in zip(log_utter_list, label_utter_list, track_session["utterances"]): # check utter index if log_utter['utter_index'] != track_utter['utter_index']: self.add_error((session_id, "utterance", log_utter['utter_index'], track_utter['utter_index']), "utter_index does not match") if 'speech_act' not in track_utter: self.add_error((session_id, "utterance", log_utter['utter_index']), "no speech_act key in utterance") else: if type(track_utter['speech_act']) != types.ListType: self.add_error((session_id, "utterance", log_utter['utter_index']), "a value for 'speech_act' key should be a list of objects") else: for act_obj in track_utter['speech_act']: if 'act' not in act_obj: self.add_error((session_id, "utterance", log_utter['utter_index']), "no act key in speech_act") else: if act_obj['act'] not in self.tagsets['speech_act']['category']: self.add_error((session_id, 'utterance', log_utter['utter_index'], act_obj['act']), "do not recognise speech act category") if 'attributes' not in act_obj: self.add_error((session_id, "utterance", log_utter['utter_index']), "no attributes key in speech_act") else: for attr in act_obj['attributes']: if attr not in self.tagsets['speech_act']['attribute']: self.add_error((session_id, 'utterance', log_utter['utter_index'], attr), "do not recognise speech act attribute") if 'semantic_tagged' not in track_utter: self.add_error((session_id, "utterance", log_utter['utter_index']), "no semantic_tagged key in utterance") else: if type(track_utter['semantic_tagged']) != types.StringType and type(track_utter['semantic_tagged']) != types.UnicodeType: self.add_error((session_id, "utterance", log_utter['utter_index'], type(track_utter['semantic_tagged'])), "a value for 'semantic_tagged' key should be a string") else: try: parser_ref = SemanticTagParser() parser_ref.feed(log_utter['transcript']) parser_pred = SemanticTagParser() parser_pred.feed(track_utter['semantic_tagged']) if parser_ref.get_chr_seq() != parser_pred.get_chr_seq(): self.add_error((session_id, 'utterance', log_utter['utter_index'], log_utter['transcript'], track_utter['semantic_tagged']), "raw utterance has changed") for bio, tag, attrs in parser_pred.get_word_tag_seq(): if tag is not None: tag = tag.upper() if tag not in self.tagsets['semantic']: self.add_error((session_id, 'utterance', log_utter['utter_index'], tag), "do not recognise semantic category") elif attrs is not None: for s,v in attrs: s = s.upper().strip() v = v.upper().strip() if len(v) == 0: v = 'NONE' if s not in self.tagsets['semantic'][tag]: if v is not None and v != 'NONE': self.add_error((session_id, 'utterance', log_utter['utter_index'], tag, s), "do not recognise semantic attribute type") elif v not in self.tagsets['semantic'][tag][s]: self.add_error((session_id, 'utterance', log_utter['utter_index'], tag, s, v), "do not recognise semantic attribute value") except HTMLParseError, err: self.add_error((session_id, 'utterance', log_utter['utter_index'], track_utter['semantic_tagged']), "do not parse the tagged utterance")
def check(self): # first check the top-level stuff if len(self.sessions.datasets) != 1: self.add_error(("top level", ), "tracker output should be over a single dataset") if "dataset" not in self.tracker_output: self.add_error(("top level", ), "trackfile should specify its dataset") elif self.sessions.datasets[0] != self.tracker_output["dataset"]: self.add_error(("top level", ), "datasets do not match") if len(self.tracker_output["sessions"]) != len(self.sessions): self.add_error(("top level", ), "number of sessions does not match") if "wall_time" not in self.tracker_output: self.add_error(("top level", ), "wall_time should be included") else: wall_time = self.tracker_output["wall_time"] if type(wall_time) != type(0.0): self.add_error(("top level", ), "wall_time must be a float") elif wall_time <= 0.0: self.add_error(("top level", ), "wall_time must be positive") if "task_type" not in self.tracker_output: self.add_error(("top level", ), "task_type should be specified") elif self.tracker_output['task_type'] != 'SLU': self.add_error(("top level", ), "task_type does not match") if "role_type" not in self.tracker_output: self.add_error(("top level", ), "role_type should be specified") elif self.tracker_output['role_type'] != self.roletype: self.add_error(("top level", ), "role_type does not match") for session, track_session in zip(self.sessions, self.tracker_output["sessions"]): session_id = session.log["session_id"] # check session id if session_id != track_session["session_id"]: self.add_error((session_id, ), "session-id does not match") log_utter_list = [] label_utter_list = [] for log_utter, _, label_utter in session: if (self.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (self.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'): log_utter_list.append(log_utter) label_utter_list.append(label_utter) # check number of utterances if len(log_utter_list) != len(track_session["utterances"]): self.add_error( (session_id, ), "number of utterances spoken by %s does not match" % (self.roletype, )) # now iterate through turns for log_utter, label_utter, track_utter in zip( log_utter_list, label_utter_list, track_session["utterances"]): # check utter index if log_utter['utter_index'] != track_utter['utter_index']: self.add_error( (session_id, "utterance", log_utter['utter_index'], track_utter['utter_index']), "utter_index does not match") if 'speech_act' not in track_utter: self.add_error( (session_id, "utterance", log_utter['utter_index']), "no speech_act key in utterance") else: if type(track_utter['speech_act']) != types.ListType: self.add_error(( session_id, "utterance", log_utter['utter_index'] ), "a value for 'speech_act' key should be a list of objects" ) else: for act_obj in track_utter['speech_act']: if 'act' not in act_obj: self.add_error((session_id, "utterance", log_utter['utter_index']), "no act key in speech_act") else: if act_obj['act'] not in self.tagsets[ 'speech_act']['category']: self.add_error( (session_id, 'utterance', log_utter['utter_index'], act_obj['act']), "do not recognise speech act category") if 'attributes' not in act_obj: self.add_error( (session_id, "utterance", log_utter['utter_index']), "no attributes key in speech_act") else: for attr in act_obj['attributes']: if attr not in self.tagsets['speech_act'][ 'attribute']: self.add_error(( session_id, 'utterance', log_utter['utter_index'], attr ), "do not recognise speech act attribute" ) if 'semantic_tagged' not in track_utter: self.add_error( (session_id, "utterance", log_utter['utter_index']), "no semantic_tagged key in utterance") else: if type(track_utter['semantic_tagged'] ) != types.StringType and type( track_utter['semantic_tagged'] ) != types.UnicodeType: self.add_error(( session_id, "utterance", log_utter['utter_index'], type(track_utter['semantic_tagged']) ), "a value for 'semantic_tagged' key should be a string" ) else: try: parser_ref = SemanticTagParser() parser_ref.feed(log_utter['transcript']) parser_pred = SemanticTagParser() parser_pred.feed(track_utter['semantic_tagged']) if parser_ref.get_chr_seq( ) != parser_pred.get_chr_seq(): self.add_error( (session_id, 'utterance', log_utter['utter_index'], log_utter['transcript'], track_utter['semantic_tagged']), "raw utterance has changed") for bio, tag, attrs in parser_pred.get_word_tag_seq( ): if tag is not None: tag = tag.upper() if tag not in self.tagsets['semantic']: self.add_error(( session_id, 'utterance', log_utter['utter_index'], tag ), "do not recognise semantic category" ) elif attrs is not None: for s, v in attrs: s = s.upper().strip() v = v.upper().strip() if len(v) == 0: v = 'NONE' if s not in self.tagsets[ 'semantic'][tag]: if v is not None and v != 'NONE': self.add_error(( session_id, 'utterance', log_utter[ 'utter_index'], tag, s ), "do not recognise semantic attribute type" ) elif v not in self.tagsets[ 'semantic'][tag][s]: self.add_error(( session_id, 'utterance', log_utter['utter_index'], tag, s, v ), "do not recognise semantic attribute value" ) except HTMLParseError, err: self.add_error((session_id, 'utterance', log_utter['utter_index'], track_utter['semantic_tagged']), "do not parse the tagged utterance")