Пример #1
0
    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Пример #2
0
    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Пример #3
0
def main(argv):
    parser = argparse.ArgumentParser(description='Dataset Converter for SAP pilot task.')
    parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', required=True, help='The target dataset to be converted')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='Will look for corpus in <destroot>/...')

    args = parser.parse_args()

    dataset = dataset_walker.dataset_walker(args.dataset, dataroot=args.dataroot, labels=True, translations=False)

    for call in dataset:
        session_id = call.log["session_id"]

        input_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'}
        output_guide = {u'session_id': session_id, u'utterances': [], u'roletype': u'Guide'}

        input_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'}
        output_tourist = {u'session_id': session_id, u'utterances': [], u'roletype': u'Tourist'}

        for (log_utter, _, label_utter) in call:
            speaker = log_utter['speaker']
            utter_index = log_utter['utter_index']
            transcript = log_utter['transcript']

            speech_act = label_utter['speech_act']

            mention_words = []
            curr_cat = None
            curr_attrs = None

            semantic_tags = []

            for semantic_tagged in label_utter['semantic_tagged']:
                parser = SemanticTagParser(False)
                parser.feed(semantic_tagged)

                for word, (bio, cat, attrs) in zip(parser.get_word_seq(), parser.get_word_tag_seq()):
                    if bio == 'I':
                        mention_words.append(word)
                    else:
                        if curr_cat is not None:
                            semantic_tags.append({
                                u'main': curr_cat,
                                u'attributes': curr_attrs,
                                u'mention': ' '.join(mention_words)
                            })

                        mention_words = []
                        curr_cat = None
                        curr_attrs = None

                        if bio == 'B':
                            mention_words = [word]
                            curr_cat = cat
                            curr_attrs = {}
                            for key, value in attrs:
                                curr_attrs[key] = value

                if curr_cat is not None:
                    semantic_tags.append({
                        u'main': curr_cat,
                        u'attributes': curr_attrs,
                        u'mention': ' '.join(mention_words)
                    })

            if speaker == 'Guide':
                input_guide[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'semantic_tags': semantic_tags
                })
                output_guide[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speech_act': speech_act
                })
                input_tourist[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'transcript': transcript,
                    u'semantic_tags': semantic_tags,
                    u'speech_act': speech_act
                })
            elif speaker == 'Tourist':
                input_tourist[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'semantic_tags': semantic_tags
                })
                output_tourist[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speech_act': speech_act
                })
                input_guide[u'utterances'].append({
                    u'utter_index': utter_index,
                    u'speaker': speaker,
                    u'transcript': transcript,
                    u'semantic_tags': semantic_tags,
                    u'speech_act': speech_act
                })

        path = os.path.join(os.path.abspath(args.dataroot), '%03d' % (session_id,))

        with open(os.path.join(path, 'sap.guide.in.json'), 'w') as fp:
            json.dump(input_guide, fp)
        with open(os.path.join(path, 'sap.guide.label.json'), 'w') as fp:
            json.dump(output_guide, fp)
        with open(os.path.join(path, 'sap.tourist.in.json'), 'w') as fp:
            json.dump(input_tourist, fp)
        with open(os.path.join(path, 'sap.tourist.label.json'), 'w') as fp:
            json.dump(output_tourist, fp)
Пример #4
0
def eval_semantics(ref_tagged, pred_tagged, stat_semantics):
    parser_ref = SemanticTagParser()
    parser_pred = SemanticTagParser()
    try:
        parser_ref.feed(ref_tagged)
        ref_chr_seq = parser_ref.get_chr_seq()
        ref_space_seq = parser_ref.get_chr_space_seq()

        parser_pred.feed(pred_tagged)
        pred_chr_seq = parser_pred.get_chr_seq()
        pred_space_seq = parser_pred.get_chr_space_seq()

        if ref_chr_seq != pred_chr_seq:
            raise

        merged_space_seq = [
            x or y for x, y in zip(ref_space_seq, pred_space_seq)
        ]

        parser_ref.tokenize(merged_space_seq)
        parser_pred.tokenize(merged_space_seq)

        ref_word_tag_seq = parser_ref.get_word_tag_seq()
        pred_word_tag_seq = parser_pred.get_word_tag_seq()

        for ref_tuple, pred_tuple in zip(ref_word_tag_seq, pred_word_tag_seq):
            ref_bio, ref_tag, ref_attrs = ref_tuple
            pred_bio, pred_tag, pred_attrs = pred_tuple

            pred_obj = None
            ref_obj = None

            if pred_bio is not None:
                pred_obj = {'bio': pred_bio}
            if ref_bio is not None:
                ref_obj = {'bio': ref_bio}

            if 'detection' in stat_semantics:
                stat_semantics['detection'].add(pred_obj, ref_obj)

            if pred_obj is not None and pred_tag is not None:
                pred_obj['tag'] = pred_tag
            if ref_obj is not None and ref_tag is not None:
                ref_obj['tag'] = ref_tag

            if 'class' in stat_semantics:
                stat_semantics['class'].add(pred_obj, ref_obj)

            if pred_obj is not None and pred_attrs is not None:
                for (s, v) in pred_attrs:
                    if v != 'NONE':
                        pred_obj[s] = v

            if ref_obj is not None and ref_attrs is not None:
                for (s, v) in ref_attrs:
                    if v != 'NONE':
                        ref_obj[s] = v

            if 'all' in stat_semantics:
                stat_semantics['all'].add(pred_obj, ref_obj)

        parser_ref.close()
        parser_pred.close()
    except HTMLParseError, err:
        print "HTMLParseError: %s" % err
Пример #5
0
def eval_semantics(ref_tagged, pred_tagged, stat_semantics):
    parser_ref = SemanticTagParser()
    parser_pred = SemanticTagParser()
    try:
        parser_ref.feed(ref_tagged)
        ref_chr_seq = parser_ref.get_chr_seq()
        ref_space_seq = parser_ref.get_chr_space_seq()

        parser_pred.feed(pred_tagged)
        pred_chr_seq = parser_pred.get_chr_seq()
        pred_space_seq = parser_pred.get_chr_space_seq()

        if ref_chr_seq != pred_chr_seq:
            raise

        merged_space_seq = [
            x or y for x, y in zip(ref_space_seq, pred_space_seq)]

        parser_ref.tokenize(merged_space_seq)
        parser_pred.tokenize(merged_space_seq)

        ref_word_tag_seq = parser_ref.get_word_tag_seq()
        pred_word_tag_seq = parser_pred.get_word_tag_seq()

        for ref_tuple, pred_tuple in zip(ref_word_tag_seq, pred_word_tag_seq):
            ref_bio, ref_tag, ref_attrs = ref_tuple
            pred_bio, pred_tag, pred_attrs = pred_tuple

            pred_obj = None
            ref_obj = None            

            if pred_bio is not None:
                pred_obj = {'bio': pred_bio}
            if ref_bio is not None:
                ref_obj = {'bio': ref_bio}

            if 'detection' in stat_semantics:
                stat_semantics['detection'].add(pred_obj, ref_obj)

            if pred_obj is not None and pred_tag is not None:
                pred_obj['tag'] = pred_tag
            if ref_obj is not None and ref_tag is not None:
                ref_obj['tag'] = ref_tag

            if 'class' in stat_semantics:
                stat_semantics['class'].add(pred_obj, ref_obj)

            if pred_obj is not None and pred_attrs is not None:
                for (s, v) in pred_attrs:
                    if v != 'NONE':
                        pred_obj[s] = v

            if ref_obj is not None and ref_attrs is not None:
                for (s, v) in ref_attrs:
                    if v != 'NONE':
                        ref_obj[s] = v

            if 'all' in stat_semantics:
                stat_semantics['all'].add(pred_obj, ref_obj)

        parser_ref.close()
        parser_pred.close()
    except HTMLParseError, err:
        print "HTMLParseError: %s" % err
Пример #6
0
    def check(self):
    # first check the top-level stuff
        if len(self.sessions.datasets) != 1 :
            self.add_error(("top level",), "tracker output should be over a single dataset")
        if "dataset" not in self.tracker_output :
            self.add_error(("top level",),"trackfile should specify its dataset")
        elif self.sessions.datasets[0] != self.tracker_output["dataset"]:
            self.add_error(("top level",),"datasets do not match")
        if len(self.tracker_output["sessions"]) !=  len(self.sessions) :
            self.add_error(("top level",),"number of sessions does not match")
        if "wall_time" not in self.tracker_output :
            self.add_error(("top level",),"wall_time should be included")
        else:
            wall_time = self.tracker_output["wall_time"]
            if type(wall_time) != type(0.0):
                self.add_error(("top level",),"wall_time must be a float")
            elif wall_time <= 0.0 :
                self.add_error(("top level",),"wall_time must be positive")

        if "task_type" not in self.tracker_output :
            self.add_error(("top level",),"task_type should be specified")
        elif self.tracker_output['task_type'] != 'SLU':
            self.add_error(("top level",),"task_type does not match")

        if "role_type" not in self.tracker_output:
            self.add_error(("top level",),"role_type should be specified")
        elif self.tracker_output['role_type'] != self.roletype:
            self.add_error(("top level",),"role_type does not match")

        for session, track_session in zip(self.sessions, self.tracker_output["sessions"]):
            session_id = session.log["session_id"]
            # check session id
            if session_id != track_session["session_id"] :
                self.add_error((session_id,),"session-id does not match")

            log_utter_list = []
            label_utter_list = []

            for log_utter, _, label_utter in session:
                if (self.roletype == 'GUIDE' and log_utter['speaker'] == 'Guide') or (self.roletype == 'TOURIST' and log_utter['speaker'] == 'Tourist'):
                    log_utter_list.append(log_utter)
                    label_utter_list.append(label_utter)

            # check number of utterances
            if len(log_utter_list) != len(track_session["utterances"]) :
                self.add_error((session_id,),"number of utterances spoken by %s does not match" % (self.roletype,))

            # now iterate through turns
            for log_utter, label_utter, track_utter in zip(log_utter_list, label_utter_list, track_session["utterances"]):
                # check utter index
                if log_utter['utter_index'] != track_utter['utter_index']:
                    self.add_error((session_id, "utterance", log_utter['utter_index'], track_utter['utter_index']), "utter_index does not match")

                if 'speech_act' not in track_utter:
                    self.add_error((session_id, "utterance", log_utter['utter_index']), "no speech_act key in utterance")
                else:
                    if type(track_utter['speech_act']) != types.ListType:
                        self.add_error((session_id, "utterance", log_utter['utter_index']), "a value for 'speech_act' key should be a list of objects")
                    else:
                        for act_obj in track_utter['speech_act']:
                            if 'act' not in act_obj:
                                self.add_error((session_id, "utterance", log_utter['utter_index']), "no act key in speech_act")
                            else:
                                if act_obj['act'] not in self.tagsets['speech_act']['category']:
                                    self.add_error((session_id, 'utterance', log_utter['utter_index'], act_obj['act']), "do not recognise speech act category")

                            if 'attributes' not in act_obj:
                                self.add_error((session_id, "utterance", log_utter['utter_index']), "no attributes key in speech_act")
                            else:
                                for attr in act_obj['attributes']:
                                    if attr not in self.tagsets['speech_act']['attribute']:
                                        self.add_error((session_id, 'utterance', log_utter['utter_index'], attr), "do not recognise speech act attribute")

                if 'semantic_tagged' not in track_utter:
                    self.add_error((session_id, "utterance", log_utter['utter_index']), "no semantic_tagged key in utterance")
                else:
                    if type(track_utter['semantic_tagged']) != types.StringType and type(track_utter['semantic_tagged']) != types.UnicodeType:
                        self.add_error((session_id, "utterance", log_utter['utter_index'], type(track_utter['semantic_tagged'])), "a value for 'semantic_tagged' key should be a string")
                    else:
                        try:
                            parser_ref = SemanticTagParser()
                            parser_ref.feed(log_utter['transcript'])

                            parser_pred = SemanticTagParser()
                            parser_pred.feed(track_utter['semantic_tagged'])

                            if parser_ref.get_chr_seq() != parser_pred.get_chr_seq():
                                self.add_error((session_id, 'utterance', log_utter['utter_index'], log_utter['transcript'], track_utter['semantic_tagged']), "raw utterance has changed")

                            for bio, tag, attrs in parser_pred.get_word_tag_seq():
                                if tag is not None:
                                    tag = tag.upper()
                                    if tag not in self.tagsets['semantic']:
                                        self.add_error((session_id, 'utterance', log_utter['utter_index'], tag), "do not recognise semantic category")
                                    elif attrs is not None:
                                        for s,v in attrs:
                                            s = s.upper().strip()
                                            v = v.upper().strip()

                                            if len(v) == 0:
                                                v = 'NONE'

                                            if s not in self.tagsets['semantic'][tag]:
                                                if v is not None and v != 'NONE':
                                                    self.add_error((session_id, 'utterance', log_utter['utter_index'], tag, s), "do not recognise semantic attribute type")
                                            elif v not in self.tagsets['semantic'][tag][s]:
                                                self.add_error((session_id, 'utterance', log_utter['utter_index'], tag, s, v), "do not recognise semantic attribute value")

                        except HTMLParseError, err:
                            self.add_error((session_id, 'utterance', log_utter['utter_index'], track_utter['semantic_tagged']), "do not parse the tagged utterance")
Пример #7
0
    def check(self):
        # first check the top-level stuff
        if len(self.sessions.datasets) != 1:
            self.add_error(("top level", ),
                           "tracker output should be over a single dataset")
        if "dataset" not in self.tracker_output:
            self.add_error(("top level", ),
                           "trackfile should specify its dataset")
        elif self.sessions.datasets[0] != self.tracker_output["dataset"]:
            self.add_error(("top level", ), "datasets do not match")
        if len(self.tracker_output["sessions"]) != len(self.sessions):
            self.add_error(("top level", ),
                           "number of sessions does not match")
        if "wall_time" not in self.tracker_output:
            self.add_error(("top level", ), "wall_time should be included")
        else:
            wall_time = self.tracker_output["wall_time"]
            if type(wall_time) != type(0.0):
                self.add_error(("top level", ), "wall_time must be a float")
            elif wall_time <= 0.0:
                self.add_error(("top level", ), "wall_time must be positive")

        if "task_type" not in self.tracker_output:
            self.add_error(("top level", ), "task_type should be specified")
        elif self.tracker_output['task_type'] != 'SLU':
            self.add_error(("top level", ), "task_type does not match")

        if "role_type" not in self.tracker_output:
            self.add_error(("top level", ), "role_type should be specified")
        elif self.tracker_output['role_type'] != self.roletype:
            self.add_error(("top level", ), "role_type does not match")

        for session, track_session in zip(self.sessions,
                                          self.tracker_output["sessions"]):
            session_id = session.log["session_id"]
            # check session id
            if session_id != track_session["session_id"]:
                self.add_error((session_id, ), "session-id does not match")

            log_utter_list = []
            label_utter_list = []

            for log_utter, _, label_utter in session:
                if (self.roletype == 'GUIDE' and log_utter['speaker']
                        == 'Guide') or (self.roletype == 'TOURIST'
                                        and log_utter['speaker'] == 'Tourist'):
                    log_utter_list.append(log_utter)
                    label_utter_list.append(label_utter)

            # check number of utterances
            if len(log_utter_list) != len(track_session["utterances"]):
                self.add_error(
                    (session_id, ),
                    "number of utterances spoken by %s does not match" %
                    (self.roletype, ))

            # now iterate through turns
            for log_utter, label_utter, track_utter in zip(
                    log_utter_list, label_utter_list,
                    track_session["utterances"]):
                # check utter index
                if log_utter['utter_index'] != track_utter['utter_index']:
                    self.add_error(
                        (session_id, "utterance", log_utter['utter_index'],
                         track_utter['utter_index']),
                        "utter_index does not match")

                if 'speech_act' not in track_utter:
                    self.add_error(
                        (session_id, "utterance", log_utter['utter_index']),
                        "no speech_act key in utterance")
                else:
                    if type(track_utter['speech_act']) != types.ListType:
                        self.add_error((
                            session_id, "utterance", log_utter['utter_index']
                        ), "a value for 'speech_act' key should be a list of objects"
                                       )
                    else:
                        for act_obj in track_utter['speech_act']:
                            if 'act' not in act_obj:
                                self.add_error((session_id, "utterance",
                                                log_utter['utter_index']),
                                               "no act key in speech_act")
                            else:
                                if act_obj['act'] not in self.tagsets[
                                        'speech_act']['category']:
                                    self.add_error(
                                        (session_id, 'utterance',
                                         log_utter['utter_index'],
                                         act_obj['act']),
                                        "do not recognise speech act category")

                            if 'attributes' not in act_obj:
                                self.add_error(
                                    (session_id, "utterance",
                                     log_utter['utter_index']),
                                    "no attributes key in speech_act")
                            else:
                                for attr in act_obj['attributes']:
                                    if attr not in self.tagsets['speech_act'][
                                            'attribute']:
                                        self.add_error((
                                            session_id, 'utterance',
                                            log_utter['utter_index'], attr
                                        ), "do not recognise speech act attribute"
                                                       )

                if 'semantic_tagged' not in track_utter:
                    self.add_error(
                        (session_id, "utterance", log_utter['utter_index']),
                        "no semantic_tagged key in utterance")
                else:
                    if type(track_utter['semantic_tagged']
                            ) != types.StringType and type(
                                track_utter['semantic_tagged']
                            ) != types.UnicodeType:
                        self.add_error((
                            session_id, "utterance", log_utter['utter_index'],
                            type(track_utter['semantic_tagged'])
                        ), "a value for 'semantic_tagged' key should be a string"
                                       )
                    else:
                        try:
                            parser_ref = SemanticTagParser()
                            parser_ref.feed(log_utter['transcript'])

                            parser_pred = SemanticTagParser()
                            parser_pred.feed(track_utter['semantic_tagged'])

                            if parser_ref.get_chr_seq(
                            ) != parser_pred.get_chr_seq():
                                self.add_error(
                                    (session_id, 'utterance',
                                     log_utter['utter_index'],
                                     log_utter['transcript'],
                                     track_utter['semantic_tagged']),
                                    "raw utterance has changed")

                            for bio, tag, attrs in parser_pred.get_word_tag_seq(
                            ):
                                if tag is not None:
                                    tag = tag.upper()
                                    if tag not in self.tagsets['semantic']:
                                        self.add_error((
                                            session_id, 'utterance',
                                            log_utter['utter_index'], tag
                                        ), "do not recognise semantic category"
                                                       )
                                    elif attrs is not None:
                                        for s, v in attrs:
                                            s = s.upper().strip()
                                            v = v.upper().strip()

                                            if len(v) == 0:
                                                v = 'NONE'

                                            if s not in self.tagsets[
                                                    'semantic'][tag]:
                                                if v is not None and v != 'NONE':
                                                    self.add_error((
                                                        session_id,
                                                        'utterance', log_utter[
                                                            'utter_index'],
                                                        tag, s
                                                    ), "do not recognise semantic attribute type"
                                                                   )
                                            elif v not in self.tagsets[
                                                    'semantic'][tag][s]:
                                                self.add_error((
                                                    session_id, 'utterance',
                                                    log_utter['utter_index'],
                                                    tag, s, v
                                                ), "do not recognise semantic attribute value"
                                                               )

                        except HTMLParseError, err:
                            self.add_error((session_id, 'utterance',
                                            log_utter['utter_index'],
                                            track_utter['semantic_tagged']),
                                           "do not parse the tagged utterance")