Exemplo n.º 1
0
class swda_reader(object):
	def __init__(self):
		self.corpus = CorpusReader('swda')
	def transcript_reader(self):
		def is_neg(tag):
			if (tag == 'sv') or (tag == 'sd'):
				return True
			return False
		def is_pos(tag):
			if (tag == 'qy') or (tag == 'qw') or (tag == 'qh'):
				return True
			return False
		pos_data = []
		neg_data = []
		for trans in self.corpus.iter_transcripts():
			pool = []
			for utt in trans.utterances:
				if utt.damsl_act_tag() == '+':
					pool.append(utt)
				else:
					if len(pool) > 0:
						pool.append(utt)
					else:
						pool = [utt]
					if is_neg(utt.damsl_act_tag()):
                                        	neg_data.append(pool)
					elif is_pos(utt.damsl_act_tag()):
						pos_data.append(pool)
					pool = []
				'''
				if is_neg(utt.damsl_act_tag()) or is_pos(utt.damsl_act_tag()):
					print utt.pos_words()
					print utt.damsl_act_tag()
				'''
		return pos_data,neg_data
Exemplo n.º 2
0
def get_swda_utterances(swda_dir):
    corpus = CorpusReader(swda_dir)
    c = 0
    last_utterances = dict()

    for trans in  corpus.iter_transcripts(display_progress=True):
            last_utterances["A"] = []
            last_utterances["B"] = []
            for utt in trans.utterances:
                utt_temp = re.sub(r'\(|\)|-|\{.+? |\}|\[|\]|\+|#|/|<.+?>|,', "", utt.text.lower())
                utt_tokens = word_tokenize(re.sub("<|>", "", utt_temp))
                if utt.damsl_act_tag() != "+":
                    last_utterances[utt.caller].append((c, utt.damsl_act_tag() + "/%s_%s_%s" % (utt.conversation_no, utt.caller, c), utt_tokens))
                    c += 1
                else:
                    try:
                        prev = last_utterances[utt.caller].pop()
                        new = (prev[0], prev[1], prev[2] + utt_tokens)
                        last_utterances[utt.caller].append(new)
                    except IndexError:
                        pass
                        # RW: for some reason, Chris Potts' Corpus Reader gives us utterances with a "+" tag although
                        # there is no previous utterance of the same speaker to complete.
                        # Looking at the originial data, there seems to be a bug in his Corpus Reader that skips some
                        # stuff in the beginning for some reason (e.g. the beginning of conv. no 3554.
                        print utt.conversation_no
            utterances = last_utterances["A"] + last_utterances["B"]
            utterances = sorted(utterances, key= lambda t: t[0])
            for tpl in utterances:
                if tpl[2]:
                    yield tpl[1:]
Exemplo n.º 3
0
def swda_education_region():
    """Create a count dictionary relating education and region."""    
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Iterate throught the transcripts; display_progress=True tracks progress:
    for trans in corpus.iter_transcripts(display_progress=True):
        d[(trans.from_caller_education, trans.from_caller_dialect_area)] += 1
        d[(trans.to_caller_education, trans.to_caller_dialect_area)] += 1
    # Turn d into a list of tuples as d.items(), sort it based on the
    # second (index 1 member) of those tuples, largest first, and
    # print out the results:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
Exemplo n.º 4
0
def swda_education_region():
    """Create a count dictionary relating education and region."""    
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Iterate throught the transcripts; display_progress=True tracks progress:
    for trans in corpus.iter_transcripts(display_progress=True):
        d[(trans.from_caller_education, trans.from_caller_dialect_area)] += 1
        d[(trans.to_caller_education, trans.to_caller_dialect_area)] += 1
    # Turn d into a list of tuples as d.items(), sort it based on the
    # second (index 1 member) of those tuples, largest first, and
    # print out the results:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
Exemplo n.º 5
0
def load_swda_data():

    if not os.path.exists("../helper_files/swda_data.pkl"):
        corpus = CorpusReader("../data/switchboard-corpus/swda")
        excluded_tags = ["x", "+"]
        conversations = []
        labels = []
        print("Loading swda transcripts, this might take a while")
        for transcript in corpus.iter_transcripts():
            utterances, utterance_labels = process_transcript_txt(
                transcript, excluded_tags)
            conversations.append(utterances)
            labels.append(utterance_labels)

        with open("../helper_files/swda_data.pkl", "wb") as f:
            pickle.dump((conversations, labels), f)
    else:
        with open("../helper_files/swda_data.pkl", "rb") as f:
            conversations, labels = pickle.load(f)

    return conversations, labels
Exemplo n.º 6
0
def load_swda_corpus_data(swda_directory):
    print('Loading SwDA Corpus...')
    corpus_reader = CorpusReader(swda_directory)
    conversations = []

    for transcript in corpus_reader.iter_transcripts(display_progress=False):
        name = 'sw' + str(transcript.conversation_no)

        conv = {
            "name": name,
            "utterances": [],
            "partition_name": get_partition(name)
        }

        for j, utterance in enumerate(transcript.utterances):
            utt = {
                "text": " ".join(utterance.text_words(filter_disfluency=True)),
                "act_tag": utterance.act_tag,
                "damsl_act_tag": utterance.damsl_act_tag(),
                "caller": utterance.caller,
            }

            #utt_text = " ".join(utterance.text_words(filter_disfluency=True))
            #print("[{}] {}".format(j, utt_text))
            #print("\t==>", utterance.act_tag, utterance.damsl_act_tag())

            conv["utterances"].append(utt)

        conversations.append(conv)

    corpus = {
        "partition_source": "https://github.com/Franck-Dernoncourt/naacl2016",
        "train_ids": list(train_set_idx),
        "test_ids": list(test_set_idx),
        "dev_set_ids": list(valid_set_idx),
        "conversations": conversations
    }

    return corpus
    def write_to_file(self, corpus_path, metadata_path, target_folder_path,
                      ranges, errorLog):
        """Writes files to a target folder with the mappings
        from words in utterances to corresponding POS tags.
        """
        if errorLog:
            errorLog = open(errorLog, 'w')
        corpus = CorpusReader(corpus_path, metadata_path)

        folder = None
        corpus_file = None
        for trans in corpus.iter_transcripts():

            # print "iterating",trans.conversation_no
            if not trans.has_pos():
                continue
            # print "has pos"
            if ranges and not trans.conversation_no in ranges:
                continue
            # print "in range"
            # just look at transcripts WITHOUT trees as compliment to the
            # above models
            if trans.has_trees():
                continue
            end = trans.swda_filename.rfind("/")
            start = trans.swda_filename.rfind("/", 0, end)
            c_folder = trans.swda_filename[start + 1:end]
            if c_folder != folder:
                # for now splitting the maps by folder
                folder = c_folder
                if corpus_file:
                    corpus_file.close()
                corpus_file = open(
                    target_folder_path +
                    "/POS_map_{0}.csv.text".format(folder), 'w')
                wordPOSMapList = POSMapCorpus(False, errorLog)
                print "new map for folder", folder

            translist = trans.utterances
            translength = len(translist)
            count = 0

            # iterating through transcript utterance by utterance
            while count < translength:
                utt = trans.utterances[count]
                words = utt.text_words()
                wordPOSMap = []
                if len(utt.pos) == 0:  # no POS
                    wordPOSMap.append((utt, []))  # just dummy value
                    wordPOSMapList.append(trans.conversation_no,
                                          utt.transcript_index,
                                          list(wordPOSMap))
                    errormessage = "WARNING: NO POS for file/utt: " +\
                        str(utt.swda_filename) + " " + utt.caller + "." + \
                        str(utt.utterance_index) + "." + \
                        str(utt.subutterance_index) + " " + utt.text
                    # print errormessage
                    # raw_input()
                else:
                    # indices for which POS we're at
                    j = 0
                    possibleComment = False  # can have comments, flag
                    mistranscribe = False
                    word = words[0]
                    # loop until no more words left to be matched in utterance
                    while len(words) > 0:
                        word = words[0]
                        # print "top WORD:" + word
                        if not mistranscribe:
                            wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "",
                                              word)
                            wordtest = wordtest.replace("(", "").\
                                replace(")", "").replace("/", "")
                        match = False
                        POSIndices = []

                        if (possibleComment or word[0:1] in [
                                "{", "}", "-"
                        ] or word in [
                                "/", ".", ",", "]"
                        ] or wordtest == "" or any([
                                x in word
                                for x in
                            ["<", ">", "*", "[", "+", "]]", "...", "#", "="]
                        ])):
                            # no tree equivalent for {D } type annotations
                            if (word[0:1] == "-" or
                                    any([x in word for x in
                                         ["*", "<<", "<+", "[[", "<"]])) \
                                    and not possibleComment:
                                possibleComment = True
                            if possibleComment:
                                # print "match COMMENT!:" + word
                                # raw_input()
                                POSIndices = []
                                match = True
                                if (any([x in word for x in [">>", "]]", "))",
                                                             ">"]]) or
                                        word[0] == "-") \
                                        and not word == "->":
                                    # turn off comment
                                    possibleComment = False
                                if (">>" in word or "]]" in word
                                        or "))" in word or ">" in word and
                                        not word == "->"):  # turn off comment
                                    possibleComment = False
                                    #del words[0]
                            wordPOSMap.append((word, POSIndices))
                            POSIndices = []
                            match = True
                            # print "match annotation!:" + word
                            del words[0]  # word is consumed
                            if len(words) > 0:
                                word = words[0]
                                wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "",
                                                  word)
                                wordtest = wordtest.replace("(", "")
                                wordtest = wordtest.replace(")", "")
                            else:
                                break
                            continue  # carry on to next word
                        else:
                            myPOS = utt.regularize_pos_lemmas()
                            while j < len(myPOS):
                                pos = myPOS[j][0]  # pair of (word,POS)
                                # print "j number of pos : " + str(len(myPOS))
                                # print "j loop word : " + word
                                # print "j loop wordtest : " + wordtest
                                # print "j pos : " + str(j) + " " + str(pos)
                                # raw_input()
                                breaker = False
                                if wordtest == pos or word == pos:  # exact match
                                    POSIndices.append(j)
                                    wordPOSMap.append((word, POSIndices))
                                    # print "match!:" + word + " in file/utt: "\
                                    # + str(utt.swda_filename) + \
                                    # str(utt.transcript_index))
                                    del words[0]  # word is consumed
                                    if len(words) > 0:
                                        word = words[0]  # next word
                                        wordtest = re.sub(
                                            r"[\.\,\?\/\)\(\"\!\\]", "", word)
                                        wordtest = wordtest.replace("(", "").\
                                            replace(")", "").replace("/", "")
                                    POSIndices = []
                                    j += 1  # increment lead number
                                    match = True
                                    breaker = True
                                    # raw_input()
                                    break
                                elif (pos in wordtest or pos in word) \
                                        and not pos in [",", "."]:
                                    # substring relation
                                    testpos = pos
                                    POSIndices.append(j)
                                    j += 1
                                    if wordtest[-1] == "-" and \
                                            pos == wordtest[0:-1]:
                                        wordPOSMap.append((word, POSIndices))
                                        del words[0]  # remove word
                                        # print "match!:" + word + " in \
                                        # file/utt: " + str(utt.swda_filename) \
                                        #+ str(utt.transcript_index)
                                        if len(words) > 0:
                                            word = words[0]
                                            wordtest = re.sub(
                                                r"[\.\,\?\/\)\(\"\!\\]", "",
                                                word)
                                            wordtest = wordtest.\
                                                replace("(", "").\
                                                replace(")", "").\
                                                replace("/", "")
                                            POSIndices = []
                                        match = True
                                        breaker = True
                                        break
                                    for k in range(j, j + 3):
                                        if (k >= len(myPOS)):
                                            breaker = True
                                            break
                                        if (testpos + myPOS[k][0]) in wordtest\
                                                or (testpos + myPOS[k][0]) in word:
                                            testpos += myPOS[k][0]
                                            POSIndices.append(k)
                                            j += 1
                                            # concatenation
                                            if testpos == wordtest or \
                                                    testpos == word:  # matched
                                                wordPOSMap.append(
                                                    (word, POSIndices))
                                                del words[0]  # remove word
                                                # print "match!:" +\
                                                # word + " in file/utt: " + \
                                                # str(utt.swda_filename) +\
                                                # str(utt.transcript_index))
                                                if len(words) > 0:
                                                    word = words[0]
                                                    wordtest = re.sub(
                                                        r"[\.\,\?\/\)\(\"\!\\]",
                                                        "", word)
                                                    wordtest = wordtest.\
                                                        replace("(", "")
                                                    wordtest = wordtest.\
                                                        replace(")", "")
                                                POSIndices = []
                                                j = k + 1
                                                match = True
                                                breaker = True
                                                break
                                else:
                                    j += 1  # otherwise go on
                                if breaker:
                                    break
                                if match:
                                    break

                        # could not match word! Could be mistransription
                        if not match:
                            # print "false checking other options"
                            # print j
                            # print word
                            # print wordtest
                            if not mistranscribe:
                                mistranscribe = True
                                for pair in possibleMistranscription:
                                    if pair[0] == wordtest:
                                        wordtest = pair[1]
                                        break  # matched
                                if wordtest[-1] == "-":  # partial words
                                    wordtest = wordtest[0:-1]
                                if "'" in wordtest:
                                    wordtest = wordtest.replace("'", "")
                                if len(wordPOSMap) > 0:
                                    found = False
                                    for n in range(
                                            len(wordPOSMap) - 1, -1, -1):
                                        if len(wordPOSMap[n][1]) > 0:
                                            j = wordPOSMap[n][1][-1] + 1
                                            # print j
                                            found = True
                                            break
                                    if not found:
                                        # if not possible go back to
                                        # the beginning!
                                        j = 0
                                else:
                                    j = 0
                                # print j
                            else:
                                mistranscribe = False
                                wordPOSMap.append((word, POSIndices))
                                errormessage = "WARNING: no/partial POS \
                                mapping for ''"                                                + words[0] + "'' in file/utt:"\
                                    + str(utt.swda_filename) + "-" + \
                                    str(utt.transcript_index) + \
                                    "POSSIBLE COMMENT = " + \
                                    str(possibleComment)
                                del words[0]  # remove word
                                if len(words) > 0:
                                    word = words[0]
                                    wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]",
                                                      "", word)
                                    wordtest = wordtest.replace("(", "").\
                                        replace(")", "").replace("/", "")
                                # print errormessage
                                if errorLog:
                                    errorLog.write("possible wrong POS : " +
                                                   errormessage + "\n")
                                # raw_input()

                    # end of while loop (words)
                    if not len(wordPOSMap) == len(utt.text_words()):
                        print "Error "
                        print "Length mismatch in file/utt: " + \
                            str(utt.swda_filename) + str(utt.transcript_index)
                        print utt.text_words()
                        print wordPOSMap
                        raw_input()

                    wordPOSMapList.append(trans.conversation_no,
                                          str(utt.transcript_index),
                                          list(wordPOSMap))
                    # print "\nadded POSmap " + str(trans.swda_filename) + \
                    #"." + str(utt.transcript_index) + "\n"
                    csv_string = '"' + str(wordPOSMap) + '"'

                    corpus_file.write('"' + str(utt.conversation_no) + '"\t' +
                                      str(utt.transcript_index) + '\t' +
                                      csv_string + "\n")

                count += 1

        corpus_file.close()
        if errorLog:
            errorLog.close()
    def write_to_file(self, corpus_path, metadata_path, target_folder_path,
                      ranges, errorLog):
        """Writes files to a target folder with the mappings
        from words in utterances to tree nodes in trees.
        """

        if errorLog:
            errorLog = open(errorLog, 'w')
        corpus = CorpusReader(corpus_path, metadata_path)
        # Iterate through all transcripts
        incorrectTrees = 0
        folder = None
        corpus_file = None

        for trans in corpus.iter_transcripts():

            # print "iterating",trans.conversation_no
            if not trans.has_pos():
                continue
            # print "has pos"
            if ranges and not trans.conversation_no in ranges:
                continue
            # print "in range"
            # just look at transcripts WITH trees as compliment to the
            # below models
            if not trans.has_trees():
                continue
            end = trans.swda_filename.rfind("/")
            start = trans.swda_filename.rfind("/", 0, end)
            c_folder = trans.swda_filename[start + 1:end]
            if c_folder != folder:
                # for now splitting the maps by folder
                folder = c_folder
                if corpus_file:
                    corpus_file.close()
                corpus_file = open(
                    target_folder_path +
                    "/Tree_map_{0}.csv.text".format(folder), 'w')
                wordTreeMapList = TreeMapCorpus(False, errorLog)
                print "new map for folder", folder

            translist = trans.utterances
            translength = len(translist)
            count = 0

            # iterating through transcript utterance by utterance
            # create list of tuples i.e. map from word to the index(ices)
            # (possibly multiple or null) of the relevant leaf/ves
            # of a given tree i.e. utt.tree[0].leaves[0] would be a pair (0,0))
            while count < translength:
                utt = trans.utterances[count]
                words = utt.text_words()
                wordTreeMap = []  # [((word), (List of LeafIndices))]
                forwardtrack = 0
                backtrack = 0
                continued = False
                # print "\n COUNT" + str(count)
                # print utt.damsl_act_tag()
                if len(utt.trees) == 0 or utt.damsl_act_tag() == "x":
                    wordTreeMap.append((utt, []))  # just dummy value
                    # errormessage = "WARNING: NO TREE for file/utt: " +\
                    # str(utt.swda_filename) + " " + utt.caller + "." +  \
                    # str(utt.utterance_index) + "." + \
                    #str(utt.subutterance_index) + " " + utt.text
                    # print(errormessage)
                    count += 1
                    continue
                    # raw_input()

                # indices for which tree and leaf we're at:
                i = 0  # tree
                j = 0  # leaf
                # initialise pairs of trees and ptb pairs
                trees = []
                for l in range(0, len(utt.trees)):
                    trees.append(
                        (utt.ptb_treenumbers[l], count, l, utt.trees[l]))
                # print "TREES = "
                # for tree in trees:
                #    print tree
                origtrees = list(trees)
                origcount = count
                # overcoming the problem of previous utterances contributing
                # to the tree at this utterance, we need to add the words from
                # the previous utt add in all the words from previous utterance
                # with a dialogue act tag/or the same tree?
                # check that the last tree in the previous utterance
                # is the same as the previous one
                previousUttSame = trans.previous_utt_same_speaker(utt)
                # print previousUttSame
                lastTreeMap = None
                if previousUttSame:
                    # print "search for previous full act utt
                    # for " + str(utt.swda_filename) +
                    # str(utt.transcript_index)
                    lastTreeMap = wordTreeMapList.get_treemap(
                        trans, previousUttSame)
                    if ((not lastTreeMap) or (len(lastTreeMap) == 0) or
                        (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])):
                        # print "no last tree map, backwards searching"
                        while previousUttSame and \
                            ((not lastTreeMap) or (len(lastTreeMap) == 0) or
                             (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])):
                            previousUttSame = trans.previous_utt_same_speaker(
                                previousUttSame)  # go back one more
                            lastTreeMap = wordTreeMapList.get_treemap(
                                trans, previousUttSame)
                            if previousUttSame:
                                pass
                                # print previousUttSame.transcript_index

                    if not lastTreeMap:
                        pass
                        # print "no last treemap found for:"
                        # print utt.swda_filename
                        # print utt.transcript_index

                if lastTreeMap and \
                        (utt.damsl_act_tag() == "+" or
                         (len(lastTreeMap.treebank_numbers) > 0
                          and lastTreeMap.treebank_numbers[-1] ==
                          utt.ptb_treenumbers[0])):
                    continued = True
                    # might have to backtrack
                    # now checking for wrong trees
                    lastPTB = lastTreeMap.treebank_numbers
                    lastIndexes = lastTreeMap.transcript_numbers
                    lastTreesTemp = lastTreeMap.get_trees(trans)
                    lastTrees = []
                    for i in range(0, len(lastPTB)):
                        lastTrees.append([
                            lastPTB[i], lastIndexes[i][0], lastIndexes[i][1],
                            lastTreesTemp[i]
                        ])
                    if not (lastPTB[-1] == utt.ptb_treenumbers[0]):
                        # print "not same, need to correct!"
                        # print words
                        # print trees
                        # print "last one"
                        # print previousUttSame.text_words()
                        # print lastTrees
                        if utt.ptb_treenumbers[0] - lastPTB[-1] > 1:
                            # backtrack and redo the antecedent
                            count = count - (count - lastIndexes[-1][0])
                            utt = previousUttSame
                            words = utt.text_words()
                            mytrees = []
                            for i in range(0, len(lastTrees) - 1):
                                mytrees.append(lastTrees[i])
                            trees = mytrees + [origtrees[0]]
                            # print "\n(1)backtrack to with new trees:"
                            backtrack = 1
                            # print utt.transcript_index
                            # print words
                            # print trees
                            # raw_input()
                        # alternately, this utt's tree may be further back
                        # than its antecdent's, rare mistake
                        elif utt.ptb_treenumbers[0] < lastTrees[-1][0]:
                            # continue with this utterance and trees
                            # (if there are any), but replace its first
                            # tree with its antecdents last one
                            forwardtrack = 1
                            trees = [lastTrees[-1]] + origtrees[1:]
                            # print "\n(2)replacing first one to lasttreemap's:"
                            # print words
                            # print trees
                            # raw_input()

                    if backtrack != 1:  # we should have no match
                        found_treemap = False
                        # resetting
                        # for t in wordTreeMapList.keys():
                        #        print t
                        #        print wordTreeMapList[t]
                        for t in range(len(lastTreeMap) - 1, -1, -1):
                            # print lastTreeMap[t][1]
                            # if there is a leafIndices for the
                            # word being looked at, gets last mapped one
                            if len(lastTreeMap[t][1]) > 0:
                                # print "last treemapping of last
                                # caller utterance =
                                # " + str(lastTreeMap[t][1][-1])
                                j = lastTreeMap[t][1][-1][1] + 1
                                found_treemap = True
                                # print "found last mapping, j -1 = " + str(j-1)
                                # raw_input()
                                break
                        if not found_treemap:
                            pass
                            # print "NO matched last TREEMAP found for \
                            # previous Utt Same Speaker of " + \
                            # str(trans.swda_filename) + " " + \
                            # str(utt.transcript_index)
                            # print lastTreeMap
                            # for tmap in wordTreeMapList.keys():
                            #    print tmap
                            #    print wordTreeMapList[tmap]
                            # raw_input()

                possibleComment = False  # can have comments, flag
                mistranscribe = False
                LeafIndices = []  # possibly empty list of leaf indices
                word = words[0]
                # loop until no more words left to be matched in utterance
                while len(words) > 0:
                    # print "top WORD:" + word
                    if not mistranscribe:
                        wordtest = re.sub(r"[\.\,\?\"\!]", "", word)
                        wordtest = wordtest.replace("(", "").replace(")", "")
                    match = False
                    LeafIndices = []  # possibly empty list of leaf indices
                    if (possibleComment or word[0:1] in [
                            "{", "}", "-"
                    ] or word in ["/", ".", ",", "]"] or wordtest == "" or any(
                        [
                            x in word for x in
                            ["<", ">", "*", "[", "+", "]]", "...", "#", "="]
                        ])):
                        # no tree equivalent for {D } type annotations
                        if (word[0:1] == "-" or
                                any([x in word for x in
                                     ["*", "<<", "<+", "[[", "<"]])) \
                                and not possibleComment:
                            possibleComment = True
                        if possibleComment:
                            #print("match COMMENT!:" + word)
                            # raw_input()
                            LeafIndices = []
                            match = True
                            #wordTreeMap.append((word, LeafIndices))
                            if any([x in word for x in [">>", "]]", ">"]]) or \
                                    word[0] == "-":  # turn off comment
                                possibleComment = False
                                #del words[0]
                        # LeadIndices will be null here
                        wordTreeMap.append((word, LeafIndices))
                        LeafIndices = []
                        match = True
                        # print "match annotation!:" + word
                        del words[0]  # word is consumed, should always be one
                        if len(words) > 0:
                            word = words[0]
                            wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word)
                            wordtest = wordtest.replace("(", "")
                            wordtest = wordtest.replace(")", "")
                        else:
                            break
                        continue
                        # carry on to next word without updating indices?
                    else:
                        while i < len(trees):
                            # print "i number of trees :" + str(len(utt.trees))
                            # print "i tree number :" + str(i)
                            # print "i loop word :" + word
                            tree = trees[i][3]
                            # print "looking at ptb number " + str(trees[i][0])
                            # print "looking at index number " \
                            #+ str(trees[i][1])+","+str(trees[i][2])
                            while j < len(tree.leaves()):
                                leaf = tree.leaves()[j]
                                # print "j number of leaves : " \
                                #+ str(len(tree.leaves()))
                                # print "j loop word : " + word
                                # print "j loop wordtest : " + wordtest
                                # print "j leaf : " + str(j) + " " + leaf
                                breaker = False
                                # exact match
                                if wordtest == leaf or word == leaf:
                                    LeafIndices.append((i, j))
                                    wordTreeMap.append((word, LeafIndices))
                                    # print("match!:" + word + " " + \
                                    # str(utt.swda_filename) + " " + \
                                    # utt.caller + "." +  \
                                    # str(utt.utterance_index) + \
                                    # "." + str(utt.subutterance_index))
                                    del words[0]  # word is consumed
                                    if len(words) > 0:
                                        word = words[0]  # next word
                                        wordtest = re.sub(
                                            r"[\.\,\?\/\)\(\"\!]", "", word)
                                        wordtest = wordtest.replace("(", "")
                                        wordtest = wordtest.replace(")", "")
                                    LeafIndices = []
                                    j += 1  # increment loop to next leaf
                                    match = True
                                    breaker = True
                                    # raw_input()
                                    break
                                elif leaf in wordtest or \
                                        leaf in word and not leaf == ",":
                                    testleaf = leaf
                                    LeafIndices.append((i, j))
                                    j += 1
                                    for k in range(j, j + 3):  # 3 beyond
                                        if (k >= len(tree.leaves())):
                                            j = 0
                                            i += 1
                                            #breaker = True
                                            breaker = True
                                            break  # got to next tree
                                        if (testleaf + tree.leaves()[k]) \
                                                in wordtest or (testleaf +
                                                                tree.leaves()[k])\
                                                in word:
                                            testleaf += tree.leaves()[k]
                                            LeafIndices.append((i, k))
                                            j += 1
                                            # concatenation
                                            if testleaf == wordtest or \
                                                    testleaf == word:  # word matched
                                                wordTreeMap.append(
                                                    (word, LeafIndices))
                                                del words[0]  # remove word
                                                # print "match!:" + word +\
                                                #str(utt.swda_filename) + " "\
                                                # + utt.caller + "." +  \
                                                # str(utt.utterance_index) +\
                                                # "." + \
                                                # str(utt.subutterance_index))
                                                if len(words) > 0:
                                                    word = words[0]
                                                    wordtest = re.sub(
                                                        r"[\.\,\?\/\)\(\"\!]",
                                                        "", word)
                                                    wordtest = wordtest.\
                                                        replace("(", "")
                                                    wordtest = wordtest.\
                                                        replace(")", "")
                                                # reinitialise leaves
                                                LeafIndices = []
                                                j = k + 1
                                                match = True
                                                breaker = True
                                                # raw_input()
                                                break
                                else:
                                    # otherwise go on
                                    j += 1
                                if breaker:
                                    break
                                if match:
                                    break
                            if j >= len(tree.leaves()):
                                j = 0
                                i += 1
                            if match:
                                break

                    # could not match word! try mistranscriptions first:
                    if not match:
                        if not mistranscribe:  # one final stab at matching!
                            mistranscribe = True
                            for pair in possibleMistranscription:
                                if pair[0] == wordtest:
                                    wordtest = pair[1]
                                    if len(wordTreeMap) > 0:
                                        if len(wordTreeMap[-1][1]) > 0:
                                            i = wordTreeMap[-1][1][-1][0]
                                            j = wordTreeMap[-1][1][-1][1]
                                        else:
                                            # go back to beginning of
                                            # tree search
                                            i = 0
                                            j = 0
                                    else:
                                        i = 0  # go back to beginning
                                        j = 0
                                    break  # matched
                        elif continued:
                            # possible lack of matching up of words in
                            # previous utterance same caller and same
                            # tree// not always within same tree!!
                            errormessage = "Possible bad start for \
                            CONTINUED UTT ''"                                              + words[0] + "'' in file/utt: "\
                                + str(utt.swda_filename) + "\n " + utt.caller + \
                                "." + str(utt.utterance_index) + "." + \
                                str(utt.subutterance_index) + \
                                "POSSIBLE COMMENT = " + str(possibleComment)
                            # print errormessage
                            if not errorLog is None:
                                errorLog.write(errormessage + "\n")
                            # raw_input()
                            if backtrack == 1:
                                backtrack += 1
                            elif backtrack == 2:
                                # i.e. we've done two loops and
                                # still haven't found it, try the other way
                                count = origcount
                                utt = trans.utterances[count]
                                words = utt.text_words()
                                word = words[0]
                                trees = [lastTrees[-1]] + origtrees[1:]
                                # print "\nSECOND PASS(2)replacing \
                                # first one to lasttreemap's:"
                                # print words
                                # print trees
                                backtrack += 1
                                # mistranscribe = False #TODO perhaps needed
                                wordTreeMap = []
                                # switch to forward track this is
                                # the only time we want to try
                                # from the previous mapped leaf in the
                                # other tree
                                foundTreemap = False
                                for t in range(len(lastTreeMap) - 1, -1, -1):
                                    # backwards iteration through words
                                    # print lastTreeMap[t][1]
                                    if len(lastTreeMap[t][1]) > 0:
                                        # print "last treemapping of last \
                                        # caller utterance = " + \
                                        # str(lastTreeMap[t][1][-1])
                                        j = lastTreeMap[t][1][-1][1] + 1
                                        foundTreemap = True
                                        # print "found last mapping, j = " \
                                        #+ str(j)
                                        # raw_input()
                                        # break when last tree
                                        # mapped word from this caller is found
                                        break
                                    if not foundTreemap:
                                        # print "NO matched last TREEMAP found\
                                        # for previous Utt Same Speaker of " + \
                                        # str(utt.swda_filename) + " " + \
                                        # utt.caller + "." +  \
                                        # str(utt.utterance_index) + "." +\
                                        #  str(utt.subutterance_index)
                                        j = 0
                                        # for tmap in wordTreeMapList.keys():
                                        #    print tmap
                                        #    print wordTreeMapList[tmap]
                                        # raw_input()
                                i = 0  # go back to first tree
                                continue
                            elif forwardtrack == 1:
                                forwardtrack += 1
                            elif forwardtrack == 2:
                                count = count - (count - lastIndexes[-1][0])
                                utt = previousUttSame
                                words = utt.text_words()
                                word = words[0]
                                mytrees = []
                                for i in range(0, len(lastTrees) - 1):
                                    mytrees.append(lastTrees[i])
                                trees = mytrees + [origtrees[0]]
                                # print "\nSECOND PASS(1)backtrack to \
                                # with new trees:"
                                # print utt.transcript_index
                                # print words
                                # print trees
                                forwardtrack += 1
                                # mistranscribe = False #TODO maybe needed
                                wordTreeMap = []
                                # raw_input()
                            elif forwardtrack == 3 or backtrack == 3:
                                # if this hasn't worked reset to old trees
                                # print "trying final reset"
                                count = origcount
                                utt = trans.utterances[count]
                                words = utt.text_words()
                                word = words[0]
                                trees = origtrees
                                forwardtrack = 0
                                backtrack = 0
                                # mistranscribe = False #TODO maybe needed
                                wordTreeMap = []
                                # raw_input()
                            else:
                                pass
                                # print "resetting search"
                                # raw_input()
                            # unless forward tracking now,
                            # just go back to beginning
                            i = 0  # go back to beginning of tree search
                            j = 0
                        else:
                            mistranscribe = False
                            LeafIndices = []
                            wordTreeMap.append((word, LeafIndices))
                            errormessage = "WARNING: 440 no/partial tree \
                            mapping for ''"                                            + words[0] + "'' in file/utt: "\
                                + str(utt.swda_filename) + " \n" + utt.caller\
                                + "." + str(utt.utterance_index) + "." + \
                                str(utt.subutterance_index) + \
                                "POSSIBLE COMMENT = " + str(possibleComment)
                            # print utt.text_words()
                            del words[0]  # remove word
                            # for trip in wordTreeMap:
                            #    print "t",trip
                            if len(words) > 0:
                                word = words[0]
                                wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "",
                                                  word)
                                wordtest = wordtest.replace("(", "")
                                wordtest = wordtest.replace(")", "")
                            # print errormessage
                            if errorLog:
                                errorLog.write("possible wrong tree mapping:" +
                                               errormessage + "\n")
                            raw_input()
                # end of while loop (words)
                mytreenumbers = []
                for treemap in trees:
                    # the whole list but the tree
                    mytreenumbers.append(treemap[:-1])
                if not len(utt.text_words()) == len(wordTreeMap):
                    print "ERROR. uneven lengths!"
                    print utt.text_words()
                    print wordTreeMap
                    print trans.swda_filename
                    print utt.transcript_index
                    raw_input()
                    count += 1
                    continue
                # add the treemap
                wordTreeMapList.append(trans.conversation_no,
                                       utt.transcript_index,
                                       tuple(mytreenumbers),
                                       tuple(wordTreeMap))
                count += 1
            # rewrite after each transcript
            filedict = defaultdict(str)
            for key in wordTreeMapList.keys():
                csv_string = '"' + str(list(wordTreeMapList[key])) + '"'
                mytreenumbers = wordTreeMapList[key].transcript_numbers
                myptbnumbers = wordTreeMapList[key].treebank_numbers
                tree_list_string = '"'
                for i in range(0, len(mytreenumbers)):
                    treemap = [myptbnumbers[i]] + mytreenumbers[i]
                    tree_list_string += str(treemap) + ";"
                tree_list_string = tree_list_string[:-1] + '"'
                filename = '"' + key[0:key.rfind(':')] + '"'
                transindex = key[key.rfind(':') + 1:]
                filedict[int(transindex)] = filename \
                    + "\t" + transindex + '\t' + csv_string + "\t" \
                    + tree_list_string + "\n"
            for key in sorted(filedict.keys()):
                corpus_file.write(filedict[key])

            wordTreeMapList = TreeMapCorpus(False, errorLog)  # reset each time
        print "\n" + str(incorrectTrees) + " incorrect trees"
        corpus_file.close()
        if not errorLog is None:
            errorLog.close()
Exemplo n.º 9
0
remove_file(data_dir, dev_set_file, utterance_only_flag)

# Create a temporary directory and unzip the archived data
with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir:
    print('Created temporary directory', tmp_dir)

    zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'swda_archive.zip'),
                               'r')
    zip_file.extractall(tmp_dir)
    zip_file.close()

    # Corpus object for iterating over the whole corpus in .csv format
    corpus = CorpusReader(tmp_dir)

    # Process each transcript
    for transcript in corpus.iter_transcripts(display_progress=False):

        # Process the utterances and create a dialogue object
        dialogue = process_transcript(transcript, excluded_tags,
                                      excluded_chars)

        # Append all utterances to full_set text file
        dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue,
                         utterance_only_flag, 'a+')

        # Determine which set this dialogue belongs to (training, test or validation)
        set_dir = ''
        set_file = ''
        if dialogue.conversation_id in train_split:
            set_dir = 'train'
            set_file = train_set_file
Exemplo n.º 10
0
def main():
    cmdline_parser = argparse.ArgumentParser(description=__doc__)
    cmdline_parser.add_argument('--swda_basedir',
                                required=True,
                                help='SWDA basedir')
    cmdline_parser.add_argument('--model_json',
                                required=True,
                                help='output model json file')
    args = cmdline_parser.parse_args()

    all_utterances = set()
    corpus = CorpusReader(args.swda_basedir)
    for trans in corpus.iter_transcripts(display_progress=False):
        list_utterance = []
        for utt in trans.utterances:
            tokens = utt.pos_lemmas(wn_lemmatize=False)
            list_word = []
            for token in tokens:
                # skip punctuation by checking the POS tag.
                if not re.match(r'^[a-zA-Z]', token[1]):
                    continue
                list_word.append(token[0].lower())
            if not list_word:
                # ignore empty utterance
                continue
            utterance = ' '.join(list_word)
            if len(utterance) > 140:
                # Amazon has a limit of 140 character for slot values
                continue
            list_utterance.append(utterance)
        all_utterances |= set(list_utterance)
        # only keep first 1000 unique utterances
        if len(all_utterances) > 1000:
            break
    print('\nextracted {} unique utterances'.format(len(all_utterances)))

    language_model = {
        'invocationName':
        'lab two',
        'intents': [{
            'name': 'ConverseIntent',
            'slots': [{
                'name': 'Text',
                'type': 'TEXT'
            }],
            'samples': ['{Text}']
        }],
        'types': [{
            'name':
            'TEXT',
            'values': [{
                'name': {
                    'value': utt
                }
            } for utt in all_utterances]
        }]
    }

    interaction_model = {'interactionModel': {'languageModel': language_model}}

    json.dump(interaction_model, open(args.model_json, 'w'), indent=2)