def get_swda_utterances(swda_dir): corpus = CorpusReader(swda_dir) c = 0 last_utterances = dict() for trans in corpus.iter_transcripts(display_progress=True): last_utterances["A"] = [] last_utterances["B"] = [] for utt in trans.utterances: utt_temp = re.sub(r'\(|\)|-|\{.+? |\}|\[|\]|\+|#|/|<.+?>|,', "", utt.text.lower()) utt_tokens = word_tokenize(re.sub("<|>", "", utt_temp)) if utt.damsl_act_tag() != "+": last_utterances[utt.caller].append((c, utt.damsl_act_tag() + "/%s_%s_%s" % (utt.conversation_no, utt.caller, c), utt_tokens)) c += 1 else: try: prev = last_utterances[utt.caller].pop() new = (prev[0], prev[1], prev[2] + utt_tokens) last_utterances[utt.caller].append(new) except IndexError: pass # RW: for some reason, Chris Potts' Corpus Reader gives us utterances with a "+" tag although # there is no previous utterance of the same speaker to complete. # Looking at the originial data, there seems to be a bug in his Corpus Reader that skips some # stuff in the beginning for some reason (e.g. the beginning of conv. no 3554. print utt.conversation_no utterances = last_utterances["A"] + last_utterances["B"] utterances = sorted(utterances, key= lambda t: t[0]) for tpl in utterances: if tpl[2]: yield tpl[1:]
def get_dialog_acts(dset_root): cr = CorpusReader(dset_root) act_tags = Counter() i = 0 for utt in cr.iter_utterances(): # print(utt.keys()) # act_tags.append(utt.act_tag) act_tags.update([utt.act_tag]) return act_tags
def tag_counts(): """Gather and print counts of the tags.""" d = defaultdict(int) corpus = CorpusReader('swda') # Loop, counting tags: for utt in corpus.iter_utterances(display_progress=True): d[utt.act_tag] += 1 # Print the results sorted by count, largest to smallest: for key, val in sorted(d.items(), key=itemgetter(1), reverse=True): print key, val
def count_matches(): """Determine how many utterances have a single precisely matching tree.""" d = defaultdict(int) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(): if len(utt.trees) == 1: if utt.tree_is_perfect_match(): d['match'] += 1 else: d['mismatch'] += 1 print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
def Atag(): corpus = CorpusReader('swda') actTag = defaultdict(int) for utt in corpus.iter_utterances(display_progress=True): actTag[utt.damsl_act_tag()] +1 i=1 for key in actTag.keys(): actTag[key] = i i=i+1 print actTag return actTag
def preprocess_data(): act_tags = defaultdict(lambda: 0) corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): act_tags[utt.damsl_act_tag()] += 1 act_tags = act_tags.iteritems() act_tags = sorted(act_tags, key=itemgetter(1), reverse=True) f = open(tags_file, 'w') for k, v in act_tags: f.write('%s %d\n' % (k, v)) f.close() return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
def swda_education_region(): """Create a count dictionary relating education and region.""" d = defaultdict(int) corpus = CorpusReader('swda') # Iterate throught the transcripts; display_progress=True tracks progress: for trans in corpus.iter_transcripts(display_progress=True): d[(trans.from_caller_education, trans.from_caller_dialect_area)] += 1 d[(trans.to_caller_education, trans.to_caller_dialect_area)] += 1 # Turn d into a list of tuples as d.items(), sort it based on the # second (index 1 member) of those tuples, largest first, and # print out the results: for key, val in sorted(d.items(), key=itemgetter(1), reverse=True): print key, val
def act_tags_and_text(): """ Create a CSV file named swda-actags-and-text.csv in which each utterance utt has its own row consisting of utt.damsl_act_tag(), and clean-text utterance This data can be used for training a speechAct classifier """ csvwriter = csv.writer(open('swda-acttags-and-text.csv', 'wt')) csvwriter.writerow(['DamslActTag', 'Text']) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(display_progress=True): clean_words = utt.text_words(filter_disfluency=True) csvwriter.writerow([utt.damsl_act_tag(), " ".join(clean_words)])
def process_data(tags): x = [] y = [] model = {} corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): words = [w.lower() for w in utt.pos_words() if w not in except_words] for word in words: if word not in model: model[word] = random_vector(vector_size) words = [model[w] for w in words] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) return (x, y)
def process_data(tags): x = [] y = [] model= {} corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): words = [w.lower() for w in utt.pos_words() if w not in except_words] for word in words: if word not in model: model[word] = random_vector(vector_size) words = [model[w] for w in words] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) return (x, y)
def act_tags_and_rootlabels(): """ Create a CSV file named swda-actags-and-rootlabels.csv in which each utterance utt has its own row consisting of just utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node restricting attention to cases in which utt has a single, perfectly matching tree associated with it. """ csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w')) csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode']) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(display_progress=True): if utt.tree_is_perfect_match(): csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
class swda_reader(object): def __init__(self): self.corpus = CorpusReader('swda') def transcript_reader(self): def is_neg(tag): if (tag == 'sv') or (tag == 'sd'): return True return False def is_pos(tag): if (tag == 'qy') or (tag == 'qw') or (tag == 'qh'): return True return False pos_data = [] neg_data = [] for trans in self.corpus.iter_transcripts(): pool = [] for utt in trans.utterances: if utt.damsl_act_tag() == '+': pool.append(utt) else: if len(pool) > 0: pool.append(utt) else: pool = [utt] if is_neg(utt.damsl_act_tag()): neg_data.append(pool) elif is_pos(utt.damsl_act_tag()): pos_data.append(pool) pool = [] ''' if is_neg(utt.damsl_act_tag()) or is_pos(utt.damsl_act_tag()): print utt.pos_words() print utt.damsl_act_tag() ''' return pos_data,neg_data
def preprocess(): stemmer = PorterStemmer() corpus = CorpusReader('swda') stoplist =set([line.strip() for line in open("corpus/stopword", 'r')]) frequency = defaultdict(int) corpusDict = [[[stemmer.stem(word.translate(None, "?.,-").strip()) for word in utt.text.lower().split() if word.translate(None, "?.,-") not in stoplist],utt.damsl_act_tag()] for utt in corpus.iter_utterances(display_progress=True)] texts =[] for i in corpusDict: texts.append(i[0]) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 10] for text in texts] return texts
def load_dataset_OLD(): corpus = CorpusReader('swda') data = defaultdict(list) N = 221616 not_found_set = [] found = [] skipp_count = 0 for utt in corpus.iter_utterances(display_progress=False): d = { "basename": get_basename(utt), "words": " ".join(utt.pos_words()), "label": utt.damsl_act_tag(), } if len(d["words"]) < 1: #print("skipping ... ") skipp_count += 1 #print(utt.text_words()) continue not_found = True for splitname in SwDA: if d["basename"] in SwDA[splitname]: not_found = False data[splitname].append(d) found.append(d["basename"]) if not_found: not_found_set.append(d["basename"]) not_found_set = set(not_found_set) print("not found count:", len(not_found_set)) print("skipp count:", skipp_count) #for name in not_found_set: # print(name) print("label counts:") for k, v in data.items(): print("\t{} size:".format(k), len(v)) # 1115 seen dialogs, 19 unseen dialogs. size = len(set(found)) #assert size == 1115 + 19, "{} != 1115 + 19; difference = {}".format(size, 1115 + 19 - size) return data
def load_dataset(): corpus = CorpusReader('swda') data = [] skipp_count = 0 for utt in corpus.iter_utterances(display_progress=False): d = { "basename": get_basename(utt), "words": " ".join(utt.pos_words()), "label": utt.damsl_act_tag(), } if len(d["words"]) < 1: skipp_count += 1 continue data.append(d) print("skipp count:", skipp_count) return data
def process_data(model, tags): x = [] y = [] model_cache = {} non_modeled = set() corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): wordlist = str2wordlist(utt.text.lower()) for word in wordlist: if word in model: if word not in model_cache: model_cache[word] = model[word].tolist() else: non_modeled.add(word) words = [model_cache[w] for w in wordlist if w in model_cache] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) print 'Complete. The following words are not converted: ' print list(non_modeled) return (x, y)
def load_swda_data(): if not os.path.exists("../helper_files/swda_data.pkl"): corpus = CorpusReader("../data/switchboard-corpus/swda") excluded_tags = ["x", "+"] conversations = [] labels = [] print("Loading swda transcripts, this might take a while") for transcript in corpus.iter_transcripts(): utterances, utterance_labels = process_transcript_txt( transcript, excluded_tags) conversations.append(utterances) labels.append(utterance_labels) with open("../helper_files/swda_data.pkl", "wb") as f: pickle.dump((conversations, labels), f) else: with open("../helper_files/swda_data.pkl", "rb") as f: conversations, labels = pickle.load(f) return conversations, labels
def load_swda_corpus_data(swda_directory): print('Loading SwDA Corpus...') corpus_reader = CorpusReader(swda_directory) conversations = [] for transcript in corpus_reader.iter_transcripts(display_progress=False): name = 'sw' + str(transcript.conversation_no) conv = { "name": name, "utterances": [], "partition_name": get_partition(name) } for j, utterance in enumerate(transcript.utterances): utt = { "text": " ".join(utterance.text_words(filter_disfluency=True)), "act_tag": utterance.act_tag, "damsl_act_tag": utterance.damsl_act_tag(), "caller": utterance.caller, } #utt_text = " ".join(utterance.text_words(filter_disfluency=True)) #print("[{}] {}".format(j, utt_text)) #print("\t==>", utterance.act_tag, utterance.damsl_act_tag()) conv["utterances"].append(utt) conversations.append(conv) corpus = { "partition_source": "https://github.com/Franck-Dernoncourt/naacl2016", "train_ids": list(train_set_idx), "test_ids": list(test_set_idx), "dev_set_ids": list(valid_set_idx), "conversations": conversations } return corpus
from swda import CorpusReader from utilities import * import nltk nltk.download('averaged_perceptron_tagger') batch_name = 'dev' # train, test, val or dev resource_dir = 'data/' file_path = resource_dir + batch_name + "_text.txt" corpus = CorpusReader('switchboard_data/') # Excluded dialogue act tags excluded_tags = ['x', '+'] # Process switchboard csv's to text process_batch_to_txt_file(corpus, resource_dir, batch_name, excluded_tags=excluded_tags) print("Processing file: ", file_path) text_data = read_file(file_path) # Split into labels and sentences sentences = [] labels = [] for line in text_data: sentences.append(line.split("|")[0]) labels.append(line.split("|")[1]) # Generate tokenised utterances utterances = []
import nltk, datetime, matplotlib, math, random, copy, sys, os import dill as pickle from collections import defaultdict, Counter from swda import CorpusReader import numpy as np import scipy as sc import scipy.spatial.distance as dis import Levenshtein as LD from scipy import stats from multiprocessing import Pool corpus = CorpusReader('./data/dialogue_corpora/swb/SwDA/swda') caller_metafile = './data/dialogue_corpora/swb/SwDA/swda/call_con_tab.csv' ACW = [ 'alright', 'gotcha', 'huh', 'mm-hm', 'okay', 'right', 'uh-huh', 'yeah', 'yep', 'yes', 'yup' ] FP = ['uh', 'um', 'mm'] _proc = 1 _chunksize = 10000 def CalcSimWrap(_): return (Cos(_[1], _[2]), _[0])
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' svakulenko 12 Feb 2017 Generate CSV for process mining the conversations ''' import csv from swda import CorpusReader corpus = CorpusReader('swda') def main(): with open('swda.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';') # iterate over transcripts for trans in corpus.iter_transcripts(): # iterate over messages # print trans.conversation_no for utt in trans.utterances: spamwriter.writerow( [str(trans.conversation_no), utt.caller, utt.act_tag]) if __name__ == '__main__': main()
def main(): cmdline_parser = argparse.ArgumentParser(description=__doc__) cmdline_parser.add_argument('--swda_basedir', required=True, help='SWDA basedir') cmdline_parser.add_argument('--model_json', required=True, help='output model json file') args = cmdline_parser.parse_args() all_utterances = set() corpus = CorpusReader(args.swda_basedir) for trans in corpus.iter_transcripts(display_progress=False): list_utterance = [] for utt in trans.utterances: tokens = utt.pos_lemmas(wn_lemmatize=False) list_word = [] for token in tokens: # skip punctuation by checking the POS tag. if not re.match(r'^[a-zA-Z]', token[1]): continue list_word.append(token[0].lower()) if not list_word: # ignore empty utterance continue utterance = ' '.join(list_word) if len(utterance) > 140: # Amazon has a limit of 140 character for slot values continue list_utterance.append(utterance) all_utterances |= set(list_utterance) # only keep first 1000 unique utterances if len(all_utterances) > 1000: break print('\nextracted {} unique utterances'.format(len(all_utterances))) language_model = { 'invocationName': 'lab two', 'intents': [{ 'name': 'ConverseIntent', 'slots': [{ 'name': 'Text', 'type': 'TEXT' }], 'samples': ['{Text}'] }], 'types': [{ 'name': 'TEXT', 'values': [{ 'name': { 'value': utt } } for utt in all_utterances] }] } interaction_model = {'interactionModel': {'languageModel': language_model}} json.dump(interaction_model, open(args.model_json, 'w'), indent=2)
def write_to_file(self, corpus_path, metadata_path, target_folder_path, ranges, errorLog): """Writes files to a target folder with the mappings from words in utterances to corresponding POS tags. """ if errorLog: errorLog = open(errorLog, 'w') corpus = CorpusReader(corpus_path, metadata_path) folder = None corpus_file = None for trans in corpus.iter_transcripts(): # print "iterating",trans.conversation_no if not trans.has_pos(): continue # print "has pos" if ranges and not trans.conversation_no in ranges: continue # print "in range" # just look at transcripts WITHOUT trees as compliment to the # above models if trans.has_trees(): continue end = trans.swda_filename.rfind("/") start = trans.swda_filename.rfind("/", 0, end) c_folder = trans.swda_filename[start + 1:end] if c_folder != folder: # for now splitting the maps by folder folder = c_folder if corpus_file: corpus_file.close() corpus_file = open( target_folder_path + "/POS_map_{0}.csv.text".format(folder), 'w') wordPOSMapList = POSMapCorpus(False, errorLog) print "new map for folder", folder translist = trans.utterances translength = len(translist) count = 0 # iterating through transcript utterance by utterance while count < translength: utt = trans.utterances[count] words = utt.text_words() wordPOSMap = [] if len(utt.pos) == 0: # no POS wordPOSMap.append((utt, [])) # just dummy value wordPOSMapList.append(trans.conversation_no, utt.transcript_index, list(wordPOSMap)) errormessage = "WARNING: NO POS for file/utt: " +\ str(utt.swda_filename) + " " + utt.caller + "." + \ str(utt.utterance_index) + "." + \ str(utt.subutterance_index) + " " + utt.text # print errormessage # raw_input() else: # indices for which POS we're at j = 0 possibleComment = False # can have comments, flag mistranscribe = False word = words[0] # loop until no more words left to be matched in utterance while len(words) > 0: word = words[0] # print "top WORD:" + word if not mistranscribe: wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "", word) wordtest = wordtest.replace("(", "").\ replace(")", "").replace("/", "") match = False POSIndices = [] if (possibleComment or word[0:1] in [ "{", "}", "-" ] or word in [ "/", ".", ",", "]" ] or wordtest == "" or any([ x in word for x in ["<", ">", "*", "[", "+", "]]", "...", "#", "="] ])): # no tree equivalent for {D } type annotations if (word[0:1] == "-" or any([x in word for x in ["*", "<<", "<+", "[[", "<"]])) \ and not possibleComment: possibleComment = True if possibleComment: # print "match COMMENT!:" + word # raw_input() POSIndices = [] match = True if (any([x in word for x in [">>", "]]", "))", ">"]]) or word[0] == "-") \ and not word == "->": # turn off comment possibleComment = False if (">>" in word or "]]" in word or "))" in word or ">" in word and not word == "->"): # turn off comment possibleComment = False #del words[0] wordPOSMap.append((word, POSIndices)) POSIndices = [] match = True # print "match annotation!:" + word del words[0] # word is consumed if len(words) > 0: word = words[0] wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") else: break continue # carry on to next word else: myPOS = utt.regularize_pos_lemmas() while j < len(myPOS): pos = myPOS[j][0] # pair of (word,POS) # print "j number of pos : " + str(len(myPOS)) # print "j loop word : " + word # print "j loop wordtest : " + wordtest # print "j pos : " + str(j) + " " + str(pos) # raw_input() breaker = False if wordtest == pos or word == pos: # exact match POSIndices.append(j) wordPOSMap.append((word, POSIndices)) # print "match!:" + word + " in file/utt: "\ # + str(utt.swda_filename) + \ # str(utt.transcript_index)) del words[0] # word is consumed if len(words) > 0: word = words[0] # next word wordtest = re.sub( r"[\.\,\?\/\)\(\"\!\\]", "", word) wordtest = wordtest.replace("(", "").\ replace(")", "").replace("/", "") POSIndices = [] j += 1 # increment lead number match = True breaker = True # raw_input() break elif (pos in wordtest or pos in word) \ and not pos in [",", "."]: # substring relation testpos = pos POSIndices.append(j) j += 1 if wordtest[-1] == "-" and \ pos == wordtest[0:-1]: wordPOSMap.append((word, POSIndices)) del words[0] # remove word # print "match!:" + word + " in \ # file/utt: " + str(utt.swda_filename) \ #+ str(utt.transcript_index) if len(words) > 0: word = words[0] wordtest = re.sub( r"[\.\,\?\/\)\(\"\!\\]", "", word) wordtest = wordtest.\ replace("(", "").\ replace(")", "").\ replace("/", "") POSIndices = [] match = True breaker = True break for k in range(j, j + 3): if (k >= len(myPOS)): breaker = True break if (testpos + myPOS[k][0]) in wordtest\ or (testpos + myPOS[k][0]) in word: testpos += myPOS[k][0] POSIndices.append(k) j += 1 # concatenation if testpos == wordtest or \ testpos == word: # matched wordPOSMap.append( (word, POSIndices)) del words[0] # remove word # print "match!:" +\ # word + " in file/utt: " + \ # str(utt.swda_filename) +\ # str(utt.transcript_index)) if len(words) > 0: word = words[0] wordtest = re.sub( r"[\.\,\?\/\)\(\"\!\\]", "", word) wordtest = wordtest.\ replace("(", "") wordtest = wordtest.\ replace(")", "") POSIndices = [] j = k + 1 match = True breaker = True break else: j += 1 # otherwise go on if breaker: break if match: break # could not match word! Could be mistransription if not match: # print "false checking other options" # print j # print word # print wordtest if not mistranscribe: mistranscribe = True for pair in possibleMistranscription: if pair[0] == wordtest: wordtest = pair[1] break # matched if wordtest[-1] == "-": # partial words wordtest = wordtest[0:-1] if "'" in wordtest: wordtest = wordtest.replace("'", "") if len(wordPOSMap) > 0: found = False for n in range( len(wordPOSMap) - 1, -1, -1): if len(wordPOSMap[n][1]) > 0: j = wordPOSMap[n][1][-1] + 1 # print j found = True break if not found: # if not possible go back to # the beginning! j = 0 else: j = 0 # print j else: mistranscribe = False wordPOSMap.append((word, POSIndices)) errormessage = "WARNING: no/partial POS \ mapping for ''" + words[0] + "'' in file/utt:"\ + str(utt.swda_filename) + "-" + \ str(utt.transcript_index) + \ "POSSIBLE COMMENT = " + \ str(possibleComment) del words[0] # remove word if len(words) > 0: word = words[0] wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "", word) wordtest = wordtest.replace("(", "").\ replace(")", "").replace("/", "") # print errormessage if errorLog: errorLog.write("possible wrong POS : " + errormessage + "\n") # raw_input() # end of while loop (words) if not len(wordPOSMap) == len(utt.text_words()): print "Error " print "Length mismatch in file/utt: " + \ str(utt.swda_filename) + str(utt.transcript_index) print utt.text_words() print wordPOSMap raw_input() wordPOSMapList.append(trans.conversation_no, str(utt.transcript_index), list(wordPOSMap)) # print "\nadded POSmap " + str(trans.swda_filename) + \ #"." + str(utt.transcript_index) + "\n" csv_string = '"' + str(wordPOSMap) + '"' corpus_file.write('"' + str(utt.conversation_no) + '"\t' + str(utt.transcript_index) + '\t' + csv_string + "\n") count += 1 corpus_file.close() if errorLog: errorLog.close()
def write_to_file(self, corpus_path, metadata_path, target_folder_path, ranges, errorLog): """Writes files to a target folder with the mappings from words in utterances to tree nodes in trees. """ if errorLog: errorLog = open(errorLog, 'w') corpus = CorpusReader(corpus_path, metadata_path) # Iterate through all transcripts incorrectTrees = 0 folder = None corpus_file = None for trans in corpus.iter_transcripts(): # print "iterating",trans.conversation_no if not trans.has_pos(): continue # print "has pos" if ranges and not trans.conversation_no in ranges: continue # print "in range" # just look at transcripts WITH trees as compliment to the # below models if not trans.has_trees(): continue end = trans.swda_filename.rfind("/") start = trans.swda_filename.rfind("/", 0, end) c_folder = trans.swda_filename[start + 1:end] if c_folder != folder: # for now splitting the maps by folder folder = c_folder if corpus_file: corpus_file.close() corpus_file = open( target_folder_path + "/Tree_map_{0}.csv.text".format(folder), 'w') wordTreeMapList = TreeMapCorpus(False, errorLog) print "new map for folder", folder translist = trans.utterances translength = len(translist) count = 0 # iterating through transcript utterance by utterance # create list of tuples i.e. map from word to the index(ices) # (possibly multiple or null) of the relevant leaf/ves # of a given tree i.e. utt.tree[0].leaves[0] would be a pair (0,0)) while count < translength: utt = trans.utterances[count] words = utt.text_words() wordTreeMap = [] # [((word), (List of LeafIndices))] forwardtrack = 0 backtrack = 0 continued = False # print "\n COUNT" + str(count) # print utt.damsl_act_tag() if len(utt.trees) == 0 or utt.damsl_act_tag() == "x": wordTreeMap.append((utt, [])) # just dummy value # errormessage = "WARNING: NO TREE for file/utt: " +\ # str(utt.swda_filename) + " " + utt.caller + "." + \ # str(utt.utterance_index) + "." + \ #str(utt.subutterance_index) + " " + utt.text # print(errormessage) count += 1 continue # raw_input() # indices for which tree and leaf we're at: i = 0 # tree j = 0 # leaf # initialise pairs of trees and ptb pairs trees = [] for l in range(0, len(utt.trees)): trees.append( (utt.ptb_treenumbers[l], count, l, utt.trees[l])) # print "TREES = " # for tree in trees: # print tree origtrees = list(trees) origcount = count # overcoming the problem of previous utterances contributing # to the tree at this utterance, we need to add the words from # the previous utt add in all the words from previous utterance # with a dialogue act tag/or the same tree? # check that the last tree in the previous utterance # is the same as the previous one previousUttSame = trans.previous_utt_same_speaker(utt) # print previousUttSame lastTreeMap = None if previousUttSame: # print "search for previous full act utt # for " + str(utt.swda_filename) + # str(utt.transcript_index) lastTreeMap = wordTreeMapList.get_treemap( trans, previousUttSame) if ((not lastTreeMap) or (len(lastTreeMap) == 0) or (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])): # print "no last tree map, backwards searching" while previousUttSame and \ ((not lastTreeMap) or (len(lastTreeMap) == 0) or (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])): previousUttSame = trans.previous_utt_same_speaker( previousUttSame) # go back one more lastTreeMap = wordTreeMapList.get_treemap( trans, previousUttSame) if previousUttSame: pass # print previousUttSame.transcript_index if not lastTreeMap: pass # print "no last treemap found for:" # print utt.swda_filename # print utt.transcript_index if lastTreeMap and \ (utt.damsl_act_tag() == "+" or (len(lastTreeMap.treebank_numbers) > 0 and lastTreeMap.treebank_numbers[-1] == utt.ptb_treenumbers[0])): continued = True # might have to backtrack # now checking for wrong trees lastPTB = lastTreeMap.treebank_numbers lastIndexes = lastTreeMap.transcript_numbers lastTreesTemp = lastTreeMap.get_trees(trans) lastTrees = [] for i in range(0, len(lastPTB)): lastTrees.append([ lastPTB[i], lastIndexes[i][0], lastIndexes[i][1], lastTreesTemp[i] ]) if not (lastPTB[-1] == utt.ptb_treenumbers[0]): # print "not same, need to correct!" # print words # print trees # print "last one" # print previousUttSame.text_words() # print lastTrees if utt.ptb_treenumbers[0] - lastPTB[-1] > 1: # backtrack and redo the antecedent count = count - (count - lastIndexes[-1][0]) utt = previousUttSame words = utt.text_words() mytrees = [] for i in range(0, len(lastTrees) - 1): mytrees.append(lastTrees[i]) trees = mytrees + [origtrees[0]] # print "\n(1)backtrack to with new trees:" backtrack = 1 # print utt.transcript_index # print words # print trees # raw_input() # alternately, this utt's tree may be further back # than its antecdent's, rare mistake elif utt.ptb_treenumbers[0] < lastTrees[-1][0]: # continue with this utterance and trees # (if there are any), but replace its first # tree with its antecdents last one forwardtrack = 1 trees = [lastTrees[-1]] + origtrees[1:] # print "\n(2)replacing first one to lasttreemap's:" # print words # print trees # raw_input() if backtrack != 1: # we should have no match found_treemap = False # resetting # for t in wordTreeMapList.keys(): # print t # print wordTreeMapList[t] for t in range(len(lastTreeMap) - 1, -1, -1): # print lastTreeMap[t][1] # if there is a leafIndices for the # word being looked at, gets last mapped one if len(lastTreeMap[t][1]) > 0: # print "last treemapping of last # caller utterance = # " + str(lastTreeMap[t][1][-1]) j = lastTreeMap[t][1][-1][1] + 1 found_treemap = True # print "found last mapping, j -1 = " + str(j-1) # raw_input() break if not found_treemap: pass # print "NO matched last TREEMAP found for \ # previous Utt Same Speaker of " + \ # str(trans.swda_filename) + " " + \ # str(utt.transcript_index) # print lastTreeMap # for tmap in wordTreeMapList.keys(): # print tmap # print wordTreeMapList[tmap] # raw_input() possibleComment = False # can have comments, flag mistranscribe = False LeafIndices = [] # possibly empty list of leaf indices word = words[0] # loop until no more words left to be matched in utterance while len(words) > 0: # print "top WORD:" + word if not mistranscribe: wordtest = re.sub(r"[\.\,\?\"\!]", "", word) wordtest = wordtest.replace("(", "").replace(")", "") match = False LeafIndices = [] # possibly empty list of leaf indices if (possibleComment or word[0:1] in [ "{", "}", "-" ] or word in ["/", ".", ",", "]"] or wordtest == "" or any( [ x in word for x in ["<", ">", "*", "[", "+", "]]", "...", "#", "="] ])): # no tree equivalent for {D } type annotations if (word[0:1] == "-" or any([x in word for x in ["*", "<<", "<+", "[[", "<"]])) \ and not possibleComment: possibleComment = True if possibleComment: #print("match COMMENT!:" + word) # raw_input() LeafIndices = [] match = True #wordTreeMap.append((word, LeafIndices)) if any([x in word for x in [">>", "]]", ">"]]) or \ word[0] == "-": # turn off comment possibleComment = False #del words[0] # LeadIndices will be null here wordTreeMap.append((word, LeafIndices)) LeafIndices = [] match = True # print "match annotation!:" + word del words[0] # word is consumed, should always be one if len(words) > 0: word = words[0] wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") else: break continue # carry on to next word without updating indices? else: while i < len(trees): # print "i number of trees :" + str(len(utt.trees)) # print "i tree number :" + str(i) # print "i loop word :" + word tree = trees[i][3] # print "looking at ptb number " + str(trees[i][0]) # print "looking at index number " \ #+ str(trees[i][1])+","+str(trees[i][2]) while j < len(tree.leaves()): leaf = tree.leaves()[j] # print "j number of leaves : " \ #+ str(len(tree.leaves())) # print "j loop word : " + word # print "j loop wordtest : " + wordtest # print "j leaf : " + str(j) + " " + leaf breaker = False # exact match if wordtest == leaf or word == leaf: LeafIndices.append((i, j)) wordTreeMap.append((word, LeafIndices)) # print("match!:" + word + " " + \ # str(utt.swda_filename) + " " + \ # utt.caller + "." + \ # str(utt.utterance_index) + \ # "." + str(utt.subutterance_index)) del words[0] # word is consumed if len(words) > 0: word = words[0] # next word wordtest = re.sub( r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") LeafIndices = [] j += 1 # increment loop to next leaf match = True breaker = True # raw_input() break elif leaf in wordtest or \ leaf in word and not leaf == ",": testleaf = leaf LeafIndices.append((i, j)) j += 1 for k in range(j, j + 3): # 3 beyond if (k >= len(tree.leaves())): j = 0 i += 1 #breaker = True breaker = True break # got to next tree if (testleaf + tree.leaves()[k]) \ in wordtest or (testleaf + tree.leaves()[k])\ in word: testleaf += tree.leaves()[k] LeafIndices.append((i, k)) j += 1 # concatenation if testleaf == wordtest or \ testleaf == word: # word matched wordTreeMap.append( (word, LeafIndices)) del words[0] # remove word # print "match!:" + word +\ #str(utt.swda_filename) + " "\ # + utt.caller + "." + \ # str(utt.utterance_index) +\ # "." + \ # str(utt.subutterance_index)) if len(words) > 0: word = words[0] wordtest = re.sub( r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.\ replace("(", "") wordtest = wordtest.\ replace(")", "") # reinitialise leaves LeafIndices = [] j = k + 1 match = True breaker = True # raw_input() break else: # otherwise go on j += 1 if breaker: break if match: break if j >= len(tree.leaves()): j = 0 i += 1 if match: break # could not match word! try mistranscriptions first: if not match: if not mistranscribe: # one final stab at matching! mistranscribe = True for pair in possibleMistranscription: if pair[0] == wordtest: wordtest = pair[1] if len(wordTreeMap) > 0: if len(wordTreeMap[-1][1]) > 0: i = wordTreeMap[-1][1][-1][0] j = wordTreeMap[-1][1][-1][1] else: # go back to beginning of # tree search i = 0 j = 0 else: i = 0 # go back to beginning j = 0 break # matched elif continued: # possible lack of matching up of words in # previous utterance same caller and same # tree// not always within same tree!! errormessage = "Possible bad start for \ CONTINUED UTT ''" + words[0] + "'' in file/utt: "\ + str(utt.swda_filename) + "\n " + utt.caller + \ "." + str(utt.utterance_index) + "." + \ str(utt.subutterance_index) + \ "POSSIBLE COMMENT = " + str(possibleComment) # print errormessage if not errorLog is None: errorLog.write(errormessage + "\n") # raw_input() if backtrack == 1: backtrack += 1 elif backtrack == 2: # i.e. we've done two loops and # still haven't found it, try the other way count = origcount utt = trans.utterances[count] words = utt.text_words() word = words[0] trees = [lastTrees[-1]] + origtrees[1:] # print "\nSECOND PASS(2)replacing \ # first one to lasttreemap's:" # print words # print trees backtrack += 1 # mistranscribe = False #TODO perhaps needed wordTreeMap = [] # switch to forward track this is # the only time we want to try # from the previous mapped leaf in the # other tree foundTreemap = False for t in range(len(lastTreeMap) - 1, -1, -1): # backwards iteration through words # print lastTreeMap[t][1] if len(lastTreeMap[t][1]) > 0: # print "last treemapping of last \ # caller utterance = " + \ # str(lastTreeMap[t][1][-1]) j = lastTreeMap[t][1][-1][1] + 1 foundTreemap = True # print "found last mapping, j = " \ #+ str(j) # raw_input() # break when last tree # mapped word from this caller is found break if not foundTreemap: # print "NO matched last TREEMAP found\ # for previous Utt Same Speaker of " + \ # str(utt.swda_filename) + " " + \ # utt.caller + "." + \ # str(utt.utterance_index) + "." +\ # str(utt.subutterance_index) j = 0 # for tmap in wordTreeMapList.keys(): # print tmap # print wordTreeMapList[tmap] # raw_input() i = 0 # go back to first tree continue elif forwardtrack == 1: forwardtrack += 1 elif forwardtrack == 2: count = count - (count - lastIndexes[-1][0]) utt = previousUttSame words = utt.text_words() word = words[0] mytrees = [] for i in range(0, len(lastTrees) - 1): mytrees.append(lastTrees[i]) trees = mytrees + [origtrees[0]] # print "\nSECOND PASS(1)backtrack to \ # with new trees:" # print utt.transcript_index # print words # print trees forwardtrack += 1 # mistranscribe = False #TODO maybe needed wordTreeMap = [] # raw_input() elif forwardtrack == 3 or backtrack == 3: # if this hasn't worked reset to old trees # print "trying final reset" count = origcount utt = trans.utterances[count] words = utt.text_words() word = words[0] trees = origtrees forwardtrack = 0 backtrack = 0 # mistranscribe = False #TODO maybe needed wordTreeMap = [] # raw_input() else: pass # print "resetting search" # raw_input() # unless forward tracking now, # just go back to beginning i = 0 # go back to beginning of tree search j = 0 else: mistranscribe = False LeafIndices = [] wordTreeMap.append((word, LeafIndices)) errormessage = "WARNING: 440 no/partial tree \ mapping for ''" + words[0] + "'' in file/utt: "\ + str(utt.swda_filename) + " \n" + utt.caller\ + "." + str(utt.utterance_index) + "." + \ str(utt.subutterance_index) + \ "POSSIBLE COMMENT = " + str(possibleComment) # print utt.text_words() del words[0] # remove word # for trip in wordTreeMap: # print "t",trip if len(words) > 0: word = words[0] wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") # print errormessage if errorLog: errorLog.write("possible wrong tree mapping:" + errormessage + "\n") raw_input() # end of while loop (words) mytreenumbers = [] for treemap in trees: # the whole list but the tree mytreenumbers.append(treemap[:-1]) if not len(utt.text_words()) == len(wordTreeMap): print "ERROR. uneven lengths!" print utt.text_words() print wordTreeMap print trans.swda_filename print utt.transcript_index raw_input() count += 1 continue # add the treemap wordTreeMapList.append(trans.conversation_no, utt.transcript_index, tuple(mytreenumbers), tuple(wordTreeMap)) count += 1 # rewrite after each transcript filedict = defaultdict(str) for key in wordTreeMapList.keys(): csv_string = '"' + str(list(wordTreeMapList[key])) + '"' mytreenumbers = wordTreeMapList[key].transcript_numbers myptbnumbers = wordTreeMapList[key].treebank_numbers tree_list_string = '"' for i in range(0, len(mytreenumbers)): treemap = [myptbnumbers[i]] + mytreenumbers[i] tree_list_string += str(treemap) + ";" tree_list_string = tree_list_string[:-1] + '"' filename = '"' + key[0:key.rfind(':')] + '"' transindex = key[key.rfind(':') + 1:] filedict[int(transindex)] = filename \ + "\t" + transindex + '\t' + csv_string + "\t" \ + tree_list_string + "\n" for key in sorted(filedict.keys()): corpus_file.write(filedict[key]) wordTreeMapList = TreeMapCorpus(False, errorLog) # reset each time print "\n" + str(incorrectTrees) + " incorrect trees" corpus_file.close() if not errorLog is None: errorLog.close()
remove_file(data_dir, train_set_file, utterance_only_flag) remove_file(data_dir, test_set_file, utterance_only_flag) remove_file(data_dir, val_set_file, utterance_only_flag) remove_file(data_dir, dev_set_file, utterance_only_flag) # Create a temporary directory and unzip the archived data with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir: print('Created temporary directory', tmp_dir) zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'swda_archive.zip'), 'r') zip_file.extractall(tmp_dir) zip_file.close() # Corpus object for iterating over the whole corpus in .csv format corpus = CorpusReader(tmp_dir) # Process each transcript for transcript in corpus.iter_transcripts(display_progress=False): # Process the utterances and create a dialogue object dialogue = process_transcript(transcript, excluded_tags, excluded_chars) # Append all utterances to full_set text file dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue, utterance_only_flag, 'a+') # Determine which set this dialogue belongs to (training, test or validation) set_dir = '' set_file = ''
def __init__(self): self.corpus = CorpusReader('swda')