示例#1
0
def get_swda_utterances(swda_dir):
    corpus = CorpusReader(swda_dir)
    c = 0
    last_utterances = dict()

    for trans in  corpus.iter_transcripts(display_progress=True):
            last_utterances["A"] = []
            last_utterances["B"] = []
            for utt in trans.utterances:
                utt_temp = re.sub(r'\(|\)|-|\{.+? |\}|\[|\]|\+|#|/|<.+?>|,', "", utt.text.lower())
                utt_tokens = word_tokenize(re.sub("<|>", "", utt_temp))
                if utt.damsl_act_tag() != "+":
                    last_utterances[utt.caller].append((c, utt.damsl_act_tag() + "/%s_%s_%s" % (utt.conversation_no, utt.caller, c), utt_tokens))
                    c += 1
                else:
                    try:
                        prev = last_utterances[utt.caller].pop()
                        new = (prev[0], prev[1], prev[2] + utt_tokens)
                        last_utterances[utt.caller].append(new)
                    except IndexError:
                        pass
                        # RW: for some reason, Chris Potts' Corpus Reader gives us utterances with a "+" tag although
                        # there is no previous utterance of the same speaker to complete.
                        # Looking at the originial data, there seems to be a bug in his Corpus Reader that skips some
                        # stuff in the beginning for some reason (e.g. the beginning of conv. no 3554.
                        print utt.conversation_no
            utterances = last_utterances["A"] + last_utterances["B"]
            utterances = sorted(utterances, key= lambda t: t[0])
            for tpl in utterances:
                if tpl[2]:
                    yield tpl[1:]
示例#2
0
def get_dialog_acts(dset_root):
    cr = CorpusReader(dset_root)
    act_tags = Counter()
    i = 0
    for utt in cr.iter_utterances():
        # print(utt.keys())
        # act_tags.append(utt.act_tag)
        act_tags.update([utt.act_tag])
    return act_tags
示例#3
0
def tag_counts():
    """Gather and print counts of the tags."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Loop, counting tags:
    for utt in corpus.iter_utterances(display_progress=True):
        d[utt.act_tag] += 1
    # Print the results sorted by count, largest to smallest:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
示例#4
0
def tag_counts():
    """Gather and print counts of the tags."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Loop, counting tags:
    for utt in corpus.iter_utterances(display_progress=True):
        d[utt.act_tag] += 1
    # Print the results sorted by count, largest to smallest:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
示例#5
0
def count_matches():
    """Determine how many utterances have a single precisely matching tree."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    for utt in corpus.iter_utterances():
        if len(utt.trees) == 1:
            if utt.tree_is_perfect_match():
                d['match'] += 1
            else: 
                d['mismatch'] += 1
    print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
示例#6
0
def count_matches():
    """Determine how many utterances have a single precisely matching tree."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    for utt in corpus.iter_utterances():
        if len(utt.trees) == 1:
            if utt.tree_is_perfect_match():
                d['match'] += 1
            else: 
                d['mismatch'] += 1
    print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
示例#7
0
def Atag():
        corpus = CorpusReader('swda')
        actTag = defaultdict(int)
        for utt in corpus.iter_utterances(display_progress=True):
            actTag[utt.damsl_act_tag()] +1
        i=1
        for key in actTag.keys():
            actTag[key] = i
            i=i+1
        print actTag
        return actTag
def preprocess_data():
    act_tags = defaultdict(lambda: 0)
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        act_tags[utt.damsl_act_tag()] += 1
    act_tags = act_tags.iteritems()
    act_tags = sorted(act_tags, key=itemgetter(1), reverse=True)
    f = open(tags_file, 'w')
    for k, v in act_tags:
        f.write('%s %d\n' % (k, v))
    f.close()
    return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
def preprocess_data():
    act_tags = defaultdict(lambda: 0)
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        act_tags[utt.damsl_act_tag()] += 1
    act_tags = act_tags.iteritems()
    act_tags = sorted(act_tags, key=itemgetter(1), reverse=True)
    f = open(tags_file, 'w')
    for k, v in act_tags:
        f.write('%s %d\n' % (k, v))
    f.close()
    return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
示例#10
0
def swda_education_region():
    """Create a count dictionary relating education and region."""    
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Iterate throught the transcripts; display_progress=True tracks progress:
    for trans in corpus.iter_transcripts(display_progress=True):
        d[(trans.from_caller_education, trans.from_caller_dialect_area)] += 1
        d[(trans.to_caller_education, trans.to_caller_dialect_area)] += 1
    # Turn d into a list of tuples as d.items(), sort it based on the
    # second (index 1 member) of those tuples, largest first, and
    # print out the results:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
示例#11
0
def swda_education_region():
    """Create a count dictionary relating education and region."""    
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Iterate throught the transcripts; display_progress=True tracks progress:
    for trans in corpus.iter_transcripts(display_progress=True):
        d[(trans.from_caller_education, trans.from_caller_dialect_area)] += 1
        d[(trans.to_caller_education, trans.to_caller_dialect_area)] += 1
    # Turn d into a list of tuples as d.items(), sort it based on the
    # second (index 1 member) of those tuples, largest first, and
    # print out the results:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
示例#12
0
def act_tags_and_text():
    """
    Create a CSV file named swda-actags-and-text.csv in
    which each utterance utt has its own row consisting of

      utt.damsl_act_tag(), and clean-text utterance

    This data can be used for training a speechAct classifier
    """
    csvwriter = csv.writer(open('swda-acttags-and-text.csv', 'wt'))
    csvwriter.writerow(['DamslActTag', 'Text'])
    corpus = CorpusReader('swda')
    for utt in corpus.iter_utterances(display_progress=True):
        clean_words = utt.text_words(filter_disfluency=True)
        csvwriter.writerow([utt.damsl_act_tag(), " ".join(clean_words)])
def process_data(tags):
    x = []
    y = []
    model = {}
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        words = [w.lower() for w in utt.pos_words() if w not in except_words]
        for word in words:
            if word not in model:
                model[word] = random_vector(vector_size)
        words = [model[w] for w in words]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    return (x, y)
def process_data(tags):
    x = []
    y = []
    model= {}
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        words = [w.lower() for w in utt.pos_words() if w not in except_words]
        for word in words:
            if word not in model:
                model[word] = random_vector(vector_size)
        words = [model[w] for w in words]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    return (x, y)
示例#15
0
def act_tags_and_rootlabels():
    """
    Create a CSV file named swda-actags-and-rootlabels.csv in
    which each utterance utt has its own row consisting of just

      utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node

    restricting attention to cases in which utt has a single,
    perfectly matching tree associated with it.
    """
    csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w'))
    csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode'])
    corpus = CorpusReader('swda')    
    for utt in corpus.iter_utterances(display_progress=True):
        if utt.tree_is_perfect_match():
            csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
示例#16
0
def act_tags_and_rootlabels():
    """
    Create a CSV file named swda-actags-and-rootlabels.csv in
    which each utterance utt has its own row consisting of just

      utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node

    restricting attention to cases in which utt has a single,
    perfectly matching tree associated with it.
    """
    csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w'))
    csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode'])
    corpus = CorpusReader('swda')    
    for utt in corpus.iter_utterances(display_progress=True):
        if utt.tree_is_perfect_match():
            csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
示例#17
0
class swda_reader(object):
	def __init__(self):
		self.corpus = CorpusReader('swda')
	def transcript_reader(self):
		def is_neg(tag):
			if (tag == 'sv') or (tag == 'sd'):
				return True
			return False
		def is_pos(tag):
			if (tag == 'qy') or (tag == 'qw') or (tag == 'qh'):
				return True
			return False
		pos_data = []
		neg_data = []
		for trans in self.corpus.iter_transcripts():
			pool = []
			for utt in trans.utterances:
				if utt.damsl_act_tag() == '+':
					pool.append(utt)
				else:
					if len(pool) > 0:
						pool.append(utt)
					else:
						pool = [utt]
					if is_neg(utt.damsl_act_tag()):
                                        	neg_data.append(pool)
					elif is_pos(utt.damsl_act_tag()):
						pos_data.append(pool)
					pool = []
				'''
				if is_neg(utt.damsl_act_tag()) or is_pos(utt.damsl_act_tag()):
					print utt.pos_words()
					print utt.damsl_act_tag()
				'''
		return pos_data,neg_data
示例#18
0
def preprocess():
    stemmer = PorterStemmer()
    corpus = CorpusReader('swda')
    stoplist =set([line.strip() for line in open("corpus/stopword", 'r')])
    frequency = defaultdict(int)
    corpusDict = [[[stemmer.stem(word.translate(None, "?.,-").strip())
         for word in utt.text.lower().split() if word.translate(None, "?.,-") not in stoplist],utt.damsl_act_tag()]
         for utt in corpus.iter_utterances(display_progress=True)]
    texts =[]
    for i in corpusDict:
        texts.append(i[0])

    for text in texts:
       for token in text:
           frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 10]  for text in texts]
    return texts
示例#19
0
def load_dataset_OLD():
    corpus = CorpusReader('swda')
    data = defaultdict(list)
    N = 221616

    not_found_set = []
    found = []
    skipp_count = 0
    for utt in corpus.iter_utterances(display_progress=False):
        d = {
            "basename": get_basename(utt),
            "words": " ".join(utt.pos_words()),
            "label": utt.damsl_act_tag(),
        }

        if len(d["words"]) < 1:
            #print("skipping ... ")
            skipp_count += 1
            #print(utt.text_words())
            continue

        not_found = True
        for splitname in SwDA:
            if d["basename"] in SwDA[splitname]:
                not_found = False
                data[splitname].append(d)
                found.append(d["basename"])

        if not_found:
            not_found_set.append(d["basename"])

    not_found_set = set(not_found_set)
    print("not found count:", len(not_found_set))
    print("skipp count:", skipp_count)
    #for name in not_found_set:
    #    print(name)

    print("label counts:")
    for k, v in data.items():
        print("\t{} size:".format(k), len(v))

    # 1115 seen dialogs, 19 unseen dialogs.
    size = len(set(found))
    #assert size == 1115 + 19, "{} != 1115 + 19; difference = {}".format(size, 1115 + 19 - size)

    return data
示例#20
0
def load_dataset():
    corpus = CorpusReader('swda')
    data = []
    skipp_count = 0

    for utt in corpus.iter_utterances(display_progress=False):
        d = {
            "basename": get_basename(utt),
            "words": " ".join(utt.pos_words()),
            "label": utt.damsl_act_tag(),
        }

        if len(d["words"]) < 1:
            skipp_count += 1
            continue
        data.append(d)

    print("skipp count:", skipp_count)
    return data
def process_data(model, tags):
    x = []
    y = []
    model_cache = {}
    non_modeled = set()
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        wordlist = str2wordlist(utt.text.lower())
        for word in wordlist:
            if word in model:
                if word not in model_cache:
                    model_cache[word] = model[word].tolist()
            else:
                non_modeled.add(word)
        words = [model_cache[w] for w in wordlist if w in model_cache]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    print 'Complete. The following words are not converted: '
    print list(non_modeled)
    return (x, y)
示例#22
0
def load_swda_data():

    if not os.path.exists("../helper_files/swda_data.pkl"):
        corpus = CorpusReader("../data/switchboard-corpus/swda")
        excluded_tags = ["x", "+"]
        conversations = []
        labels = []
        print("Loading swda transcripts, this might take a while")
        for transcript in corpus.iter_transcripts():
            utterances, utterance_labels = process_transcript_txt(
                transcript, excluded_tags)
            conversations.append(utterances)
            labels.append(utterance_labels)

        with open("../helper_files/swda_data.pkl", "wb") as f:
            pickle.dump((conversations, labels), f)
    else:
        with open("../helper_files/swda_data.pkl", "rb") as f:
            conversations, labels = pickle.load(f)

    return conversations, labels
def process_data(model, tags):
    x = []
    y = []
    model_cache = {}
    non_modeled = set()
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        wordlist = str2wordlist(utt.text.lower())
        for word in wordlist:
            if word in model:
                if word not in model_cache:
                    model_cache[word] = model[word].tolist()
            else:
                non_modeled.add(word)
        words = [model_cache[w] for w in wordlist if w in model_cache]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    print 'Complete. The following words are not converted: '
    print list(non_modeled)
    return (x, y)
示例#24
0
def load_swda_corpus_data(swda_directory):
    print('Loading SwDA Corpus...')
    corpus_reader = CorpusReader(swda_directory)
    conversations = []

    for transcript in corpus_reader.iter_transcripts(display_progress=False):
        name = 'sw' + str(transcript.conversation_no)

        conv = {
            "name": name,
            "utterances": [],
            "partition_name": get_partition(name)
        }

        for j, utterance in enumerate(transcript.utterances):
            utt = {
                "text": " ".join(utterance.text_words(filter_disfluency=True)),
                "act_tag": utterance.act_tag,
                "damsl_act_tag": utterance.damsl_act_tag(),
                "caller": utterance.caller,
            }

            #utt_text = " ".join(utterance.text_words(filter_disfluency=True))
            #print("[{}] {}".format(j, utt_text))
            #print("\t==>", utterance.act_tag, utterance.damsl_act_tag())

            conv["utterances"].append(utt)

        conversations.append(conv)

    corpus = {
        "partition_source": "https://github.com/Franck-Dernoncourt/naacl2016",
        "train_ids": list(train_set_idx),
        "test_ids": list(test_set_idx),
        "dev_set_ids": list(valid_set_idx),
        "conversations": conversations
    }

    return corpus
from swda import CorpusReader
from utilities import *
import nltk
nltk.download('averaged_perceptron_tagger')

batch_name = 'dev'  # train, test, val or dev
resource_dir = 'data/'
file_path = resource_dir + batch_name + "_text.txt"
corpus = CorpusReader('switchboard_data/')

# Excluded dialogue act tags
excluded_tags = ['x', '+']

# Process switchboard csv's to text
process_batch_to_txt_file(corpus,
                          resource_dir,
                          batch_name,
                          excluded_tags=excluded_tags)

print("Processing file: ", file_path)
text_data = read_file(file_path)

# Split into labels and sentences
sentences = []
labels = []
for line in text_data:
    sentences.append(line.split("|")[0])
    labels.append(line.split("|")[1])

# Generate tokenised utterances
utterances = []
示例#26
0
import nltk, datetime, matplotlib, math, random, copy, sys, os
import dill as pickle

from collections import defaultdict, Counter
from swda import CorpusReader
import numpy as np
import scipy as sc
import scipy.spatial.distance as dis

import Levenshtein as LD
from scipy import stats
from multiprocessing import Pool

corpus = CorpusReader('./data/dialogue_corpora/swb/SwDA/swda')
caller_metafile = './data/dialogue_corpora/swb/SwDA/swda/call_con_tab.csv'

ACW = [
    'alright', 'gotcha', 'huh', 'mm-hm', 'okay', 'right', 'uh-huh', 'yeah',
    'yep', 'yes', 'yup'
]

FP = ['uh', 'um', 'mm']

_proc = 1
_chunksize = 10000


def CalcSimWrap(_):
    return (Cos(_[1], _[2]), _[0])

示例#27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
svakulenko
12 Feb 2017

Generate CSV for process mining the conversations
'''
import csv

from swda import CorpusReader
corpus = CorpusReader('swda')


def main():
    with open('swda.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';')
        # iterate over transcripts
        for trans in corpus.iter_transcripts():
            # iterate over messages
            # print trans.conversation_no
            for utt in trans.utterances:
                spamwriter.writerow(
                    [str(trans.conversation_no), utt.caller, utt.act_tag])


if __name__ == '__main__':
    main()
示例#28
0
def main():
    cmdline_parser = argparse.ArgumentParser(description=__doc__)
    cmdline_parser.add_argument('--swda_basedir',
                                required=True,
                                help='SWDA basedir')
    cmdline_parser.add_argument('--model_json',
                                required=True,
                                help='output model json file')
    args = cmdline_parser.parse_args()

    all_utterances = set()
    corpus = CorpusReader(args.swda_basedir)
    for trans in corpus.iter_transcripts(display_progress=False):
        list_utterance = []
        for utt in trans.utterances:
            tokens = utt.pos_lemmas(wn_lemmatize=False)
            list_word = []
            for token in tokens:
                # skip punctuation by checking the POS tag.
                if not re.match(r'^[a-zA-Z]', token[1]):
                    continue
                list_word.append(token[0].lower())
            if not list_word:
                # ignore empty utterance
                continue
            utterance = ' '.join(list_word)
            if len(utterance) > 140:
                # Amazon has a limit of 140 character for slot values
                continue
            list_utterance.append(utterance)
        all_utterances |= set(list_utterance)
        # only keep first 1000 unique utterances
        if len(all_utterances) > 1000:
            break
    print('\nextracted {} unique utterances'.format(len(all_utterances)))

    language_model = {
        'invocationName':
        'lab two',
        'intents': [{
            'name': 'ConverseIntent',
            'slots': [{
                'name': 'Text',
                'type': 'TEXT'
            }],
            'samples': ['{Text}']
        }],
        'types': [{
            'name':
            'TEXT',
            'values': [{
                'name': {
                    'value': utt
                }
            } for utt in all_utterances]
        }]
    }

    interaction_model = {'interactionModel': {'languageModel': language_model}}

    json.dump(interaction_model, open(args.model_json, 'w'), indent=2)
    def write_to_file(self, corpus_path, metadata_path, target_folder_path,
                      ranges, errorLog):
        """Writes files to a target folder with the mappings
        from words in utterances to corresponding POS tags.
        """
        if errorLog:
            errorLog = open(errorLog, 'w')
        corpus = CorpusReader(corpus_path, metadata_path)

        folder = None
        corpus_file = None
        for trans in corpus.iter_transcripts():

            # print "iterating",trans.conversation_no
            if not trans.has_pos():
                continue
            # print "has pos"
            if ranges and not trans.conversation_no in ranges:
                continue
            # print "in range"
            # just look at transcripts WITHOUT trees as compliment to the
            # above models
            if trans.has_trees():
                continue
            end = trans.swda_filename.rfind("/")
            start = trans.swda_filename.rfind("/", 0, end)
            c_folder = trans.swda_filename[start + 1:end]
            if c_folder != folder:
                # for now splitting the maps by folder
                folder = c_folder
                if corpus_file:
                    corpus_file.close()
                corpus_file = open(
                    target_folder_path +
                    "/POS_map_{0}.csv.text".format(folder), 'w')
                wordPOSMapList = POSMapCorpus(False, errorLog)
                print "new map for folder", folder

            translist = trans.utterances
            translength = len(translist)
            count = 0

            # iterating through transcript utterance by utterance
            while count < translength:
                utt = trans.utterances[count]
                words = utt.text_words()
                wordPOSMap = []
                if len(utt.pos) == 0:  # no POS
                    wordPOSMap.append((utt, []))  # just dummy value
                    wordPOSMapList.append(trans.conversation_no,
                                          utt.transcript_index,
                                          list(wordPOSMap))
                    errormessage = "WARNING: NO POS for file/utt: " +\
                        str(utt.swda_filename) + " " + utt.caller + "." + \
                        str(utt.utterance_index) + "." + \
                        str(utt.subutterance_index) + " " + utt.text
                    # print errormessage
                    # raw_input()
                else:
                    # indices for which POS we're at
                    j = 0
                    possibleComment = False  # can have comments, flag
                    mistranscribe = False
                    word = words[0]
                    # loop until no more words left to be matched in utterance
                    while len(words) > 0:
                        word = words[0]
                        # print "top WORD:" + word
                        if not mistranscribe:
                            wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "",
                                              word)
                            wordtest = wordtest.replace("(", "").\
                                replace(")", "").replace("/", "")
                        match = False
                        POSIndices = []

                        if (possibleComment or word[0:1] in [
                                "{", "}", "-"
                        ] or word in [
                                "/", ".", ",", "]"
                        ] or wordtest == "" or any([
                                x in word
                                for x in
                            ["<", ">", "*", "[", "+", "]]", "...", "#", "="]
                        ])):
                            # no tree equivalent for {D } type annotations
                            if (word[0:1] == "-" or
                                    any([x in word for x in
                                         ["*", "<<", "<+", "[[", "<"]])) \
                                    and not possibleComment:
                                possibleComment = True
                            if possibleComment:
                                # print "match COMMENT!:" + word
                                # raw_input()
                                POSIndices = []
                                match = True
                                if (any([x in word for x in [">>", "]]", "))",
                                                             ">"]]) or
                                        word[0] == "-") \
                                        and not word == "->":
                                    # turn off comment
                                    possibleComment = False
                                if (">>" in word or "]]" in word
                                        or "))" in word or ">" in word and
                                        not word == "->"):  # turn off comment
                                    possibleComment = False
                                    #del words[0]
                            wordPOSMap.append((word, POSIndices))
                            POSIndices = []
                            match = True
                            # print "match annotation!:" + word
                            del words[0]  # word is consumed
                            if len(words) > 0:
                                word = words[0]
                                wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]", "",
                                                  word)
                                wordtest = wordtest.replace("(", "")
                                wordtest = wordtest.replace(")", "")
                            else:
                                break
                            continue  # carry on to next word
                        else:
                            myPOS = utt.regularize_pos_lemmas()
                            while j < len(myPOS):
                                pos = myPOS[j][0]  # pair of (word,POS)
                                # print "j number of pos : " + str(len(myPOS))
                                # print "j loop word : " + word
                                # print "j loop wordtest : " + wordtest
                                # print "j pos : " + str(j) + " " + str(pos)
                                # raw_input()
                                breaker = False
                                if wordtest == pos or word == pos:  # exact match
                                    POSIndices.append(j)
                                    wordPOSMap.append((word, POSIndices))
                                    # print "match!:" + word + " in file/utt: "\
                                    # + str(utt.swda_filename) + \
                                    # str(utt.transcript_index))
                                    del words[0]  # word is consumed
                                    if len(words) > 0:
                                        word = words[0]  # next word
                                        wordtest = re.sub(
                                            r"[\.\,\?\/\)\(\"\!\\]", "", word)
                                        wordtest = wordtest.replace("(", "").\
                                            replace(")", "").replace("/", "")
                                    POSIndices = []
                                    j += 1  # increment lead number
                                    match = True
                                    breaker = True
                                    # raw_input()
                                    break
                                elif (pos in wordtest or pos in word) \
                                        and not pos in [",", "."]:
                                    # substring relation
                                    testpos = pos
                                    POSIndices.append(j)
                                    j += 1
                                    if wordtest[-1] == "-" and \
                                            pos == wordtest[0:-1]:
                                        wordPOSMap.append((word, POSIndices))
                                        del words[0]  # remove word
                                        # print "match!:" + word + " in \
                                        # file/utt: " + str(utt.swda_filename) \
                                        #+ str(utt.transcript_index)
                                        if len(words) > 0:
                                            word = words[0]
                                            wordtest = re.sub(
                                                r"[\.\,\?\/\)\(\"\!\\]", "",
                                                word)
                                            wordtest = wordtest.\
                                                replace("(", "").\
                                                replace(")", "").\
                                                replace("/", "")
                                            POSIndices = []
                                        match = True
                                        breaker = True
                                        break
                                    for k in range(j, j + 3):
                                        if (k >= len(myPOS)):
                                            breaker = True
                                            break
                                        if (testpos + myPOS[k][0]) in wordtest\
                                                or (testpos + myPOS[k][0]) in word:
                                            testpos += myPOS[k][0]
                                            POSIndices.append(k)
                                            j += 1
                                            # concatenation
                                            if testpos == wordtest or \
                                                    testpos == word:  # matched
                                                wordPOSMap.append(
                                                    (word, POSIndices))
                                                del words[0]  # remove word
                                                # print "match!:" +\
                                                # word + " in file/utt: " + \
                                                # str(utt.swda_filename) +\
                                                # str(utt.transcript_index))
                                                if len(words) > 0:
                                                    word = words[0]
                                                    wordtest = re.sub(
                                                        r"[\.\,\?\/\)\(\"\!\\]",
                                                        "", word)
                                                    wordtest = wordtest.\
                                                        replace("(", "")
                                                    wordtest = wordtest.\
                                                        replace(")", "")
                                                POSIndices = []
                                                j = k + 1
                                                match = True
                                                breaker = True
                                                break
                                else:
                                    j += 1  # otherwise go on
                                if breaker:
                                    break
                                if match:
                                    break

                        # could not match word! Could be mistransription
                        if not match:
                            # print "false checking other options"
                            # print j
                            # print word
                            # print wordtest
                            if not mistranscribe:
                                mistranscribe = True
                                for pair in possibleMistranscription:
                                    if pair[0] == wordtest:
                                        wordtest = pair[1]
                                        break  # matched
                                if wordtest[-1] == "-":  # partial words
                                    wordtest = wordtest[0:-1]
                                if "'" in wordtest:
                                    wordtest = wordtest.replace("'", "")
                                if len(wordPOSMap) > 0:
                                    found = False
                                    for n in range(
                                            len(wordPOSMap) - 1, -1, -1):
                                        if len(wordPOSMap[n][1]) > 0:
                                            j = wordPOSMap[n][1][-1] + 1
                                            # print j
                                            found = True
                                            break
                                    if not found:
                                        # if not possible go back to
                                        # the beginning!
                                        j = 0
                                else:
                                    j = 0
                                # print j
                            else:
                                mistranscribe = False
                                wordPOSMap.append((word, POSIndices))
                                errormessage = "WARNING: no/partial POS \
                                mapping for ''"                                                + words[0] + "'' in file/utt:"\
                                    + str(utt.swda_filename) + "-" + \
                                    str(utt.transcript_index) + \
                                    "POSSIBLE COMMENT = " + \
                                    str(possibleComment)
                                del words[0]  # remove word
                                if len(words) > 0:
                                    word = words[0]
                                    wordtest = re.sub(r"[\.\,\?\/\)\(\"\!\\]",
                                                      "", word)
                                    wordtest = wordtest.replace("(", "").\
                                        replace(")", "").replace("/", "")
                                # print errormessage
                                if errorLog:
                                    errorLog.write("possible wrong POS : " +
                                                   errormessage + "\n")
                                # raw_input()

                    # end of while loop (words)
                    if not len(wordPOSMap) == len(utt.text_words()):
                        print "Error "
                        print "Length mismatch in file/utt: " + \
                            str(utt.swda_filename) + str(utt.transcript_index)
                        print utt.text_words()
                        print wordPOSMap
                        raw_input()

                    wordPOSMapList.append(trans.conversation_no,
                                          str(utt.transcript_index),
                                          list(wordPOSMap))
                    # print "\nadded POSmap " + str(trans.swda_filename) + \
                    #"." + str(utt.transcript_index) + "\n"
                    csv_string = '"' + str(wordPOSMap) + '"'

                    corpus_file.write('"' + str(utt.conversation_no) + '"\t' +
                                      str(utt.transcript_index) + '\t' +
                                      csv_string + "\n")

                count += 1

        corpus_file.close()
        if errorLog:
            errorLog.close()
    def write_to_file(self, corpus_path, metadata_path, target_folder_path,
                      ranges, errorLog):
        """Writes files to a target folder with the mappings
        from words in utterances to tree nodes in trees.
        """

        if errorLog:
            errorLog = open(errorLog, 'w')
        corpus = CorpusReader(corpus_path, metadata_path)
        # Iterate through all transcripts
        incorrectTrees = 0
        folder = None
        corpus_file = None

        for trans in corpus.iter_transcripts():

            # print "iterating",trans.conversation_no
            if not trans.has_pos():
                continue
            # print "has pos"
            if ranges and not trans.conversation_no in ranges:
                continue
            # print "in range"
            # just look at transcripts WITH trees as compliment to the
            # below models
            if not trans.has_trees():
                continue
            end = trans.swda_filename.rfind("/")
            start = trans.swda_filename.rfind("/", 0, end)
            c_folder = trans.swda_filename[start + 1:end]
            if c_folder != folder:
                # for now splitting the maps by folder
                folder = c_folder
                if corpus_file:
                    corpus_file.close()
                corpus_file = open(
                    target_folder_path +
                    "/Tree_map_{0}.csv.text".format(folder), 'w')
                wordTreeMapList = TreeMapCorpus(False, errorLog)
                print "new map for folder", folder

            translist = trans.utterances
            translength = len(translist)
            count = 0

            # iterating through transcript utterance by utterance
            # create list of tuples i.e. map from word to the index(ices)
            # (possibly multiple or null) of the relevant leaf/ves
            # of a given tree i.e. utt.tree[0].leaves[0] would be a pair (0,0))
            while count < translength:
                utt = trans.utterances[count]
                words = utt.text_words()
                wordTreeMap = []  # [((word), (List of LeafIndices))]
                forwardtrack = 0
                backtrack = 0
                continued = False
                # print "\n COUNT" + str(count)
                # print utt.damsl_act_tag()
                if len(utt.trees) == 0 or utt.damsl_act_tag() == "x":
                    wordTreeMap.append((utt, []))  # just dummy value
                    # errormessage = "WARNING: NO TREE for file/utt: " +\
                    # str(utt.swda_filename) + " " + utt.caller + "." +  \
                    # str(utt.utterance_index) + "." + \
                    #str(utt.subutterance_index) + " " + utt.text
                    # print(errormessage)
                    count += 1
                    continue
                    # raw_input()

                # indices for which tree and leaf we're at:
                i = 0  # tree
                j = 0  # leaf
                # initialise pairs of trees and ptb pairs
                trees = []
                for l in range(0, len(utt.trees)):
                    trees.append(
                        (utt.ptb_treenumbers[l], count, l, utt.trees[l]))
                # print "TREES = "
                # for tree in trees:
                #    print tree
                origtrees = list(trees)
                origcount = count
                # overcoming the problem of previous utterances contributing
                # to the tree at this utterance, we need to add the words from
                # the previous utt add in all the words from previous utterance
                # with a dialogue act tag/or the same tree?
                # check that the last tree in the previous utterance
                # is the same as the previous one
                previousUttSame = trans.previous_utt_same_speaker(utt)
                # print previousUttSame
                lastTreeMap = None
                if previousUttSame:
                    # print "search for previous full act utt
                    # for " + str(utt.swda_filename) +
                    # str(utt.transcript_index)
                    lastTreeMap = wordTreeMapList.get_treemap(
                        trans, previousUttSame)
                    if ((not lastTreeMap) or (len(lastTreeMap) == 0) or
                        (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])):
                        # print "no last tree map, backwards searching"
                        while previousUttSame and \
                            ((not lastTreeMap) or (len(lastTreeMap) == 0) or
                             (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])):
                            previousUttSame = trans.previous_utt_same_speaker(
                                previousUttSame)  # go back one more
                            lastTreeMap = wordTreeMapList.get_treemap(
                                trans, previousUttSame)
                            if previousUttSame:
                                pass
                                # print previousUttSame.transcript_index

                    if not lastTreeMap:
                        pass
                        # print "no last treemap found for:"
                        # print utt.swda_filename
                        # print utt.transcript_index

                if lastTreeMap and \
                        (utt.damsl_act_tag() == "+" or
                         (len(lastTreeMap.treebank_numbers) > 0
                          and lastTreeMap.treebank_numbers[-1] ==
                          utt.ptb_treenumbers[0])):
                    continued = True
                    # might have to backtrack
                    # now checking for wrong trees
                    lastPTB = lastTreeMap.treebank_numbers
                    lastIndexes = lastTreeMap.transcript_numbers
                    lastTreesTemp = lastTreeMap.get_trees(trans)
                    lastTrees = []
                    for i in range(0, len(lastPTB)):
                        lastTrees.append([
                            lastPTB[i], lastIndexes[i][0], lastIndexes[i][1],
                            lastTreesTemp[i]
                        ])
                    if not (lastPTB[-1] == utt.ptb_treenumbers[0]):
                        # print "not same, need to correct!"
                        # print words
                        # print trees
                        # print "last one"
                        # print previousUttSame.text_words()
                        # print lastTrees
                        if utt.ptb_treenumbers[0] - lastPTB[-1] > 1:
                            # backtrack and redo the antecedent
                            count = count - (count - lastIndexes[-1][0])
                            utt = previousUttSame
                            words = utt.text_words()
                            mytrees = []
                            for i in range(0, len(lastTrees) - 1):
                                mytrees.append(lastTrees[i])
                            trees = mytrees + [origtrees[0]]
                            # print "\n(1)backtrack to with new trees:"
                            backtrack = 1
                            # print utt.transcript_index
                            # print words
                            # print trees
                            # raw_input()
                        # alternately, this utt's tree may be further back
                        # than its antecdent's, rare mistake
                        elif utt.ptb_treenumbers[0] < lastTrees[-1][0]:
                            # continue with this utterance and trees
                            # (if there are any), but replace its first
                            # tree with its antecdents last one
                            forwardtrack = 1
                            trees = [lastTrees[-1]] + origtrees[1:]
                            # print "\n(2)replacing first one to lasttreemap's:"
                            # print words
                            # print trees
                            # raw_input()

                    if backtrack != 1:  # we should have no match
                        found_treemap = False
                        # resetting
                        # for t in wordTreeMapList.keys():
                        #        print t
                        #        print wordTreeMapList[t]
                        for t in range(len(lastTreeMap) - 1, -1, -1):
                            # print lastTreeMap[t][1]
                            # if there is a leafIndices for the
                            # word being looked at, gets last mapped one
                            if len(lastTreeMap[t][1]) > 0:
                                # print "last treemapping of last
                                # caller utterance =
                                # " + str(lastTreeMap[t][1][-1])
                                j = lastTreeMap[t][1][-1][1] + 1
                                found_treemap = True
                                # print "found last mapping, j -1 = " + str(j-1)
                                # raw_input()
                                break
                        if not found_treemap:
                            pass
                            # print "NO matched last TREEMAP found for \
                            # previous Utt Same Speaker of " + \
                            # str(trans.swda_filename) + " " + \
                            # str(utt.transcript_index)
                            # print lastTreeMap
                            # for tmap in wordTreeMapList.keys():
                            #    print tmap
                            #    print wordTreeMapList[tmap]
                            # raw_input()

                possibleComment = False  # can have comments, flag
                mistranscribe = False
                LeafIndices = []  # possibly empty list of leaf indices
                word = words[0]
                # loop until no more words left to be matched in utterance
                while len(words) > 0:
                    # print "top WORD:" + word
                    if not mistranscribe:
                        wordtest = re.sub(r"[\.\,\?\"\!]", "", word)
                        wordtest = wordtest.replace("(", "").replace(")", "")
                    match = False
                    LeafIndices = []  # possibly empty list of leaf indices
                    if (possibleComment or word[0:1] in [
                            "{", "}", "-"
                    ] or word in ["/", ".", ",", "]"] or wordtest == "" or any(
                        [
                            x in word for x in
                            ["<", ">", "*", "[", "+", "]]", "...", "#", "="]
                        ])):
                        # no tree equivalent for {D } type annotations
                        if (word[0:1] == "-" or
                                any([x in word for x in
                                     ["*", "<<", "<+", "[[", "<"]])) \
                                and not possibleComment:
                            possibleComment = True
                        if possibleComment:
                            #print("match COMMENT!:" + word)
                            # raw_input()
                            LeafIndices = []
                            match = True
                            #wordTreeMap.append((word, LeafIndices))
                            if any([x in word for x in [">>", "]]", ">"]]) or \
                                    word[0] == "-":  # turn off comment
                                possibleComment = False
                                #del words[0]
                        # LeadIndices will be null here
                        wordTreeMap.append((word, LeafIndices))
                        LeafIndices = []
                        match = True
                        # print "match annotation!:" + word
                        del words[0]  # word is consumed, should always be one
                        if len(words) > 0:
                            word = words[0]
                            wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word)
                            wordtest = wordtest.replace("(", "")
                            wordtest = wordtest.replace(")", "")
                        else:
                            break
                        continue
                        # carry on to next word without updating indices?
                    else:
                        while i < len(trees):
                            # print "i number of trees :" + str(len(utt.trees))
                            # print "i tree number :" + str(i)
                            # print "i loop word :" + word
                            tree = trees[i][3]
                            # print "looking at ptb number " + str(trees[i][0])
                            # print "looking at index number " \
                            #+ str(trees[i][1])+","+str(trees[i][2])
                            while j < len(tree.leaves()):
                                leaf = tree.leaves()[j]
                                # print "j number of leaves : " \
                                #+ str(len(tree.leaves()))
                                # print "j loop word : " + word
                                # print "j loop wordtest : " + wordtest
                                # print "j leaf : " + str(j) + " " + leaf
                                breaker = False
                                # exact match
                                if wordtest == leaf or word == leaf:
                                    LeafIndices.append((i, j))
                                    wordTreeMap.append((word, LeafIndices))
                                    # print("match!:" + word + " " + \
                                    # str(utt.swda_filename) + " " + \
                                    # utt.caller + "." +  \
                                    # str(utt.utterance_index) + \
                                    # "." + str(utt.subutterance_index))
                                    del words[0]  # word is consumed
                                    if len(words) > 0:
                                        word = words[0]  # next word
                                        wordtest = re.sub(
                                            r"[\.\,\?\/\)\(\"\!]", "", word)
                                        wordtest = wordtest.replace("(", "")
                                        wordtest = wordtest.replace(")", "")
                                    LeafIndices = []
                                    j += 1  # increment loop to next leaf
                                    match = True
                                    breaker = True
                                    # raw_input()
                                    break
                                elif leaf in wordtest or \
                                        leaf in word and not leaf == ",":
                                    testleaf = leaf
                                    LeafIndices.append((i, j))
                                    j += 1
                                    for k in range(j, j + 3):  # 3 beyond
                                        if (k >= len(tree.leaves())):
                                            j = 0
                                            i += 1
                                            #breaker = True
                                            breaker = True
                                            break  # got to next tree
                                        if (testleaf + tree.leaves()[k]) \
                                                in wordtest or (testleaf +
                                                                tree.leaves()[k])\
                                                in word:
                                            testleaf += tree.leaves()[k]
                                            LeafIndices.append((i, k))
                                            j += 1
                                            # concatenation
                                            if testleaf == wordtest or \
                                                    testleaf == word:  # word matched
                                                wordTreeMap.append(
                                                    (word, LeafIndices))
                                                del words[0]  # remove word
                                                # print "match!:" + word +\
                                                #str(utt.swda_filename) + " "\
                                                # + utt.caller + "." +  \
                                                # str(utt.utterance_index) +\
                                                # "." + \
                                                # str(utt.subutterance_index))
                                                if len(words) > 0:
                                                    word = words[0]
                                                    wordtest = re.sub(
                                                        r"[\.\,\?\/\)\(\"\!]",
                                                        "", word)
                                                    wordtest = wordtest.\
                                                        replace("(", "")
                                                    wordtest = wordtest.\
                                                        replace(")", "")
                                                # reinitialise leaves
                                                LeafIndices = []
                                                j = k + 1
                                                match = True
                                                breaker = True
                                                # raw_input()
                                                break
                                else:
                                    # otherwise go on
                                    j += 1
                                if breaker:
                                    break
                                if match:
                                    break
                            if j >= len(tree.leaves()):
                                j = 0
                                i += 1
                            if match:
                                break

                    # could not match word! try mistranscriptions first:
                    if not match:
                        if not mistranscribe:  # one final stab at matching!
                            mistranscribe = True
                            for pair in possibleMistranscription:
                                if pair[0] == wordtest:
                                    wordtest = pair[1]
                                    if len(wordTreeMap) > 0:
                                        if len(wordTreeMap[-1][1]) > 0:
                                            i = wordTreeMap[-1][1][-1][0]
                                            j = wordTreeMap[-1][1][-1][1]
                                        else:
                                            # go back to beginning of
                                            # tree search
                                            i = 0
                                            j = 0
                                    else:
                                        i = 0  # go back to beginning
                                        j = 0
                                    break  # matched
                        elif continued:
                            # possible lack of matching up of words in
                            # previous utterance same caller and same
                            # tree// not always within same tree!!
                            errormessage = "Possible bad start for \
                            CONTINUED UTT ''"                                              + words[0] + "'' in file/utt: "\
                                + str(utt.swda_filename) + "\n " + utt.caller + \
                                "." + str(utt.utterance_index) + "." + \
                                str(utt.subutterance_index) + \
                                "POSSIBLE COMMENT = " + str(possibleComment)
                            # print errormessage
                            if not errorLog is None:
                                errorLog.write(errormessage + "\n")
                            # raw_input()
                            if backtrack == 1:
                                backtrack += 1
                            elif backtrack == 2:
                                # i.e. we've done two loops and
                                # still haven't found it, try the other way
                                count = origcount
                                utt = trans.utterances[count]
                                words = utt.text_words()
                                word = words[0]
                                trees = [lastTrees[-1]] + origtrees[1:]
                                # print "\nSECOND PASS(2)replacing \
                                # first one to lasttreemap's:"
                                # print words
                                # print trees
                                backtrack += 1
                                # mistranscribe = False #TODO perhaps needed
                                wordTreeMap = []
                                # switch to forward track this is
                                # the only time we want to try
                                # from the previous mapped leaf in the
                                # other tree
                                foundTreemap = False
                                for t in range(len(lastTreeMap) - 1, -1, -1):
                                    # backwards iteration through words
                                    # print lastTreeMap[t][1]
                                    if len(lastTreeMap[t][1]) > 0:
                                        # print "last treemapping of last \
                                        # caller utterance = " + \
                                        # str(lastTreeMap[t][1][-1])
                                        j = lastTreeMap[t][1][-1][1] + 1
                                        foundTreemap = True
                                        # print "found last mapping, j = " \
                                        #+ str(j)
                                        # raw_input()
                                        # break when last tree
                                        # mapped word from this caller is found
                                        break
                                    if not foundTreemap:
                                        # print "NO matched last TREEMAP found\
                                        # for previous Utt Same Speaker of " + \
                                        # str(utt.swda_filename) + " " + \
                                        # utt.caller + "." +  \
                                        # str(utt.utterance_index) + "." +\
                                        #  str(utt.subutterance_index)
                                        j = 0
                                        # for tmap in wordTreeMapList.keys():
                                        #    print tmap
                                        #    print wordTreeMapList[tmap]
                                        # raw_input()
                                i = 0  # go back to first tree
                                continue
                            elif forwardtrack == 1:
                                forwardtrack += 1
                            elif forwardtrack == 2:
                                count = count - (count - lastIndexes[-1][0])
                                utt = previousUttSame
                                words = utt.text_words()
                                word = words[0]
                                mytrees = []
                                for i in range(0, len(lastTrees) - 1):
                                    mytrees.append(lastTrees[i])
                                trees = mytrees + [origtrees[0]]
                                # print "\nSECOND PASS(1)backtrack to \
                                # with new trees:"
                                # print utt.transcript_index
                                # print words
                                # print trees
                                forwardtrack += 1
                                # mistranscribe = False #TODO maybe needed
                                wordTreeMap = []
                                # raw_input()
                            elif forwardtrack == 3 or backtrack == 3:
                                # if this hasn't worked reset to old trees
                                # print "trying final reset"
                                count = origcount
                                utt = trans.utterances[count]
                                words = utt.text_words()
                                word = words[0]
                                trees = origtrees
                                forwardtrack = 0
                                backtrack = 0
                                # mistranscribe = False #TODO maybe needed
                                wordTreeMap = []
                                # raw_input()
                            else:
                                pass
                                # print "resetting search"
                                # raw_input()
                            # unless forward tracking now,
                            # just go back to beginning
                            i = 0  # go back to beginning of tree search
                            j = 0
                        else:
                            mistranscribe = False
                            LeafIndices = []
                            wordTreeMap.append((word, LeafIndices))
                            errormessage = "WARNING: 440 no/partial tree \
                            mapping for ''"                                            + words[0] + "'' in file/utt: "\
                                + str(utt.swda_filename) + " \n" + utt.caller\
                                + "." + str(utt.utterance_index) + "." + \
                                str(utt.subutterance_index) + \
                                "POSSIBLE COMMENT = " + str(possibleComment)
                            # print utt.text_words()
                            del words[0]  # remove word
                            # for trip in wordTreeMap:
                            #    print "t",trip
                            if len(words) > 0:
                                word = words[0]
                                wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "",
                                                  word)
                                wordtest = wordtest.replace("(", "")
                                wordtest = wordtest.replace(")", "")
                            # print errormessage
                            if errorLog:
                                errorLog.write("possible wrong tree mapping:" +
                                               errormessage + "\n")
                            raw_input()
                # end of while loop (words)
                mytreenumbers = []
                for treemap in trees:
                    # the whole list but the tree
                    mytreenumbers.append(treemap[:-1])
                if not len(utt.text_words()) == len(wordTreeMap):
                    print "ERROR. uneven lengths!"
                    print utt.text_words()
                    print wordTreeMap
                    print trans.swda_filename
                    print utt.transcript_index
                    raw_input()
                    count += 1
                    continue
                # add the treemap
                wordTreeMapList.append(trans.conversation_no,
                                       utt.transcript_index,
                                       tuple(mytreenumbers),
                                       tuple(wordTreeMap))
                count += 1
            # rewrite after each transcript
            filedict = defaultdict(str)
            for key in wordTreeMapList.keys():
                csv_string = '"' + str(list(wordTreeMapList[key])) + '"'
                mytreenumbers = wordTreeMapList[key].transcript_numbers
                myptbnumbers = wordTreeMapList[key].treebank_numbers
                tree_list_string = '"'
                for i in range(0, len(mytreenumbers)):
                    treemap = [myptbnumbers[i]] + mytreenumbers[i]
                    tree_list_string += str(treemap) + ";"
                tree_list_string = tree_list_string[:-1] + '"'
                filename = '"' + key[0:key.rfind(':')] + '"'
                transindex = key[key.rfind(':') + 1:]
                filedict[int(transindex)] = filename \
                    + "\t" + transindex + '\t' + csv_string + "\t" \
                    + tree_list_string + "\n"
            for key in sorted(filedict.keys()):
                corpus_file.write(filedict[key])

            wordTreeMapList = TreeMapCorpus(False, errorLog)  # reset each time
        print "\n" + str(incorrectTrees) + " incorrect trees"
        corpus_file.close()
        if not errorLog is None:
            errorLog.close()
示例#31
0
remove_file(data_dir, train_set_file, utterance_only_flag)
remove_file(data_dir, test_set_file, utterance_only_flag)
remove_file(data_dir, val_set_file, utterance_only_flag)
remove_file(data_dir, dev_set_file, utterance_only_flag)

# Create a temporary directory and unzip the archived data
with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir:
    print('Created temporary directory', tmp_dir)

    zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'swda_archive.zip'),
                               'r')
    zip_file.extractall(tmp_dir)
    zip_file.close()

    # Corpus object for iterating over the whole corpus in .csv format
    corpus = CorpusReader(tmp_dir)

    # Process each transcript
    for transcript in corpus.iter_transcripts(display_progress=False):

        # Process the utterances and create a dialogue object
        dialogue = process_transcript(transcript, excluded_tags,
                                      excluded_chars)

        # Append all utterances to full_set text file
        dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue,
                         utterance_only_flag, 'a+')

        # Determine which set this dialogue belongs to (training, test or validation)
        set_dir = ''
        set_file = ''
示例#32
0
	def __init__(self):
		self.corpus = CorpusReader('swda')