with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir:
    print('Created temporary directory', tmp_dir)

    zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'swda_archive.zip'),
                               'r')
    zip_file.extractall(tmp_dir)
    zip_file.close()

    # Corpus object for iterating over the whole corpus in .csv format
    corpus = CorpusReader(tmp_dir)

    # Process each transcript
    for transcript in corpus.iter_transcripts(display_progress=False):

        # Process the utterances and create a dialogue object
        dialogue = process_transcript(transcript, excluded_tags,
                                      excluded_chars)

        # Append all utterances to full_set text file
        dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue,
                         utterance_only_flag, 'a+')

        # Determine which set this dialogue belongs to (training, test or validation)
        set_dir = ''
        set_file = ''
        if dialogue.conversation_id in train_split:
            set_dir = 'train'
            set_file = train_set_file
        elif dialogue.conversation_id in test_split:
            set_dir = 'test'
            set_file = test_set_file
        elif dialogue.conversation_id in val_split:
    # Get the id for this transcript
    transcript_name = str(transcript.split('.')[0])

    # Get the transcript and moves files
    transcript = load_text_data(os.path.join(archive_dir, 'transcripts',
                                             transcript),
                                verbose=False)
    moves_f = load_text_data(os.path.join(archive_dir, 'moves',
                                          transcript_name + '.f.moves.xml'),
                             verbose=False)
    moves_g = load_text_data(os.path.join(archive_dir, 'moves',
                                          transcript_name + '.g.moves.xml'),
                             verbose=False)

    # Process the utterances and create a dialogue object
    dialogue = process_transcript(transcript, moves_g, moves_f, excluded_chars,
                                  excluded_tags)

    # Append all utterances to full_set text file
    dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue,
                     utterance_only_flag, 'a+')

    # Determine which set this dialogue belongs to (training, test or validation)
    set_dir = ''
    set_file = ''
    if dialogue.conversation_id in train_split:
        set_dir = 'train'
        set_file = train_set_file
    elif dialogue.conversation_id in test_split:
        set_dir = 'test'
        set_file = test_set_file
    elif dialogue.conversation_id in val_split:
示例#3
0
# Process each transcript
for transcript in transcript_list:

    # Get the id for this transcript
    transcript_name = str(transcript.split('.')[0])

    # Get the transcript and database file
    transcript = load_text_data(os.path.join(archive_dir, 'transcripts',
                                             transcript_name + '.trans'),
                                verbose=False)
    database = load_text_data(os.path.join(archive_dir, 'database',
                                           transcript_name + '.dadb'),
                              verbose=False)

    # Process the utterances and create a dialogue object
    dialogue = process_transcript(transcript, database, da_map, excluded_chars,
                                  excluded_tags)

    # Append all utterances to full_set_file text file
    dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue,
                     utterance_only_flag, 'a+')

    # Determine which set this dialogue belongs to (training, test or validation)
    set_dir = ''
    set_file = ''
    if dialogue.conversation_id in train_split:
        set_dir = 'train'
        set_file = train_set_file
    elif dialogue.conversation_id in test_split:
        set_dir = 'test'
        set_file = test_set_file
    elif dialogue.conversation_id in val_split:
示例#4
0
    # Get the turn list, words and DA files for both speakers
    turn_list = load_text_data(
        os.path.join(archive_dir, transcript + '.ldial.xml'))
    a_words_file = load_text_data(
        os.path.join(archive_dir, transcript + '.a.unit.xml'))
    a_da_file = load_text_data(
        os.path.join(archive_dir, transcript + '.a.lturn.xml'))
    b_words_file = load_text_data(
        os.path.join(archive_dir, transcript + '.b.unit.xml'))
    b_da_file = load_text_data(
        os.path.join(archive_dir, transcript + '.b.lturn.xml'))

    # Process the utterances and create a dialogue object
    dialogue = process_transcript(turn_list, a_words_file, a_da_file,
                                  b_words_file, b_da_file, excluded_chars,
                                  excluded_tags)

    # Append all utterances to full_set_file text file
    dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue,
                     utterance_only_flag, 'a+')

    # Determine which set this dialogue belongs to (training, test or validation)
    set_dir = ''
    set_file = ''
    if dialogue.conversation_id in train_split:
        set_dir = 'train'
        set_file = train_set_file
    elif dialogue.conversation_id in test_split:
        set_dir = 'test'
        set_file = test_set_file