with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir: print('Created temporary directory', tmp_dir) zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'swda_archive.zip'), 'r') zip_file.extractall(tmp_dir) zip_file.close() # Corpus object for iterating over the whole corpus in .csv format corpus = CorpusReader(tmp_dir) # Process each transcript for transcript in corpus.iter_transcripts(display_progress=False): # Process the utterances and create a dialogue object dialogue = process_transcript(transcript, excluded_tags, excluded_chars) # Append all utterances to full_set text file dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue, utterance_only_flag, 'a+') # Determine which set this dialogue belongs to (training, test or validation) set_dir = '' set_file = '' if dialogue.conversation_id in train_split: set_dir = 'train' set_file = train_set_file elif dialogue.conversation_id in test_split: set_dir = 'test' set_file = test_set_file elif dialogue.conversation_id in val_split:
# Get the id for this transcript transcript_name = str(transcript.split('.')[0]) # Get the transcript and moves files transcript = load_text_data(os.path.join(archive_dir, 'transcripts', transcript), verbose=False) moves_f = load_text_data(os.path.join(archive_dir, 'moves', transcript_name + '.f.moves.xml'), verbose=False) moves_g = load_text_data(os.path.join(archive_dir, 'moves', transcript_name + '.g.moves.xml'), verbose=False) # Process the utterances and create a dialogue object dialogue = process_transcript(transcript, moves_g, moves_f, excluded_chars, excluded_tags) # Append all utterances to full_set text file dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue, utterance_only_flag, 'a+') # Determine which set this dialogue belongs to (training, test or validation) set_dir = '' set_file = '' if dialogue.conversation_id in train_split: set_dir = 'train' set_file = train_set_file elif dialogue.conversation_id in test_split: set_dir = 'test' set_file = test_set_file elif dialogue.conversation_id in val_split:
# Process each transcript for transcript in transcript_list: # Get the id for this transcript transcript_name = str(transcript.split('.')[0]) # Get the transcript and database file transcript = load_text_data(os.path.join(archive_dir, 'transcripts', transcript_name + '.trans'), verbose=False) database = load_text_data(os.path.join(archive_dir, 'database', transcript_name + '.dadb'), verbose=False) # Process the utterances and create a dialogue object dialogue = process_transcript(transcript, database, da_map, excluded_chars, excluded_tags) # Append all utterances to full_set_file text file dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue, utterance_only_flag, 'a+') # Determine which set this dialogue belongs to (training, test or validation) set_dir = '' set_file = '' if dialogue.conversation_id in train_split: set_dir = 'train' set_file = train_set_file elif dialogue.conversation_id in test_split: set_dir = 'test' set_file = test_set_file elif dialogue.conversation_id in val_split:
# Get the turn list, words and DA files for both speakers turn_list = load_text_data( os.path.join(archive_dir, transcript + '.ldial.xml')) a_words_file = load_text_data( os.path.join(archive_dir, transcript + '.a.unit.xml')) a_da_file = load_text_data( os.path.join(archive_dir, transcript + '.a.lturn.xml')) b_words_file = load_text_data( os.path.join(archive_dir, transcript + '.b.unit.xml')) b_da_file = load_text_data( os.path.join(archive_dir, transcript + '.b.lturn.xml')) # Process the utterances and create a dialogue object dialogue = process_transcript(turn_list, a_words_file, a_da_file, b_words_file, b_da_file, excluded_chars, excluded_tags) # Append all utterances to full_set_file text file dialogue_to_file(os.path.join(data_dir, full_set_file), dialogue, utterance_only_flag, 'a+') # Determine which set this dialogue belongs to (training, test or validation) set_dir = '' set_file = '' if dialogue.conversation_id in train_split: set_dir = 'train' set_file = train_set_file elif dialogue.conversation_id in test_split: set_dir = 'test' set_file = test_set_file