class MeetingRecoder(AbstractDataset): class Utterance: def __init__(self, utterance): self.id = utterance.utterance_id ##mapping between DAMSL and tagset used in SWDA self.da_tag = DAMSL_TAGSET[ utterance.da_tag.strip()] - 1 # index for DAMSL starts from 1 self.speaker = utterance.speaker self.tokens = utterance.original_text self.length = len(self.tokens) class Dialogue: def __init__(self, transcript): self.conversation_no = transcript.conversation_id self.conversation_length = len(transcript.utterances) self.utterances = [] for utterance in transcript.utterances: ## only consider data subset that can be tagged with mrda damsl tags if utterance.da_tag in MRDA_DAMSL_MAP and MRDA_DAMSL_MAP[ utterance.da_tag] != "##" and MRDA_DAMSL_MAP[ utterance.da_tag] in DAMSL_TAGSET: self.utterances.append(MeetingRecoder.Utterance(utterance)) # def __init__(self, args, dataset_path): self.name = type(self).__name__ corpus = mrda.CorpusReader(dataset_path) # train, test splits standard self.total_length = 0 self.vocabulary = Vocabulary() self.label_set_size = len(DAMSL_TAGSET) dataset = [] for transcript in corpus.iter_transcripts(display_progress=True): self.total_length += 1 if args.truncate_dataset and self.total_length > 20: break dataset.append(MeetingRecoder.Dialogue(transcript)) #TODO: Exact test-dev split for mrda //actually do cross validation if args.truncate_dataset: self.train_dataset = dataset[:10] self.valid_dataset = dataset[10:15] self.test_dataset = dataset[15:20] else: self.train_dataset = dataset[:45] self.valid_dataset = dataset[45:60] self.test_dataset = dataset[60:] ## create vocabulary from training data (UNKS during test time) for data_point in self.train_dataset: for utterance in data_point.utterances: self.vocabulary.add_and_get_indices(utterance.tokens) if args.limit_vocabulary: self.vocabulary.truncate() ## create character vocabulary self.vocabulary.get_character_vocab()
class CallHomeEnglish(AbstractDataset): class Utterance: def __init__(self, id, utterance): self.name = "call_home_eng" self.id = utterance.utterance_id self.speaker = utterance.speaker self.tokens = utterance.tokens self.length = len(self.tokens) self.start_time = utterance.start_time self.end_time = utterance.end_time class Dialogue: def __init__(self, transcript): self.id = transcript.conversation_no self.utterances = [] for id, utterance in enumerate(transcript.utterances): self.utterances.append(CallHomeEnglish.Utterance( id, utterance)) self.length = len(self.utterances) def __init__(self, args, dataset_path): self.name = type(self).__name__ corpus = call.CorpusReader(dataset_path) self.total_length = 0 self.vocabulary = Vocabulary() self.label_set_size = 0 dataset = [] for transcript in corpus.iter_transcripts(display_progress=True): if args.truncate_dataset and self.total_length > 25: break dataset.append(CallHomeEnglish.Dialogue(transcript)) self.total_length += 1 if args.truncate_dataset: self.train_dataset = dataset[:15] self.valid_dataset = dataset[15:20] self.test_dataset = dataset[20:] else: ## depending on what task (in args) you can choose to return only a subset that is annotated for DA self.train_dataset = dataset[:140] self.valid_dataset = dataset[141:155] self.test_dataset = dataset[156:] for data_point in self.train_dataset: for utterance in data_point.utterances: self.vocabulary.add_and_get_indices(utterance.tokens) if args.limit_vocabulary: self.vocabulary.truncate() ## create character vocabulary self.vocabulary.get_character_vocab()
class AmericanMeetingCorpus(AbstractDataset): class Utterance: def __init__(self, id, utterance): self.name = "ami" self.id = utterance.utterance_id self.label = utterance.dialogue_act self.speaker = utterance.speaker self.tokens = utterance.tokens self.length = len(self.tokens) self.start_time = utterance.start_time self.end_time = utterance.end_time class Dialogue: def __init__(self, transcript): self.id = transcript.conversation_no self.utterances = [] for id, utterance in enumerate(transcript.utterances): self.utterances.append(AmericanMeetingCorpus.Utterance(id, utterance)) self.length = len(self.utterances) def __init__(self, args, dataset_path): self.name = type(self).__name__ corpus = ami.CorpusReader(dataset_path) self.total_length = 0 self.vocabulary = Vocabulary() self.label_set_size = len(AMI_DIALOGUE_TAGSET) dataset = [] for transcript in corpus.iter_transcripts(display_progress=True): self.total_length += 1 if args.truncate_dataset and self.total_length > 25: break dataset.append(AmericanMeetingCorpus.Dialogue(transcript)) if args.truncate_dataset: self.train_dataset = dataset[:15] self.valid_dataset = dataset[15:20] self.test_dataset = dataset[20:] else: ## depending on what task (in args) you can choose to return only a subset that is annotated for DA self.train_dataset = [] self.valid_dataset = [] self.test_dataset = [] for dialogue in dataset: if dialogue.id[:-1] in TRAIN_SPLIT: self.train_dataset.append(dialogue) elif dialogue.id[:-1] in DEV_SPLIT: self.valid_dataset.append(dialogue) elif dialogue.id[:-1] in TEST_SPLIT: self.test_dataset.append(dialogue) for data_point in self.train_dataset: for utterance in data_point.utterances: self.vocabulary.add_and_get_indices(utterance.tokens) if args.limit_vocabulary: self.vocabulary.truncate() ## create character vocabulary self.vocabulary.get_character_vocab()
class SwitchBoard(AbstractDataset): class Utterance: ## minimum elements all datasets must have; id, length, tokens def __init__(self, id, utterance): self.name = "swda" self.index = utterance.utterance_index self.id = id self.label = DAMSL_TAGSET[utterance.damsl_act_tag().strip( )] # index for DAMSL starts from 1 self.speaker = utterance.caller #TODO: clean text before processing self.tokens = utterance.text_words() self.length = len(self.tokens) self.pos = utterance.regularize_pos_lemmas() class Dialogue: ## minimum elements all datasets must have; id, length, utterances def __init__(self, transcript): self.id = transcript.conversation_no ## length of transcript not same as number of utterances self.length = transcript.length self.conversation_topic = transcript.topic_description self.utterances = [] for id, utterance in enumerate(transcript.utterances): self.utterances.append(SwitchBoard.Utterance(id, utterance)) def __init__(self, args, dataset_path): corpus = swda.CorpusReader(dataset_path) self.name = type(self).__name__ self.total_length = 0 self.vocabulary = Vocabulary() self.label_set_size = len(DAMSL_TAGSET) dataset = [] for transcript in corpus.iter_transcripts(display_progress=True): self.total_length += 1 if args.truncate_dataset and self.total_length > 25: break dataset.append(SwitchBoard.Dialogue(transcript)) shuffled_dataset = random.shuffle(dataset) ## 1155 transcribed datapoints ; 1115, 19, 21 split if args.truncate_dataset: self.train_dataset = dataset[:15] self.valid_dataset = dataset[15:20] self.test_dataset = dataset[20:] else: ##TODO: this split adheres to numbers reporteed by Schriberg et. al., but ideally cross-validation should be done self.train_dataset = dataset[:1115] self.valid_dataset = dataset[1115:1134] self.test_dataset = dataset[1134:] ## create vocabulary from training data (unks during test time) for data_point in self.train_dataset: for utterance in data_point.utterances: self.vocabulary.add_and_get_indices(utterance.tokens) if args.limit_vocabulary: self.vocabulary.truncate() ## create character vocabulary self.vocabulary.get_character_vocab()