def load_swda_corpus_data(swda_directory): print('Loading SwDA Corpus...') corpus_reader = CorpusReader(swda_directory) talks = [] talk_names = [] tags_seen = set() tag_occurances = {} for transcript in corpus_reader.iter_transcripts(False): name = 'sw' + str(transcript.conversation_no) talk_names.append(name) conversation_content = [] conversation_tags = [] for utterance in transcript.utterances: conversation_content.append( utterance.text_words(True) ) tag = utterance.damsl_act_tag() conversation_tags.append( tag ) if tag not in tags_seen: tags_seen.add(tag) tag_occurances[tag] = 1 else: tag_occurances[tag] += 1 talks.append( (conversation_content, conversation_tags) ) print('\nFound ' + str(len(tags_seen))+ ' different utterance tags.\n') tag_indices = {tag:i for i, tag in enumerate(sorted(list(tags_seen)))} for talk in talks: talk_tags = talk[1] for i, tag in enumerate(talk_tags): talk_tags[i] = tag_indices[ tag ] print('Loaded SwDA Corpus.') return talks, talk_names, tag_indices, tag_occurances
def __init__(self, data_dir, tokenizer, word2id, task='', seed=42): self.corpus = CorpusReader(data_dir) self.data_dir = data_dir self.tokenizer = tokenizer self.word2id = word2id self.task = task # self.utt_num = 0 # for utt in self.corpus.iter_utterances(): # self.utt_num += 1 self.trans_num = 1155 self.deleted_utterances = 0 self.deleted_tokens = 0 self.in_tokens = 0 # for trans in self.corpus.iter_transcripts(): # self.trans_num += 1 self.da2num = switchboard_da_mapping() self.stopwords = get_stopwords(data_dir, word2id) # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same! train_ixs, val_ixs = train_test_split(range(self.trans_num), shuffle=True, train_size=0.8, random_state=seed) val_ixs, test_ixs = train_test_split(val_ixs, shuffle=True, train_size=0.5, random_state=seed) self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs self.utt_da_pairs = [] prev_da = "%" for i, utt in enumerate(self.corpus.iter_utterances()): sentence = self.clean_utt(utt.text) if not sentence: continue self.in_tokens += len(sentence) sentence = self.word2id(sentence) if len(sentence) == 0: continue if not self.utt_acceptable(sentence): continue act = utt.damsl_act_tag() if act == None: act = "%" if act == "+": act = prev_da _, swda_name = os.path.split(utt.swda_filename) swda_name = swda_name[:-4] if swda_name.endswith('.csv') else swda_name ix = utt.utterance_index self.utt_da_pairs.append((sentence, act, swda_name, ix))
def __init__(self, data_dir, tokenizer, word2id, task='', seed=42): self.corpus = CorpusReader(data_dir) self.data_dir = data_dir self.tokenizer = tokenizer self.word2id = word2id self.task = task self.utt_num = 0 for utt in self.corpus.iter_utterances(): self.utt_num += 1 self.trans_num = 0 for trans in self.corpus.iter_transcripts(): self.trans_num += 1 self.da2num = switchboard_da_mapping() # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same! train_ixs, val_ixs = train_test_split(range(self.trans_num), shuffle=True, train_size=0.8, random_state=seed) val_ixs, test_ixs = train_test_split(val_ixs, shuffle=True, train_size=0.5, random_state=seed) self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs self.utt_da_pairs = [] prev_da = "%" for i, utt in enumerate(self.corpus.iter_utterances()): sentence = re.sub(r"([+/\}\[\]]|\{\w)", "", utt.text) sentence = self.word2id(self.tokenizer(sentence)) act = utt.damsl_act_tag() if act == None: act = "%" if act == "+": act = prev_da _, swda_name = os.path.split(utt.swda_filename) swda_name = swda_name[:-4] if swda_name.endswith( '.csv') else swda_name ix = utt.utterance_index self.utt_da_pairs.append((sentence, act, swda_name, ix))
def load_swda_corpus_data(swda_directory): print('Loading SwDA Corpus...') corpus_reader = CorpusReader(swda_directory) talks = [] talk_names = [] tags_seen = {} tag_occurances = {} num_tags_seen = 0 X=[] Y=[] for transcript in corpus_reader.iter_transcripts(False): name = 'sw' + str(transcript.conversation_no) talk_names.append(name) conversation_content = [] conversation_tags = [] for utterance in transcript.utterances: conversation_content.append( utterance.text_words(True) ) tag = utterance.damsl_act_tag() conversation_tags.append( tag ) if tag not in tags_seen: tags_seen[tag] = num_tags_seen num_tags_seen += 1 tag_occurances[tag] = 1 else: tag_occurances[tag] += 1 talks.append( (conversation_content, conversation_tags) ) X.append(conversation_content) Y.append(conversation_tags) print('\nFound ' + str(len(tags_seen))+ ' different utterance tags.\n') for talk in talks: conversation_tags = talk[1] for i in range(len(conversation_tags)): conversation_tags[i] = tags_seen[ conversation_tags[i] ] print('Loaded SwDA Corpus.') return X,Y,talks, talk_names, tags_seen, tag_occurances
class SwitchboardConverter: def __init__(self, data_dir, tokenizer, word2id, task='', seed=42): self.corpus = CorpusReader(data_dir) self.data_dir = data_dir self.tokenizer = tokenizer self.word2id = word2id self.task = task # self.utt_num = 0 # for utt in self.corpus.iter_utterances(): # self.utt_num += 1 self.trans_num = 1155 self.deleted_utterances = 0 self.deleted_tokens = 0 self.in_tokens = 0 # for trans in self.corpus.iter_transcripts(): # self.trans_num += 1 self.da2num = switchboard_da_mapping() self.stopwords = get_stopwords(data_dir, word2id) # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same! train_ixs, val_ixs = train_test_split(range(self.trans_num), shuffle=True, train_size=0.8, random_state=seed) val_ixs, test_ixs = train_test_split(val_ixs, shuffle=True, train_size=0.5, random_state=seed) self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs self.utt_da_pairs = [] prev_da = "%" for i, utt in enumerate(self.corpus.iter_utterances()): sentence = self.clean_utt(utt.text) if not sentence: continue self.in_tokens += len(sentence) sentence = self.word2id(sentence) if len(sentence) == 0: continue if not self.utt_acceptable(sentence): continue act = utt.damsl_act_tag() if act == None: act = "%" if act == "+": act = prev_da _, swda_name = os.path.split(utt.swda_filename) swda_name = swda_name[:-4] if swda_name.endswith('.csv') else swda_name ix = utt.utterance_index self.utt_da_pairs.append((sentence, act, swda_name, ix)) def clean_utt(self, utterance): utterance, cnt = re.subn(r"([+/\}\[\],\-\(\)#]|\{\w)", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r"<+.*>+", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r"\*\w+", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r">[\s\w'?]+$", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r"\*.+$", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r"\^\w+$", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r"^uh+$", "", utterance) ; self.deleted_tokens += cnt utterance, cnt = re.subn(r"(uh+)", "", utterance) ; self.deleted_tokens += cnt ml = re.search("(^\s*\.\s*$)|>", utterance) if ml: self.deleted_utterances += 1 return None utterance = [w.lower() for w in utterance.split(" ") if len(w) > 0 and not re.search("[Uu][Hh]+", w)] return utterance def utt_acceptable(self, utt): # check whether an utterance is acceptable for perturbation if not utt: return False stop_cnt = 0 for w in utt: if w in self.stopwords: stop_cnt += 1 if (len(utt)-stop_cnt >= 4): # (float(stop_cnt) / float(len(utt))) < 0.999 and return True return False def draw_rand_sent(self): r = random.randint(0, len(self.utt_da_pairs)-1) return self.utt_da_pairs[r] def create_vocab(self): print("Creating Vocab file for Switchboard") cnt = Counter() for utt in self.corpus.iter_utterances(): sentence = re.sub(r"([+/\}\[\]]|\{\w)", "", utt.text) sentence = self.tokenizer(sentence) for w in sentence: cnt[w] += 1 itos_file = os.path.join(self.data_dir, "itos.txt") itosf = open(itos_file, "w") for (word, _) in cnt.most_common(25000): itosf.write("{}\n".format(word)) #getKeysByValue def swda_permute(self, sents, amount, speaker_ixs): if amount == 0: return [] permutations = [list(range(len(sents)))] segment_permutations = [] amount = min(amount, factorial(len(sents))-1) segm_ixs = self.speaker_segment_ixs(speaker_ixs) segments = list(set(segm_ixs.values())) for i in range(amount): while True: permutation = [] segm_perm = np.random.permutation(len(segments)) segment_permutations.append(segm_perm) for segm_ix in segm_perm: utt_ixs = sorted(getKeysByValue(segm_ixs, segm_ix)) permutation = permutation + utt_ixs if permutation not in permutations: break permutations.append(permutation) return permutations[1:] , segment_permutations #the first one is the original, which was included s.t. won't be generated def speaker_segment_ixs(self, speaker_ixs): i = 0 segment_indices = dict() prev_speaker = speaker_ixs[0] for j,speaker in enumerate(speaker_ixs): if speaker != prev_speaker: prev_speaker = speaker i += 1 segment_indices[j] = i return segment_indices def swda_half_perturb(self, amount, speaker_ixs): segm_ixs = self.speaker_segment_ixs(speaker_ixs) segments = list(set(segm_ixs.values())) segment_permutations = [] permutations = [list(segm_ixs.keys())] for _ in range(amount): speaker = random.randint(0,1) # choose one of the speakers speaker_to_perm = list(filter(lambda x: (x-speaker) % 2 == 0, segments)) speaker_orig = list(filter(lambda x: (x-speaker) % 2 != 0, segments)) #TODO: rename either speaker_ix or speaker_ixs, they are something different, but the names are too close if len(speaker_to_perm) < 2: return [] while True: permuted_speaker_ix = np.random.permutation(speaker_to_perm).tolist() new_segments = [None]*(len(speaker_orig)+len(permuted_speaker_ix)) if speaker == 0 : new_segments[::2] = permuted_speaker_ix new_segments[1::2] = speaker_orig else: new_segments[1::2] = permuted_speaker_ix new_segments[::2] = speaker_orig segment_permutations.append(new_segments) permutation = [] for segm_ix in new_segments: utt_ixs = sorted(getKeysByValue(segm_ixs, segm_ix)) permutation = permutation + utt_ixs if not permutation in permutations: permutations.append(permutation) break return permutations, segment_permutations def swda_utterance_insertion(self, speaker_ixs, amounts): segment_ixs = self.speaker_segment_ixs(speaker_ixs) segments = list(set(segment_ixs.values())) segment_permutations = [] permutations = [] i = 0 for _ in range(amounts): while True: # actually: do ... while permutation not in permutations i_from = random.randint(0, len(segments)-1) i_to = random.randint(0, len(segments)-2) segm_perm = deepcopy(segments) rem_elem = segments[i_from] segm_perm = segm_perm[0:i_from] + segm_perm[i_from+1:] segm_perm = segm_perm[0:i_to] + [rem_elem] + segm_perm[i_to:] permutation = [] for segm_ix in segm_perm: utt_ixs = sorted(getKeysByValue(segment_ixs, segm_ix)) permutation = permutation + utt_ixs if permutation not in permutations: permutations.append(permutation) segment_permutations.append(segm_perm) break return permutations, segment_permutations def swda_utterance_sampling(self, utterances, acts, speaker_ixs, amount): segm_ixs = self.speaker_segment_ixs(speaker_ixs) segments = list(set(segm_ixs.values())) permutations = [] for i in range(amount): (sentence, act, swda_name, ix) = self.draw_rand_sent() # insert_ix = random.choice(range(len(utterances))) while(True): insert_ix = np.random.choice(range(len(utterances))) utt = utterances[insert_ix] act_orig = acts[insert_ix] if self.utt_acceptable(utt) and act != act_orig: break permutations.append((sentence, act, swda_name, ix, insert_ix)) return permutations def convert_dset(self, amounts): # create distinct train/validation/test files. they'll correspond to the created # splits from the constructor train_output_file = os.path.join(self.data_dir, 'train', 'coherency_dset_{}.txt'.format(self.task)) val_output_file = os.path.join(self.data_dir, 'validation', 'coherency_dset_{}.txt'.format(self.task)) test_output_file = os.path.join(self.data_dir, 'test', 'coherency_dset_{}.txt'.format(self.task)) if not os.path.exists(os.path.join(self.data_dir, 'train')): os.makedirs(os.path.join(self.data_dir, 'train')) if not os.path.exists(os.path.join(self.data_dir, 'validation')): os.makedirs(os.path.join(self.data_dir, 'validation')) if not os.path.exists(os.path.join(self.data_dir, 'test')): os.makedirs(os.path.join(self.data_dir, 'test')) trainfile = open(train_output_file, 'w') valfile = open(val_output_file, 'w') testfile = open(test_output_file, 'w') shuffled_path = os.path.join(self.data_dir, "shuffled_{}".format(self.task)) if not os.path.isdir(shuffled_path): os.mkdir(shuffled_path) for i,trans in enumerate(tqdm(self.corpus.iter_transcripts(display_progress=False), total=1155)): utterances = [] acts = [] speaker_ixs = [] prev_act = "%" for utt in trans.utterances: sentence = self.clean_utt(utt.text) sentence = self.word2id(sentence) # print(sentence, " ## DAs: ", utt.act_tag) utterances.append(sentence) act = utt.damsl_act_tag() if act == None: act = "%" if act == "+": act = prev_act acts.append(self.da2num[act]) prev_act = act if "A" in utt.caller: speaker_ixs.append(0) else: speaker_ixs.append(1) if self.task == 'up': permuted_ixs , segment_perms = self.swda_permute(utterances, amounts, speaker_ixs) elif self.task == 'us': permuted_ixs = self.swda_utterance_sampling(utterances, acts, speaker_ixs, amounts) elif self.task == 'hup': permuted_ixs , segment_perms = self.swda_half_perturb(amounts, speaker_ixs) elif self.task == 'ui': permuted_ixs, segment_perms = self.swda_utterance_insertion(speaker_ixs, amounts) swda_fname = os.path.split(trans.swda_filename)[1] shuffle_file = os.path.join(shuffled_path, swda_fname) # [:-4] with open(shuffle_file, "w") as f: #TODO: analogous to DD, write switchboard name into the file csv_writer = csv.writer(f) if self.task == 'us': for perm in permuted_ixs: (utt, da, name, ix, insert_ix) = perm row = [name, ix,insert_ix] csv_writer.writerow(row) else: for perm in segment_perms: csv_writer.writerow(perm) if self.task == 'us': for p in permuted_ixs: a = " ".join([str(x) for x in acts]) u = str(utterances) # (sentence, act, swda_name, ix, insert_ix) insert_sent, insert_da, name, ix, insert_ix = p insert_da = self.da2num[insert_da] p_a = deepcopy(acts) p_a[insert_ix] = insert_da pa = " ".join([str(x) for x in p_a]) p_u = deepcopy(utterances) p_u[insert_ix] = insert_sent if i in self.train_ixs: trainfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u)) trainfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u)) if i in self.val_ixs: valfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u)) valfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u)) if i in self.test_ixs: testfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u)) testfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u)) else: for p in permuted_ixs: a = " ".join([str(x) for x in acts]) u = str(utterances) pa = [acts[i] for i in p] p_a = " ".join([str(x) for x in pa]) pu = [utterances[i] for i in p] p_u = str(pu) if i in self.train_ixs: trainfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u)) trainfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u)) if i in self.val_ixs: valfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u)) valfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u)) if i in self.test_ixs: testfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u)) testfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
# similarity_processor = BertSimilarity() QUESTIONS = ["qy", "qw", "qo", "qr"] def get_similarity(sentence1, sentence2): print(sentence1, sentence2) return 1 # similarity_processor.get_similarity(sentence1, sentence2) def is_question(utterance): return utterance.damsl_act_tag() in QUESTIONS if __name__ == '__main__': data_dir = "swda/swda" scan_range = 5 cr = CorpusReader("swda1/swda") for dialog in cr.iter_transcripts(display_progress=True): for index, utterance in enumerate(dialog.utterances): if index < scan_range or index >= len( dialog.utterances) - scan_range: continue if is_question(utterance): for i in range(-5, 5): print( get_similarity(utterance.text, dialog.utterances[index + i].text)) # pyplot.bar(range(scan_range*2), aggregate)
'ng', 'ny', 'qw^d', 'bd', 'qy^d', 'bf', 'ft', 'ba', 'bh', 'bk', 'fa', 'fc', 'br', 'qh', 'oo', 'b', 'qw', 'qy', 'h', 't3', 'o', 't1', '^h', 'aap', '^q', 'x', 'sd', '^2', 'qo', '^g' ] convert = { '+': 'sd', 'fo_o_fw_"_by_bc': 'sd', 'oo_co_cc': 'sd', 'arp_nd': 'no', 'aap_am': 'sd' } # assuming SWDA corpus installed in path-to-project/swda # url of repo is https://github.com/cgpotts/swda # proprocessor script for this model is in https://github.com/miyamotost/swda corpus = CorpusReader('swda/swda') with open('dataset/swda_datset_training.txt', mode='a') as f1, open('dataset/swda_datset_test.txt', mode='a') as f2: for i, trans in enumerate(corpus.iter_transcripts(display_progress=False)): speakerids = [pad, pad, pad, pad] utts = [pad, pad, pad, pad] labels = [pad, pad, pad, pad] print('iter: {}'.format(i + 1)) # # speakerid : utt.caller_no # main_topics : trans.topic_description しばらく"PAD"で対応(無視する) # pos : utt.act_tag しばらく"PAD"で対応(無視する) # utt : utt.text
pos = pos_list[k][1] add_token(language, sentence, word, lemma, pos, i, j, k + 1) tree = etree.ElementTree(text) tree.write(out_file, pretty_print=True, xml_declaration=True, encoding='utf-8') def add_token(language, sentence, token, lemma, pos, i, j, k): """ Converts a CONLL-U token to a OPUS-xml 'w'-tag. """ word = etree.SubElement(sentence, 'w') word.text = token word.set('id', 'w{}.{}.{}'.format(i, j, k)) word.set('lem', lemma) word.set('tree', pos) if __name__ == '__main__': corpus = CorpusReader('swda/swda') for transcript in corpus.iter_transcripts(): out_file = os.path.splitext(os.path.basename( transcript.swda_filename))[0] out_path = 'swda-opus' os.makedirs(out_path, exist_ok=True) process_single('en', transcript, os.path.join(out_path, out_file + '.xml'))
import logging from swda.swda import CorpusReader corpus = CorpusReader('swda/swda') train_set_idx = [ 'sw2005', 'sw2006', 'sw2008', 'sw2010', 'sw2012', 'sw2015', 'sw2018', 'sw2019', 'sw2020', 'sw2022', 'sw2024', 'sw2025', 'sw2027', 'sw2028', 'sw2032', 'sw2035', 'sw2038', 'sw2039', 'sw2040', 'sw2041', 'sw2051', 'sw2060', 'sw2061', 'sw2062', 'sw2064', 'sw2065', 'sw2073', 'sw2078', 'sw2079', 'sw2085', 'sw2086', 'sw2090', 'sw2092', 'sw2093', 'sw2094', 'sw2095', 'sw2101', 'sw2102', 'sw2104', 'sw2105', 'sw2107', 'sw2109', 'sw2110', 'sw2111', 'sw2113', 'sw2120', 'sw2122', 'sw2124', 'sw2125', 'sw2130', 'sw2137', 'sw2139', 'sw2145', 'sw2149', 'sw2154', 'sw2155', 'sw2157', 'sw2168', 'sw2171', 'sw2177', 'sw2178', 'sw2180', 'sw2181', 'sw2184', 'sw2185', 'sw2187', 'sw2190', 'sw2191', 'sw2197', 'sw2205', 'sw2220', 'sw2221', 'sw2226', 'sw2227', 'sw2228', 'sw2231', 'sw2232', 'sw2234', 'sw2235', 'sw2237', 'sw2241', 'sw2244', 'sw2247', 'sw2248', 'sw2249', 'sw2252', 'sw2259', 'sw2260', 'sw2262', 'sw2263', 'sw2264', 'sw2265', 'sw2266', 'sw2268', 'sw2275', 'sw2278', 'sw2279', 'sw2283', 'sw2285', 'sw2287', 'sw2290', 'sw2292', 'sw2293', 'sw2295', 'sw2296', 'sw2300', 'sw2301', 'sw2302', 'sw2303', 'sw2304', 'sw2305', 'sw2308', 'sw2309', 'sw2313', 'sw2314', 'sw2316', 'sw2323', 'sw2324', 'sw2325', 'sw2330', 'sw2331', 'sw2334', 'sw2336', 'sw2339', 'sw2342', 'sw2344', 'sw2349', 'sw2353', 'sw2354', 'sw2355', 'sw2362', 'sw2365', 'sw2366', 'sw2368', 'sw2370', 'sw2372', 'sw2376', 'sw2379', 'sw2380', 'sw2382', 'sw2383', 'sw2386', 'sw2387', 'sw2389', 'sw2393', 'sw2397', 'sw2405', 'sw2406', 'sw2407', 'sw2413', 'sw2418', 'sw2421', 'sw2423', 'sw2424', 'sw2426', 'sw2427', 'sw2429', 'sw2431', 'sw2432', 'sw2433', 'sw2435', 'sw2436', 'sw2437', 'sw2439', 'sw2442', 'sw2445', 'sw2446', 'sw2448', 'sw2450', 'sw2451', 'sw2452', 'sw2457', 'sw2460', 'sw2465', 'sw2466', 'sw2467', 'sw2469', 'sw2471', 'sw2472', 'sw2476', 'sw2477', 'sw2478',
def prep_swda(): """ Put the conversations into a json format that torchtext can read easily. Each "example" is a conversation comprised of a list of utterances and a list of dialogue act tags (each the same length) """ log.info("Loading SWDA corpus.") if not os.path.isfile(SWDA_CORPUS_DIR): with zipfile.ZipFile("swda/swda.zip") as zip_ref: zip_ref.extractall('data') corpus = CorpusReader(SWDA_CORPUS_DIR) corpus = {t.conversation_no: t for t in corpus.iter_transcripts()} bert_vocab_file = BERT_VOCAB_FILE.format(BERT_MODEL) if not os.path.isfile(bert_vocab_file): log.info("Customizing BERT vocab.") customize_bert_vocab() log.info("Loading BERT vocab/tokenizer.") bert_tokenizer = BertTokenizer.from_pretrained(bert_vocab_file, never_split = BERT_RESERVED_TOKENS + BERT_CUSTOM_TOKENS) log.info("Getting splits.") splits_file = SWDA_SPLITS.format('splits') if os.path.isfile(splits_file): # use existing SWDA splits (for reproducibility purposes) with open(splits_file) as f: splits = json.load(f) else: # save the splits file splits = gen_splits(list(corpus.keys())) with open(splits_file, 'w') as f: json.dump(splits, f) def words_to_ints(ws): maxvalue = max(vocab.values()) for w in ws: if w not in vocab: maxvalue += 1 vocab[w] = maxvalue xs = [vocab[x] for x in ws] return xs def tag_to_int(tag): maxvalue = max(tag_vocab.values()) if tag_vocab else -1 if tag not in tag_vocab: maxvalue += 1 tag_vocab[tag] = maxvalue return tag_vocab[tag] def extract_example(transcript): """ Gets the parts we need from the SWDA utterance object """ tags, tags_ints, utts, utts_ints, utts_ints_bert , utts_ints_nl, utts_ints_bert_nl = [], [], [], [], [], [], [] for utt in transcript.utterances: # Regex tokenization words = "[SPKR_{}] ".format(utt.caller) + tokenize(utt.text.lower()) words_nl = remove_laughters(remove_disfluencies(words)) utts.append(words) utts_ints.append(words_to_ints(words.split())) utts_ints_nl.append(words_to_ints(words_nl.split())) # BERT wordpiece tokenization bert_text = "[CLS] [SPKR_{}] ".format(utt.caller) + utt.text bert_tokens = bert_tokenizer.tokenize(bert_text) # list of strings utts_ints_bert.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens)) bert_text_nl = remove_laughters(remove_disfluencies(bert_text)) bert_tokens_nl = bert_tokenizer.tokenize(bert_text_nl) utts_ints_bert_nl.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens_nl)) # dialogue act tags tag = damsl_tag_cluster(utt.act_tag) tags.append(tag) tags_ints.append(tag_to_int(tag)) return {'id': transcript.conversation_no, 'utts': utts, 'utts_ints': utts_ints, 'utts_ints_bert': utts_ints_bert, 'tags': tags, 'tags_ints': tags_ints, 'utts_ints_bert_nl': utts_ints_bert_nl, 'utts_ints_nl': utts_ints_nl} log.info("Extracting data and saving splits.") for split in splits: data = [] for ex_id in tqdm(splits[split], desc=split): data.append(extract_example(corpus[ex_id])) with open(SWDA_SPLITS.format(split), 'w') as f: json.dump(data, f) log.info("Vocab size: {}". format(len(vocab))) with open(SWDA_SPLITS.format("vocab"), 'w') as f: json.dump(vocab, f) log.info("Tag vocab size: {}". format(len(tag_vocab))) with open(SWDA_SPLITS.format("tag_vocab"), 'w') as f: json.dump(tag_vocab, f)