def bp_encode(text_in, bp_vocab, subword2idx): sw_token_list = [] for tmp_token in \ word_tokenizer(text_in.lower().strip()): sw_token_list.extend(subword_tokenize( tmp_token, bp_vocab, subword2idx)) return sw_token_list
def learn_word_vocab(corpus): w_counter = Counter() for tmp_text in corpus: tmp_tokens = word_tokenizer(tmp_text.strip().lower()) w_counter.update(tmp_tokens) word_counts = [] for word, count in w_counter.items(): tmp_word = "<" + word + ">" tmp_word = "".join([x+" " for x in tmp_word]).strip() word_counts.append((tmp_word, count)) return dict(word_counts)
else: tmp_class = "good_joke" tmp_jokes_tuple.append((tmp_class, tmp_joke)) del tmp_row, tmp_joke print("Total of", str(len(tmp_jokes_tuple)), "jokes loaded.") # Process the data. # tmp_jokes_filtered = [] w_counter = Counter() for tmp_class, tmp_joke in tmp_jokes_tuple: tmp_joke = \ tmp_joke.replace("\n", " \n ").replace("\'", " ") tmp_tokens = [ x for x in word_tokenizer(tmp_joke.lower()) if x != ""] if len(tmp_tokens) <= max_len: w_counter.update(tmp_tokens) tmp_jokes_filtered.append((tmp_class, tmp_joke)) del tmp_tokens print("Total of", str(len(tmp_jokes_filtered)), "jokes filtered.") del tmp_jokes_tuple word_counts = [] for word, count in w_counter.items(): tmp_word = "<" + word + ">" tmp_word = "".join([x+" " for x in tmp_word]).strip() word_counts.append((tmp_word, count)) word_counts = dict(word_counts)
for x in tmp_split.split(",") ] convs.append(tmp_ids) q_len = 10 a_len = 10 w_counter = Counter() tmp_corpus = [] tmp_data_tuple = [] for conv in convs: for i in range(len(conv) - 1): tmp_qns = id2line[conv[i]].lower().replace("\\u", " ").replace( "\\i", " ").replace("\n", " ").replace("\t", " ") #tmp_qns = re.sub(r"[^\w\s]", " ", tmp_qns) tmp_qns = [x for x in word_tokenizer(tmp_qns) if x != ""] tmp_ans = id2line[conv[i + 1]].lower().replace("\\u", " ").replace( "\\i", " ").replace("\n", " ").replace("\t", " ") #tmp_ans = re.sub(r"[^\w\s]", " ", tmp_ans) tmp_ans = [x for x in word_tokenizer(tmp_ans) if x != ""] if len(tmp_qns) == 0 or len(tmp_ans) == 0: continue elif len(tmp_qns) <= q_len and len(tmp_ans) <= a_len: w_counter.update(tmp_qns) w_counter.update(tmp_ans) tmp_data_tuple.append((" ".join(tmp_qns), " ".join(tmp_ans))) elapsed_tm = (time.time() - start_tm) / 60 print("Elapsed Time:", str(elapsed_tm), "mins.")