def load_data(): char2idx, _ = load_vocab() csv = os.path.join(Hyper.data_dir, "metadata.csv") names, lengths, texts = [], [], [] with codecs.open(csv, 'r', "utf-8") as f: lines = f.readlines() for line in lines: line = line.strip() fname, text = line.split('|') text = text_normalize(text) + 'E' # append the end of string mark for char in text: if char <= '9' and char >= '0': raise ValueError( "[data]: after text normalize, there should be no digits." ) text = [char2idx[char] for char in text] names.append(fname) lengths.append(len(text)) texts.append(text) if len(text) > Hyper.data_max_text_length: raise Exception("[load data]: length of text is out of range") return names, lengths, texts
def process_text(text, padding=False): char2idx, _ = load_vocab() text = text_normalize(text) + 'E' # append the end of string mark for char in text: if char <= '9' and char >= '0': raise ValueError("[data]: after text normalize, there should be no digits.") text = [char2idx[char] for char in text] if padding: text = np.concatenate((text, np.zeros(Hyper.data_max_text_length - len(text)))) return text