def _process_training_line(self, syllables, w_bi_labels, output_scheme): assert len(syllables) == len(w_bi_labels) characters, syl4chr, labels = [], [], [] # we get syllable and its label here for syllable, label in zip(syllables, w_bi_labels): _len = len(syllable) if _len == 0: _len, _chr = 1, [""] _label = [label] else: _chr = list(syllable) _label = [label] + [0] * (_len - 1) assert len(_chr) == len(_label) == _len, "%d vs %d vs %d" % (len(_chr), len(_label), _len) characters.extend(_chr) labels.extend([label] + [0] * (_len - 1)) syl4chr.extend([syllable]*_len) y = np.array(list(labels)).astype(int) ch_ix = np.array( list(map(lambda ch: preprocessing.character2ix(self.dict, ch), characters)) ).astype(int) ct_ix = np.array(char_type.get_char_type_ix(characters)).astype(int) assert len(ch_ix) == len(ct_ix) == len(y) x = np.stack((ch_ix, ct_ix), axis=0) y = output_scheme.encode(y, syl4chr) return (x, len(y)), y
def make_feature(self, txt): syllables = preprocessing.syllable_tokenize(txt) sy2ix, ch2ix = self.sy_dict, self.ch_dict ch_ix, ch_type_ix, syllable_ix = [], [], [] for syllable in syllables: six = preprocessing.syllable2ix(sy2ix, syllable) characters = list(syllable) chs = list( map( lambda ch: preprocessing.character2ix(ch2ix, ch), characters, ) ) ch_ix.extend(chs) ch_type_ix.extend(char_type.get_char_type_ix(characters)) syllable_ix.extend([six]*len(chs)) features = np.stack((ch_ix, ch_type_ix, syllable_ix), axis=0) \ .reshape((1, 3, -1)) \ .astype(np.int64) seq_lengths = np.array([features.shape[-1]], dtype=np.int64) return list(txt), (torch.from_numpy(features), torch.from_numpy(seq_lengths))
def make_feature(self, txt): characters = list(txt) ch_ix = list( map(lambda c: preprocessing.character2ix(self.dict, c), characters)) features = np.array(ch_ix, dtype=np.int64).reshape((1, -1)) seq_lengths = np.array([features.shape[-1]], dtype=np.int64) return characters, (torch.from_numpy(features), torch.from_numpy(seq_lengths))
def make_feature(self, txt): characters = list(txt) ch_ix = list( map( lambda c: preprocessing.character2ix(self.dict, c), characters ) ) ch_type_ix = char_type.get_char_type_ix(characters) features = np.stack((ch_ix, ch_type_ix), axis=0) \ .reshape((1, 2, -1)) \ .astype(np.int64) seq_lengths = np.array([features.shape[-1]], dtype=np.int64) return characters, (torch.from_numpy(features), torch.from_numpy(seq_lengths))
def prepare_syllable_charater_seq_data(files, ch2ix, sy2ix, sampling=10, output_dir=""): training, validation = files if sampling: training = training[:sampling] validation = validation[:sampling] output_dir = "%s/best-syllable-crf-and-character-seq-feature-sampling-%d" % ( output_dir, sampling) print("Saving data to %s" % output_dir) utils.maybe_create_dir(output_dir) for name, dataset in zip(("training", "val"), (training, validation)): print("working on : %s" % name) fout_txt = open("%s/%s.txt" % (output_dir, name), "w") try: for path in dataset: count = 0 with open(path, "r") as fin, open(path.replace(".txt", ".label"), "r") as flab: has_space_problem = False for txt, label in zip(fin, flab): txt = txt.strip().replace("~~", "~") if not txt: continue label = label.strip() syllables = txt.split("~") chars_idx = [] char_labels = [] syllable_idx = [] syllable_indices = list( map( lambda sy: preprocessing.syllable2ix( sy2ix, sy), syllables)) if len(syllables) != len(label): print(txt, path) print(len(syllables), len(label)) print(syllables) print(label) raise SystemExit("xx") label = list(label) for ii, (syllable, six, l) in enumerate( zip(syllables, syllable_indices, label)): if not syllable: continue if syllable == " ": # next syllable is B, then we should also split this space if label[ii + 1] == "1": l = "1" else: l = "0" chs = list( map( lambda c: preprocessing.character2ix( ch2ix, c), list(syllable))) total_chs = len(chs) syllable_idx.extend([six] * total_chs) chars_idx.extend(chs) if l == "1": char_labels.extend(["1"] + ["0"] * (total_chs - 1)) else: char_labels.extend(["0"] * total_chs) assert len(char_labels) == len(chars_idx) # check space problem if not has_space_problem: for cix, clb in zip(chars_idx, char_labels): if cix == 3 and clb == "0": has_space_problem = True print(txt) break fout_txt.write("%s::%s::%s\n" % ( "".join(char_labels), " ".join(np.array(chars_idx).astype(str)), " ".join(np.array(syllable_idx).astype(str)), )) if has_space_problem: print("problem with space in %s" % path) finally: fout_txt.close()