def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Prepare training data train_data_of = {} for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): if ph_key not in score_of: print(f'Warning: no human score for {ph_key}') continue ph = int(feat[0]) if phone_int2sym is not None: if phone_int2sym[ph] != phone_of[ph_key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ') continue score = score_of[ph_key] train_data_of.setdefault(ph, []).append((score, feat[1:])) # Make the dataset more blance train_data_of = add_more_negative_data(train_data_of) # Train models with ProcessPoolExecutor(args.nj) as ex: future_to_model = [(ph, ex.submit(train_model_for_phone, pairs)) for ph, pairs in train_data_of.items()] model_of = {ph: future.result() for ph, future in future_to_model} # Write to file with open(args.model, 'wb') as f: pickle.dump(model_of, f)
def main(): args = get_args() score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) y_true = [] y_pred = [] with open(args.predicted, 'rt') as f, open(args.write, 'wt') as fw: for line in f: key, score, ph = line.strip('\n').split('\t') score = float(score) ph = int(ph) if key not in score_of: print(f'Warning: no human score for {key}') continue if phone_int2sym is not None and phone_int2sym[ph] != phone_of[key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[key]} ') continue y_true.append(score_of[key]) y_pred.append(score) fw.write(f'{key}\t{ph}\t{score_of[key]:.1f}\t{score:.1f}\n') print(f'MSE: {metrics.mean_squared_error(y_true, y_pred):.2f}') print(f'Corr: {np.corrcoef(y_true, y_pred)[0][1]:.2f}') print(metrics.classification_report(y_true, y_pred))
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Prepare training data train_data_of = {} for key, gops in kaldi_io.read_post_scp(args.gop_scp): for i, [(ph, gop)] in enumerate(gops): ph_key = f'{key}.{i}' if ph_key not in score_of: print(f'Warning: no human score for {ph_key}') continue if phone_int2sym is not None and phone_int2sym[ph] != phone_of[ ph_key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ') continue score = score_of[ph_key] train_data_of.setdefault(ph, []).append((score, gop)) # Train polynomial regression with ProcessPoolExecutor(args.nj) as ex: future_to_model = [(ph, ex.submit(train_model_for_phone, pairs)) for ph, pairs in train_data_of.items()] model_of = {ph: future.result() for ph, future in future_to_model} # Write to file with open(args.model, 'wb') as f: pickle.dump(model_of, f)
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Prepare training data train_data_of = {} for key, gops in kaldi_io.read_post_scp(args.gop_scp): for i, [(ph, gop)] in enumerate(gops): ph_key = f'{key}.{i}' if ph_key not in score_of: print(f'Warning: no human score for {ph_key}') continue if phone_int2sym is not None and phone_int2sym[ph] != phone_of[ ph_key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ') continue score = score_of[ph_key] if ph not in train_data_of: train_data_of[ph] = [] train_data_of[ph].append((score, gop)) # Train polynomial regression poly = PolynomialFeatures(2) model_of = {} for ph, pairs in train_data_of.items(): model = LinearRegression() labels = [] gops = [] for label, gop in pairs: labels.append(label) gops.append(gop) labels = np.array(labels).reshape(-1, 1) gops = np.array(gops).reshape(-1, 1) gops = poly.fit_transform(gops) gops, labels = balanced_sampling(gops, labels) model.fit(gops, labels) model_of[ph] = (model.coef_, model.intercept_) # Write to file with open(args.model, 'wb') as f: pickle.dump(model_of, f)
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Prepare training data train_data_of = {} for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): if ph_key not in score_of: print(f'Warning: no human score for {ph_key}') continue if phone_int2sym is not None: ph = int(feat[0]) if phone_int2sym[ph] != phone_of[ph_key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ') continue score = score_of[ph_key] if ph not in train_data_of: train_data_of[ph] = [] train_data_of[ph].append((score, feat)) # Train models model_of = {} for ph, pairs in train_data_of.items(): model = RandomForestRegressor() labels = [] feats = [] for label, feat in pairs: labels.append(label) feats.append(feat[1:]) labels = np.array(labels).reshape(-1, 1) feats = np.array(feats).reshape(-1, len(feats[0])) feats, labels = balanced_sampling(feats, labels) labels = labels.ravel() model.fit(feats, labels) model_of[ph] = model print(f'Model of phone {ph} trained.') # Write to file with open(args.model, 'wb') as f: pickle.dump(model_of, f)
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Gather the features lables = [] features = [] for key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): if key not in score_of: print(f'Warning: no human score for {key}') continue ph = int(feat[0]) if ph in range(args.min_phone_idx, args.max_phone_idx + 1): if phone_int2sym is not None and ph in phone_int2sym: ph = phone_int2sym[ph] lables.append(f'{ph}-{score_of[key]}') features.append(feat[1:]) # Sampling sampled_paris = random.sample(list(zip(features, lables)), min(args.samples, len(lables))) features, lables = list(zip(*sampled_paris)) # Draw scatters label_counter = Counter(lables) colors = sns.color_palette("colorblind", len(label_counter)) features = TSNE(n_components=2).fit_transform(features) sns_plot = sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=lables, legend='full', palette=colors) sns_plot.get_figure().savefig(args.output)