def link_cluster_caller(name, base_path, db_file, name_out_path): global log x = Utility.load_obj('{}/x.pkl'.format(base_path)) inverselengthscale = Utility.load_obj('{}/input_sensitivity.pkl'.format(base_path)) for n_clusters in xrange(2, 6): for mul in [0.025, 0.05, 0.075, 0.1]: n_neighbors = int( len(x)*mul ) title = 'param_n_cluster_{}_n_neighbors_{}x'.format(n_clusters, mul) name_out_file = '{}/{}.eps'.format(name_out_path, title) log.append(title) log.append('n_cluster : {}'.format(n_clusters)) log.append('n_neighbors for kernel : {}'.format(n_neighbors)) labels = link_clustering(x, inverselengthscale, n_clusters, n_neighbors) plot(x, inverselengthscale, labels, name_out_file, title) Utility.save_obj(labels, '{}/{}.pkl'.format(name_out_path, title) ) Utility.write_to_file_line_by_line('{}/{}_log.txt'.format(name_out_path, name), log) pass
def find_data_point_from_coordinate(filepath, input_sen_path, labels, syllable_data_tag, area): # Read data file data_point = Utility.load_obj(filepath) # print data_point # Get input sensitivity input_sen_obj = Utility.load_obj(input_sen_path) input_sensitivety = Utility.get_input_sensitivity(input_sen_obj, 3) # print input_sensitivety x_coordinate = data_point[:, [input_sensitivety[0], input_sensitivety[1]]] # print x_coordinate x_cor = np.array(x_coordinate) index = DataReader.filter_data(x_cor, area) print index lab = Utility.load_obj(labels) print len(lab) print lab[index] syllable_tag = Utility.load_obj(syllable_data_tag) print len(syllable_tag) print syllable_tag # Return pass
def run_for_voice_data(): dropbox_path = '/home/h1/decha/Dropbox/' output_name,delta_bool,delta2_bool = '02_delta_delta-delta', True, True # output_name,delta_bool,delta2_bool = '03_delta', True, False # output_name,delta_bool,delta2_bool = '04_no_delta', False, False input_dims = 3 for tone in ['0','1','2','3','4', '01234']: # for tone in ['01234']: print 'Running Tone : {}'.format(tone) if tone is '01234': data_object_path = '{}/Inter_speech_2016/Syllable_object/01_manual_labeling_object/syllable_all.pickle'.format(dropbox_path) syllable_management = Utility.load_obj(data_object_path) else : data_object_path = '{}/Inter_speech_2016/Syllable_object/01_manual_labeling_object/syllable_{}.pickle'.format(dropbox_path,tone) syllable_management = Utility.load_obj(data_object_path) print 'Delta : {}, Delta-Dealta : {}'.format(delta_bool, delta2_bool) output_path = '{}/Inter_speech_2016/Syllable_object/{}/BGP_LVM/{}_dimentionality/Tone_{}/'.format(dropbox_path,output_name,input_dims,tone) print output_path Latent_variable_model_Training.execute_Bayesian_GPLVM_training( syllable_management, Syllable.TRAINING_FEATURE_POLYNOMIAL_2_DEGREE_VOICE, input_dims, output_path, delta_bool=delta_bool, delta2_bool=delta2_bool) pass
def analysis(main_path): # main_path = '/work/w13/decha/Inter_speech_2016_workplace/Data/07c-5dims_missing_data_delta_deltadelta/BayesianGPLVMMiniBatch_Missing/Tone_4/' gpmodel = Utility.load_obj('{}/GP2dRegression.npy'.format(main_path)) model_path = '{}/GP_model.npy'.format(main_path) model = Utility.load_obj(model_path) data = model.X.mean x = [] input_sensitivity = model.input_sensitivity() print input_sensitivity index = Utility.get_input_sensitivity(input_sensitivity, 2) print index for i in range(len(data)): x.append([data[i, index[0]], data[i, index[1]]]) x = np.array(x) y = np.array(gpmodel.predict(x)[0]) print y.shape plt.clf() plt.scatter(x[:, 0], x[:, 1], c=y, cmap='gray') plt.savefig('{}/gpregression.pdf'.format(main_path)) pass
def run_plot_and_latex(): # output_name = '02_delta_delta-delta' # output_name = '03_delta' # output_name = '04_no_delta' system_names = [ '02_delta_delta-delta', '03_delta', '04_no_delta', '05_missing_data_no_delta', '06_02_with_3-dimentionality' ] for output_name in system_names: base_path = '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/{}/BGP_LVM/'.format( output_name) object_path = '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/01_manual_labeling_object/' for tone in ['0', '1', '2', '3', '4', '01234']: model_path = '{}/Tone_{}/GP_model.npy'.format(base_path, tone) data_object = '{}/syllable_{}.pickle'.format(object_path, tone) if tone == '01234': data_object = '{}/syllable_all.pickle'.format(object_path) outpath = '{}/Tone_{}/stress_unstress_plot.eps'.format( base_path, tone) GP_LVM_Scatter.plot_scatter(Utility.load_obj(model_path), Utility.load_obj(data_object), outpath) pass
def add_data_object(): obj = Utility.load_obj( '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/mix_object/current_version/all_vowel_type/syllable_object_01234.pickle' ) name_index = Utility.load_obj( '/work/w13/decha/Inter_speech_2016_workplace/Tonal_projection/11_missing_data/all_vowel_type/input_dims_10/delta-True_delta-delta-True/BayesianGPLVMMiniBatch_Missing_Tone_01234/name_index.npy' ) name_index = np.array(name_index) model = Utility.load_obj( '/work/w13/decha/Inter_speech_2016_workplace/Tonal_projection/11_missing_data/all_vowel_type/input_dims_10/delta-True_delta-delta-True/BayesianGPLVMMiniBatch_Missing_Tone_01234/GP_model.npy' ) data = np.array(model.X.mean) print data.shape for syl in obj.syllables_list: name = syl.name_index if 'gpr' not in name: continue name_position = np.where(name_index == name) # print name_position latent_data = data[name_position][0] # print latent_data syl.set_latent_for_single_space(latent_data) # print syl.single_space_latent # sys.exit() Utility.save_obj( obj, '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/mix_object/current_version/all_vowel_type/syllable_object_01234.pickle' )
def find_group(tone, v, name): # if 'n' in v: # v = 'vvvn' # elif 'sg' in v: # v = 'vvvsg' # else: # v = 'vvv' group_path = '/work/w13/decha/Inter_speech_2016_workplace/Tonal_projection/06_Tonal_part_projection_noise_reduction-250-iters-opt/{}/input_dims_10/delta-True_delta-delta-True/BGP_LVM_Tone_{}/'.format( v, tone) name_index = np.array( Utility.load_obj('{}/name_index.npy'.format(group_path))) label = np.array( Utility.load_obj('{}/clustered_label.npy'.format(group_path))) # print name # print name_index if '.' in name: print name if len(label[name_index == name]) == 0: print name return 3 return label[name_index == name][0] pass
def get_latent_data(base_path, names, label_feature, use_input_sensitivity=False, normalize=False): model = Utility.load_obj('{}/GP_model.npy'.format(base_path)) input_sensitivity = model.input_sensitivity() latent_data = np.array(Utility.load_obj('{}/GP_model.npy'.format(base_path)).X.mean) name_index = np.array(Utility.load_obj('{}/name_index.npy'.format(base_path))) latent_Y = [] for n in names: ind = np.where(name_index==n) latent_Y.append(latent_data[ind][0]) if len(latent_Y) != len(names): print 'Un equal data : {}'.format(base_path) sys.exit() latent_Y = np.array(latent_Y) print 'Get input sent {}'.format(input_sensitivity) if not use_input_sensitivity: input_sensitivity = None data = ANN_Executioner_Helper.get_ClassificationDataSet(latent_Y, label_feature, normalize=normalize, input_sensitivity=input_sensitivity) return data
def plot_type(plot_type, out_file_path, base_path_list, data_object_path): model_path = '{}/GP_model.npy'.format(base_path_list) name_index_list = '{}/name_index.npy'.format(base_path_list) import os.path if not os.path.isfile(model_path): return data_object = data_object_path # model = Utility.load_obj(model_path) # data = model.X.mean # means = np.array(data) GP_LVM_Scatter.plot_scatter( Utility.load_obj(model_path), Utility.load_obj(data_object), out_file_path, name_index_list=Utility.load_obj(name_index_list), label_type=plot_type, no_short_duration=True, perform_unsupervised=False, non_unlabelled_stress=False, get_only_gpr_data=False, get_only_manual_data=True, return_after_dbscan=False) # sys.exit() pass
def normalize_data(db_file, name_out_path, target_type, missing_db_file, missing_type): db = Utility.load_obj(db_file) missing_db = Utility.load_obj(missing_db_file) new_data = [] for syl in db: d = syl['TF'][target_type]['data'] # print syl['dur'] dur = 0 for du in syl['dur']: dur = dur + du consonant_ratio = syl['dur'][0] / dur # print consonant_ratio missing = None for m in missing_db: if syl['id'] == m['id']: missing = m break unvoice_frames = np.argwhere( np.isnan(missing['TF'][missing_type]['data'])) # print unvoice_frames unvoice_frames_ratio = float(len(unvoice_frames)) / float(len(d) - 1) d = np.append(d, consonant_ratio) d = np.append(d, unvoice_frames_ratio) # print d new_data.append(d) # if not len(unvoice_frames) == 0: # sys.exit() new_data = np.array(new_data) print new_data.shape new_db = [] for idx, syl in enumerate(db): syl['TF']['intepolate151_with_consonant_unvoice_ratio'] = dict() syl['TF']['intepolate151_with_consonant_unvoice_ratio'][ 'data'] = new_data[idx] syl['TF']['intepolate151_with_consonant_unvoice_ratio'][ 'description'] = 'intepolate151 adding ratio of consonant and unvoice frame in syllable' new_db.append(syl) Utility.save_obj(new_db, name_out_path) pass
def run_training(base_path, db_file, name_out_path): names_file = '{}/names.pkl'.format(base_path) out_data = '{}/x.pkl'.format(base_path) input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path) names = Utility.load_obj(names_file) db = Utility.load_obj(db_file) name_list = [] for d in db: name_list.append(d['id']) label = [] for nn in names: idx = name_list.index(nn) if nn in potential_list: label.append('3') elif db[idx]['stress'] == '1': label.append(db[idx]['stress']) else: label.append(db[idx]['stress']) out = Utility.load_obj(out_data) input_sent = Utility.load_obj(input_sensitivity) print 'Input sensitivity', input_sent most_dominants = Utility.get_input_sensitivity(input_sent, 2) label = map(int, label) label = np.array(label) train = np.append(out[label == 2], out[label == 3], axis=0) train = np.c_[train[:, most_dominants[0]], train[:, most_dominants[1]]] print train.shape global kern lengthscale = 1 / np.array(input_sent, dtype=float) kern = GPy.kern.RBF(len(train[0]), ARD=True, lengthscale=[ lengthscale[most_dominants[0]], lengthscale[most_dominants[1]] ]) print most_dominants xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500)) plane = np.c_[xx.ravel(), yy.ravel()] svm_classifier(train, '', '', '', '', plane, xx, yy) pass
def call_accuracy(db_file, x_base_file, setting, name): db = Utility.load_obj(db_file) real = load_real_label(db) n_cluster, n_neighbor = find_config(name) unstress_list = setting[0] stress_list = setting[1] x_file = '{}/param_n_cluster_{}_n_neighbors_{}x.pkl'.format( x_base_file, n_cluster, n_neighbor) pred = Utility.load_obj(x_file) print pred.shape print set(pred), setting for un in unstress_list: pred[pred == un] = 555 # Unstress for st in stress_list: pred[pred == st] = 999 # Stress pred[pred == 999] = 1 pred[pred == 555] = 0 if name == '1_non-nasal': print set(pred) acc = accuracy_score(real, pred) f1 = f1_score(real, pred, average=None) print 'acc : ', acc print 'f1 : ', f1 global acc_scores global f1_scores # spl = name.split('_') acc_scores[name] = acc f1_scores[name] = f1 result_file = dict() result_file['pred'] = pred result_file['real'] = real result_file['acc'] = acc result_file['f1'] = f1 result_file['name'] = name result_file['n_cluster'] = n_cluster result_file['n_neighbors'] = n_neighbor Utility.save_obj(result_file, '{}/result_file.pkl'.format(x_base_file)) pass
def load_name_and_label(name_file, label_file): global d names = Utility.load_obj(name_file) labels = Utility.load_obj(label_file) for n, lab in zip(names, labels): print n, lab d[n] = {'stress': lab} pass
def call_run_dbscan(data_path, inverselengthscale_path, out_base_path): x = Utility.load_obj(data_path) print x.shape inverselengthscale = Utility.load_obj(inverselengthscale_path) eps, m = find_distance_stat(x, inverselengthscale, out_base_path) run_dbscan(x, inverselengthscale, eps, m, out_base_path) pass
def perform_unsupervised(out_file_path, base_path_list, data_object_path): model_path = '{}/GP_model.npy'.format(base_path_list) import os.path if not os.path.isfile(model_path) : return data_object = data_object_path plot_result( Utility.load_obj(model_path), Utility.load_obj(data_object), out_file_path)
def get_train_and_test_fold(fold_object_path, number_of_fold, tst_fold): syls_trn, syls_tst = [], [] test_fold_path = '{}{}.pickle'.format(fold_object_path, tst_fold) syls_tst = Utility.load_obj(test_fold_path).syllables_list train_fold_path = [] for j in range(number_of_fold): if j==tst_fold : continue fold_path = '{}{}.pickle'.format(fold_object_path, j) syls_trn+= Utility.load_obj(fold_path).syllables_list return ( SyllableDatabaseManagement(syllable_list=syls_trn), SyllableDatabaseManagement(syllable_list=syls_tst) )
def run_command(feature_type, missing_data, data_object_base_path_name, base_out_path, input_dims, tone_list, dur_position, num_sampling, d1, d2): deltas = [ [d1, d2] ] output_name_paths = [] for i, d in enumerate(deltas): outp = '{}/input_dims_{}/delta-{}_delta-delta-{}/'.format(base_out_path, input_dims, d[0], d[1]) output_name_paths.append(outp) print 'Missing Data : {}'.format(missing_data) print 'Inducing points : 10 percent' for idx, output_name in enumerate(output_name_paths): delta_bool=deltas[idx][0] delta2_bool=deltas[idx][1] if missing_data: method_name = 'BayesianGPLVMMiniBatch_Missing' else : method_name = 'BGP_LVM' for tone in tone_list: print 'Delta : {}, Delta-Dealta : {}'.format(delta_bool, delta2_bool) data_object_path = '{}{}.pickle'.format(data_object_base_path_name, tone) print 'data path ',data_object_path syllable_management = Utility.load_obj(data_object_path) if len(syllable_management.syllables_list) == 0: print 'No syllable in this object database : {}'.format(tone) print '-----------------------------------------------------------------' continue output_path = '{}/{}_Tone_{}/'.format(output_name, method_name, tone) Utility.make_directory(output_path) print output_path Latent_variable_model_Training.execute_Bayesian_GPLVM_training( syllable_management, feature_type, input_dims, output_path, num_sampling=num_sampling, dur_position=dur_position, delta_bool=delta_bool, delta2_bool=delta2_bool, missing_data=missing_data, num_inducing=int(len(syllable_management.syllables_list)*0.1), max_iters=500) pass
def __init__(self, load_data_object=None, syllable_list=None): ''' Constructor ''' self.syllables_list = syllable_list if load_data_object is not None: self.syllables_list = Utility.load_obj(load_data_object)
def get_data_with_missing_values(self, num_sampling, subtract_typical_contour, feature_name=None, delta=False, deltadelta=False): x = np.linspace(0, len(self.raw_data), num=num_sampling) Y = np.interp(x, np.arange(len(self.raw_data)), self.raw_data) data = Y if feature_name is not None: training_data = np.interp(x, np.arange(len(self.training_feature[feature_name])), self.training_feature[feature_name]) # print training_data, len(training_data) data = training_data data[ Y<0 ] = np.nan if subtract_typical_contour: typical_tone_path = '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/Typical_contour/50dims/tone_{}.pickle'.format(self.tone) typical_tone_obj = Utility.load_obj(typical_tone_path) data = data - typical_tone_obj # print data, len(data) if delta: y_delta = np.gradient(data) # print y_delta if deltadelta: y_delta_delta = np.gradient(y_delta) # print y_delta_delta y_delta = np.append(y_delta, y_delta_delta) data = np.append(data, y_delta) # print np.array(data), len(data) return np.array(data)
def gen_dct_data(syllable_management_path): syl_object = Utility.load_obj(syllable_management_path) for syl in syl_object.syllables_list: data = syl.get_Y_features( Syllable. Training_feature_tonal_part_raw_remove_head_tail_interpolated, 50, False, False, exp=True, subtract_means=False, output=None, missing_data=False) data_dct = dct(data, 2, norm='ortho') idct = dct(data_dct, 3, norm='ortho') print syl.name_index # print data # print data_dct # print idct syl.training_feature[ Syllable.Training_feature_tonal_part_dct_coeff] = data_dct Utility.save_obj(syl_object, syllable_management_path) pass
def run_training(db_file, name_out_path, n_components, data_type): db = Utility.load_obj(db_file) Y = [] names = [] for syl in db: feat = syl['TF'][data_type]['data'] Y.append(feat) names.append(syl['id']) # sys.exit() Y = np.array(Y) print Y.shape # print Y[0] config = {'n_components': n_components, 'data': Y} print config m, Y_r = GPy_Interface.pca(config) # print Y_r.shape Utility.save_obj(m, '{}/model.pkl'.format(name_out_path)) Utility.save_obj(Y_r, '{}/pca_reduction_output.pkl'.format(name_out_path)) Utility.save_obj(names, '{}/names.pkl'.format(name_out_path)) Utility.save_obj(Y, '{}/training_data.pkl'.format(name_out_path)) pass
def fix_database(db_file, change_list_file, out_file): global db db = None db = Utility.load_obj(db_file) change_list = [] less_than = None for line in Utility.read_file_line_by_line(change_list_file): if 'tsc' in line: n = Utility.trim(line).replace(' ', '_') change_list.append(n) elif '<' in line: # print line less_than = line.split(' ')[1] pass # print change_list # print less_than if (len(change_list) == 0) | (less_than == None): raise 'Change list file false' new_list = change_stress(change_list, less_than) Utility.save_obj(new_list, out_file) pass
def gen_data(db_file, name_out_path): out = [] for syl in Utility.load_obj(db_file): y = Syllable.get_normailze_with_missing_data(syl['raw_lf0'], 50, syl['dur']) # print len(y) syl['TF'] = dict() missing_data = dict() missing_data['data'] = y missing_data[ 'description'] = 'Raw lf0 (first 50 + delta + delta-delta) + duration in frame unit (the last one). Unvoice frames are defined as missing data ' syl['TF']['missing151'] = missing_data # print syl out.append(syl) # sys.exit(0) Utility.save_obj(out, name_out_path) pass
def remove_duration_data(db_file, name_out_path): db = Utility.load_obj(db_file) new_data = [] for syl in db: d = syl['TF']['intepolate151normailize']['data'] new_data.append(d) new_data = np.array(new_data) print new_data new_data = np.delete(new_data, [150, 151], axis=1) print new_data print new_data.shape new_db = [] for idx, syl in enumerate(db): syl['TF']['intepolate150_normailize_no_duration'] = dict() syl['TF']['intepolate150_normailize_no_duration']['data'] = new_data[ idx] syl['TF']['intepolate150_normailize_no_duration'][ 'description'] = 'Normalized version of intepolate151, but remove duration' new_db.append(syl) Utility.save_obj(new_db, name_out_path) pass
def set_pre_suc(): tones = ['01234'] name_list_path = '/home/h1/decha/Dropbox/python_workspace/Inter_speech_2016/playground/list_file_for_preceeding_suceeding/list_gpr_file/' for t in tones: path = '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/mix_object/current_version/all_vowel_type/syllable_object_{}.pickle'.format( t) print path syl_management = Utility.load_obj(path) for syl in syl_management.syllables_list: if 'manual' in syl.name_index: continue name = syl.name_index.split('_') file_tar = '{}/{}/{}.lab'.format(name_list_path, name[2][0], name[2]) list_file = Utility.read_file_line_by_line(file_tar) for idx, l in enumerate(list_file): f = Utility.trim(l) if f == syl.name_index: # print '--------------------' preceeding = Utility.trim(list_file[idx - 1]) # print f succeeding = Utility.trim(list_file[idx + 1]) # print '--------------------' syl.set_preceeding_succeeding_name_index( preceeding, succeeding) # sys.exit() Utility.save_obj(syl_management, path)
def normalize_data(db_file, name_out_path): db = Utility.load_obj(db_file) new_data = [] for syl in db: d = syl['TF']['intepolate151']['data'] new_data.append(d) new_data = np.array(new_data) print new_data X_normalized = preprocessing.normalize(new_data, norm='l2') print X_normalized print X_normalized.shape new_db = [] for idx, syl in enumerate(db): syl['TF']['intepolate151_normalize_by_preprocessing.normalize'] = dict( ) syl['TF']['intepolate151_normalize_by_preprocessing.normalize'][ 'data'] = X_normalized[idx] syl['TF']['intepolate151_normalize_by_preprocessing.normalize'][ 'description'] = 'preprocessing.normalize version of intepolate151' new_db.append(syl) Utility.save_obj(new_db, name_out_path) pass
def run_data_processor(db_file): db = Utility.load_obj(db_file) real = [] # Y = [] for syl in db: if syl['stress'] == '2': real.append(1) elif syl['stress'] in ['0', '1']: real.append(int(syl['stress'])) else: print syl['stress'] real.append(int(syl['stress'])) # Y.append(syl['TF']['intepolate151_normalize_by_preprocessing.normalize']['data']) # sys.exit() real = np.array(real) # Y = np.array(Y) return real pass
def find_min_y(db_all): max_y = 600 for syl in Utility.load_obj(db_all): if len(syl['raw_lf0']) == 0: continue r = syl['raw_lf0'] r = np.array(r) r[r<0] = np.nan # print r # sys.exit() if min( np.exp( r ) ) < 150: # print min( np.exp( r ) ) continue if min( np.exp( r ) ) < max_y: max_y = min( np.exp( r ) ) print 'min y = ', max_y pass
def fix(): base_path = '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/Tonal_object/remove_all_silence_file/' fixed_list_path = '/work/w13/decha/Inter_speech_2016_workplace/Fix_stress_label/fix_list/' fixed_list = load_fix_list(fixed_list_path) fixed_list = np.array(fixed_list) for v in Utility.list_file(base_path): if v.startswith('.'): continue vowel_path = '{}/{}/'.format(base_path, v) for tone in Utility.list_file(vowel_path): if tone.startswith('.'): continue tone_file_path = '{}/{}'.format(vowel_path, tone) print tone_file_path syl_obj = Utility.load_obj(tone_file_path) for syl in syl_obj.syllables_list: # print syl.stress_manual if syl.name_index in fixed_list: print syl.name_index, syl.stress_manual if syl.stress_manual == 0: syl.stress_manual = 1 else: syl.stress_manual = 0 # print syl.name_index Utility.save_obj(syl_obj, tone_file_path) pass
def normalize_data(db_file, name_out_path): db = Utility.load_obj(db_file) new_data = [] for syl in db: d = syl['TF']['missing151']['data'] dd = np.array(d) dd[np.argwhere(np.isnan(d))] = un_voice new_data.append(dd) new_data = np.array(new_data) print new_data robust_scaler = RobustScaler() Xtr_r = robust_scaler.fit_transform(new_data) print Xtr_r print Xtr_r.shape new_db = [] for idx, syl in enumerate(db): syl['TF']['missing151_robust_scale'] = dict() syl['TF']['missing151_robust_scale']['data'] = Xtr_r[idx] syl['TF']['missing151_robust_scale'][ 'description'] = 'robust_scale version of missing151' new_db.append(syl) Utility.save_obj(new_db, name_out_path) pass