def compute_and_write(audio_dir, data_dir, features=None): """Compute frame-based features for all audio files in a folder. Args: audio_dir (str): where to find audio files data_dir (str): where to write features features (dict): dictionary with feature extraction functions, indexed by feature name. Feature extraction functions should return a time 1d-array of frame times and a 2d-array of feature frames. Feature name will be used as the subdirectory to which feature CSVs are written.) """ if features is None: features = {'mfcc': get_mfcc, 'hpcp': get_hpcp, 'melody': get_melody, 'beats': get_beats, 'onsets': get_onsets} filenames = os.listdir(audio_dir) for filename in filenames: if filename.endswith('.wav') or filename.endswith('.mp3'): print("Computing features for file {}...".format(filename)) x, sr = librosa.load(os.path.join(audio_dir, filename), mono=True) for feature in features: func = features[feature] t, X = func(x, sr) track_id = filename.split('.')[-2] utils.write_feature([t, X], [data_dir, feature, track_id])
def setUp(self): data_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/' pitch.chroma_dir = data_dir + 'hpcp/' self.example_id = '147526770' t, chroma = pitch.get_chroma(self.example_id) print chroma.shape, '\n' chroma_trans = np.roll(chroma, -3, axis=1) print chroma_trans.shape, '\n' self.temp_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/temp/' utils.write_feature([t, chroma_trans], [self.temp_dir, self.example_id])
def setUp(self): data_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/' pitch.chroma_dir = data_dir + 'hpcp/' self.example_id = '147526770' t, chroma = pitch.get_chroma(self.example_id) print chroma.shape, '\n' chroma_trans = np.roll(chroma, -3, axis=1) print chroma_trans.shape, '\n' self.temp_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/temp/' utils.write_feature([t, chroma_trans], [self.temp_dir, self.example_id])
def compute_and_write(data_dir, track_list=None, features=None): """Compute frame-based features for all audio files in a folder. Args: data_dir (str): where to write features track_list (str or None): list of file ids. Set to None to infer from files in melody_dir and chroma_dir (the intersection is used). features (dict): dictionary with (unique) feature names as keys and tuples as values, each containing a feature extraction function and a parameter dictionary. Feature extraction functions can be any function that returns one or more 1d or 2d-arrays that share their first dimension. Required global variables: melody_dir (str): where to find melody data chroma_dir (str): where to find chroma data """ if track_list is None: melody_ids = [filename.split('.')[0] for filename in os.listdir(melody_dir)] chroma_ids = [filename.split('.')[0] for filename in os.listdir(chroma_dir)] track_list = list(set(melody_ids + chroma_ids)) if features is None: features = {'pitchhist': (get_pitchhist, {}), 'pitchhist2': (get_pitchhist2, {}), 'pitchhist3': (get_pitchhist3, {}), 'pitchhist3_int': (get_pitchhist3, {'intervals': True, 'diagfactor': 1, 'sqrt': False}), 'chromahist2': (get_chromahist2, {}), 'chromahist3': (get_chromahist3, {}), 'chromahist3_int': (get_chromahist3, {'intervals': True}), 'harmonisation': (get_harmonisation, {}), 'harmonisation_int': (get_harmonisation, {'intervals': True}) } for track_id in track_list: print("Computing features for track {}...".format(track_id)) for feature in features: # run feature function func, params = features[feature] X = func(track_id, **params) # normalize (!) and flatten X = X.flatten() / np.sum(X) # write utils.write_feature(X, [data_dir, feature, track_id])
def compute_and_write(data_dir, track_list=None, features=None): """Compute frame-based features for all audio files in a folder. Args: data_dir (str): where to write features track_list (str or None): list of file ids. Set to None to infer from files in beats_dir and onsets_dir. features (dict): dictionary with (unique) feature names as keys and tuples as values, each containing a feature extraction function and a parameter dictionary. Feature extraction functions can be any function that returns one or more 1d or 2d-arrays that share their first dimension. Required global variables: beats_dir (str): where to find beat data onsets_dir (str): where to find onset data """ if features is None: features = {'tempo': (local_tempo, {}), 'log_norm_ioi': (log_ioi, {'normalize_ioi': True}), 'log_norm_ioi_hist': (ioi_histogram, {'min_length': -3, 'max_length': 3, 'step': 0.5}), 'rpvi': (raw_pvi, {'normalize_ioi': False}), 'npvi': (norm_pvi, {'normalize_ioi': False})} if track_list is None: onsets_ids = [filename.split('.')[0] for filename in os.listdir(onsets_dir)] beats_ids = [filename.split('.')[0] for filename in os.listdir(beats_dir)] track_list = list(set(onsets_ids + beats_ids)) for track_id in track_list: print("Computing features for track {}...".format(track_id)) for feature in features: # run feature function func, params = features[feature] X = func(track_id, **params) # flatten X = X.flatten() # write utils.write_feature(X, [data_dir, feature, track_id])
def compute_and_write(audio_dir, data_dir, features=None): """Compute frame-based features for all audio files in a folder. Args: audio_dir (str): where to find audio files data_dir (str): where to write features features (dict): dictionary with feature extraction functions, indexed by feature name. Feature extraction functions should return a time 1d-array of frame times and a 2d-array of feature frames. Feature name will be used as the subdirectory to which feature CSVs are written.) """ if features is None: features = { 'mfcc': get_mfcc, 'hpcp': get_hpcp, 'melody': get_melody, 'beats': get_beats, 'onsets': get_onsets } filenames = os.listdir(audio_dir) for filename in filenames: if filename.endswith('.wav') or filename.endswith('.mp3'): print("Computing features for file {}...".format(filename)) x, sr = librosa.load(os.path.join(audio_dir, filename), mono=True) for feature in features: func = features[feature] t, X = func(x, sr) track_id = filename.split('.')[-2] utils.write_feature([t, X], [data_dir, feature, track_id])
posX = pd.read_csv('./human_cell_line/data/ZCPseKNC/human_pos_ZCPseKNC_all_csv.txt',header=None) negX = pd.read_csv('./human_cell_line/data/ZCPseKNC/human_neg_ZCPseKNC_all_csv.txt',header=None) # print(posX.columns[np.where(np.isnan(posX))[1]]) posX = np.array(posX) negX = np.array(negX) X = np.concatenate((posX,negX),axis=0) # print(X.shape) posy = np.ones((len(posX))) negy = np.zeros((len(negX))) negy[0:len(negX)] = -1 y = np.concatenate((posy,negy)) # feature_importance = XGBClassifier().fit(X,y).feature_importances_ # feature_importance_array = np.array(feature_importance) # sort_index = list(np.argsort(feature_importance_array)) # for i in range(-1,-11,-1): # index = sort_index[i] # print('feature_name:'+feature_name[index]+' '+'importance:'+str(feature_importance[index])) X_new = SelectKBest(f_classif,k=200).fit_transform(X,y) # print(X_new.shape) posX_new = X_new[0:len(posX)] negX_new = X_new[len(posX):len(posX)+len(negX)] # print(posX_new.shape) pos_outPath = './human_cell_line/data/ZCPseKNC/human_pos_ZCPseKNC_top200_csv.txt' neg_outPath = './human_cell_line/data/ZCPseKNC/human_neg_ZCPseKNC_top200_csv.txt' write_feature(posX_new, pos_outPath, '+1', 'csv') write_feature(negX_new, neg_outPath, '-1', 'csv')
'./Archaea/archaea_ZPseKNC/archaea_ZPseKNC_neg_XGBC_top10_csv.txt', header=None) step = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9] split_size = [] for i in step: split_size.append(int(518 * i)) # print(split_size) for s in split_size: index = np.random.choice(1072, size=s, replace=False) negX_new = negX.loc[index] negX_new = negX_new.values # print(negX_new.shape) neg_outPath = './Archaea/archaea_ZPseKNC/archaea_negtive_split/archaea_ZPseKNC_neg_XGBC_top10_' + str( s) + '_csv.txt' write_feature(negX_new, neg_outPath, '-1', 'csv') # negX = pd.read_csv('./Archaea/archaea_ZPseKNC/archaea_ZPseKNC_neg_XGBC_top10_csv.txt',header=None) # inc = int(518*0.1) #增量 # # print(inc) # cnt = int((1072-518)/inc) #做的次数 # for i in range(cnt+1): # negX_new = negX.loc[:517+inc*i] #闭区间 # negX_new = negX_new.values # # print(negX_new.shape) # s = str(518+inc*i) # neg_outPath = './Archaea/archaea_ZPseKNC/archaea_negtive_split/archaea_ZPseKNC_neg_XGBC_top10_'+str(s)+'_svm.txt' # write_feature(negX_new, neg_outPath, '-1', 'svm')
# gc_na = ZCPseKNC('./Archaea/archaea_neg80.txt',k,w,lamb) # write_feature(gc_na, './Archaea/ZCPseKNC/neg/archaea_ZCPseKNC_neg'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '-1', 'csv') # for k in (1,2,3,4,5): # for w in (0.1,0.5,0.9): # lamb = 5 # gc_a = ZCPseKNC('./human_cell_line/data/human_pos.txt',k,w,lamb) # write_feature(gc_a, './human_cell_line/data/ZCPseKNC/ZCPseKNC_pos'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '+1', 'csv') # gc_na = ZCPseKNC('./human_cell_line/data/human_neg.txt',k,w,lamb) # write_feature(gc_na, './human_cell_line/data/ZCPseKNC/ZCPseKNC_neg'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '-1', 'csv') # for k in (1,2,3): # w = 0.3 # lamb = 5 # gc_a = ZCPseKNC('./human_cell_line/data/human_pos.txt',k,w,lamb) # write_feature(gc_a, './human_cell_line/data/ZCPseKNC_k123/pos/ZCPseKNC_pos'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '+1', 'csv') # gc_na = ZCPseKNC('./human_cell_line/data/human_neg.txt',k,w,lamb) # write_feature(gc_na, './human_cell_line/data/ZCPseKNC_k123/neg/ZCPseKNC_neg'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '-1', 'csv') k = 3 w = 0.3 lamb = 5 gc_a = ZCPseKNC('./human_cell_line/data/human_pos.txt', k, w, lamb) write_feature( gc_a, './human_cell_line/data/human_pos_ZCPseKNC' + '_k_' + str(k) + '_w_' + str(w) + '_lambda_' + str(lamb) + '_csv.txt', '+1', 'csv') gc_na = ZCPseKNC('./human_cell_line/data/human_neg.txt', k, w, lamb) write_feature( gc_na, './human_cell_line/data/human_neg_ZCPseKNC' + '_k_' + str(k) + '_w_' + str(w) + '_lambda_' + str(lamb) + '_csv.txt', '-1', 'csv')
def compute_and_write(data_dir, track_list=None, features=None): """Compute frame-based features for all audio files in a folder. Args: data_dir (str): where to write features track_list (str or None): list of file ids. Set to None to infer from files in beats_dir and onsets_dir. features (dict): dictionary with (unique) feature names as keys and tuples as values, each containing a feature extraction function and a parameter dictionary. Feature extraction functions can be any function that returns one or more 1d or 2d-arrays that share their first dimension. Required global variables: beats_dir (str): where to find beat data onsets_dir (str): where to find onset data """ if features is None: features = { 'tempo': (local_tempo, {}), 'log_norm_ioi': (log_ioi, { 'normalize_ioi': True }), 'log_norm_ioi_hist': (ioi_histogram, { 'min_length': -3, 'max_length': 3, 'step': 0.5 }), 'rpvi': (raw_pvi, { 'normalize_ioi': False }), 'npvi': (norm_pvi, { 'normalize_ioi': False }) } if track_list is None: onsets_ids = [ filename.split('.')[0] for filename in os.listdir(onsets_dir) ] beats_ids = [ filename.split('.')[0] for filename in os.listdir(beats_dir) ] track_list = list(set(onsets_ids + beats_ids)) for track_id in track_list: print("Computing features for track {}...".format(track_id)) for feature in features: # run feature function func, params = features[feature] X = func(track_id, **params) # flatten X = X.flatten() # write utils.write_feature(X, [data_dir, feature, track_id])
# write_feature(gc_a, './Archaea/Zcurve/archaea_pos_Zcurve_k_1_csv.txt', '+1', 'csv') # gc_na = Zcurve_one('./Archaea/archaea_neg80.txt') # write_feature(gc_na, './Archaea/Zcurve/archaea_neg_Zcurve_k_1_csv.txt', '-1', 'csv') # for k in (1,2,3,4,5): # gc_a = Zcurve('./human_cell_line/data/human_pos.txt',k) # write_feature(gc_a, './human_cell_line/data/Zcurve/human_pos_Zcurve'+'_k_'+str(k)+'_csv.txt', '+1', 'csv') # gc_na = Zcurve('./human_cell_line/data/human_neg.txt',k) # write_feature(gc_na, './human_cell_line/data/Zcurve/human_neg_Zcurve'+'_k_'+str(k)+'_csv.txt', '-1', 'csv') gc_a_all = Zcurve('./human_cell_line/data/human_pos.txt', 1) print('gc_a_all', len(gc_a_all), len(gc_a_all[0])) gc_na_all = Zcurve('./human_cell_line/data/human_neg.txt', 1) for k in (2, 3): gc_a = Zcurve('./human_cell_line/data/human_pos.txt', k) print(k) print('gc_a', len(gc_a), len(gc_a[0])) for i in range(len(gc_a)): gc_a_all[i].extend(gc_a[i]) print('gc_a_all', len(gc_a_all), len(gc_a_all[0])) gc_na = Zcurve('./human_cell_line/data/human_neg.txt', k) for i in range(len(gc_na)): gc_na_all[i].extend(gc_na[i]) write_feature( gc_a_all, './human_cell_line/data/Zcurve/human_pos_Zcurve_all_3_csv.txt', '+1', 'csv') write_feature( gc_na_all, './human_cell_line/data/Zcurve/human_neg_Zcurve_all_3_csv.txt', '-1', 'csv')
def compute_and_write(data_dir, track_list=None, features=None): """Compute frame-based features for all audio files in a folder. Args: data_dir (str): where to write features track_list (str or None): list of file ids. Set to None to infer from files in melody_dir and chroma_dir (the intersection is used). features (dict): dictionary with (unique) feature names as keys and tuples as values, each containing a feature extraction function and a parameter dictionary. Feature extraction functions can be any function that returns one or more 1d or 2d-arrays that share their first dimension. Required global variables: melody_dir (str): where to find melody data chroma_dir (str): where to find chroma data """ if track_list is None: melody_ids = [ filename.split('.')[0] for filename in os.listdir(melody_dir) ] chroma_ids = [ filename.split('.')[0] for filename in os.listdir(chroma_dir) ] track_list = list(set(melody_ids + chroma_ids)) if features is None: features = { 'pitchhist': (get_pitchhist, {}), 'pitchhist2': (get_pitchhist2, {}), 'pitchhist3': (get_pitchhist3, {}), 'pitchhist3_int': (get_pitchhist3, { 'intervals': True, 'diagfactor': 1, 'sqrt': False }), 'chromahist2': (get_chromahist2, {}), 'chromahist3': (get_chromahist3, {}), 'chromahist3_int': (get_chromahist3, { 'intervals': True }), 'harmonisation': (get_harmonisation, {}), 'harmonisation_int': (get_harmonisation, { 'intervals': True }) } for track_id in track_list: print("Computing features for track {}...".format(track_id)) for feature in features: # run feature function func, params = features[feature] X = func(track_id, **params) # normalize (!) and flatten X = X.flatten() / np.sum(X) # write utils.write_feature(X, [data_dir, feature, track_id])
return -1 with open(fileName,'r') as f: for line in f.readlines(): if line[0] == '>': continue else: line = line.strip() featureVector,feature_name = codon_PseKNC_feature(line,k,w,lamb) feature_list.append(featureVector) if flag==0: return feature_list else: return feature_name if __name__ == '__main__': for k in (1,2,3): for w in (0.1,0.5,0.9): for lamb in (1,3,5): gc_a = codon_PseKNC('./Archaea/archaea_pos80.txt',k,w,lamb) write_feature(gc_a, './Archaea/codon_PseKNC/codon_PseKNC_pos_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '+1', 'svm') gc_na = codon_PseKNC('./Archaea/archaea_neg80.txt',k,w,lamb) write_feature(gc_na, './Archaea/codon_PseKNC/codon_PseKNC_neg_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '-1', 'svm') # k=3 # w=0.6 # lamb=3 # gc_a = Zcurve_PseKNC('./Archaea/archaea_pos80.txt',k,w,lamb) # # write_feature(gc_a, './tmp_Zcurve+PseKNC_pos_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '+1', 'svm') # # gc_na = Zcurve_PseKNC('./Archaea/archaea_neg80.txt',k,w,lamb) # # write_feature(gc_na, './tmp_Zcurve+PseKNC_neg_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '-1', 'svm')
对文件的每条序列样本计算kmer特征 参数: fileName:样本数据的文件名,需要包含文件路径,要求数据格式为fasta格式 k: kmer片段的长度 ''' if k > 5 or k < 1: print('invalide k value!') return -1 feature_list = [] with open(fileName) as f: for line in f.readlines(): if line[0] == '>': continue else: seq = line.strip().upper() featureVector = kmer_feature(seq, k) feature_list.append(featureVector) return feature_list if __name__ == '__main__': for k in [1, 2, 3, 4, 5]: gc_a = Kmer('./human_cell_line/data/human_pos.txt', k) write_feature( gc_a, './human_cell_line/data/kmer_me/human_pos_kmer' + '_k_' + str(k) + '_csv.txt', '+1', 'csv') gc_na = Kmer('./human_cell_line/data/human_neg.txt', k) write_feature( gc_na, './human_cell_line/data/kmer_me/human_neg_kmer' + '_k_' + str(k) + '_csv.txt', '-1', 'csv')