예제 #1
0
def compute_and_write(audio_dir, data_dir, features=None):
    """Compute frame-based features for all audio files in a folder.

    Args:
        audio_dir (str): where to find audio files
        data_dir (str): where to write features
        features (dict): dictionary with feature extraction functions, indexed
            by feature name.
            Feature extraction functions should return a time 1d-array of 
            frame times and a 2d-array of feature frames.
            Feature name will be used as the subdirectory to
            which feature CSVs are written.)
        """
    
    if features is None:
        features = {'mfcc': get_mfcc,
                    'hpcp': get_hpcp, 'melody': get_melody,
                    'beats': get_beats,  'onsets': get_onsets}

    filenames = os.listdir(audio_dir)
    for filename in filenames:

        if filename.endswith('.wav') or filename.endswith('.mp3'):
            print("Computing features for file {}...".format(filename))

            x, sr = librosa.load(os.path.join(audio_dir, filename), mono=True)

            for feature in features:

                func = features[feature]
                t, X = func(x, sr)

                track_id = filename.split('.')[-2]
                utils.write_feature([t, X], [data_dir, feature, track_id])
예제 #2
0
    def setUp(self):
        data_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/'
        pitch.chroma_dir = data_dir + 'hpcp/'
        self.example_id = '147526770'

        t, chroma = pitch.get_chroma(self.example_id)
        print chroma.shape, '\n'
        chroma_trans = np.roll(chroma, -3, axis=1)
        print chroma_trans.shape, '\n'

        self.temp_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/temp/'
        utils.write_feature([t, chroma_trans], [self.temp_dir, self.example_id])
예제 #3
0
    def setUp(self):
        data_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/'
        pitch.chroma_dir = data_dir + 'hpcp/'
        self.example_id = '147526770'

        t, chroma = pitch.get_chroma(self.example_id)
        print chroma.shape, '\n'
        chroma_trans = np.roll(chroma, -3, axis=1)
        print chroma_trans.shape, '\n'

        self.temp_dir = '/Users/Jan/Documents/Work/Cogitch/Data/HookedOnMusic/features/temp/'
        utils.write_feature([t, chroma_trans],
                            [self.temp_dir, self.example_id])
예제 #4
0
def compute_and_write(data_dir, track_list=None, features=None):
    """Compute frame-based features for all audio files in a folder.

    Args:
        data_dir (str): where to write features
        track_list (str or None): list of file ids. Set to None to infer from
            files in melody_dir and chroma_dir (the intersection is used).
        features (dict): dictionary with (unique) feature names as keys and 
            tuples as values, each containing a feature extraction function and a
            parameter dictionary.
            Feature extraction functions can be any function that returns one
                or more 1d or 2d-arrays that share their first dimension.

    Required global variables:
        melody_dir (str): where to find melody data
        chroma_dir (str): where to find chroma data
    """
    
    if track_list is None:
        melody_ids = [filename.split('.')[0] for filename in os.listdir(melody_dir)]
        chroma_ids = [filename.split('.')[0] for filename in os.listdir(chroma_dir)]

        track_list = list(set(melody_ids + chroma_ids))
    
    if features is None:
        features = {'pitchhist': (get_pitchhist, {}),
                    'pitchhist2': (get_pitchhist2, {}),
                    'pitchhist3': (get_pitchhist3, {}),
                    'pitchhist3_int': (get_pitchhist3, {'intervals': True, 'diagfactor': 1, 'sqrt': False}),
                    'chromahist2': (get_chromahist2, {}),
                    'chromahist3': (get_chromahist3, {}),
                    'chromahist3_int': (get_chromahist3, {'intervals': True}),
                    'harmonisation': (get_harmonisation, {}),
                    'harmonisation_int': (get_harmonisation, {'intervals': True}) }

    for track_id in track_list:

        print("Computing features for track {}...".format(track_id))

        for feature in features:

            # run feature function
            func, params = features[feature]
            X = func(track_id, **params)

            # normalize (!) and flatten
            X = X.flatten() / np.sum(X)

            # write
            utils.write_feature(X, [data_dir, feature, track_id])
예제 #5
0
def compute_and_write(data_dir, track_list=None, features=None):
    """Compute frame-based features for all audio files in a folder.

    Args:
        data_dir (str): where to write features
        track_list (str or None): list of file ids. Set to None to infer from
            files in beats_dir and onsets_dir.
        features (dict): dictionary with (unique) feature names as keys and 
            tuples as values, each containing a feature extraction function and a
            parameter dictionary.
            Feature extraction functions can be any function that returns one
                or more 1d or 2d-arrays that share their first dimension.

    Required global variables:
        beats_dir (str): where to find beat data
        onsets_dir (str): where to find onset data
    """
    if features is None:
        features = {'tempo': (local_tempo, {}),
                    'log_norm_ioi': (log_ioi, {'normalize_ioi': True}),
                    'log_norm_ioi_hist': (ioi_histogram, {'min_length': -3, 'max_length': 3, 'step': 0.5}),
                    'rpvi': (raw_pvi, {'normalize_ioi': False}),
                    'npvi': (norm_pvi, {'normalize_ioi': False})}

    if track_list is None:
        onsets_ids = [filename.split('.')[0] for filename in os.listdir(onsets_dir)]
        beats_ids = [filename.split('.')[0] for filename in os.listdir(beats_dir)]

        track_list = list(set(onsets_ids + beats_ids))


    for track_id in track_list:

        print("Computing features for track {}...".format(track_id))

        for feature in features:

            # run feature function
            func, params = features[feature]
            X = func(track_id, **params)

            # flatten
            X = X.flatten()

            # write
            utils.write_feature(X, [data_dir, feature, track_id])
예제 #6
0
def compute_and_write(audio_dir, data_dir, features=None):
    """Compute frame-based features for all audio files in a folder.

    Args:
        audio_dir (str): where to find audio files
        data_dir (str): where to write features
        features (dict): dictionary with feature extraction functions, indexed
            by feature name.
            Feature extraction functions should return a time 1d-array of 
            frame times and a 2d-array of feature frames.
            Feature name will be used as the subdirectory to
            which feature CSVs are written.)
        """

    if features is None:
        features = {
            'mfcc': get_mfcc,
            'hpcp': get_hpcp,
            'melody': get_melody,
            'beats': get_beats,
            'onsets': get_onsets
        }

    filenames = os.listdir(audio_dir)
    for filename in filenames:

        if filename.endswith('.wav') or filename.endswith('.mp3'):
            print("Computing features for file {}...".format(filename))

            x, sr = librosa.load(os.path.join(audio_dir, filename), mono=True)

            for feature in features:

                func = features[feature]
                t, X = func(x, sr)

                track_id = filename.split('.')[-2]
                utils.write_feature([t, X], [data_dir, feature, track_id])
예제 #7
0
posX = pd.read_csv('./human_cell_line/data/ZCPseKNC/human_pos_ZCPseKNC_all_csv.txt',header=None)
negX = pd.read_csv('./human_cell_line/data/ZCPseKNC/human_neg_ZCPseKNC_all_csv.txt',header=None)
# print(posX.columns[np.where(np.isnan(posX))[1]])
posX = np.array(posX)
negX = np.array(negX)
X = np.concatenate((posX,negX),axis=0)
# print(X.shape)

posy = np.ones((len(posX)))
negy = np.zeros((len(negX)))
negy[0:len(negX)] = -1
y = np.concatenate((posy,negy))

# feature_importance = XGBClassifier().fit(X,y).feature_importances_
# feature_importance_array = np.array(feature_importance)
# sort_index = list(np.argsort(feature_importance_array))
# for i in range(-1,-11,-1):
#     index = sort_index[i]
#     print('feature_name:'+feature_name[index]+'         '+'importance:'+str(feature_importance[index]))

X_new = SelectKBest(f_classif,k=200).fit_transform(X,y)
# print(X_new.shape)
posX_new = X_new[0:len(posX)]
negX_new = X_new[len(posX):len(posX)+len(negX)]
# print(posX_new.shape)
pos_outPath = './human_cell_line/data/ZCPseKNC/human_pos_ZCPseKNC_top200_csv.txt'
neg_outPath = './human_cell_line/data/ZCPseKNC/human_neg_ZCPseKNC_top200_csv.txt'
write_feature(posX_new, pos_outPath, '+1', 'csv')
write_feature(negX_new, neg_outPath, '-1', 'csv')
예제 #8
0
    './Archaea/archaea_ZPseKNC/archaea_ZPseKNC_neg_XGBC_top10_csv.txt',
    header=None)
step = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]
split_size = []
for i in step:
    split_size.append(int(518 * i))
# print(split_size)

for s in split_size:
    index = np.random.choice(1072, size=s, replace=False)
    negX_new = negX.loc[index]
    negX_new = negX_new.values
    # print(negX_new.shape)
    neg_outPath = './Archaea/archaea_ZPseKNC/archaea_negtive_split/archaea_ZPseKNC_neg_XGBC_top10_' + str(
        s) + '_csv.txt'
    write_feature(negX_new, neg_outPath, '-1', 'csv')

# negX = pd.read_csv('./Archaea/archaea_ZPseKNC/archaea_ZPseKNC_neg_XGBC_top10_csv.txt',header=None)

# inc = int(518*0.1)      #增量
# # print(inc)

# cnt = int((1072-518)/inc)          #做的次数
# for i in range(cnt+1):
#     negX_new = negX.loc[:517+inc*i]               #闭区间
#     negX_new = negX_new.values
#     # print(negX_new.shape)
#     s = str(518+inc*i)
#     neg_outPath = './Archaea/archaea_ZPseKNC/archaea_negtive_split/archaea_ZPseKNC_neg_XGBC_top10_'+str(s)+'_svm.txt'
#     write_feature(negX_new, neg_outPath, '-1', 'svm')
예제 #9
0
    #             gc_na = ZCPseKNC('./Archaea/archaea_neg80.txt',k,w,lamb)
    #             write_feature(gc_na, './Archaea/ZCPseKNC/neg/archaea_ZCPseKNC_neg'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '-1', 'csv')

    # for k in (1,2,3,4,5):
    #     for w in (0.1,0.5,0.9):
    #         lamb = 5
    #         gc_a = ZCPseKNC('./human_cell_line/data/human_pos.txt',k,w,lamb)
    #         write_feature(gc_a, './human_cell_line/data/ZCPseKNC/ZCPseKNC_pos'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '+1', 'csv')
    #         gc_na = ZCPseKNC('./human_cell_line/data/human_neg.txt',k,w,lamb)
    #         write_feature(gc_na, './human_cell_line/data/ZCPseKNC/ZCPseKNC_neg'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '-1', 'csv')

    # for k in (1,2,3):
    #         w = 0.3
    #         lamb = 5
    #         gc_a = ZCPseKNC('./human_cell_line/data/human_pos.txt',k,w,lamb)
    #         write_feature(gc_a, './human_cell_line/data/ZCPseKNC_k123/pos/ZCPseKNC_pos'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '+1', 'csv')
    #         gc_na = ZCPseKNC('./human_cell_line/data/human_neg.txt',k,w,lamb)
    #         write_feature(gc_na, './human_cell_line/data/ZCPseKNC_k123/neg/ZCPseKNC_neg'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'_csv.txt', '-1', 'csv')

    k = 3
    w = 0.3
    lamb = 5
    gc_a = ZCPseKNC('./human_cell_line/data/human_pos.txt', k, w, lamb)
    write_feature(
        gc_a, './human_cell_line/data/human_pos_ZCPseKNC' + '_k_' + str(k) +
        '_w_' + str(w) + '_lambda_' + str(lamb) + '_csv.txt', '+1', 'csv')
    gc_na = ZCPseKNC('./human_cell_line/data/human_neg.txt', k, w, lamb)
    write_feature(
        gc_na, './human_cell_line/data/human_neg_ZCPseKNC' + '_k_' + str(k) +
        '_w_' + str(w) + '_lambda_' + str(lamb) + '_csv.txt', '-1', 'csv')
예제 #10
0
def compute_and_write(data_dir, track_list=None, features=None):
    """Compute frame-based features for all audio files in a folder.

    Args:
        data_dir (str): where to write features
        track_list (str or None): list of file ids. Set to None to infer from
            files in beats_dir and onsets_dir.
        features (dict): dictionary with (unique) feature names as keys and 
            tuples as values, each containing a feature extraction function and a
            parameter dictionary.
            Feature extraction functions can be any function that returns one
                or more 1d or 2d-arrays that share their first dimension.

    Required global variables:
        beats_dir (str): where to find beat data
        onsets_dir (str): where to find onset data
    """
    if features is None:
        features = {
            'tempo': (local_tempo, {}),
            'log_norm_ioi': (log_ioi, {
                'normalize_ioi': True
            }),
            'log_norm_ioi_hist': (ioi_histogram, {
                'min_length': -3,
                'max_length': 3,
                'step': 0.5
            }),
            'rpvi': (raw_pvi, {
                'normalize_ioi': False
            }),
            'npvi': (norm_pvi, {
                'normalize_ioi': False
            })
        }

    if track_list is None:
        onsets_ids = [
            filename.split('.')[0] for filename in os.listdir(onsets_dir)
        ]
        beats_ids = [
            filename.split('.')[0] for filename in os.listdir(beats_dir)
        ]

        track_list = list(set(onsets_ids + beats_ids))

    for track_id in track_list:

        print("Computing features for track {}...".format(track_id))

        for feature in features:

            # run feature function
            func, params = features[feature]
            X = func(track_id, **params)

            # flatten
            X = X.flatten()

            # write
            utils.write_feature(X, [data_dir, feature, track_id])
예제 #11
0
    # write_feature(gc_a, './Archaea/Zcurve/archaea_pos_Zcurve_k_1_csv.txt', '+1', 'csv')
    # gc_na = Zcurve_one('./Archaea/archaea_neg80.txt')
    # write_feature(gc_na, './Archaea/Zcurve/archaea_neg_Zcurve_k_1_csv.txt', '-1', 'csv')

    # for k in (1,2,3,4,5):
    #     gc_a = Zcurve('./human_cell_line/data/human_pos.txt',k)
    #     write_feature(gc_a, './human_cell_line/data/Zcurve/human_pos_Zcurve'+'_k_'+str(k)+'_csv.txt', '+1', 'csv')
    #     gc_na = Zcurve('./human_cell_line/data/human_neg.txt',k)
    #     write_feature(gc_na, './human_cell_line/data/Zcurve/human_neg_Zcurve'+'_k_'+str(k)+'_csv.txt', '-1', 'csv')

    gc_a_all = Zcurve('./human_cell_line/data/human_pos.txt', 1)
    print('gc_a_all', len(gc_a_all), len(gc_a_all[0]))
    gc_na_all = Zcurve('./human_cell_line/data/human_neg.txt', 1)
    for k in (2, 3):
        gc_a = Zcurve('./human_cell_line/data/human_pos.txt', k)
        print(k)
        print('gc_a', len(gc_a), len(gc_a[0]))
        for i in range(len(gc_a)):
            gc_a_all[i].extend(gc_a[i])
        print('gc_a_all', len(gc_a_all), len(gc_a_all[0]))
        gc_na = Zcurve('./human_cell_line/data/human_neg.txt', k)
        for i in range(len(gc_na)):
            gc_na_all[i].extend(gc_na[i])
    write_feature(
        gc_a_all,
        './human_cell_line/data/Zcurve/human_pos_Zcurve_all_3_csv.txt', '+1',
        'csv')
    write_feature(
        gc_na_all,
        './human_cell_line/data/Zcurve/human_neg_Zcurve_all_3_csv.txt', '-1',
        'csv')
예제 #12
0
def compute_and_write(data_dir, track_list=None, features=None):
    """Compute frame-based features for all audio files in a folder.

    Args:
        data_dir (str): where to write features
        track_list (str or None): list of file ids. Set to None to infer from
            files in melody_dir and chroma_dir (the intersection is used).
        features (dict): dictionary with (unique) feature names as keys and 
            tuples as values, each containing a feature extraction function and a
            parameter dictionary.
            Feature extraction functions can be any function that returns one
                or more 1d or 2d-arrays that share their first dimension.

    Required global variables:
        melody_dir (str): where to find melody data
        chroma_dir (str): where to find chroma data
    """

    if track_list is None:
        melody_ids = [
            filename.split('.')[0] for filename in os.listdir(melody_dir)
        ]
        chroma_ids = [
            filename.split('.')[0] for filename in os.listdir(chroma_dir)
        ]

        track_list = list(set(melody_ids + chroma_ids))

    if features is None:
        features = {
            'pitchhist': (get_pitchhist, {}),
            'pitchhist2': (get_pitchhist2, {}),
            'pitchhist3': (get_pitchhist3, {}),
            'pitchhist3_int': (get_pitchhist3, {
                'intervals': True,
                'diagfactor': 1,
                'sqrt': False
            }),
            'chromahist2': (get_chromahist2, {}),
            'chromahist3': (get_chromahist3, {}),
            'chromahist3_int': (get_chromahist3, {
                'intervals': True
            }),
            'harmonisation': (get_harmonisation, {}),
            'harmonisation_int': (get_harmonisation, {
                'intervals': True
            })
        }

    for track_id in track_list:

        print("Computing features for track {}...".format(track_id))

        for feature in features:

            # run feature function
            func, params = features[feature]
            X = func(track_id, **params)

            # normalize (!) and flatten
            X = X.flatten() / np.sum(X)

            # write
            utils.write_feature(X, [data_dir, feature, track_id])
예제 #13
0
        return -1
    with open(fileName,'r') as f:
        for line in f.readlines():
            if line[0] == '>':
                continue
            else:
                line = line.strip()
                featureVector,feature_name = codon_PseKNC_feature(line,k,w,lamb)
            feature_list.append(featureVector)
    if flag==0:
        return feature_list
    else:
        return feature_name

if __name__ == '__main__':
    for k in (1,2,3):
        for w in (0.1,0.5,0.9):
            for lamb in (1,3,5):
                gc_a = codon_PseKNC('./Archaea/archaea_pos80.txt',k,w,lamb)
                write_feature(gc_a, './Archaea/codon_PseKNC/codon_PseKNC_pos_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '+1', 'svm')
                gc_na = codon_PseKNC('./Archaea/archaea_neg80.txt',k,w,lamb)
                write_feature(gc_na, './Archaea/codon_PseKNC/codon_PseKNC_neg_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '-1', 'svm')

    # k=3
    # w=0.6
    # lamb=3
    # gc_a = Zcurve_PseKNC('./Archaea/archaea_pos80.txt',k,w,lamb)
    # # write_feature(gc_a, './tmp_Zcurve+PseKNC_pos_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '+1', 'svm')
    # # gc_na = Zcurve_PseKNC('./Archaea/archaea_neg80.txt',k,w,lamb)
    # # write_feature(gc_na, './tmp_Zcurve+PseKNC_neg_svm'+'_k_'+str(k)+'_w_'+str(w)+'_lambda_'+str(lamb)+'.txt', '-1', 'svm')
예제 #14
0
    对文件的每条序列样本计算kmer特征
    参数:
        fileName:样本数据的文件名,需要包含文件路径,要求数据格式为fasta格式
        k: kmer片段的长度
    '''
    if k > 5 or k < 1:
        print('invalide k value!')
        return -1
    feature_list = []
    with open(fileName) as f:
        for line in f.readlines():
            if line[0] == '>':
                continue
            else:
                seq = line.strip().upper()
                featureVector = kmer_feature(seq, k)
                feature_list.append(featureVector)
    return feature_list


if __name__ == '__main__':

    for k in [1, 2, 3, 4, 5]:
        gc_a = Kmer('./human_cell_line/data/human_pos.txt', k)
        write_feature(
            gc_a, './human_cell_line/data/kmer_me/human_pos_kmer' + '_k_' +
            str(k) + '_csv.txt', '+1', 'csv')
        gc_na = Kmer('./human_cell_line/data/human_neg.txt', k)
        write_feature(
            gc_na, './human_cell_line/data/kmer_me/human_neg_kmer' + '_k_' +
            str(k) + '_csv.txt', '-1', 'csv')