def gfcc_yaml2df(gfcc_ymlf):
    gfcc_dict = {}
    gfcc_df = pd.DataFrame()        

    f = open(gfcc_ymlf)
    gfcc_dict = yaml.safe_load(f)
    f.close()

    for i in range(13):
        gfcc_arr = np.array([])
        for j in range(len(gfcc_dict['lowlevel']['gfcc'])):
            gfcc_arr = np.append(gfcc_arr,gfcc_dict['lowlevel']['gfcc'][j][i])
            gfcc_df['gfcc_mean_'+ str(i)] = [round(np.mean(gfcc_arr),4)]

    for i in range(13):
        gfcc_arr = np.array([])
        for j in range(len(gfcc_dict['lowlevel']['gfcc'])):
            gfcc_arr = np.append(gfcc_arr,gfcc_dict['lowlevel']['gfcc'][j][i])
            gfcc_df['gfcc_var_'+ str(i)] = [round(np.var(gfcc_arr),4)]

    # convert DataFrame to csv and arff
    pitch_df.to_csv('str_pitch_gender.csv')
    pandas2arff.pandas2arff(pitch_df,filename='str_pitch_gender.arff',wekaname = 'gender')
    
    return gfcc_df
Exemplo n.º 2
0
    def subset(self):
        data_frame = pd.read_csv(self.csvPath, sep=',')

        if self.percentage > 1:
            self.percentage = self.percentage / 100

        for count in range(self.n):
            df_sub = data_frame.groupby(
                self.column, as_index=False).apply(lambda x: x.sample(
                    frac=self.percentage, replace=True)).reset_index(drop=True)

            # n = df_sub.shape[1]
            # row = df_sub.shape[0]

            # x = df_sub.iloc[0:(row - 1), 0:n - 1]
            # y = df_sub.iloc[0:(row - 1), n - 1]

            # knn = KNeighborsClassifier(n_neighbors=3)
            # knn.fit(x, y)
            # result = knn.predict(x)

            # for it in range(0, row - 1):
            #     if y[it] != result[it]:
            #         data_frame = data_frame.append([df_sub.iloc[it, :]], ignore_index=True)

            #     self.percentage = (df_sub.shape[0] / data_frame.shape[0])

            subset = str(count)
            value = str(self.tamanho)
            caminho = str(self.repeticao)
            # salva os subsets em .arff
            pandas2arff(df_sub,
                        "C:\\TCC\\TCC\\subsets\\boosting\\" + caminho + "\\" +
                        self.csvNameFinal + subset + "_" + value + '.arff',
                        cleanstringdata=False)
Exemplo n.º 3
0
def gfcc_yaml2df(gfcc_ymlf):
    gfcc_dict = {}
    gfcc_df = pd.DataFrame()

    f = open(gfcc_ymlf)
    gfcc_dict = yaml.safe_load(f)
    f.close()

    for i in range(13):
        gfcc_arr = np.array([])
        for j in range(len(gfcc_dict['lowlevel']['gfcc'])):
            gfcc_arr = np.append(gfcc_arr, gfcc_dict['lowlevel']['gfcc'][j][i])
            gfcc_df['gfcc_mean_' + str(i)] = [round(np.mean(gfcc_arr), 4)]

    for i in range(13):
        gfcc_arr = np.array([])
        for j in range(len(gfcc_dict['lowlevel']['gfcc'])):
            gfcc_arr = np.append(gfcc_arr, gfcc_dict['lowlevel']['gfcc'][j][i])
            gfcc_df['gfcc_var_' + str(i)] = [round(np.var(gfcc_arr), 4)]

    # convert DataFrame to csv and arff
    pitch_df.to_csv('str_pitch_gender.csv')
    pandas2arff.pandas2arff(pitch_df,
                            filename='str_pitch_gender.arff',
                            wekaname='gender')

    return gfcc_df
Exemplo n.º 4
0
    def subset(self):

        if self.percentage > 1:
            self.percentage = self.percentage / 100

        # lê o CSV
        data_frame = pd.read_csv(self.csvPath, sep=',')

        for count in range(self.n):
            # Seleciona todos os valores 1 da coluna Class do dataframe e joga para class1, faz o mesmo para o class2
            #  mas com o 2
            df_sub = data_frame.groupby(
                self.column, as_index=False).apply(lambda x: x.sample(
                    frac=self.percentage, replace=True)).reset_index(drop=True)
            subset = str(count)
            value = str(self.tamanho)
            caminho = str(self.repeticao)
            # df_sub.to_csv( "C:\\Users\\Mateus\\Documents\\TCC\\"+caminho+"\\bagging\\" + self.csvNameFinal + subset + "_" + value, sep=',')
            pandas2arff(df_sub,
                        "C:\\Users\\Mateus\\Documents\\TCC\\" + caminho +
                        "\\bagging\\" + self.csvNameFinal + subset + "_" +
                        value + '.arff',
                        cleanstringdata=False)
            Dcol.DcolI(
                "C:\\Users\\Mateus\\Documents\\TCC\\" + caminho +
                "\\bagging\\",
                self.csvNameFinal + subset + "_" + value + '.arff',
                self.csvNameFinal + subset + "_" + value)

            metrica.metrica(
                df_sub, "C:\\Users\\Mateus\\Documents\\TCC\\" + caminho +
                "\\bagging\\" + self.csvNameFinal + subset + "_" + value +
                ".txt")
def pitch_yaml2df(pitch_ymlf,stats = ['median']):
    pitch_dict = {}
    pitch_df = pd.DataFrame()
    
    f = open(pitch_ymlf)
    pitch_dict = yaml.safe_load(f)
    f.close()

    for stat in stats:
        pitch_df['pitch_' + stat] = [round(pitch_dict['tonal']['pitch'][stat],4)]
        
    # convert DataFrame to csv and arff
    pitch_df.to_csv('str_pitch_gender.csv')
    pandas2arff.pandas2arff(pitch_df,filename='str_pitch_gender.arff',wekaname = 'gender')
    
    return pitch_df
def write_to_file(data_train_dfs,
                  data_test_df,
                  noises,
                  dataset_name,
                  sampling_method="random_features"):
    stripped_dataset_name = dataset_name.replace('.arff', '')
    stripped_dataset_name = stripped_dataset_name.replace('regression/', '')
    pandas2arff(data_test_df,
                "data/" + dataset_name + "/" + sampling_method + "/" +
                stripped_dataset_name + "_RQPtest.arff",
                wekaname=file_path + "_RQP_test_data")

    for i in range(len(noises)):
        pandas2arff(data_train_dfs[i],
                    "data/" + dataset_name + "/" + sampling_method + "/" +
                    stripped_dataset_name + "_noise_" + str(noises[i]) +
                    "_RQPtrain.arff",
                    wekaname=file_path + "_RQP_training_data")
def mfcc_yaml2df(mfcc_ymlf):
    mfcc_dict = {}
    mfcc_df = pd.DataFrame()
    
    f = open(mfcc_ymlf)
    mfcc_dict = yaml.safe_load(f)
    f.close()

    for i in range(13):
        mfcc_df['mfcc_mean_'+ str(i)] = [round(mfcc_dict['lowlevel']['mfcc']['mean'][i],4)]

    for i in range(13):
        mfcc_df['mfcc_var_'+ str(i)] = [round(mfcc_dict['lowlevel']['mfcc']['var'][i],4)]

    # convert DataFrame to csv and arff
    pitch_df.to_csv('str_pitch_gender.csv')
    pandas2arff.pandas2arff(pitch_df,filename='str_pitch_gender.arff',wekaname = 'gender')
    
    return mfcc_df
Exemplo n.º 8
0
def pitch_yaml2df(pitch_ymlf, stats=['median']):
    pitch_dict = {}
    pitch_df = pd.DataFrame()

    f = open(pitch_ymlf)
    pitch_dict = yaml.safe_load(f)
    f.close()

    for stat in stats:
        pitch_df['pitch_' +
                 stat] = [round(pitch_dict['tonal']['pitch'][stat], 4)]

    # convert DataFrame to csv and arff
    pitch_df.to_csv('str_pitch_gender.csv')
    pandas2arff.pandas2arff(pitch_df,
                            filename='str_pitch_gender.arff',
                            wekaname='gender')

    return pitch_df
def mfcc_yaml2df(mfcc_ymlf):
    mfcc_dict = {}
    mfcc_df = pd.DataFrame()

    f = open(mfcc_ymlf)
    mfcc_dict = yaml.safe_load(f)
    f.close()

    for i in range(13):
        mfcc_df['mfcc_mean_' +
                str(i)] = [round(mfcc_dict['lowlevel']['mfcc']['mean'][i], 4)]

    for i in range(13):
        mfcc_df['mfcc_var_' +
                str(i)] = [round(mfcc_dict['lowlevel']['mfcc']['var'][i], 4)]

    # convert DataFrame to csv and arff
    pitch_df.to_csv('str_pitch_gender.csv')
    pandas2arff.pandas2arff(pitch_df,
                            filename='str_pitch_gender.arff',
                            wekaname='gender')

    return mfcc_df
        instruments = np.append(instruments,'hihat')
    elif inst in crash:
        instruments = np.append(instruments,'crash')
    elif inst in ride:
        instruments = np.append(instruments,'ride')
    elif inst in rim:
        instruments = np.append(instruments,'rim')
    elif inst in snare:
        instruments = np.append(instruments,'snare')
    elif inst in tom:
        instruments = np.append(instruments,'tom')
    else:
        instruments = np.append(instruments,'kick')

df['instrument'] = instruments
instruments = np.unique(instruments)

n_samples = min([len(df[df['instrument'] == inst]) for inst in instruments])

new_df = pd.DataFrame()
for inst in instruments:
    new_df = pd.concat([new_df,df[df['instrument'] == inst].sample(n_samples)],ignore_index=True)

names = new_df.iloc[:,-1:]
new_df = new_df.iloc[:,:-1]

import pandas2arff
pandas2arff.pandas2arff(new_df,filename='alt_experiment_7classes_balanced.arff',wekaname = 'instrument')

new_df.to_csv('alt_experiment_7classes_balanced.csv')
names.to_csv('alt_experiment_names_7classes_balanced.csv')
import pandas as pd
#https://github.com/saurabhnagrecha/Pandas-to-ARFF
import pandas2arff

#read csvs with feature values to DataFrames and concatenate to one combined DataFrame
mfcc_df = pd.read_csv('str_mfcc_gender.csv').iloc[:, 1:-1]
gfcc_df = pd.read_csv('str_gfcc_gender.csv').iloc[:, 1:-1]
pitch_df = pd.read_csv('str_pitch_gender.csv').iloc[:, 1:]
df = pd.concat([mfcc_df, gfcc_df, pitch_df], axis=1)

#convert DataFrame with all feature values to csv and arff
df.to_csv('str_mfcc_and_gfcc_and_pitch_gender.csv')
pandas2arff.pandas2arff(df,
                        filename='str_mfcc_and_gfcc_and_pitch_gender.arff',
                        wekaname='gender')
Exemplo n.º 12
0
                   final,
                   left_on='level_0',
                   right_on='movieId')
final_2.head()
final_2 = final_2.drop(['level_0', 'level_1'], axis=1)

final_2.head()

final_2.to_csv('data/final_data.csv', index=False, encoding='utf-8')
final_2.drop(['genres', 'title', 'movieId'],
             axis=1).to_csv('data/final_hadoop_data.csv',
                            index=False,
                            encoding='utf-8')

genres = [
    'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]
for i in genres:
    tmp = final_2[final_2.genres.str.contains(i)].drop(
        ['genres', 'title', 'movieId'], axis=1)
    tmp = pd.get_dummies(tmp, columns=['tag'])
    tmp.to_csv('genres/final_' + i + ".csv", index=False, encoding='utf-8')

import pandas2arff
reload(pandas2arff)
t = pd.get_dummies(final_2.drop(['genres', 'title', 'movieId'], axis=1),
                   columns=['tag'])
pandas2arff.pandas2arff(t, "foo.arff")
Exemplo n.º 13
0
print(park.describe())
#print(park.status.unique())
#print(park.class.unique())
#print(park.spread1.value_counts())

# Drop a column
#park.drop("PPE", axis=1, inplace=True)

# set the dtype
#park.DFA = park.DFA.astype(float)

park = park.rename(columns=lambda x: x.replace('-', '_'))

park.to_hdf('datasets.hdf', 'parkinsons', complib='blosc', complevel=9)
park.to_csv('datasets-parkinsons.csv')
pandas2arff(park, "datasets-parkinsons.arff")

# parkinsons telemonitoring dataset

updrs = pd.read_csv('./parkinsons_updrs.data', header=None)
"""
From: https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.names

subject# - Integer that uniquely identifies each subject
age - Subject age
sex - Subject gender '0' - male, '1' - female
test_time - Time since recruitment into the trial. The integer part is the 
number of days since recruitment.
motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated
total_UPDRS - Clinician's total UPDRS score, linearly interpolated
Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of 
            feat_df = pd.concat([feat_df,df],ignore_index=True)


feat_df.to_csv('alt_test_all.csv')
feat_df = pd.read_csv('../../../alt_test_set_text_files/json_and_csv/alt_test_all.csv')

feats = ['spectral_contrast_mean_4','spectral_energyband_middle_low_mean','spectral_contrast_mean_5',
 'spectral_energyband_high_mean','spectral_contrast_var_0']

new_feat_df = pd.DataFrame()

for feat in feats:
    new_feat_df[feat] = feat_df[feat]


new_feat_df

feat_df.to_csv('alt_test_clustering1.csv')

import pandas2arff

pandas2arff.pandas2arff(feat_df,filename='alt_test_all.arff')
pandas2arff.pandas2arff(new_feat_df,filename='alt_test_clustering1.arff')

fit_predict = model.fit_predict(X)

for i in np.argsort(fit_predict):
    sorted_names = np.append(sorted_names,names[i])

get_ipython().magic(u'pinfo Birch')
Exemplo n.º 15
0
    words['destressed_pn_list'] = words.pronunciation.apply(filter_stress)
    words['primary_stress_map'] = words.pn_list.apply(stress_map)
    words['secondary_stress_map'] = words.pn_list.apply(stress_map,stress='2')
    words['vowel_map'] = words.destressed_pn_list.apply(phoneme_map,args=(vowels,))
    words['consonant_map'] = words.destressed_pn_list.apply(phoneme_map, args=(consonants,))
    words['vector_map'] = words.destressed_pn_list.apply(iterable_map, args=(vector_map,))
    words['vowel_count'] = words.vowel_map.apply(np.sum)
    words['consonant_count'] = words.consonant_map.apply(np.sum)
    words['type_tag'] = words.word.apply(get_pos_tag)
    words['1st_letter_idx'] = words.word.apply(get_first_letter_idx)
    words['phoneme_length'] = words.pn_list.str.len()
    words['prefix'] = words.word.apply(check_prefix)
    words['suffix'] = words.word.apply(check_suffix)
    #words['prefix_suffix_vector'] = words.
    words['primary_stress_idx'] = words.primary_stress_map.apply(get_stress_position)
    words['stressed_vowel'] = words.pn_list.apply(get_stressed_vowel)

    # Unpack vector map into single columns
    unpacked_vector_map = pd.DataFrame.from_records(words.vector_map.tolist(),columns=vector_map)
    words = pd.concat([words, unpacked_vector_map],axis=1)

    return words

if __name__ == '__main__':
    data_loc = 'asset/training_data.txt'
    words = get_words(data_loc)
    words.head()

    columns = ['word','type_tag','1st_letter_index','phoneme_length','suffix','prefix','primary_stress_idx']
    pd2a.pandas2arff(words_df[columns],'weka/word_typetag.arff','word_typetag')
import pandas as pd,numpy as np,os,yaml
#https://github.com/saurabhnagrecha/Pandas-to-ARFF
import pandas2arff

#path to csv file containing sounds' extracted features
pardir = '../experiment_text_files/alt_test_set_text_files/json_and_csv/with_unspecified'

#csv file containing sounds' extracted features
filename = 'alt_test_with_unspecified.csv'

df = pd.read_csv(os.path.join(pardir,filename))
feat_df = pd.DataFrame()

#list of features we intend to keep 
feats = ['spectral_centroid_mean','spectral_entropy_mean','gfcc_mean_2','logattacktime_mean','spectral_skewness_mean',
         'spectral_energyband_middle_low_mean','scvalleys_var_2','effective_duration_mean',
         'pitch_instantaneous_confidence_mean']

for feat in feats:
    feat_df[feat] = df[feat]

exported_csv_fname = 'alt_test_with_unspecified_9FEATS_nosil'
feat_df.to_csv(exported_csv_fname + '.csv')

exported_arff_fname = 'alt_test_with_unspecified_9FEATS_nosil'
pandas2arff.pandas2arff(feat_df,filename=exported_arff_fname + '.arff',wekaname = 'instrument')
Exemplo n.º 17
0
final['tag'] = final.tag.map(list)
final.head()

s = final['tag'].apply(pd.Series,1).stack()
s.index = s.index.droplevel(-1)
s.name = 'tag'
s.head()
del final['tag']
final_2= pd.merge(s.reset_index(), final, left_on='level_0', right_on='movieId')
final_2.head()
final_2 = final_2.drop(['level_0', 'level_1'], axis=1)

final_2.head()

final_2.to_csv('data/final_data.csv', index=False,encoding='utf-8')
final_2.drop(['genres','title','movieId'], axis=1).to_csv('data/final_hadoop_data.csv', index=False,encoding='utf-8')



genres= ['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
for i in genres:
    tmp = final_2[final_2.genres.str.contains(i)].drop(['genres','title','movieId'], axis=1)
    tmp = pd.get_dummies(tmp, columns=['tag'])
    tmp.to_csv('genres/final_'+i+".csv", index=False,encoding='utf-8')


import pandas2arff
reload(pandas2arff)
t=pd.get_dummies(final_2.drop(['genres','title','movieId'], axis=1), columns=['tag'])
pandas2arff.pandas2arff(t,"foo.arff")
pardir = '../experiment_text_files/big_train_set_files/json_and_csv/7_classes'
filename = 'big_experiment_7classes_no_silence.csv'

df = pd.read_csv(os.path.join(pardir,filename))
df = df.iloc[:,1:]

instrument = 'kick'

inst = df[df['instrument'] == instrument]
not_inst_init = df[df['instrument'] != instrument]

instruments = np.unique(df[df['instrument']!= instrument]['instrument'])

not_inst = pd.DataFrame()
for i in instruments:
    #sample from each instrument as many samples as needed in order to accumulate a "not instrument" dataframe 
    #as big as the "instrument" dataframe
    tmp = df[df['instrument'] == i].sample(int(round(inst.shape[0]/6)))
    tmp_inst = np.chararray(tmp.shape[0],len('not '+instrument))
    tmp_inst[:] = 'not '+instrument
    tmp['instrument'] = tmp_inst
    not_inst = pd.concat([not_inst,tmp])

df = pd.DataFrame()
df = pd.concat([inst,not_inst])

df.to_csv('../experiment_text_files/big_train_set_files/json_and_csv/1classVSall/{instrument}_vs_all_nosil.csv'.format(instrument = instrument))
pandas2arff.pandas2arff(df,filename='../experiment_text_files/big_train_set_files/arff/1classVSall/{instrument}_vs_all_nosil.arff'.format(instrument = instrument),wekaname = 'instrument')

df = pd.read_csv('../experiment_text_files/big_train_set_files/json_and_csv/1classVSall/{instrument}_vs_all_nosil.csv'.format(instrument = instrument))
import pandas as pd
#https://github.com/saurabhnagrecha/Pandas-to-ARFF
import pandas2arff

#read csvs with feature values to DataFrames and concatenate to one combined DataFrame
mfcc_df = pd.read_csv('str_mfcc_gender.csv').iloc[:,1:-1]
gfcc_df = pd.read_csv('str_gfcc_gender.csv').iloc[:,1:-1]
pitch_df = pd.read_csv('str_pitch_gender.csv').iloc[:,1:]
df = pd.concat([mfcc_df,gfcc_df,pitch_df], axis=1)

#convert DataFrame with all feature values to csv and arff 
df.to_csv('str_mfcc_and_gfcc_and_pitch_gender.csv')
pandas2arff.pandas2arff(df,filename='str_mfcc_and_gfcc_and_pitch_gender.arff',wekaname = 'gender')