def precision_100percent(train, test): """Description of precision_100percent ..todo:: 1 Find best clf with default param 2 vary param of best clf and find best param 3 use best param and best clf to find recall for 100 percent precision """ utils.print_success("Find Recall for best Precision for each tag") train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) classifiers = { # "RandomForest": RandomForestClassifier(),#n_estimators=5 "DecisionTree":DecisionTreeClassifier()#,#max_depth=10 # "SVM":SVC(kernel="linear", C=0.0205), # "ExtraTreesClassifier":ExtraTreesClassifier(n_estimators=5, criterion="entropy", max_features="log2", max_depth=9), # "LogisticRegression":LogisticRegression() } tags = list(set(test_groundtruths)) nb_tag = len(tags) step = 0.01 # for index, tag in enumerate(["i"]): for index, tag in enumerate(tags): utils.print_success("Tag " + tag) max_precision = 0 max_recall = 0 max_f_measure = 0 max_clf = "" max_weight = 0 for key in classifiers: clf = classifiers[key] # for weight in np.arange(0., 0.01, 0.000001): # for weight in np.arange(step, 1-step, step): for weight in np.arange(0.0, 1.0, step): print("Classifier " + key + " & Weight " + str(weight)) sys.stdout.write("\033[F") sys.stdout.write("\033[K") clf.set_params(class_weight={"i":weight, "s":1-weight}) clf.fit(train_features, train_groundtruths) predictions = clf.predict(test_features) precision = precision_score(test_groundtruths, predictions, average=None)[index] if precision >= max_precision: recall = recall_score(test_groundtruths, predictions, average=None)[index] # if recall > max_recall: max_precision = precision max_recall = recall max_f_measure = f1_score(test_groundtruths, predictions, average=None)[index] max_weight = weight max_clf = key sys.stdout.write("\033[K") utils.print_info("\tClassifier " + str(max_clf)) utils.print_info("\tPrecision " + str(max_precision)) utils.print_info("\tRecall " + str(max_recall)) utils.print_info("\tF-Measure " + str(max_f_measure)) utils.print_info("\tWeight " + str(max_weight))
def get_gts(): gt_filen = "../data/filelist.csv" utils.abs_path_file(gt_filen) gts = {} with open(gt_filen, "r") as filep: next(filep) for line in filep: if ",male," in line or ",female," in line: row = line.split(",") gts[row[0]] = row[3][0] return gts
def read_file(filename): filename = utils.abs_path_file(filename) with open(filename, "r") as filep: next(filep) first_line = True groundtruths = [] filenames = [] for row in filep: splitted_row = row.split(",") filenames.append(splitted_row[0]) if first_line: first_line = False if "s" in splitted_row[-1][:-1]: groundtruths = np.array([True]) elif "i" in splitted_row[-1][:-1]: groundtruths = np.array([False]) features = np.array([splitted_row[1:-1]]).astype(np.float) else: cur_feat = np.array([splitted_row[1:-1]]).astype(np.float) if "s" in splitted_row[-1][:-1]: groundtruths = np.append(groundtruths, np.array([True])) elif "i" in splitted_row[-1][:-1]: groundtruths = np.append(groundtruths, np.array([False])) features = np.append(features, cur_feat, axis=0) return filenames, features, groundtruths
def new_algo_final(indir, file_gts_track): utils.print_success("Approx. time ~6 hours.") # Preprocess arg indir = utils.abs_path_dir(indir) file_gts_track = utils.abs_path_file(file_gts_track) dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "bayle") feat_frame_train = utils.create_dir(dir_tmp + "feat_frame_train") feat_frame_test = utils.create_dir(dir_tmp + "feat_frame_test") outdir_global = utils.create_dir(dir_tmp + "feat_track") feat_train = outdir_global + "train.csv" feat_test = outdir_global + "test.csv" models_dir = utils.create_dir(dir_tmp + "models") loc_feat_testset_dirpath = "features/database2/" filelist_train = "groundtruths/database1.csv" filelist_test = "groundtruths/database2.csv" models_global = utils.create_dir(dir_tmp + "models_track") process_local_feat(indir, file_gts_track, outdir_local=feat_frame_train, out_feat_global=feat_train, train=False) classify.create_models(outdir=models_dir, train_dir=feat_frame_train, separator=",", classifiers="RandomForest") """ Create features at track scale for the train set Features: MFCC + Delta + Double Delta + ngrams + hist """ model_file = "src/tmp/bayle/models/RandomForest/RandomForest.pkl" model_file = "/media/sf_DATA/ReproducibleResearchIEEE2017/src/tmp/bayle/models/RandomForest/RandomForest.pkl" create_track_feat_testset(indir, filelist_train, feat_train, model_file, train=True) # # 15h28m44s to 19h08m28s Done in 13184117ms create_track_feat_testset(loc_feat_testset_dirpath, filelist_test, feat_test, model_file) classify.create_models(outdir=models_global, train_file=feat_train, classifiers="RandomForest") process_results(feat_train, feat_test)
def read_item_tag(filename): filename = utils.abs_path_file(filename) data = {} with open(filename, "r") as filep: for line in filep: line = line.split("\t") data[line[0][2:]] = line[1] return data
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def process_results(in_fn, out_fn): in_fn = utils.abs_path_file(in_fn) out_fp = open(out_fn, "w") with open(in_fn, "r") as filep: for index, line in enumerate(filep): if index % 2: row = line[:-1].split("\t") out_fp.write(row[0].split("_")[0] + "," + row[2] + "\n") out_fp.close()
def figures1bd(indir, file_gts_track): """Description of figures1bd infile is formated like: /media/sf_github/yann/train/01 - 01 Les Jardins Japonais.wav.mfcc.csv feat1 feat2 ... featn tag1 feat1 feat2 ... featn tag2 ... feat1 feat2 ... featn tag2 0 Input the local extracted features from YAAFE 13 MFCC per frame 186 musical pieces as train set 1 Computes delta and double delta (39 features per frame) 2 Gather global mean (39 features per musical pieces) 3 train on mfcc & deltas (39 feat/frame) to output global predictions 4 Use global preds to compute song and instru n-grams and histogramm which add 70 feat/track lead to a total of 109 feat/track 5 Fit on 109x186 6 predict (or predict_proba) on 41491 track """ # Preprocess arg indir = utils.abs_path_dir(indir) file_gts_track = utils.abs_path_file(file_gts_track) feat_frame_train = "feat_frame_train/" utils.create_dir(feat_frame_train) feat_frame_test = "feat_frame_test/" utils.create_dir(feat_frame_test) outdir_global = "feat_track/" utils.create_dir(outdir_global) feat_train = outdir_global + "train.csv" feat_test = outdir_global + "test.csv" models_dir = "models/" utils.create_dir(models_dir) loc_feat_testset_dirpath = "/media/sf_DATA/Datasets/Simbals/yaafe/results/processed/" filelist_test = "filelist_test.tsv" filelist_train = "filelist_train.tsv" models_global = "models_track/" utils.create_dir(models_global) # process_local_feat(indir, file_gts_track, feat_frame_train, feat_train, train=True) # classify.create_models(outdir=models_dir, train_dir=feat_frame_train, separator=",") # create_track_feat_testset(indir, filelist_train, feat_train, train=True) # 15h28m44s to 19h08m28s Done in 13184117ms # create_track_feat_testset(loc_feat_testset_dirpath, filelist_test, feat_test) # classify.create_models(outdir=models_global, train_file=feat_train) # classify.test_models_parallel( # models_dir=models_global, # out_dir="results/", # test_file=feat_test) # Display results reproduce.plot_results("results/")
def extract_features(files="available.txt"): utils.print_success("Extracting features") files = utils.abs_path_file(files) dir_feat = "/media/sf_DATA/ISMIR2017/features/gender/" with open(files, "r") as filep: for line in filep: line = line[:-1] utils.yaafe(line, dir_feat + "song/", verbose=True) utils.yaafe(line.replace("nbv-ld", "sv"), dir_feat + "sv/", verbose=True)
def read_train_file(filename): """ Read ONE train file """ groundtruths = [] features = [] filename = utils.abs_path_file(filename) with open(filename, "r") as filep: for line in filep: line = line.split(",") groundtruths.append(line[-1][:-1]) features.append(line[1:-1]) return features, groundtruths
def merge_gt_feat(gt_filen, feat_filen, train_filen): """ @brief Read the files containing ground truths and features and merge them to be used for classification @param gt_filen The ground truths filename @param feat_filen The features filename """ utils.print_success("Adding groundtruth") feat_filen = utils.abs_path_file(feat_filen) gt_filen = utils.abs_path_file(gt_filen) gts = read_gts(gt_filen) output = open(train_filen, "w") with open(feat_filen, "r") as feat: cur_id = "" for line in feat: if "filename" in line: m = re.search(r"\d{2,10}", line) cur_id = m.group() elif len(cur_id) > 1 and "srate" not in line and cur_id in gts: output.write( str(cur_id) + "," + line[:-4] + gts[cur_id] + "\n") output.close()
def read_item_tag(filename): """Description of read_file example line: filename,tag """ filename = utils.abs_path_file(filename) groundtruths = {} with open(filename, "r") as filep: for row in filep: line = row.split(",") groundtruths[line[0]] = line[1][:-1] return groundtruths
def read_file(filename): """Description of read_file train/test example line: filename,feat1,feat2,...,featn,tag """ filename = utils.abs_path_file(filename) groundtruths = [] features = [] with open(filename, "r") as filep: for row in filep: line = row.split(",") groundtruths.append(line[-1][:-1]) features.append([float(i) for i in line[1:-1]]) return features, groundtruths
def read_preds(filename): """Description of read_file ex file: ISRC,tag """ filename = utils.abs_path_file(filename) isrcs = {} with open(filename, "r") as filep: for row in filep: line = row.split(",") # print(line) isrcs[line[0]] = float(line[1]) # isrcs[line[0]] = 1.0-float(line[1]) return isrcs
def read_preds(filename): pres_filen = utils.abs_path_file(filename) predictions = {} i = 0 with open(filename, "r") as filep: for index, line in enumerate(filep): if index % 2: line = line.split("\t") name = line[0].split("/")[-1] pred = float(line[-1]) if pred > 0.5: predictions[name] = "s" else: predictions[name] = "i" return predictions
def read_gts(filename): filename = utils.abs_path_file(filename) groundtruths = {} i = 0 with open(filename, "r") as filep: for index, line in enumerate(filep): if index > 73: if i == 0: i += 1 name = line.split("/")[-1][:-1] elif i == 1: i += 1 elif i == 2: i = 0 groundtruths[name] = line.split(",")[-1][:-1] return groundtruths
def read_gts(gt_filen): """ @brief Parse ground truths file in a Python dictionary @param gt_filen The ground truths filename @return Return a dictionary with key corresponding to songs' id and value corresponding to male or female tag """ gt_filen = utils.abs_path_file(gt_filen) data = {} with open(gt_filen, "r") as filep: for line in filep: if "male," in line: # this works for ,male, or ,female, row = line.split(",") data[row[0]] = row[3] return data
def read_data_1(stats_dir, filen): stats_dir = utils.abs_path_dir(stats_dir) filen = utils.abs_path_file(filen) data = [] names = [] with open(stats_dir + filen, "r") as filep: for line in filep: # Read file with lines like this: # GA,0.578947368421,0.631578947368,0.710526315789,0.722222222222 # SVMBFF,0.631578947368,0.684210526316,0.815789473684,0.66666666 # VQMM,0.736842105263,0.842105263158,0.842105263158,0.75,0.61111 row = line[:-1].split(",") tmp = [] for index in range(1, len(row)): names.append(row[0]) tmp.append(float(row[index])) data.append(tmp) print(filen.split(".")[0].split("_")[1].title() + " for " + row[0] + " \t= " + str("{0:.3f}".format(sum(tmp)/len(tmp))) + " ± " + str("{0:.3f}".format(stdev(tmp))))
def read_file_bayle(filename): """Description of read_file train/test example line: filename,feat1,feat2,...,featn,tag """ filename = utils.abs_path_file(filename) filenames = [] groundtruths = [] features = [] with open(filename, "r") as filep: for row in filep: line = row.split(",") filenames.append(line[0]) features.append([float(i) for i in line[1:-1]]) gt = line[-1] while "\n" in gt or "\r" in gt: gt = gt [:-1] groundtruths.append(gt) return filenames, features, groundtruths
def remove_silence(filen, verbose=False): utils.print_success("Removing silence") filen = utils.abs_path_file(filen) with open(filen, "r") as filep: for line in filep: print(line) # Step 1 Gather samples song_fn = line[:-1] voice_fn = song_fn.replace("nbv-ld", "sv") try: voice_samples, voice_fs = sf.read(voice_fn) except: utils.print_error( "ERROR in identify_singing_voice_gender line 207 in sf.read(voice_fn)" ) try: song_samples, song_fs = sf.read(song_fn) except: utils.print_error( "ERROR in identify_singing_voice_gender line 207 in sf.read(song_fn)" ) idxs = np.any( voice_samples != 0., axis=1) # index of rows with at least one non zero value voice_samples_non_zero = voice_samples[ idxs, 0] # selection of the wanted rows voice_samples_non_zero = voice_samples_non_zero.reshape( 1, len(voice_samples_non_zero)) song_samples_non_zero = song_samples[ idxs, 0] # selection of the wanted rows song_samples_non_zero = song_samples_non_zero.reshape( 1, len(song_samples_non_zero)) # Step 2 Extract features dir_feat = "/media/sf_DATA/ISMIR2017/features/gender/" save_feat( voice_samples_non_zero, dir_feat + "sv_nonzero/" + voice_fn.split(os.sep)[-1] + ".mfcc") save_feat( song_samples_non_zero, dir_feat + "song_nonzero/" + song_fn.split(os.sep)[-1] + ".mfcc")
def read_test_file(filename): """ Read ONE test file with content like: feat1 feat2 ... featN feat1 feat2 ... featN ... feat1 feat2 ... featN """ features = [] filename = utils.abs_path_file(filename) with open(filename, "r") as filep: for line in filep: line = line.split(" ") line[-1] = line[-1][:-1] feat = [] for tmp_feat in line: feat.append(float(tmp_feat)) features.append(feat) return features
def create_filelist(kara1k, dir_audio): """ @brief Creates a filelist. @param kara1k The kara 1 k @return { description_of_the_return_value } """ utils.print_success("Creating file list to be analyzed") kara1k = utils.abs_path_file(kara1k) dir_audio = utils.abs_path_dir(dir_audio) filelist = [] with open(kara1k, "r") as filep: next(filep) for line in filep: row = line.split(",") # "male" in line avoir females and male and get female and male tracks if "1" in row[6] and "1" in row[7] and "1" in row[ 8] and "male" in line: filelist.append(dir_audio + row[1] + "_" + row[2] + "_" + row[0]) return filelist
def read_data_2(stats_dir, filen): stats_dir = utils.abs_path_dir(stats_dir) filen = utils.abs_path_file(stats_dir + filen) data = [] names = [] tmp = [] name = "" with open(filen, "r") as filep: next(filep) for line in filep: row = line[:-1].split(";") if row[0] in names: names.append(row[0]) tmp.append(float(row[1])) else: if len(tmp) > 0: data.append(tmp) tmp = [] names.append(row[0]) tmp.append(float(row[1])) data.append(tmp) return data, names
def add_groundtruth(feature_fn, groundtruth_fn, output_fn): """Description of add_groundtruth Write in output filename the groundtruth merged with corresponding features ..todo:: Error with old_tag not corresponding to filename... """ utils.print_success("Adding groundtruth") feature_fn = utils.abs_path_file(feature_fn) groundtruth_fn = utils.abs_path_file(groundtruth_fn) if os.path.isfile(output_fn) and os.path.exists(output_fn): utils.print_warning("Overwritting existing output file: " + utils.abs_path_file(output_fn)) # TODO Read groundtruth file in memory tmp_gt = csv.reader(open(groundtruth_fn, "r")) groundtruths = {} for row in tmp_gt: groundtruths[row[0]] = row[1] tags = [] output = open(output_fn, "w") with open(feature_fn, "r") as feat: line_num = 0 tmp_line = "" for line in feat: line_num += 1 if line_num > 74: if line[0] != "%": # Alter feature line with correct tag cur_line = line.split(",") old_tag = cur_line[-1].split("_")[0] if old_tag in groundtruths: new_tag = groundtruths[old_tag] output.write(tmp_line + ",".join(cur_line[:-1]) + "," + new_tag + "\n") tmp_line = "" tags.append(new_tag) else: # TODO # File not in groundtruth tmp_line = "" # utils.print_warning("Error with " + old_tag) else: tmp_line += line elif line_num == 2: output.write("@relation train_test.arff\n") # output.write("@relation MARSYAS_KEA\n") elif line_num == 71: # Alter line 71 containing all tag gathered along the way # TODO enhance output.write("@attribute output {i,s}\n") else: # Write header output.write(line) tags = list(set(tags)) utils.print_warning("TODO Take in account diffents tags than " + str(tags)) output.close() utils.print_success("Groundtruth added")
def cross_validation(train_filename, n_folds, outfilename): filename = utils.abs_path_file(train_filename) features = [] groundtruths = [] with open(filename, "r") as filep: for line in filep: line = line.split(",") features.append([float(x) for x in line[1:-1]]) groundtruths.append(line[-1][:-1]) features = np.array(features) groundtruths = np.array(groundtruths) # Init # if os.path.exists(outfilename): try: with open(outfilename, "r") as filep: data = json.load(filep) except: data = {} # else: # data = {} algo_name = "Method 1" data[algo_name] = {} data[algo_name]["uneven"] = {} data[algo_name]["balanced"] = {} for distribution in data[algo_name]: data[algo_name][distribution]["precision"] = {} data[algo_name][distribution]["recall"] = {} data[algo_name][distribution]["f1"] = {} for tmp in data[algo_name][distribution]: data[algo_name][distribution][tmp]["instru"] = [] data[algo_name][distribution][tmp]["song"] = [] skf = StratifiedKFold(n_splits=n_folds) for i in range(0, 10): utils.print_warning("TODO for i in range") song_precis = [] song_recall = [] song_fmeasu = [] inst_precis = [] inst_recall = [] inst_fmeasu = [] cur_fold = 0 for train, test in skf.split(features, groundtruths): cur_fold += 1 utils.print_success("Iteration " + str(i) + "\tFold " + str(cur_fold)) dataset = {} dataset["train_features"] = features[train] dataset["train_groundtruths"] = groundtruths[train] dataset["test_features"] = features[test] dataset["test_groundtruths"] = groundtruths[test] predictions = classify(data=dataset) song_precis.append( precision_score(dataset["test_groundtruths"], predictions, average=None)[1]) song_recall.append( recall_score(dataset["test_groundtruths"], predictions, average=None)[1]) song_fmeasu.append( f1_score(dataset["test_groundtruths"], predictions, average=None)[1]) inst_precis.append( precision_score(dataset["test_groundtruths"], predictions, average=None)[0]) inst_recall.append( recall_score(dataset["test_groundtruths"], predictions, average=None)[0]) inst_fmeasu.append( f1_score(dataset["test_groundtruths"], predictions, average=None)[0]) song_precis = sum(song_precis) / float(len(song_precis)) song_recall = sum(song_recall) / float(len(song_recall)) song_fmeasu = sum(song_fmeasu) / float(len(song_fmeasu)) inst_precis = sum(inst_precis) / float(len(inst_precis)) inst_recall = sum(inst_recall) / float(len(inst_recall)) inst_fmeasu = sum(inst_fmeasu) / float(len(inst_fmeasu)) # Song data[algo_name]["balanced"]["precision"]["song"].append(song_precis) data[algo_name]["balanced"]["recall"]["song"].append(song_recall) data[algo_name]["balanced"]["f1"]["song"].append(song_fmeasu) # Instru data[algo_name]["balanced"]["precision"]["instru"].append(inst_precis) data[algo_name]["balanced"]["recall"]["instru"].append(inst_recall) data[algo_name]["balanced"]["f1"]["instru"].append(inst_fmeasu) with open(outfilename, "w") as outfile: json.dump(data, outfile, indent=2)
def process_local_feat(indir, file_gts_track, outdir_local, out_feat_global, train): """Description of process_local_feat Add delta and double delta to MFCCs """ utils.print_success("Processing local features") # Preprocess arg indir = utils.abs_path_dir(indir) file_gts_track = utils.abs_path_file(file_gts_track) filelist = os.listdir(indir) outdir_local = utils.abs_path_dir(outdir_local) track_gts = {} with open(file_gts_track, "r") as filep: for line in filep: line = line.split(",") if train: index = line[0] else: index = line[0] + ".wav.mfcc.csv" track_gts[index] = line[1][:-1] for index, filename in enumerate(filelist): utils.print_progress_start(str(index) + "/" + str(len(filelist)) + " " + filename) if filename in track_gts: mfccs = [] groundtruths = [] with open(indir + filename, "r") as filep: next(filep) next(filep) next(filep) next(filep) next(filep) for line in filep: line = line.split(",") mfccs.append(str2arr(line[:-1])) if train: groundtruths.append(line[-1][:-1]) mfccs = np.array(mfccs) delta_mfcc = librosa.feature.delta(mfccs) delta2_mfcc = librosa.feature.delta(mfccs, order=2) # Write local features in outdir_local with open(outdir_local + filename, "w") as filep: gt_to_write = "" if "i" in track_gts[filename]: gt_to_write = ",i" elif "s" in track_gts[filename]: # postpone frame groundtruth annotationa to another function later in the code gt_to_write = "" else: utils.print_warning("bayle.py line 231 local frame groundtruth undefined") if train: for a, b, c, d in zip(mfccs, delta_mfcc, delta2_mfcc, groundtruths): filep.write(arr2str(a) + "," + arr2str(b) + "," + arr2str(c) + "," + d + "\n") else: for a, b, c in zip(mfccs, delta_mfcc, delta2_mfcc): filep.write(arr2str(a) + "," + arr2str(b) + "," + arr2str(c) + gt_to_write + "\n") # # Write global features in out_feat_global # with open(out_feat_global, "a") as filep: # filep.write(filename + "," + # arr2str(np.mean(mfccs, axis=0)) + "," + # arr2str(np.mean(delta_mfcc, axis=0)) + "," + # arr2str(np.mean(delta2_mfcc, axis=0)) + "," + # track_gts[filename] + "\n") utils.print_progress_end() utils.print_success("Adding local groundtruths to Songs in Jamendo thanks to Ramona annotations") match_feat_with_song_gt(dir_feat=outdir_local, dir_gts="groundtruths/frame_annot_jamendo_ramona/") utils.print_success("Done")
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier() # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) if outfilename is not None: with open(outfilename, "w") as filep: for gt, pred in zip(test_groundtruths, predictions): filep.write(gt + "," + pred + "\n") # Global data = [key] data.append( str( precision_score(test_groundtruths, predictions, average='weighted'))) data.append( str( recall_score(test_groundtruths, predictions, average='weighted'))) data.append( str(f1_score(test_groundtruths, predictions, average='weighted'))) data = ",".join(data) if disp: print(data) else: with open(res_dir + "global.csv", "a") as filep: filep.write(data + ",\n") # Local for index, tag in enumerate(list(set(train_groundtruths))): precision = precision_score(test_groundtruths, predictions, average=None) recall = recall_score(test_groundtruths, predictions, average=None) f1 = f1_score(test_groundtruths, predictions, average=None) line = key + "," + str(precision[index]) + "," + str( recall[index]) + "," + str(f1[index]) if disp: print(line) else: with open(res_dir + "tag_" + tag + ".csv", "a") as filep: filep.write(line + ",\n") return predictions
def create_track_feat_testset(folder, infile, outfile, model_file, train=False): """Description of create_track_feat_testset Need to read each test file compute deltas on mfcc in the ram predict and predict_proba generate song and instru ngrams and histograms Add the mean of mfcc+deltas append 109 features vector in feat_track/feat_test.csv """ utils.print_success("Create track feat testset") folder = utils.abs_path_dir(folder) infile = utils.abs_path_file(infile) clf = joblib.load(model_file) track_gts = read_gts(infile, separator=",") for index, filename in enumerate(track_gts): utils.print_progress_start(str(index+1) + "/" + str(len(track_gts)) + " " + filename) mfccs = [] mfccs_1 = [] extension = "" if train: extension = "" else: extension += "_audio_full_mono_22k" extension += ".wav.mfcc.csv" with open(folder + filename + extension, "r") as filep: if train: next(filep) next(filep) next(filep) next(filep) next(filep) for line in filep: if train: line = line.split(",") else: line = line.split(" ") mfccs_1.append(str2arr(line[:-1])) # if train: # mfccs.append(str2arr(line[:-1])) # else: # mfccs.append(str2arr(line[0:])) mfccs = np.array(mfccs_1) delta_mfcc = librosa.feature.delta(mfccs) delta2_mfcc = librosa.feature.delta(mfccs, order=2) tmp = np.append(mfccs, delta_mfcc, axis=1) features = np.append(tmp, delta2_mfcc, axis=1) preds_proba = clf.predict_proba(features) # Histogramm nb_hist_class = 10 numbers = column(preds_proba, 0) hist_pred = np.histogram(numbers, nb_hist_class) hist_pred_norm = hist_pred[0] / float(sum(hist_pred[0])) ngram_threshold = 0.5 song_ngram_proba = ngram_proba(local_pred=numbers, threshold=ngram_threshold, above_threshold=True) instru_ngram_proba = ngram_proba(local_pred=numbers, threshold=ngram_threshold, above_threshold=False) preds = clf.predict(features) song_ngram = ngram(preds, "s") instru_ngram = ngram(preds, "i") with open(outfile, "a") as filep: filep.write(filename[:12] + "," + arr2str(np.mean(mfccs, axis=0)) + "," + arr2str(np.mean(delta_mfcc, axis=0)) + "," + arr2str(np.mean(delta2_mfcc, axis=0)) + "," + arr2str(hist_pred_norm) + "," + song_ngram_proba + "," + instru_ngram_proba + "," + song_ngram + "," + instru_ngram + "," + track_gts[filename] + "\n") utils.print_progress_end()
def cross_validation(train_filename, n_folds, outfilename): utils.print_success("Cross validation") filename = utils.abs_path_file(train_filename) condition = train_filename.split(".")[0].split(os.sep)[-1] features = [] groundtruths = [] with open(filename, "r") as filep: for line in filep: line = line[:-1].split(",") features.append([float(x) for x in line[0:-1]]) groundtruths.append(line[-1]) features = np.array(features) groundtruths = np.array(groundtruths) skf = StratifiedKFold(n_splits=n_folds) # for i in range(0, 10): i = 0 cur_fold = 0 with open("../results/gender/precision.txt", "a") as filep: filep.write(condition + ";" + str( precision_score(dataset["test_groundtruths"], predictions, average='weighted')) + "\n") with open("../results/gender/recall.txt", "a") as filep: filep.write(condition + ";" + str( recall_score(dataset["test_groundtruths"], predictions, average='weighted')) + "\n") with open("../results/gender/f1.txt", "a") as filep: filep.write(condition + ";" + str( f1_score(dataset["test_groundtruths"], predictions, average='weighted')) + "\n") with open("../results/gender/accuracy.txt", "a") as filep: filep.write( condition + ";" + str(accuracy_score(dataset["test_groundtruths"], predictions)) + "\n") for train, test in skf.split(features, groundtruths): cur_fold += 1 utils.print_success("Iteration " + str(i) + "\tFold " + str(cur_fold)) dataset = {} dataset["train_features"] = features[train] dataset["train_groundtruths"] = groundtruths[train] dataset["test_features"] = features[test] dataset["test_groundtruths"] = groundtruths[test] predictions = classify(data=dataset) print("\tPrecision weighted\t" + str( precision_score( dataset["test_groundtruths"], predictions, average='weighted')) ) print("\tRecall weighted\t" + str( recall_score( dataset["test_groundtruths"], predictions, average='weighted')) ) print("\tF1 weighted\t" + str( f1_score( dataset["test_groundtruths"], predictions, average='weighted')) ) print("\tAccuracy\t" + str(accuracy_score(dataset["test_groundtruths"], predictions))) with open("../results/gender/precision.txt", "a") as filep: filep.write( str( precision_score(dataset["test_groundtruths"], predictions, average='weighted')) + "\n") with open("../results/gender/recall.txt", "a") as filep: filep.write( str( recall_score(dataset["test_groundtruths"], predictions, average='weighted')) + "\n") with open("../results/gender/f1.txt", "a") as filep: filep.write( str( f1_score(dataset["test_groundtruths"], predictions, average='weighted')) + "\n") with open("../results/gender/accuracy.txt", "a") as filep: filep.write( str(accuracy_score(dataset["test_groundtruths"], predictions)) + "\n")