def extract_features(in_dir, out_dir, path, verbose=False): in_dir = utils.abs_path_dir(in_dir) path = utils.abs_path_dir(path) cur_dir = os.getcwd() os.chdir(in_dir) script = "harmony-analyser-script-jar-with-dependencies.jar" src = path + script dst = in_dir + script shutil.copy(src, dst) options = [ "nnls-chroma:nnls-chroma", "nnls-chroma:chordino-tones", "nnls-chroma:chordino-labels", "qm-vamp-plugins:qm-keydetector", "chord_analyser:tps_distance" # "chord_analyser:chord_complexity_distance", # "chroma_analyser:complexity_difference", # "chord_analyser:average_chord_complexity_distance" ] for opt in options: cmd = "java -jar " + script + " -a " + opt + " -s .wav -t 0.07" utils.run_cmd(cmd, verbose) os.remove(dst) cp_cmd = "cp *.txt " + out_dir utils.run_cmd(cp_cmd) # utils.run_cmd("rm *.txt") os.chdir(cur_dir)
def test_models_parallel(models_dir, out_dir, test_dir=None, test_file=None): """Description of test_models_parallel 17h16m12s DecisionTree done in 16135373ms 17h25m08s GradientBoosting done in 16671109ms 18h59m05s RandomForest done in 22307811ms 18h59m07s AdaBoost done in 22310633ms 19h18m12s ExtraTrees done in 23455779ms """ models_dir = utils.abs_path_dir(models_dir) + "/" models = os.listdir(models_dir) utils.create_dir(out_dir) if test_dir is not None: test_dir = utils.abs_path_dir(test_dir) + "/" test_files = os.listdir(test_dir) test_file = None elif test_file is not None: test_files = None else: utils.print_warning( "TODO Error in arg for test_models_parallel() function") partial_test_model = partial(test_model, models_dir=models_dir, test_dir=test_dir, out_dir=out_dir, test_files=test_files, test_file=test_file) pool = multiprocessing.Pool(len(models)) pool.map(partial_test_model, models) #make our results with a map call pool.close() #we are not adding any more processes pool.join() #tell it to wait until all threads are done before going on
def match_feat_with_song_gt(dir_feat, dir_gts): """Description of match_feat_gt Use groundtruth created by http://www.mathieuramona.com/wp/data/jamendo/ associate to local features csv 7041 lines yaafe lab 326.973 sec ramona Definition of YAAFE from http://yaafe.sourceforge.net/features.html """ utils.print_success("Matching local feat to song/instru groundtruths") dir_feat = utils.abs_path_dir(dir_feat) dir_gts = utils.abs_path_dir(dir_gts) block_size = 1024. step_size = 512. fech = 22050. frame_size_ms = block_size / fech filenames = [fn for fn in os.listdir(dir_gts)] for index, filename in enumerate(filenames): utils.print_progress_start(str(index) + "/" + str(len(filenames)) + " " + filename) # gather groundtruths groundtruths = [] with open(dir_gts + filename, "r") as filep: for row in filep: line = row.split(" ") end = float(line[1]) if "no" in line[2]: tag = ",i\n" else: tag = ",s\n" groundtruths.append([end, tag]) gt_len = len(groundtruths) overflow = False gt_index = 0 cpt = 0 # Write features & groundtruths to file str_to_write = "" feat_fn = filename.split(".")[0] feat_fn += ".wav.mfcc.csv" with open(dir_feat + feat_fn, "r") as filep: for index, line in enumerate(filep): # todo cleanup if gt_index < gt_len: if frame_size_ms * index > groundtruths[gt_index][0]: gt_index += 1 if gt_index < gt_len: str_to_write += line[:-1] + groundtruths[gt_index][1] with open(dir_feat + feat_fn, "w") as filep: filep.write(str_to_write) utils.print_progress_end()
def match_feat_with_instru_gt(indir, outdir): """Description of match_feat_gt Apply instru groundtruth to CCmixter and MedleyDB """ utils.print_success("Matching local features to instrumental groundtruths") indir = utils.abs_path_dir(indir) + "/" outdir = utils.abs_path_dir(outdir) + "/" filenames = [fn for fn in os.listdir(indir)] for filename in filenames: outfile = open(outdir + filename, "w") with open(indir + filename, "r") as filep: for line in filep: outfile.write(line[:-1] + " i\n") outfile.close()
def new_algo_final(indir, file_gts_track): utils.print_success("Approx. time ~6 hours.") # Preprocess arg indir = utils.abs_path_dir(indir) file_gts_track = utils.abs_path_file(file_gts_track) dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "bayle") feat_frame_train = utils.create_dir(dir_tmp + "feat_frame_train") feat_frame_test = utils.create_dir(dir_tmp + "feat_frame_test") outdir_global = utils.create_dir(dir_tmp + "feat_track") feat_train = outdir_global + "train.csv" feat_test = outdir_global + "test.csv" models_dir = utils.create_dir(dir_tmp + "models") loc_feat_testset_dirpath = "features/database2/" filelist_train = "groundtruths/database1.csv" filelist_test = "groundtruths/database2.csv" models_global = utils.create_dir(dir_tmp + "models_track") process_local_feat(indir, file_gts_track, outdir_local=feat_frame_train, out_feat_global=feat_train, train=False) classify.create_models(outdir=models_dir, train_dir=feat_frame_train, separator=",", classifiers="RandomForest") """ Create features at track scale for the train set Features: MFCC + Delta + Double Delta + ngrams + hist """ model_file = "src/tmp/bayle/models/RandomForest/RandomForest.pkl" model_file = "/media/sf_DATA/ReproducibleResearchIEEE2017/src/tmp/bayle/models/RandomForest/RandomForest.pkl" create_track_feat_testset(indir, filelist_train, feat_train, model_file, train=True) # # 15h28m44s to 19h08m28s Done in 13184117ms create_track_feat_testset(loc_feat_testset_dirpath, filelist_test, feat_test, model_file) classify.create_models(outdir=models_global, train_file=feat_train, classifiers="RandomForest") process_results(feat_train, feat_test)
def plot_isrc_year_distribution(isrc_filename="ISRC_valid.txt", img_outdir=""): """Description of plot_isrc_year_distribution Create a png image of the distribution of ISRCs over the years """ img_outdir = utils.abs_path_dir(img_outdir) years = [] with open(isrc_filename, 'r') as csvfile: isrcs = csv.reader(csvfile) for isrc in isrcs: year = int(isrc[0][5:7]) + 2000 if year > date.today().year: year -= 100 years.append(year) axe = plt.subplot(111) hist_bins_range = range(min(years), max(years) + 1, 1) plt.hist(years, bins=hist_bins_range, color="#BBBBBB") plt.xlabel("Registration years") plt.ylabel("ISRC number") plt.xlim(min(years) - 2, max(years) + 2) axe.spines['top'].set_visible(False) axe.spines['right'].set_visible(False) axe.get_xaxis().tick_bottom() axe.get_yaxis().tick_left() plt.savefig(img_outdir + "Figure_1_ISRC_year_distribution.png") utils.print_success("ISRC year distribution image saved")
def yaafe_feat_extraction(dir_tracks): """Description of yaafe_feat_extraction yaafe.py -r 22050 -f "mfcc: MFCC blockSize=2048 stepSize=1024" audio_fn.txt """ utils.print_success("YAAFE features extraction (approx. 8 minutes)") # Assert Python version if sys.version_info.major != 2: utils.print_error("Yaafe needs Python 2 environment") # Assert folder exists dir_tracks = utils.abs_path_dir(dir_tracks) filelist = os.listdir(dir_tracks) dir_feat = utils.create_dir(utils.create_dir("features") + "database1") # dir_tmp = utils.create_dir("tmp") # dir_yaafe = utils.create_dir(dir_tmp + "yaafe") # fn_filelist = dir_yaafe + "filelist.txt" dir_current = os.getcwd() os.chdir(dir_tracks) yaafe_cmd = 'yaafe -r 22050 -f "mfcc: MFCC blockSize=2048 stepSize=1024" ' yaafe_cmd += "--resample -b " + dir_feat + " " for index, filen in enumerate(filelist): utils.print_progress_start(str(index+1) + "/" + str(len(filelist)) + " " + filen) os.system(yaafe_cmd + filen + "> /dev/null 2>&1") utils.print_progress_end() os.chdir(dir_current)
def add_feat_yaafe(dir_feat, data, ids=None): """ @brief Reads features files. @param dir_feat The folder containing the songs' features @return dict (key=ids and values=features) """ dir_feat = utils.abs_path_dir(dir_feat + "yaafe/") for filen in os.listdir(dir_feat): new_id = re.search(r"\d{3,9}", filen).group() if ids is None or new_id in ids: MFCCs = [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.] with open(dir_feat + filen, "r") as filep: for _ in range(0, 5): next(filep) for line_num, line in enumerate(filep): row = line.split(",") for index, val in enumerate(row): MFCCs[index] += float(val) for index, mfcc in enumerate(MFCCs): MFCCs[index] = mfcc / (line_num + 1) if new_id in data: data[new_id].append(MFCCs) else: data[new_id] = MFCCs return data
def add_feat_essentia(dir_feat, data, ids=None): """ @brief Reads essentia features files. @param dir_feat The folder containing the songs' features @return dict (key=ids and values=features) """ dir_feat = utils.abs_path_dir(dir_feat + "essentia/") features = [] for filen in os.listdir(dir_feat): new_id = re.search(r"\d{3,9}", filen).group() if ids is None or new_id in ids: with open(dir_feat + filen) as filep: essentia_feat = json.load(filep) features.append(essentia_feat["tonal"]["chords_changes_rate"]) features.append(essentia_feat["tonal"]["chords_number_rate"]) features.append(essentia_feat["tonal"]["tuning_frequency"]) # features.append(essentia_feat["tonal"]["chords_key"]) # features.append(essentia_feat["tonal"]["chords_scale"]) # features.append(essentia_feat["tonal"]["key_key"]) # features.append(essentia_feat["tonal"]["key_scale"]) features.append(essentia_feat["rhythm"]["beats_count"]) features.append(essentia_feat["rhythm"]["bpm"]) features.append(essentia_feat["rhythm"]["danceability"]) features.append(essentia_feat["rhythm"]["onset_rate"]) # utils.print_error(features) if new_id in data: data[new_id].append(features) else: data[new_id] = features return data
def read_train_files(indir, separator=" "): """Description of read_train_files Gather local features and GT from every individual train songs """ utils.print_success("Reading multiple train files") indir = utils.abs_path_dir(indir) + "/" groundtruths = [] features = [] included_extenstions = ["csv"] filenames = [ fn for fn in os.listdir(indir) if any( fn.endswith(ext) for ext in included_extenstions) ] for index, filename in enumerate(filenames): print(str(index + 1) + "/" + str(len(filenames)) + " " + filename) sys.stdout.write("\033[F") # Cursor up one line sys.stdout.write("\033[K") # Clear line with open(indir + filename, "r") as filep: for row in filep: line = row.split(separator) features.append([float(i) for i in line[:-1]]) groundtruths.append(line[-1][:-1]) sys.stdout.write("\033[K") # Clear line return features, groundtruths
def experiments_2_3(vqmm_cmd, codebook_file): utils.print_success("Experiment 2 & 3 (approx. 6h") dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "vqmm") # train dir_models = utils.create_dir(dir_tmp + "models_expe2_3") train(vqmm_cmd, codebook_file, dir_models, dir_tmp + "filelist.txt") # Models file # Need to explicitly create models_file here for VQMM models_list = os.listdir(dir_models) models_file = dir_tmp + "models_file_expe2_3.txt" with open(models_file, "w") as filep: for model_path in models_list: if not "NOT" in model_path: filep.write(dir_models + model_path + "\n") # test test_dir = utils.abs_path_dir("features/database2/") groundtruths = utils.read_groundtruths("groundtruths/database2.csv") test_file_list = os.listdir(test_dir) with open(dir_tmp + "test_file_list.txt", "w") as filep: for test_filen in test_file_list: filep.write(test_dir + test_filen + "\t" + groundtruths[test_filen.split("_")[0]] + "\n") dir_res = utils.create_dir(dir_tmp + "results_expe2_3") test(vqmm_cmd, codebook_file, outputdir=dir_res, models_file=models_file, testfile=dir_tmp + "test_file_list.txt") # disp results utils.print_success("Experiment 2 & 3 Done processing")
def preprocess_yaafe_features(dir_features="features/database1/"): utils.print_success("Preprocessing YAAFE's features (approx. 2 minutes)") groundtruths = utils.read_groundtruths("groundtruths/database1.csv") dir_features = utils.abs_path_dir(dir_features) filenames = os.listdir(dir_features) dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "ghosal") res_file_name = dir_tmp + "database1.csv" res_file = open(res_file_name, "w") res_file.write( "filename,MFCC_01,MFCC_02,MFCC_03,MFCC_04,MFCC_05,MFCC_06,MFCC_07,MFCC_08,MFCC_09,MFCC_10,MFCC_11,MFCC_12,MFCC_13,tag\n" ) nb_header_lines = 4 for index, filename in enumerate(filenames): utils.print_progress_start( str(index + 1) + "/" + str(len(filenames)) + " " + filename) with open(dir_features + filename, "r+") as filep: tmp_mfcc = np.zeros(shape=(13, 1)) for line_index, line in enumerate(filep): # Skip 5 first header lines generated by YAAFE if line_index > nb_header_lines: index = 0 mfccs = line[:-1].split(",") for mfcc in mfccs: tmp_mfcc[index] += float(mfcc) index += 1 tmp_mfcc /= (line_index - nb_header_lines) mfcc_str = ["%.15f" % number for number in tmp_mfcc] filen = filename.split(".")[0] if filen in groundtruths: res_file.write(filen + "," + ",".join(mfcc_str) + "," + groundtruths[filen] + "\n") res_file.close() return res_file_name
def plot_precision_recall(indir, gts_file, outdir): groundtruths = read_item_tag(gts_file) plt.figure(1) indir = utils.abs_path_dir(indir) for item in os.listdir(indir): if ".csv" in item: isrcs = read_preds(indir + "/" + item) test_groundtruths = [] predictions = [] for isrc in isrcs: if isrc in groundtruths: test_groundtruths.append(groundtruths[isrc]) predictions.append(isrcs[isrc]) test_groundtruths = [tag == "s" for tag in test_groundtruths] precision, recall, _ = precision_recall_curve( test_groundtruths, predictions) plt.plot(recall, precision, label=item[:-4] + " (" + str( round( average_precision_score(test_groundtruths, predictions), 3)) + ")") plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([-0.05, 1.05]) plt.title('Precision-Recall curve for Algo (AUC)') plt.legend(loc='best') plt.savefig(outdir + "precision_recall.png", dpi=200, bbox_inches="tight") # plt.show() plt.close() utils.print_success("Precision-Recall curve created in " + outdir)
def run_kea_on_folds(folds_dir): """Description of run_kea_on_folds Wrapper for kea on folds """ folds_dir = utils.abs_path_dir(folds_dir) out_file = folds_dir + "/results.txt" if os.path.exists(folds_dir + "/train_test.arff"): train_file = folds_dir + "/train_test.arff" test_file = train_file run_kea(train_file, test_file, out_file) else: nb_folds = len([ name for name in os.listdir(folds_dir) if os.path.isfile(os.path.join(folds_dir, name)) ]) # Run on multiple train/test for index in range(1, int(nb_folds / 2) + 1): utils.print_success("Train/Test on fold " + str(index)) train_file = folds_dir + "/train_" + str(index).zfill(2) + ".arff" test_file = folds_dir + "/test_" + str(index).zfill(2) + ".arff" out_file = folds_dir + "/results_" + str(index).zfill(2) + ".arff" run_kea(train_file, test_file, out_file) utils.print_warning("TODO multiprocessing")
def extract_features(dir_audio, dir_feat): dir_audio = utils.abs_path_dir(dir_audio) dir_feat = utils.abs_path_dir(dir_feat) filelist = [] for elem in os.listdir(dir_audio): if os.path.isfile(dir_audio + elem): filelist.append(dir_audio + elem) else: for filename in os.listdir(dir_audio + elem): if "ld.wav" in filename: filelist.append(dir_audio + elem + "/" + filename) # marsyas(dir_feat, filelist) for index, filen in enumerate(filelist): utils.print_progress_start(str(index+1) + "/" + str(len(filelist)) + " " + filen.split(os.sep)[-1]) utils.yaafe(filen) essentia(dir_feat, filen) utils.print_progress_end()
def validate_isrcs(infile="isrc.txt", outfile="ISRC_invalid.txt", indir=None): """Description of validate_isrcs Validate a list of ISRCs contained into a file All line must only contain the ISRC and the \n """ rm_infile = False if indir: indir = utils.abs_path_dir(indir) print("Directory to analyse: " + indir) infile = "tmpISRCs.txt" os.system("ls " + indir + " > " + infile) rm_infile = True else: if os.path.isfile(infile): infile = os.path.abspath(infile) else: print("Invalid input file") sys.exit() if not os.path.isfile(outfile): outfile = os.path.abspath(outfile) else: print("Already existing output file will be overwritten") valid_isrcs = "" invalid_isrcs = "" cpt_invalid = 0 isrc_file = open(infile, "r") for index, line in enumerate(isrc_file): isrc = line[0:12] print("\t" + str(index) + "\t" + isrc) sys.stdout.write("\033[F") # Cursor up one line # sys.stdout.write("\033[K") # Clear line # if len(line) == 13 and validate_isrc(line[0:12]): if validate_isrc(isrc): valid_isrcs = valid_isrcs + line else: invalid_isrcs = invalid_isrcs + line cpt_invalid += 1 sys.stdout.write("\033[K") # Clear line isrc_file.close() if rm_infile: os.remove(infile) file_valid = open("ISRC_valid.txt", "w") file_valid.write(valid_isrcs) file_valid.close() if len(invalid_isrcs) != 0: print(str(cpt_invalid) + " invalid ISRCs stored in: " + outfile) file_invalid = open(outfile, "w") file_invalid.write(invalid_isrcs) file_invalid.close() else: print("All ISRCs are valid")
def preprocess_features(folder): utils.print_success("Preprocessing train set") folder = utils.abs_path_dir(folder) filelist = os.listdir(folder) nb_file = str(len(filelist)) for index, filename in enumerate(filelist): utils.print_progress_start(str(index) + "/" + nb_file + " " + filename) convert_feats_files(folder + filename) utils.print_progress_end()
def figures1bd(indir, file_gts_track): """Description of figures1bd infile is formated like: /media/sf_github/yann/train/01 - 01 Les Jardins Japonais.wav.mfcc.csv feat1 feat2 ... featn tag1 feat1 feat2 ... featn tag2 ... feat1 feat2 ... featn tag2 0 Input the local extracted features from YAAFE 13 MFCC per frame 186 musical pieces as train set 1 Computes delta and double delta (39 features per frame) 2 Gather global mean (39 features per musical pieces) 3 train on mfcc & deltas (39 feat/frame) to output global predictions 4 Use global preds to compute song and instru n-grams and histogramm which add 70 feat/track lead to a total of 109 feat/track 5 Fit on 109x186 6 predict (or predict_proba) on 41491 track """ # Preprocess arg indir = utils.abs_path_dir(indir) file_gts_track = utils.abs_path_file(file_gts_track) feat_frame_train = "feat_frame_train/" utils.create_dir(feat_frame_train) feat_frame_test = "feat_frame_test/" utils.create_dir(feat_frame_test) outdir_global = "feat_track/" utils.create_dir(outdir_global) feat_train = outdir_global + "train.csv" feat_test = outdir_global + "test.csv" models_dir = "models/" utils.create_dir(models_dir) loc_feat_testset_dirpath = "/media/sf_DATA/Datasets/Simbals/yaafe/results/processed/" filelist_test = "filelist_test.tsv" filelist_train = "filelist_train.tsv" models_global = "models_track/" utils.create_dir(models_global) # process_local_feat(indir, file_gts_track, feat_frame_train, feat_train, train=True) # classify.create_models(outdir=models_dir, train_dir=feat_frame_train, separator=",") # create_track_feat_testset(indir, filelist_train, feat_train, train=True) # 15h28m44s to 19h08m28s Done in 13184117ms # create_track_feat_testset(loc_feat_testset_dirpath, filelist_test, feat_test) # classify.create_models(outdir=models_global, train_file=feat_train) # classify.test_models_parallel( # models_dir=models_global, # out_dir="results/", # test_file=feat_test) # Display results reproduce.plot_results("results/")
def plot_roc(indir, gts_file, outdir): groundtruths = read_item_tag(gts_file) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--', label="Random (0.5)") indir = utils.abs_path_dir(indir) for item in os.listdir(indir): if ".csv" in item: isrcs = read_preds(indir + "/" + item) test_groundtruths = [] predictions = [] for isrc in isrcs: if isrc in groundtruths: test_groundtruths.append(groundtruths[isrc]) predictions.append(isrcs[isrc]) test_groundtruths = [tag=="s" for tag in test_groundtruths] fpr_rf, tpr_rf, _ = roc_curve(test_groundtruths, predictions) label = item[:-4] + " (" + str(round(roc_auc_score(test_groundtruths, predictions), 3)) + ")" color = "" if "VQMM" in item: color = "ro" elif "SVMBFF" in item: color = "g-" elif "GA" in item: color = "b:" plt.plot(fpr_rf, tpr_rf, color, label=label) ax = plt.gca() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') plt.xlabel('False positive rate') plt.ylabel('True positive rate') # plt.title('ROC curve for Algo (AUC)') plt.legend(loc='best') outdir = utils.abs_path_dir(outdir) roc_fn = outdir + "Figure_3_ROC.png" plt.savefig(roc_fn, dpi=200, bbox_inches="tight") plt.savefig(outdir + "Figure_3_ROC.eps") # plt.show() plt.close() utils.print_success("ROC curve successfully created in " + roc_fn)
def merge_files(folder, name): utils.print_success("Merging files") subfolder = utils.abs_path_dir(folder + name) data = "" for filen in os.listdir(subfolder): with open(subfolder + filen, "r") as filep: for line in filep: data += line with open(folder + name + ".csv", "w") as filep: filep.write(data)
def test_models(models_dir, test_dir, out_dir): models_dir = utils.abs_path_dir(models_dir) + "/" test_dir = utils.abs_path_dir(test_dir) + "/" utils.create_dir(out_dir) test_files = os.listdir(test_dir) models = os.listdir(models_dir) for model in models: utils.print_success(model) pred_dir = out_dir + model + "/" utils.create_dir(pred_dir) clf = joblib.load(models_dir + model + "/" + model + ".pkl") for index, test_file in enumerate(test_files): print(str(index) + "\t" + test_file) sys.stdout.write("\033[F") sys.stdout.write("\033[K") test_features = read_test_file(test_dir + test_file) predictions = clf.predict_proba(test_features) with open(pred_dir + test_file, "w") as filep: for pred in predictions: filep.write(str(pred[0]) + "\n") sys.stdout.write("\033[K")
def plot_clf(indir="res/"): indir = utils.abs_path_dir(indir) + "/" algos = [] measure = [] with open(indir + "global.csv", "r") as filep: for line in filep: line = line.split(",") algos.append(line[0]) measure.append(tuple(map(float, line[1:4]))) n_groups = 3 fig, ax = plt.subplots(figsize=(10, 6)) index = np.arange(n_groups) bar_width = 0.2 opacity = 0.4 error_config = {'ecolor': '0.3'} color = utils.rand_color(len(algos)) rects = {} offset = 0.15 for ind, algo in enumerate(algos): print(ind) print(tuple(measure[ind])) rects[ind] = plt.bar(index + bar_width * ind + offset, tuple(measure[ind]), bar_width, alpha=opacity, color=color[ind], label=algo) plt.ylabel('Scores (in %)') plt.xticks(index + bar_width * ind + offset, ('Precision', 'Recall', 'F-Measure')) plt.legend() plt.ylim(0, 1) # spines & axis ax = plt.gca() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') art = [] lgd = ax.legend(loc=9, bbox_to_anchor=(1.1, 1.), frameon=False) # lgd = pylab.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2) art.append(lgd) # ax.legend() plt.tight_layout() img_name = "global.png" plt.savefig(img_name, dpi=200, additional_artists=art, bbox_inches="tight")
def main(): """ 1 2 Make cbk on train set 3 Train 200 4 Test 50k """ # utils.print_success("VQMM (approx. 6h)") # 1 # preprocess features # YAAFE produce files which contain unusable float format # Need to transfroms those into a valid format preprocess_features("features/database1/") # 2 # Read filenames & groundtruths groundtruths = {} with open("groundtruths/database1.csv", "r") as filep: for line in filep: row = line[:-1].split(",") groundtruths[row[0]] = row[1] # 3 # VQMM needs a special file containing path & filename along ground truth. dir_feats = utils.abs_path_dir("features/database1_vqmm/") files_list = os.listdir(dir_feats) dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "vqmm") filenames_gts = dir_tmp + "filelist.txt" with open(filenames_gts, "w") as filep: for filename in files_list: fn = filename.split(".")[0] filep.write(dir_feats + filename + "\t" + groundtruths[fn] + "\n") # # 4 # # Need to compile VQMM and check that everything is ok utils.print_success("Compiling VQMM") vqmm_cmd = "src/vqmm/vqmm" os.system("make -C src/vqmm/src") # 5 # Create codebook needed for VQMM file_cbk = dir_tmp + "codebook.txt" create_cbk(vqmm_cmd, filenames_gts, file_cbk) # 5 # launch expe1 experiment_1(vqmm_cmd, file_cbk) experiments_2_3(vqmm_cmd, file_cbk) process_results()
def merge_arff(indir, outfilename): """Description of merge_arff bextract programm from Marsyas generate one output file per audio file This function merge them all in one unique file Check if analysed file are valid i.e. not empty """ utils.print_success("Preprocessing ARFFs") indir = utils.abs_path_dir(indir) tmpfilename = "tmp_arff.txt" os.system("ls " + indir + " > " + tmpfilename) with open(tmpfilename, 'r') as filenames: outfn = open(outfilename, 'w') cpt_invalid_fn = 0 # Write first lines of ARFF template file for filename in filenames: filename = validate_arff(indir + "/" + filename[:-1]) if filename: with open(filename, 'r') as template: nb_line = 77 for line in template: if not nb_line: break nb_line -= 1 outfn.write(line) break else: cpt_invalid_fn += 1 # Append all arff file to the output file cur_file_num = 1 for filename in filenames: filename = validate_arff(indir + "/" + filename[:-1]) if filename: cur_file_num = cur_file_num + 1 sys.stdout.write("\r\tAnalysing file\t" + str(cur_file_num)) sys.stdout.flush() fname = open(filename, 'r') outfn.write("".join(fname.readlines()[74:77])) fname.close() else: cpt_invalid_fn += 1 sys.stdout.write('\n') sys.stdout.flush() outfn.close() os.remove(tmpfilename) if cpt_invalid_fn: utils.print_warning(str(cpt_invalid_fn) + " ARFF with errors found") utils.print_success("Preprocessing done") return outfilename
def main(args): """ @brief Main entry point """ path = utils.abs_path_dir(args.path) in_dir = utils.abs_path_dir(args.in_dir) out_dir = utils.abs_path_dir(args.out_dir) id_songs_feat_done = [] for filen in os.listdir(out_dir): if os.path.isfile(out_dir + filen): m = re.search(r"\d{3,9}", filen) id_songs_feat_done.append(m.group()) id_songs_feat_done = list(set(id_songs_feat_done)) index = 0 with open("../data/filelist.csv", "r") as filep: for line in filep: row = line[:-1].split(",") # Check if features have been extracted by YAAFE, Marsyas & Essentia if "1" in row[6] and "1" in row[7] and "1" in row[8]: if not row[0] in id_songs_feat_done: folder = in_dir + row[1] + "_" + row[2] + "_" + row[0] index += 1 print(str(index) + " " + folder) extract_features(folder, out_dir, path)
def plot_isrc_country_repartition(isrc_filename="ISRC_valid.txt", img_outdir=""): """Description of plot_isrc_country_repartition """ img_outdir = utils.abs_path_dir(img_outdir) # Gather countries' name along ISO-2 codes countries = {} with open('src/wikipedia-iso-country-codes.csv', 'r') as csvfile: codes = csv.reader(csvfile, delimiter=',', quotechar='"') for row in codes: countries[row[0]] = row[1] # Map nb of ISRCs with a color for each country colors = {} with open(isrc_filename, "r") as filep: for row in filep: country_code = row[0:2] if country_code in colors: colors[country_code] += 1 else: colors[country_code] = 1 countries_shp = shpreader.natural_earth(resolution='110m', category='cultural', name='admin_0_countries') fig, axe = plt.subplots(figsize=(12, 6), subplot_kw={'projection': ccrs.PlateCarree()}) norm = mpl.colors.Normalize(vmin=0, vmax=float(max(list(colors.values())))) cmap = plt.cm.YlOrBr # or YlGnBu for country in shpreader.Reader(countries_shp).records(): country_name = country.attributes['name_long'] if country_name in countries: country_iso2 = countries[country_name] if country_iso2 in colors: color = colors[country_iso2] else: color = -1 else: color = -1 axe.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=cmap(norm(color)), label=country_name) axe.outline_patch.set_edgecolor('white') cax = fig.add_axes([0.91, 0.2, 0.02, 0.6]) mpl.colorbar.ColorbarBase(cax, cmap=cmap, norm=norm) plt.savefig(img_outdir + "Figure_2_ISRC_country_repartition.png") utils.print_success("ISRC country repartition image saved")
def merge_arff(indir, outfilename): """Description of merge_arff bextract program from Marsyas generate one output file per audio file This function merge them all in one unique file Check if analysed file are valid i.e. not empty """ utils.print_success("Preprocessing ARFFs") indir = utils.abs_path_dir(indir) filenames = os.listdir(indir) outfn = open(outfilename, 'w') cpt_invalid_fn = 0 # Write first lines of ARFF template file for filename in filenames: if os.path.isfile(indir + filename): new_fn = validate_arff(indir + filename) if new_fn: with open(new_fn, 'r') as template: nb_line = 74 for line in template: if not nb_line: break nb_line -= 1 outfn.write(line) break else: cpt_invalid_fn += 1 # Append all arff file to the output file cur_file_num = 1 for filename in filenames: if os.path.isfile(indir + filename): new_fn = validate_arff(indir + filename) if new_fn: cur_file_num = cur_file_num + 1 utils.print_progress_start("Analysing file\t" + str(cur_file_num)) fname = open(new_fn, 'r') outfn.write("".join(fname.readlines()[74:77])) fname.close() else: cpt_invalid_fn += 1 utils.print_progress_end() outfn.close() # os.system("rm " + indir + "*.arff") if cpt_invalid_fn: utils.print_warning( str(cpt_invalid_fn) + " ARFF files with errors found") return outfilename
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def read_data_1(stats_dir, filen): stats_dir = utils.abs_path_dir(stats_dir) filen = utils.abs_path_file(filen) data = [] names = [] with open(stats_dir + filen, "r") as filep: for line in filep: # Read file with lines like this: # GA,0.578947368421,0.631578947368,0.710526315789,0.722222222222 # SVMBFF,0.631578947368,0.684210526316,0.815789473684,0.66666666 # VQMM,0.736842105263,0.842105263158,0.842105263158,0.75,0.61111 row = line[:-1].split(",") tmp = [] for index in range(1, len(row)): names.append(row[0]) tmp.append(float(row[index])) data.append(tmp) print(filen.split(".")[0].split("_")[1].title() + " for " + row[0] + " \t= " + str("{0:.3f}".format(sum(tmp)/len(tmp))) + " ± " + str("{0:.3f}".format(stdev(tmp))))
def read_files(dir_features): utils.print_success("Preprocessing YAAFE's features (approx. 20 minutes)") tmp_gts = utils.read_groundtruths("groundtruths/database2.csv") dir_features = utils.abs_path_dir(dir_features) filenames = os.listdir(dir_features) dir_tmp = utils.create_dir(utils.create_dir("tmp") + "ghosal") features = [] groundtruths = [] to_print = "/" + str(len(filenames)) for index, filename in enumerate(filenames): utils.print_progress_start(str(index + 1) + to_print) # pandas used here because fastest method to read csv fils data = pandas.read_csv(dir_features + filename, sep=" ").values filen = filename.split("_")[0] if filen in tmp_gts: groundtruths.append(tmp_gts[filen]) features.append([sum(x) / len(data) for x in zip(*data)]) return filenames, features, groundtruths