def initialize_feature_files(base_filename, ext, append=False): files = {} # files is a dict of one file handle per extension writer = {} # files is a dict of one file writer per extension if append: mode = 'a' # append else: mode = 'w' # write new (will overwrite) for e in ext: filename = base_filename + '.' + e files[e] = open(filename, mode) writer[e] = unicsv.UnicodeCSVWriter( files[e]) #, quoting=csv.QUOTE_ALL) return (files, writer)
def open(self, base_filename, ext, append=False): ''' base_filename: path and filename that will be extended by . and feature extension ext: list of file extensions i.e. feature types to open files for append: whether to append to existing feature files (or overwrite) ''' import unicsv # unicode csv library (installed via pip install unicsv) self.ext = ext # keep extensions self.files = {} # files is a dict of one file handle per extension self.writer = {} # writer is a dict of one file writer per extension # append or write new (will overwrite) mode = 'a' if append else 'w' for e in ext: filename = base_filename + '.' + e self.files[e] = open(filename, mode) self.writer[e] = unicsv.UnicodeCSVWriter( self.files[e]) #, quoting=csv.QUOTE_ALL) self.isopen = True
def extract_all_files(filelist, path, out_file=None, feature_types=['rp', 'ssd', 'rh'], label=False, append=False, no_extension_check=False, force_resampling=None, out_HDF5=False, log_AudioTypes=True, log_Errors=True, verbose=True): """ finds all files of a certain type (e.g. .wav and/or .mp3) in a path and all sub-directories in it extracts selected RP feature types and saves them into separate CSV feature files (one per feature type) # filelist: list of files for features to be extracted # path: absolute path that will be added at beginning of filelist (can be '') # out_file: output file name stub for feature files to write (if omitted, features will be returned from function) # feature_types: RP feature types to extract. see rp_extract.py # label: use subdirectory name as class label # no_extension_check: does not check file format via extension. means that decoder is called on ALL files. # force_resampling: force a target sampling rate (provided in Hz) when decoding (works with FFMPEG only!) # out_HDF5: whether to store as HDF5 file format (otherwise CSV) """ ext = feature_types n = 0 # counting the files being processed n_extracted = 0 # counting the files that were actually analyzed err = 0 # counting errors n_files = len(filelist) # initialize filelist_extracted and dict containing all accumulated feature arrays filelist_extracted = [] feat_array = {} audio_logwriter = None error_logwriter = None audio_logwriter_wrote_header = False start_time = time.time() if out_file: # only if out_file is specified if log_AudioTypes: log_filename = out_file + '.audiotypes.log' audio_logfile = open(log_filename, 'w') # TODO allow append mode 'a' audio_logwriter = unicsv.UnicodeCSVWriter( audio_logfile) #, quoting=csv.QUOTE_ALL) if log_Errors: err_log_filename = out_file + '.errors.log' error_logfile = open(err_log_filename, 'w') # TODO allow append mode 'a' error_logwriter = unicsv.UnicodeCSVWriter( error_logfile) #, quoting=csv.QUOTE_ALL) if out_HDF5: FeatureWriter = HDF5FeatureWriter() else: FeatureWriter = CSVFeatureWriter() FeatureWriter.open(out_file, ext, append=append) for fil in filelist: # iterate over all files try: if n > 0: elaps_time = time.time() - start_time remain_time = elaps_time * n_files / n - elaps_time # n is the number of files done here else: remain_time = None n += 1 if path: filename = path + os.sep + fil else: filename = fil if verbose: print '#', n, '/', n_files, '(ETA: ' + timestr( remain_time) + "):", filename # read audio file (wav or mp3) samplerate, samplewidth, data, decoder = audiofile_read( filename, verbose=verbose, include_decoder=True, no_extension_check=no_extension_check, force_resampling=force_resampling) # audio file info if verbose: print samplerate, "Hz,", data.shape[ 1], "channel(s),", data.shape[0], "samples" # extract features # Note: the True/False flags are determined by checking if a feature is listed in 'ext' (see settings above) feat = rp.rp_extract( data, samplerate, extract_rp=('rp' in ext), # extract Rhythm Patterns features extract_ssd=( 'ssd' in ext), # extract Statistical Spectrum Descriptor extract_tssd=( 'tssd' in ext ), # extract temporal Statistical Spectrum Descriptor extract_rh=('rh' in ext), # extract Rhythm Histogram features extract_trh=( 'trh' in ext), # extract temporal Rhythm Histogram features extract_mvd=( 'mvd' in ext ), # extract Modulation Frequency Variance Descriptor spectral_masking=True, transform_db=True, transform_phon=True, transform_sone=True, fluctuation_strength_weighting=True, skip_leadin_fadeout=1, step_width=1, verbose=verbose) # TODO check if ext and feat.keys are consistent # WHAT TO USE AS ID (based on filename): 3 choices: id = fil # rel. filename as from find_files # id = filename # full filename incl. full path # id = filename[len(path)+1:] # relative filename only (extracted from path) if out_file: # WRITE each feature set to a CSV or HDF5 file id2 = None if label: id2 = id.replace("\\", "/").split("/")[-2].strip() if out_HDF5 and n_extracted == 0: # for HDF5 we need to know the vector dimension # thats why we cannot open the file earlier FeatureWriter.open( out_file, ext, feat, append=append ) # append not working for now but possibly in future FeatureWriter.write_features(id, feat, id2) else: # IN MEMORY: add the extracted features for 1 file to the array dict accumulating all files # TODO: only if we don't have out_file? maybe we want this as a general option if feat_array == {}: # for first file, initialize empty array with dimension of the feature set for e in feat.keys(): feat_array[e] = np.empty((0, feat[e].shape[0])) # store features in array for e in feat.keys(): feat_array[e] = np.append( feat_array[e], feat[e].reshape(1, -1), axis=0 ) # 1 for horizontal vector, -1 means take original dimension filelist_extracted.append(id) n_extracted += 1 # write list of analyzed audio files alongsize audio metadata (kHz, bit, etc.) if audio_logwriter: if not audio_logwriter_wrote_header: # write CSV header log_info = [ "filename", "decoder", "samplerate (kHz)", "samplewidth (bit)", "n channels", "n samples" ] audio_logwriter.writerow(log_info) audio_logwriter_wrote_header = True log_info = [ filename, decoder, samplerate, samplewidth * 8, data.shape[1], data.shape[0] ] audio_logwriter.writerow(log_info) gc.collect( ) # after every file we do garbage collection, otherwise our memory is used up quickly for some reason except Exception as e: print "ERROR analysing file: " + fil + ": " + str(e) err += 1 if error_logwriter: error_logwriter.writerow([fil, str(e)]) try: if out_file: # close all output files FeatureWriter.close() if audio_logwriter: audio_logfile.close() if error_logwriter: error_logfile.close() except Exception as e: print "ERROR closing the output or log files: " + str(e) end_time = time.time() if verbose: print "FEATURE EXTRACTION FINISHED.", n, "file(s) processed,", n_extracted, "successful. Duration:", timestr( end_time - start_time) if err > 0: print err, "file(s) had ERRORs during feature extraction.", if log_Errors: print "See", err_log_filename else: print if out_file: opt_ext = '.h5' if out_HDF5 else '' print "Feature file(s):", out_file + "." + str(ext) + opt_ext if out_file is None: return filelist_extracted, feat_array
if int(district[0:2]) in d: results.append([ county, row[1], row[2], row[3], row[4], row[5], row[6], office, district, party, cand.split(' (')[0].replace(' ', ' '), votes ]) elif office == 'State Senator': d = [ int(x['district']) for x in county_districts if x['office'] == 'State Senate' ] if int(district[0:2]) in d: results.append([ county, row[1], row[2], row[3], row[4], row[5], row[6], office, district, party, cand.split(' (')[0].replace(' ', ' '), votes ]) elif office == 'President' or office == 'U.S. Senator': results.append([ county, row[1], row[2], row[3], row[4], row[5], row[6], office, district, party, cand.split(' (')[0].replace(' ', ' '), votes ]) else: continue with open(filename, 'wb') as outfile: writer = unicsv.UnicodeCSVWriter(outfile) writer.writerow(fixed_cols) writer.writerows(results)