示例#1
0
def initialize_feature_files(base_filename, ext, append=False):
    files = {}  # files is a dict of one file handle per extension
    writer = {}  # files is a dict of one file writer per extension

    if append:
        mode = 'a'  # append
    else:
        mode = 'w'  # write new (will overwrite)

    for e in ext:
        filename = base_filename + '.' + e
        files[e] = open(filename, mode)
        writer[e] = unicsv.UnicodeCSVWriter(
            files[e])  #, quoting=csv.QUOTE_ALL)

    return (files, writer)
示例#2
0
    def open(self, base_filename, ext, append=False):
        '''
        base_filename: path and filename that will be extended by . and feature extension
        ext: list of file extensions i.e. feature types to open files for
        append: whether to append to existing feature files (or overwrite)
        '''

        import unicsv  # unicode csv library (installed via pip install unicsv)

        self.ext = ext  # keep extensions
        self.files = {}  # files is a dict of one file handle per extension
        self.writer = {}  # writer is a dict of one file writer per extension

        # append or write new (will overwrite)
        mode = 'a' if append else 'w'

        for e in ext:
            filename = base_filename + '.' + e
            self.files[e] = open(filename, mode)
            self.writer[e] = unicsv.UnicodeCSVWriter(
                self.files[e])  #, quoting=csv.QUOTE_ALL)

        self.isopen = True
示例#3
0
def extract_all_files(filelist,
                      path,
                      out_file=None,
                      feature_types=['rp', 'ssd', 'rh'],
                      label=False,
                      append=False,
                      no_extension_check=False,
                      force_resampling=None,
                      out_HDF5=False,
                      log_AudioTypes=True,
                      log_Errors=True,
                      verbose=True):
    """
    finds all files of a certain type (e.g. .wav and/or .mp3) in a path and all sub-directories in it
    extracts selected RP feature types
    and saves them into separate CSV feature files (one per feature type)

    # filelist: list of files for features to be extracted
    # path: absolute path that will be added at beginning of filelist (can be '')
    # out_file: output file name stub for feature files to write (if omitted, features will be returned from function)
    # feature_types: RP feature types to extract. see rp_extract.py
    # label: use subdirectory name as class label
    # no_extension_check: does not check file format via extension. means that decoder is called on ALL files.
    # force_resampling: force a target sampling rate (provided in Hz) when decoding (works with FFMPEG only!)
    # out_HDF5: whether to store as HDF5 file format (otherwise CSV)
    """

    ext = feature_types

    n = 0  # counting the files being processed
    n_extracted = 0  # counting the files that were actually analyzed
    err = 0  # counting errors
    n_files = len(filelist)

    # initialize filelist_extracted and dict containing all accumulated feature arrays
    filelist_extracted = []
    feat_array = {}
    audio_logwriter = None
    error_logwriter = None
    audio_logwriter_wrote_header = False

    start_time = time.time()

    if out_file:  # only if out_file is specified

        if log_AudioTypes:
            log_filename = out_file + '.audiotypes.log'
            audio_logfile = open(log_filename,
                                 'w')  # TODO allow append mode 'a'
            audio_logwriter = unicsv.UnicodeCSVWriter(
                audio_logfile)  #, quoting=csv.QUOTE_ALL)

        if log_Errors:
            err_log_filename = out_file + '.errors.log'
            error_logfile = open(err_log_filename,
                                 'w')  # TODO allow append mode 'a'
            error_logwriter = unicsv.UnicodeCSVWriter(
                error_logfile)  #, quoting=csv.QUOTE_ALL)

        if out_HDF5:
            FeatureWriter = HDF5FeatureWriter()
        else:
            FeatureWriter = CSVFeatureWriter()
            FeatureWriter.open(out_file, ext, append=append)

    for fil in filelist:  # iterate over all files
        try:
            if n > 0:
                elaps_time = time.time() - start_time
                remain_time = elaps_time * n_files / n - elaps_time  # n is the number of files done here
            else:
                remain_time = None

            n += 1

            if path:
                filename = path + os.sep + fil
            else:
                filename = fil
            if verbose:
                print '#', n, '/', n_files, '(ETA: ' + timestr(
                    remain_time) + "):", filename

            # read audio file (wav or mp3)
            samplerate, samplewidth, data, decoder = audiofile_read(
                filename,
                verbose=verbose,
                include_decoder=True,
                no_extension_check=no_extension_check,
                force_resampling=force_resampling)

            # audio file info
            if verbose:
                print samplerate, "Hz,", data.shape[
                    1], "channel(s),", data.shape[0], "samples"

            # extract features
            # Note: the True/False flags are determined by checking if a feature is listed in 'ext' (see settings above)

            feat = rp.rp_extract(
                data,
                samplerate,
                extract_rp=('rp' in ext),  # extract Rhythm Patterns features
                extract_ssd=(
                    'ssd' in ext),  # extract Statistical Spectrum Descriptor
                extract_tssd=(
                    'tssd' in ext
                ),  # extract temporal Statistical Spectrum Descriptor
                extract_rh=('rh' in ext),  # extract Rhythm Histogram features
                extract_trh=(
                    'trh'
                    in ext),  # extract temporal Rhythm Histogram features
                extract_mvd=(
                    'mvd' in ext
                ),  # extract Modulation Frequency Variance Descriptor
                spectral_masking=True,
                transform_db=True,
                transform_phon=True,
                transform_sone=True,
                fluctuation_strength_weighting=True,
                skip_leadin_fadeout=1,
                step_width=1,
                verbose=verbose)

            # TODO check if ext and feat.keys are consistent

            # WHAT TO USE AS ID (based on filename): 3 choices:
            id = fil  # rel. filename as from find_files
            # id = filename   # full filename incl. full path
            # id = filename[len(path)+1:] # relative filename only (extracted from path)

            if out_file:
                # WRITE each feature set to a CSV or HDF5 file

                id2 = None

                if label:
                    id2 = id.replace("\\", "/").split("/")[-2].strip()

                if out_HDF5 and n_extracted == 0:
                    # for HDF5 we need to know the vector dimension
                    # thats why we cannot open the file earlier
                    FeatureWriter.open(
                        out_file, ext, feat, append=append
                    )  # append not working for now but possibly in future

                FeatureWriter.write_features(id, feat, id2)
            else:
                # IN MEMORY: add the extracted features for 1 file to the array dict accumulating all files
                # TODO: only if we don't have out_file? maybe we want this as a general option

                if feat_array == {}:  # for first file, initialize empty array with dimension of the feature set
                    for e in feat.keys():
                        feat_array[e] = np.empty((0, feat[e].shape[0]))

                # store features in array
                for e in feat.keys():
                    feat_array[e] = np.append(
                        feat_array[e], feat[e].reshape(1, -1), axis=0
                    )  # 1 for horizontal vector, -1 means take original dimension

                filelist_extracted.append(id)

            n_extracted += 1

            # write list of analyzed audio files alongsize audio metadata (kHz, bit, etc.)
            if audio_logwriter:
                if not audio_logwriter_wrote_header:  # write CSV header
                    log_info = [
                        "filename", "decoder", "samplerate (kHz)",
                        "samplewidth (bit)", "n channels", "n samples"
                    ]
                    audio_logwriter.writerow(log_info)
                    audio_logwriter_wrote_header = True

                log_info = [
                    filename, decoder, samplerate, samplewidth * 8,
                    data.shape[1], data.shape[0]
                ]
                audio_logwriter.writerow(log_info)

            gc.collect(
            )  # after every file we do garbage collection, otherwise our memory is used up quickly for some reason

        except Exception as e:
            print "ERROR analysing file: " + fil + ": " + str(e)
            err += 1
            if error_logwriter:
                error_logwriter.writerow([fil, str(e)])

    try:
        if out_file:  # close all output files
            FeatureWriter.close()

            if audio_logwriter:
                audio_logfile.close()

        if error_logwriter:
            error_logfile.close()

    except Exception as e:
        print "ERROR closing the output or log files: " + str(e)

    end_time = time.time()

    if verbose:
        print "FEATURE EXTRACTION FINISHED.", n, "file(s) processed,", n_extracted, "successful. Duration:", timestr(
            end_time - start_time)
        if err > 0:
            print err, "file(s) had ERRORs during feature extraction.",
            if log_Errors:
                print "See", err_log_filename
            else:
                print
        if out_file:
            opt_ext = '.h5' if out_HDF5 else ''
            print "Feature file(s):", out_file + "." + str(ext) + opt_ext

    if out_file is None:
        return filelist_extracted, feat_array
                    if int(district[0:2]) in d:
                        results.append([
                            county, row[1], row[2], row[3], row[4], row[5],
                            row[6], office, district, party,
                            cand.split(' (')[0].replace('  ', ' '), votes
                        ])
                elif office == 'State Senator':
                    d = [
                        int(x['district']) for x in county_districts
                        if x['office'] == 'State Senate'
                    ]
                    if int(district[0:2]) in d:
                        results.append([
                            county, row[1], row[2], row[3], row[4], row[5],
                            row[6], office, district, party,
                            cand.split(' (')[0].replace('  ', ' '), votes
                        ])
                elif office == 'President' or office == 'U.S. Senator':
                    results.append([
                        county, row[1], row[2], row[3], row[4], row[5], row[6],
                        office, district, party,
                        cand.split(' (')[0].replace('  ', ' '), votes
                    ])
                else:
                    continue

        with open(filename, 'wb') as outfile:
            writer = unicsv.UnicodeCSVWriter(outfile)
            writer.writerow(fixed_cols)
            writer.writerows(results)