def get_statistics_independently(arff_file): matrix, labels, relation, attributes = am.arff_to_nparray(arff_file) classes = list(set(labels)) labels = labels.reshape(-1, 1) folder, name = os.path.split(arff_file) if folder == "": folder = os.getcwd() stats_names = [ 'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew', 'percentile25', 'percentile50', 'percentile75' ] for stat in stats_names: indices = [] subname = name.replace(".arff", "_%s" % stat) for attribute in attributes: if attribute.endswith(stat): indices.append(attributes.index(attribute)) submatrix = np.concatenate((matrix[:, indices], labels), axis=-1) subheader = np.concatenate( (np.array(attributes)[indices], np.array(["Class"])), axis=-1).reshape(1, -1) am.create_arff( np.concatenate((subheader, submatrix), axis=0).tolist(), classes, folder, subname, subname)
def get_statistics(features_folder, output_path=None, statistics=["mean"]): if output_path is None: output_path = os.path.join(features_folder, os.path.split(features_folder)[1]) features = os.path.split(features_folder)[1] analyzed_files = [] matrix = [] classes = sorted([ f for f in os.listdir(features_folder) if os.path.isdir(os.path.join(features_folder, f)) and not f.startswith('.') ], key=lambda f: f.lower()) for class_name in classes: files = sorted([ f for f in os.listdir(os.path.join(features_folder, class_name)) if os.path.isfile(os.path.join(features_folder, class_name, f)) and not f.startswith('.') and f[-4:].lower() == ".csv" ], key=lambda f: f.lower()) analyzed_files += ["%s,%s" % (file, class_name) for file in files] for feat_file in files: df = pandas.read_csv(os.path.join(features_folder, class_name, feat_file), header=None) feature_names = df.columns.values feature_names = [ "%s_%s" % (features, num) for num in feature_names ] vals = df.values header = [] data = [] for statistic in statistics: if statistic == "max": values = np.nanmax(vals, axis=0) elif statistic == "min": values = np.nanmin(vals, axis=0) elif statistic == "mean": values = np.nanmean(vals, axis=0) elif statistic == "median": values = np.nanmedian(vals, axis=0) elif statistic == "std": values = np.nanstd(vals, axis=0) elif statistic == "var": values = np.nanvar(vals, axis=0) elif statistic == "kurt": values = scipy.stats.kurtosis(vals, axis=0) elif statistic == "skew": values = scipy.stats.skew(vals, axis=0) elif statistic == 'percentile25': values = np.nanpercentile(vals, 25, axis=0) elif statistic == 'percentile50': values = np.nanpercentile(vals, 50, axis=0) elif statistic == 'percentile75': values = np.nanpercentile(vals, 75, axis=0) header += [ "%s_%s" % (name, statistic) for name in feature_names ] data.append(values) instance = np.concatenate(tuple(data), axis=-1).tolist() instance.append(class_name) matrix.append(instance) print("%s analyzed." % feat_file) header.append("Class") matrix = [header] + matrix am.create_arff( matrix, classes, os.path.split(output_path)[0], os.path.split(output_path)[1] + "_%s" % "_".join(statistics), os.path.split(output_path)[1] + "_statistics") print("Statistics from %s obtained." % os.path.split(output_path)[1]) with open(output_path + ".txt", "w+") as files: files.write("\n".join(analyzed_files))
def get_statistics_per_category(databaseFolder, processedDataFolder=None): if processedDataFolder == None: processedDataFolder = "datasets/visual" classes = sorted([ f for f in os.listdir(databaseFolder) if os.path.isdir(os.path.join(databaseFolder, f)) and not f.startswith('.') ], key=lambda f: f.lower()) stats_names = [ 'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew', 'percentile25', 'percentile50', 'percentile75' ] categoryDictionary = { "gaze": ["gaze_"], "eye_landmarks": ["eye_lmk_"], "head": ["pose_"], "facial_landmarks": ["x_", "y_"], "au_intensity": ["_r"], "au_presence": ["_c"] } for category in categoryDictionary.keys(): startFlag = True analyzedFiles = [] for className in classes: files = sorted([ f for f in os.listdir(os.path.join(databaseFolder, className)) if os.path.isfile(os.path.join(databaseFolder, className, f)) and not f.startswith('.') and f[-4:].lower() == ".csv" ], key=lambda f: f.lower()) analyzedFiles += ["%s,%s" % (file, className) for file in files] for feat_file in files: mm_feats = [] mm_names = [] df = pandas.read_csv(os.path.join(databaseFolder, className, feat_file), header='infer') feature_names = df.columns.values for feat in feature_names: reference = categoryDictionary.get(category) for string in reference: if feat.strip().lower().startswith(string) \ or feat.strip().lower().endswith(string): # Feature vector vals = df[feat].values # Run statistics maximum = np.nanmax(vals) minimum = np.nanmin(vals) mean = np.nanmean(vals) median = np.nanmedian(vals) std = np.nanstd(vals) var = np.nanvar(vals) kurt = scipy.stats.kurtosis(vals) skew = scipy.stats.skew(vals) percentile25 = np.nanpercentile(vals, 25) percentile50 = np.nanpercentile(vals, 50) percentile75 = np.nanpercentile(vals, 75) names = [ feat.strip() + "_" + stat for stat in stats_names ] feats = [ maximum, minimum, mean, median, std, var, kurt, skew, percentile25, percentile50, percentile75 ] if startFlag: for n in names: mm_names.append(n) for f in feats: mm_feats.append(f) break if startFlag: matrix = [mm_names + ["Class"]] startFlag = False matrix.append(mm_feats + [className]) am.create_arff(matrix, classes, processedDataFolder, category, category) print("Analysis of %s acquired." % (category)) with open(os.path.join(processedDataFolder, "%s.txt" % (category)), "w+") as files: files.write("\n".join(analyzedFiles))
def get_statistics(databaseFolder, processedDataFolder=None, outputFileName=None, relationName=None): if processedDataFolder == None: processedDataFolder = "datasets/visual" if outputFileName == None: outputFileName = "all" if relationName == None: relationName = "all_visual" classes = sorted([ f for f in os.listdir(databaseFolder) if os.path.isdir(os.path.join(databaseFolder, f)) and not f.startswith('.') ], key=lambda f: f.lower()) stats_names = [ 'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew', 'percentile25', 'percentile50', 'percentile75' ] startFlag = True analyzedFiles = [] for className in classes: files = sorted([ f for f in os.listdir(os.path.join(databaseFolder, className)) if os.path.isfile(os.path.join(databaseFolder, className, f)) and not f.startswith('.') and f[-4:].lower() == ".csv" ], key=lambda f: f.lower()) analyzedFiles += ["%s,%s" % (file, className) for file in files] for feat_file in files: mm_feats = [] mm_names = [] df = pandas.read_csv(os.path.join(databaseFolder, className, feat_file), header='infer') feature_names = df.columns.values for feat in feature_names[5:]: # Feature vector vals = df[feat].values # Run statistics maximum = np.nanmax(vals) minimum = np.nanmin(vals) mean = np.nanmean(vals) median = np.nanmedian(vals) std = np.nanstd(vals) var = np.nanvar(vals) kurt = scipy.stats.kurtosis(vals) skew = scipy.stats.skew(vals) percentile25 = np.nanpercentile(vals, 25) percentile50 = np.nanpercentile(vals, 50) percentile75 = np.nanpercentile(vals, 75) names = [feat.strip() + "_" + stat for stat in stats_names] feats = [ maximum, minimum, mean, median, std, var, kurt, skew, percentile25, percentile50, percentile75 ] if startFlag: for n in names: mm_names.append(n) for f in feats: mm_feats.append(f) if startFlag: matrix = [mm_names + ["Class"]] startFlag = False matrix.append(mm_feats + [className]) am.create_arff(matrix, classes, processedDataFolder, outputFileName, relationName) print("Analysis of all OpenFace features acquired.") with open(os.path.join(processedDataFolder, outputFileName + ".txt"), "w+") as files: files.write("\n".join(analyzedFiles))
def get_statistics_per_category(databaseFolder, processedDataFolder=None): if processedDataFolder == None: processedDataFolder = "datasets/acousticic" classes = sorted([f for f in os.listdir(databaseFolder) if os.path.isdir(os.path.join(databaseFolder, f)) and not f.startswith('.')], key=lambda f: f.lower()) stats_names = ['max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew', 'percentile25', 'percentile50', 'percentile75'] categoryDictionary = {"voice": ["f0", "vuv"], "glottal_flow": ["naq", "qoq", "h1h2", "psp", "mdq", "peakslope", "rd", "creak"], "mcep": ["mcep_"], "hmpdm": ["hmpdm_"], "hmpdd": ["hmpdd_"], } for category in categoryDictionary.keys(): startFlag = True analyzedFiles = [] for className in classes: files = sorted([f for f in os.listdir(os.path.join(databaseFolder, className)) if os.path.isfile(os.path.join(databaseFolder, className, f)) and not f.startswith('.') and f[-4:].lower() == ".csv"], key=lambda f: f.lower()) analyzedFiles += ["%s,%s" % (file, className) for file in files] for feat_file in files: mm_feats = [] mm_names = [] df = pandas.read_csv(os.path.join(databaseFolder, className, feat_file), header='infer') feature_names = df.columns.values for feat in feature_names: reference = categoryDictionary.get(category) for string in reference: if feat.strip().lower().startswith(string) \ or feat.strip().lower().endswith(string): # Feature vector vals = df[feat].values # Run statistics maximum = np.nanmax(vals) minimum = np.nanmin(vals) mean = np.nanmean(vals) median = np.nanmedian(vals) std = np.nanstd(vals) var = np.nanvar(vals) kurt = scipy.stats.kurtosis(vals) skew = scipy.stats.skew(vals) percentile25 = np.nanpercentile(vals, 25) percentile50 = np.nanpercentile(vals, 50) percentile75 = np.nanpercentile(vals, 75) names = [feat.strip() + "_" + stat for stat in stats_names] feats = [maximum, minimum, mean, median, std, var, kurt, skew, percentile25, percentile50, percentile75] if startFlag: for n in names: mm_names.append(n) for f in feats: if np.isinf(f): mm_feats.append(np.sign(f)) elif np.isnan(f): mm_feats.append(0) else: mm_feats.append(f) break if startFlag: matrix = [mm_names + ["Class"]] startFlag = False matrix.append(mm_feats + [className]) am.create_arff(matrix,classes,processedDataFolder,category,category) print("Analysis of %s acquired." % (category)) with open(os.path.join(processedDataFolder, "%s.txt"%(category)), "w+") as files: files.write("\n".join(analyzedFiles))