def remove_and_correct_outliers(data): ##is data in a normal distribution?? b_constant = 1.4826 ##constant used for normal distribution factor = 10 #3 ##factor to multiply for the range for i in range(0, len(data[0].values) ): ##iterate through all features, in voce case 6125 d_s, d_ns, _, _ = utils.get_utterance_values_of_ith_utterance( data, i) ##get all feature values d = d_s + d_ns ##join them together, since the fucntion returns different arrays for stress or not stress f_vals = np.array(d, dtype=float) ##transform list into np array median = np.median(f_vals) ##get the median diff = ( f_vals - median )**2 ##subtract median to every element and **2 to get all values to positive diff = np.sqrt(diff) ## eliminate the **2 trick to avoid negatives med_abs_deviation = np.median(diff) ##get the new mean threshold = med_abs_deviation * b_constant ##raange of value to be accepted max_range = median + threshold * factor min_range = median - threshold * factor for j in range( 0, len(f_vals) ): ##mark values that are outside the bounderies as outliers if f_vals[j] < min_range or f_vals[j] > max_range: f_vals[j] = np.nan imp = Imputer(missing_values=np.nan, strategy='mean', axis=1) f_vals = imp.fit_transform(f_vals)[0] for j in range(0, len(f_vals)): data[j].values[i] = round(f_vals[j], 6) return data
def remove_and_correct_outliers(data): ##is data in a normal distribution?? b_constant = 1.4826 ##constant used for normal distribution factor = 10 #3 ##factor to multiply for the range count = 0 for i in range(0, len(data[0].values)): ##iterate through all features, in voce case 6125 d_s, d_ns, _, _ = utils.get_utterance_values_of_ith_utterance(data, i) ##get all feature values d = d_s + d_ns ##join them together, since the fucntion returns different arrays for stress or not stress f_vals = np.array(d, dtype=float) ##transform list into np array median = np.median(f_vals) ##get the median diff = (f_vals - median)**2 ##subtract median to every element and **2 to get all values to positive diff = np.sqrt(diff) ## eliminate the **2 trick to avoid negatives med_abs_deviation = np.median(diff) ##get the new mean threshold = med_abs_deviation * b_constant ##raange of value to be accepted max_range = median + threshold * factor min_range = median - threshold * factor for j in range(0, len(f_vals)): ##mark values that are outside the bounderies as outliers if f_vals[j] < min_range or f_vals[j] > max_range: count += 1 f_vals[j] = np.nan imp = Imputer(missing_values=np.nan, strategy='mean', axis=1) f_vals = imp.fit_transform(f_vals)[0] for j in range(0, len(f_vals)): data[j].values[i] = round(f_vals[j],6) print "Detected ", count, " outliers" return data
def filter_features(data): features = {} useless_features = [] bins = [50, 100, 250, 500, 1000] mis = [[],[],[],[],[]] for index in range(0,6125): ##iterate through all 6125 features feature_values_no_stress, feature_values_stress, min, max = utils.get_utterance_values_of_ith_utterance(data, index) if max - min == 0: useless_features.append(index) for i in range(0,len(bins)): mi = calculate_mi(feature_values_no_stress, feature_values_stress, bins[i]) mis[i].append(mi) if i == 0: features[index] = [] features[index].append(mi) ##thresholds for each bin thresholds = [] for m in mis: t = np.percentile(m, 75) # return 75th percentile thresholds.append(t) ##create lis to check if feature was selected for each bin selections = [[],[],[],[],[]] fts = [] hist = [] for i in range(0,6125): fts.append(0) for j in range(0, len(bins)): if features[i][j] >= thresholds[j]: selections[j].append(1) fts[i] += 1 hist.append(i) else: selections[j].append(0) most_selected_fts = [] ##stores the utterances that were selected in all tests ##mudar para fazer histograma pela contagem e nao pela frequencia hist_scatter = {} for i in range(0,6125): ##initiates list with all 0 hist_scatter[i] = 0 for i in hist: hist_scatter[i] += 1 for i in hist_scatter: if hist_scatter[i] == 5: most_selected_fts.append(i) return most_selected_fts
def filter_features(data): features = {} useless_features = [] bins = [50, 100, 250, 500, 1000] mis = [[], [], [], [], []] for index in range(0, 6125): ##iterate through all 6125 features feature_values_no_stress, feature_values_stress, min, max = utils.get_utterance_values_of_ith_utterance( data, index) if max - min == 0: useless_features.append(index) for i in range(0, len(bins)): mi = calculate_mi(feature_values_no_stress, feature_values_stress, bins[i]) mis[i].append(mi) if i == 0: features[index] = [] features[index].append(mi) ##thresholds for each bin thresholds = [] for m in mis: t = np.percentile(m, 75) # return 75th percentile thresholds.append(t) ##create lis to check if feature was selected for each bin selections = [[], [], [], [], []] fts = [] hist = [] for i in range(0, 6125): fts.append(0) for j in range(0, len(bins)): if features[i][j] >= thresholds[j]: selections[j].append(1) fts[i] += 1 hist.append(i) else: selections[j].append(0) most_selected_fts = [ ] ##stores the utterances that were selected in all tests ##mudar para fazer histograma pela contagem e nao pela frequencia hist_scatter = {} for i in range(0, 6125): ##initiates list with all 0 hist_scatter[i] = 0 for i in hist: hist_scatter[i] += 1 for i in hist_scatter: if hist_scatter[i] == 5: most_selected_fts.append(i) return most_selected_fts