def decorrelate_data(input_file, corr_min=DEFAULT_CORR_MIN, corr_max=DEFAULT_CORR_MAX, subtable_limit=DEFAULT_SUBTABLE_LEN, out_file=None): input_file_name = input_file[0].name input_file[0].close() in_data = load_table(input_file_name) if out_file is None: base, ext = path.splitext(input_file_name) out_file = base + '_decorrelated' + ext c_vars = [a.name for a in in_data.domain if a.var_type == Orange.feature.Type.Continuous] out_data = cast_table(in_data, attr_selector=c_vars) clean_data = clean_missing_data(out_data) out_data = purge_uniform_features(clean_data) if len(out_data) > subtable_limit: in_subtable = get_random_subtable(out_data, subtable_limit) else: in_subtable = out_data data_distances = compute_attr_dist_matrix(in_subtable) kept, dropped = get_redundant_attrs(data_distances, corr_lower=corr_min, corr_upper=corr_max) out_data = cast_table(out_data, attr_selector=kept) #out_subtable = get_random_subtable(out_data, DEFAULT_SUBTABLE_LEN) #compute_attr_dist_matrix(out_subtable) save_table(out_file, out_data) return in_data, out_data
from data_utils import cast_table from distance_utils import get_redundant_attrs, compute_attr_dist_matrix kept, dropped = get_redundant_attrs(in_distance) out_data = cast_table(in_data, attr_selector=kept) out_distance = compute_attr_dist_matrix(out_data)