def concatenate(input_files, output=None, max_per=None, max_total=None, do_random=True): ap_files = [os.path.abspath(in_file.name) for in_file in input_files] for in_file in input_files: in_file.close() if not output: dir_path, _ = os.path.split(ap_files[0]) output_filename = generate_output_filename(ap_files) output_path = os.path.join(dir_path, output_filename) output = open(output_path, 'w') output_filename = os.path.abspath(output.name) output.close() if do_random: table_trim = random_trim else: table_trim = trunc_trim tables = [data_utils.load_table(apf) for apf in ap_files] subtables = [] for tab in tables: if max_per and len(tab) > int(max_per): tab = table_trim(tab, max_per) subtables.append(tab) concatted = data_utils.concatenate_tables(tables) if max_total and len(concatted) > int(max_total): concatted = table_trim(concatted, max_total) data_utils.save_table(output_filename, concatted) return
def select(input_file, protection_level, classes, class_var, attrfile, output): input_file_name = input_file[0].name input_file[0].close() in_data = load_table(input_file_name) if output is None: base, ext = path.splitext(input_file_name) output = base + '_selected' + ext if not classes: classes = DEFAULT_CLASSES if protection_level: protection_index = in_data.domain[protection_level] unprotected_index = [i for i, v in enumerate(in_data) if v[protection_index].native() != 'True'] out_data = in_data.get_items(unprotected_index) kwargs = {} kwargs[class_var] = classes out_data = in_data.filter(**kwargs) out_data = cast_table(out_data, new_class_var=out_data.domain.class_var) if attrfile: in_data = cast_table(in_data, attr_selector=attrfile) save_table(output, out_data) return in_data, out_data
def decorrelate_data(input_file, corr_min=DEFAULT_CORR_MIN, corr_max=DEFAULT_CORR_MAX, subtable_limit=DEFAULT_SUBTABLE_LEN, out_file=None): input_file_name = input_file[0].name input_file[0].close() in_data = load_table(input_file_name) if out_file is None: base, ext = path.splitext(input_file_name) out_file = base + '_decorrelated' + ext c_vars = [a.name for a in in_data.domain if a.var_type == Orange.feature.Type.Continuous] out_data = cast_table(in_data, attr_selector=c_vars) clean_data = clean_missing_data(out_data) out_data = purge_uniform_features(clean_data) if len(out_data) > subtable_limit: in_subtable = get_random_subtable(out_data, subtable_limit) else: in_subtable = out_data data_distances = compute_attr_dist_matrix(in_subtable) kept, dropped = get_redundant_attrs(data_distances, corr_lower=corr_min, corr_upper=corr_max) out_data = cast_table(out_data, attr_selector=kept) #out_subtable = get_random_subtable(out_data, DEFAULT_SUBTABLE_LEN) #compute_attr_dist_matrix(out_subtable) save_table(out_file, out_data) return in_data, out_data