def test_results_to_table(tr, keep_metas=True, keep_attrs=False): attrs = [] is_cont = tr.class_values is None # None if continuous, non-None if discrete new_attrs = {} for i, cn in enumerate(tr.classifier_names): feat_name = 'cls_'+cn if is_cont: feat = data_utils.make_c_feature(feat_name) else: feat = data_utils.make_d_feature(feat_name, tr.class_values) # TODO: untested new_attrs[feat] = [ r.classes[i] for r in tr.results ] try: orig_table = tr.examples except AttributeError: if keep_metas or keep_attrs: raise else: # did not use save_examples on test results, need to construct table from scratch # TODO raise if not keep_attrs: attr_selector = lambda x: False else: attr_selector = None return data_utils.cast_table(orig_table, new_attrs=new_attrs, attr_selector=attr_selector, keep_metas=keep_metas)
def select(input_file, protection_level, classes, class_var, attrfile, output): input_file_name = input_file[0].name input_file[0].close() in_data = load_table(input_file_name) if output is None: base, ext = path.splitext(input_file_name) output = base + '_selected' + ext if not classes: classes = DEFAULT_CLASSES if protection_level: protection_index = in_data.domain[protection_level] unprotected_index = [i for i, v in enumerate(in_data) if v[protection_index].native() != 'True'] out_data = in_data.get_items(unprotected_index) kwargs = {} kwargs[class_var] = classes out_data = in_data.filter(**kwargs) out_data = cast_table(out_data, new_class_var=out_data.domain.class_var) if attrfile: in_data = cast_table(in_data, attr_selector=attrfile) save_table(output, out_data) return in_data, out_data
def decorrelate_data(input_file, corr_min=DEFAULT_CORR_MIN, corr_max=DEFAULT_CORR_MAX, subtable_limit=DEFAULT_SUBTABLE_LEN, out_file=None): input_file_name = input_file[0].name input_file[0].close() in_data = load_table(input_file_name) if out_file is None: base, ext = path.splitext(input_file_name) out_file = base + '_decorrelated' + ext c_vars = [a.name for a in in_data.domain if a.var_type == Orange.feature.Type.Continuous] out_data = cast_table(in_data, attr_selector=c_vars) clean_data = clean_missing_data(out_data) out_data = purge_uniform_features(clean_data) if len(out_data) > subtable_limit: in_subtable = get_random_subtable(out_data, subtable_limit) else: in_subtable = out_data data_distances = compute_attr_dist_matrix(in_subtable) kept, dropped = get_redundant_attrs(data_distances, corr_lower=corr_min, corr_upper=corr_max) out_data = cast_table(out_data, attr_selector=kept) #out_subtable = get_random_subtable(out_data, DEFAULT_SUBTABLE_LEN) #compute_attr_dist_matrix(out_subtable) save_table(out_file, out_data) return in_data, out_data
from data_utils import cast_table from mdp.nodes import FANode from Orange.data import Table, Domain import Orange fa_node = FANode(max_cycles=500, verbose=True) dom_data = cast_table(in_data, attr_selector='d_') dom_stats = Orange.statistics.basic.Domain(dom_data) new_attrs = [] for attr in dom_data.domain.features: attr_c = Orange.feature.Continuous(attr.name + "_n") attr_c.getValueFrom = Orange.classification.ClassifierFromVar(whichVar=attr) transformer = Orange.data.utils.NormalizeContinuous() attr_c.getValueFrom.transformer = transformer transformer.average = dom_stats[attr].avg transformer.span = dom_stats[attr].dev new_attrs.append(attr_c) new_domain = Orange.data.Domain(new_attrs, dom_data.domain.classVar) norm_dom_data = Orange.data.Table(new_domain, dom_data) fa_res = fa_node.execute(norm_dom_data.to_numpy()[0]) out_data = Table(fa_node.A) from stats import dist_stats in_domain = norm_dom_data.domain LATENT_COUNT = min(len(in_domain.attributes)/2, len(fa_node.A)) latent_attrs = [] weights = fa_node.A.transpose() for i in range(LATENT_COUNT):
from data_utils import cast_table from distance_utils import get_redundant_attrs, compute_attr_dist_matrix kept, dropped = get_redundant_attrs(in_distance) out_data = cast_table(in_data, attr_selector=kept) out_distance = compute_attr_dist_matrix(out_data)
import data_utils CLASS_SCORES = {"FA": 0.9, "GA": 0.5} c_feat = data_utils.get_mapped_c_feature("R_ah_current", "C_ah_current", CLASS_SCORES) out_data = data_utils.cast_table(in_data, new_class_var=c_feat)
from data_utils import cast_table, make_c_feature import distance_utils as d_utils BASE_SCORES = {'FA': 0.9, 'GA':0.7} SIGMA_WIDTH = 0.2/5 r_scores = d_utils.get_relief_scores(in_data) r_scores.sort(key=lambda x: x[1], reverse=True) narrow_data = cast_table(in_data, attr_selector=[x[0].name for x in r_scores[:40]]) dist_feats = d_utils.get_norm_dist_features(narrow_data) exemplary_table = d_utils.get_exemplary_table(narrow_data, ['FA', 'GA']) exem_dist_feats = d_utils.get_norm_dist_features(exemplary_table, 'dist_E', narrow_data) out_data = cast_table(narrow_data, new_attrs=dist_feats+exem_dist_feats) get_score_boost = lambda x, rw=None: (x['dist_EGA'] - x['dist_EFA']) / (x['dist_EGA'] + x['dist_EFA']) get_score_boost_ga = d_utils.get_sigmoid_func(get_score_boost, out_data.filter_ref(R_ah_current='GA'), SIGMA_WIDTH) get_score_boost_fa = d_utils.get_sigmoid_func(get_score_boost, out_data.filter_ref(R_ah_current='FA'), SIGMA_WIDTH) score_boost_feat = make_c_feature('score_boost') def bucketed_score_boost(ex, rw=None): if ex['R_ah_current'] == 'FA': return get_score_boost_fa(ex) else: return get_score_boost_ga(ex) score_boost_feat.get_value_from = bucketed_score_boost boosted_data = cast_table(out_data, new_attrs=[score_boost_feat]) total_score_feat = make_c_feature('total_score')