示例#1
0
def test_results_to_table(tr, keep_metas=True, keep_attrs=False):
    attrs = []
    is_cont = tr.class_values is None # None if continuous, non-None if discrete

    new_attrs = {}
    for i, cn in enumerate(tr.classifier_names):
        feat_name = 'cls_'+cn
        if is_cont:
            feat = data_utils.make_c_feature(feat_name)
        else:
            feat = data_utils.make_d_feature(feat_name, tr.class_values) # TODO: untested
        new_attrs[feat] = [ r.classes[i] for r in tr.results ]

    try:
        orig_table = tr.examples
    except AttributeError:
        if keep_metas or keep_attrs:
            raise
        else:
            # did not use save_examples on test results, need to construct table from scratch
            # TODO
            raise

    if not keep_attrs:
        attr_selector = lambda x: False
    else:
        attr_selector = None

    return data_utils.cast_table(orig_table,
                                 new_attrs=new_attrs,
                                 attr_selector=attr_selector,
                                 keep_metas=keep_metas)
示例#2
0
def select(input_file, protection_level, classes, class_var, attrfile, output):
    input_file_name = input_file[0].name
    input_file[0].close()
    in_data = load_table(input_file_name)
    if output is None:
        base, ext = path.splitext(input_file_name)
        output = base + '_selected' + ext
    if not classes:
        classes = DEFAULT_CLASSES
    if protection_level:
        protection_index = in_data.domain[protection_level]
        unprotected_index = [i for i, v in enumerate(in_data) if v[protection_index].native() != 'True']
        out_data = in_data.get_items(unprotected_index)
    kwargs = {}
    kwargs[class_var] = classes
    out_data = in_data.filter(**kwargs)
    out_data = cast_table(out_data, new_class_var=out_data.domain.class_var)
    if attrfile:
        in_data = cast_table(in_data, attr_selector=attrfile)
    save_table(output, out_data)
    return in_data, out_data
示例#3
0
def decorrelate_data(input_file, corr_min=DEFAULT_CORR_MIN, corr_max=DEFAULT_CORR_MAX, subtable_limit=DEFAULT_SUBTABLE_LEN, out_file=None):
    input_file_name = input_file[0].name
    input_file[0].close()
    in_data = load_table(input_file_name)
    if out_file is None:
        base, ext = path.splitext(input_file_name)
        out_file = base + '_decorrelated' + ext
    c_vars = [a.name for a in in_data.domain if a.var_type == Orange.feature.Type.Continuous]
    out_data = cast_table(in_data, attr_selector=c_vars)
    clean_data = clean_missing_data(out_data)
    out_data = purge_uniform_features(clean_data)
    if len(out_data) > subtable_limit:
        in_subtable = get_random_subtable(out_data, subtable_limit)
    else:
        in_subtable = out_data
    data_distances = compute_attr_dist_matrix(in_subtable)
    kept, dropped = get_redundant_attrs(data_distances, corr_lower=corr_min, corr_upper=corr_max)
    out_data = cast_table(out_data, attr_selector=kept)
    #out_subtable = get_random_subtable(out_data, DEFAULT_SUBTABLE_LEN)
    #compute_attr_dist_matrix(out_subtable)
    save_table(out_file, out_data)
    return in_data, out_data
示例#4
0
from data_utils import cast_table
from mdp.nodes import FANode
from Orange.data import Table, Domain
import Orange

fa_node = FANode(max_cycles=500, verbose=True)

dom_data = cast_table(in_data, attr_selector='d_')
dom_stats = Orange.statistics.basic.Domain(dom_data)
new_attrs = []
for attr in dom_data.domain.features:
    attr_c = Orange.feature.Continuous(attr.name + "_n")
    attr_c.getValueFrom = Orange.classification.ClassifierFromVar(whichVar=attr)
    transformer = Orange.data.utils.NormalizeContinuous()
    attr_c.getValueFrom.transformer = transformer
    transformer.average = dom_stats[attr].avg
    transformer.span = dom_stats[attr].dev
    new_attrs.append(attr_c)

new_domain = Orange.data.Domain(new_attrs, dom_data.domain.classVar)
norm_dom_data = Orange.data.Table(new_domain, dom_data)

fa_res = fa_node.execute(norm_dom_data.to_numpy()[0])
out_data = Table(fa_node.A)

from stats import dist_stats
in_domain = norm_dom_data.domain
LATENT_COUNT = min(len(in_domain.attributes)/2, len(fa_node.A))
latent_attrs = []
weights = fa_node.A.transpose()
for i in range(LATENT_COUNT):
from data_utils import cast_table
from distance_utils import get_redundant_attrs, compute_attr_dist_matrix

kept, dropped = get_redundant_attrs(in_distance)

out_data = cast_table(in_data, attr_selector=kept)
out_distance = compute_attr_dist_matrix(out_data)
import data_utils

CLASS_SCORES = {"FA": 0.9, "GA": 0.5}

c_feat = data_utils.get_mapped_c_feature("R_ah_current", "C_ah_current", CLASS_SCORES)

out_data = data_utils.cast_table(in_data, new_class_var=c_feat)
示例#7
0
from data_utils import cast_table, make_c_feature
import distance_utils as d_utils

BASE_SCORES = {'FA': 0.9, 'GA':0.7}
SIGMA_WIDTH = 0.2/5

r_scores = d_utils.get_relief_scores(in_data)
r_scores.sort(key=lambda x: x[1], reverse=True)

narrow_data = cast_table(in_data, attr_selector=[x[0].name for x in r_scores[:40]])
dist_feats = d_utils.get_norm_dist_features(narrow_data)
exemplary_table = d_utils.get_exemplary_table(narrow_data, ['FA', 'GA'])
exem_dist_feats = d_utils.get_norm_dist_features(exemplary_table, 'dist_E', narrow_data)

out_data = cast_table(narrow_data, new_attrs=dist_feats+exem_dist_feats)

get_score_boost = lambda x, rw=None: (x['dist_EGA'] - x['dist_EFA']) / (x['dist_EGA'] + x['dist_EFA'])

get_score_boost_ga = d_utils.get_sigmoid_func(get_score_boost, out_data.filter_ref(R_ah_current='GA'), SIGMA_WIDTH)
get_score_boost_fa = d_utils.get_sigmoid_func(get_score_boost, out_data.filter_ref(R_ah_current='FA'), SIGMA_WIDTH)

score_boost_feat = make_c_feature('score_boost')
def bucketed_score_boost(ex, rw=None):
    if ex['R_ah_current'] == 'FA':
        return get_score_boost_fa(ex)
    else:
        return get_score_boost_ga(ex)
score_boost_feat.get_value_from = bucketed_score_boost
boosted_data = cast_table(out_data, new_attrs=[score_boost_feat])

total_score_feat = make_c_feature('total_score')