def __make_rule_term_example_table(tableDict, allTerms): import orange import constants as const attrList = [ orange.EnumVariable(name=str(term), values=[const.PRESENT, const.ABSENT]) for term in allTerms ] # three meta attributes ruleName = orange.StringVariable(const.NAME_ATTR) mid = orange.newmetaid() ruleTerms = orange.StringVariable(const.TERMS_ATTR) mid1 = orange.newmetaid() #ruleNumber = orange.EnumVariable(SEQ_NUM_ATTR) #StringVariable(SEQ_NUM_ATTR) ruleNumber = orange.FloatVariable(const.SEQ_NUM_ATTR, startValue=1, endValue=len(tableDict), stepValue=1, numberOfDecimals=0) mid2 = orange.newmetaid() # this is a classless domain domain = orange.Domain(attrList, False) # name of the rule is a meta attribute domain.addmeta(mid, ruleName, False) domain.addmeta(mid1, ruleTerms, False) domain.addmeta(mid2, ruleNumber, False) table = orange.ExampleTable(domain) for k in sorted(tableDict.keys()): exampleValues = [] for (i, term) in enumerate(allTerms): if term in tableDict[k][const.RULETERMS_KEY]: #exampleValues.append(PRESENT) exampleValues.append(orange.Value(attrList[i], const.PRESENT)) else: #exampleValues.append(ABSENT) exampleValues.append(orange.Value(attrList[i], const.ABSENT)) example = orange.Example(domain, exampleValues) #example[NAME_ATTR] = tableDict[k][RULENAME_KEY][1:-1] #skip square brackets from the string #example[TERMS_ATTR] = tableDict[k][RULETERMS_STR_KEY][1:-1] #example[SEQ_NUM_ATTR] = k example[const.NAME_ATTR] = orange.Value(ruleName, tableDict[k][ const.RULENAME_KEY][1:-1]) #skip square brackets from the string example[const.TERMS_ATTR] = orange.Value( ruleTerms, tableDict[k][const.RULETERMS_STR_KEY][1:-1]) example[const.SEQ_NUM_ATTR] = orange.Value(ruleNumber, k) table.append(example) #end return table
def __call__(self, ex, what=orange.Classifier.GetValue): val = ex[self.var1] * self.noValues2 + ex[self.var2] if what == orange.Classifier.GetValue: return orange.Value(self.classVar, val) probs = orange.DiscDistribution(self.classVar) probs[val] = 1.0 if what == orange.Classifier.GetProbabilities: return probs else: return (orange.Value(self.classVar, val), probs)
def sendpredictions(self): if not self.data or not self.outvar: self.send("Predictions", None) return # predictions, data set with class predictions classification = self.outvar.varType == orange.VarTypes.Discrete metas = [] if classification: if len(self.selectedClasses): for c in self.predictors.values(): m = [orange.FloatVariable(name=str("%s(%s)" % (c.name, str(self.outvar.values[i]))), getValueFrom = lambda ex, rw, cindx=i, c=c: orange.Value(c(ex, c.GetProbabilities)[cindx])) \ for i in self.selectedClasses] metas.extend(m) if self.showClass: mc = [ orange.EnumVariable( name=str(c.name), values=self.outvar.values, getValueFrom=lambda ex, rw, c=c: orange.Value(c(ex))) for c in self.predictors.values() ] metas.extend(mc) else: # regression mc = [ orange.FloatVariable( name="%s" % c.name, getValueFrom=lambda ex, rw, c=c: orange.Value(c(ex))) for c in self.predictors.values() ] metas.extend(mc) classVar = self.outvar domain = orange.Domain(self.data.domain.attributes + [classVar]) domain.addmetas(self.data.domain.getmetas()) for m in metas: domain.addmeta(orange.newmetaid(), m) predictions = orange.ExampleTable(domain, self.data) if self.doPrediction: c = self.predictors.values()[0] for ex in predictions: ex[classVar] = c(ex) predictions.name = self.data.name self.send("Predictions", predictions) self.changedFlag = False
def all_discrete_clauses(self, attr, max_card=None): all_vals = self.col_to_clauses[attr] attrobj = self.full_table.domain[attr] idx = self.full_table.domain.index(attrobj) if max_card: for card in xrange(1, max_card + 1): for vals in combinations(all_vals, card): vals = [orange.Value(attrobj, value) for value in vals] yield orange.ValueFilter_discrete(position=idx, values=vals) else: for vals in powerset(all_vals): vals = [orange.Value(attrobj, value) for value in vals] yield orange.ValueFilter_discrete(position=idx, values=vals)
def __call__(self, example, resultType=orange.GetValue): from operator import add # voting for class probabilities if resultType == orange.GetProbabilities or resultType == orange.GetBoth: cprob = [0.] * len(self.domain.classVar.values) for c in self.classifiers: a = [x for x in c(example, orange.GetProbabilities)] cprob = list(map(add, cprob, a)) norm = sum(cprob) for i in range(len(cprob)): cprob[i] = cprob[i] / norm # voting for crisp class membership, notice that # this may not be the same class as one obtaining the # highest probability through probability voting if resultType == orange.GetValue or resultType == orange.GetBoth: cfreq = [0] * len(self.domain.classVar.values) for c in self.classifiers: cfreq[int(c(example))] += 1 index = cfreq.index(max(cfreq)) cvalue = orange.Value(self.domain.classVar, index) if resultType == orange.GetValue: return cvalue elif resultType == orange.GetProbabilities: return cprob else: return (cvalue, cprob)
def fill_in_rule(self, table, ref_bounds): domain = table.domain # if there are any cols not in the rule, fill them in with table bounds conds = {} for c in self.filter.conditions: attr = domain[c.position] name = attr.name conds[name] = True for col, bounds in ref_bounds.iteritems(): if col in conds: continue attr = domain[col] pos = domain.index(attr) if bounds is None: vals = range(len(attr.values)) vals = [orange.Value(attr, attr.values[v]) for v in vals] cond = orange.ValueFilter_discrete(position=pos, values=vals) else: (minv, maxv) = bounds cond = orange.ValueFilter_continuous( position=pos, oper=orange.ValueFilter.Between, min=minv - 1, max=maxv + 1) self.filter.conditions.append(cond)
def __call__(self, example, resultType=orange.GetValue): # 1. calculate sum of distributions of examples that cover the example num_cover = 0.0 distribution = [0] * len(self.data.domain.classVar.values) for rsc in self.rulesClass: for rule in rsc.rules.rules: if rule.covers(example): num_cover += 1 tmp_dist = rule(example, orange.GetProbabilities) for i in range(len(distribution)): distribution[i] += tmp_dist[i] # 2. calculate average of distributions of rules that cover example if num_cover != 0: max_index = 0 for i in range(len(distribution)): distribution[i] = distribution[i] / num_cover if distribution[i] > distribution[max_index]: max_index = i dist = orange.DiscDistribution(distribution) value = orange.Value(self.data.domain.classVar, self.data.domain.classVar.values[max_index]) # if no rule fiers else: value, dist = self.majorityClassifier(example, orange.GetBoth) # 3. -----------return if resultType == orange.GetValue: return value elif resultType == orange.GetBoth: return (value, dist) else: return dist
def __call__(self, example, result_type=orange.GetValue): # compute the class probabilities p = map(None, self.p_class) for c in range(len(self.domain.classVar.values)): for a in range(len(self.domain.attributes)): if not example[a].isSpecial(): p[c] *= self.p_cond[a][int(example[a])][c] # normalize probabilities to sum to 1 sum = 0. for pp in p: sum += pp if sum > 0: for i in range(len(p)): p[i] = p[i] / sum # find the class with highest probability v_index = p.index(max(p)) v = orange.Value(self.domain.classVar, v_index) # return the value based on requested return type if result_type == orange.GetValue: return v if result_type == orange.GetProbabilities: return p return (v, p)
def __rule__(self): if self._rule: return self._rule conds = [] for attr, gid in zip(self.attrs, self.group): if attr.var_type == Orange.feature.Type.Discrete: vals = [ orange.Value(attr, v) for v in self.grouper.id2vals[attr][gid] ] conds.append( orange.ValueFilter_discrete( position=self.grouper.data.domain.index(attr), values=vals)) else: vals = self.grouper.id2vals[attr][gid] minv, maxv = vals[0], vals[1] conds.append( Orange.data.filter.ValueFilterContinuous( oper=orange.ValueFilter.Between, position=self.grouper.data.domain.index(attr), min=minv, max=maxv)) self._rule = SDRule(self.grouper.data, None, conds, None) self._rule.quality = self.inf self._rule.inf_state = self.inf_state return self._rule
def cloneAndAddCondition(self, attribute, value): '''Returns a copy of this rule which condition part is extended by attribute = value''' cond = self.filter.conditions[:] cond.append( orange.ValueFilter_discrete( position=self.data.domain.attributes.index(attribute), values=[orange.Value(attribute, value)])) return SDRule(self.data, self.targetClass, cond, self.g)
def simplify(self, data=None, cdists=None, ddists=None): """ Args: data: non-filtered! data cdists: non-filtered Continuous distribution ddists: non-filtered discrete distribution Return: copy of this rule with simplified conditions """ subset = data and self(data) or self.examples data = data or self.data ret = self.clone() positions = [cond.position for cond in self.filter.conditions] cdists = cdists or Orange.statistics.basic.Domain(data) ddists = ddists or Orange.statistics.distribution.Domain(data) #scdists = Orange.statistics.basic.Domain(subset) #sddists = Orange.statistics.distribution.Domain(subset) conds = [] for old_cond, idx in zip(self.filter.conditions, positions): attr = data.domain[idx] # if rule values == full dataset values, then remove rule # filter down to the values that intersect the subset of data if attr.var_type == Orange.feature.Type.Discrete: full_d = ddists[attr.name] #sub_d = sddists[attr.name] fvals = [k for k, v in full_d.items() if v] cvals = set( [str(attr.values[int(v)]) for v in old_cond.values]) if len(cvals) == len(fvals): continue #dvals = [k for k,v in sub_d.items() if v] #vals = set(cvals).intersection(dvals) vals = cvals cond = orange.ValueFilter_discrete( position=idx, values=[orange.Value(attr, val) for val in vals]) else: fb = cdists[attr.name] #sb = scdists[attr.name] old_bound = [fb.min, fb.max] cond_bound = [old_cond.min, old_cond.max] bound = r_intersect(old_bound, cond_bound) if r_vol(bound) >= r_vol(old_bound): continue #bound = r_intersect(bound, [sb.min, sb.max]) cond = old_cond cond.min, cond.max = bound[0], bound[1] conds.append(cond) continue ret.quality = self.quality ret.filter.conditions = conds ret.c_range = list(self.c_range) return ret
def __call__(self, example, what=orange.Classifier.GetValue): probs = self.classifier(example, self.GetProbabilities) if what == self.GetProbabilities: return probs value = orange.Value(self.classifier.classVar, probs[1] > self.threshold) if what == orange.Classifier.GetValue: return value else: return (value, probs)
def mergeClassValues(data, value): selection = orange.EnumVariable("Selection", values=["0", "1"]) selectedClassesStr = [value] nonSelectedClassesStr = [] for val in data.domain.classVar.values: if val not in selectedClassesStr: nonSelectedClassesStr.append(val) shortData1 = data.select({data.domain.classVar.name: selectedClassesStr}) shortData2 = data.select( {data.domain.classVar.name: nonSelectedClassesStr}) d1 = orange.Domain(shortData1.domain.attributes + [selection]) selection.getValueFrom = lambda ex, what: orange.Value(selection, "0") data1 = orange.ExampleTable(d1, shortData1) selection.getValueFrom = lambda ex, what: orange.Value(selection, "1") data2 = orange.ExampleTable(d1, shortData2) data1.extend(data2) return data1
def __call__(self, data, targetClass, num_of_rules=0): '''Returns CN2-SD rules by performing weighted covering algorithm.''' data_discretized = False # If any of the attributes are continuous, discretize them if data.domain.hasContinuousAttributes(): original_data = data data_discretized = True new_domain = [] discretize = orange.EntropyDiscretization(forceAttribute=True) for attribute in data.domain.attributes: if attribute.varType == orange.VarTypes.Continuous: d_attribute = discretize(attribute, data) # An attribute is irrelevant, if it is discretized into a single interval # if len(d_attribute.getValueFrom.transformer.points) > 0: new_domain.append(d_attribute) else: new_domain.append(attribute) data = original_data.select(new_domain + [original_data.domain.classVar]) self.data = data self.max_rules = num_of_rules rules = [] tc = orange.Value(data.domain.classVar, targetClass) # weighted covering self.data.addMetaAttribute( self.weightID) # set weights of all examples to 1 self.data.addMetaAttribute( self.counter) # set counters of all examples to 0 targetClassRule = SDRule(data, targetClass, conditions=[], g=1) tmpRule = self.rbf(data, self.weightID, targetClass, None) while (tmpRule.quality > 0) and (self.max_rules == 0 or len(rules) < self.max_rules): bestRule = SDRule(self.data, tc, tmpRule.filter.conditions) bestRule.quality = tmpRule.quality self.decreaseExampleWeights(bestRule) rules.append(bestRule) tmpRule = self.rbf(data, self.weightID, targetClass, None) if data_discretized: targetClassRule = SDRule(original_data, targetClass, conditions=[], g=1) # change beam so the rules apply to original data rules = [rule.getUndiscretized(original_data) for rule in rules] else: targetClassRule = SDRule(data, targetClass, conditions=[], g=1) return SDRules(rules, targetClassRule, "CN2-SD")
def __call__(self, example, resultType = orange.GetValue): freq = [0.] * len(self.domain.classVar.values) for c in self.classifiers: freq[int(c(example))] += 1 index = freq.index(max(freq)) value = orange.Value(self.domain.classVar, index) for i in range(len(freq)): freq[i] = freq[i]/len(self.classifiers) if resultType == orange.GetValue: return value elif resultType == orange.GetProbabilities: return freq else: return (value, freq)
def __call__(self, example, what=orange.Classifier.GetValue): probability = self.classifier.orange_classify(example) answer = orange.Value(self.classVar, int(round(probability))) probabilities = orange.DiscDistribution(self.classVar) probabilities[answer] = probability if what == orange.Classifier.GetValue: return answer elif what == orange.Classifier.GetProbabilities: return probabilities else: return answer, probabilities
def lookupFromFunction(attribute, bound, function): """ Constructs ClassifierByExampleTable or ClassifierByLookupTable mirroring the given function """ lookup = lookupFromBound(attribute, bound) if lookup: lookup.lookupTable = [orange.Value(attribute, function(attributes)) for attributes in orngMisc.LimitedCounter([len(attr.values) for attr in bound])] return lookup else: examples = orange.ExampleTable(orange.Domain(bound, attribute)) for attributes in orngMisc.LimitedCounter([len(attr.values) for attr in dom.attributes]): examples.append(orange.Example(dom, attributes + [function(attributes)])) return orange.LookupLearner(examples)
def __call__(self, ex, what=orange.Classifier.GetValue): value = self.classify(ex) result = orange.Value(ex.domain.classVar, str(value)) probs = orange.DiscDistribution(ex.domain.classVar) probs[value] = 1.0 if what == orange.Classifier.GetValue: return result elif what == orange.Classifier.GetProbabilities: return probs elif what == orange.Classifier.GetBoth: return result, probs else: raise ValueError("Bad what argument: %s" % ` what `)
def __call__(self, example, result_type=orange.GetValue): if result_type == orange.GetValue: return orange.Value(self.domain.classVar, self.model.predict(extract_features(example))) else: # build a label map, which will be used to sort the outputted # probabilities class_map = {} for pos, label in enumerate(self.domain.classVar.values): class_map[label] = pos result = self.model.eval_all(extract_features(example)) if len(result) > 0: if result_type == orange.GetProbabilities: r = [None] * len(result) for label, prob in result: r[class_map[label]] = prob return r elif result_type == orange.GetBoth: return (orange.Value(self.domain.classVar, result[0][0]), result[0][1]) else: return None
def to_rule(self, table, cont_dists=None, disc_dists=None): """ @param cols list of attribute names """ if not self.rule: domain = table.domain attrnames = [attr.name for attr in domain] cont_dists = cont_dists or dict( zip(attrnames, Orange.statistics.basic.Domain(table))) disc_dists = disc_dists or dict( zip(attrnames, Orange.statistics.distribution.Domain(table))) conds = [] for col, bound in zip(self.cols, zip(*self.bbox)): attr = domain[col] pos = domain.index(attr) table_bound = cont_dists[attr.name] minv, maxv = r_intersect(bound, [table_bound.min, table_bound.max]) if maxv - minv > 0.99 * (table_bound.max - table_bound.min): continue conds.append( orange.ValueFilter_continuous(position=pos, max=bound[1], min=bound[0])) for disc_name, vidxs in self.discretes.iteritems(): attr = domain[disc_name] disc_pos = domain.index(attr) vals = [ orange.Value(attr, attr.values[int(vidx)]) for vidx in vidxs if int(vidx) < len(attr.values) ] if not vals or len(vals) == len(disc_dists[attr.name]): continue conds.append( orange.ValueFilter_discrete(position=disc_pos, values=vals)) rule = SDRule(table, None, conditions=conds) self.rule = rule rule = self.rule rule.quality = rule.score = self.error rule.inf_state = self.inf_state rule.c_range = self.c_range return rule
def getFixed(self, original_data): cond = [] for c in self.filter.conditions: feature = self.data.domain.attributes[c.position] position = original_data.domain.attributes.index(feature.attribute) if feature.cond == '==': cond.append( orange.ValueFilter_discrete(position=position, values=[ orange.Value( feature.attribute, feature.value) ])) elif feature.cond == '!=': cond.append( orange.ValueFilter_discrete( position=position, values=[ orange.Value(feature.attribute, value) for value in feature.attribute.values if value != feature.value ])) elif feature.cond == '<=': cond.append( orange.ValueFilter_continuous(position=position, max=feature.value, min=float(-infinity), outside=False)) elif feature.cond == '>': cond.append( orange.ValueFilter_continuous(position=position, max=feature.value, min=float(-infinity), outside=True)) return SDRule(original_data, self.targetClass, cond, self.g)
def __call__(self, learndata, testdata = None, weight = None): # because of preprocessing if testdata: classifier = SD_Classifier(testdata) else: classifier = SD_Classifier(learndata) for targetClassValue in learndata.domain.classVar.values: targetClass = orange.Value(learndata.domain.classVar, targetClassValue) beam = self.learner (learndata, targetClass, self.max_rules) classifier.addRulesForClass(beam, targetClass) classifier.name = self.name classifier.algorithm = self.algorithm return classifier
def __call__(self, example, resultType=orange.GetValue): votes = [0.] * len(self.classVar.values) for c, e in self.classifiers: votes[int(c(example))] += e index = orngMisc.selectBestIndex(votes) value = orange.Value(self.classVar, index) if resultType == orange.GetValue: return value sv = sum(votes) for i in range(len(votes)): votes[i] = votes[i] / sv if resultType == orange.GetProbabilities: return votes else: return (value, votes)
def create_clause(table, attr, val, bdists, cmp='='): cmps = ['<', '<=', '>', '>=', '='] if attr.varType == Orange.feature.Type.Discrete: if not isinstance(val, (list, tuple)): val = [val] vals = [orange.Value(attr, v) for v in val] filt = orange.ValueFilter_discrete(position=table.domain.index(attr), values=vals) return filt else: # it may be a discretized continuous condition (e.g., "<= 5") isnumerical = False for c in cmps: try: if val.startswith(c): val = float(val.split(c)[1]) cmp = c isnumerical = True break except: pass if not isnumerical: val = float(val) bdist = bdists[attr] minv, maxv = bdist.min, bdist.max op = None if cmp == '>=': minv = val elif cmp == '>': minv = val elif cmp == '<=': maxv = val elif cmp == '<': maxv = val elif cmp == '=': maxv = minv = val else: raise return Orange.data.filter.ValueFilterContinuous( position=table.domain.index(attr), oper=orange.ValueFilter.Between, min=minv, max=maxv)
def __call__(self, example, resultType=orange.GetValue): ex = orange.Example(self.domain, example) ex = self.imputer(ex) ex = numpy.array(ex.native()) if self.beta0: if len(self.beta) > 1: yhat = self.beta[0] + dot(self.beta[1:], ex[:-1]) else: yhat = self.beta[0] else: yhat = dot(self.beta, ex[:-1]) yhat = orange.Value(yhat) if resultType == orange.GetValue: return yhat if resultType == orange.GetProbabilities: return orange.ContDistribution({1.0: yhat}) return (yhat, orange.ContDistribution({1.0: yhat}))
def dictToCond(d, data): if d['type'] == 'num': return orange.ValueFilter_continuous( position=d['pos'], oper=orange.ValueFilter.Between, min=d['vals'][0], max=d['vals'][1]) # XXX: NULL hack attr = data.domain[d['col']] vals = [] for v in d['vals']: if v is None: if 'NULL' in attr.values: v = 'NULL' elif 'None' in attr.values: v = 'None' vals.append(orange.Value(attr, v)) return orange.ValueFilter_discrete(position=d['pos'], values=vals)
def __call__(self, table, bound, weightID=0): if not len(bound): raise AttributeError, "no bound attributes" bound = [table.domain[a] for a in bound] newVar = orange.EnumVariable("-".join([a.name for a in bound])) if (len(bound) == 1): newVar.values = list(bound[0].values) clsfr = orange.ClassifierByLookupTable(newVar, bound[0]) else: import orngMisc for vs in orngMisc.LimitedCounter([len(a.values) for a in bound]): newVar.values.append("-".join( [bound[i].values[v] for i, v in enumerate(vs)])) clsfr = orange.ClassifierByLookupTable(newVar, bound) ## elif (len(bound)==2): ## for v1 in bound[0].values: ## for v2 in bound[1].values: ## newVar.values.append(v1+"-"+v2) ## clsfr = orange.ClassifierByLookupTable2(newVar, bound[0], bound[1]) ## elif (len(bound)==3): ## for v1 in bound[0].values: ## for v2 in bound[1].values: ## for v3 in bound[2].values: ## newVar.values.append(v1+"-"+v2+"-"+v3) ## clsfr = orange.ClassifierByLookupTable3(newVar, bound[0], bound[1], bound[2]) ## else: ## raise AttributeError, "cannot deal with more than 3 bound attributes" for i in range(len(newVar.values)): clsfr.lookupTable[i] = orange.Value(newVar, i) newVar.getValueFrom = clsfr if self.measure: meas = self.measure(newVar, table) else: meas = 0 return newVar, meas
def cloneAndAddCondition(self, attribute, values, used=False, negate=False): '''Returns a copy of this rule which condition part is extended by attribute = value''' conds = list(self.filter.conditions) if not (values): return self if not isinstance(values, list): values = [values] pos = self.data.domain.index(attribute) conds = filter(lambda cond: cond.position != pos, conds) values = [orange.Value(attribute, value) for value in values] conds.append( orange.ValueFilter_discrete( position=self.data.domain.index(attribute), values=values)) conds.sort(key=lambda c: c.position) return SDRule(self.data, self.targetClass, conds, self.g)
def __call__(self, example, resultType=orange.GetValue): if self.classVar.varType == orange.VarTypes.Discrete: freq = [0.] * len(self.classVar.values) for c in self.classifiers: freq[int(c(example))] += 1 index = freq.index(max(freq)) value = orange.Value(self.classVar, index) if resultType == orange.GetValue: return value for i in range(len(freq)): freq[i] = freq[i] / len(self.classifiers) if resultType == orange.GetProbabilities: return freq else: return (value, freq) elif self.classVar.varType == orange.VarTypes.Continuous: votes = [ c( example, orange.GetBoth if resultType == orange.GetProbabilities else resultType) for c in self.classifiers ] wsum = float(len(self.classifiers)) if resultType in [orange.GetBoth, orange.GetProbabilities]: pred = sum([float(c) for c, p in votes]) / wsum # prob = sum([float(p.modus()) for c, p in votes]) / wsum from collections import defaultdict prob = defaultdict(float) for c, p in votes: try: prob[float(c)] += p[c] / wsum except IndexError: # p[c] sometimes fails with index error prob[float(c)] += 1.0 / wsum prob = orange.ContDistribution(prob) return self.classVar( pred), prob if resultType == orange.GetBoth else prob elif resultType == orange.GetValue: pred = sum([float(c) for c in votes]) / wsum return self.classVar(pred)
def booleanToOrange(bool, var): if bool: txt = "True" else: txt = "False" return orange.Value(var, txt)