def disc_gain_rt(self, index, data): ''' 计算一个属性的信息增益 ''' statisc_dict = {} index_val = self.disc_type[index] total_info = cal_set_info(data) for val in index_val: statisc_dict[val] = {} for d in data: if d[-1] in statisc_dict[d[index]]: statisc_dict[d[index]][d[-1]] += 1 else: statisc_dict[d[index]][d[-1]] = 1 ''' statisc_dict结构: { attr_value1:{yes:num1, no:num2}, ... attr_value2:{yes:num1, no:num2}, } ''' info_gain, info_measure = cal_gain_ratio(statisc_dict, data) return -1 if info_measure == -1 else info_gain / info_measure
def num_gain_rt(self, index, data): ''' 连续数值属性计算信息增益,先根据第index列排序,选取标签改变时对应的index列属性值, 作为分界点,分别计算出每个分界点对应的信息增益,返回最大增益及其对应的分界点 ''' ctgs = set() sorted_data = sorted(data, key=itemgetter(index)) cls = sorted_data[0][-1] #只选取便签改变时对应的属性值 for d in sorted_data: if d[-1] != cls: cls = d[-1] ctgs.add(d[index]) max_gain, border, gain_ratio = sys.float_info.min, 0.0, -1.0 for ctg in ctgs: statisc_dict = {} info_gain = 0.0 ''' 结构为 { 'left': {yes: num1, no:num2} 'right': {yes:num1, no:num2} } ''' statisc_dict['left'], statisc_dict['right'] = binary_sp( data, ctg, index) info_gain, info_measure = cal_gain_ratio(statisc_dict, data) if info_measure == -1: continue if info_gain > max_gain: max_gain, border, gain_ratio = info_gain, ctg, info_gain / info_measure return gain_ratio, border
def num_gain_rt(self, index, data): ''' 连续数值属性计算信息增益,先根据第index列排序,选取标签改变时对应的index列属性值, 作为分界点,分别计算出每个分界点对应的信息增益,返回最大增益及其对应的分界点 ''' ctgs = set() sorted_data = sorted(data, key=itemgetter(index)) cls = sorted_data[0][-1] #只选取便签改变时对应的属性值 for d in sorted_data: if d[-1] != cls: cls = d[-1] ctgs.add(d[index]) max_gain, border, gain_ratio = sys.float_info.min, 0.0, -1.0 for ctg in ctgs: statisc_dict = {} info_gain = 0.0 ''' 结构为 { 'left': {yes: num1, no:num2} 'right': {yes:num1, no:num2} } ''' statisc_dict['left'], statisc_dict['right'] = binary_sp(data, ctg, index) info_gain, info_measure = cal_gain_ratio(statisc_dict, data) if info_measure == -1: continue if info_gain > max_gain: max_gain, border, gain_ratio = info_gain, ctg, info_gain / info_measure return gain_ratio, border