def one_line(line): with cst.TimeRecord("initial") as _: fea = map(lambda x: x.strip(), line.split("\t"))[:len(FEA.fea_number_dict)] try: one_lable = str( cst.safe_int(fea[FEA.fea_number_dict[label_name] - 1])) except Exception as e: print e print len(FEA.fea_number_dict), len(fea), line print FEA.fea_number_dict print fea return '\t'.join(["0", max_feature_id_num + ":0"]) def one_fea((n, fea_value)): ''' 1. n,fea_value 2. fea_name or fea_name_list ,fea_value or fea_value_list 调用获取v :return: ''' fea_name = FEA.num_fea_dict[n] fc = FEA.fea_conf[fea_name] fun_key = { "cate": normal, "origin": normal, "number": normal, "none": none, "pair": pair } if fc.name != cst.label_name: return fun_key[fc.method.split("#")[0]](fc, fea_value, fea) try: rs = filter( lambda x: x, map(one_fea, enumerate(fea + [0] * (max_len - len(fea)), start=1))) if rs and max_feature_id_num == rs[-1][0]: data_line = " ".join( map(lambda x: ":".join(map(str, x)), sorted(rs, key=lambda x: int(x[0])))) else: data_line = " ".join( map(lambda x: ":".join(map(str, x)), sorted(rs, key=lambda x: int(x[0]))) + [max_feature_id_num + ":0"]) except Exception as e: print e return '\t'.join([one_lable, ""]) else: return '\t'.join([one_lable, data_line])
def process_origin(conf,value,frac): v = FEA.name_values_dict[conf.name] after_sorted = filter(lambda x: wash_data(x), sorted([x.strip() for x in v if cst.isfloat(x)], key=lambda x: float(x))) after_filtered = filter(lambda x: not float(conf.filter_threds[0]) <= float(x) <= float(conf.filter_threds[1]), after_sorted) if conf.filter_threds else after_sorted return map(lambda x:str(float('%0.3f' % float(x))),[after_filtered[0],after_filtered[-1]])
def process_number(conf, value,frac): v = FEA.name_values_dict[conf.name] after_sorted = filter(lambda x: wash_data(x), sorted([x.strip() for x in v if cst.isfloat(x)], key=lambda x: float(x))) after_filtered = filter(lambda x: not float(conf.filter_threds[0]) <= float(x) <= float(conf.filter_threds[1]), after_sorted) if conf.filter_threds else after_sorted sorted_v = after_filtered def equal_freq(v,frac): # print v,'v' freq = sorted(list(set([v[index] for index in range(1, len(v), len(v) / frac)] + [v[-1]])), key=lambda x: float(x)) return freq def equal_dis(v,frac): l, h = float(v[0]), float(v[-1]) l = l if l >= 0 else 0 KEY = frac return sorted(remove_dup([str(l + i * (h - l) / KEY) for i in range(int(KEY) + 1)]), key=lambda x: float(x)) data_huafen = locals().get("equal_{key}".format(**{"key": value}), None) if not data_huafen: data_huafen = equal_freq # print data_huafen(sorted_v) return sorted(list(set(map(lambda x: str(float('%0.3f' % float(x))), data_huafen(sorted_v,frac)))),key=lambda x:float(x))
def one_conf((conf, v)): def print_conf(conf, values, is_exists=False): ars_list = '#'.join([v.strip() for v in values]).replace(",", "0x32") \ if conf.method != "none" and conf.name != cst.label_name else "" print ','.join( map(str, [ conf.name, conf.method, conf.filter_threds if conf.filter_threds else "", conf.status, ars_list if not is_exists else conf.ars ])) return ','.join( map(str, [ conf.name, conf.method, conf.filter_threds if conf.filter_threds else "", conf.status, ars_list if not is_exists else conf.ars ])) if len(conf.ars) > 0: print "conf.ars already exists", conf.ars return print_conf(conf, [], True) key, value, frac = cst.parse_method( conf.method) # number freq 5 key, value, frac if key != "none": data_method = globals().get("process_{key}".format(**{"key": key})) rs = data_method(conf, value, frac, v) else: rs = [] return print_conf(conf, rs)
def origin(self, fea_name, fea_value): k = fea_name cf = self.fea_conf[fea_name] l, h = map(lambda x: float('%0.3f' % float(x)), cf.arrs_list) fea_value = cst.wash(fea_value) fea_value = float('%0.3f' % float(fea_value)) if l <= fea_value <= h: return self.add_feature(fea_name, fea_value)
def read_conf(self): with codecs.open(self.config, 'r', 'utf8') as f: for n, l in enumerate(f.readlines()): if not l.startswith("#"): # 过掉注释 n += 1 cf = Conf(*l.split(",")) cf.method_arr_list = gen_arr_list_by_method_name(cst.parse_method(cf.method)[0], cf.args) self.name_conf_dict[cf.name] = cf self.name_id_dict[cf.name] = n self.id_name_dict = {self.name_id_dict[x]: x for x in self.name_id_dict} # 初始化特征的value list self.name_values_dict = {x: list() for x in self.name_id_dict}
def read_conf(self): with codecs.open(self.config, 'r', 'utf8') as f: for n, l in enumerate(f.readlines()): if not l.startswith("#"): # 过掉注释 n += 1 print l cf = Conf(*l.split(",")) cf.arrs_list = gen_cates(cst.parse_method(cf.method)[0], cf.ars) self.fea_conf[cf.name] = cf self.fea_number_dict[cf.name] = n self.num_fea_dict = {self.fea_number_dict[x]: x for x in self.fea_number_dict} self.fea_number_value_list = {x: list() for x in self.fea_number_dict}
def process_origin(conf, value, frac, v): # v = FEA.fea_number_value_list[conf.name] after_sorted = filter( lambda x: wash_data(x), sorted([x.strip() for x in v if cst.isfloat(x)], key=lambda x: float(x))) after_filtered = filter( lambda x: not float(conf.filter_threds[0]) <= float(x) <= float( conf.filter_threds[1]), after_sorted) if conf.filter_threds else after_sorted return map(lambda x: str(float('%0.3f' % float(x))), [after_filtered[0], after_filtered[-1]])
def read_conf(self): with codecs.open(self.config, 'r', 'utf8') as f: for n, l in enumerate(f.readlines()): if not l.startswith("#"): # 过掉注释 n += 1 print l cf = Conf(*l.split(",")) cf.arrs_list = gen_cates( cst.parse_method(cf.method)[0], cf.ars) self.fea_conf[cf.name] = cf self.fea_number_dict[cf.name] = n self.num_fea_dict = { self.fea_number_dict[x]: x for x in self.fea_number_dict } self.fea_number_value_list = {x: list() for x in self.fea_number_dict}
def one_conf(conf): def print_conf(conf, values): with codecs.open(FEAS_RECONSTRUCT_FILE, 'a', 'utf8') as f: f.write( ','.join(map(str, [conf.name, conf.method, "#".join(conf.filter_threds) if conf.filter_threds else "",conf.status, '#'.join([v.strip() for v in values]).replace(",", "0x32")])) + '\n') print(','.join(map(str, [conf.name, conf.method, conf.status,"#".join(conf.filter_threds) if conf.filter_threds else "", '#'.join([v.strip() for v in values]).replace(",", "0x32")]))) if conf.method == "none" or conf.name == cst.LABEL: print_conf(conf, []) return key, value, frac = cst.parse_method(conf.method) data_method = globals().get("process_{key}".format(**{"key": key})) rs = data_method(conf, value,frac) print_conf(conf, rs)
def process_number(conf, value, frac, v): # v = FEA.fea_number_value_list[conf.name] after_sorted = filter( lambda x: wash_data(x), sorted([x.strip() for x in v if cst.isfloat(x)], key=lambda x: float(x))) after_filtered = filter( lambda x: conf.filter_method(x), after_sorted) if conf.filter_threds else after_sorted sorted_v = after_filtered def equal_freq(v, frac): try: freq = sorted(list( set([v[index] for index in range(1, len(v), len(v) / frac)] + [v[-1]])), key=lambda x: float(x)) except: print v, 'v', "frac is {0}".format(frac) return freq def equal_dis(v, frac): l, h = float(v[0]), float(v[-1]) l = l if l >= 0 else 0 KEY = frac return sorted(remove_dup( [str(l + i * (h - l) / KEY) for i in range(int(KEY) + 1)]), key=lambda x: float(x)) data_huafen = locals().get("equal_{key}".format(**{"key": value}), None) if not data_huafen: data_huafen = equal_freq # print data_huafen(sorted_v) return sorted(list( set( map(lambda x: str(float('%0.3f' % float(x))), data_huafen(sorted_v, frac)))), key=lambda x: float(x))
def one_line(line): with cst.TimeRecord("initial") as _: fea = map(lambda x: x.strip(), line.split("\t"))[:len(FEA.fea_number_dict)] try: one_lable = str(cst.safe_int(fea[FEA.fea_number_dict[label_name] - 1])) except Exception as e: print e print len(FEA.fea_number_dict),len(fea),line print FEA.fea_number_dict print fea return '\t'.join(["0", max_feature_id_num + ":0"]) def one_fea((n, fea_value)): ''' 1. n,fea_value 2. fea_name or fea_name_list ,fea_value or fea_value_list 调用获取v :return: ''' fea_name = FEA.num_fea_dict[n] fc = FEA.fea_conf[fea_name] fun_key = {"cate": normal, "origin": normal, "number": normal, "none": none, "pair": pair} if fc.name != cst.label_name: return fun_key[fc.method.split("#")[0]](fc, fea_value, fea) try: rs = filter(lambda x: x, map(one_fea, enumerate(fea + [0] * (max_len - len(fea)), start=1))) if rs and max_feature_id_num == rs[-1][0]: data_line = " ".join(map(lambda x: ":".join(map(str, x)), sorted(rs, key=lambda x: int(x[0])))) else: data_line = " ".join(map(lambda x: ":".join(map(str, x)), sorted(rs, key=lambda x: int(x[0]))) + [max_feature_id_num + ":0"]) except Exception as e: print e return '\t'.join([one_lable, ""]) else: return '\t'.join([one_lable, data_line])
def normal(fc, fea_value, fea): return FEA.__getattribute__(cst.parse_method(fc.method)[0])(fc.name, fea_value)
def pre_cal(self): # precalculate self.method_arr_list = gen_arr_list_by_method_name(cst.parse_method(self.method)[0], self.args)
return fun_key[fc.method.split("#")[0]](fc, fea_value, fea) try: rs = filter( lambda x: x, map(one_fea, enumerate(fea + [0] * (max_len - len(fea)), start=1))) if rs and max_feature_id_num == rs[-1][0]: data_line = " ".join( map(lambda x: ":".join(map(str, x)), sorted(rs, key=lambda x: int(x[0])))) else: data_line = " ".join( map(lambda x: ":".join(map(str, x)), sorted(rs, key=lambda x: int(x[0]))) + [max_feature_id_num + ":0"]) except Exception as e: print e return '\t'.join([one_lable, ""]) else: return '\t'.join([one_lable, data_line]) import time t = time.time() with cst.TimeRecord("total") as _: pool = mp.Pool(32) rs = filter(lambda x: x, pool.map(one_line, data)) with codecs.open(feature_lines, 'w', 'utf8') as f: f.write('\n'.join(rs))
def pre_cal(self): # precalculate self.arrs_list = gen_cates(cst.parse_method(self.method)[0], self.ars)