def load_data(): r = csv.reader(open(predict_file, 'r', encoding='utf-8'), delimiter=',', quotechar='"') raw_data = np.array(list(r)) s.print_data("raw", raw_data) str_data = np.delete(raw_data[1:, 1:], [6], axis=1) po_nums = raw_data[1:, 0] return s.normalize(str_data.astype('float32')), po_nums
def predict(model_b, model_m, x_data, po_nums): pdata_b = model_b.predict(x_data) #二维数组,每一个 member 包含舞弊概率,和非舞弊概率 #s.print_data("predict Data",pdata_b) y_data = pdata_b[:, 0] #s.print_data("y_data",y_data) #s.print_data("y_data shape0", y_data.shape[0])#就是y_data 的count #s.print_data("po_nums" ,po_nums) tup = zip(y_data, po_nums, np.arange( y_data.shape[0])) #zip之前需要 组成一个三列的二维数组,分别为 y_data,po_nums,和 0到n res = sorted(tup, key=lambda spo: spo[0], reverse=True) #spo for suspicious purchase order s.print_data("res desc", res) #ret=list() count = 0 idx = 0 sus_pos = [] likelihoods = [] rows = [] duplicates = set() #duplicates.add('0') while idx < len(res) and count < 50: if res[idx][1] in duplicates: pass #print("%s is already in duplicates ..." %(res[idx][1])) else: #print("will add %s whose index is %d" %(res[idx][1],res[idx][2])) likelihoods.append(res[idx][0]) sus_pos.append(res[idx][1]) rows.append(res[idx][2]) #ret.append(res[idx]) count += 1 duplicates.add(res[idx][1]) idx += 1 #s.print_data("sus_pos",sus_pos) #s.print_data("rows",rows) #s.print_data("x_data",x_data) #s.print_data("x_data[rows]",x_data[rows]) pdata_m = model_m.predict( x_data[rows] ) # x_data[rows] 即为排行较高的怀疑对象,需要被输入的 multicast prediction 的 x_data #s.print_data("pdata_m",pdata_m) violation_typeids = np.argmax(pdata_m, axis=1) #s.print_data("violation_typeids",violation_typeids) return sus_pos, likelihoods, violation_typeids
def main(): x_data, po_nums = load_data() from keras.models import load_model mfile_b = 'model/binary.h5' try: model_b = load_model(mfile_b) except: print('model file for binary prediction is not available') exit(1) mfile_m = 'model/multi.h5' try: model_m = load_model(mfile_m) except: print('model file for multi-cast prediction is not available') exit(2) model_m = load_model('model/multi.h5') f = open(file2write, 'w', encoding='utf-8') pos, likelihoods, vids = predict(model_b, model_m, x_data, po_nums) s.print_data("pos", pos) s.print_data("likelihoods", likelihoods) s.print_data("vids", vids) for po, likelihood, vid in zip(pos, likelihoods, vids): print('%s, %f%%, %s\n' % (po, likelihood * 100, violations[vid])) f.write('%s, %f%%, %s\n' % (po, likelihood * 100, violations[vid])) f.close() import gc gc.collect()
def process_m(data): #data[1:, 1:] 表示省略掉 0行 与0 列,第一行第一列开始取这样就去掉了 表头(0行)与采购凭证号(0列) #print(type(data)); #s=data.shape #s.print_data("s the shape",s) str_data = np.delete(data[1:, 1:], [7], axis=1) #去掉第7列, 即工厂, axis=0 的话则删掉第7行 #s.print_data("str_data 1",str_data) #print("shape[0] is %d" %(str_data.shape[0])) line_num=str_data.shape[0]; for i in range(line_num): varray=str_data[i, 5].split(';') # 有;号的话变array,例如 流程违规;成本偏高 #print("linenum %d violation description is %s" %(i, varray[0])) vkey=varray[0] if(vkey in s.violations): str_data[i, 5] = s.violations[varray[0]] else: print("csv file issue, it contains a violation bit that's not existent in violation dict [%s], line num is about %d" %(vkey,i)) s.print_data("violations dictionary",s.violations) exit() #s.print_data("str_data 2",str_data) ret=str_data.astype('float32') #print("length is %d" %(ret.shape[0])) return ret
def cross_validation(x_data, y_data): #print(x_data.shape[0] // 10 * 9, 'train samples') #print(x_data.shape[0] // 10, 'test samples') print(x_data.shape[0] // const_folds * (const_folds-1), 'train samples') print(x_data.shape[0] // const_folds, 'test samples') debugflag=False s.print_data("const_folds",const_folds) loss = []; accuracy = [] kzipped=kfold(x_data.shape[0], const_folds) #s.print_data("kfold",kzipped) cur=0 for train_idx, test_idx in kzipped: x_train, x_test = x_data[train_idx], x_data[test_idx] y_train, y_test = y_data[train_idx], y_data[test_idx] if debugflag: print("===================== train_test iteration [%d]" %(cur)) s.print_data("train_idx",train_idx) s.print_data("x_train", x_train) s.print_data("y_train", y_train) s.print_data("test_idx",test_idx) s.print_data("x_test", x_test) s.print_data("y_test", y_test) score = train_test(x_train, y_train, x_test, y_test,cur) if debugflag: s.print_data("score", score) loss.append(score[0]) accuracy.append(score[1]) if debugflag: s.print_data("loss", loss) s.print_data("accuracy", accuracy) print("\n\n\n") exit("won't go to next iteration") s.print_data("score", score) cur=cur+1 print("\n----------------------------------------\n") print("run #\ttest loss\ttest accuracy") #画表头 total_loss = total_accuracy = 0.0 for i in range(const_folds): print("%d\t%f\t%f" % (i, loss[i], accuracy[i])) total_loss += loss[i]; total_accuracy += accuracy[i] print("\naverage loss:", total_loss/const_folds) print("average accuracy:", total_accuracy/const_folds)