示例#1
0
def load_data():
    r = csv.reader(open(predict_file, 'r', encoding='utf-8'),
                   delimiter=',',
                   quotechar='"')

    raw_data = np.array(list(r))
    s.print_data("raw", raw_data)
    str_data = np.delete(raw_data[1:, 1:], [6], axis=1)
    po_nums = raw_data[1:, 0]

    return s.normalize(str_data.astype('float32')), po_nums
示例#2
0
def predict(model_b, model_m, x_data, po_nums):
    pdata_b = model_b.predict(x_data)  #二维数组,每一个 member 包含舞弊概率,和非舞弊概率
    #s.print_data("predict Data",pdata_b)
    y_data = pdata_b[:, 0]
    #s.print_data("y_data",y_data)
    #s.print_data("y_data shape0", y_data.shape[0])#就是y_data 的count
    #s.print_data("po_nums" ,po_nums)
    tup = zip(y_data, po_nums, np.arange(
        y_data.shape[0]))  #zip之前需要 组成一个三列的二维数组,分别为 y_data,po_nums,和 0到n
    res = sorted(tup, key=lambda spo: spo[0],
                 reverse=True)  #spo for suspicious purchase order
    s.print_data("res desc", res)
    #ret=list()
    count = 0
    idx = 0
    sus_pos = []
    likelihoods = []
    rows = []
    duplicates = set()
    #duplicates.add('0')

    while idx < len(res) and count < 50:
        if res[idx][1] in duplicates:
            pass
            #print("%s is already in duplicates ..." %(res[idx][1]))
        else:
            #print("will add %s whose index is %d" %(res[idx][1],res[idx][2]))
            likelihoods.append(res[idx][0])
            sus_pos.append(res[idx][1])
            rows.append(res[idx][2])
            #ret.append(res[idx])
            count += 1
            duplicates.add(res[idx][1])
        idx += 1
    #s.print_data("sus_pos",sus_pos)
    #s.print_data("rows",rows)
    #s.print_data("x_data",x_data)
    #s.print_data("x_data[rows]",x_data[rows])
    pdata_m = model_m.predict(
        x_data[rows]
    )  # x_data[rows] 即为排行较高的怀疑对象,需要被输入的 multicast prediction 的 x_data
    #s.print_data("pdata_m",pdata_m)
    violation_typeids = np.argmax(pdata_m, axis=1)
    #s.print_data("violation_typeids",violation_typeids)
    return sus_pos, likelihoods, violation_typeids
示例#3
0
def main():
    x_data, po_nums = load_data()
    from keras.models import load_model
    mfile_b = 'model/binary.h5'
    try:
        model_b = load_model(mfile_b)
    except:
        print('model file for binary prediction is not available')
        exit(1)

    mfile_m = 'model/multi.h5'
    try:
        model_m = load_model(mfile_m)
    except:
        print('model file for multi-cast prediction is not available')
        exit(2)

        model_m = load_model('model/multi.h5')
    f = open(file2write, 'w', encoding='utf-8')
    pos, likelihoods, vids = predict(model_b, model_m, x_data, po_nums)
    s.print_data("pos", pos)
    s.print_data("likelihoods", likelihoods)
    s.print_data("vids", vids)
    for po, likelihood, vid in zip(pos, likelihoods, vids):
        print('%s, %f%%, %s\n' % (po, likelihood * 100, violations[vid]))
        f.write('%s, %f%%, %s\n' % (po, likelihood * 100, violations[vid]))
    f.close()
    import gc
    gc.collect()
示例#4
0
def process_m(data):
    #data[1:, 1:] 表示省略掉 0行 与0 列,第一行第一列开始取这样就去掉了 表头(0行)与采购凭证号(0列)
    #print(type(data));
    #s=data.shape
    #s.print_data("s the shape",s)
    str_data = np.delete(data[1:, 1:], [7], axis=1)  #去掉第7列, 即工厂, axis=0 的话则删掉第7行
    #s.print_data("str_data 1",str_data)
    #print("shape[0] is %d" %(str_data.shape[0]))
    line_num=str_data.shape[0];
    for i in range(line_num):
        varray=str_data[i, 5].split(';') # 有;号的话变array,例如 流程违规;成本偏高
        #print("linenum %d violation description is %s" %(i, varray[0]))
        vkey=varray[0]
        if(vkey in s.violations):
            str_data[i, 5] = s.violations[varray[0]]
        else:
            print("csv file issue, it contains a violation bit that's not existent in violation dict [%s], line num is about %d" %(vkey,i))
            s.print_data("violations dictionary",s.violations)
            exit()
    #s.print_data("str_data 2",str_data)
    ret=str_data.astype('float32')
    #print("length is %d" %(ret.shape[0]))
    return ret
示例#5
0
def cross_validation(x_data, y_data):
    #print(x_data.shape[0] // 10 * 9, 'train samples')
    #print(x_data.shape[0] // 10, 'test samples')
    print(x_data.shape[0] // const_folds * (const_folds-1), 'train samples')
    print(x_data.shape[0] // const_folds, 'test samples')
    debugflag=False
    s.print_data("const_folds",const_folds)
    loss = []; accuracy = []
    kzipped=kfold(x_data.shape[0], const_folds)
    #s.print_data("kfold",kzipped)
    cur=0
    for train_idx, test_idx in kzipped:
        x_train, x_test = x_data[train_idx], x_data[test_idx]
        y_train, y_test = y_data[train_idx], y_data[test_idx]
        
        if debugflag:
            print("===================== train_test iteration [%d]" %(cur))
            s.print_data("train_idx",train_idx)
            s.print_data("x_train", x_train)
            s.print_data("y_train", y_train)
            s.print_data("test_idx",test_idx)
            s.print_data("x_test", x_test)
            s.print_data("y_test", y_test)
        
        score = train_test(x_train, y_train, x_test, y_test,cur)
        if debugflag:
            s.print_data("score", score)
        loss.append(score[0])
        accuracy.append(score[1])
        if debugflag:
            s.print_data("loss", loss)
            s.print_data("accuracy", accuracy)
            print("\n\n\n")
            exit("won't go to next iteration")
        s.print_data("score", score)
        
        cur=cur+1

    print("\n----------------------------------------\n")
    print("run #\ttest loss\ttest accuracy") #画表头

    total_loss = total_accuracy = 0.0
    for i in range(const_folds):
        print("%d\t%f\t%f" % (i, loss[i], accuracy[i]))
        total_loss += loss[i]; total_accuracy += accuracy[i]

    print("\naverage loss:", total_loss/const_folds)
    print("average accuracy:", total_accuracy/const_folds)