示例#1
0
        def wrapper(db,feature_type, sample, data):
            # サンプルのグループを取ってくるだけ
            # Sample … (ft,_id,type,cls,group,likelihood,weight)
            #print "function: predict"
            
            clf_id = generate_clf_id(algorithm,feature_type,data) 

            # 予測部
            collection = db["classifiers"]            
            
            try:
                record = collection.find_one({'_id':clf_id})
                if record == None:
                    return error_json("No classifier was found.")


                print "load %s..."%record['clf']
                clf = load_model(record['clf'])
                print "done"
            except:
                print sys.exc_info()[0]
                return error_json(sys.exc_info()[1])
                

            if record.has_key('pca'):
                print "load %s..."%record['pca']
                pca = load_model(record['pca'])
                print "done"
                feature = sample.ft
                sample.ft = pca.transform(sample.ft)
                
            # sparseな学習をしても通常のベクトルを入力として良い
            # (逆にsparseなベクトルを入れると出力のlikelihoodもsparseになる)
            #if record.has_key('sparse'):
            #    sample.ft = lil_matrix(sample.ft).tocsr()

            # algorithmに応じた処理(func)を行う
            likelihood_list = func(clf,sample)
            

            if record.has_key('pca'):
                sample.ft = feature

            likelihood_dict = {}
            for i,l in enumerate(likelihood_list):
                if not record['class_id2name'].has_key(str(i)):
                    # 本来,ここは通らないはず.稀にclass_id2nameに登録されていないクラスID (カテゴリ数と一致を返す.scikit-learnの仕様??要調査
                    continue
                key = record['class_id2name'][str(i)]
                likelihood_dict[key] = l
                        
            # 予測結果をデータベースへ追加
            sample.likelihood[clf_id] = likelihood_dict

            collections = db[feature_type]
            if collections.find_one(sample._id):
                collections.update_one({"_id":sample._id},{"$set":{'likelihood':sample.likelihood}})
                sub_result = {'update::%s'%sample._id}
            else:
                sub_result = mongointerface.add(db,feature_type,sample)
            
            result = success_json()
            result['event'] = {'_id':"predict::"+clf_id+"::"+str(sample._id), 'sub_event':sub_result}
            result['result'] = {'id':sample._id, 'likelihood':likelihood_dict}            
            if sample.ground_truth:
                result['result']['ground_truth'] = sample.ground_truth
            
            return result 
示例#2
0
def cross_validation(db, json_data_s, feature_type, algorithm, fold_num):
    print("function: cross_validation")
    data = json.loads(json_data_s)
    init_data(data)

    cv_group_head = "__cross_validation"    
    # disband all previously taged cross_validation_groups
    for i in range(0,fold_num):
        group_name = generate_group_name(cv_group_head, i)
        mongointerface.disband(db, feature_type, {'group': group_name})
    mongointerface.disband(db, feature_type, {'group': cv_group_head})
        
    collections = db[feature_type]
    selector = data['selector']
    data['selector']['ground_truth'] = {"$exists": True}
    samples = collections.find(selector)

    # group samples into N groups randomly

    samples_count = samples.count()
    if samples_count == 0:
        return error_json("ERROR: no samples are hit.")

    group_assignment = []
    remainder = samples_count % fold_num
    quotient = int(samples_count / fold_num)
    for i in range(0,fold_num):
        n = quotient
        if i < remainder:
            n = n+1
        print("group_count[%02d] = %d" % (i,n))
        group_assignment += [generate_group_name(cv_group_head, i)] * n
    random.shuffle(group_assignment)
                
    # grouping samples into N group
    for i in range(samples_count):
        s = samples[i]
        group_name = group_assignment[i]
        #print group_name

        groups = s['group']
        if not group_name in groups:
            groups = mongointerface.ensure_list(groups)
            groups.append(group_name)
            groups.append(cv_group_head)
            _id = s['_id']
            collections.update_one({"_id":_id},{"$set":{'group':groups}})

    mod = __import__(algorithm+'.classifier', fromlist=['.'])

    #print 'train and evaluation'
    # evaluate each group by trained classifiers    
    confusion_matrices = []
    # train, predict, and evaluate N classifiers
    for i in range(0,fold_num):
        ## train ##
        exclude_group = generate_group_name(cv_group_head, i)
        #print exclude_group
        _data = copy.deepcopy(data)
        _data['selector'] = {'group':{'$not':{'$all':[exclude_group]},'$all':[cv_group_head]},'ground_truth':{"$exists": True}}
        _data['overwrite'] = True
        _data['name'] = exclude_group
        #print _data
        result = mod.train(db,feature_type,_data)
        if result['status'] != 'success':
            return result
            
        ## predict ##
        selector = {'group':{'$all':[exclude_group]}}        
        group_samples = mongointerface.get_training_samples(db,feature_type,False,selector)
        for s in group_samples:
            result = mod.predict(db,feature_type, Sample(s), _data)
            if result['status'] != 'success':
                return result
        _data['selector'] = selector
        ## evaluate ##

        result = mongointerface.evaluate(db, feature_type, _data, algorithm)
        if result['status'] != 'success':
            return result
        confusion_matrices.append(result['confusion_matrix'])
    
    cmat = None
    for m in confusion_matrices:
        if bool(cmat):
            cmat = merge_confusion_matrix(cmat,json.loads(m))
        else:
            cmat = json.loads(m)
    result = success_json()
    result['confusion_matrix'] = cmat
    clf_id = generate_clf_id(algorithm,feature_type,data)
    result['event'] = {'_id':"cross_validation::" + clf_id}
    return result