Пример #1
0
def prepare_extract_and_evaluate(ext_hash,image_certificate_file,model_certificate_file,task):

    conn = pm.Connection(document_class=bson.SON)
    db = conn[DB_NAME]
    

    perf_coll = db['performance']
    perf_coll.remove({'__hash__':ext_hash})
    split_coll = db['splits.files']
    split_fs = gridfs.GridFS(db,'splits')
    remove_existing(split_coll,split_fs,ext_hash)
    splitperf_coll = db['split_performance.files']
    splitperf_fs = gridfs.GridFS(db,'split_performance')
    remove_existing(splitperf_coll,splitperf_fs,ext_hash)

    model_certdict = cPickle.load(open(model_certificate_file))
    model_hash = model_certdict['model_hash']
    model_coll = db['models.files']
    
    image_certdict = cPickle.load(open(image_certificate_file))
    image_hash = image_certdict['image_hash']
    image_config_gen = image_certdict['args']
    model_configs = get_most_recent_files(model_coll,{'__hash__' : model_hash})
    
    if isinstance(task,list):
        task_list = task
    else:
        task_list = [task]
    
    return model_configs,image_config_gen,model_hash,image_hash, task_list, perf_coll, split_coll, split_fs, splitperf_coll, splitperf_fs
Пример #2
0
def extract_features_inner_core(image_certificate, model_certificate, feature_hash, image_hash,
     model_hash, convolve_func_name,device_id, im_query, m_query, im_skip,
     im_limit, m_skip, m_limit):

    if im_query is None:
        im_query = {}
    if m_query is None:
        m_query = {}
        
    im_query['__hash__'] = image_hash
    m_query['__hash__'] = model_hash

    conn = pm.Connection(document_class = SON)
    db = conn[DB_NAME]
    
    image_col = db['images.files'] 
    model_col = db['models.files'] 
    
    image_fs = gridfs.GridFS(db,'images')
    model_fs = gridfs.GridFS(db,'models')
    
    feature_fs = gridfs.GridFS(db,'features')
                       
    if convolve_func_name == 'pyfft':
        context = v1_pyfft.setup_pyfft(device_id)
        context.push()
        convolve_func = functools.partial(v1f.v1like_filter_pyfft,device_id=device_id)
    else:
        convolve_func = v1f.v1like_filter_numpy

    L1 = get_most_recent_files(image_col,im_query,skip=im_skip,limit=im_limit)
    L2 = get_most_recent_files(model_col,m_query,skip=m_skip,limit=m_limit)
        
    for image_config in L1:
        for model_config in L2: 
            features = compute_features(image_config['filename'], image_fs, model_config, model_fs,convolve_func)
            features_string = cPickle.dumps(features)
            y = SON([('config',SON([('model',model_config['config']['model']),('image',image_config['config']['image'])]))])
            filename = get_filename(y['config'])
            y['filename'] = filename
            y['__hash__'] = feature_hash
            feature_fs.put(features_string,**y)
            
            
    if convolve_func_name == 'pyfft':
        context.pop()
Пример #3
0
def extract_and_evaluate_parallel_core(image_config_gen,m,task,ext_hash,split_id,convolve_func_name,cache_port=None):

    if cache_port is None:
        cache_port = NETWORK_CACHE_PORT
        

               
    conn = pm.Connection(document_class=bson.SON)
    db = conn[DB_NAME]
    split_col = db['splits.files']
    split_fs = gridfs.GridFS(db,'splits')

    splitconf = get_most_recent_files(split_col,{'__hash__':ext_hash,'split_id':split_id,'model':m['config']['model'],'images':son_escape(image_config_gen['images'])})[0]
    split = cPickle.loads(split_fs.get_version(splitconf['filename']).read())['split']
    res = extract_and_evaluate_core(split,m,convolve_func_name,task,cache_port)
    splitperf_fs = gridfs.GridFS(db,'split_performance')
    put_in_split_result(res,image_config_gen,m,task,ext_hash,split_id,splitperf_fs)
Пример #4
0
def extract_and_evaluate_parallel(outfile,image_certificate_file,model_certificate_file,cpath,convolve_func_name,task,ext_hash):
        
    (model_configs, image_config_gen, model_hash, image_hash, task_list,
     perf_col, split_coll, split_fs, splitperf_coll, splitperf_fs) = prepare_extract_and_evaluate(ext_hash,
                                                                                                  image_certificate_file,
                                                                                                  model_certificate_file,
                                                                                                  task)

    
    jobids = []
    if convolve_func_name == 'numpy':
        opstring = '-l qname=extraction_cpu.q'
    elif convolve_func_name == 'pyfft':
        opstring = '-l qname=extraction_gpu.q -o /home/render -e /home/render'
        
    for m in model_configs: 
        print('Evaluating model',m)
        for task in task_list:
            classifier_kwargs = task.get('classifier_kwargs',{})    
            print('task',task)
            splits = generate_splits(task,image_hash,'images') 
            for (ind,split) in enumerate(splits):
                put_in_split(split,image_config_gen,m,task,ext_hash,ind,split_fs)  
                jobid = qsub(extract_and_evaluate_parallel_core,(image_config_gen,m,task,ext_hash,ind,convolve_func_name),opstring=opstring)
                jobids.append(jobid)

    print(jobids)
    statuses = wait_and_get_statuses(jobids)
    
    for m in model_configs: 
        print('Evaluating model',m)
        for task in task_list:
            split_results = get_most_recent_files(splitperf_coll,{'__hash__':ext_hash,'task':son_escape(task),'model':m['config']['model'],'images':son_escape(image_config_gen['images'])})
            put_in_performance(split_results,image_config_gen,m,model_hash,image_hash,perf_col,task,ext_hash)

    createCertificateDict(outfile,{'image_file':image_certificate_file,'models_file':model_certificate_file})
Пример #5
0
def evaluate(outfile,feature_certificate,cpath,task,ext_hash):

    conn = pm.Connection(document_class=bson.SON)
    db = conn[DB_NAME]
    
    perf_fs = gridfs.GridFS(db,'performance')
    perf_coll = db['performance.files']
    
    remove_existing(perf_coll,perf_fs,ext_hash)

    feature_certdict = cPickle.load(open(feature_certificate))
    feature_hash = feature_certdict['feature_hash']
    image_hash = feature_certdict['image_hash']
    model_hash = feature_certdict['model_hash']
    image_config_gen = feature_certdict['args']['images']
    model_col = db['models.files']
    feature_fs = gridfs.GridFS(db,'features')
    feature_col = db['features.files']
    
    stats = ['test_accuracy','ap','auc','mean_ap','mean_auc','train_accuracy']    
       
    if isinstance(task,list):
        task_list = task
    else:
        task_list = [task]
    
    model_configs = get_most_recent_files(model_col,{'__hash__':model_hash})
    
    for m in model_configs:
        print('Evaluating model',m) 
        for task in task_list:
            task['universe'] = task.get('universe',SON([]))
            task['universe']['model'] = m['config']['model']
            print('task', task)
            classifier_kwargs = task.get('classifier_kwargs',{})    
            split_results = []
            splits = generate_splits(task,feature_hash,'features') 
            for (ind,split) in enumerate(splits):
                print ('split', ind)
                train_data = split['train_data']
                test_data = split['test_data']
                
                train_filenames = [t['filename'] for t in train_data]
                test_filenames = [t['filename'] for t in test_data]
                assert set(train_filenames).intersection(test_filenames) == set([])
                
                print('train feature extraction ...')
                train_features = sp.row_stack([load_features(f['filename'],feature_fs,m,task) for f in train_data])
                print('test feature extraction ...')
                test_features = sp.row_stack([load_features(f['filename'],feature_fs,m,task) for f in test_data])
                train_labels = split['train_labels']
                test_labels = split['test_labels']
    
                print('classifier ...')
                res = svm.classify(train_features,train_labels,test_features,test_labels,classifier_kwargs)
                print('Split test accuracy', res['test_accuracy'])
                split_results.append(res)
        
            model_results = SON([])
            for stat in STATS:
                if stat in split_results[0] and split_results[0][stat] != None:
                    model_results[stat] = sp.array([split_result[stat] for split_result in split_results]).mean()           
    
            out_record = SON([('model',m['config']['model']),
                              ('model_hash',model_hash), 
                              ('model_filename',m['filename']), 
                              ('images',son_escape(image_config_gen)),
                              ('image_hash',image_hash),
                              ('task',son_escape(task)),
                         ])
                                             
            filename = get_filename(out_record)
            out_record['filename'] = filename
            out_record['config_path'] = cpath
            out_record['__hash__'] = ext_hash
            out_record.update(model_results)
            print('dump out ...')
            out_data = cPickle.dumps(SON([('split_results',split_results),('splits',splits)]))
            
            perf_fs.put(out_data,**out_record)

    createCertificateDict(outfile,{'feature_file':feature_certificate})
Пример #6
0
def generate_splits(dbname,collectionname,task_query,N,ntrain,ntest,
                    ntrain_pos = None, ntest_pos = None, universe = None,
                    use_negate = False, overlap=None):

    task_query = copy.deepcopy(task_query)
    print('Generating splits ...')
    if universe is None:
        universe = SON([])

    connection = pm.Connection(document_class=SON)
    db = connection[dbname]
    data = db[collectionname + '.files']

    fs = gridfs.GridFS(db,collection=collectionname)

    combine_things(task_query,universe)
    
    print('T',task_query)
    task_data = list(data.find(task_query,fields=["filename"]))
    task_fnames = [str(x['filename']) for x in task_data]
    N_task = len(task_data)
    
    if use_negate:
        task_fnames = np.array(task_fnames)
        all_data = list(data.find(universe,fields=["filename"]))
        all_fnames = np.array([str(x['filename']) for x in all_data])
        I = np.invert(tb.isin(all_fnames,task_fnames)).nonzero()[0]
        nontask_data = [all_data[ind] for ind in I]
        nontask_fnames = [str(x['filename']) for x in nontask_data]
        assert set(task_fnames).intersection(nontask_fnames) == set([]), set(task_fnames).intersection(nontask_fnames)
    else:
        nontask_query = {'filename':{'$nin':task_fnames}}
        nontask_query.update(universe)
        nontask_data = get_most_recent_files(data,nontask_query)
        
    N_nontask = len(nontask_data)

    assert ntrain + ntest <= N_task + N_nontask, "Not enough training and/or testing examples " + str([N_task,N_nontask])
    
        
    splits = []  
    for ind in range(N):
        print('... split', ind)
        if ntrain_pos is not None:
            ntrain_neg = ntrain - ntrain_pos
            assert ntrain_pos <= N_task, "Not enough positive training examples, there are: " + str(N_task)
            assert ntrain_neg <= N_nontask, "Not enough negative training examples, there are: " + str(N_nontask)
            
            perm_pos = sp.random.permutation(len(task_data))
            perm_neg = sp.random.permutation(len(nontask_data))
            
            train_data = [task_data[i] for i in perm_pos[:ntrain_pos]] + [nontask_data[i] for i in perm_neg[:ntrain_neg]]    
            
            if ntest_pos is not None:
                ntest_neg = ntest - ntest_pos
                assert ntest_pos <= N_task - ntrain_pos, "Not enough positive test examples, there are: " + str(N_task - ntrain_pos)
                assert ntest_neg <= N_nontask - ntrain_neg, "Not enough negative test examples, there are: " + str(N_nontask - ntrain_neg)       
                test_data = [task_data[i] for i in perm_pos[ntrain_pos:ntrain_pos + ntest_pos]] + [nontask_data[i] for i in perm_neg[ntrain_neg:ntrain_neg + ntest_neg]]          
            else:     
                nontrain_data = [task_data[i] for i in perm_pos[ntrain_pos:]] + [nontask_data[i] for i in perm_neg[ntrain_neg:]]
                new_perm = sp.random.permutation(len(nontrain_data))
                test_data = [nontrain_data[i] for i in new_perm[:ntest]]
            
        
        else:
            if ntest_pos is not None:
                ntest_neg = ntest - ntest_pos
                assert ntest_pos <= N_task, "Not enough positive test examples, there are: " + str(N_task)
                assert ntest_neg <= N_nontask, "Not enough negative test examples, there are: " + str(N_nontask)                   
                perm_pos = sp.random.permutation(len(task_data))
                perm_neg = sp.random.permutation(len(nontask_data))
                test_data = [task_data[i] for i in perm_pos[:ntest_pos]] + [nontask_data[i] for i in perm_neg[:ntest_neg]]   
                nontest_data = [task_data[i] for i in perm_pos[ntest_pos:]] + [nontask_data[i] for i in perm_neg[ntest_neg:]]
                new_perm = sp.random.permutation(len(nontest_data))
                train_data = [nontest_data[i] for i in new_perm[:ntrain]]               
            else:
                all_data = task_data + nontask_data
                perm = sp.random.permutation(len(all_data))
                train_data = [all_data[i] for i in perm[:ntrain]]
                test_data = [all_data[i] for i in perm[ntrain:ntrain + ntest]]
            
        train_filenames = np.array([str(_t['filename']) for _t in train_data])
        test_filenames = np.array([str(_t['filename']) for _t in test_data])
        
        train_labels = tb.isin(train_filenames,task_fnames)
        test_labels = tb.isin(test_filenames,task_fnames)
         
        #train_labels = sp.array([x['filename'] in task_fnames for x in train_data])
        #test_labels = sp.array([x['filename'] in task_fnames for x in test_data])

        assert set(train_filenames).intersection(test_filenames) == set([]), str(set(train_filenames).intersection(test_filenames))
        
        split = {'train_data': train_data, 'test_data' : test_data, 'train_labels':train_labels,'test_labels':test_labels}
        splits.append(split)
   
    return splits