Exemplo n.º 1
0
t = myProf.Timer()

# ensemble_file = '/home/diogoaos/QCThesis/datasets/gauss10e6_overlap/ensemble_500k_test2.h5'
ensemble_file = '/media/Data/diogoaos_tmp/gaussseparated_ensembles/ensemble_500000_2sqrt.hdf'

coassc_path_ssd = '/home/diogoaos/QCThesis/coassoc.h5'
index_path_ssd = '/home/diogoaos/QCThesis/'

coassc_path_spin = '/media/Data/diogoaos_tmp/coassoc.h5'
index_path_spin = '/media/Data/diogoaos_tmp/'

print "loading ensemble"
t.reset()
t.tic()
ensemble = part.loadEnsembleFromFileHDF(ensemble_file)
print 'load ensemble time: {}'.format(t.tac())

n_samples = part.n_samples_from_partition(ensemble[0])
n_partitions = len(ensemble)
print "number of samples: {}".format(n_samples)
print "number of partitions: {}".format(n_partitions)

ma = eacSp._compute_max_assocs_from_ensemble(ensemble)
ma *= 3
ma = int(ma)

print "memory required: {} MB".format(ma * n_samples * 5 / (1024.0**2))

mat = eacSp.EAC_CSR(n_samples=n_samples,
                    max_assocs=ma,
Exemplo n.º 2
0
def get_ensemble(data_sampled, rule):
    n_clusts = rule(n)

    logger.info("* * * * * * * * * * * * * * * * * *")
    logger.info("Num. samples: {}".format(n))
    logger.info("New config: {}".format(rule.__doc__))
    logger.info("* * * * * * * * * * * * * * * * * *")

    # skip if number of clusters is bigger than number of samples
    if n_clusts[1] >= n:
        logger.info("Kmax too large for dataset size. Skipping...")
        continue
    if n_clusts[0] <= 1:
        logger.info("Kmin too little. Skipping...")
        continue            


    ## generate ensemble
    logger.info("Checking for ensemble in folder...")

    generator = myKM.K_Means(cuda_mem="manual")    

    # if there is an ensemble file load it, otherwise generate and save
    ensemble_filename = os.path.join(folder,"ensemble_{}_{}.hdf".format(n, rule.__doc__))
    if not os.path.exists(ensemble_filename):
        logger.info("No ensemble detected. Generating ensemble...")
        t.reset()
        t.tic()
        ensemble = part.generateEnsemble(data_sampled, generator, n_clusts,
                                         n_partitions, n_iters)
        t.tac()
        part.saveEnsembleToFileHDF(ensemble_filename, ensemble)
        logger.info("Saved ensemble in file: {}".format(ensemble_filename))
        t_ensemble = t.elapsed
    else:
        logger.info("Ensemble detected in file {}. Loading ensemble...".format(ensemble_filename))
        ensemble = part.loadEnsembleFromFileHDF(ensemble_filename)
        t_ensemble = -1

    # ensemble_name = "ensemble_" + rule.__doc__ + ".hdf"
    # part.saveEnsembleToFileHDF(os.path.join(folder, ensemble_name), ensemble)

    max_cluster_size = myEAC.biggest_cluster_size(ensemble)

    logger.info("Maximum cluster size: {}".format(max_cluster_size))

    # # # # # # # # # # # # # #
    # check memory usage for different matrix schemes

    # compute memory usage for each type of matrix
    # linear properties for condensed sparse matrix
    n_s = 0.05
    n_e = 1.0
    val_s = 1.0
    val_e = 0.05

    ma = max_cluster_size * sparse_max_assocs_factor

    mems = compute_mems(n, ma, n_s, n_e, val_s, val_e)

    f_mat = mems[0] # full matrix
    fc_mat = mems[1] # full condensed matrix
    sp_const = mems[2] # sparse constant matrix
    sp_lin = mems[3] # sparse linear matrix

    sp_const_mst = mems[4]
    sp_lin_mst = mems[5]
Exemplo n.º 3
0
t = myProf.Timer()

# ensemble_file = '/home/diogoaos/QCThesis/datasets/gauss10e6_overlap/ensemble_500k_test2.h5'
ensemble_file = "/media/Data/diogoaos_tmp/gaussseparated_ensembles/ensemble_500000_2sqrt.hdf"

coassc_path_ssd = "/home/diogoaos/QCThesis/coassoc.h5"
index_path_ssd = "/home/diogoaos/QCThesis/"

coassc_path_spin = "/media/Data/diogoaos_tmp/coassoc.h5"
index_path_spin = "/media/Data/diogoaos_tmp/"

print "loading ensemble"
t.reset()
t.tic()
ensemble = part.loadEnsembleFromFileHDF(ensemble_file)
print "load ensemble time: {}".format(t.tac())

n_samples = part.n_samples_from_partition(ensemble[0])
n_partitions = len(ensemble)
print "number of samples: {}".format(n_samples)
print "number of partitions: {}".format(n_partitions)

ma = eacSp._compute_max_assocs_from_ensemble(ensemble)
ma *= 3
ma = int(ma)

print "memory required: {} MB".format(ma * n_samples * 5 / (1024.0 ** 2))

mat = eacSp.EAC_CSR(n_samples=n_samples, max_assocs=ma, condensed=True, sort_mode="surgical")
Exemplo n.º 4
0
        ensemble_path = os.path.join(ensemble_dir, ensemble_filename)
        if not os.path.exists(ensemble_path):
            logger.info("No ensemble detected. Generating ensemble...")
            t.reset()
            t.tic()
            ensemble = part.generateEnsemble(data_sampled, generator, n_clusts,
                                             n_partitions, n_iters)
            t.tac()
            part.saveEnsembleToFileHDF(ensemble_path, ensemble)
            logger.info("Saved ensemble in file: {}".format(ensemble_path))
            t_ensemble = t.elapsed
        else:
            logger.info(
                "Ensemble detected in file {}. Loading ensemble...".format(
                    ensemble_path))
            ensemble = part.loadEnsembleFromFileHDF(ensemble_path)
            t_ensemble = -1

        # ensemble_name = "ensemble_" + rule.__doc__ + ".hdf"
        # part.saveEnsembleToFileHDF(os.path.join(folder, ensemble_name), ensemble)

        max_cluster_size = myEAC.biggest_cluster_size(ensemble)

        logger.info("Maximum cluster size: {}".format(max_cluster_size))

        # # # # # # # # # # # # # #
        # check memory usage for different matrix schemes

        # compute memory usage for each type of matrix
        # linear properties for condensed sparse matrix
        n_s = 0.05