示例#1
0
def test_create_dataset_with_format_change():
    h5_path = mkstemp(suffix=".h5")[1]
    sparse_matrix = ss.csr_matrix(
        [[0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1]],
        dtype=np.float64)
    with h5sparse.File(h5_path, 'w') as h5f:
        h5f.create_dataset('sparse/matrix',
                           data=sparse_matrix,
                           sparse_format='csc')
    with h5sparse.File(h5_path) as h5f:
        assert 'sparse' in h5f
        assert 'matrix' in h5f['sparse']
        assert h5f['sparse']['matrix'].format_str == 'csc'
        result_matrix = h5f['sparse']['matrix'][()]
        assert isinstance(result_matrix, ss.csc_matrix)
        assert (result_matrix != sparse_matrix).size == 0
        assert (h5f['sparse']['matrix'][1:3] != sparse_matrix[:,
                                                              1:3]).size == 0
        assert (h5f['sparse']['matrix'][2:] != sparse_matrix[:, 2:]).size == 0
        assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:, :2]).size == 0
        assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[:,
                                                              -2:]).size == 0
        assert (h5f['sparse']['matrix'][:-2] !=
                sparse_matrix[:, :-2]).size == 0

    os.remove(h5_path)
示例#2
0
    def __init__(self,
                 gvm_part_fname,
                 gvm_path,
                 indices=None,
                 batch_size=128,
                 shuffle=True,
                 verbose=True):

        'Initialization'
        if gvm_path is None:
            with h5sparse.File(gvm_part_fname, 'r') as h:
                self.GVD = h[()]
        else:
            with h5sparse.File(gvm_part_fname, 'r') as h:
                self.GVD = h[gvm_path][()]
        # If no indices are specified, use all of them.
        if indices is None:
            with h5sparse.File(gvm_part_fname, 'r') as h:
                indices = list(range(self.GVD.shape[0]))
        self.indices = list(map(int, indices))
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.verbose = verbose

        self.on_epoch_end()
示例#3
0
def processdata(id):
    datalabels = array([1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74], dtype=uint8)
    with h5py.File('/datadrive/musicnet.h5', 'r') as file:

        with h5sparse.File('/datadrive/labels.h5') as sparse:
            doit = join('sparse', 'matrix', id) not in sparse

        if doit:

            size = len(file[id]['data'])
            bar = Bar('Processing ' + str(id), max=len(file[id]['labels']))

            segments = lil_matrix((size, 11), dtype=uint8)
            for _, end_time, instrument, _, note, _, start_time in file[id][
                    'labels']:
                label_index = list(datalabels == instrument).index(True)

                for t in range(start_time, end_time):
                    segments[t, label_index] = note

                bar.next()

            with h5sparse.File('/datadrive/labels.h5') as sparse:
                sparse.create_dataset(join('sparse', 'matrix', id),
                                      data=segments.tocsr())

            bar.finish()
            print('Done!')

            del segments
            collect()
示例#4
0
    def test_marginal_likelihood_hdf5(self):
        """
        I don't think this gets us much because ´.value´ will probably load the complete sparse matrix
        :return:
        """
        transition_counts = csr_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                                       dtype=np.float64)
        transition_probabilities = normalize(transition_counts, "l1", axis=1)
        pseudo_counts = transition_probabilities * 5

        if os.path.isfile("test.h5"):
            os.remove("test.h5")

        with h5sparse.File("test.h5") as h5f:
            h5f.create_dataset('transition/counts', data=transition_counts)
            h5f.create_dataset('pseudo/counts', data=pseudo_counts)

        h5f = h5sparse.File("test.h5")
        print(h5f['transition/counts'].h5py_group.attrs["h5sparse_shape"])
        print(h5f['transition/counts'][0:2])

        ml = MarkovChain.marginal_likelihood(h5f['transition/counts'].value,
                                             h5f['pseudo/counts'].value,
                                             smoothing=1.0)

        os.remove("test.h5")

        print(ml)
示例#5
0
def test_create_empty_dataset():
    h5_path = mkstemp(suffix=".h5")[1]
    with h5sparse.File(h5_path) as h5f:
        h5f.create_dataset('empty_data', shape=(100, 200))
    with h5sparse.File(h5_path) as h5f:
        assert h5f['empty_data'].shape == (100, 200)
    os.remove(h5_path)
示例#6
0
def merge_hdf5s(hdf5_paths, output_path):

    arrays_to_merge = []
    for hdf5_path in hdf5_paths:
        arrays_to_merge.append(h5sparse.File(hdf5_path)["data"].value)
    merged_array = scipy.sparse.hstack(arrays_to_merge, format="coo")
    output_file = h5sparse.File(output_path, "w", libver="latest")
    output_file.create_dataset("data", data=merged_array.toarray())
示例#7
0
def yPrepare(userNowDealing, user_id_dict, item_id_dict, filePlace, item_id,
             user_id, target_id, train):

    usrList = [user_id_dict[user] for user in user_id_dict]
    usrList.sort()
    usrList = np.array(usrList)

    # select the data from train which is related with usrNowDealing
    with h5sparse.File(filePlace + "userdot_cosine.h5") as user_net:
        # get the relationship between user u and other user:
        usrRelationship = user_net['dot_cosineData/data'][userNowDealing:(
            userNowDealing + 1)].toarray().ravel()
        usrHasRelation = usrList[usrRelationship]
    trainHasRelation = train[train[user_id].\
            isin(usrHasRelation)].sort_values(by = [item_id])

    relatedItem = trainHasRelation[item_id].values
    relatedUser = trainHasRelation[user_id].values
    relatedTarget = trainHasRelation[target_id]

    # get the item-train-sized net work
    with h5sparse.File(filePlace + "itemdot_cosine.h5") as item_net:
        itemRelationship = item_net['dot_cosineData/data']
        itemToitemNetRelated = [
            itemRelationship[itemRelated:(itemRelated + 1)]
            for itemRelated in relatedItem
        ]
        itemToitemNetRelated = vstack(itemToitemNetRelated).transpose()

    # get the item-train-sized kernel data

    # get the item-train-sized x data

    with h5sparse.File(filePlace + "itemdis.h5") as item_dis:
        itemDisRelationship = item_dis['disData/data']
        itemToitemDisRelated = [
            itemDisRelationship[itemDisRelated:(itemDisRelated + 1)]
            for itemDisRelated in relatedItem
        ]
        itemToitemDisRelated = vstack(itemToitemDisRelated).transpose()

    # broadcast with the user-train-sized x data
    with h5sparse.File(filePlace + "userdis.h5") as user_dis:
        userDisRelationship = user_dis['disData/data']\
            [userNowDealing:(userNowDealing+1)]
        userDisRelationship = userDisRelationship[:, relatedUser].todense()

    itemToitemDisRelated = itemToitemDisRelated + userDisRelationship
    itemToitemDisRelated = kernel(itemToitemDisRelated)

    weight = itemToitemNetRelated.multiply(itemToitemDisRelated)

    weight_sum = diags(1 / weight.sum(1).A.ravel())
    weight = weight_sum @ weight

    y = weight.dot(relatedTarget.transpose())
    return (y)
示例#8
0
def largeMatrixDis(largeDisMatrix,
                   num=2,
                   netFilePlace="C:\\Users\\22560\\Desktop\\",
                   prefix="item"):
    # load the social network
    with h5sparse.File(netFilePlace + prefix + "dot_cosine.h5") as h5f:

        (rowNum, colNum) = largeDisMatrix.shape
        sep = np.linspace(0, rowNum, endpoint=True, dtype=np.int64, num=num)
        yTy = (largeDisMatrix * largeDisMatrix).sum(1)
        print("#############  please  be patient ############## \n \n")
        for i, j in enumerate(sep):
            if i + 1 < len(sep):
                blockSlice = slice(j, sep[i + 1])
                blockData = largeDisMatrix[blockSlice, :]
                negtive2xTy = -2 * blockData.dot(largeDisMatrix.transpose())
                xTx = yTy[blockSlice]
                xTx = xTx.reshape((len(xTx), 1))

                dis = yTy + negtive2xTy + xTx
                dis = csr_matrix(dis)

                sparse = h5f['dot_cosineData/data'][blockSlice]

                dis = dis.multiply(sparse)

                if i == 0:
                    # if its the first loop
                    # check if dot_cosine.h5 is exists or not
                    # if exists , clean it
                    # create the file dot_cosine.h5
                    with h5py.File(netFilePlace + prefix + "dis.h5") as h5file:
                        for key in h5file.keys():
                            del h5file[key]
                    with h5sparse.File(netFilePlace + prefix +
                                       "dis.h5") as h5file:
                        h5file.create_dataset("disData/data",
                                              data=dis,
                                              chunks=(10000, ),
                                              maxshape=(None, ))
                else:
                    with h5sparse.File(netFilePlace + prefix +
                                       "dis.h5") as h5file:
                        h5file['disData/data'].append(dis)

                print("Dis for " + prefix + " is now  preparing ")
                preparePercent = (1 + i) / (len(sep) - 1)
                preparePercent = round(preparePercent, 4)
                print(str(preparePercent),
                      " percent of Distance data is prepared ")
        print("############# dis data for " + prefix +
              " prepared successful!! ###########")
示例#9
0
def test_create_and_read_dataset():
    h5_path = mkstemp(suffix=".h5")[1]
    sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]],
                                  dtype=np.float64)
    with h5sparse.File(h5_path) as h5f:
        h5f.create_dataset('sparse/matrix', data=sparse_matrix)
    with h5sparse.File(h5_path) as h5f:
        assert (h5f['sparse']['matrix'][1:3] != sparse_matrix[1:3]).size == 0
        assert (h5f['sparse']['matrix'][2:] != sparse_matrix[2:]).size == 0
        assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:2]).size == 0
        assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[-2:]).size == 0
        assert (h5f['sparse']['matrix'][:-2] != sparse_matrix[:-2]).size == 0
        assert (h5f['sparse']['matrix'].value != sparse_matrix).size == 0

    os.remove(h5_path)
示例#10
0
def test_create_dataset_from_dataset():
    from_h5_path = mkstemp(suffix=".h5")[1]
    to_h5_path = mkstemp(suffix=".h5")[1]
    sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]],
                                  dtype=np.float64)
    with h5sparse.File(from_h5_path) as from_h5f:
        from_dset = from_h5f.create_dataset('sparse/matrix',
                                            data=sparse_matrix)

        with h5sparse.File(to_h5_path) as to_h5f:
            to_h5f.create_dataset('sparse/matrix', data=from_dset)
            assert (to_h5f['sparse/matrix'].value != sparse_matrix).size == 0

    os.remove(from_h5_path)
    os.remove(to_h5_path)
示例#11
0
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger):
    with timer("read data"):
        meta = pd.read_hdf(meta_path, key="data")
        mat = h5sparse.File(mat_path, mode="r")["matrix"]

    with timer("split data"):
        if val:
            train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0]
            val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
        else:
            train_ind = np.where(meta.is_test == 0)[0]
            val_ind = np.where(meta.is_test == 1)[0]

        logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
        meta_train = meta.iloc[train_ind]
        meta_val = meta.iloc[val_ind]
        X_train = mat[train_ind.min() : (train_ind.max() + 1)]
        X_val = mat[val_ind.min() : (val_ind.max() + 1)]
        del mat
        gc.collect()

    with timer("fit model"):
        model_instance.fit(
            X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)
        )
        joblib.dump(model_instance, model_path)
        val_pred = model_instance.predict(X_val)
        train_pred = model_instance.predict(X_train)
        logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred)))
        if val:
            logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred)))
        meta_val["click_proba"] = val_pred
        if val:
            logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
        meta_val.to_csv(predictions_path, index=False)
def open_gvm(fname):
    '''
	Returns a gvm obtained from the input file.
	fname : str
		The name of the file to be obtained as a gvm.
	'''

    with h5sparse.File(fname, 'r') as h:
        data = h['gvm'][()]
        if 'idx' in h:
            idx = h['idx'][()]
            try:
                idx = [i.decode('utf-8', 'ignore') for i in idx]
            except:
                pass
        else:
            idx = None

        if 'col' in h:
            col = h['col'][()]
            try:
                col = [i.decode('utf-8', 'ignore') for i in col]
            except:
                pass
        else:
            col = None
    return {'gvm': data, 'idx': np.array(idx), 'col': np.array(col)}
def open_gvm(fname):
    '''
	Opens and returns the gvm at `fname`. 

	Parameters:
	fname (str): The name of the file to be obtained as a gvm.

	Returns:
	gvm (h5sparse): gvm
	'''

    with h5sparse.File(fname, 'r') as h:
        data = h['gvm'][()]
        if 'idx' in h:
            idx = h['idx'][()]
            try:
                idx = [i.decode('utf-8', 'ignore') for i in idx]
            except:
                pass
        else:
            idx = None

        if 'col' in h:
            col = h['col'][()]
            try:
                col = [i.decode('utf-8', 'ignore') for i in col]
            except:
                pass
        else:
            col = None
    return {'gvm': data, 'idx': np.array(idx), 'col': np.array(col)}
示例#14
0
    def __init__(self, h5_fname):
        """Creates a :class:`PredictSession` from a given HDF5 file
 
        Parameters
        ----------
        h5_fname : string
           Name of the HDF5 file.
 
        """
        self.h5_file = h5sparse.File(h5_fname, 'r')
        self.options = self.h5_file["config/options"].attrs
        self.nmodes = int(self.options['num_priors'])
        self.num_latent = int(self.options["num_latent"])
        self.data_shape = self.h5_file["config/train/data"].shape
        assert self.nmodes == 2

        self.samples = []
        for name in self.h5_file.keys():
            if not name.startswith("sample_"):
                continue

            sample = Sample(self.h5_file, name, self.num_latent)
            self.samples.append(sample)
            self.beta_shape = sample.beta_shape()

        if len(self.samples) == 0:
            raise ValueError("No samples found in " + h5_fname)

        self.samples.sort(key=lambda x: x.no)
        self.num_samples = len(self.samples)
示例#15
0
def test_create_empty_sparse_dataset():
    h5_path = mkstemp(suffix=".h5")[1]
    with h5sparse.File(h5_path) as h5f:
        h5f.create_dataset('sparse/matrix', sparse_format='csr')
    with h5sparse.File(h5_path) as h5f:
        assert 'sparse' in h5f
        assert 'matrix' in h5f['sparse']
        assert h5f['sparse']['matrix'].format_str == 'csr'
        result_matrix = h5f['sparse']['matrix'][()]
        assert isinstance(result_matrix, ss.csr_matrix)
        assert result_matrix.shape == (0, 0)
        assert result_matrix.dtype == np.float64
        assert h5f['sparse']['matrix'].shape == (0, 0)
        assert h5f['sparse']['matrix'].dtype == np.float64

    os.remove(h5_path)
示例#16
0
def test_numpy_array():
    h5_path = mkstemp(suffix=".h5")[1]
    matrix = np.random.rand(3, 5)
    with h5sparse.File(h5_path) as h5f:
        h5f.create_dataset('matrix', data=matrix)
        assert 'matrix' in h5f
        np.testing.assert_equal(h5f['matrix'][()], matrix)
    os.remove(h5_path)
示例#17
0
def test_bytestring():
    h5_path = mkstemp(suffix=".h5")[1]
    strings = [str(i) for i in range(100)]
    data = json.dumps(strings).encode('utf8')
    with h5sparse.File(h5_path) as h5f:
        h5f.create_dataset('strings', data=data)
        assert 'strings' in h5f
        assert strings == json.loads(h5f['strings'][()].decode('utf8'))
    os.remove(h5_path)
示例#18
0
def verify_hdf5(matrix_path, test_yaml_path):

    expected_values = yaml.load(open(test_yaml_path))['expected_output']

    output_matrix = h5sparse.File(matrix_path)["data"].value

    assert numpy.count_nonzero(
        output_matrix) == expected_values["non_zero_count"]
    assert numpy.sum(output_matrix) == expected_values["sum"]
    assert tuple(output_matrix.shape) == tuple(expected_values["shape"])
示例#19
0
def relationToNetwork(objectRelationship, numOfUser, ifHasitsOwn, ifBIGDATA,
                      prefix, fileplace):
    # objectRelationship = userRelationship
    #relation is sth like:
    # userID  friendID value
    #  1890    1625      1
    #  1890    1807      1
    #  1890    1816      1
    #  1891     548      1
    #  1891     564      1
    if ifBIGDATA == False:
        columns = objectRelationship.columns.tolist()
        objectNetwork = csr_matrix((objectRelationship.loc[:, columns[2]],
                                    (objectRelationship.loc[:, columns[0]],
                                     objectRelationship.loc[:, columns[1]])),
                                   shape=(numOfUser, numOfUser),
                                   dtype=np.float32)

        ######### code only used for lastFM ##########################

        # #objectNetwork = objectNetwork
        # objectNetwork[objectNetwork > 0] = 0.8
        # objectNetwork[objectNetwork==0] = 0.5

        ##############################################################

        with h5py.File(fileplace + prefix + 'dot_cosine.h5') as h5f:
            for key in h5f.keys():
                del h5f[key]
        with h5sparse.File(fileplace + prefix + 'dot_cosine.h5') as h5f:

            if ifHasitsOwn:
                h5f.create_dataset("dot_cosineData/data",
                                   data=objectNetwork,
                                   chunks=(10000, ),
                                   maxshape=(None, ))
            else:
                objectNetwork = objectNetwork + diags(np.ones(numOfUser))

                ######### code only used for lastFM ##########################

                # objectNetwork[objectNetwork == 1.5] = 0.8

                ##############################################################

                h5f.create_dataset("dot_cosineData/data",
                                   data=objectNetwork,
                                   chunks=(10000, ),
                                   maxshape=(None, ))

    else:
        # 要求数据的row 一定是按照顺序排好的
        pass
def get_gvm_size(fname):
    '''
	Returns the gvm shape.

	Parameters:
	fname (str): filepath of gvm

	Returns:
	gvm_size (tuple): width and height of gvm array 
	'''
    with h5sparse.File(fname, 'r') as h:
        return h['gvm'].shape
def write_gvm(gvm, output_fname, fmt='h5'):
    if fmt == 'pd':
        if type(gvm) != pd.DataFrame:
            temp = pd.DataFrame(gvm['gvm'].todense())
            if ('idx' not in gvm) or np.all(np.array(gvm['idx'] == None)):
                gvm['idx'] = np.array(range(gvm['gvm'].shape[0]))
            if ('col' not in gvm) or np.all(np.array(gvm['col'] == None)):
                gvm['col'] = np.array(range(gvm['gvm'].shape[1]))
            temp.index = gvm['idx']
            temp.columns = gvm['col']
            gvm = temp

        gvm = gvm.replace(to_replace=False, value='')
        gvm.to_csv(output_fname, sep='\t')

    elif fmt == 'h5':
        if type(gvm) == pd.DataFrame:
            gvm = {
                'gvm': csc_matrix(gvm.values),
                'idx': gvm.index.values,
                'col': gvm.columns.values
            }

        if ('idx' not in gvm) or np.all(np.array(gvm['idx'] == None)):
            gvm['idx'] = np.array(range(gvm['gvm'].shape[0]))
        if ('col' not in gvm) or np.all(np.array(gvm['col'] == None)):
            gvm['col'] = np.array(range(gvm['gvm'].shape[1]))

        if not (np.all([isinstance(i, int) for i in gvm['idx']])
                or np.all([isinstance(i, float) for i in gvm['idx']])):
            try:
                gvm['idx'] = np.array(
                    [i.encode('utf-8', 'ignore') for i in gvm['idx']],
                    dtype=np.string_)
            except AttributeError:
                gvm['idx'] = np.array(gvm['idx'], dtype=np.string_)
        if not (np.all([isinstance(i, int) for i in gvm['col']])
                or np.all([isinstance(i, float) for i in gvm['col']])):
            try:
                gvm['col'] = np.array(
                    [i.encode('utf-8', 'ignore') for i in gvm['col']],
                    dtype=np.string_)
            except AttributeError:
                gvm['idx'] = np.array(gvm['idx'], dtype=np.string_)

        with h5sparse.File(output_fname, 'w') as h:
            h.create_dataset('gvm', data=gvm['gvm'])
            h.create_dataset('idx', data=np.array(gvm['idx']))
            h.create_dataset('col', data=np.array(gvm['col']))

    else:
        raise ValueError('unrecognized format %s' % fmt)
示例#22
0
    def csr_matrix_to_h5(matrix, file, name):
        """
        :type matrix: csr_matrix
        :param matrix: the matrix to parallelize

        :type file: str
        :param file: HDF5 file to hold the matrix

        :type name: str
        :param name: name of the matrix in the HDF5 file
        """
        with h5sparse.File(file) as h5f:
            h5f.create_dataset(name, data=matrix)
示例#23
0
def to_sparse_hdf5(df, cell_name):
    """Convert a datafraom to a sparse represenation in an hdf5 file."""

    for major in ("csc", "csr"):
        path = _create_path("h5sparse", cell_name, major, "h5")
        matrix_class = getattr(scipy.sparse, major + "_matrix")
        sparse_matrix = matrix_class(df.as_matrix())
        f = h5sparse.File(path, 'w')
        f.create_dataset("data", data=sparse_matrix)

        f.h5f.attrs["hdf5_version"] = h5py.version.hdf5_version
        f.h5f.attrs["h5py_version"] = h5py.version.version

        f.h5f.close()
示例#24
0
def prepareY(doRregression, networkSavingplace, trainAfterTransform, userID,
             itemID, targetID, trainAfterDealingName):

    # read train data
    train = pd.read_csv(trainAfterTransform)
    train = train.sort_values(by=[userID, itemID])

    # do not do regression at the first time trying

    if doRregression == True:
        weight = regression(train)
        train['weight'] = weight

    train.to_csv(trainAfterDealingName, index=False)
    gc.collect()

    print("train data has prepared !")

    # we are now start to prepare y
    user_num = train[userID].max() + 1

    # calculate h later
    # h is the sum of ( median of useful DIS of user) and ( median of useful DIS of user)

    with h5sparse.File(networkSavingplace + "itemdis.h5") as item_dis,\
        h5sparse.File(networkSavingplace + "userdis.h5") as  user_dis:
        idis = item_dis['disData/data'].value.data.copy()
        idis = idis[idis > 0]
        idis.sort()
        udis = user_dis['disData/data'].value.data.copy()
        udis = udis[udis > 0]
        udis.sort()
        h = np.median(udis) + np.median(idis)
        del udis, idis

    yPrepareForSmallData(user_num, networkSavingplace, itemID, userID,
                         targetID, train, h)
示例#25
0
    def marginal_likelihood(transition_counts_h5file,
                            transition_counts_h5name,
                            transition_probabilities_h5file,
                            transition_probabilities_h5name,
                            concentration_factors=None,
                            smoothing=1.0):
        # TODO: add chunking and parallelization?
        """

        :param transition_counts_h5file:
        :param transition_counts_h5name:
        :param transition_probabilities_h5file:
        :param transition_probabilities_h5name:
        :param concentration_factors:
        :param smoothing:
        :return:
        """

        transition_counts_h5f = h5sparse.File(
            transition_counts_h5file)[transition_counts_h5name]
        transition_probabilities_h5f = h5sparse.File(
            transition_probabilities_h5file)[transition_probabilities_h5name]

        shape = transition_counts_h5f.h5py_group.attrs["h5sparse_shape"]
        ml = np.zeros(len(concentration_factors))
        for row in range(shape[0]):
            transition_counts_row = transition_counts_h5f[row:row + 1]
            transition_probabilities_row = transition_probabilities_h5f[
                row:row + 1]
            ml_row = np.array([
                HypTrailsMarkovChain.marginal_likelihood(
                    transition_counts_row, transition_probabilities_row * cf,
                    smoothing) for cf in concentration_factors
            ])
            ml += ml_row

        return ml
示例#26
0
def convert_to_sparse_hdf5(df, major="csc"):
    """Convert a dataframe to a sparse represenation in an hdf5 file."""

    path = _get_temp_path(".h5")
    matrix_class = getattr(scipy.sparse, major + "_matrix")
    sparse_matrix = matrix_class(df.as_matrix())
    f = h5sparse.File(path, 'w', libver='latest')
    f.create_dataset("data", data=sparse_matrix)

    f.h5f.attrs["hdf5_version"] = h5py.version.hdf5_version
    f.h5f.attrs["h5py_version"] = h5py.version.version

    f.h5f.close()

    return path
示例#27
0
    def __init__(self, dataset, mat=None):

        if isinstance(dataset, str):
            self.dataset = h5sparse.File(dataset)
        else:
            self.dataset = dataset
        if mat is not None:
            self.data = mat
        else:
            self.data = self.dataset['sparse/matrix'][:]
        self.reference = self.dataset['reference/data'][:]
        self.x = self.dataset['reference/x'][:]
        self.y = self.dataset['reference/y'][:]
        self.z = self.dataset['reference/z'][:]
        self.voxel_num = self.dataset['reference/voxel_num'][:]
示例#28
0
    def update_context(self, context, data_definition, **kwargs):
        args = H5pyDataHandlerArgs(**kwargs)
        if args.create_dataset_context is None:
            return
        functions = context.setdefault(args.create_dataset_context, {})
        assert data_definition.key not in functions

        # open h5
        assert data_definition not in self.h5f_dict
        hdf_path = self._get_hdf_path(data_definition)
        assert not hdf_path.exists()
        h5f = h5sparse.File(hdf_path, 'w')
        self.h5f_dict[data_definition] = h5f

        functions[data_definition.key] = partial(h5f.create_dataset, 'data')
示例#29
0
    def __init__(self, window=None, epoch_size=100000):

        self.labels = array([1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74],
                            dtype=uint8)
        self.sample_frequency = 44100

        self.window = int(
            window * self.sample_frequency) if window is not None else None
        self.size = epoch_size

        # initialising dataset
        self.data_path = '/datadrive/musicnet.h5'
        self.labels_path = '/datadrive/labels.h5'

        with h5sparse.File(self.labels_path, 'r') as file:
            self.ids = list(file['sparse/matrix'].h5py_group)
示例#30
0
def test_dataset_append():
    h5_path = mkstemp(suffix=".h5")[1]
    sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]],
                                  dtype=np.float64)
    to_append = ss.csr_matrix([[0, 1, 1], [1, 0, 0]], dtype=np.float64)
    appended_matrix = ss.vstack((sparse_matrix, to_append))

    with h5sparse.File(h5_path) as h5f:
        h5f.create_dataset('matrix',
                           data=sparse_matrix,
                           chunks=(100000, ),
                           maxshape=(None, ))
        h5f['matrix'].append(to_append)
        assert (h5f['matrix'].value != appended_matrix).size == 0

    os.remove(h5_path)