예제 #1
0
    def __getitem__(self, index):
        if isinstance(index, slice):
            instances = self._instances[index]

            if self.sparse:
                instances = np.array(csr_matrix.todense(instances))

            labels = self._labels[index]

            length = len(instances)

            # バッチ内の各ラベルからランダムに要素を取り出す。
            sampled_instances = [self._instances[random.choice(range(self._partition[label],
                                                                     self._partition[label + 1]))]
                                 for label in labels]
            if self.sparse:
                sampled_instances = [np.array(csr_matrix.todense(sampled_instance)) for sampled_instance in sampled_instances]

            return [(instances[i], labels[i], sampled_instances[i]) for i in range(length)]
        else:
            instance = self._instances[index]
            if self.sparse:
                instance = np.array(csr_matrix.todense(instance))

            label = self._labels[index]
            sampled_instance = self._instances[random.choice(range(self._partition[label], self._partition[label + 1]))]

            if self.sparse:
                sampled_instance = np.array(csr_matrix.todense(sampled_instance))

            return instance, label, sampled_instance
예제 #2
0
def select_dataset(name):
    x_size, y_size, train_data, train_label, test_data, test_label = 0, 0, [], [] ,[] ,[] #초기화
    if name == 'cifar':
        dataset = cifar.CIFAR()
        train_data, train_label, test_data, test_label = dataset.getdata()

        train_data = train_data.reshape(-1, 3072)
        test_data = test_data.reshape(-1, 3072)
        x_size = 3072
        y_size = 10

    elif name == 'svhn':
        dataset = svhn.SVHN()
        train_data, train_label = dataset.get_trainset()
        test_data, test_label = dataset.get_testset()

        train_data = train_data.reshape(-1, 3072)
        test_data = test_data.reshape(-1, 3072)
        x_size = 3072
        y_size = 10

    elif name == 'mnist':
        dataset = mnist.read_data_sets(flags.MNIST_DIR, one_hot=True)
        train_data, train_label, test_data, test_label = dataset.train.images, dataset.train.labels, \
                                                         dataset.test.images, dataset.test.labels
        x_size = 784
        y_size = 10

    elif name == 'news':
        trainset = fetch_20newsgroups(data_home=flags.NEWS_DIR, subset='train')
        testset = fetch_20newsgroups(data_home=flags.NEWS_DIR, subset='test')

        vectorizer = TfidfVectorizer(analyzer='word', max_features=3072)

        vectorizer.fit(trainset.data)
        train_data = vectorizer.transform(trainset.data)
        train_data = csr_matrix.todense(train_data)
        train_label = trainset.target
        train_label = NNutils.onehot(train_label, 20, list=True)
        # print(train_label.shape)

        test_data = vectorizer.transform(testset.data)
        test_data = csr_matrix.todense(test_data)
        test_label = testset.target
        test_label = NNutils.onehot(test_label, 20, list=True)

        x_size = 3072
        y_size = 20

    return Dataset(name, x_size, y_size, train_data, train_label, test_data,
                   test_label)
예제 #3
0
    def test_sparseMatConn(self):
        conn_mat = np.random.randint(2, size=(5, 3), dtype=bp.math.bool_)
        sparse_mat = csr_matrix(conn_mat)
        conn = bp.conn.SparseMatConn(sparse_mat)(pre_size=sparse_mat.shape[0],
                                                 post_size=sparse_mat.shape[1])

        print(conn.requires('pre2post'))

        print(conn.requires('conn_mat'))
        print(csr_matrix.todense(sparse_mat))

        assert bp.math.array_equal(
            conn_mat,
            bp.math.asarray(csr_matrix.todense(sparse_mat),
                            dtype=bp.math.bool_))
예제 #4
0
 def batch_generator(self, A_X, y, batch_size):
     number_of_batches = ceil(y.shape[0] / batch_size)
     counter = 0
     shuffle_index = np.arange(np.shape(y)[0])
     np.random.shuffle(shuffle_index)
     A_ = A_X[0]
     X = A_X[1]
     A_ = A_[shuffle_index]
     X = X[shuffle_index]
     y = y[shuffle_index]
     while 1:
         index_batch = shuffle_index[batch_size *
                                     counter:min(batch_size *
                                                 (counter + 1), y.shape[0])]
         if len(A_.shape) == 1:
             A_batch = np.array(
                 list(
                     map(lambda a: csr_matrix.todense(a),
                         A_[index_batch].tolist())))
         else:
             A_batch = A_[index_batch]
         X_batch = X[index_batch]
         y_batch = y[index_batch]
         counter += 1
         yield ([A_batch, X_batch], y_batch)
         if (counter < number_of_batches):
             np.random.shuffle(shuffle_index)
             counter = 0
예제 #5
0
def update_iterative(self, inc_mat, inc_transpose, inv_len_mat, flux_mat):
    """One step update

    Parameters:
        inc_mat: sparse.matrix, oriented incidence matrix
        inc_transpose: sparse.matrix, oriented incidence matrix transposed
        inv_len_mat: sparse.matrix, diagonal matrix 1/l_e
        flux_mat: np.array, fluxes

    Returns:
        flux: np.array, updated fluxes
        """

    if self.coupling == "l2":
        flux_norm = np.linalg.norm(flux_mat, axis=1)**2
    if self.coupling == "l1":
        flux_norm = np.linalg.norm(flux_mat, axis=1, ord=1)**2

    # computing scaling ad updating conductivities
    temp = (np.sum(self.length *
                   flux_norm**((2 - self.pflux) /
                               (3 - self.pflux))))**(1 / (2 - self.pflux))
    self.tdens = (1 / temp) * flux_norm**(1 / (3 - self.pflux))
    td_mat = diags(self.tdens, 0)

    # computing fluxes
    temp_pinv = np.linalg.pinv(
        csr_matrix.todense(inc_mat * td_mat * inv_len_mat * inc_transpose))
    lagrange_mult = temp_pinv * self.forcing
    flux = td_mat * inv_len_mat * inc_transpose * lagrange_mult

    return flux
예제 #6
0
def frag_matrix_extract(hicfile, chrN1, chrN2, binsize, start1, start2,
                        lastend1, lastend2, shiftsize, Step):

    end1 = start1 + Step + shiftsize
    end2 = start2 + Step + shiftsize
    #if end1 > lastend1:
    #    end1 = lastend1
    #if end2 > lastend2:
    #    end2 = lastend2
    result = straw.straw('NONE', hicfile, str(chrN1), str(chrN2), 'BP',
                         binsize)
    row = [r // binsize for r in result[0]]
    col = [c // binsize for c in result[1]]
    value = result[2]

    N = max(chrs_length[chrN2] // binsize + Step // binsize,
            chrs_length[chrN1] // binsize + Step // binsize) + 1
    #N = max(max(row)+1, max(col) + 1)
    #print(N)
    M = csr_matrix((value, (row, col)), shape=(N, N))
    M = csr_matrix.todense(M)
    rowix = range(start1 // binsize, end1 // binsize + 1)
    colix = range(start2 // binsize, end2 // binsize + 1)
    #print(rowix,colix)
    M = M[np.ix_(rowix, colix)]
    N = M.shape[1]
    return (M, N)
def general_transform(inputs, sparse):
    instance, label = inputs

    if sparse:
        instance = np.array(csr_matrix.todense(instance))

    return instance, label
예제 #8
0
 def todense(self, samples):
     densed_samples = []
     for sample in samples:
         densed_samples.append(csr_matrix.todense(sample))
     densed_samples_np = np.asarray(densed_samples).reshape(
         len(samples), -1)
     return densed_samples_np
예제 #9
0
def categorize_dates(df, date_enc=None):
    onehot_cols = 'month,dow,year'.split(',')
    if not date_enc:
        date_enc = OneHotEncoder()
        date_enc.fit(df[onehot_cols])
    X_date = date_enc.transform(df[onehot_cols])
    X_date = csr_matrix.todense(X_date)
    return X_date, date_enc
예제 #10
0
 def index_sparse_arr(self):
     
     self.sparse_doc = csr_matrix(self.test_arr)
     print(self.sparse_doc)
     
     self.dense_arr = csr_matrix.todense(self.sparse_doc)        
     
     self.im = Image.fromarray(self.dense_arr * 225)
     self.im.show()
예제 #11
0
 def todense(self, samples):
     densed_samples = []
     for sample in samples:
         # print([csr_matrix.todense(sample).tolist()])
         densed_samples.append(csr_matrix.todense(sample))
     densed_samples_np = np.asarray(densed_samples).reshape(
         len(samples), -1)
     # print(densed_samples_np.shape)
     return densed_samples_np
예제 #12
0
    def createTrainData_nparray(data, seqLength, predLength=1, stride=1):
        data = csr_matrix.todense(data)
        i = 0
        dataX = []
        dataY = []
        while (i < (len(data) - seqLength - predLength)):
            dataX.append(data[i:i + seqLength])
            dataY.append(data[i + seqLength:(i + seqLength + predLength)])
            i += stride

        return np.array(dataX), np.array(dataY)
    def lgc_solver(self, mu):
        d_neg_half = np.diag(np.power(self.degrees, -0.5))
        d_neg_half = csr_matrix(d_neg_half)
        step = d_neg_half.dot(self.weight_matrix)
        S = (1 / (1 + mu)) * step.dot(d_neg_half)
        S = scipy.sparse.identity(self.node_number) - S
        S = csr_matrix.todense(S)
        S = np.linalg.inv(S)
        S = (mu / (1 + mu)) * S

        output_labels = np.linalg.multi_dot((S, self.Y))

        return output_labels
예제 #14
0
def laplacian_graph(df_train,df_test):
    test=df_test.copy()
    test['rate']=0
    la_train=pd.concat([df_train,test])
    la_train=la_train.set_index(np.arange(len(la_train)))

    la_train = la_train.astype({'item': 'int'})
    la_train = la_train.astype({'user': '******'})

    a=list(set(la_train.user.tolist()))
    b=list(set(la_train.item.tolist()))

    B = nx.Graph()
    # Add nodes with the node attribute "bipartite"
    B.add_nodes_from(a, bipartite=1)
    B.add_nodes_from(b, bipartite=0)
    # Add edges only between nodes of opposite node sets

    B.add_weighted_edges_from(get_edgelist())

    bottom_nodes, top_nodes = bipartite.sets(B)

    G1 = bipartite.weighted_projected_graph(B, top_nodes, ratio=False)  #movie
    G2 = bipartite.weighted_projected_graph(B, bottom_nodes, ratio=False) #user

    la_movie=nx.normalized_laplacian_matrix(G1, nodelist=None, weight='weight')
    la_user=nx.normalized_laplacian_matrix(G2, nodelist=None, weight='weight')
    from scipy.sparse import csr_matrix
    la_movie=csr_matrix.todense(la_movie)
    la_user=csr_matrix.todense(la_user)
    m=nx.to_numpy_array(G1)
    u=nx.to_numpy_array(G2)

    Max1=np.amax(m) 
    Max2=np.amax(u)
    norm_la_movie=np.true_divide(m, Max1)
    norm_la_user=np.true_divide(u, Max2)
    return la_movie,la_user,norm_la_movie,norm_la_user
예제 #15
0
def train_matrix_extract(chrN1, binsize, hicfile):
    result = straw.straw('NONE', hicfile, str(chrN1), str(chrN1), 'BP',
                         binsize)
    row = [r // binsize for r in result[0]]
    col = [c // binsize for c in result[1]]
    value = result[2]
    N = max(max(row) + 1, max(col) + 1)
    #print(N)
    M = csr_matrix((value, (row, col)), shape=(N, N))
    M = csr_matrix.todense(M)
    M = np.array(M)
    x, y = np.where(M != 0)
    M[y, x] = M[x, y]
    return (M)
예제 #16
0
def matrix_extract(chrN1, chrN2, binsize, hicfile):

    result = straw.straw('NONE', hicfile, str(chrN1),str(chrN2),'BP',binsize)

    row = [r//binsize for r in result[0]]
    col = [c//binsize for c in result[1]]
    value = result[2]
    Nrow = max(row) + 1
    Ncol = max(col) + 1
    N = max(Nrow, Ncol)

    M = csr_matrix((value, (row,col)), shape=(N,N))
    M = csr_matrix.todense(M)

    return(M)
예제 #17
0
 def __getitem__(self, index):
     if isinstance(index, slice):
         raise NotImplementedError
         batches = [dataset[index] for dataset in self._datasets]
         instances = [tuple([self.transform(instance) for instance in batches[2]])]
         clusters = [tuple([cluster for cluster in batches[0]])]
         classes = [tuple([_class for _class in batches[1]])]
         return [instances, clusters, classes]
     else:
         batches = [dataset[index] for dataset in self._datasets]
         instance, cluster, _class = tuple(batches)
         if self.transform is not None:
             instance = self.transform(instance)
         if self.sparse:
             instance = np.array(csr_matrix.todense(instance))
         return instance, cluster, _class
def check_cluster(model,
                  train,
                  num_classes,
                  num_cluster,
                  batchsize=128,
                  device=-1,
                  sparse=False):
    with chainer.using_config('train', False):
        i, N = 0, len(train)
        cc = None
        ss = None

        while i <= N:
            train_batch = train[i:i + batchsize]
            if sparse:
                train_batch = np.array(csr_matrix.todense(train_batch))
            # concat_examplesは(instances, labels)を返す。
            xx = F.softmax(
                model(
                    chainer.dataset.convert.concat_examples(
                        train_batch, device=device)[0])).data
            if device >= 0:
                xx = cuda.to_cpu(xx)

            if cc is None:
                cc = np.argmax(xx, axis=1)
            else:
                cc = np.append(cc, np.argmax(xx, axis=1))

            if ss is None:
                ss = np.sum(xx, axis=0)
            else:
                ss = ss + np.sum(xx, axis=0)
            i += batchsize

        ss /= N
        partition = train._partition
        cluster = [
            tuple(
                np.sum(cc[partition[k]:partition[k + 1]] == c)
                for c in range(num_cluster)) for k in range(num_classes)
        ]
    return cluster, ss
예제 #19
0
def parse_dataset(dataset, hasher, kth, batch_size):

    a = hasher.transform(
        tokens(d[0])
        for d in dataset.train[kth * batch_size:(kth + 1) * batch_size])
    sample_size = a.shape[0]
    labels = list()
    class_dict = dict()
    last_class_index = 0

    for i in range(sample_size):
        i = kth * batch_size + i
        if dataset.train[i][1] in class_dict:
            labels.append(class_dict[dataset.train[i][1]])
        else:
            class_dict[dataset.train[i][1]] = last_class_index
            labels.append(last_class_index)
            last_class_index += 1
    return csr_matrix.todense(a).T, convert_to_one_hot(labels).T
예제 #20
0
def matrix_extract(chrN1, binsize, hicfile):

    result = straw.straw('NONE', hicfile, str(chrN1), str(chrN1), 'BP',
                         binsize)
    row = [r // binsize for r in result[0]]
    col = [c // binsize for c in result[1]]
    value = result[2]
    N = max(max(row) + 1, max(col) + 1)
    #print(N)
    M = csr_matrix((value, (row, col)), shape=(N, N))
    M = csr_matrix.todense(M)
    M = np.array(M)
    x, y = np.where(M != 0)
    M[y, x] = M[x, y]
    #rowix = range(start1//binsize, end1//binsize+1)
    #colix = range(start2//binsize, end2//binsize+1)
    #print(rowix,colix)
    #M = M[np.ix_(rowix, colix)]
    #N = M.shape[1]
    return (M)
예제 #21
0
    def topic_proportions(self, bow_matrix, embeds):

        #We assume up to batch_size documents to compute their proportions

        with tf.Session(graph=self.graph) as session:

            saver = tf.train.Saver()

            saver.restore(session, self.net_file)

            n_batches = int(np.floor(self.ntrain / self.batch_size))

            order = np.arange(bow_matrix.shape[0])

            topic_prop = np.zeros(
                [bow_matrix.shape[0], self.network_params['out_dim']])

            loglik = 0.0

            for i in range(n_batches):

                idx_batch = self.next_batch(order, i)

                bow_batch = np.zeros(
                    [self.batch_size, self.network_params['input_dim']])
                embed_batch = np.zeros(
                    [self.batch_size, self.network_params['embedding_dim']])
                #We complete with zeros (last batch only)

                bow_batch[:idx_batch.shape[0], :] = csr_matrix.todense(
                    bow_matrix[idx_batch, :])
                embed_batch[:idx_batch.shape[0], :] = embeds[idx_batch, :]

                topic_prop[idx_batch, :], ll = self.vae_graph.topic_prop(
                    bow_batch, embed_batch, session)

                loglik += ll / self.ntrain * self.batch_size

            session.close()

        return topic_prop, loglik
예제 #22
0
파일: batch.py 프로젝트: ramellose/massoc
    def rarefy(self):
        """
        For each BIOM file, a rarefaction filter is applied.
        A mininum read depth can be specified;
        samples with reads lower than this read depth are removed,
        and then samples are rarefied to equal depth.

        :return:
        """
        all_bioms = {'otu': self.otu, 'genus': self.genus,
                     'family': self.family, 'order': self.order,
                     'class': self.class_, 'phylum': self.phylum}
        batchcopy = deepcopy(all_bioms)
        for level in all_bioms:
            for name in all_bioms[level]:
                try:
                    if self.inputs['rar'] == 'True':
                        lowest_count = int(min(all_bioms[level][name].sum(axis='sample')))
                    else:
                        lowest_count = int(self.inputs['rar'])
                    data = all_bioms[level][name].matrix_data
                    data = csr_matrix.todense(data)
                    keep_samples = list()
                    mincount = np.sum(data, axis=0)
                    for y in range(mincount.shape[1]):
                        if mincount.item(y) >= lowest_count:
                            keep_samples.append(all_bioms[level][name]._sample_ids[y])
                    keep = all_bioms[level][name].filter(keep_samples, axis="sample", inplace=False)
                    batchcopy[level][name] = keep.subsample(n=lowest_count, axis='sample')
                except Exception:
                    logger.error("Unable to rarefy file", exc_info=True)
                for name in list(all_bioms[level]):
                    all_bioms[level][name] = batchcopy[level][name]
        self.otu = all_bioms['otu']
        self.genus = all_bioms['genus']
        self.family = all_bioms['family']
        self.order = all_bioms['order']
        self.class_ = all_bioms['class']
        self.phylum = all_bioms['phylum']
예제 #23
0
 def generate_figures(self):
     """Generates figures for diagnostics canvas.
     Also sets the split file params. """
     file = self.file_list.GetSelection()
     if file != -1:
         file = self.file_list.GetString(file)
         biomfile = biom.load_table(file)
         if biomfile.metadata(axis='sample'):
             varlist = list(biomfile.metadata_to_dataframe(axis='sample').columns)
             varlist.sort()
             self.split_list.Set(varlist)
         else:
             if self.meta:
                 if file in self.meta:
                     varlist = self.meta[file]
                     varlist.sort()
                     self.split_list.Set(varlist)
         if self.split:
             split = self.split_list.FindString(self.split)
             self.split_list.SetSelection(split)
         data = biomfile.matrix_data
         data = csr_matrix.todense(data)
         fracs = np.count_nonzero(data, axis=1)
         nsamples = data.shape[1]
         fracs = fracs / nsamples
         self.prevfig.clear()
         self.prevfig.hist(fracs, bins=20)
         self.prevfig.set_xlabel('Prevalence')
         self.prevfig.set_title('Taxon prevalence')
         self.prevfig.set_ylabel('Number of taxa')
         sample_sums = np.transpose(np.count_nonzero(data, axis=0))
         self.rarfig.clear()
         self.rarfig.hist(sample_sums, bins=40)
         self.rarfig.set_xlabel('Count number')
         self.rarfig.set_title('Sample counts')
         self.rarfig.set_ylabel('Number of samples')
         self.canvas1.draw()
         self.canvas2.draw()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 10 14:59:37 2020

Copyright 2020 by Hadrien Montanelli.
"""
# %% Imports.

# Standard library imports:
import numpy as np
from scipy.sparse import csr_matrix

# Chebpy imports:
from chebpy.nla import sptoeplitz

# %% Test 1.

col = np.array([1, 1, 2, 4, 5, 0, 0])
row = np.array([1, 3, 4, 0])
T = sptoeplitz(col, row)
print(csr_matrix.todense(T))

# %% Test 2.

n = 10
col = np.zeros(n)
row = np.zeros(2*n + 1)
row[int(n/2)] = 1
T = sptoeplitz(col, row)
print(csr_matrix.todense(T))
예제 #25
0
    def __init__(self, init_type, **kwargs): #init_type: {Directly, Random, From File, Random Tree, Random Chimera}
        
        
        if (init_type == "Directly"):
            if 'Const' in kwargs.keys():
                Const = kwargs['Const']
            else:
                Const = 0.0
            
            if 'Pot' in kwargs.keys():
                Pot = kwargs['Pot']
            else:
                Pot = np.zeros((1, len(kwargs['Inter'])))
                
            (self.Inter, self.Pot, self.Const) = (kwargs['Inter'], Pot, Const)
        elif (init_type == "Random"):
            
            if 'n' in kwargs.keys():
                n = kwargs['n']
            else:
                print("Should specify number of vertices n=")
                
            if 'p' in kwargs.keys():
                p = kwargs['p']
            else:
                print("Should specify the probability of an edge p=")    
            
            if 'seed' in kwargs.keys():
                seed = kwargs['seed']
            else:
                seed = None

            G = nx.gnp_random_graph(n, p, seed)
            A = csr_matrix.todense(nx.adjacency_matrix(G))

            (self.Inter, self.Pot, self.Const) = (Laplacian(A)/4, np.zeros((n)), 0)
            
        elif (init_type == "From File"):
            
            if 'filename' in kwargs.keys():
                filename = kwargs['filename']
            else:
                print("You should specify the filename!")
                
            import os
            name, extension = os.path.splitext(filename)

            if (extension == '.json'):
                (self.Inter, self.Pot, self.Const) = BQPJSON(filename)
                self.Inter = 1*self.Inter
                self.Pot = 1*self.Pot
                self.Const = 1*self.Const
            #elif (file_extension == '.mat'):
                #retrieve a dense graph from .mat file
            #elif (file_extension == '.sparse'):
                #retrieve a sparse graph from .sparse file
            else:
                print("Wrong File Extension")
                
        elif (init_type == "Random Chimera"):
            import dwave_networkx as dnx

            G = dnx.chimera_graph(kwargs['M'], kwargs['N'], kwargs['L'])
            A = csr_matrix.todense(nx.adjacency_matrix(G))

            (self.Inter, self.Pot, self.Const) = (Laplacian(A)/4, np.zeros((n)), 0)
        
        elif (init_type == "Random Tree"):
            
            if 'seed' in kwargs.keys():
                seed = kwargs['seed']
            else:
                seed = None
    
            if 'n' in kwargs.keys():
                n = kwargs['n']
            else:
                n = random.randint(10, 100)

            G = nx.random_tree(n, seed)
            A = csr_matrix.todense(nx.adjacency_matrix(G))
            
            (self.Inter, self.Pot, self.Const) = (Laplacian(A)/4, np.zeros((n)), 0)
예제 #26
0
def Laplacian(Adjacency): 
    
    G = nx.from_numpy_matrix(Adjacency)
    L = csr_matrix.todense(nx.laplacian_matrix(G))

    return L
예제 #27
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 10 14:59:37 2020

Copyright 2020 by Hadrien Montanelli.
"""
# %% Imports.

# Standard library imports:
import numpy as np
from scipy.sparse import csr_matrix

# Chebpy imports:
from chebpy.nla import sphankel

# %% Test 1.

col = np.array([1, 2, 3, 4])
H = sphankel(col)
print(csr_matrix.todense(H))
예제 #28
0
파일: batch.py 프로젝트: ramellose/massoc
    def cluster_biom(self):
        """
        First normalizes bioms so clustering is not affected,
        performs transformation and then applies clustering.
        Note that the returned biom files are not normalized,
        this is just for the clustering process.
        Many network inference tools require absolute counts.
        Silhouette score is used to determine the optimal
        number of clusters.
        Clustering adds metadata info to the samples.
        Splitting according to cluster ID is done
        by wrapping the split_biom function.

        :return:
        """
        inputs = self.inputs
        if inputs['nclust'] is not None:
            nums = list(range(2, (int(inputs['nclust']) + 1)))
        else:
            nums = list(range(2,5))
        new_dict = {}
        if type(self.otu) is not dict:
            logger.warning('Cluster_biom requires a dictionary of biom files to be supplied. \n', exc_info=True)
            raise ValueError("Cluster_biom requires a dictionary of biom files to be supplied.")
        normbatch = self.normalize_transform(mode='clr')
        # CLR transform places data in Euclidean space
        for x in list(self.otu):
            try:
                # define topscore and bestcluster for no cluster
                norm_table = normbatch.otu[x]
                topscore = 0
                bestcluster = [1] * len(norm_table.ids())
                data = csr_matrix.todense(norm_table.matrix_data)
                data = np.matrix.transpose(data)
                data = PCA(n_components=2).fit_transform(data)
                randomclust = np.random.randint(2, size=len(data))
                sh_score = [silhouette_score(data, randomclust)]
                # K-means clustering, tests 2-4 clusters
                if inputs['cluster'] == 'K-means':
                    for i in nums:
                        clusters = KMeans(i).fit_predict(data)
                        silhouette_avg = silhouette_score(data, clusters)
                        sh_score.append(silhouette_avg)
                    topscore = int(np.argmax(sh_score) + 1)
                    bestcluster = KMeans(topscore).fit_predict(data)
                # DBSCAN clustering, automatically finds optimal cluster size
                if inputs['cluster'] == 'DBSCAN':
                    bestcluster = DBSCAN().fit_predict(data)
                    topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0)
                # Gaussian Mixture Model (gmm) probability distribution
                if inputs['cluster'] == 'Gaussian':
                    for i in nums:
                        fit = GaussianMixture(i).fit(data)
                        clusters = fit.predict(data)
                        silhouette_avg = silhouette_score(data, clusters)
                        sh_score.append(silhouette_avg)
                    topscore = int(np.argmax(sh_score) + 1)
                    bestfit = GaussianMixture(topscore).fit(data)
                    bestcluster = bestfit.predict(data)
                # Spectral Clustering
                if inputs['cluster'] == 'Spectral':
                    for i in nums:
                        clusters = SpectralClustering(i).fit_predict(data)
                        silhouette_avg = silhouette_score(data, clusters)
                        sh_score.append(silhouette_avg)
                    topscore = int(np.argmax(sh_score) + 1)
                    bestcluster = SpectralClustering(topscore).fit_predict(data)
                # Affinity Propagation clustering
                if inputs['cluster'] == 'Affinity':
                    bestcluster = AffinityPropagation().fit_predict(data)
                    topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0)
                if max(sh_score) < 0.25:
                    raise ValueError("Silhouette score too low: please try a different algorithm. "
                                     "Your data may not be suitable for clustering.")
                new_dict[x] = deepcopy(self.otu[x])
                for i in range(topscore):
                    mask, = np.where(bestcluster == i)
                    for j in mask:
                        new_dict[x]._sample_metadata[j]['cluster'] = inputs['cluster'] + '_' + str(i)
                self.otu = new_dict
                if inputs['split'] is not None:
                    if inputs['split'] == 'TRUE':
                        inputs['split'] = 'cluster'
                        self.split_biom()
            except Exception:
                logger.error("Error occurred when clustering samples", exc_info=True)
예제 #29
0
파일: batch.py 프로젝트: ramellose/massoc
    def prev_filter(self, mode='prev'):
        """
        Some operations may require transformed data.
        This function performs normalization and
        a clr transform on all OTU tables in a Batch object.
        It returns a deep copy of the original Batch object,
        so the original file is not modified.

        :param mode: prev or min, specifies whether taxa should be filtered
        based on prevalence or minimum abundance. The values are stored in the batch.inputs dictionary.
        :return:
        """
        for level in self.levels:
            for name in self.levels[level]:
                data = self.levels[level][name].matrix_data
                data = csr_matrix.todense(data)
                keep_otus = list()
                binotu = None
                try:
                    if mode == 'prev':  # calculates prevalence
                        fracs = np.count_nonzero(data, axis=1)
                        nsamples = data.shape[1]
                        fracs = fracs / nsamples
                        for y in range(0, len(fracs)):
                            if fracs[y] >= (float(self.inputs['prev'])/100):
                                keep_otus.append(self.levels[level][name]._observation_ids[y])
                            else:
                                binotu = self.levels[level][name]._observation_ids[y]
                        if binotu is not None and 'Bin' not in keep_otus:
                            keep_otus.append(binotu)
                except Exception:
                    logger.error("Could not set prevalence filter", exc_info=True)
                try:
                    if mode == 'min':
                        mincount = np.sum(data, axis=1)
                        for y in range(0, len(mincount)):
                            if mincount[y] >= (int(self.inputs['min'])):
                                keep_otus.append(self.levels[level][name]._observation_ids[y])
                            else:
                                binotu = self.levels[level][name]._observation_ids[y]
                        if binotu is not None:
                            keep_otus.append(binotu)
                except Exception:
                    logger.error("Could not set a minimum count filter", exc_info=True)
                keep = self.levels[level][name].filter(keep_otus, axis="observation", inplace=False)
                try:
                    if binotu is not None:
                        bin = self.levels[level][name].filter(keep_otus[:-1], axis="observation", inplace=False, invert=True)
                        binsums = np.sum(bin.matrix_data, axis=0) # sums all binned OTUs
                        # need to recreate keep._data as lil matrix, is more efficient
                        orig = keep._data.tolil(copy=True)
                        if 'Bin' not in keep_otus:
                            bin_id = keep._obs_index[binotu]
                            orig[bin_id] = binsums
                            keep._observation_ids[bin_id] = "Bin"
                            keep._obs_index["Bin"] = keep._obs_index.pop(binotu)
                        if 'Bin' in keep_otus:  # necessary to prevent duplicate Bin ID
                            old_bin_id = keep._obs_index["Bin"]
                            old_bin_sums = keep._data[old_bin_id]
                            new_bin_sums = binsums + old_bin_sums
                            orig[old_bin_id] = new_bin_sums
                        # update keep._data with orig
                        keep._data = orig.tocsr()
                except Exception:
                    logger.error("Could not preserve binned taxa", exc_info=True)
                self.levels[level][name] = keep
예제 #30
0
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    # Remove last blank column
    # plt.xlim( (0, tfidf_matrix.shape[1]) )
    for i in range(tfidf_matrix.shape[0]):
        for j in range(tfidf_matrix.shape[1]):
            c = round(tfidf_matrix[i, j], 2)
            ax.text(j, i, str(c))

    plt.show()


if __name__ == "__main__":
    corpus = [
        'this is the one document.',
        'this is the second document.',
        'and this is the third one, which is very similar to first one.',
        'is this the first document relates to politics?',
    ]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    print(vectorizer.get_feature_names())
    print(csr_matrix.todense(X))

    heatmap(np.array(csr_matrix.todense(X)), "", "", "", vectorizer.get_feature_names(), corpus)