Exemplo n.º 1
0
class SoftSeqKmeans():
    def __init__(self,
                 n_centroid,
                 centroid_length,
                 alphabet,
                 use_gpu=True,
                 tau=2):
        self.model = None
        self.optimizer = None
        self.centroid_length = centroid_length
        self.n_centroid = n_centroid
        self.tau = tau
        self.use_gpu = use_gpu
        self.alphabet = alphabet
        self.dict_alphabet = {alphabet[i]: i for i in range(len(alphabet))}
        self.max_length = None

    def fit(self,
            X,
            batchsize=100,
            n_iter=100,
            init_smooth=0.8,
            init_scale=0.1,
            lr=0.01,
            optimizer='Momentum'):
        L = np.array([len(seq) for seq in X])
        self.max_length = np.max(L)

        init = X[np.where(L == self.centroid_length)[0]]
        init = np.unique(init)
        init = init[np.random.choice(len(init), self.n_centroid,
                                     replace=False)]
        print(init)
        init_seq = one_hot_encoding(init, self.dict_alphabet, self.max_length,
                                    init_smooth)
        init_seq[np.where(init_seq != 0)] = np.log(
            init_seq[np.where(init_seq != 0)])
        noise = np.random.gumbel(0, 1, init_seq.shape)
        init_seq[np.where(init_seq != 0)] += noise[np.where(init_seq != 0)]
        init_seq *= init_scale
        init_seq = np.transpose(
            np.transpose(init_seq, (1, 0, 2)) - np.mean(init_seq, axis=1),
            (1, 0, 2))

        self.model = Chain(kmeans=SoftKMeansLayer(self.n_centroid,
                                                  self.centroid_length,
                                                  init_W=init_seq,
                                                  tau1=self.tau))

        self.optimizer = {
            'Adam': optimizers.Adam(lr),
            'Momentum': optimizers.MomentumSGD(lr),
            'SGD': optimizers.SGD(lr)
        }[optimizer]

        self.optimizer.setup(self.model)
        self.optimizer.add_hook(chainer.optimizer.WeightDecay(1e-6))
        if self.use_gpu:
            self.model.to_gpu()

        with chainer.using_config('train', True):
            lcurve = []
            for i in range(n_iter):
                self.model.cleargrads()
                indexes = np.random.choice(len(X), batchsize)
                x = X[indexes]
                x = one_hot_encoding(x, self.dict_alphabet, self.max_length)
                if self.use_gpu:
                    x = cupy.array(x)
                loss = self.model.kmeans(x[indexes])
                loss.backward()
                lcurve.append(float(loss.data))
                self.optimizer.update()
                print(i, np.mean(lcurve[-10:]))

        return np.array(lcurve)

    def transform(self, X, batchsize=1000):
        labels = []
        with chainer.using_config('train', False):
            with chainer.no_backprop_mode():
                for i in range(0, len(X), batchsize):
                    print(i)
                    x = X[i:i + batchsize]
                    x = one_hot_encoding(x, self.dict_alphabet,
                                         self.max_length)
                    if self.use_gpu:
                        x = cupy.array(x)
                    loss, indexes = self.model.kmeans(x, inference=True)
                    labels.append(indexes)
        return np.concatenate(labels)

    def get_centroid(self):
        return cupy.asnumpy(self.model.kmeans.get_centroid())
Exemplo n.º 2
0
class SeqKmeans():
    def __init__(self,
                 n_centroid,
                 centroid_length,
                 alphabet,
                 use_gpu=True,
                 tau=2):
        self.model = None
        self.optimizer = None
        self.centroid_length = centroid_length
        self.n_centroid = n_centroid
        self.tau = tau
        self.use_gpu = use_gpu
        self.alphabet = alphabet
        self.dict_alphabet = {alphabet[i]: i for i in range(len(alphabet))}
        self.max_length = None

    def get_initialize_points(self, X, smooth, n_centroid):
        X = cupy.array(one_hot_encoding(X, self.dict_alphabet, self.max_length,
                                        smooth),
                       dtype=np.float32)
        I = np.ravel(np.broadcast_to(np.arange(len(X)), (len(X), len(X))).T)
        J = np.ravel(np.broadcast_to(np.arange(len(X)), (len(X), len(X))))
        d = edit_distance(X[I], X[J]).reshape((len(X), len(X)))
        d = cupy.asnumpy(d)
        out = [random.randint(0, len(X) - 1)]
        for i in range(n_centroid - 1):
            min_d = np.min(d[:, out], axis=1)
            new_point = np.random.choice(len(min_d),
                                         1,
                                         p=min_d / np.sum(min_d))
            out.append(new_point)
        return cupy.asnumpy(X)[out, :, :]

    def fit(self,
            X,
            mini_batch=1000,
            subsample_batch=100,
            n_iter=100,
            step_per_iter=10,
            init_smooth=0.8,
            init_scale=0.1,
            lr=0.1,
            optimizer='SGD'):
        L = np.array([len(seq) for seq in X])
        self.max_length = np.max(L)

        init = X[np.where(L == self.centroid_length)[0]]
        init = np.unique(init)
        if len(init) > self.n_centroid * 100:
            init = init[np.random.choice(len(init),
                                         self.n_centroid * 100,
                                         replace=False)]

        init_seq = self.get_initialize_points(init, init_smooth,
                                              self.n_centroid)
        """init_seq = one_hot_encoding(init, self.dict_alphabet, self.max_length, init_smooth)
        init_seq[np.where(init_seq != 0)] = np.log(init_seq[np.where(init_seq != 0)])
        noise = np.random.gumbel(0, 1, init_seq.shape)
        init_seq[np.where(init_seq != 0)] += noise[np.where(init_seq != 0)]
        init_seq *= init_scale"""
        init_seq = np.transpose(
            np.transpose(init_seq, (1, 0, 2)) - np.mean(init_seq, axis=1),
            (1, 0, 2))

        self.model = Chain(kmeans=KMeansLayer(self.n_centroid,
                                              self.centroid_length,
                                              init_W=init_seq,
                                              tau=self.tau))
        self.optimizer = {
            'Adam': optimizers.Adam(lr),
            'Momentum': optimizers.MomentumSGD(lr),
            'SGD': optimizers.SGD(lr)
        }[optimizer]
        self.optimizer.setup(self.model)
        self.optimizer.add_hook(chainer.optimizer.WeightDecay(1e-6))
        if self.use_gpu:
            self.model.to_gpu()

        with chainer.using_config('train', True):
            lcurve = []
            for i in range(n_iter):
                self.model.cleargrads()
                indexes = np.random.choice(len(X), mini_batch)
                x = X[indexes]
                x = one_hot_encoding(x, self.dict_alphabet, self.max_length)
                if self.use_gpu:
                    x = cupy.array(x)
                with chainer.no_backprop_mode():
                    _, labels = self.model.kmeans(x, inference=True)
                labels_indexes = [
                    np.where(labels == u)[0] for u in np.unique(labels)
                ]
                for j in range(step_per_iter):
                    indexes = []
                    for row in labels_indexes:
                        indexes += np.random.choice(
                            row,
                            subsample_batch // len(labels_indexes)).tolist()

                    loss = self.model.kmeans(x[indexes],
                                             indexes=labels[indexes])
                    loss = F.mean(loss)
                    loss.backward()
                    lcurve.append(float(loss.data))
                    self.optimizer.update()
                    print(i, j, np.mean(lcurve[-10:]))

        return np.array(lcurve)

    def transform(self, X, batchsize=1000):
        labels = []
        with chainer.using_config('train', False):
            with chainer.no_backprop_mode():
                for i in range(0, len(X), batchsize):
                    print(i)
                    x = X[i:i + batchsize]
                    x = one_hot_encoding(x, self.dict_alphabet,
                                         self.max_length)
                    if self.use_gpu:
                        x = cupy.array(x)
                    loss, indexes = self.model.kmeans(x, inference=True)
                    labels.append(indexes)
        return np.concatenate(labels)

    def get_centroid(self):
        return cupy.asnumpy(self.model.kmeans.get_centroid())