Exemplo n.º 1
0
class BirchAlgo(object):

    labels = None
    clusters = []
    centers = None
    labels_temporary = None
    
    def __init__(self, threshold = 0.2):
        self.birch = Birch(threshold=threshold, n_clusters=None, compute_labels=True)
        self.n_cluster = None
    

    def aplica_birch(self, dados):
        self.birch.partial_fit(dados)
        if (self.labels is None):
            self.labels = self.birch.labels_
            self.labels_temporary = self.birch.labels_
        else:
            self.labels = np.append(self.labels, self.birch.labels_)
            self.labels_temporary = self.birch.labels_
        self.centers = self.birch.subcluster_centers_
        return self.labels_temporary

    def atualiza_kmeans(self, dados):
        self.aplica_birch(dados)
Exemplo n.º 2
0
class ClusteringObjectClassifierModel(object):
    def __init__(self):
        self.learned_classes = dict()
        self.max_classes = 10
        self.estimator = Birch(n_clusters=None, threshold=10.0)

    def online_fit(self, X, class_name):
        self.estimator.partial_fit(X)

        cluster_id = np.asscalar(self.estimator.labels_)
        if cluster_id not in self.learned_classes:
            print("Assigning cluster id %d to class %s" %
                  (cluster_id, class_name))
            self.learned_classes[cluster_id] = class_name

        return self.__pca_on_cluster_centers(
            self.estimator.subcluster_centers_)

    def __pca_on_cluster_centers(self, cluster_centers):
        pca = PCA(n_components=2)
        coords = np.atleast_2d(pca.fit_transform(cluster_centers))
        if len(coords) < 2:
            return np.zeros(1), np.zeros(1)

        return coords[:, 0], coords[:, 1]

    def predict_class(self, X):
        if not hasattr(self.estimator, "root_"):
            return False, False

        cluster_id = np.asscalar(self.estimator.predict(X))
        if cluster_id not in self.learned_classes:
            return False, False

        return self.learned_classes[cluster_id], cluster_id
Exemplo n.º 3
0
def test_partial_fit_second_call_error_checks():
    # second partial fit calls will error when n_features is not consistent
    # with the first call
    X, y = make_blobs(n_samples=100)
    brc = Birch(n_clusters=3)
    brc.partial_fit(X, y)

    msg = "X has 1 features, but Birch is expecting 2 features"
    with pytest.raises(ValueError, match=msg):
        brc.partial_fit(X[:, [0]], y)
Exemplo n.º 4
0
def test_partial_fit():
    # Test that fit is equivalent to calling partial_fit multiple times
    X, y = make_blobs(n_samples=100)
    brc = Birch(n_clusters=3)
    brc.fit(X)
    brc_partial = Birch(n_clusters=None)
    brc_partial.partial_fit(X[:50])
    brc_partial.partial_fit(X[50:])
    assert_array_almost_equal(brc_partial.subcluster_centers_,
                              brc.subcluster_centers_)

    # Test that same global labels are obtained after calling partial_fit
    # with None
    brc_partial.set_params(n_clusters=3)
    brc_partial.partial_fit(None)
    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
Exemplo n.º 5
0
def birtch_partial(matrix,n_cluster):
    brc = Birch(branching_factor=100, n_clusters=n_cluster, threshold=1.0, compute_labels = True)
    model=brc.partial_fit(matrix)

    res = model.predict(matrix)

    return res
Exemplo n.º 6
0
    def specBIRCH(self, n_clusters, spectralptsfile):
        """
        Use BIRCH clustering on spectral data only.
        """
        self.classifier = "Spectral-BIRCH"
        self.inptsfile = spectralptsfile
        points = self.loadPoints()
        points = points[self.validhit_bool, :]

        print "Running BIRCH clustering on spectral data only ..."
        points = StandardScaler(copy=False).fit_transform(points)
        brc = Birch(n_clusters=n_clusters)
        # Feed the points to the BIRCH gradually
        npts = len(points)
        niter = int(npts / self.birch.pf_npts) + 1
        for i in xrange(niter - 1):
            brc.partial_fit(points[i * self.birch.pf_npts:(i + 1) *
                                   self.birch.pf_npts, :])
        brc.partial_fit(points[(niter - 1) * self.birch.pf_npts:, :])
        self.labels[self.validhit_bool] = brc.predict(points)
Exemplo n.º 7
0
def birchCluster(zD, maxd, out='dict', N=None, start=0, stop=None):
    #The radius of the subcluster obtained by merging a new sample and the closest subcluster should be lesser than the threshold.
    #Otherwise a new subcluster is started. Setting this value to be very low promotes splitting and vice-versa.
    data = zD.dictPos
    stop = len(zD.pList) if not stop else stop
    X = [[data['x'][i], data['y'][i], data['z'][i]]
         for i in range(start, stop)]
    brc = Birch(branching_factor=50,
                n_clusters=None,
                threshold=maxd,
                compute_labels=True)
    brc.fit(X)
    if N:
        brc.set_params(n_clusters=N)
    brc.partial_fit(np.matrix(X))
    groups = brc.predict(X)
    if out == 'dict':
        return list2dict(zD, groups)
    elif out == 'list':
        return groups
    else:
        raise Exception("Out argument must have valus 'dict' or 'list'")
Exemplo n.º 8
0
class Birch_algo_wrapper:
    def __init__(self):
        self.wrapped = Birch(n_clusters=None,
                             threshold=0.5,
                             branching_factor=50)

    def fit(self, data):
        return self.wrapped.fit(data)

    def fit_predict(self, data):
        self.wrapped = self.wrapped.partial_fit(data)
        return self.wrapped.predict(data)

    def predict(self, data):
        return self.wrapped.predict(data)
Exemplo n.º 9
0
def main():
    vec = HashingVectorizer(tokenizer=preprocess,
                            ngram_range=(3, 3),
                            analyzer='word')
    clu = Birch(n_clusters=3)
    #clu = MiniBatchKMeans(n_clusters=2)

    config = configparser.ConfigParser()
    config.read('cfg.ini')
    config = config['DEFAULT']
    api = twitter.Api(consumer_key=config['consumer_key'],
                      consumer_secret=config['consumer_secret'],
                      access_token_key=config['access_token_key'],
                      access_token_secret=config['access_token_secret'])
    queue = deque(maxlen=50)
    for n, line in enumerate(
            api.GetStreamFilter(track=[
                'pokemon', 'dark souls', 'darksouls', 'sonic', 'hedgehog'
            ],
                                languages=['en'])):
        if n > 1000000:
            break
        elif len(queue) != 50:
            try:
                queue.append(line['text'])
                logging.warning("%s", line['text'])
            except KeyError:
                pass
        else:
            try:
                v = vec.transform(queue)
                clu = clu.partial_fit(v)
                logging.warning('TESTING\n.\n.\n.\n.')
                logging.warning("%s, %s, %s", n, clu.predict(v[-1]), queue[-1])
            except KeyError:
                pass
            queue.clear()

    pickle.dump(clu, open('cluster_model.pkl', 'w'))
Exemplo n.º 10
0
    affinity_propagation_valid_performance_metrics_for_plotting[item + 1] = affinity_propagation_valid_performance_metric_array[item]
    affinity_propagation_test_performance_metrics_for_plotting[item + 1] = affinity_propagation_test_performance_metric_array[item]
Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(affinity_propagation_parameter_search_space_for_plotting,
                                                                        affinity_propagation_valid_performance_metrics_for_plotting,
                                                                        affinity_propagation_test_performance_metrics_for_plotting,
                                                                        'Adjusted Mutual Information Score',
                                                                        'AffinityPropagation Clustering damping parameter',
                                                                        'Affinity_Propagation_Performance',
                                                                        0,
                                                                        0.5,
                                                                        left_horizontal_limit=0.5)

# Do BIRCH, optimizing number of calls to partial_fit over a validation set
current_optimal_birch_number_of_calls = 1
initial_optimal_birch_clusterer = Birch()
initial_optimal_birch_clusterer.partial_fit(train_data_set)
initial_optimal_birch_clusterer.set_params(n_clusters=number_of_classes)
initial_birch_valid_predictions = initial_optimal_birch_clusterer.predict(valid_data_set)
initial_birch_test_predictions = initial_optimal_birch_clusterer.predict(test_data_set)

# Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix
for element in range(number_of_valid_observations):
    initial_birch_valid_predictions[element] += 1
for element in range(number_of_test_observations):
    initial_birch_test_predictions[element] += 1
initial_birch_valid_predictions = Clustering.Hungarian_Fix(initial_birch_valid_predictions,
                                                           valid_labels).astype('int')
initial_birch_test_predictions = Clustering.Hungarian_Fix(initial_birch_test_predictions,
                                                          test_labels).astype('int')

# Set a starting point for optimality of the initial performance metric, to be possibly adjusted later
Exemplo n.º 11
0
centers_2 = [[10, 10], [0, 1]]
X2, _ = make_blobs(n_samples=n_samples_2, centers=centers_2, cluster_std=0.2)

# Prints these new points
plt.plot([X2[a][0] for a in range(n_samples_2)],
         [X2[b][1] for b in range(n_samples_2)], '.')
plt.axis([-4, 12, -4, 12])
plt.show()

labels = brc.labels_
cluster_centers = brc.subcluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

# Adds the new points to the old clustering with "partial_fit"
brc.partial_fit(X2)

labels = np.concatenate([labels, brc.labels_])
cluster_centers = brc.subcluster_centers_  # All the cluster centers existing (old and new)
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

# All the points generated (old and new)
X_tot = np.concatenate([X, X2])

# Prints the different clusters computed
# We can see that the some new points were added to an old cluster ([0,1]), and the others created a new cluster
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
Exemplo n.º 12
0
    def spaBIRCH(self, n_clusters, spectralptsfile, mscfile, use_scales=None):
        """
        Use BIRCH clustering on spatial data only.
        """
        self.classifier = "Spatial-BIRCH"
        self.inptsfile = spectralptsfile
        self.mscfile = mscfile

        self.loadPoints()

        print "Running BIRCH clustering on spatial data only ..."

        mscfobj = dpu.openMSC(mscfile)
        mscheader = mscfobj.header

        nscales = len(mscheader[1])
        if use_scales is None:
            use_scales = np.arange(nscales)
        else:
            if np.any(use_scales >= nscales):
                raise RuntimeError(
                    "Indices to scales out of bound, {0:d} scales in input MSC\n"
                    .format(nscales))
            if np.any(use_scales < 0):
                raise RuntimeError(
                    "Indices to scales out of bound, negative indices found")

        # Process the points in batches gradually
        npts = mscheader[0]
        niter = int(npts / self.birch.pf_npts) + 1

        rusage_denom = 1024.

        pca_flag = False

        if pca_flag:
            # Transform the data with PCA
            print "\tPCA of MSC spatial data ..."
            ipca = IncrementalPCA(n_components=len(use_scales))
            for i in xrange(niter):
                mscdata = mscfobj.read(npts=self.birch.pf_npts,
                                       use_scales=use_scales)
                mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1]
                if np.sum(mscbool) == 0:
                    if self.verbose:
                        # debug
                        print "\t\tno valid points, {0:d} / {1:d}".format(
                            i, niter)
                    continue
                ipca.partial_fit(mscdata[mscbool, 0:-1])
                sys.stdout.write("{0:d} / {1:d}  \n".format(i, niter))

            print np.cumsum(ipca.explained_variance_ratio_)

        # Train the standard scaler to scale the input data
        # incrementally
        print
        print "\tTraining preprocessing scaler for MSC spatial data ..."
        mscfobj.next_pt_idx = 0
        scaler = StandardScaler()
        for i in xrange(niter):
            mscdata = mscfobj.read(npts=self.birch.pf_npts,
                                   use_scales=use_scales)
            mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1]
            if np.sum(mscbool) == 0:
                if self.verbose:
                    # debug
                    print "\t\tno valid points, {0:d} / {1:d}".format(i, niter)
                continue
            if pca_flag:
                scaler.partial_fit(ipca.transform(mscdata[mscbool, 0:-1]))
            else:
                scaler.partial_fit(mscdata[mscbool, 0:-1])

            mem = resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss / rusage_denom
            sys.stdout.write("{0:d} / {1:d}: {2:.2f}\n".format(i, niter, mem))

        # Train the BIRCH
        print
        print "\tTraining the BIRCH cluster ..."
        mscfobj.next_pt_idx = 0
        brc = Birch(n_clusters=n_clusters)
        for i in xrange(niter):
            mscdata = mscfobj.read(npts=self.birch.pf_npts,
                                   use_scales=use_scales)
            mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1]
            if np.sum(mscbool) == 0:
                if self.verbose:
                    # debug
                    print "\t\tno valid points, {0:d} / {1:d}".format(i, niter)
                continue
            if pca_flag:
                brc.partial_fit(
                    scaler.transform(ipca.transform(mscdata[mscbool, 0:-1])))
            else:
                brc.partial_fit(scaler.transform(mscdata[mscbool, 0:-1]))

            mem = resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss / rusage_denom
            sys.stdout.write("{0:d} / {1:d}: {2:.2f}\n".format(i, niter, mem))

        # Predict the label of points after feeding all points to
        # BIRCH
        print
        print "\tPredicting BIRCH clustering labels ..."
        # Rewind the MSC file object to read points from the
        # beginning.
        mscfobj.next_pt_idx = 0
        for i in xrange(niter):
            mscdata = mscfobj.read(npts=self.birch.pf_npts,
                                   use_scales=use_scales)
            mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1]
            if np.sum(mscbool) == 0:
                if self.verbose:
                    # debug
                    print "\t\tno valid points, {0:d} / {1:d}".format(i, niter)
                continue
            if pca_flag:
                self.labels[mscdata[mscbool, -1].astype(int) -
                            1] = brc.predict(
                                scaler.transform(
                                    ipca.transform(mscdata[mscbool, 0:-1])))
            else:
                self.labels[mscdata[mscbool, -1].astype(int) -
                            1] = brc.predict(
                                scaler.transform(mscdata[mscbool, 0:-1]))

            mem = resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss / rusage_denom
            sys.stdout.write("{0:d} / {1:d}: {2:.2f}\n".format(i, niter, mem))

        mscfobj.close()
Exemplo n.º 13
0
    # Perform the online clustering
    mbkm = MiniBatchKMeans(n_clusters=nb_clusters,
                           batch_size=batch_size,
                           reassignment_ratio=0.001,
                           random_state=1000)
    birch = Birch(n_clusters=nb_clusters, threshold=0.2, branching_factor=350)

    scores_mbkm = []
    scores_birch = []

    for i in range(0, nb_samples, batch_size):
        X_batch, Y_batch = X[i:i + batch_size], Y[i:i + batch_size]

        mbkm.partial_fit(X_batch)
        birch.partial_fit(X_batch)

        scores_mbkm.append(
            adjusted_rand_score(Y[:i + batch_size],
                                mbkm.predict(X[:i + batch_size])))
        scores_birch.append(
            adjusted_rand_score(Y[:i + batch_size],
                                birch.predict(X[:i + batch_size])))

    Y_pred_mbkm = mbkm.predict(X)
    Y_pred_birch = birch.predict(X)

    print('Adjusted Rand score Mini-Batch K-Means: {}'.format(
        adjusted_rand_score(Y, Y_pred_mbkm)))
    print('Adjusted Rand score BIRCH: {}'.format(
        adjusted_rand_score(Y, Y_pred_birch)))
Exemplo n.º 14
0
class BorderPointStreamClustering:
    def __init__(self, data_frame):

        self.data_frame = data_frame
        self.birch_tree = None
        self.clustering_name = None
        self.clustering_result = None
        self.cluster_count = None

    def cluster_Birch(self,
                      branch=10,
                      n_Clusters=2,
                      threshold=0.1,
                      k=10,
                      visualize=True):

        nn_indices = calculate_k_nearest_neighbours(
            self.data_frame.get_point_only_df(), k)[1]

        self.birch_tree = Birch(branching_factor=branch,
                                n_clusters=n_Clusters,
                                threshold=threshold,
                                compute_labels=True)
        point_finnished_name = "anytime calc finnished"
        border_point_name = "border point"
        self.clustering_name = "birch streaming clustering " + str(
            time.process_time())
        self.data_frame.add_result_name(self.clustering_name, -1,
                                        ColType.CLUSTER_LABEL)
        self.data_frame.add_result_name(border_point_name, 0,
                                        ColType.BORDER_POINT)
        self.data_frame.add_result_name(point_finnished_name, 0,
                                        ColType.UNKNOWN)  # Gotta add colytype

        border_degree_res_name = "border degree"
        self.data_frame.add_result_name(border_degree_res_name, -1,
                                        ColType.BORDER_DEGREE)
        step_size = 50
        max_border_degree = 70
        stops = [0.1, 0.2, 0.5]
        stopped = False
        while not stopped:

            new_data_points_batch = self.data_frame.get_data_batch(step_size)
            new_border_points_indices = []

            compute_approx_enclosing_degree_avg_for_batch(
                self.data_frame, new_data_points_batch, nn_indices, k,
                border_degree_res_name)

            for index, row in new_data_points_batch.iterrows():

                deg = self.data_frame.df.at[index, border_degree_res_name]
                if deg <= max_border_degree:
                    self.data_frame.add_result(border_point_name, index, 1)
                    new_border_points_indices.append(index)
                self.data_frame.add_result(point_finnished_name, index, 1)

            new_border_points = self.data_frame.df.iloc[
                new_border_points_indices, :]
            new_border_points_po = self.data_frame.get_point_only_df(
                new_border_points).to_numpy()

            print(new_border_points_po)

            self.birch_tree.partial_fit(new_border_points_po)
            print(self.birch_tree.labels_)

        print(self.birch_tree.labels_)

        labels = self.birch_tree.labels_

        bp_list = self.data_frame.get_border_points_point_only_df(
        ).index.tolist()
        bp_clus = list(zip(bp_list, labels))
        self.data_frame.add_result_name(self.clustering_name, -1,
                                        ColType.CLUSTER_LABEL)

        for ind, clus in bp_clus:
            self.data_frame.add_result(self.clustering_name, ind, clus)

        self._assign_inner_points(self.clustering_name)

        self.cluster_count = len(
            set(self.data_frame.df[self.clustering_name].tolist()))
        self.clustering_result = self.data_frame.df[
            self.clustering_name].tolist()

        return self.clustering_name

    @staticmethod
    def get_degree_and_euclidean_distance_metric(modifier=1):
        def degree_and_euclidean_distance_metric(point_a, point_b):

            distance_vector_a = point_a[:len(point_a) // 2]
            angular_vector_a = point_a[len(point_a) // 2:]

            distance_vector_b = point_b[:len(point_b) // 2]
            angular_vector_b = point_b[len(point_b) // 2:]

            angular_vector_a_norm = numpy.linalg.norm(
                angular_vector_a.tolist())
            angular_vector_b_norm = numpy.linalg.norm(
                angular_vector_b.tolist())
            dp = numpy.dot(angular_vector_a.tolist(),
                           angular_vector_b.tolist())
            upper_term = (dp / (angular_vector_a_norm * angular_vector_b_norm))
            if round(upper_term, 2) == 1:
                direction = 0
            else:
                direction = numpy.degrees(numpy.arccos(upper_term))

            if np.isnan(direction):
                direction = 0

            distance = numpy.linalg.norm(distance_vector_a - distance_vector_b)

            linear_multiplier_max = modifier
            per_degree_multiplicator = ((linear_multiplier_max - 1) / 180)
            angular_weighted_distance = distance * (
                1 + (per_degree_multiplicator * direction))
            return angular_weighted_distance

        return degree_and_euclidean_distance_metric

    def _similarity_measure_data_pre_processing(self):
        pre_processed_degree_euclidean_data = self.data_frame.get_border_points_point_only_df(
        ).values
        angular_measure_data = self.data_frame.get_border_points_direction_only_df(
        )
        angular_measure_data_values = angular_measure_data.values

        single_array_values = angular_measure_data_values
        new_list = []
        for j in range(len(pre_processed_degree_euclidean_data)):
            x = numpy.array(single_array_values[j])
            y = pre_processed_degree_euclidean_data[j]
            new_list.append(numpy.append(y, x))
        pre_processed_degree_euclidean_data = new_list

        return pre_processed_degree_euclidean_data

    def _visualize_DBSCAN(self, cluster_name):

        n_clusters_ = len(set(self.df[cluster_name].tolist()))

        if self.dimensions == 2:
            DataVisualization.visualize_plot_2d(
                self.df,
                hue=cluster_name,
                palette=DataVisualization.create_categorical_palette(
                    n_clusters_))

    def _visualize_DBSCAN_mask(self, labels, distance_measure_data_values,
                               core_sample_indices, n_clusters_):

        core_samples_mask = numpy.zeros_like(labels, dtype=bool)
        core_samples_mask[core_sample_indices] = True

        if self.dimensions == 2:
            # Black removed and intsys used for noise instead.
            unique_labels = set(labels)
            colors = [
                plt.cm.Spectral(each)
                for each in numpy.linspace(0, 1, len(unique_labels))
            ]
            for k, col in zip(unique_labels, colors):
                if k == -1 or k == 0:
                    # Black used for noise.
                    col = [0, 0, 0, 1]

                class_member_mask = (labels == k)

                X = numpy.asarray(distance_measure_data_values)
                xy = X[class_member_mask & core_samples_mask]
                plt.plot(xy[:, 0],
                         xy[:, 1],
                         'o',
                         markerfacecolor=tuple(col),
                         markeredgecolor='k',
                         markersize=8)

                xy = X[class_member_mask & ~core_samples_mask]
                plt.plot(xy[:, 0],
                         xy[:, 1],
                         'o',
                         markerfacecolor=tuple(col),
                         markeredgecolor='k',
                         markersize=8)

            plt.title('Estimated number of clusters: %d' % n_clusters_)
            plt.show()

    def _assign_inner_points(self, cluster_name):

        border_points = self.data_frame.get_border_points_point_only_df()
        border_points_values = border_points.values
        border_points_indices = border_points.index.values

        inner_points = self.data_frame.get_inner_points_point_only_df()
        tree = spatial.KDTree(border_points_values)

        for index, row in inner_points.iterrows():

            point_to_search = row.values
            distance, idx = tree.query(point_to_search)

            nearest_border_point = border_points_indices[idx]
            clusternr = self.data_frame.df.iloc[nearest_border_point][
                cluster_name]

            self.data_frame.add_result(cluster_name, index, clusternr)
Exemplo n.º 15
0
def birch_pipeline(args: argparse.Namespace) -> None:
    """
    The main function of this application when birch is requested.

    Parameters
    ----------
    args: `argparse.Namespace`, required
        The argument namespace passed from terminal
    """
    method_parameters = parse_parameters_from_special_string(args.method_params)
    input_dict = fetch_input_dict(input_files=args.input_files)
    X = numpy.array(input_dict['vector_representation'])
    method_searchspace = args.method_searchspace
    variable_name, variable_low, variable_high = method_searchspace.split(':')
    variable_low = int(variable_low)
    variable_high = int(variable_high)
    history = []

    output_filepath = os.path.abspath(args.output_bundle)

    if args.batch_size == 0:
        for i in tqdm(range(variable_low, variable_high + 1)):
            method_parameters[variable_name] = i

            if method_parameters['n_clusters'] == 0:
                method_parameters['n_clusters'] = None

            method_model = Birch(**method_parameters).fit(X)
            method_model.fit(X)

            history.append(
                {'search_on': variable_name,
                 'method_name': 'birch',
                 'parameters': method_parameters.copy(),
                 'input_files': args.input_files,
                 'labels': method_model.labels_.copy(),
                 'loss': compute_point_loss(X=X, labels=method_model.labels_)
                 }
            )

            with open(output_filepath, 'wb') as handle:
                pickle.dump({'history': history}, handle)

            del method_model
    else:
        for i in tqdm(range(variable_low, variable_high + 1)):
            method_parameters[variable_name] = i
            method_model = Birch(**method_parameters)

            random_index_permutation = numpy.random.permutation(X.shape[0])
            labels = numpy.zeros(X.shape[0])

            for epoch in range(args.num_epochs):
                print('>> (status): epoch {}/{}\n'.format(epoch, args.num_epochs))
                cursor = 0
                while (cursor + args.batch_size) <= X.shape[0]:
                    method_model.partial_fit(
                        X[random_index_permutation[cursor:(cursor + args.batch_size)], :])
                    labels[random_index_permutation[cursor:(cursor + args.batch_size)]] = method_model.labels_
                    cursor += args.batch_size

            history.append(
                {'search_on': variable_name,
                 'method_name': 'kmeans',
                 'parameters': method_parameters.copy(),
                 'input_files': args.input_files,
                 'labels': labels.copy(),
                 'loss': compute_point_loss(X=X, labels=method_model.labels_)
                 }
            )

            with open(output_filepath, 'wb') as handle:
                pickle.dump({'history': history}, handle)

            del method_model

    print('\n>> status: all finished.\n')
n_samples_2 = 200
centers_2 = [[10,10], [0,1]]
X2, _ = make_blobs(n_samples=n_samples_2, centers=centers_2, cluster_std=0.2)

# Prints these new points
plt.plot([X2[a][0] for a in range(n_samples_2)], [X2[b][1] for b in range(n_samples_2)], '.')
plt.axis([-4,12,-4,12])
plt.show()

labels = brc.labels_
cluster_centers = brc.subcluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

# Adds the new points to the old clustering with "partial_fit"
brc.partial_fit(X2)

labels = np.concatenate([labels,brc.labels_])
cluster_centers = brc.subcluster_centers_ # All the cluster centers existing (old and new)
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

# All the points generated (old and new)
X_tot = np.concatenate([X,X2])

# Prints the different clusters computed
# We can see that the some new points were added to an old cluster ([0,1]), and the others created a new cluster
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
batch_size = 80


if __name__ == '__main__':
    # Create the dataset
    X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=5, cluster_std=1.5, random_state=1000)

    # Create an instance of BIRCH
    birch = Birch(n_clusters=5, threshold=0.15, branching_factor=100)

    # Train the model
    X_batch = []
    Y_preds = []

    for i in range(0, nb_samples, batch_size):
        birch.partial_fit(X[i:i + batch_size])
        X_batch.append(X[:i + batch_size])
        Y_preds.append(birch.predict(X[:i + batch_size]))

    print(adjusted_rand_score(birch.predict(X), Y))

    # Show the training steps
    fig, ax = plt.subplots(5, 5, figsize=(20, 12))

    for i in range(5):
        for j in range(5):
            idx = (i * 5) + j

            for k in range(5):
                ax[i][j].scatter(X_batch[idx][Y_preds[idx] == k, 0], X_batch[idx][Y_preds[idx] == k, 1], s=3)
Exemplo n.º 18
0
def mClassification(X, y, threshold, K=None):
    brc = Birch(n_clusters=K, threshold=threshold, compute_labels=True)
    return brc.partial_fit(X)
Exemplo n.º 19
0
    
if index.vocab_size < args.bsize:
    logging.info("ERROR: Batch size [{}] must be greater than vocabulary [{}]".format(args.bsize, index.vocab_size))
    exit()

sparse_word_centroids = wordCentroids(db=index, vect=vectorizer)
# Tal vez pueda cargar la matrix dipersa de word_centroids en ram y hacer NMF.
# 
logging.info("Fitting Birch clustering for sparse coding ...")
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=args.dim) #MiniBatchKMeans(n_clusters=args.dim, init='k-means++', max_iter=4, batch_size=batch_size)
words = []
for i, batch in enumerate(batches(sparse_word_centroids, batch_size)):
    #buffer.append(vstack(batch))
    logging.info("Fitted the %d th batch..." % i)
    words.append(batch[0])
    birch.partial_fit(batch[1])

words = list(chain(*words))

for i, batch in enumerate(batches(sparse_word_centroids, batch_size)):
    if i == 0:
        #word_embeddings = batch[1].dot(csr_matrix(birch.subcluster_centers_).T)
        word_embeddings = birch.transform(batch[1])
    else:
        #word_embeddings = vstack([word_embeddings, batch[1].dot(csr_matrix(birch.subcluster_centers_).T)])
        
        word_embeddings = np.vstack([word_embeddings, birch.transform(batch[1])])

# word_embeddings.shape = (vocab_size, args.dim)
logging.info("DB Vocabulary size %d ..." % index.vocab_size)
logging.info("Vectorizer vocabulary size %d ..." % len(vectorizer.vocabulary_.keys()))