Exemplo n.º 1
0
def test_onthefly_labels():
    clusterer = BlockClustering(base_estimator=ScipyHierarchicalClustering(
        n_clusters=1, method="complete"))
    clusterer.fit(X)
    assert_array_equal([100], np.bincount(clusterer.labels_))
    clusterer.clusterers_[0].set_params(n_clusters=4)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
Exemplo n.º 2
0
def test_onthefly_labels():
    clusterer = BlockClustering(
        base_estimator=ScipyHierarchicalClustering(n_clusters=1,
                                                   method="complete"))
    clusterer.fit(X)
    assert_array_equal([100], np.bincount(clusterer.labels_))
    clusterer.clusterers_[0].set_params(n_clusters=4)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
Exemplo n.º 3
0
class Clusterer(object):
    def __init__(self, estimator):
        # TODO get rid of this global
        global distance_estimator
        distance_estimator = estimator.distance_estimator
        try:
            distance_estimator.steps[-1][1].set_params(n_jobs=1)
        except Exception:
            pass

        # threshold determines when to split blocks into smaller ones adding first initial
        self.block_function = partial(block_phonetic,
                                      threshold=0,
                                      phonetic_algorithm='nysiis')

        self.clustering_threshold = 0.709  # magic value taken from BEARD example
        self.clustering_method = 'average'

    def load_data(self, signatures_path, publications_path,
                  input_clusters_path):
        signatures_by_uuid = load_signatures(signatures_path,
                                             publications_path)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        with open(input_clusters_path, 'r') as fd:
            for line in fd:
                cluster = json.loads(line)
                for signature_uuid in cluster['signature_uuids']:
                    if signature_uuid not in signatures_by_uuid:
                        continue  # TODO figure out how this can happen
                    self.X[i, 0] = signatures_by_uuid[signature_uuid]
                    self.y[i] = cluster['cluster_id']
                    i += 1

    def load_model(self, input_filename):
        with open(input_filename, 'rb') as fd:
            self.clusterer = pickle.load(fd)

    def save_model(self, output_filename):
        with open_file_in_folder(output_filename, 'wb') as fd:
            pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL)

    def fit(self, n_jobs=8):
        self.clusterer = BlockClustering(
            blocking=self.block_function,
            base_estimator=ScipyHierarchicalClustering(
                affinity=_affinity,
                threshold=self.clustering_threshold,
                method=self.clustering_method,
                supervised_scoring=b3_f_score),
            n_jobs=n_jobs,
            verbose=True)
        self.clusterer.fit(self.X, self.y)
Exemplo n.º 4
0
def test_predict():
    """Test predict."""
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(X, blocks=(y <= 1))
    pred = clusterer.predict(X, blocks=(y <= 1))
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    pred = clusterer.predict(X, blocks=10 * np.ones(len(X)))
    assert_array_equal(-np.ones(len(X)), pred)
Exemplo n.º 5
0
def test_predict():
    """Test predict."""
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(X, blocks=(y <= 1))
    pred = clusterer.predict(X, blocks=(y <= 1))
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    pred = clusterer.predict(X, blocks=10 * np.ones(len(X)))
    assert_array_equal(-np.ones(len(X)), pred)
Exemplo n.º 6
0
def test_partial_fit():
    """Test partial_fit."""
    blocks = (y <= 1)

    clusterer1 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
    assert_equal(len(clusterer1.clusterers_), 1)
    clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
    assert_equal(len(clusterer1.clusterers_), 2)

    clusterer2 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer2.fit(X, blocks=blocks)

    c1 = clusterer1.predict(X, blocks=blocks)
    c2 = clusterer2.labels_

    assert_equal(paired_f_score(c1, c2), 1.0)
Exemplo n.º 7
0
def test_partial_fit():
    """Test partial_fit."""
    blocks = (y <= 1)

    clusterer1 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
    assert_equal(len(clusterer1.clusterers_), 1)
    clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
    assert_equal(len(clusterer1.clusterers_), 2)

    clusterer2 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer2.fit(X, blocks=blocks)

    c1 = clusterer1.predict(X, blocks=blocks)
    c2 = clusterer2.labels_

    assert_equal(paired_f_score(c1, c2), 1.0)
Exemplo n.º 8
0
def test_fit(n_jobs):
    """Test fit."""
    # Single block
    clusterer = BlockClustering(
        blocking="single",
        base_estimator=AgglomerativeClustering(n_clusters=4,
                                               linkage="complete"),
        n_jobs=n_jobs)
    clusterer.fit(X)

    assert_equal(len(clusterer.clusterers_), 1)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed blocks
    clusterer = BlockClustering(
        blocking="precomputed",
        base_estimator=AgglomerativeClustering(n_clusters=2,
                                               linkage="complete"),
        n_jobs=n_jobs)
    clusterer.fit(X, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed affinity
    clusterer = BlockClustering(
        affinity="precomputed",
        blocking="precomputed",
        base_estimator=ScipyHierarchicalClustering(affinity="precomputed",
                                                   n_clusters=2,
                                                   method="complete"),
        n_jobs=n_jobs)
    X_affinity = euclidean_distances(X)
    clusterer.fit(X_affinity, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Custom blocking function
    X_ids = np.arange(len(X)).reshape((-1, 1))

    def _blocking(X_ids):
        return y[X_ids.ravel()] <= 1  # block labels into {0,1} and {2,3}

    clusterer = BlockClustering(
        blocking=_blocking,
        base_estimator=AgglomerativeClustering(n_clusters=2,
                                               linkage="complete",
                                               affinity=_distance))
    clusterer.fit(X_ids)

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
Exemplo n.º 9
0
def test_fit(n_jobs):
    """Test fit."""
    # Single block
    clusterer = BlockClustering(blocking="single",
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=4, linkage="complete"),
                                n_jobs=n_jobs)
    clusterer.fit(X)

    assert_equal(len(clusterer.clusterers_), 1)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed blocks
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=2, linkage="complete"),
                                n_jobs=n_jobs)
    clusterer.fit(X, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed affinity
    clusterer = BlockClustering(affinity="precomputed",
                                blocking="precomputed",
                                base_estimator=ScipyHierarchicalClustering(
                                    affinity="precomputed",
                                    n_clusters=2,
                                    method="complete"),
                                n_jobs=n_jobs)
    X_affinity = euclidean_distances(X)
    clusterer.fit(X_affinity, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Custom blocking function
    X_ids = np.arange(len(X)).reshape((-1, 1))

    def _blocking(X_ids):
        return y[X_ids.ravel()] <= 1  # block labels into {0,1} and {2,3}

    clusterer = BlockClustering(blocking=_blocking,
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=2,
                                    linkage="complete",
                                    affinity=_distance))
    clusterer.fit(X_ids)

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
Exemplo n.º 10
0
def test_validation():
    """Test the validation of hyper-parameters and input data."""
    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="foobar", base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X, blocks=(y <= 1))
        clusterer.predict(X)
Exemplo n.º 11
0
def test_validation():
    """Test the validation of hyper-parameters and input data."""
    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="foobar",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X, blocks=(y <= 1))
        clusterer.predict(X)
Exemplo n.º 12
0
if __name__ == "__main__":
    # Load data
    data = np.load("data/author-disambiguation.npz")
    X = data["X"]
    truth = data["y"]

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(
            threshold=0.5,
            affinity=affinity,
            method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()

    # Statistics
    print("Number of blocks =", len(block_clusterer.clusterers_))
    print("True number of clusters", len(np.unique(truth)))
Exemplo n.º 13
0
class Clusterer(object):
    def __init__(self, estimator):
        # TODO get rid of this global
        global distance_estimator
        distance_estimator = estimator.distance_estimator
        try:
            distance_estimator.steps[-1][1].set_params(n_jobs=1)
        except Exception:
            pass

        # threshold determines when to split blocks
        # into smaller ones adding first initial
        self.block_function = partial(block_phonetic,
                                      threshold=0,
                                      phonetic_algorithm="nysiis")

        self.clustering_threshold = 0.709  # magic value taken from BEARD example
        self.clustering_method = "average"

    def load_data(self, signatures, input_clusters):
        """Loads data to the estimator vectors

        Args:
            signatures (iterable): Signatures which should be processed
            input_clusters (iterable): Input clusters built for provided signatures
                see: `inspire_disambiguation.core.es.readers.get_input_clusters`

        """
        signatures_by_uuid = load_signatures(signatures)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        for cluster in input_clusters:
            for signature_uuid in cluster["signature_uuids"]:
                if signature_uuid not in signatures_by_uuid:
                    continue  # TODO figure out how this can happen
                self.X[i, 0] = signatures_by_uuid[signature_uuid]
                self.y[i] = cluster["cluster_id"]
                i += 1

    def load_model(self, input_filename):
        """Loads model dumped by pickle

        Args:
            input_filename (str): path to file with dumped ethnicity model.
        """
        with open(input_filename, "rb") as fd:
            self.clusterer = pickle.load(fd)

    def save_model(self, output_filename):
        """Dump object to a file

        Args:
            output_filename (str): Path where model will be dumped
        """
        with open_file_in_folder(output_filename, "wb") as fd:
            pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL)

    def fit(self, n_jobs=8):
        """Fit data using the estimator"""
        self.clusterer = BlockClustering(
            blocking=self.block_function,
            base_estimator=ScipyHierarchicalClustering(
                affinity=_affinity,
                threshold=self.clustering_threshold,
                method=self.clustering_method,
                supervised_scoring=b3_f_score,
            ),
            n_jobs=n_jobs,
            verbose=True,
        )
        self.clusterer.fit(self.X, self.y)
Exemplo n.º 14
0
def test_single_signature(n_jobs):
    """Test clustering of a  single signature."""
    import numbers
    clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(np.array([X[0]]))
    assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)
Exemplo n.º 15
0
    # Load data
    data = np.load("data/author-disambiguation.npz")
    X = data["X"]
    truth = data["y"]

    print("hello")

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(threshold=0.5,
                                                   affinity=affinity,
                                                   method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()

    # Statistics
    print("Number of blocks =", len(block_clusterer.clusterers_))
    print("True number of clusters", len(np.unique(truth)))
class Clusterer(object):
    def __init__(self, estimator):
        # TODO get rid of this global
        global distance_estimator
        distance_estimator = estimator.distance_estimator
        try:
            distance_estimator.steps[-1][1].set_params(n_jobs=1)
        except Exception:
            pass

        # threshold determines when to split blocks
        # into smaller ones adding first initial
        self.block_function = partial(block_phonetic,
                                      threshold=0,
                                      phonetic_algorithm="nysiis")

        self.clustering_threshold = 0.709  # magic value taken from BEARD example
        self.clustering_method = "average"

    def load_data(self, signatures, input_clusters):
        """Loads data to the estimator vectors

        Args:
            signatures (iterable): Signatures which should be processed
            input_clusters (iterable): Input clusters built for provided signatures
                see: `inspire_disambiguation.core.es.readers.get_input_clusters`

        """
        signatures_by_uuid = load_signatures(signatures)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        for cluster in input_clusters:
            for signature_uuid in cluster["signature_uuids"]:
                if signature_uuid not in signatures_by_uuid:
                    continue  # TODO figure out how this can happen
                self.X[i, 0] = signatures_by_uuid[signature_uuid]
                self.y[i] = cluster["cluster_id"]
                i += 1

    def load_model(self, input_filename):
        """Loads model dumped by pickle

        Args:
            input_filename (str): path to file with dumped ethnicity model.
        """
        with open(input_filename, "rb") as fd:
            self.clusterer = pickle.load(fd)

    def save_model(self, output_filename):
        """Dump object to a file

        Args:
            output_filename (str): Path where model will be dumped
        """
        with open_file_in_folder(output_filename, "wb") as fd:
            pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL)

    def fit(self, n_jobs=8):
        """Fit data using the estimator"""
        self.clusterer = BlockClustering(
            blocking=self.block_function,
            base_estimator=ScipyHierarchicalClustering(
                affinity=_affinity,
                threshold=self.clustering_threshold,
                method=self.clustering_method,
                supervised_scoring=b3_f_score,
            ),
            n_jobs=n_jobs,
            verbose=True,
        )
        self.clusterer.fit(self.X, self.y)

    def prepare_test_data(self, test_uuids, labels):
        """
        Returns the arrays used for scoring training and test datasets

        Args:
            test_uuids - set of signatures uuids used for testing
            labels - list of labels (author id) for test dataset
        """
        all_uuids = np.vectorize(lambda x: x.signature_uuid)(self.X).flatten()
        test_uuids_array = np.array(list(test_uuids))
        mask = np.isin(all_uuids, test_uuids_array)
        y_train = self.y[~mask]
        y_test = np.array(labels)
        labels_train = self.clusterer.labels_[~mask]
        labels_test = self.clusterer.labels_[mask]
        return labels_train, y_train, labels_test, y_test

    def nb_of_clusters_predicted_for_author(
            self, input_clusters_with_all_author_labels,
            test_signature_authors_ids):
        author_ids = np.array([sample[0]['author_id'] for sample in self.X])
        author_ids[author_ids == None] = test_signature_authors_ids
        signatures_per_author = {
            cluster['author_id']: set(cluster['signature_uuids'])
            for cluster in input_clusters_with_all_author_labels
        }
        nb_of_clusters_per_author = {}
        for author_id in signatures_per_author.keys():
            author_mask = author_ids == author_id
            signatures_predicted_in_one_cluster = self.clusterer.labels_[
                author_mask]
            nb_of_clusters_per_author[author_id] = np.unique(
                signatures_predicted_in_one_cluster).size
        return nb_of_clusters_per_author

    def score(self, labels_train, y_train, labels_test, y_test):
        """
        Return the clustering statistics (b3 precision, b3 recall, b3 f1 score)
        and wrongly clustered samples for training, test and the whole dataset.

        Args:
            labels_train - array of labels predicted for training set
            y_train - array of true labels for training set
            labels_train - array of labels predicted for test set
            y_test - array of true labels for test set
        """
        return (b3_precision_recall_fscore(self.y, self.clusterer.labels_),
                b3_precision_recall_fscore(y_train, labels_train)
                if labels_train.size != 0 else None,
                b3_precision_recall_fscore(y_test, labels_test)
                if labels_test.size != 0 else None)
Exemplo n.º 17
0
def test_single_signature(n_jobs):
    """Test clustering of a  single signature."""
    import numbers
    clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(np.array([X[0]]))
    assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)