Exemplo n.º 1
0
 def _scoring2(X_affinity, labels_true, labels_pred):
     assert X_affinity.shape[0] == X_affinity.shape[1]
     assert X_affinity.shape == X2.shape
     score = b3_f_score(labels_true, labels_pred)
     return score
Exemplo n.º 2
0
 def _scoring(X_raw, labels_true, labels_pred):
     assert X_raw.shape == X.shape
     score = b3_f_score(labels_true, labels_pred)
     return score
Exemplo n.º 3
0
def clustering(input_signatures, input_records, distance_model,
               input_clusters=None, output_clusters=None,
               verbose=1, n_jobs=-1, clustering_method="average",
               clustering_random_state=42, clustering_test_size=None,
               clustering_threshold=None):
    """Cluster signatures using a pretrained distance model.

    Parameters
    ----------
    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be a pickle
        created using the ``distance.py`` script.

    :param input_clusters: string
        Path to the file with knownn clusters. The file should be a dictionary,
        where keys are cluster labels and values are the `signature_id` of the
        signatures grouped in the clusters. Signatures assigned to the cluster
        with label "-1" are not clustered.

        {"0": [0, 1, 3], "1": [2, 5], ...}

    :param output_clusters: string
        Path to the file with output cluster. The file will be filled with
        clusters, using the same format as ``input_clusters``.

    :param verbose: int
        If not zero, function will output scores on stdout.

    :param n_jobs: int
        Parameter passed to joblib. Number of threads to be used.

    :param clustering_method: string
        Parameter passed to ``ScipyHierarchicalClustering``. Used only if
        ``clustering_test_size`` is specified.

    :param clustering_random_state: int or RandomState
        Random state for spltting the data into training and test data.

    :param clustering_test_size: float
        Part of data used in the test set.

    :param clustering_threshold: float
        Threshold passed to ``ScipyHierarchicalClustering``.

    """
    # Assumes that 'distance_estimator' lives in global, making things fast
    global distance_estimator

    distance_estimator = pickle.load(open(distance_model, "rb"))
    signatures, records = load_signatures(input_signatures,
                                          input_records)

    indices = {}
    X = np.empty((len(signatures), 1), dtype=np.object)
    for i, signature in enumerate(sorted(signatures.values(),
                                         key=lambda s: s["signature_id"])):
        X[i, 0] = signature
        indices[signature["signature_id"]] = i

    # Semi-supervised block clustering
    if input_clusters:
        true_clusters = json.load(open(input_clusters, "r"))
        y_true = -np.ones(len(X), dtype=np.int)

        for label, signature_ids in true_clusters.items():
            for signature_id in signature_ids:
                y_true[indices[signature_id]] = label

        if clustering_test_size is not None:
            train, test = train_test_split(
                np.arange(len(X)),
                test_size=clustering_test_size,
                random_state=clustering_random_state)

            y = -np.ones(len(X), dtype=np.int)
            y[train] = y_true[train]

        else:
            y = y_true

    else:
        y = None

    clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(
            affinity=_affinity,
            threshold=clustering_threshold,
            method=clustering_method,
            supervised_scoring=b3_f_score),
        verbose=verbose,
        n_jobs=n_jobs).fit(X, y)

    labels = clusterer.labels_

    # Save predicted clusters
    if output_clusters:
        clusters = {}

        for label in np.unique(labels):
            mask = (labels == label)
            clusters[label] = [r[0]["signature_id"] for r in X[mask]]

        json.dump(clusters, open(output_clusters, "w"))

    # Statistics
    if verbose and input_clusters:
        print("Number of blocks =", len(clusterer.clusterers_))
        print("True number of clusters", len(np.unique(y_true)))
        print("Number of computed clusters", len(np.unique(labels)))
        print("B^3 F-score (overall) =", b3_f_score(y_true, labels))

        if clustering_test_size:
            print("B^3 F-score (train) =",
                  b3_f_score(y_true[train], labels[train]))
            print("B^3 F-score (test) =",
                  b3_f_score(y_true[test], labels[test]))
Exemplo n.º 4
0
 def _scoring(labels_true, labels_pred):
     score = b3_f_score(labels_true, labels_pred)
     return score
Exemplo n.º 5
0
                                        threshold=args.clustering_threshold,
                                        method=args.clustering_method,
                                        scoring=b3_f_score),
                                    verbose=args.verbose,
                                    n_jobs=args.n_jobs).fit(X, y)

        labels = clusterer.labels_

        # Save predicted clusters
        if args.output_clusters:
            clusters = {}

            for label in np.unique(labels):
                mask = (labels == label)
                clusters[label] = [r[0]["signature_id"] for r in X[mask]]

            json.dump(clusters, open(args.output_clusters, "w"))

        # Statistics
        if args.verbose and args.input_clusters:
            print("Number of blocks =", len(clusterer.clusterers_))
            print("True number of clusters", len(np.unique(y_true)))
            print("Number of computed clusters", len(np.unique(labels)))
            print("B^3 F-score (overall) =", b3_f_score(y_true, labels))

            if args.clustering_test_size:
                print("B^3 F-score (train) =",
                      b3_f_score(y_true[train], labels[train]))
                print("B^3 F-score (test) =",
                      b3_f_score(y_true[test], labels[test]))