def test_similarity_to_distance():
    matrix = numpy.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]], dtype=numpy.float)
    output = utils.similarity_to_distance(matrix, 20.0)
    expected = numpy.array([[0, 20.0, 10.0], [20.0, 0, 6.66666667],
                            [10.0, 6.666666667, 0]])
    assert output.shape == expected.shape
    assert all(output.ravel() - expected.ravel() < 1e4)
示例#2
0
def hierarchical_dbscan(similarities=None, distance_cutoff=200):
    """ cluster using the Hierarchical DBSCAN algorithm """
    if similarities is None:
        return None
    distance_matrix = utils.similarity_to_distance(similarities,
                                                   distance_cutoff)
    hdbscan_clusterer = hdbscan.HDBSCAN(metric="precomputed", min_samples=3)
    labels = hdbscan_clusterer.fit_predict(distance_matrix)
    return labels
示例#3
0
def dbscan(similarities=None, threshold=None, distance_cutoff=200):
    """ cluster using DBSCAN algorithm """
    if similarities is None:
        return None
    distance_matrix = utils.similarity_to_distance(similarities,
                                                   distance_cutoff)
    threshold = threshold or 2.8
    dbscan_clusterer = sklearn.cluster.DBSCAN(eps=threshold,
                                              min_samples=3,
                                              metric="precomputed",
                                              algorithm="brute")
    labels = dbscan_clusterer.fit_predict(distance_matrix)
    return labels
示例#4
0
def hierarchical_clustering(similarities=None,
                            method='complete',
                            threshold=None,
                            distance_cutoff=200):
    """ create a flat clustering based on hierarchical clustering methods and a threshold """
    if similarities is None:
        return None
    distance_matrix = utils.similarity_to_distance(similarities,
                                                   distance_cutoff)
    linkage = sch.linkage(distance_matrix, method=method)
    threshold = threshold or 0.7 * linkage[:, 2].max()
    labels = sch.fcluster(linkage, threshold, criterion='distance')
    return labels
    args = parse_arguments()

    edgelist = None
    if args.edgelist:
        print("edgelist filename=" + args.edgelist)
        edgelist = utils.read_edgelist_file(args.edgelist)

    matrix = None
    if args.matrix:
        print("matrix filename=" + args.matrix)
        matrix = utils.read_distance_matrix_file(args.matrix)

    if args.convert:
        print("convert=" + args.convert)
        if args.edgelist:
            edgelist['d'] = utils.similarity_to_distance(edgelist['d'], float(args.convert))
        if args.matrix:
            matrix = utils.similarity_to_distance(matrix, float(args.convert))

    if args.names:
        print("names filenname=" + args.names)
    print("clustering_algorithm=" + args.clustering_algorithm)


    if args.clustering_algorithm == 'hierarchical':
        clustering = hierarchical_clustering(edgelist=edgelist, distance_matrix=matrix)
    elif args.clustering_algorithm == 'dbscan':
        clustering = dbscan(edgelist=edgelist, distance_matrix=matrix)
    elif args.clustering_algorithm == 'spectral':
        clustering = spectral(edgelist=edgelist, distance_matrix=matrix)
    elif args.clustering_algorithm == 'agglomerative':
    args = parse_arguments()

    edgelist = None
    if args.edgelist:
        print("edgelist filename=" + args.edgelist)
        edgelist = utils.read_edgelist_file(args.edgelist)

    matrix = None
    if args.matrix:
        print("matrix filename=" + args.matrix)
        matrix = utils.read_distance_matrix_file(args.matrix)

    if args.convert:
        print("convert=" + args.convert)
        if args.edgelist:
            edgelist['d'] = utils.similarity_to_distance(
                edgelist['d'], float(args.convert))
        if args.matrix:
            matrix = utils.similarity_to_distance(matrix, float(args.convert))

    if args.names:
        print("names filenname=" + args.names)
    print("clustering_algorithm=" + args.clustering_algorithm)

    if args.clustering_algorithm == 'hierarchical':
        clustering = hierarchical_clustering(edgelist=edgelist,
                                             distance_matrix=matrix)
    elif args.clustering_algorithm == 'dbscan':
        clustering = dbscan(edgelist=edgelist, distance_matrix=matrix)
    elif args.clustering_algorithm == 'hdbscan':
        clustering = hierarchical_dbscan(edgelist=edgelist,
                                         distance_matrix=matrix)
def test_similarity_to_distance():
    matrix = numpy.array([[0, 1, 2],[1,0,3],[2,3,0]], dtype=numpy.float)
    output = utils.similarity_to_distance(matrix, 20.0)
    expected = numpy.array([[0, 20.0, 10.0],[20.0,0,6.66666667],[10.0,6.666666667,0]])
    assert output.shape == expected.shape
    assert all(output.ravel() - expected.ravel() < 1e4)