def agglomerate(corpus, threshold=1.4, ignoreOutliers=True): """ Cluster a set of questions using the hierarchical (bottom-up) agglomerative clustering method. Parameters: corpus (list): Tagged Questions Corpus collection of questions and their ids threshold (int): Interger value to determine the distance threshold for the cut-off point as we build the dendrogram removeOutliers (bool): A flag to determine whether to remove outliers or not (default is True) Returns: corpus: Corpus that has clusters list attached to it """ repMatrix = makeRepresentationMatrix(corpus) outliers = [] if ignoreOutliers: outliers, repMatrix = getOutliers(repMatrix, corpus=corpus) clustering = AgglomerativeClustering(linkage="ward", distance_threshold=threshold, n_clusters=None) clustering.fit(repMatrix) mapping = [i[1] - i[0] for i in enumerate(outliers)] clustering.labels_ = np.insert(clustering.labels_, mapping, -1) clusterMap = createClusterMap(corpus, clustering) print(clusterMap) #JEFFLAG corpus = nameClusters(clusterMap) print("The corpus's clusters after naming") print(corpus.clusters) #JEFFLAG return corpus
def update_clusters(clustering: AgglomerativeClustering, new_distance_threshold: float): """ Updates the cluster labels for each datapoint to be consistent with the algorithm's hierarchy and given distance threshold. Useful when we already ran the HAC algorithm to determine the points' hierarchy but want to change the threshold at which the number of clusters is found. :param AgglomerativeClustering clustering: the clustering algorithm with the distances :param float new_distance_threshold: the new distance threshold at which the number of clusters is to be determined. :return: """ clustering.distance_threshold = new_distance_threshold clustering.labels_ = np.full_like(clustering.labels_, -1, dtype=int) _update_clusters(clustering) clustering.labels_ = np.max( clustering.labels_ ) - clustering.labels_ # invert to follow natural order clustering.n_clusters_ = int(np.max(clustering.labels_) + 1)