def test_hdbscan_approximate_predict(): clusterer = HDBSCAN(prediction_data=True).fit(X) cluster, prob = approximate_predict(clusterer, np.array([[-1.5, -1.0]])) assert_equal(cluster, 2) cluster, prob = approximate_predict(clusterer, np.array([[1.5, -1.0]])) assert_equal(cluster, 1) cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]])) assert_equal(cluster, -1)
def test_hdbscan_approximate_predict(): clusterer = HDBSCAN(prediction_data=True).fit(X) cluster, prob = approximate_predict(clusterer, np.array([[-1.5, -1.0]])) assert cluster == 2 cluster, prob = approximate_predict(clusterer, np.array([[1.5, -1.0]])) assert cluster == 1 cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]])) assert cluster == -1
def transform( self, documents: Union[str, List[str]]) -> Tuple[List[int], np.ndarray]: """ After having fit a model, use transform to predict new instances Arguments: documents: A single document or a list of documents to fit on Returns: predictions: Topic predictions for each documents probabilities: The topic probability distribution """ if isinstance(documents, str): documents = [documents] embeddings = self._extract_embeddings(documents) umap_embeddings = self.umap_model.transform(embeddings) probabilities = hdbscan.membership_vector(self.cluster_model, umap_embeddings) predictions, _ = hdbscan.approximate_predict(self.cluster_model, umap_embeddings) if self.mapped_topics: predictions = self._map_predictions(predictions) probabilities = self._map_probabilities(probabilities) if len(documents) == 1: probabilities = probabilities.flatten() return predictions, probabilities
def predictCellmlsCluster(self, cellmls=None): documentations = self.__getCellmlsDocumentation(cellmls) cellmlUrls, cellmlDocs = zip(*documentations.items()) tfidf = self.__calcuateTfidf(cellmlDocs) test_labels, strengths = hdbscan.approximate_predict( self.clusterer, tfidf.A) return test_labels
def ClusterOneMonthData(month, name): ## Month should be an integer (0 = Apr_2016, 8= dec_2016, 9 = jan_2017, 13 = May_2017) ## Name should be a string, with which we should save the dataframe os.chdir('C:/Users/tpaulraj/Projects/Clustering/features/') #Windows One_Month_Data = ReadMonthlyData( pattern_list[month]) #Reading one month data using the pattern list IL_One_Month_Data, monthly_data_mask = RemoveOutliers( One_Month_Data, pca_final, th1, th2) #Removing outliers from ##one month data labels, strengths = hdbscan.approximate_predict( hdb, IL_One_Month_Data) # predicting labels and strengths ## for one month ##Writing the two PCs, labels and strengths into a dataframe and storing it as hdf5 file os.chdir('C:/Users/tpaulraj/Projects/Clustering/Results/') pca_dataframe = pd.DataFrame(IL_One_Month_Data) pca_dataframe['labels'] = labels pca_dataframe['strengths'] = strengths pca_dataframe.columns = ['pca1', 'pca2', 'labels', 'strengths'] pca_dataframe.to_hdf(path_or_buf='{}_clusters'.format(name), key='pca_dataframe') One_Month_Data.iloc[monthly_data_mask].to_csv( path_or_buf='{}_feature_data'.format(name), header=True, index=True) del (IL_One_Month_Data, One_Month_Data, labels, monthly_data_mask, pca_dataframe, strengths)
def hdbscan_see(X, test_bi_pca, labels, min_samples): result = [] for mSample in min_samples: clusterer = hdbscan.HDBSCAN(min_cluster_size=mSample, prediction_data=True, gen_min_span_tree=True).fit(X) test_labels, strengths = hdbscan.approximate_predict( clusterer, test_bi_pca) test_labels[test_labels > -1] = 0 print(mSample) print(data_utils.show_performance(labels, test_labels)) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, clusterer.labels_)) # print("Calinski-Harabaz Index: %0.3f" % metrics.calinski_harabaz_score(X, clusterer.labels_)) print('--') # clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', # edge_alpha=0.6, # node_size=80, # edge_linewidth=2) # clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True) # clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette()) # result.append(metrics.silhouette_score(X, clusterer.labels_)) plt.plot(min_samples, result) plt.xlabel('min_samples') plt.ylabel('Silhouette Coefficient')
def recommend_co_merchants_hdb(df, lat, long, city, merchant, cluster_object): # Predict the cluster for longitude and latitude provided test_labels, strengths = hdbscan.approximate_predict( cluster_object, [[lat, long]]) predicted_cluster = test_labels[0] print('Predicted cluster for this lat/long combination is: ' + str(predicted_cluster)) print( "_______________________________________________________________________________" ) if predicted_cluster == -1: return ('No merchants close by') # Get the best merchant in this cluster else: pop_merch_recomm_df = ( df[df['cluster'] == predicted_cluster].iloc[0:5][[ 'merchant', 'city', 'latitude', 'longitude' ]]) pop_merch_recomm_df = pop_merch_recomm_df.reset_index(drop=True) mask = (pop_merch_recomm_df.merchant==merchant) & (pop_merch_recomm_df.latitude==lat) & \ (pop_merch_recomm_df.longitude==long) print ('Since you are currently in '+ city.capitalize() + ' ' + 'at ' + \ merchant.capitalize() + ', how about you visit these merchants around this area? ') return pop_merch_recomm_df[~mask]
def visualize(data, text, clusterer, cluster_by, whitespace_only=False): ''' Visualize appearances of each cluster in the sample text. 'data' should be output like that from get_comp_data''' # Get data for text to use for clustering: tokens = [unidecode.unidecode(char) for char in text] tokens = [ct.tweak_whitespace(w) for w in tokens[0:-1]] print("Starting labelling") labels, _ = hdbscan.approximate_predict(clusterer, data[cluster_by]) clusters = sorted(list(set(labels))) print("Done labelling") content = dt.div(dt.h1("Cluster Visualization")) for i, c in enumerate(clusters): print("DEBUG: - Visualizing cluster %d" % i) details = dt.details(dt.summary("Cluster: %d" % c)) colors = [1.0 if c == labels[j] else 0.0 for j in range(len(tokens))] if whitespace_only: for j in range(len(tokens)): colors[j] = colors[j] if text[j] in [" ", "\n"] else 0.0 token_data = zip(tokens, colors) details.add(ct.colored_text(token_data)) content.add(details) return content
def get_clustering(self, attributes_2D_mapping): """Returns HDBSCAN cluster labels """ assert attributes_2D_mapping.shape[1] == 2 new_labels = hdbscan.approximate_predict( self.clusterer, attributes_2D_mapping ) return new_labels
def predict(self, embeddings: np.ndarray): if not self.is_fitted: return Clusterer._empty_assignment(len(embeddings)) embeddings_umap = self.umap.transform(embeddings) labels, probabilities = hdbscan.approximate_predict( self.hdbscan, embeddings_umap) return ClusterAssignment(labels=labels, probabilities=probabilities)
def assign_samples(chunk, X, y, model, scale, chunk_size, values=False): """Runs a models assignment on a chunk of input Args: chunk (int) Index of chunk to process X (NumpyShared) n x 2 array of core and accessory distances for n samples y (NumpyShared) An n-vector to store results, with the most likely cluster memberships or an n by k matrix with the component responsibilities for each sample. weights (numpy.array) Component weights from :class:`~PopPUNK.models.BGMMFit` means (numpy.array) Component means from :class:`~PopPUNK.models.BGMMFit` covars (numpy.array) Component covariances from :class:`~PopPUNK.models.BGMMFit` scale (numpy.array) Scaling of core and accessory distances from :class:`~PopPUNK.models.BGMMFit` chunk_size (int) Size of each chunk in X values (bool) Whether to return the responsibilities, rather than the most likely assignment (used for entropy calculation). Default is False """ # Make sure this is run single threaded with set_env(MKL_NUM_THREADS='1', NUMEXPR_NUM_THREADS='1', OMP_NUM_THREADS='1'): if isinstance(X, NumpyShared): X_shm = shared_memory.SharedMemory(name=X.name) X = np.ndarray(X.shape, dtype=X.dtype, buffer=X_shm.buf) if isinstance(y, NumpyShared): y_shm = shared_memory.SharedMemory(name=y.name) y = np.ndarray(y.shape, dtype=y.dtype, buffer=y_shm.buf) start = chunk * chunk_size end = min((chunk + 1) * chunk_size, X.shape[0]) if start >= end: raise RuntimeError("start >= end in BGMM assign") if isinstance(model, BGMMFit): logprob, lpr = log_likelihood(X[start:end, :], model.weights, model.means, model.covariances, scale) responsibilities = np.exp(lpr - logprob[:, np.newaxis]) # Default to return the most likely cluster if values == False: y[start:end] = responsibilities.argmax(axis=1) # Can return the actual responsibilities else: y[start:end, :] = responsibilities elif isinstance(model, DBSCANFit): y[start:end] = hdbscan.approximate_predict( model.hdb, X[start:end, :] / scale)[0]
def get_result(self): points = [[ self.age, self.incomeneed, self.riskpropension, self.protectionneed, self.inheritanceindex ]] print("My Points: ", points) labels, streghts = hdbscan.approximate_predict(hdb_cluster, points) print("Predictions: ", labels[0]) global RS RS = labels[0]
def hdbscan_segmentation(embedding, n_img_dims=None, coord_scales=None, metric='euclidean', min_cluster_size=50, slice_for_fit=None, **hdbscan_kwargs): assert hdbscan is not None, 'need hdbscan for hdbscan_segmentation' assert metric in hdbscan.dist_metrics.METRIC_MAPPING if n_img_dims is None: # default: assume one embedding image is being passed n_img_dims = len(embedding.shape) - 1 emb_shape = embedding.shape img_shape = emb_shape[-n_img_dims:] # append image coordinates as features if requested if coord_scales is not None: if not isinstance(coord_scales, collections.Iterable): coord_scales = n_img_dims * (coord_scales, ) assert len(coord_scales) == n_img_dims, f'{coord_scales}, {n_img_dims}' embedding = _append_coords(embedding, coord_scales) # compute #pixels per image n_pixels = 1 for s in img_shape: n_pixels *= s # reshape embedding for clustering embedding = embedding.contiguous().view(-1, embedding.shape[-n_img_dims - 1], n_pixels).permute(0, 2, 1) # init HDBSCAN clusterer clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric=metric, **hdbscan_kwargs) # iterate over images in batch result = [] for emb in embedding: if slice_for_fit is not None: clusterer.fit( emb.view(*img_shape, -1)[slice_for_fit].contiguous().view( -1, emb.shape[-1])) clusterer.generate_prediction_data() labels = hdbscan.approximate_predict(clusterer, emb).reshape(img_shape) else: labels = clusterer.fit_predict(emb).reshape(img_shape) result.append(labels) result = np.stack(result, axis=0).reshape(emb_shape[:-n_img_dims - 1] + emb_shape[-n_img_dims:]) return torch.from_numpy(result)
def HDBScan_clustering(data, test, columns, minimum_cluster_size=3000): """ Clustering function using HDBscan. The function will create a new columns in the train and test set dataframes. data - train dataframe to be to performed clustering and then create a new column with the cluster identification columns - columns to perform clustering test - test dataframe where it will be created a new column with the cluster identification based on the training set clustering """ clusterer = hdbscan.HDBSCAN(min_cluster_size=minimum_cluster_size, prediction_data=True).fit(data[columns]) train_labels, strengths = hdbscan.approximate_predict(clusterer, data[columns]) test_labels, strengths = hdbscan.approximate_predict(clusterer, test[columns]) print('Number of clusters in training set using HDBScan: {}'.format(len(np.unique(train_labels)))) print('Number of clusters in test set using HDBScan: {}'.format(len(np.unique(train_labels)))) data['HDBScan'] = train_labels test['HDBScan'] = test_labels
def predict(self, text): """ Predict the cluster for the input text :return tuple: label and stength """ labels, strengths = hdbscan.approximate_predict( self.model, [ self.encoder.encode( configuration.DEFAULT_TOKENIZER.transform(text)) ]) return int(labels[0]), float(strengths[0])
def _cluster_test(hdbscan, cluster_centers, df): coords = df[['latitude', 'longitude']] * np.pi / 180 df = df.assign(cluster=approximate_predict(hdbscan, coords)[0]) df = df.merge(cluster_centers, left_on='cluster', right_index=True, how='left', suffixes=('', '_cluster')) return df
def learn(self, personas, keywordsId, answers, keywordsCond): i = 0 for persona in personas: embed = concatEmbeddingEn( getContextualEmbedding(persona, verbose=True)) df2 = pd.DataFrame(embed[0]) df2 = tools.Compressor.compressVectorDfdim1Todim2( df2, self.compressor) df2 = df2.rename(columns=str) df2['word'] = [s.replace("</w>", "") for s in embed[1]] sentences = [] doc = ' '.join(embed[1]) h = 0 windows_size = 20 print(doc, flush=True) print(embed[1], flush=True) for word in embed[1]: sentences.append(' '.join( embed[1][max(0, h - windows_size):min(len(embed[1]), h + windows_size)])) h += 1 print("SENTENCES", flush=True) print(sentences, flush=True) df2['sentence'] = sentences df2 = df2[~df2.word.isin(stopwords.words('english'))] data_formatted = [] for col in df2.columns: if col != "word" and col != "sentence": data_formatted.append(df2[col].tolist()) data = np.array(data_formatted[0:32]).T print("TO LEARN", flush=True) print(data, flush=True) print(data.shape, flush=True) #self.hdbscan_model.fit(data) labels, _ = hdbscan.approximate_predict(self.hdbscan_model, data) df2['clusterid'] = labels df2['keywordsId'] = [ keywordsId[i].split('|') for l in range(len(df2)) ] if len(keywordsId[i].split('|')) > 0 else [] df2['keywordsCond'] = [ keywordsCond[i].split('|') for l in range(len(df2)) ] if len(keywordsCond[i].split('|')) > 0 else [] df2['answer'] = [answers[i] for l in range(len(df2))] print(df2.head(), flush=True) print(df2.columns, flush=True) print(self.dfWiki.columns, flush=True) print(self.dfWiki, flush=True) self.dfWiki = pd.concat([self.dfWiki, df2]).reset_index(drop=True) print(self.dfWiki) print("TAIL", flush=True) print(self.dfWiki.tail(10), flush=True) i += 1
def hdb_cluster(data, min_s_ratio, min_clust_ratio, f_save=None, num_cores=1, min_s_num=None, min_clust_num=None, clust_select='eom', max_samps=100000): n_obs = data.shape[0] print(n_obs) min_samples = int(n_obs/min_s_ratio) min_cluster_size = int(n_obs/min_clust_ratio) if min_s_num is not None: min_samples = min_s_num if min_clust_num is not None: min_cluster_size = min_clust_num print('cluster method:', clust_select) # if n_subsamp != None: # data.sample # Assume needs to have if data.shape[0] > max_samps: print("Subsampling") full_data = data.copy() data = data[np.random.choice(data.shape[0], max_samps, replace=False), :] min_samples = int(data.shape[0] / min_s_ratio) min_cluster_size = int(data.shape[0] / min_clust_ratio) clusterer = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size, core_dist_n_jobs=num_cores, cluster_selection_method=clust_select, prediction_data=True ).fit(data) labels, _ = hdbscan.approximate_predict(clusterer, full_data) print(labels) if f_save is not None: f_save = f_save.replace(".p", "") + ".p" pickle.dump(labels, open(f_save, "wb")) return labels print("Number of cores", num_cores) if num_cores is not None and num_cores != 1: labels = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size, core_dist_n_jobs=num_cores,cluster_selection_method=clust_select ).fit_predict(data) else: labels = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size,cluster_selection_method=clust_select ).fit_predict(data) if f_save is not None: f_save = f_save.replace(".p", "") + ".p" pickle.dump(labels, open(f_save, "wb")) return labels
def predict(self, attributes0, thresh=0.75): attributes = np.array(attributes0) pick_clusters = self.comparison_summary[ self.comparison_summary['dif'] >= thresh].index.tolist() res, cluster_strengths = hdbscan.approximate_predict( self.clustering_model, attributes) return (pd.Series(res).isin(pick_clusters).astype(int).values, cluster_strengths)
def __predict(self): #==================================================== #== CHECK DATA & MODEL #==================================================== # - Check if data are set if self.data is None: logger.error("Input data array is None!") return -1 # - Check if clustering model is set if self.clusterer is None: logger.error("Clusterer is not set!") return -1 # - Retrieve prediction data from current model logger.info( "Retrieving prediction data from current model (if any) ...") self.prediction_data = self.clusterer.prediction_data_ #==================================================== #== CLUSTER DATA USING SAVED MODEL #==================================================== logger.info("Encode input data using loaded model ...") self.labels, self.probs = hdbscan.approximate_predict( self.clusterer, self.data) #================================ #== SAVE CLUSTERED DATA #================================ logger.info("Saving unsupervised encoded data to file ...") N = self.data.shape[0] print("Cluster data N=", N) snames = np.array(self.source_names).reshape(N, 1) objids = np.array(self.data_classids).reshape(N, 1) clustered_data = np.concatenate( (snames, objids, self.labels, self.probs), axis=1) head = "# sname id clustid clustprob" Utils.write_ascii(clustered_data, self.outfile, head) #================================ #== PLOT #================================ logger.info("Plotting results ...") self.__plot_predict(self.clusterer, self.data, self.labels, self.source_names, self.data_labels, self.prediction_data, self.prediction_extra_data, self.outfile_plot) return 0
def transform(self, documents: Union[str, List[str]]) -> List[int]: """ After having fit a model, use transform to predict new instances """ if isinstance(documents, str): documents = [documents] embeddings = self._extract_embeddings(documents) umap_embeddings = self.umap_model.transform(embeddings) predictions, strengths = hdbscan.approximate_predict(self.cluster_model, umap_embeddings) if self.mapped_topics: predictions = self._map_predictions(predictions) return predictions
def predict_news_cluster_by_date(nrows, date): last_script_execution = db.get_last_script_execution(script_name) last_processed_date = ( last_script_execution["last_processed_date"].values[0] if last_script_execution is not None else None ) # Load only news articles since the last batch news_articles = db.get_news_articles_from_startdate_to_enddate(str(last_processed_date), str(date), nrows) data_matrix = vectorizer.transform(news_articles["text_lemmatized_without_stopwords"]) labels, probabilities = approximate_predict(model, data_matrix.toarray()) # Clusters are identified by a sorted string of news ids return labels
def hdbscan(self, inclusion_threshold: float or None = None): """ Perform gating with HDBSCAN algorithm (https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) HDBSCAN clustering is performed on either the whole dataset or a sample of the dataset if specified. If clustering is performed on a sample, a call to 'approximate_predict' is made for remaining data. (https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan.prediction.approximate_predict) Parameters ----------- inclusion_threshold: float, optional float value for minimum probability threshold for data inclusion; data below this threshold will be classed as noise Returns -------- ChildPopulationCollection Updated child populations with events indexing complete """ sample = None # If parent is empty just return the child populations with empty index array if self.empty_parent: return self.child_populations if self.frac is not None: sample = self.sampling(self.data, 40000) # Cluster! model = hdbscan.HDBSCAN(core_dist_n_jobs=-1, min_cluster_size=self.min_pop_size, prediction_data=True) if sample is not None: model.fit(sample[[self.x, self.y]]) self.data['labels'], self.data[ 'label_strength'] = hdbscan.approximate_predict( model, self.data[[self.x, self.y]]) else: model.fit(self.data[[self.x, self.y]]) self.data['labels'] = model.labels_ self.data['label_strength'] = model.probabilities_ # Post clustering checks if inclusion_threshold is not None: mask = self.data['label_strength'] < inclusion_threshold self.data.loc[mask, 'labels'] = -1 # Predict clusters for child populations polygon_shapes = self.generate_polygons() population_predictions = self._predict_pop_clusters(polygon_shapes) return self._assign_clusters(population_predictions, polygon_shapes)
def assign_samples_dbscan(X, hdb, scale): """Use a fitted dbscan model to assign new samples to a cluster Args: X (numpy.array) N x 2 array of core and accessory distances hdb (hdbscan.HDBSCAN) Fitted DBSCAN from hdbscan package scale (numpy.array) Scale factor of model object Returns: y (numpy.array) Cluster assignments by sample """ y = hdbscan.approximate_predict(hdb, X/scale)[0] return y
def classify_test(self): with open('../data/classical-artists.ids') as f: flat = f.read() f.close() aggrs = [] names = [] for _id in flat.split("\n"): doc = self.get_doc(_id) if not doc is None: if len(doc["recordings"].keys()) < doc["track_count"]: doc["track_count"] = len(doc["recordings"].keys()) aggrs.append( self.aggregate_features(doc["recordings"])["median"]) names.append(doc["name"]) test_labels, strengths = hd.approximate_predict(self.clusterer, aggrs) for i, name in enumerate(names): print(name, test_labels[i], strengths[i])
def test_approx_predict_default(): """ Verify that approximate_predict_flat produces same results as default """ # Given the base HDBSCAN trained on some data, clusterer = HDBSCAN(cluster_selection_method='eom', cluster_selection_epsilon=0, prediction_data=True).fit(X) # When using approximate_predict_flat without specifying n_clusters, labels_flat, proba_flat = approximate_predict_flat( clusterer, X_test, n_clusters=None) # Then, the clustering should match that due to approximate_predict, labels_base, proba_base = approximate_predict(clusterer, X_test) assert_array_equal(labels_flat, labels_base) assert_array_equal(proba_flat, proba_base) return
def predict_new_points(test_dataset, clusterer, mrs): test_bi_pca_all = data_utils.get_test_transformed(test_dataset, mrs) labels = test_bi_pca_all[['label']] test_bi_pca = test_bi_pca_all.drop(['label'], axis=1) # see what happened ot = test_bi_pca_all[test_bi_pca_all['label'] == -1] plt.scatter(ot[['pca_1']], ot[['pca_2']], s=50, linewidth=0, c='yellow', alpha=1, label='Test outliers') noot = test_bi_pca_all[test_bi_pca_all['label'] != -1] plt.scatter(noot[['pca_1']], noot[['pca_2']], s=50, linewidth=0, c='blue', alpha=1, label='Test data points') legend = plt.legend(loc='upper left') legend.legendHandles[2]._sizes = [30] legend.legendHandles[3]._sizes = [40] # test_labels, strengths = hdbscan.approximate_predict(clusterer, test_bi_pca) test_labels [test_labels > -1] = 0 sensitivity, specificity, accuracy = data_utils.show_performance(labels, test_labels) return sensitivity, specificity, accuracy
def test_hdbscan(data): cluster_loaded = load_model() labels, strengths = hdbscan.approximate_predict(cluster_loaded, data) print(labels, strengths) # db = DBSCAN(eps=0.25, min_samples=30) # db.fit(data) # labels = db.labels_ #get db count cluster_groups = {} for i in labels: if cluster_groups.get(i): cluster_groups[i] = cluster_groups[i] + 1 else: cluster_groups[i] = 1 print("cluster_groups", cluster_groups) # Number of clusters in labels, ignoring noise if present n_clusters = len(set(labels)) - (1 if -1 in labels else 0) return labels, cluster_groups
def predict(input_string): corpus = pickle.load((open('corpus.sav', 'rb'))) lda_model = gensim.models.LdaMulticore(corpus=corpus) lda_model.load("lda.model") reducer = pickle.load((open('reducer.sav', 'rb'))) cluster = pickle.load((open('cluster.sav', 'rb'))) df2 = pd.DataFrame([input_string], columns={'message'}) df2['message'] = df2['message'].apply(lambda x: list(x.split(' '))) df2['token'] = df2['message'].apply(sent_to_words) df2['token'] = df2['message'].apply(remove_stopwords) df2['token'] = df2['token'].apply(lemmatization) texts2 = df2.token.values bigram2 = make_bigrams(texts2) id2word2 = gensim.corpora.Dictionary(bigram2) id2word2.compactify() corpus2 = [id2word2.doc2bow(text) for text in bigram2] #Assign topics based on optimum number of topics df2[0, 'topic'] = sorted(lda_model[corpus2[0]], reverse=True, key=lambda x: x[1])[0][0] df2[0, 'topic_probability'] = sorted(lda_model[corpus2[0]], reverse=True, key=lambda x: x[1])[0][1] #Get the word embeddings sentence_embeddings = get_word_embedding(df2.loc[0, 'token']) #Split the tensor into 768 columns for clustering df_new = pd.DataFrame(columns=[i for i in range(768)], index=[0]) for j in range(768): df_new.iloc[0, j] = sentence_embeddings[j] #Concat the tensors with the original dataframe df2 = pd.concat([df2, df_new], axis=1) #Filter out LDA and word embedding embeddings = df2.iloc[:, 4:] chat_embeddings = embeddings.iloc[0, :][None, :] test_data = reducer.transform(chat_embeddings) test_labels, test_prob = hdbscan.approximate_predict(cluster, test_data) return test_labels, test_prob
def hdbscan_predict(embedding, df_scaled, clusterer, force_predict=True): if force_predict: mem_vec = pd.DataFrame(hdbscan.membership_vector(clusterer, embedding.values)) test_labels = mem_vec.idxmax(axis=1).to_numpy() strengths = mem_vec.max(axis=1).to_numpy() else: test_labels, strengths = hdbscan.approximate_predict(clusterer, embedding) # Get probabilities scores = pd.DataFrame(strengths) scores.columns = ['score'] # Get clusters labels = pd.DataFrame(test_labels) labels.columns = ['cluster'] # Join scores = scores.join(labels).join(embedding).join(df_scaled) n_clusters = sum(scores['cluster'].unique()!=-1) scores['cluster'].value_counts() return(scores)
def __init__(self, qf, df, labels, verbose=True): self.cluster = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True).fit(qf) clusids, strengths = hdbscan.approximate_predict(self.cluster, qf) uniques = np.sort(np.unique(clusids)) n_labels = len(uniques) if verbose: label_strengths = [ np.median(strengths[clusids == l]) for l in uniques ] label_counts = [np.sum(clusids == l) for l in uniques] print("# clusters found:", n_labels) print( f"cluster sizes: min:{np.min(label_counts)} mean:{np.mean(label_counts)} max:{np.max(label_counts)}" ) print( f"median label strengths: min:{np.min(label_strengths)} mean:{np.mean(label_strengths)} max:{np.max(label_strengths)}" ) self.svms = [sklearn.svm.SVR() for _ in range(n_labels)] for l, svm in zip(uniques, self.svms): indices = clusids == l print(l, indices.sum()) svm.fit(df[indices], labels[indices])