def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) x_scaled = [] for doc_features in documents_features: x_scaled.append(StandardScaler().fit_transform(doc_features)) predicted_label_lists = [] for i in range(len(documents_features)): start_time = time.time() x = documents_features[i] # documents_features[i] x_scaled[i] true_n_clusters = dataset.segmentations[i].author_count assert x.shape[0] == len(preprocessed_documents[i]) diarizer = AffinityPropagation(damping=hyperparams['damping'], preference=hyperparams['preference'], copy=True, affinity='euclidean', max_iter=100, convergence_iter=5) labels = diarizer.fit_predict(x).tolist() predicted_label_lists.append(labels) estimated_n_clusters = len(set(labels)) print('Document', i + 1, '/', len(documents_features), x.shape, 'in', time.time() - start_time, 's', ) print('Real author count = {}, estimated = {}'.format(true_n_clusters, estimated_n_clusters)) print() return generate_segmentation(preprocessed_documents, documents_features, predicted_label_lists, dataset.documents, task=task)
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) document_label_lists = [] for i in range(len(documents_features)): start_time = time.time() preprocessed_doc_tokens = preprocessed_documents[i] doc_features = documents_features[i] num_authors = dataset.segmentations[i].author_count assert doc_features.shape[0] == len(preprocessed_doc_tokens) # svd = TruncatedSVD(n_components=50) # normalizer = Normalizer(copy=False) # lsa = make_pipeline(svd, normalizer) # x_scaled = lsa.fit_transform(doc_features) x_scaled = StandardScaler().fit_transform(doc_features) #preprocessing.scale(doc_features, axis=0) # x_scaled = doc_features diarizer = AgglomerativeClustering(n_clusters=num_authors, affinity=hyperparams['affinity'], linkage=hyperparams['linkage']) labels = diarizer.fit_predict(x_scaled) document_label_lists.append(labels) print('Document', i+1, '/', len(documents_features), 'in', time.time()-start_time, 's') return generate_segmentation(preprocessed_documents, documents_features, document_label_lists, dataset.documents, task=task)
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: x_scaled = [] svd = TruncatedSVD(n_components=2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) for doc_features in documents_features: #x_scaled.append(StandardScaler().fit_transform(doc_features)) x_scaled.append(lsa.fit_transform(doc_features)) predicted_label_lists = [] for i in range(len(documents_features)): start_time = time.time() x = x_scaled[i] # documents_features[i] x_scaled[i] assert x.shape[0] == len(preprocessed_documents[i]) diarizer = IsolationForest(n_estimators=100, max_samples=1.0, contamination=0.3, max_features=1.0, bootstrap=True, random_state=None) diarizer.fit(x) labels_array = diarizer.predict(x) labels_array[labels_array == 1] = 0 labels_array[labels_array == -1] = 1 predicted_label_lists.append(labels_array.tolist()) print( 'Document', i + 1, '/', len(documents_features), x.shape, 'in', time.time() - start_time, 's', ) print() return generate_segmentation(preprocessed_documents, documents_features, predicted_label_lists, dataset.documents, task=task)
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) # scaling didn't help ?? predicted_label_lists = [] for i in range(len(documents_features)): start_time = time.time() x = documents_features[i] #x_scaled[i] true_n_clusters = dataset.segmentations[i].author_count assert x.shape[0] == len(preprocessed_documents[i]) bandwith = estimate_bandwidth(x, quantile=0.3) print('bandwith:', bandwith) diarizer = MeanShift() labels = diarizer.fit_predict(x) predicted_label_lists.append(labels) estimated_n_clusters = len(diarizer.cluster_centers_) print( 'Document', i + 1, '/', len(documents_features), x.shape, 'in', time.time() - start_time, 's', ) print('Real author count = {}, estimated = {}'.format( true_n_clusters, estimated_n_clusters)) print() return generate_segmentation(preprocessed_documents, documents_features, predicted_label_lists, dataset.documents, task=task)
def select_optimal_hyperparams(self, preprocessed_documents, documents_features, documents, true_segmentations, author_labels=None, author_counts=None, task=None): x_scaled = [] for doc_features in documents_features: x_scaled.append(self.scaler().fit_transform(doc_features)) results = [] total_hyperparams = len(self.hyperparams['damping']) * len(self.hyperparams['preference']) current_comb = 1 start_time = time.time() for damping in self.hyperparams['damping']: for preference in self.hyperparams['preference']: print('Combination {}/{}'.format(current_comb, total_hyperparams)) predicted_label_lists = [] model = AffinityPropagation(damping=damping, preference=preference, copy=True, affinity='euclidean', max_iter=100, convergence_iter=5) for i in range(len(documents_features)): x = documents_features[i] # documents_features[i] x_scaled[i] labels = model.fit_predict(x) predicted_label_lists.append(labels) current_comb += 1 predicted_segmentations = generate_segmentation(preprocessed_documents, documents_features, predicted_label_lists, documents, task=task) score = self.get_bcubed_f1(true_segmentations, predicted_segmentations) results.append({'damping': damping, 'preference': preference, 'score': score}) sorted_results = sorted(results, key=lambda r: r['score'], reverse=True) best_result = sorted_results[0] print('The best hyperparams found:', best_result, 'in {} s.'.format(time.time() - start_time)) return best_result
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) document_label_lists = [] for i in range(len(documents_features)): start_time = time.time() preprocessed_doc_tokens = preprocessed_documents[i] doc_features = documents_features[i] num_authors = dataset.segmentations[i].author_count assert doc_features.shape[0] == len(preprocessed_doc_tokens) # svd = TruncatedSVD(n_components=50) # normalizer = Normalizer(copy=False) # lsa = make_pipeline(svd, normalizer) # x_scaled = lsa.fit_transform(doc_features) x_scaled = StandardScaler().fit_transform(doc_features) # preprocessing.scale(doc_features, axis=0) # x_scaled = doc_features diarizer = GaussianMixture(n_components=num_authors, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=10, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10) diarizer.fit(x_scaled) labels = diarizer.predict(x_scaled) document_label_lists.append(labels) print('Document', i + 1, '/', len(documents_features), 'in', time.time() - start_time, 's') return generate_segmentation(preprocessed_documents, documents_features, document_label_lists, dataset.documents, task=task)
def select_optimal_hyperparams(self, preprocessed_documents, documents_features, documents, true_segmentations, author_labels=None, author_counts=None, task=None): x_scaled = [] for doc_features in documents_features: x_scaled.append(self.scaler().fit_transform(doc_features)) results = [] total_hyperparams = len(self.hyperparams['affinity']) * len( self.hyperparams['linkage']) current_comb = 1 start_time = time.time() for affinity in self.hyperparams['affinity']: for linkage in self.hyperparams['linkage']: print('Combination {}/{}'.format(current_comb, total_hyperparams)) predicted_label_lists = [] for i in range(len(documents_features)): model = AgglomerativeClustering( n_clusters=author_counts[i], affinity=affinity, linkage=linkage) x = x_scaled[i] # documents_features[i] x_scaled[i] labels = model.fit_predict(x) predicted_label_lists.append(labels) current_comb += 1 predicted_segmentations = generate_segmentation( preprocessed_documents, documents_features, predicted_label_lists, documents, task=task) if task == 'a': score = self.get_macro_f1(true_segmentations, predicted_segmentations) else: score = self.get_bcubed_f1(true_segmentations, predicted_segmentations) results.append({ 'affinity': affinity, 'linkage': linkage, 'score': score }) sorted_results = sorted(results, key=lambda r: r['score'], reverse=True) best_result = sorted_results[0] print('The best hyperparams found:', best_result, 'in {} s.'.format(time.time() - start_time)) return best_result
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) predicted_label_lists = [] for i in range(len(documents_features)): start_time = time.time() preprocessed_doc_tokens = preprocessed_documents[i] doc_features = documents_features[i] true_n_clusters = dataset.segmentations[i].author_count assert doc_features.shape[0] == len(preprocessed_doc_tokens) # svd = TruncatedSVD(n_components=50) # normalizer = Normalizer(copy=False) # lsa = make_pipeline(svd, normalizer) # x_scaled = lsa.fit_transform(doc_features) x_scaled = StandardScaler( with_mean=not issparse(doc_features)).fit_transform( doc_features) #preprocessing.scale(doc_features, axis=0) # x_scaled = doc_features diarizer = DBSCAN( eps=hyperparams['eps'], min_samples=hyperparams['min_samples'], metric=hyperparams['metric'], algorithm='brute' ) # The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. labels = diarizer.fit_predict(x_scaled) estimated_n_clusters = len( set(labels)) - (1 if -1 in labels else 0) noisy = find_cluster_for_noisy_samples(labels) predicted_label_lists.append(labels) print( 'Document', i + 1, '/', len(documents_features), x_scaled.shape, 'in', time.time() - start_time, 's', ) print( 'Real author count = {}, estimated = {}, noisy = '.format( true_n_clusters, estimated_n_clusters), noisy) print() return generate_segmentation(preprocessed_documents, documents_features, predicted_label_lists, dataset.documents, task=task)
def select_optimal_hyperparams(self, preprocessed_documents, documents_features, documents, true_segmentations, author_labels=None, author_counts=None, task=None): x_scaled = [] for doc_features in documents_features: x_scaled.append( self.scaler(with_mean=not issparse(doc_features)). fit_transform(doc_features)) #true_labels = train_set[''] results = [] total_hyperparams = len(self.hyperparams['metric']) * len( self.hyperparams['eps']) * len(self.hyperparams['min_samples']) current_comb = 1 start_time = time.time() for metric in self.hyperparams['metric']: for eps in self.hyperparams['eps']: for min_samples in self.hyperparams['min_samples']: print('Combination {}/{}'.format(current_comb, total_hyperparams)) predicted_label_lists = [] model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm='brute') for i in range(len(x_scaled)): x = x_scaled[i] labels = model.fit_predict(x) find_cluster_for_noisy_samples(labels) predicted_label_lists.append(labels) current_comb += 1 predicted_segmentations = generate_segmentation( preprocessed_documents, documents_features, predicted_label_lists, documents, task=task) score = self.get_bcubed_f1(true_segmentations, predicted_segmentations) #score = self.get_silhouette_coeff(x_scaled, predicted_label_lists, metric) #score = self.get_calinski_harabaz_score(x_scaled, predicted_label_lists) #score = (self.get_calinski_harabaz_score(x_scaled, predicted_label_lists) * # self.get_silhouette_coeff(x_scaled, predicted_label_lists, metric)) / \ # self.get_esstimated_n_difference(predicted_label_lists, author_counts) #score = self.get_esstimated_n_difference(predicted_label_lists, author_counts) results.append({ 'eps': eps, 'min_samples': min_samples, 'metric': metric, 'score': score }) sorted_results = sorted(results, key=lambda r: r['score'], reverse=True) best_result = sorted_results[0] print('The best hyperparams found:', best_result, 'in {} s.'.format(time.time() - start_time)) return best_result
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) document_label_lists = [] for i in range(len(documents_features)): start_time = time.time() preprocessed_doc_tokens = preprocessed_documents[i] doc_features = documents_features[i] num_authors = dataset.segmentations[i].author_count # remove #truth = dataset.segmentations[i].offsets_to_authors([j for j in range(len(preprocessed_doc_tokens))]) #print(truth) assert doc_features.shape[0] == len(preprocessed_doc_tokens) #svd = TruncatedSVD(n_components=2) #normalizer = Normalizer(copy=False) #lsa = make_pipeline(svd, normalizer) #x_scaled = lsa.fit_transform(doc_features) #plt.scatter(x_scaled[:,0], x_scaled[:,1], s=100, c=truth) #plt.show() x_scaled = StandardScaler().fit_transform(doc_features) #preprocessing.scale(doc_features, axis=0) # StandardScaler().fit_transform(doc_features) #x_scaled = doc_features use_one_cluster = (task == 'a') diarizer = KMeans(n_clusters=num_authors, #if use_one_cluster else num_authors, init='k-means++', n_init=10, max_iter=300, algorithm='full') # “auto” chooses “elkan” for dense data and “full” (EM style) for sparse data. labels = diarizer.fit_predict(x_scaled) #if use_one_cluster: # diffs = [] # threshold = 1 # inertia = diarizer.inertia_ # avg_inertia = inertia / doc_features.shape[0] # centroid = diarizer.cluster_centers_[0].reshape((1, x_scaled.shape[1])) # for k in range(len(labels)): # x_d = x_scaled[k].reshape((1, x_scaled[k].shape[0])) # inertia_x = paired_distances(x_d, centroid, metric='euclidean') ** 2 #np.sum(sq_diff) # diffs.append(inertia_x) # if inertia_x[0] > threshold * avg_inertia: # labels[k] = 1 # print('min:', min(diffs)) # print('max:', max(diffs)) # print('avg:', avg_inertia) # print() document_label_lists.append(labels) print('Document', i + 1, '/', len(documents_features), 'in', time.time() - start_time, 's') return generate_segmentation(preprocessed_documents, documents_features, document_label_lists, dataset.documents, task=task)