def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray],
                    dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]:
        assert len(documents_features) == len(preprocessed_documents)

        x_scaled = []
        for doc_features in documents_features:
            x_scaled.append(StandardScaler().fit_transform(doc_features))

        predicted_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            x = documents_features[i]  # documents_features[i]  x_scaled[i]
            true_n_clusters = dataset.segmentations[i].author_count

            assert x.shape[0] == len(preprocessed_documents[i])

            diarizer = AffinityPropagation(damping=hyperparams['damping'],
                                           preference=hyperparams['preference'],
                                           copy=True, affinity='euclidean',
                                           max_iter=100, convergence_iter=5)

            labels = diarizer.fit_predict(x).tolist()
            predicted_label_lists.append(labels)

            estimated_n_clusters = len(set(labels))

            print('Document', i + 1, '/', len(documents_features), x.shape, 'in', time.time() - start_time, 's', )
            print('Real author count = {}, estimated = {}'.format(true_n_clusters, estimated_n_clusters))
            print()

        return generate_segmentation(preprocessed_documents, documents_features,
                                     predicted_label_lists, dataset.documents, task=task)
Пример #2
0
    def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray],
                    dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]:

        assert len(documents_features) == len(preprocessed_documents)

        document_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            preprocessed_doc_tokens = preprocessed_documents[i]
            doc_features = documents_features[i]
            num_authors = dataset.segmentations[i].author_count

            assert doc_features.shape[0] == len(preprocessed_doc_tokens)

            # svd = TruncatedSVD(n_components=50)
            # normalizer = Normalizer(copy=False)
            # lsa = make_pipeline(svd, normalizer)
            # x_scaled = lsa.fit_transform(doc_features)

            x_scaled = StandardScaler().fit_transform(doc_features)  #preprocessing.scale(doc_features, axis=0)
            # x_scaled = doc_features

            diarizer = AgglomerativeClustering(n_clusters=num_authors,
                                               affinity=hyperparams['affinity'],
                                               linkage=hyperparams['linkage'])

            labels = diarizer.fit_predict(x_scaled)
            document_label_lists.append(labels)

            print('Document', i+1, '/', len(documents_features), 'in', time.time()-start_time, 's')

        return generate_segmentation(preprocessed_documents, documents_features,
                                     document_label_lists, dataset.documents, task=task)
Пример #3
0
    def fit_predict(self,
                    preprocessed_documents: List[List[tuple]],
                    documents_features: List[np.ndarray],
                    dataset: Dataset,
                    hyperparams=None,
                    task=None) -> List[Segmentation]:

        x_scaled = []

        svd = TruncatedSVD(n_components=2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        for doc_features in documents_features:
            #x_scaled.append(StandardScaler().fit_transform(doc_features))
            x_scaled.append(lsa.fit_transform(doc_features))

        predicted_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            x = x_scaled[i]  # documents_features[i]  x_scaled[i]

            assert x.shape[0] == len(preprocessed_documents[i])

            diarizer = IsolationForest(n_estimators=100,
                                       max_samples=1.0,
                                       contamination=0.3,
                                       max_features=1.0,
                                       bootstrap=True,
                                       random_state=None)

            diarizer.fit(x)
            labels_array = diarizer.predict(x)
            labels_array[labels_array == 1] = 0
            labels_array[labels_array == -1] = 1
            predicted_label_lists.append(labels_array.tolist())

            print(
                'Document',
                i + 1,
                '/',
                len(documents_features),
                x.shape,
                'in',
                time.time() - start_time,
                's',
            )
            print()

        return generate_segmentation(preprocessed_documents,
                                     documents_features,
                                     predicted_label_lists,
                                     dataset.documents,
                                     task=task)
Пример #4
0
    def fit_predict(self,
                    preprocessed_documents: List[List[tuple]],
                    documents_features: List[np.ndarray],
                    dataset: Dataset,
                    hyperparams=None,
                    task=None) -> List[Segmentation]:

        assert len(documents_features) == len(preprocessed_documents)

        # scaling didn't help ??

        predicted_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            x = documents_features[i]  #x_scaled[i]
            true_n_clusters = dataset.segmentations[i].author_count

            assert x.shape[0] == len(preprocessed_documents[i])

            bandwith = estimate_bandwidth(x, quantile=0.3)
            print('bandwith:', bandwith)

            diarizer = MeanShift()
            labels = diarizer.fit_predict(x)
            predicted_label_lists.append(labels)

            estimated_n_clusters = len(diarizer.cluster_centers_)

            print(
                'Document',
                i + 1,
                '/',
                len(documents_features),
                x.shape,
                'in',
                time.time() - start_time,
                's',
            )
            print('Real author count = {}, estimated = {}'.format(
                true_n_clusters, estimated_n_clusters))
            print()

        return generate_segmentation(preprocessed_documents,
                                     documents_features,
                                     predicted_label_lists,
                                     dataset.documents,
                                     task=task)
    def select_optimal_hyperparams(self, preprocessed_documents, documents_features, documents, true_segmentations,
                                   author_labels=None, author_counts=None, task=None):
        x_scaled = []
        for doc_features in documents_features:
            x_scaled.append(self.scaler().fit_transform(doc_features))

        results = []
        total_hyperparams = len(self.hyperparams['damping']) * len(self.hyperparams['preference'])

        current_comb = 1
        start_time = time.time()

        for damping in self.hyperparams['damping']:
            for preference in self.hyperparams['preference']:

                print('Combination {}/{}'.format(current_comb, total_hyperparams))
                predicted_label_lists = []

                model = AffinityPropagation(damping=damping, preference=preference,
                                           copy=True, affinity='euclidean',
                                           max_iter=100, convergence_iter=5)

                for i in range(len(documents_features)):
                    x = documents_features[i]  # documents_features[i]  x_scaled[i]
                    labels = model.fit_predict(x)
                    predicted_label_lists.append(labels)

                current_comb += 1

                predicted_segmentations = generate_segmentation(preprocessed_documents, documents_features,
                                                                predicted_label_lists, documents, task=task)
                score = self.get_bcubed_f1(true_segmentations, predicted_segmentations)

                results.append({'damping': damping, 'preference': preference, 'score': score})

        sorted_results = sorted(results, key=lambda r: r['score'], reverse=True)
        best_result = sorted_results[0]
        print('The best hyperparams found:', best_result, 'in {} s.'.format(time.time() - start_time))
        return best_result
Пример #6
0
    def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray],
                    dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]:
        assert len(documents_features) == len(preprocessed_documents)

        document_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            preprocessed_doc_tokens = preprocessed_documents[i]
            doc_features = documents_features[i]
            num_authors = dataset.segmentations[i].author_count

            assert doc_features.shape[0] == len(preprocessed_doc_tokens)

            # svd = TruncatedSVD(n_components=50)
            # normalizer = Normalizer(copy=False)
            # lsa = make_pipeline(svd, normalizer)
            # x_scaled = lsa.fit_transform(doc_features)

            x_scaled = StandardScaler().fit_transform(doc_features)  # preprocessing.scale(doc_features, axis=0)
            # x_scaled = doc_features

            diarizer = GaussianMixture(n_components=num_authors, covariance_type='full',
                                       tol=0.001, reg_covar=1e-06, max_iter=100,
                                       n_init=10, init_params='kmeans', weights_init=None,
                                       means_init=None, precisions_init=None, random_state=None,
                                       warm_start=False, verbose=0, verbose_interval=10)

            diarizer.fit(x_scaled)
            labels = diarizer.predict(x_scaled)
            document_label_lists.append(labels)

            print('Document', i + 1, '/', len(documents_features), 'in', time.time() - start_time, 's')

        return generate_segmentation(preprocessed_documents, documents_features,
                                     document_label_lists, dataset.documents, task=task)
Пример #7
0
    def select_optimal_hyperparams(self,
                                   preprocessed_documents,
                                   documents_features,
                                   documents,
                                   true_segmentations,
                                   author_labels=None,
                                   author_counts=None,
                                   task=None):
        x_scaled = []
        for doc_features in documents_features:
            x_scaled.append(self.scaler().fit_transform(doc_features))

        results = []
        total_hyperparams = len(self.hyperparams['affinity']) * len(
            self.hyperparams['linkage'])

        current_comb = 1
        start_time = time.time()

        for affinity in self.hyperparams['affinity']:
            for linkage in self.hyperparams['linkage']:

                print('Combination {}/{}'.format(current_comb,
                                                 total_hyperparams))

                predicted_label_lists = []

                for i in range(len(documents_features)):
                    model = AgglomerativeClustering(
                        n_clusters=author_counts[i],
                        affinity=affinity,
                        linkage=linkage)

                    x = x_scaled[i]  # documents_features[i]  x_scaled[i]
                    labels = model.fit_predict(x)
                    predicted_label_lists.append(labels)

                current_comb += 1

                predicted_segmentations = generate_segmentation(
                    preprocessed_documents,
                    documents_features,
                    predicted_label_lists,
                    documents,
                    task=task)
                if task == 'a':
                    score = self.get_macro_f1(true_segmentations,
                                              predicted_segmentations)
                else:
                    score = self.get_bcubed_f1(true_segmentations,
                                               predicted_segmentations)

                results.append({
                    'affinity': affinity,
                    'linkage': linkage,
                    'score': score
                })

        sorted_results = sorted(results,
                                key=lambda r: r['score'],
                                reverse=True)
        best_result = sorted_results[0]
        print('The best hyperparams found:', best_result,
              'in {} s.'.format(time.time() - start_time))
        return best_result
Пример #8
0
    def fit_predict(self,
                    preprocessed_documents: List[List[tuple]],
                    documents_features: List[np.ndarray],
                    dataset: Dataset,
                    hyperparams=None,
                    task=None) -> List[Segmentation]:

        assert len(documents_features) == len(preprocessed_documents)

        predicted_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            preprocessed_doc_tokens = preprocessed_documents[i]
            doc_features = documents_features[i]
            true_n_clusters = dataset.segmentations[i].author_count

            assert doc_features.shape[0] == len(preprocessed_doc_tokens)

            # svd = TruncatedSVD(n_components=50)
            # normalizer = Normalizer(copy=False)
            # lsa = make_pipeline(svd, normalizer)
            # x_scaled = lsa.fit_transform(doc_features)

            x_scaled = StandardScaler(
                with_mean=not issparse(doc_features)).fit_transform(
                    doc_features)  #preprocessing.scale(doc_features, axis=0)
            # x_scaled = doc_features

            diarizer = DBSCAN(
                eps=hyperparams['eps'],
                min_samples=hyperparams['min_samples'],
                metric=hyperparams['metric'],
                algorithm='brute'
            )  # The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.

            labels = diarizer.fit_predict(x_scaled)
            estimated_n_clusters = len(
                set(labels)) - (1 if -1 in labels else 0)
            noisy = find_cluster_for_noisy_samples(labels)
            predicted_label_lists.append(labels)

            print(
                'Document',
                i + 1,
                '/',
                len(documents_features),
                x_scaled.shape,
                'in',
                time.time() - start_time,
                's',
            )
            print(
                'Real author count = {}, estimated = {}, noisy = '.format(
                    true_n_clusters, estimated_n_clusters), noisy)
            print()

        return generate_segmentation(preprocessed_documents,
                                     documents_features,
                                     predicted_label_lists,
                                     dataset.documents,
                                     task=task)
    def select_optimal_hyperparams(self,
                                   preprocessed_documents,
                                   documents_features,
                                   documents,
                                   true_segmentations,
                                   author_labels=None,
                                   author_counts=None,
                                   task=None):

        x_scaled = []
        for doc_features in documents_features:
            x_scaled.append(
                self.scaler(with_mean=not issparse(doc_features)).
                fit_transform(doc_features))

        #true_labels = train_set['']

        results = []
        total_hyperparams = len(self.hyperparams['metric']) * len(
            self.hyperparams['eps']) * len(self.hyperparams['min_samples'])
        current_comb = 1
        start_time = time.time()
        for metric in self.hyperparams['metric']:
            for eps in self.hyperparams['eps']:
                for min_samples in self.hyperparams['min_samples']:

                    print('Combination {}/{}'.format(current_comb,
                                                     total_hyperparams))
                    predicted_label_lists = []
                    model = DBSCAN(eps=eps,
                                   min_samples=min_samples,
                                   metric=metric,
                                   algorithm='brute')

                    for i in range(len(x_scaled)):
                        x = x_scaled[i]
                        labels = model.fit_predict(x)

                        find_cluster_for_noisy_samples(labels)
                        predicted_label_lists.append(labels)

                    current_comb += 1

                    predicted_segmentations = generate_segmentation(
                        preprocessed_documents,
                        documents_features,
                        predicted_label_lists,
                        documents,
                        task=task)
                    score = self.get_bcubed_f1(true_segmentations,
                                               predicted_segmentations)

                    #score = self.get_silhouette_coeff(x_scaled, predicted_label_lists, metric)

                    #score = self.get_calinski_harabaz_score(x_scaled, predicted_label_lists)

                    #score = (self.get_calinski_harabaz_score(x_scaled, predicted_label_lists) *
                    #       self.get_silhouette_coeff(x_scaled, predicted_label_lists, metric)) / \
                    #      self.get_esstimated_n_difference(predicted_label_lists, author_counts)

                    #score = self.get_esstimated_n_difference(predicted_label_lists, author_counts)

                    results.append({
                        'eps': eps,
                        'min_samples': min_samples,
                        'metric': metric,
                        'score': score
                    })

        sorted_results = sorted(results,
                                key=lambda r: r['score'],
                                reverse=True)
        best_result = sorted_results[0]
        print('The best hyperparams found:', best_result,
              'in {} s.'.format(time.time() - start_time))
        return best_result
Пример #10
0
    def fit_predict(self, preprocessed_documents: List[List[tuple]],
                    documents_features: List[np.ndarray],
                    dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]:

        assert len(documents_features) == len(preprocessed_documents)

        document_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            preprocessed_doc_tokens = preprocessed_documents[i]
            doc_features = documents_features[i]
            num_authors = dataset.segmentations[i].author_count

            # remove
            #truth = dataset.segmentations[i].offsets_to_authors([j for j in range(len(preprocessed_doc_tokens))])
            #print(truth)

            assert doc_features.shape[0] == len(preprocessed_doc_tokens)


            #svd = TruncatedSVD(n_components=2)
            #normalizer = Normalizer(copy=False)
            #lsa = make_pipeline(svd, normalizer)
            #x_scaled = lsa.fit_transform(doc_features)

            #plt.scatter(x_scaled[:,0], x_scaled[:,1], s=100,  c=truth)
            #plt.show()

            x_scaled = StandardScaler().fit_transform(doc_features) #preprocessing.scale(doc_features, axis=0)  # StandardScaler().fit_transform(doc_features)
            #x_scaled = doc_features

            use_one_cluster = (task == 'a')

            diarizer = KMeans(n_clusters=num_authors, #if use_one_cluster else num_authors,
                              init='k-means++',
                              n_init=10,
                              max_iter=300,
                              algorithm='full')  # “auto” chooses “elkan” for dense data and “full” (EM style) for sparse data.

            labels = diarizer.fit_predict(x_scaled)

            #if use_one_cluster:
            #    diffs = []
            #    threshold = 1
            #    inertia = diarizer.inertia_
            #    avg_inertia = inertia / doc_features.shape[0]
            #    centroid = diarizer.cluster_centers_[0].reshape((1, x_scaled.shape[1]))
            #    for k in range(len(labels)):
            #        x_d = x_scaled[k].reshape((1, x_scaled[k].shape[0]))
            #        inertia_x = paired_distances(x_d, centroid, metric='euclidean') ** 2 #np.sum(sq_diff)
            #        diffs.append(inertia_x)
            #        if inertia_x[0] > threshold * avg_inertia:
            #            labels[k] = 1
            #    print('min:', min(diffs))
            #    print('max:', max(diffs))
            #    print('avg:', avg_inertia)
            #    print()

            document_label_lists.append(labels)

            print('Document', i + 1, '/', len(documents_features), 'in', time.time() - start_time, 's')

        return generate_segmentation(preprocessed_documents, documents_features,
                                     document_label_lists, dataset.documents, task=task)