示例#1
0
 def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set):
     for doc in Document.objects.filter(pk__in=doc_ids):
         log = CeleryTaskLogger(_task)
         field_value_cache.cache_generic_values(doc, log=log)
         suggested_values = field_detection.detect_and_cache_field_values_for_document(log, doc, False,
                                                                                       clear_old_values=False)
         field_value_cache.cache_field_values(doc, suggested_values, save=True, log=log)
示例#2
0
    def process(self, **kwargs):

        n_clusters = kwargs.get('n_clusters')
        method = kwargs.get('method')
        project_id = kwargs.get('project_id')

        project_clustering_id = kwargs.get('project_clustering_id')
        project_clustering = ProjectClustering.objects.get(
            pk=project_clustering_id) if project_id else None
        project_clustering.task = self.task
        project_clustering.save()

        project = project_clustering.project

        self.log_info(
            'Start clustering documents for project id={}'.format(project_id))
        self.log_info('Clustering method: "{}", n_clusters={}'.format(
            method, n_clusters))

        self.set_push_steps(4)

        # get documents data
        documents = Document.objects.filter(project_id=project_id)
        id_name_map = {k: v for k, v in documents.values_list('id', 'name')}
        docs_count = len(id_name_map)

        # cluster by full text
        if kwargs.get('cluster_by') == 'full_text':
            docs = np.array(documents.values_list('pk', 'full_text'))
            pks, data = docs[:, 0], docs[:, 1]

            # try increase min_df if exception occurs while fit_trasform
            for max_df in range(50, 101, 5):
                max_df = float(max_df / 100)
                try:
                    vectorizer = TfidfVectorizer(max_df=max_df,
                                                 max_features=100,
                                                 min_df=2,
                                                 stop_words='english',
                                                 use_idf=True)
                    X = vectorizer.fit_transform(data)
                except ValueError as e:
                    if 'Try a lower min_df or a higher max_df' in str(e):
                        continue
                    else:
                        raise e
                break

            terms = vectorizer.get_feature_names()

        # Cluster by terms
        else:
            id_field = 'id'
            prop_field = 'textunit__termusage__term__term'
            # filter non-null, null
            qs = documents.filter(textunit__termusage__isnull=False)
            if not qs.exists():
                raise RuntimeError(
                    'No terms in documents detected, try to re-run terms parser.'
                )
            # get values
            ann_cond = dict(prop_count=Count(prop_field))
            qs = qs.values(id_field,
                           prop_field).annotate(**ann_cond).distinct()
            # get data
            df = pd.DataFrame(list(qs)).dropna()
            null_qs = documents.exclude(textunit__termusage__isnull=False)
            if null_qs.exists():
                null_df = pd.DataFrame(list(
                    null_qs.values('id'))).set_index('id')
                df = df.join(null_df, how='outer', on='id')
            df = df.pivot(index=id_field,
                          columns=prop_field,
                          values='prop_count').fillna(0)

            X = df.as_matrix()
            # convert CountVec into TFvec
            tf_transformer = TfidfTransformer(use_idf=False).fit(X)
            X = tf_transformer.transform(X)

            pks = df.index.tolist()
            terms = df.columns.tolist()

        if method == 'Birch':
            m = Birch(n_clusters=n_clusters,
                      threshold=0.5,
                      branching_factor=50)
        elif method == 'MiniBatchKMeans':
            m = MiniBatchKMeans(n_clusters=n_clusters,
                                init='k-means++',
                                n_init=1,
                                init_size=100,
                                batch_size=100,
                                verbose=False)
        else:
            method = 'KMeans'
            m = KMeans(n_clusters=n_clusters,
                       init='k-means++',
                       max_iter=100,
                       n_init=1,
                       verbose=False)

        m.fit(X)
        self.push()

        X = X.toarray()
        pca = PCA(n_components=2).fit(X)
        data2d = pca.transform(X)

        if method == 'DBSCAN':
            clusters = m.labels_
            cluster_labels = set(clusters)
            # reshape cluster labels
            if -1 in cluster_labels:
                cluster_labels = [i + 1 for i in cluster_labels]
            cluster_terms = cluster_labels
            centers2d = None
        else:
            if method == 'Birch':
                cluster_centers = m.subcluster_centers_
            else:
                cluster_centers = m.cluster_centers_

            order_centroids = cluster_centers.argsort()[:, ::-1]
            clusters = m.labels_.tolist()
            cluster_labels = set(clusters)
            _n_clusters = len(cluster_labels)
            cluster_terms = [[terms[ind] for ind in order_centroids[i, :10]]
                             for i in range(_n_clusters)]
            centers2d = pca.transform(cluster_centers)

        points_data = [{
            'document_id': pks[i],
            'document_name': id_name_map[pks[i]],
            'coord': data2d[i].tolist(),
            'cluster_id': str(clusters[i])
        } for i in range(docs_count)]

        self.push()

        clusters_data = {}
        created_date = now()
        for cluster_id in cluster_labels:
            cluster_label = cluster_terms[cluster_id]
            if isinstance(cluster_label, list):
                cluster_label = '-'.join(cluster_label[:5])
            cluster = DocumentCluster.objects.create(
                cluster_id=cluster_id,
                name='Default({})'.format(project.pk if project else None),
                self_name=cluster_label,
                description=
                'Cluster Project (id={}) with Multiple Contract Types'.format(
                    project_id),
                cluster_by='all',
                using=method,
                created_date=created_date)
            cluster_documents = [
                i['document_id'] for i in points_data
                if i['cluster_id'] == str(cluster_id)
            ]
            cluster.documents.set(cluster_documents)
            clusters_data[str(cluster_id)] = dict(
                cluster_obj_id=cluster.pk,
                cluster_terms=cluster_terms[cluster_id],
                centroid_coord=centers2d[cluster_id].tolist()
                if centers2d is not None else None)
            project_clustering.document_clusters.add(cluster)

        result = {
            'method': method,
            'n_clusters': n_clusters,
            'points_data': points_data,
            'clusters_data': clusters_data
        }
        project_clustering.metadata = result
        project_clustering.save()

        self.push()
        self.log_info('Clustering completed. Updating document cache.')

        for doc in Document.objects.filter(project__pk=project_id):
            field_value_cache.cache_generic_values(doc)

        self.push()
        self.log_info('Finished.')
        return result