def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set): for doc in Document.objects.filter(pk__in=doc_ids): log = CeleryTaskLogger(_task) field_value_cache.cache_generic_values(doc, log=log) suggested_values = field_detection.detect_and_cache_field_values_for_document(log, doc, False, clear_old_values=False) field_value_cache.cache_field_values(doc, suggested_values, save=True, log=log)
def process(self, **kwargs): n_clusters = kwargs.get('n_clusters') method = kwargs.get('method') project_id = kwargs.get('project_id') project_clustering_id = kwargs.get('project_clustering_id') project_clustering = ProjectClustering.objects.get( pk=project_clustering_id) if project_id else None project_clustering.task = self.task project_clustering.save() project = project_clustering.project self.log_info( 'Start clustering documents for project id={}'.format(project_id)) self.log_info('Clustering method: "{}", n_clusters={}'.format( method, n_clusters)) self.set_push_steps(4) # get documents data documents = Document.objects.filter(project_id=project_id) id_name_map = {k: v for k, v in documents.values_list('id', 'name')} docs_count = len(id_name_map) # cluster by full text if kwargs.get('cluster_by') == 'full_text': docs = np.array(documents.values_list('pk', 'full_text')) pks, data = docs[:, 0], docs[:, 1] # try increase min_df if exception occurs while fit_trasform for max_df in range(50, 101, 5): max_df = float(max_df / 100) try: vectorizer = TfidfVectorizer(max_df=max_df, max_features=100, min_df=2, stop_words='english', use_idf=True) X = vectorizer.fit_transform(data) except ValueError as e: if 'Try a lower min_df or a higher max_df' in str(e): continue else: raise e break terms = vectorizer.get_feature_names() # Cluster by terms else: id_field = 'id' prop_field = 'textunit__termusage__term__term' # filter non-null, null qs = documents.filter(textunit__termusage__isnull=False) if not qs.exists(): raise RuntimeError( 'No terms in documents detected, try to re-run terms parser.' ) # get values ann_cond = dict(prop_count=Count(prop_field)) qs = qs.values(id_field, prop_field).annotate(**ann_cond).distinct() # get data df = pd.DataFrame(list(qs)).dropna() null_qs = documents.exclude(textunit__termusage__isnull=False) if null_qs.exists(): null_df = pd.DataFrame(list( null_qs.values('id'))).set_index('id') df = df.join(null_df, how='outer', on='id') df = df.pivot(index=id_field, columns=prop_field, values='prop_count').fillna(0) X = df.as_matrix() # convert CountVec into TFvec tf_transformer = TfidfTransformer(use_idf=False).fit(X) X = tf_transformer.transform(X) pks = df.index.tolist() terms = df.columns.tolist() if method == 'Birch': m = Birch(n_clusters=n_clusters, threshold=0.5, branching_factor=50) elif method == 'MiniBatchKMeans': m = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=1, init_size=100, batch_size=100, verbose=False) else: method = 'KMeans' m = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1, verbose=False) m.fit(X) self.push() X = X.toarray() pca = PCA(n_components=2).fit(X) data2d = pca.transform(X) if method == 'DBSCAN': clusters = m.labels_ cluster_labels = set(clusters) # reshape cluster labels if -1 in cluster_labels: cluster_labels = [i + 1 for i in cluster_labels] cluster_terms = cluster_labels centers2d = None else: if method == 'Birch': cluster_centers = m.subcluster_centers_ else: cluster_centers = m.cluster_centers_ order_centroids = cluster_centers.argsort()[:, ::-1] clusters = m.labels_.tolist() cluster_labels = set(clusters) _n_clusters = len(cluster_labels) cluster_terms = [[terms[ind] for ind in order_centroids[i, :10]] for i in range(_n_clusters)] centers2d = pca.transform(cluster_centers) points_data = [{ 'document_id': pks[i], 'document_name': id_name_map[pks[i]], 'coord': data2d[i].tolist(), 'cluster_id': str(clusters[i]) } for i in range(docs_count)] self.push() clusters_data = {} created_date = now() for cluster_id in cluster_labels: cluster_label = cluster_terms[cluster_id] if isinstance(cluster_label, list): cluster_label = '-'.join(cluster_label[:5]) cluster = DocumentCluster.objects.create( cluster_id=cluster_id, name='Default({})'.format(project.pk if project else None), self_name=cluster_label, description= 'Cluster Project (id={}) with Multiple Contract Types'.format( project_id), cluster_by='all', using=method, created_date=created_date) cluster_documents = [ i['document_id'] for i in points_data if i['cluster_id'] == str(cluster_id) ] cluster.documents.set(cluster_documents) clusters_data[str(cluster_id)] = dict( cluster_obj_id=cluster.pk, cluster_terms=cluster_terms[cluster_id], centroid_coord=centers2d[cluster_id].tolist() if centers2d is not None else None) project_clustering.document_clusters.add(cluster) result = { 'method': method, 'n_clusters': n_clusters, 'points_data': points_data, 'clusters_data': clusters_data } project_clustering.metadata = result project_clustering.save() self.push() self.log_info('Clustering completed. Updating document cache.') for doc in Document.objects.filter(project__pk=project_id): field_value_cache.cache_generic_values(doc) self.push() self.log_info('Finished.') return result