def do_search_similar_textunits(self) -> None: """ Text search for text units (sentences or paragraph) by comparing text units' feature matrices, built by TfidfVectorizer """ self.push_time('', True) pks = list(self.text_unit_query.values_list('pk', flat=True)) self.units_count = len(pks) # x1 for building vocabulary, x2 for building matrices, x3 for storing similar entities self.task.set_push_steps(math.ceil( self.units_count / self.unit_vocabulary_chunk_size) * 2 + 1) if self.should_delete: TextUnitSimilarity.objects.all().delete() self.push_time('deleting') self.log_check_flood('vocabulary', 'building vocabulary') vocabulary = self.build_unitlevel_vocabulary() self.log_check_flood('vocabulary', 'completed building vocabulary') self.task.task.update_progress(33) self.push_time('build_vocabulary') if not vocabulary: return self.log_check_flood('matrices', 'building matrices') dtm_chunked = self.build_unitlevel_matrices(vocabulary) self.task.task.update_progress(60) self.log_check_flood('matrices', 'completed building matrices') self.push_time('build_matricies') self.log_check_flood('vstack', 'staking matrices') X = sparse.vstack(dtm_chunked) self.task.task.update_progress(66) self.log_check_flood('vstack', 'completed staking matrices') self.push_time('sparse.vstack(matricies)') for i in range(0, self.units_count, self.step): self.log_check_flood('sim_matrix', f'building similiarity matrix: ({i} of {self.units_count} are completed)') for j in range(i + 1, self.units_count, self.step): similarity_matrix = cosine_similarity( X[i:min([i + self.step, self.units_count])], X[j:min([j + self.step, self.units_count])]) * 100 for g in range(similarity_matrix.shape[0]): similarities = [] for h in range(g + 1, similarity_matrix.shape[1]): if similarity_matrix[g, h] < self.similarity_threshold: continue similarities.append(TextUnitSimilarity( text_unit_a_id=pks[i + g], text_unit_b_id=pks[j + h], similarity=similarity_matrix[g, h])) if similarities: self.store_unit_similarity_issues(similarities) self.store_unit_similarity_issues([], True) self.push_time('searching by matrix') self.log_timing('DocumentChunkSimilarityProcessor(text unit level)')
def store_unit_similarity_issues(self, un_sims: List[TextUnitSimilarity], flush: bool = False) -> None: """ Store TextUnitSimilarity objects in buffer for future saving them in bulk update / insert operation :param un_sims: items to store :param flush: flush buffer """ if un_sims: self.unsim_store_buffer += un_sims if len(self.unsim_store_buffer ) < self.store_buf_flush_count and not flush: return # unit -> document -> project if self.unsim_store_buffer: TextUnitSimilarity.fill_joined_refs(self.unsim_store_buffer) TextUnitSimilarity.objects.bulk_create(self.unsim_store_buffer, ignore_conflicts=True) self.unsim_store_buffer = []
def do_search_similar_textunits(self) -> None: """ Text search for text units (sentences or paragraph) by comparing text units' feature matrices, built by TfidfVectorizer """ self.push_time('', True) pks = list(self.text_unit_query.values_list('pk', flat=True)) self.units_count = len(pks) # x1 for building vocabulary, x2 for building matrices, x3 for storing similar entities self.task.set_push_steps(math.ceil( self.units_count / self.unit_vocabulary_chunk_size) * 2 + 1) if self.should_delete: TextUnitSimilarity.objects.all().delete() self.push_time('deleting') vocabulary = self.build_unitlevel_vocabulary() self.push_time('build_vocabulary') if not vocabulary: return dtm_chunked = self.build_unitlevel_matrices(vocabulary) self.push_time('build_matricies') X = sparse.vstack(dtm_chunked) self.push_time('sparse.vstack(matricies)') for i in range(0, self.units_count, self.step): for j in range(0, self.units_count, self.step): similarity_matrix = cosine_similarity( X[i:min([i + self.step, self.units_count])], X[j:min([j + self.step, self.units_count])]) * 100 for g in range(similarity_matrix.shape[0]): tu_sim = [ TextUnitSimilarity( text_unit_a_id=pks[i + g], text_unit_b_id=pks[j + h], similarity=similarity_matrix[g, h]) for h in range(similarity_matrix.shape[1]) if i + g != j + h and similarity_matrix[g, h] >= self.similarity_threshold] self.store_unit_similarity_issues(tu_sim) self.store_unit_similarity_issues([], True) self.push_time('searching by matrix') self.log_timing('DocumentChunkSimilarityProcessor(text unit level)')
def process(self, **kwargs): """ :param kwargs: :return: """ search_similar_documents = kwargs['search_similar_documents'] search_similar_text_units = kwargs['search_similar_text_units'] similarity_threshold = kwargs['similarity_threshold'] project = kwargs.get('project') project_id = project['pk'] if project else 0 self.log_info('Min similarity: %d' % similarity_threshold) # get text units with min length 100 signs filters = dict(unit_type='paragraph', textunittext__text__regex=r'.{100}.*') if project_id: filters['project_id'] = project_id text_units = TextUnit.objects.filter(**filters) len_tu_set = text_units.count() push_steps = 0 if search_similar_documents: push_steps += 4 if search_similar_text_units: push_steps += math.ceil(len_tu_set / self.step) ** 2 + 3 self.set_push_steps(push_steps) documents = Document.objects.filter(project_id=project_id) if project_id \ else Document.objects.all() # similar Documents total_stored = 0 if search_similar_documents: # step #1 - delete if kwargs['delete']: if project_id: DocumentSimilarity.objects.filter( Q(document_a__project_id=project_id) | Q(document_b__project_id=project_id)).delete() else: DocumentSimilarity.objects.all().delete() self.push() # step #2 - prepare data texts_set = ['\n'.join(d.textunit_set.values_list('textunittext__text', flat=True)) for d in documents] self.push() # step #3 vectorizer = TfidfVectorizer(max_df=0.5, max_features=self.n_features, min_df=2, stop_words='english', use_idf=kwargs['use_idf']) X = vectorizer.fit_transform(texts_set) self.push() # step #4 similarity_matrix = cosine_similarity(X) * 100 pks = documents.values_list('pk', flat=True) for x in range(len(pks) - 1): document_a = pks[x] # use it to search for unique a<>b relations # for y, document_b in enumerate(Document.objects.all()[x + 1:], start=x + 1): for y in range(x + 1, len(pks)): document_b = pks[y] similarity = similarity_matrix[x, y] if similarity < similarity_threshold: continue DocumentSimilarity.objects.create( document_a_id=document_a, document_b_id=document_b, similarity=similarity) total_stored += 1 self.push() # similar Text Units if search_similar_text_units: # step #1 - delete if kwargs['delete']: if project_id: TextUnitSimilarity.objects.filter( Q(project_a__id=project_id) | Q(project_b__id=project_id)).delete() else: TextUnitSimilarity.objects.all().delete() self.push() # step #2 - prepare data texts_set, pks = zip(*text_units.values_list('textunittext__text', 'pk')) self.push() # step #3 vectorizer = TfidfVectorizer(tokenizer=normalize, max_df=0.5, max_features=self.n_features, min_df=2, stop_words='english', use_idf=kwargs['use_idf']) X = vectorizer.fit_transform(texts_set) self.push() # step #4 for i in range(0, len_tu_set, self.step): for j in range(i + 1, len_tu_set, self.step): similarity_matrix = cosine_similarity( X[i:min([i + self.step, len_tu_set])], X[j:min([j + self.step, len_tu_set])]) * 100 for g in range(similarity_matrix.shape[0]): tu_sim = [ TextUnitSimilarity( text_unit_a_id=pks[i + g], text_unit_b_id=pks[j + h], similarity=similarity_matrix[g, h]) for h in range(similarity_matrix.shape[1]) if i + g != j + h and similarity_matrix[g, h] >= similarity_threshold] total_stored += self.save_similarity_records(tu_sim, project_id) self.push() self.log_info(f'{total_stored} records stored')
def process(self, **kwargs): """ :param kwargs: :return: """ search_similar_documents = kwargs['search_similar_documents'] search_similar_text_units = kwargs['search_similar_text_units'] similarity_threshold = kwargs['similarity_threshold'] self.log_info('Min similarity: %d' % similarity_threshold) # get text units with min length 100 signs text_units = TextUnit.objects.filter(unit_type='paragraph', text__regex=r'.{100}.*') len_tu_set = text_units.count() push_steps = 0 if search_similar_documents: push_steps += 4 if search_similar_text_units: push_steps += math.ceil(len_tu_set / self.step) ** 2 + 3 self.set_push_steps(push_steps) # similar Documents if search_similar_documents: # step #1 - delete if kwargs['delete']: DocumentSimilarity.objects.all().delete() self.push() # step #2 - prepare data texts_set = ['\n'.join(d.textunit_set.values_list('text', flat=True)) for d in Document.objects.all()] self.push() # step #3 vectorizer = TfidfVectorizer(max_df=0.5, max_features=self.n_features, min_df=2, stop_words='english', use_idf=kwargs['use_idf']) X = vectorizer.fit_transform(texts_set) self.push() # step #4 similarity_matrix = cosine_similarity(X) * 100 pks = Document.objects.values_list('pk', flat=True) for x, document_a in enumerate(pks): # use it to search for unique a<>b relations # for y, document_b in enumerate(Document.objects.all()[x + 1:], start=x + 1): for y, document_b in enumerate(pks): if document_a == document_b: continue similarity = similarity_matrix[x, y] if similarity < similarity_threshold: continue DocumentSimilarity.objects.create( document_a_id=document_a, document_b_id=document_b, similarity=similarity) self.push() # similar Text Units if search_similar_text_units: # step #1 - delete if kwargs['delete']: TextUnitSimilarity.objects.all().delete() self.push() # step #2 - prepare data texts_set, pks = zip(*text_units.values_list('text', 'pk')) self.push() # step #3 vectorizer = TfidfVectorizer(tokenizer=normalize, max_df=0.5, max_features=self.n_features, min_df=2, stop_words='english', use_idf=kwargs['use_idf']) X = vectorizer.fit_transform(texts_set) self.push() # step #4 for i in range(0, len_tu_set, self.step): for j in range(0, len_tu_set, self.step): similarity_matrix = cosine_similarity( X[i:min([i + self.step, len_tu_set])], X[j:min([j + self.step, len_tu_set])]) * 100 for g in range(similarity_matrix.shape[0]): tu_sim = [ TextUnitSimilarity( text_unit_a_id=pks[i + g], text_unit_b_id=pks[j + h], similarity=similarity_matrix[g, h]) for h in range(similarity_matrix.shape[1]) if i + g != j + h and similarity_matrix[g, h] >= similarity_threshold] TextUnitSimilarity.objects.bulk_create(tu_sim) self.push()