示例#1
0
    def do_search_similar_textunits(self) -> None:
        """
        Text search for text units (sentences or paragraph) by comparing
        text units' feature matrices, built by TfidfVectorizer
        """
        self.push_time('', True)
        pks = list(self.text_unit_query.values_list('pk', flat=True))
        self.units_count = len(pks)
        # x1 for building vocabulary, x2 for building matrices, x3 for storing similar entities
        self.task.set_push_steps(math.ceil(
            self.units_count / self.unit_vocabulary_chunk_size) * 2 + 1)

        if self.should_delete:
            TextUnitSimilarity.objects.all().delete()
        self.push_time('deleting')

        self.log_check_flood('vocabulary', 'building vocabulary')
        vocabulary = self.build_unitlevel_vocabulary()
        self.log_check_flood('vocabulary', 'completed building vocabulary')
        self.task.task.update_progress(33)

        self.push_time('build_vocabulary')
        if not vocabulary:
            return

        self.log_check_flood('matrices', 'building matrices')
        dtm_chunked = self.build_unitlevel_matrices(vocabulary)
        self.task.task.update_progress(60)
        self.log_check_flood('matrices', 'completed building matrices')
        self.push_time('build_matricies')

        self.log_check_flood('vstack', 'staking matrices')
        X = sparse.vstack(dtm_chunked)
        self.task.task.update_progress(66)
        self.log_check_flood('vstack', 'completed staking matrices')
        self.push_time('sparse.vstack(matricies)')

        for i in range(0, self.units_count, self.step):
            self.log_check_flood('sim_matrix',
                                 f'building similiarity matrix: ({i} of {self.units_count} are completed)')
            for j in range(i + 1, self.units_count, self.step):
                similarity_matrix = cosine_similarity(
                    X[i:min([i + self.step, self.units_count])],
                    X[j:min([j + self.step, self.units_count])]) * 100
                for g in range(similarity_matrix.shape[0]):
                    similarities = []
                    for h in range(g + 1, similarity_matrix.shape[1]):
                        if similarity_matrix[g, h] < self.similarity_threshold:
                            continue
                        similarities.append(TextUnitSimilarity(
                            text_unit_a_id=pks[i + g],
                            text_unit_b_id=pks[j + h],
                            similarity=similarity_matrix[g, h]))
                    if similarities:
                        self.store_unit_similarity_issues(similarities)

        self.store_unit_similarity_issues([], True)
        self.push_time('searching by matrix')
        self.log_timing('DocumentChunkSimilarityProcessor(text unit level)')
 def store_unit_similarity_issues(self,
                                  un_sims: List[TextUnitSimilarity],
                                  flush: bool = False) -> None:
     """
     Store TextUnitSimilarity objects in buffer for future saving them
     in bulk update / insert operation
     :param un_sims: items to store
     :param flush: flush buffer
     """
     if un_sims:
         self.unsim_store_buffer += un_sims
     if len(self.unsim_store_buffer
            ) < self.store_buf_flush_count and not flush:
         return
     # unit -> document -> project
     if self.unsim_store_buffer:
         TextUnitSimilarity.fill_joined_refs(self.unsim_store_buffer)
         TextUnitSimilarity.objects.bulk_create(self.unsim_store_buffer,
                                                ignore_conflicts=True)
     self.unsim_store_buffer = []
    def do_search_similar_textunits(self) -> None:
        """
        Text search for text units (sentences or paragraph) by comparing
        text units' feature matrices, built by TfidfVectorizer
        """
        self.push_time('', True)
        pks = list(self.text_unit_query.values_list('pk', flat=True))
        self.units_count = len(pks)
        # x1 for building vocabulary, x2 for building matrices, x3 for storing similar entities
        self.task.set_push_steps(math.ceil(
            self.units_count / self.unit_vocabulary_chunk_size) * 2 + 1)

        if self.should_delete:
            TextUnitSimilarity.objects.all().delete()
        self.push_time('deleting')

        vocabulary = self.build_unitlevel_vocabulary()
        self.push_time('build_vocabulary')
        if not vocabulary:
            return

        dtm_chunked = self.build_unitlevel_matrices(vocabulary)
        self.push_time('build_matricies')

        X = sparse.vstack(dtm_chunked)
        self.push_time('sparse.vstack(matricies)')

        for i in range(0, self.units_count, self.step):
            for j in range(0, self.units_count, self.step):
                similarity_matrix = cosine_similarity(
                    X[i:min([i + self.step, self.units_count])],
                    X[j:min([j + self.step, self.units_count])]) * 100
                for g in range(similarity_matrix.shape[0]):
                    tu_sim = [
                        TextUnitSimilarity(
                            text_unit_a_id=pks[i + g],
                            text_unit_b_id=pks[j + h],
                            similarity=similarity_matrix[g, h])
                        for h in range(similarity_matrix.shape[1])
                        if i + g != j + h and
                            similarity_matrix[g, h] >= self.similarity_threshold]
                    self.store_unit_similarity_issues(tu_sim)

        self.store_unit_similarity_issues([], True)
        self.push_time('searching by matrix')
        self.log_timing('DocumentChunkSimilarityProcessor(text unit level)')
示例#4
0
    def process(self, **kwargs):
        """

        :param kwargs:
        :return:
        """

        search_similar_documents = kwargs['search_similar_documents']
        search_similar_text_units = kwargs['search_similar_text_units']
        similarity_threshold = kwargs['similarity_threshold']
        project = kwargs.get('project')
        project_id = project['pk'] if project else 0
        self.log_info('Min similarity: %d' % similarity_threshold)

        # get text units with min length 100 signs
        filters = dict(unit_type='paragraph', textunittext__text__regex=r'.{100}.*')
        if project_id:
            filters['project_id'] = project_id
        text_units = TextUnit.objects.filter(**filters)
        len_tu_set = text_units.count()

        push_steps = 0
        if search_similar_documents:
            push_steps += 4
        if search_similar_text_units:
            push_steps += math.ceil(len_tu_set / self.step) ** 2 + 3
        self.set_push_steps(push_steps)

        documents = Document.objects.filter(project_id=project_id) if project_id \
            else Document.objects.all()

        # similar Documents
        total_stored = 0
        if search_similar_documents:
            # step #1 - delete
            if kwargs['delete']:
                if project_id:
                    DocumentSimilarity.objects.filter(
                        Q(document_a__project_id=project_id) |
                        Q(document_b__project_id=project_id)).delete()
                else:
                    DocumentSimilarity.objects.all().delete()
            self.push()

            # step #2 - prepare data
            texts_set = ['\n'.join(d.textunit_set.values_list('textunittext__text', flat=True))
                         for d in documents]
            self.push()

            # step #3
            vectorizer = TfidfVectorizer(max_df=0.5, max_features=self.n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=kwargs['use_idf'])
            X = vectorizer.fit_transform(texts_set)
            self.push()

            # step #4
            similarity_matrix = cosine_similarity(X) * 100
            pks = documents.values_list('pk', flat=True)
            for x in range(len(pks) - 1):
                document_a = pks[x]
                # use it to search for unique a<>b relations
                # for y, document_b in enumerate(Document.objects.all()[x + 1:], start=x + 1):
                for y in range(x + 1, len(pks)):
                    document_b = pks[y]
                    similarity = similarity_matrix[x, y]
                    if similarity < similarity_threshold:
                        continue
                    DocumentSimilarity.objects.create(
                        document_a_id=document_a,
                        document_b_id=document_b,
                        similarity=similarity)
                    total_stored += 1
            self.push()

        # similar Text Units
        if search_similar_text_units:

            # step #1 - delete
            if kwargs['delete']:
                if project_id:
                    TextUnitSimilarity.objects.filter(
                        Q(project_a__id=project_id) |
                        Q(project_b__id=project_id)).delete()
                else:
                    TextUnitSimilarity.objects.all().delete()
            self.push()

            # step #2 - prepare data
            texts_set, pks = zip(*text_units.values_list('textunittext__text', 'pk'))
            self.push()

            # step #3
            vectorizer = TfidfVectorizer(tokenizer=normalize,
                                         max_df=0.5, max_features=self.n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=kwargs['use_idf'])
            X = vectorizer.fit_transform(texts_set)
            self.push()

            # step #4
            for i in range(0, len_tu_set, self.step):
                for j in range(i + 1, len_tu_set, self.step):
                    similarity_matrix = cosine_similarity(
                        X[i:min([i + self.step, len_tu_set])],
                        X[j:min([j + self.step, len_tu_set])]) * 100
                    for g in range(similarity_matrix.shape[0]):
                        tu_sim = [
                            TextUnitSimilarity(
                                text_unit_a_id=pks[i + g],
                                text_unit_b_id=pks[j + h],
                                similarity=similarity_matrix[g, h])
                            for h in range(similarity_matrix.shape[1])
                            if i + g != j + h and similarity_matrix[g, h] >= similarity_threshold]
                        total_stored += self.save_similarity_records(tu_sim, project_id)
                    self.push()

        self.log_info(f'{total_stored} records stored')
示例#5
0
    def process(self, **kwargs):
        """

        :param kwargs:
        :return:
        """

        search_similar_documents = kwargs['search_similar_documents']
        search_similar_text_units = kwargs['search_similar_text_units']
        similarity_threshold = kwargs['similarity_threshold']
        self.log_info('Min similarity: %d' % similarity_threshold)

        # get text units with min length 100 signs
        text_units = TextUnit.objects.filter(unit_type='paragraph',
                                             text__regex=r'.{100}.*')
        len_tu_set = text_units.count()

        push_steps = 0
        if search_similar_documents:
            push_steps += 4
        if search_similar_text_units:
            push_steps += math.ceil(len_tu_set / self.step) ** 2 + 3
        self.set_push_steps(push_steps)

        # similar Documents
        if search_similar_documents:

            # step #1 - delete
            if kwargs['delete']:
                DocumentSimilarity.objects.all().delete()
            self.push()

            # step #2 - prepare data
            texts_set = ['\n'.join(d.textunit_set.values_list('text', flat=True))
                         for d in Document.objects.all()]
            self.push()

            # step #3
            vectorizer = TfidfVectorizer(max_df=0.5, max_features=self.n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=kwargs['use_idf'])
            X = vectorizer.fit_transform(texts_set)
            self.push()

            # step #4
            similarity_matrix = cosine_similarity(X) * 100
            pks = Document.objects.values_list('pk', flat=True)
            for x, document_a in enumerate(pks):
                # use it to search for unique a<>b relations
                # for y, document_b in enumerate(Document.objects.all()[x + 1:], start=x + 1):
                for y, document_b in enumerate(pks):
                    if document_a == document_b:
                        continue
                    similarity = similarity_matrix[x, y]
                    if similarity < similarity_threshold:
                        continue
                    DocumentSimilarity.objects.create(
                        document_a_id=document_a,
                        document_b_id=document_b,
                        similarity=similarity)
            self.push()

        # similar Text Units
        if search_similar_text_units:

            # step #1 - delete
            if kwargs['delete']:
                TextUnitSimilarity.objects.all().delete()
            self.push()

            # step #2 - prepare data
            texts_set, pks = zip(*text_units.values_list('text', 'pk'))
            self.push()

            # step #3
            vectorizer = TfidfVectorizer(tokenizer=normalize,
                                         max_df=0.5, max_features=self.n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=kwargs['use_idf'])
            X = vectorizer.fit_transform(texts_set)
            self.push()

            # step #4
            for i in range(0, len_tu_set, self.step):
                for j in range(0, len_tu_set, self.step):
                    similarity_matrix = cosine_similarity(
                        X[i:min([i + self.step, len_tu_set])],
                        X[j:min([j + self.step, len_tu_set])]) * 100
                    for g in range(similarity_matrix.shape[0]):
                        tu_sim = [
                            TextUnitSimilarity(
                                text_unit_a_id=pks[i + g],
                                text_unit_b_id=pks[j + h],
                                similarity=similarity_matrix[g, h])
                            for h in range(similarity_matrix.shape[1])
                            if i + g != j + h and similarity_matrix[g, h] >= similarity_threshold]
                        TextUnitSimilarity.objects.bulk_create(tu_sim)
                    self.push()