Exemplo n.º 1
0
    def test_part_doc_vec_search(self, processed_files_path, percent):
        '''
        We want to know how many correct searches will be in case of taking only part of the input doc
        '''
        vectorizer, corpus, processed_files = vec_search.create_vectorizer(
            processed_files_path)
        num_samples = len(processed_files)
        expected = np.array(range(num_samples))
        result = np.array([0] * num_samples)
        for i, file in enumerate(processed_files):
            if percent >= 50:
                miss = 100 // (100 - percent)
                file = [
                    word for i, word in enumerate(re.split(' ', file))
                    if i % miss > 0
                ]
            else:
                miss = 100 // percent
                file = [
                    word for i, word in enumerate(re.split(' ', file))
                    if i % miss == 0
                ]

            distances, key_words = vec_search.vec_search(
                vectorizer, corpus, ' '.join(file))
            result[i] = distances[0][1]
        score = np.sum(expected == result) / num_samples
        print("Testing vec model, score: ", score)
        return score
Exemplo n.º 2
0
    def test_vec_search(self, processed_files_path):
        '''
        Trying to find the most closest doc for the doc from the corpus using vectorized search we expect
        exactly this doc as a result, because each doc from the corpus is the most similar to itself (vectors which represent
        these docs are collinear)
        '''

        vectorizer, corpus, processed_files = vec_search.create_vectorizer(processed_files_path)
        num_samples = len(processed_files)
        expected = np.array(range(num_samples))
        result = np.array([0] * num_samples)
        for i, file in enumerate(processed_files):
            distances, key_words = vec_search.vec_search(vectorizer, corpus, file)
            result[i] = distances[0][1]
        score = np.sum(expected == result) / num_samples
        print("Testing vec model, score: ", score)
        return score
Exemplo n.º 3
0
    def test_vec_search(self, processed_files_path):
        '''
        Trying to find the most closest doc for the doc from the corpus using vectorized search we expect
        exactly this doc as a result, because each doc from the corpus is the most similar to itself (vectors which represent
        these docs are collinear)
        '''

        vectorizer, corpus, processed_files = vec_search.create_vectorizer(
            processed_files_path)
        num_samples = len(processed_files)
        expected = np.array(range(num_samples))
        result = np.array([0] * num_samples)
        for i, file in enumerate(processed_files):
            distances, key_words = vec_search.vec_search(
                vectorizer, corpus, file)
            result[i] = distances[0][1]
        score = np.sum(expected == result) / num_samples
        print("Testing vec model, score: ", score)
        return score
Exemplo n.º 4
0
    def test_part_doc_vec_search(self, processed_files_path, percent):
        '''
        We want to know how many correct searches will be in case of taking only part of the input doc
        '''
        vectorizer, corpus, processed_files = vec_search.create_vectorizer(processed_files_path)
        num_samples = len(processed_files)
        expected = np.array(range(num_samples))
        result = np.array([0] * num_samples)
        for i, file in enumerate(processed_files):
            if percent >= 50:
                miss = 100 // (100 - percent)
                file = [word for i, word in enumerate(re.split(' ', file)) if i % miss > 0]
            else:
                miss = 100 // percent
                file = [word for i, word in enumerate(re.split(' ', file)) if i % miss == 0]

            distances, key_words = vec_search.vec_search(vectorizer, corpus, ' '.join(file))
            result[i] = distances[0][1]
        score = np.sum(expected == result) / num_samples
        print("Testing vec model, score: ", score)
        return score