Пример #1
0
    def test_extraction(self):
        # Check if extraction works succesfully

        try:
            result = extract_from_lead(self.lead.id)
            self.assertTrue(result)

            # Check if the extraction did create proper lead preview
            lead_preview = self.lead.leadpreview
            self.assertIsNotNone(lead_preview)

            # This is similar to test_web_document
            path = join(self.path, '.'.join(HTML_URL.split('/')[-1:]))
            extracted = get_or_write_file(path + '.txt',
                                          lead_preview.text_extract)
            self.assertEqual(
                ' '.join(lead_preview.text_extract.split()),
                _preprocess(' '.join(extracted.read().split())),
            )
        except Exception:
            import traceback
            logger.warning('\n' + ('*' * 30))
            logger.warning('LEAD EXTRACTION ERROR:')
            logger.warning(traceback.format_exc())
            return
Пример #2
0
    def test_extraction_(self, _, url):
        # Create the sample lead
        lead = self.create_lead_with_url(url)
        # Check if extraction works succesfully
        try:
            result = extract_from_lead(lead.id)
            self.assertTrue(result)

            # Check if the extraction did create proper lead preview
            lead_preview = lead.leadpreview
            self.assertIsNotNone(lead_preview)

            # This is similar to test_web_document
            path = os.path.join(
                self.path,
                '.'.join(url.split('/')[-1:]),
            )
            extracted = get_or_write_file(path + '.txt', lead_preview.text_extract)
            self.assertEqual(
                ' '.join(lead_preview.text_extract.split()),
                _preprocess(' '.join(extracted.read().split())),
            )
        except Exception:
            logger.warning('LEAD EXTRACTION ERROR:', exc_info=True)
            return
Пример #3
0
    def extract(self, path):
        file = open(join(self.documents, path), 'rb')
        text, images = FileDocument(file, file.name.split('/')[-1]).extract()
        path = join(self.path, file.name.split('/')[-1])

        extracted = get_or_write_file(path + '.txt', text)

        self.assertEqual(text, extracted.read())
Пример #4
0
    def extract(self, path):
        file = open(join(self.documents, path), 'rb')
        filename = file.name.split('/')[-1]
        text, images, page_count = FileDocument(file, filename).extract()
        path = join(self.path, filename)

        extracted = get_or_write_file(path + '.txt', text)

        self.assertEqual(text, extracted.read())
        self.assertEqual(page_count, self.pages[filename.split('.')[-1]])
Пример #5
0
    def extract(self, url, type):
        text, images = WebDocument(url).extract()
        path = join(self.path, '.'.join(url.split('/')[-1:]))

        extracted = get_or_write_file(path + '.txt', text)

        try:
            # TODO: Better way to handle the errors
            self.assertEqual(text.strip(), extracted.read().strip())
        except AssertionError:
            import traceback
            logger.warning('\n' + ('*' * 30))
            logger.warning('EXTRACTOR ERROR: WEBDOCUMENT: ' + type.upper())
            logger.warning(traceback.format_exc())
Пример #6
0
    def test_extraction(self):
        # Check if extraction works succesfully
        result = extract_from_file(self.file_preview.id)
        self.assertTrue(result)

        # Check if the extraction did occur
        self.file_preview = FilePreview.objects.get(id=self.file_preview.id)
        self.assertTrue(self.file_preview.extracted)

        # This is similar to test_file_document
        path = join(self.path, DOCX_FILE)
        extracted = get_or_write_file(path + '.txt', self.file_preview.text)
        self.assertEqual(
            ' '.join(self.file_preview.text.split()),
            ' '.join(extracted.read().split()),
        )
Пример #7
0
    def test_extraction(self):
        # Check if extraction works succesfully
        result = extract_from_file(self.file_preview.id)
        self.assertTrue(result)

        # Check if the extraction did occur
        self.file_preview = FilePreview.objects.get(id=self.file_preview.id)
        if not self.file_preview.extracted:
            border_len = 50
            logger.warning('*' * border_len)
            logger.warning('---- File extraction is not working ----')
            logger.warning('Probably an issue with DEEPL integration')
            logger.warning('*' * border_len)

        # This is similar to test_file_document
        path = join(self.path, DOCX_FILE)
        extracted = get_or_write_file(path + '.txt', self.file_preview.text)
        self.assertEqual(
            ' '.join(self.file_preview.text.split()),
            ' '.join(extracted.read().split()),
        )
Пример #8
0
    def extract(self, extractor, path):
        text, images = extractor.extract()
        extracted = get_or_write_file(path + '.txt', text)

        self.assertEqual(text, extracted.read())