def test_extraction(self): # Check if extraction works succesfully try: result = extract_from_lead(self.lead.id) self.assertTrue(result) # Check if the extraction did create proper lead preview lead_preview = self.lead.leadpreview self.assertIsNotNone(lead_preview) # This is similar to test_web_document path = join(self.path, '.'.join(HTML_URL.split('/')[-1:])) extracted = get_or_write_file(path + '.txt', lead_preview.text_extract) self.assertEqual( ' '.join(lead_preview.text_extract.split()), _preprocess(' '.join(extracted.read().split())), ) except Exception: import traceback logger.warning('\n' + ('*' * 30)) logger.warning('LEAD EXTRACTION ERROR:') logger.warning(traceback.format_exc()) return
def test_extraction_(self, _, url): # Create the sample lead lead = self.create_lead_with_url(url) # Check if extraction works succesfully try: result = extract_from_lead(lead.id) self.assertTrue(result) # Check if the extraction did create proper lead preview lead_preview = lead.leadpreview self.assertIsNotNone(lead_preview) # This is similar to test_web_document path = os.path.join( self.path, '.'.join(url.split('/')[-1:]), ) extracted = get_or_write_file(path + '.txt', lead_preview.text_extract) self.assertEqual( ' '.join(lead_preview.text_extract.split()), _preprocess(' '.join(extracted.read().split())), ) except Exception: logger.warning('LEAD EXTRACTION ERROR:', exc_info=True) return
def extract(self, path): file = open(join(self.documents, path), 'rb') text, images = FileDocument(file, file.name.split('/')[-1]).extract() path = join(self.path, file.name.split('/')[-1]) extracted = get_or_write_file(path + '.txt', text) self.assertEqual(text, extracted.read())
def extract(self, path): file = open(join(self.documents, path), 'rb') filename = file.name.split('/')[-1] text, images, page_count = FileDocument(file, filename).extract() path = join(self.path, filename) extracted = get_or_write_file(path + '.txt', text) self.assertEqual(text, extracted.read()) self.assertEqual(page_count, self.pages[filename.split('.')[-1]])
def extract(self, url, type): text, images = WebDocument(url).extract() path = join(self.path, '.'.join(url.split('/')[-1:])) extracted = get_or_write_file(path + '.txt', text) try: # TODO: Better way to handle the errors self.assertEqual(text.strip(), extracted.read().strip()) except AssertionError: import traceback logger.warning('\n' + ('*' * 30)) logger.warning('EXTRACTOR ERROR: WEBDOCUMENT: ' + type.upper()) logger.warning(traceback.format_exc())
def test_extraction(self): # Check if extraction works succesfully result = extract_from_file(self.file_preview.id) self.assertTrue(result) # Check if the extraction did occur self.file_preview = FilePreview.objects.get(id=self.file_preview.id) self.assertTrue(self.file_preview.extracted) # This is similar to test_file_document path = join(self.path, DOCX_FILE) extracted = get_or_write_file(path + '.txt', self.file_preview.text) self.assertEqual( ' '.join(self.file_preview.text.split()), ' '.join(extracted.read().split()), )
def test_extraction(self): # Check if extraction works succesfully result = extract_from_file(self.file_preview.id) self.assertTrue(result) # Check if the extraction did occur self.file_preview = FilePreview.objects.get(id=self.file_preview.id) if not self.file_preview.extracted: border_len = 50 logger.warning('*' * border_len) logger.warning('---- File extraction is not working ----') logger.warning('Probably an issue with DEEPL integration') logger.warning('*' * border_len) # This is similar to test_file_document path = join(self.path, DOCX_FILE) extracted = get_or_write_file(path + '.txt', self.file_preview.text) self.assertEqual( ' '.join(self.file_preview.text.split()), ' '.join(extracted.read().split()), )
def extract(self, extractor, path): text, images = extractor.extract() extracted = get_or_write_file(path + '.txt', text) self.assertEqual(text, extracted.read())