def save_website_content(tasks: List[QueueTask]): try: with transaction.atomic(): for task in tasks: if task.get_image: urls = get_url_images_from_html(task.url) or [] save_images(urls, task) logger.info("Saved all images from task id: %s" % task.pk) if task.get_text: texts = get_text_from_html(task.url) or [] WebsiteText.save_texts(texts, task) logger.info("Saved all texts from task id: %s" % task.pk) task.set_success_state() except DatabaseError as e: logger.error("Database error: %s" % e)
def test__text_extractor__should__return__None_input_is_not_correct_url( self): self.assertEqual(get_text_from_html('http://www.onetwothreepl'), None)
def test__text_extractor__should__return__not_empty_list(self): len_of_list = len(get_text_from_html('http://www.google.pl')) self.assertGreater(len_of_list, 0)
def test__text_extractor__should__return__None_if_domain_does_not_exist( self): self.assertEqual(get_text_from_html('http://www.onetwothree.pl'), None)
def test__text_extractor__should__return__None__if_if_status_code_is_404( self): url = "https://github.com/Albert-91/semantive_scrapping_text_and_images_from_url" self.assertEqual(get_text_from_html(url), None)
def test__text_extractor__should__return__None_if_scheme_is_invalid(self): self.assertEqual(get_text_from_html('htts://www.google.pl'), None)
def test__text_extractor__should__return__None_if_url_has_no_scheme(self): self.assertEqual(get_text_from_html('www.google.pl'), None)
def test__text_extractor__should__return__None_if_url_is_not_string(self): self.assertEqual(get_text_from_html(2), None)