Пример #1
0
def save_website_content(tasks: List[QueueTask]):
    try:
        with transaction.atomic():
            for task in tasks:
                if task.get_image:
                    urls = get_url_images_from_html(task.url) or []
                    save_images(urls, task)
                    logger.info("Saved all images from task id: %s" % task.pk)
                if task.get_text:
                    texts = get_text_from_html(task.url) or []
                    WebsiteText.save_texts(texts, task)
                    logger.info("Saved all texts from task id: %s" % task.pk)
                task.set_success_state()
    except DatabaseError as e:
        logger.error("Database error: %s" % e)
Пример #2
0
 def test__text_extractor__should__return__None_input_is_not_correct_url(
         self):
     self.assertEqual(get_text_from_html('http://www.onetwothreepl'), None)
Пример #3
0
 def test__text_extractor__should__return__not_empty_list(self):
     len_of_list = len(get_text_from_html('http://www.google.pl'))
     self.assertGreater(len_of_list, 0)
Пример #4
0
 def test__text_extractor__should__return__None_if_domain_does_not_exist(
         self):
     self.assertEqual(get_text_from_html('http://www.onetwothree.pl'), None)
Пример #5
0
 def test__text_extractor__should__return__None__if_if_status_code_is_404(
         self):
     url = "https://github.com/Albert-91/semantive_scrapping_text_and_images_from_url"
     self.assertEqual(get_text_from_html(url), None)
Пример #6
0
 def test__text_extractor__should__return__None_if_scheme_is_invalid(self):
     self.assertEqual(get_text_from_html('htts://www.google.pl'), None)
Пример #7
0
 def test__text_extractor__should__return__None_if_url_has_no_scheme(self):
     self.assertEqual(get_text_from_html('www.google.pl'), None)
Пример #8
0
 def test__text_extractor__should__return__None_if_url_is_not_string(self):
     self.assertEqual(get_text_from_html(2), None)