def web_scraper(page_id): """This function accepts the id,checks if it is within the list of ids in the database, and scrapes only 10 links on that particular link page""" all_ids = Pages(DB.connect()).select_id() new_all_id = [pid[0] for pid in all_ids] if page_id not in new_all_id: raise TypeError('Id does not exist.') else: url = Pages(DB.connect()).select_url(page_id) DB.pages().update(True, page_id) value = requests.get(url) soup = BeautifulSoup(value.text, 'html.parser') list_urls = [] for link in soup.find_all('a', href=True): if link['href'].startswith('https'): list_urls.append(link['href']) new_list_urls = list_urls[:10] DB.links().delete_by_page_id(page_id) for item in new_list_urls: Links(DB.connect()).insert(page_id, item) DB.pages().update(False, page_id)
class TestDb(unittest.TestCase): '''class that tests db class in _init_.py''' def setUp(self): '''function that sets up for testing ''' self.db = DB() def test_connect(self): '''function that tests the connect function''' connection_object = self.db.connect() self.assertIsNotNone(connection_object) def test_new_connect(self): '''function that tests the new_connect function''' connection_object = self.db.new_connect() self.assertIsNotNone(connection_object) def test_setup(self): '''function that tests the setup function''' self.assertEqual(self.db.setup(), None) cursor = self.db.new_connect().cursor() query = cursor.execute('SELECT url FROM pages WHERE id=1 ') self.assertEqual(query, None) def test_seed(self): '''function that tests the seed function''' self.db.setup() seed = self.db.seed() self.assertIsNone(seed) def tearDown(self): self.db = None
class TestDatabase(TestCase): '''Class to test the database (db) functions''' def setUp(self): self.db = DB() def test_connection(self): '''tests that the connection function does it's work.''' connection = self.db.connect() self.assertIsNotNone(connection) def test_setup(self): '''tests that the setup function does what it was designed to do.''' self.db.setup() self.assertIsNone(self.db.setup()) def test_seed(self): '''tests that the seed function does what it was designed to do.''' self.db.connect() self.db.setup() self.db.seed() self.assertIsNone(self.db.seed()) def test_pages(self): '''tests that the pages function does what it was designed to do.''' self.db.connect() self.db.setup() self.db.seed() selecter = self.db.pages().select() self.assertIsNotNone(selecter) def test_links(self): '''tests that the links function does what it was designed to do.''' self.db.connect() self.db.setup() select_link = self.db.links().select() self.assertIsNotNone(select_link) def TearDown(self): '''the teardown function for all the tests.''' self.db.connect().close()
def setUp(self) -> None: # set up the Pages class self.exec = Pages(DB.connect())
def test_connect(self): connection_object = DB.connect() self.assertIsNotNone(connection_object)
def setUp(self) -> None: self.exec = Pages(DB.connect())
# Show examples of how you would use ALL your implementations here from src.db import DB from src.spider import spider_scrap from celery import Celery from decouple import config # db = DB() db.connect() db.new_connect() db.setup() db.seed() dd = DB.new_connect() pages = DB.pages() # pages.fetch_url(2) print(pages.fetch_url(2)) print(pages.select()) print(pages.find(2)) # print(pages.update_id(1)) links = DB.links() print(links.insert(1, 'www.goggle.com')) print(links.delete(1)) print(links.select(1)) # # # app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) # # # @app.task # def scrap_url(): # return spider_scrap(1) # spider_scrap(1)
def setUp(self) -> None: self.exec = Links(DB.connect())
def task(): return web_scraper(Pages(DB.connect()).find_url(1))
def setUp(self) -> None: # Set up the Links class self.exec = Links(DB.connect())
def test_db_connect(self): ''' Test connection to database ''' self.assertIsNotNone(DB.connect())
def test_connect(self): conn = DB.connect() self.assertIsNotNone(conn)