def clear_database(): print("deleting all links:", db.delete_all_links()) print("deleting all page data:", db.delete_all_page_data()) print("getting all images:", db.delete_all_images()) print("deleting all pages:", db.delete_all_pages()) print("deleting all sites:", db.delete_all_sites()) print("getting all sites:", db.get_all_sites()) print("getting all pages:", db.get_all_pages()) print("getting all images:", db.get_all_images()) print("getting all page data:", db.get_all_page_data()) print("getting_all_links:", db.get_all_links())
def get_page_to_crawl(self): while True: # acquire lock all_pages = db.get_all_pages() # find first page that has the tag frontier page_to_crawl = None for page in all_pages: if page[2] == "FRONTIER": page_to_crawl = page break if page_to_crawl is None: #print("---------------------->", threading.get_ident(), "There are no pages available to crawl!") return None, None # get site url for the first page that has the tag frontier page_to_crawl_site = db.get_site_by_id(page_to_crawl[1]) # check if the domain can be accessed at current time how_long_to_wait = hf.how_long_to_wait(page_to_crawl_site[1], self.time_accessed, self.time_between_calls) if how_long_to_wait == 0: # if yes, return page and domain, and mark the page as visited (just change the tag to HTML) self.lock.acquire() updated_page = db.update_page_by_id( page_to_crawl[0], page_to_crawl[1], PAGE_TYPE_CODES[0], page_to_crawl[3], page_to_crawl[4], page_to_crawl[5], page_to_crawl[6], page_to_crawl[7]) self.lock.release() page_to_crawl = updated_page return page_to_crawl, page_to_crawl_site else: time.sleep(how_long_to_wait)
page1 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test.com/index.html", "html_content", "300", "040521") print("inserted page:", page1) page2 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test1.com/index.html", "html_content2", "303", "040522") print("inserted page:", page2) '''Insert image''' image = db.insert_image(page1[0], "slika.jpg", "image/jpg", "asd", "040521") print("inserted image:", image) '''Insert page_data''' page_data = db.insert_page_data(page2[0], DATA_TYPES[0], "asd") print("page_data_id:", page_data) '''Insert link''' link = db.insert_link(page1[0], page2[0]) print("inserted link:", link) print("getting all sites:", db.get_all_sites()) print("getting all pages:", db.get_all_pages()) print("getting all images:", db.get_all_images()) print("getting all page data:", db.get_all_page_data()) print("getting_all_links:", db.get_all_links()) print("deleting all links:", db.delete_all_links()) print("deleting all page data:", db.delete_all_page_data()) print("getting all images:", db.delete_all_images()) print("deleting all pages:", db.delete_all_pages()) print("deleting all sites:", db.delete_all_sites())