def insert_page_as_binary(self, data_type): #self.lock.acquire() db.update_page_by_id( self.page_currently_crawling[0], self.page_currently_crawling[1], "BINARY", self.page_currently_crawling[3], self.page_currently_crawling[4], self.page_currently_crawling[5], self.page_currently_crawling[6], self.page_currently_crawling[7]) db.insert_page_data(self.page_currently_crawling[0], data_type, None)
def handle_duplicate_page(self): # acquire lock #self.lock.acquire() # Hash of a passed html_content h = hash_tool.create_content_hash(self.current_page_html) # Check if page is exact copy of already parsed documents in database returned_duplicate = db.find_page_duplicate(h) if returned_duplicate and returned_duplicate[ 3] != self.page_currently_crawling[3]: # Update page as 'DUPLICATE' updated_page = db.update_page_by_id( self.page_currently_crawling[0], self.page_currently_crawling[1], PAGE_TYPE_CODES[1], self.page_currently_crawling[3], self.page_currently_crawling[4], self.page_currently_crawling[5], self.page_currently_crawling[6], self.page_currently_crawling[7]) self.page_currently_crawling = updated_page print("Page ", self.page_currently_crawling[3], "is a DUPLICATE from", returned_duplicate[3]) # Save a new link: to_page is set to duplicate version db.insert_link(self.page_currently_crawling[0], returned_duplicate[0]) #self.lock.release() return True return False """ else:
def insert_accessed_time(self): #self.lock.acquire() updated_page = db.update_page_by_id( self.page_currently_crawling[0], self.page_currently_crawling[1], self.page_currently_crawling[2], self.page_currently_crawling[3], self.page_currently_crawling[4], self.page_currently_crawling[5], self.page_currently_crawling[6], self.accessed_time) self.page_currently_crawling = updated_page
def insert_status_code(self): #self.lock.acquire() updated_page = db.update_page_by_id( self.page_currently_crawling[0], self.page_currently_crawling[1], self.page_currently_crawling[2], self.page_currently_crawling[3], self.page_currently_crawling[4], self.page_currently_crawling[5], self.status_code, self.page_currently_crawling[7]) self.page_currently_crawling = updated_page
def insert_html_content(self): #self.lock.acquire() updated_page = db.update_page_by_id( self.page_currently_crawling[0], self.page_currently_crawling[1], self.page_currently_crawling[2], self.page_currently_crawling[3], self.current_page_html, self.page_currently_crawling[5], self.page_currently_crawling[6], self.page_currently_crawling[7]) self.page_currently_crawling = updated_page
def insert_page_hash(self): # acquire lock #self.lock.acquire() # Calculate hash from html hash = hash_tool.create_content_hash(self.current_page_html) # update hash of a page in db updated_page = db.update_page_by_id( self.page_currently_crawling[0], self.page_currently_crawling[1], self.page_currently_crawling[2], self.page_currently_crawling[3], self.page_currently_crawling[4], hash, self.page_currently_crawling[6], self.page_currently_crawling[7]) self.page_currently_crawling = updated_page
def get_page_to_crawl(self): while True: # acquire lock all_pages = db.get_all_pages() # find first page that has the tag frontier page_to_crawl = None for page in all_pages: if page[2] == "FRONTIER": page_to_crawl = page break if page_to_crawl is None: #print("---------------------->", threading.get_ident(), "There are no pages available to crawl!") return None, None # get site url for the first page that has the tag frontier page_to_crawl_site = db.get_site_by_id(page_to_crawl[1]) # check if the domain can be accessed at current time how_long_to_wait = hf.how_long_to_wait(page_to_crawl_site[1], self.time_accessed, self.time_between_calls) if how_long_to_wait == 0: # if yes, return page and domain, and mark the page as visited (just change the tag to HTML) self.lock.acquire() updated_page = db.update_page_by_id( page_to_crawl[0], page_to_crawl[1], PAGE_TYPE_CODES[0], page_to_crawl[3], page_to_crawl[4], page_to_crawl[5], page_to_crawl[6], page_to_crawl[7]) self.lock.release() page_to_crawl = updated_page return page_to_crawl, page_to_crawl_site else: time.sleep(how_long_to_wait)
import db_methods as db PAGE_TYPE_CODES = ["HTML", "DUPLICATE", "FRONTIER", "BINARY"] DATA_TYPES = ["DOC", "DOCX", "PDF", "PPT", "PPTX"] '''Insert site''' site = db.insert_site("test.com123", "robotstext", "sitemaptext") print("inserted site:", site) '''Insert page''' page1 = db.insert_page(site[0], PAGE_TYPE_CODES[2], "test.com/index.html", "html_content", "300", "040521") print("inserted page:", page1) page2 = db.insert_page(site[0], PAGE_TYPE_CODES[2], "test1.com/index.html", "html_content2", "303", "040522") print("inserted page:", page2) updated_page1 = db.update_page_by_id(page1[0], page1[1], PAGE_TYPE_CODES[0], page1[3], page1[4], page1[5], page1[6]) updated_page2 = db.update_page_by_id(page2[0], page2[1], PAGE_TYPE_CODES[0], page2[3], page2[4], page2[5], page2[6]) print("getting all sites:", db.get_all_sites()) print("getting all pages:", db.get_all_pages()) print("deleting all pages:", db.delete_all_pages()) print("deleting all sites:", db.delete_all_sites())