def work(k): """Here the threads do the main job, threads gets on url page from queue, after crawl the page and remove it from queue""" for i in range(k): url = fifo_queue.get() Spider.crawl_page(threading.Thread().name, url) fifo_queue.task_done()
def go(): """Get link from queue and put it to Spider""" while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def work(): while True: item = thread_queue.get() url = item['url'] distance = item['distance'] Spider.crawl_page(threading.current_thread().name, url, distance) thread_queue.task_done()
def doCrawling(): while 1: #Remove from the queue url = queue.get() #We will use use the name of current thread to see what is going on Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def work(self): while True: url = self.queue.get() # self.output.append(threading.current_thread().name + ' now crawling ' + url) # self.output.append('Queue: ' + str(len(Spider.queue)) + ' | Crawled: ' + str(len(Spider.crawled))) Spider.crawl_page(threading.current_thread().name, url) self.queue.task_done()
def work(Num=30): print("Work") while True and Num > 0: link = QUEUE.get() Spider.crawl_page(threading.current_thread().name, link) QUEUE.task_done() Num -= 1
def work(): while True: url = queue.get() if url is None: break Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def work(): print('main.py/work()') while True: url=queue.get() Spider.crawl_page(threading.current_thread().name,url) queue.task_done() print('main.py/work()/end')
def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done() if queue.empty(): print("end of process......") break
def work(): max=10 i=0 while i<max: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done() i=i+1
def work(): """ Do next job in queue """ while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def work(): while True: url = queue.get() try: Spider.crawl_page(threading.current_thread().name, url) queue.task_done() except Exception as e: print('Error in', threading.current_thread().name, 'crawling', url, '\n\t', e)
def work(): """ do the next job in the queue """ while True: url = queue.get() #spider to crawl the url using the current thread created. Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def work(): count = 0 while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) count += 1 if count % 100 == 0: Spider.update_files() queue.task_done()
def work(): # As long as work is not done while True: # Get the next URL to be parsed url = queue.get() # Crawl the webpage at that url Spider.crawl_page(threading.current_thread().name, url) # return when task is done queue.task_done()
def work(): while True: url = queue.get() if(url == ''): print('URL not found') else: spider = Spider(url) spider.crawl_page(threading.current_thread().name) print(spider.socialLinks()) queue.task_done()
def work(): while True: time.sleep(SLEEP_TIME) url = queue.get() print (threading.current_thread().name + ' now crawling ' + url) print ('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) Spider.crawl_page(threading.current_thread().name, url) print 'Crawled web' + str(Spider.file_order) Spider.file_order += 1 queue.task_done()
def work(): while True: url = queue.get() #tup = (url, ) #print(tup) ### This print all website print(url, "main.work()") Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def create_jobs(): logging.error("In Create Jobs") for link in file_to_set(QUEUE_FILE): logging.error("Link {}".format(link)) Spider.crawl_page(link,link) queue.put(link) queue.join() logging.error("After Queue Join") logging.error(queue) crawl()
def work(): global total_retrieved_pages while True: print("retrieved pages: " + str(total_retrieved_pages)) url = queue.get() if (total_retrieved_pages < max_retrieved_pages): Spider.crawl_page(threading.current_thread().name, url) else: Spider.crawl_page_graph(threading.current_thread().name, url) print("Crawling task by thread is done") queue.task_done() '''
def work(): global working while working: url = queue.get() #if(queue.empty()): # working=False #sys.exit() #print("###############################################################\n###############################################################\n###############################################################\n") Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def do_task(url): content_type = '' try: resp = requests.head(url) # checking if the link is a webpage or a file content_type = resp.headers['Content-Type'] if 'text/html' in content_type: Spider.crawl_page(threading.current_thread().name, url) else: # the link contains a file Spider.crawl_file(threading.current_thread().name, url, content_type) except Exception as e: pass
def work(): while True: url = queue.get() table_name = 'url_title_rel' title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name) #print title queue.task_done()
class Detected: PROJECT_NAME = '' HOMEPAGE = '' DOMAIN_NAME = '' QUEUE_FILE = '' CRAWLED_FILE = '' NUMBER_OF_THREADS = 8 queue = Queue() spi = None def __init__(self, project_name, homepage): self.PROJECT_NAME = project_name self.HOMEPAGE = homepage self.DOMAIN_NAME = get_domain_name(self.HOMEPAGE) self.QUEUE_FILE = self.PROJECT_NAME + '/queue.txt' self.CRAWLED_FILE = self.PROJECT_NAME + '/crawled.txt' self.spi = Spider(self.PROJECT_NAME, self.HOMEPAGE, self.DOMAIN_NAME) # Create worker threads (will die when main exits) def create_workers(self): for _ in range(self.NUMBER_OF_THREADS): t = threading.Thread(target=self.work) t.daemon = True t.start() # Do the next job in the queue def work(self): while True: url = self.queue.get() self.spi.crawl_page(threading.current_thread().name, url) self.queue.task_done() # Each queued link is a new job def create_jobs(self): for link in file_to_set(self.QUEUE_FILE): self.queue.put(link) self.queue.join() self.crawl() # Check if there are items in the queue, if so crawl them def crawl(self): queued_links = file_to_set(self.QUEUE_FILE) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') self.create_jobs()
def work(): while True: url = queue.get() global ALLNUM try: if (lock.acquire()): response = urlopen(url, timeout=3) childrenlink = Spider.gather_links(url) childrenfile = open( PROJECT_NAME + '/childrenlink/' + str(ALLNUM) + '.txt', 'w') childrenfile.write(url + '\n') for each_child in childrenlink: if ('javascript' not in each_child): childrenfile.write(each_child + '\n', ) childrenfile.close() #write html file by utf8 encoding html_byte = response.read() chardit1 = chardet.detect(html_byte) file1 = open( PROJECT_NAME + '/html/utf8/' + str(ALLNUM) + '.html', 'wb') html_string = html_byte.decode( chardit1['encoding']).encode('utf-8') file1.write(html_string) file1.close() #for smj encode as GBK file2 = open( PROJECT_NAME + '/html/gbk/' + str(ALLNUM) + '.html', 'wb') html_string = html_byte.decode(chardit1['encoding'], 'ignore').encode( 'gbk', 'ignore') file2.write(html_string) file2.close() except Exception as e: print(str(e)) queue.task_done() lock.release() else: append_link(url) ALLNUM = ALLNUM + 1 Spider.crawl_page(threading.current_thread().name, url) queue.task_done() lock.release()
def work(): while True: url=queue.get() Spider.crawl_page(threading.current_thread().name,url) queue.task.done()
def work(): global links_found # While not reaching link limit while queue.qsize() < params.max_links: # Grab the links from file queued_links = file_to_set(QUEUE_FILE) links_found = len(queued_links) # Take a link and crawl it url = queue.get() start_crawl_time = time.time() Spider.crawl_page(threading.current_thread().name, url) end_crawl_time = time.time() # Log the task time with open(params.projectName + '/timeLog.txt', 'a') as file: file.write(threading.current_thread().name + ": " + str(round(end_crawl_time - start_crawl_time, 2)) + ' seconds\n') # Flag the queue that a task is done queue.task_done()
def work(self): while True: url = self.queue.get() Spider.crawl_page(threading.current_thread().name, url) self.queue.task_done()
def work(): while True: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done() print('calling work function')
def work(): while True: link = queue.get() Spider.crawl_page(threading.current_thread().name, link) queue.task_done()
def work(): while True: url = queue.get() print("work-------url:", url) Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def spider(domain, url, depth): """爬虫测试""" spider_engine = Spider(domain) spider_engine.crawl_page([url], depth)
def work(): while True: url = q.get() Spider.crawl_page(threading.currentThread().name, url) q.task_done()
def work(): while True: url = queue.get() # get url from queue.txt Spider.crawl_page(threading.current_thread().name, url) queue.task_done() # end task for worker. It can now find more jobs, if available
def work(): while 1: url = queue.get() Spider.crawl_page(threading.current_thread().name, url) queue.task_done()
def get_next(url_set): for url in url_set: waiting_list.append(url) current_url = waiting_list.pop() Spider.crawl_page(current_url)