def scrape_links(self,pos): doc_object=MongoDB.get(URLQueue,'document',number=pos) while doc_object is not None: self.scrape_link_and_child(doc_object['url']) pos=MongoDB.increment_url_counter() doc_object=MongoDB.get(URLQueue,'document',number=pos)
def scrape_urls_multiproc(cls): #current position pos=MongoDB.get(MetaData,'position',type='queue') #current cap cap=pos process_queue=queue.Queue(maxsize=settings.NUM_PROCESSES) #creates all the necessary processes for p_num in range(0,settings.NUM_PROCESSES): p=mp.Process(target=WebScraper().scrape_links_from_position,args=[cap]) #get curresponding objects process_queue.put(p) cap+=settings.NUM_URLS_PER_PROCESS #now start p.start() head=process_queue.get() #wait and create new processes as needed while(pos<MongoDB.count(URLQueue)): head.join() if not head.exitcode ==0: Logger.error('Error with Process, terminating') return #update counter MongoDB.increment_url_counter(settings.NUM_URLS_PER_PROCESS) p=mp.Process(target=WebScraper().scrape_links_from_position,args=[cap]) process_queue.put(p) p.start() #increase both cap and current position cap+=settings.NUM_URLS_PER_PROCESS pos+=settings.NUM_URLS_PER_PROCESS head=process_queue.get() print(p.exitcode) return cls