class ResultFilter(object): def __init__(self, results_class): self._rf_worker = Worker() self._results_class = results_class self.thresholds = [20,1,1] def ready_to_shutdown(self): return self._rf_worker.ready_to_shutdown() def submit_ranker_result(self, rank_result): """ This function is used by Ranker. Takes [page_url, page_weight, [(link_soup,weight)]] and passes it to rf_worker as [page_url, page_weight, [(link_soup,weight)], self.thresholds, self._results_class] """ args = list(rank_result) + [self.thresholds, self._results_class] rfw_task = WorkerTask(args, result_filter_routine, args[0]) self._rf_worker.add_task(rfw_task) def get_result(self, filter_match = (lambda x: True)): """ Returns [page_url, page_weight, [(link,weight)]] """ completed_rfw_task = self._rf_worker.get_completed_task(filter_match) if completed_rfw_task is None: return None return completed_rfw_task.result def purge_tasks(self, filter_not_match): """ Removes all tasks, for which <filter_not_match> returns False. """ self._rf_worker.purge_tasks(filter_not_match)
class DownloadManager: def __init__(self, mode, other_routine=None): self.tasks = [] self.tasks_completed = [] self.mode = mode if mode==0: self._worker = Worker() self._tasks_downloading = [] self._data = {} self._sleeping_time = None self.waiting = TM_WAITING_TIME self._right_time = True self._routine = other_routine if other_routine else routine pass def run(self): if self.mode==0: self._set_to_upload() self._asking_worker() else: self._usual_downloading() def get_completed_tasks(self): completed = self.tasks_completed self.tasks_completed = [] return completed def _usual_downloading(self): if self.tasks: self.tasks.sort(key=lambda t: t.weight) task = self.tasks.pop(0) print 'adding download task', task dtask = DownloadTask([], task) self._routine(dtask,[]) result = dtask.result self._data[task.link] = (result['data'], result['page']) self.tasks_completed.append(task) def get_html(self,link): if link in self._data: html = self._data[link] del self._data[link] return html return None def is_finished(self): result = not (self._sleeping_time or self._tasks_downloading or self.tasks) return result def _set_to_upload(self): if not self._is_sleep(): need_download = DOWNLOAD_COUNT - len(self._tasks_downloading) if need_download < 1: return self.tasks.sort(key=lambda t: t.weight) for task in self.tasks[:need_download]: print 'adding download task', task self._worker.add_task( WorkerTask([], self._routine, task) ) self.tasks.remove(task) self._tasks_downloading.append(task) def _get_completed_tasks(self): while True: worker_task = self._worker.get_completed_task() if not worker_task: break yield worker_task.user_data, worker_task.result def _asking_worker(self): if not self._is_sleep(): for task, result in self._get_completed_tasks(): if result['page'].getcode() == 503: self._set_sleep_mode() return self._data[task.link] = (result['data'], result['page']) self._tasks_downloading.remove(task) self.tasks_completed.append(task) self._right_time = True def _set_sleep_mode(self): self._rise_up_waiting() print 'set sleeping mode at', self.waiting, 'seconds...' self._worker.purge_tasks(lambda task: True ) self.tasks.extend(self._tasks_downloading) self._tasks_downloading = [] self._sleeping_time = time.time() self._right_time = False def _rise_up_waiting(self): if not self._right_time: self.waiting *= 1.5 def _is_sleep(self): if not self._sleeping_time: return False past_time = time.time()-self._sleeping_time if past_time >= self.waiting: self._sleeping_time = None return False return True