def __init__(self, source_patterns=None): super().__init__() self.daemon = False self.sources = source_patterns self.sources = self.load_sources() # initialize Loader self.loader = RedditLoader(sources=self.sources, settings_json=settings.to_json()) self.deduplicator = Deduplicator( settings_json=settings.to_json(), stop_event=self.loader.get_stop_event()) self._downloaders = self._create_downloaders() self._all_processes = [self.loader, *self._downloaders] if settings.get('processing.deduplicate_files'): self._all_processes.append(self.deduplicator)
def _create_downloaders(self): dls = [] for i in range(settings.get('threading.concurrent_downloads')): tp = Downloader(reader=self.loader.get_reader(), ack_queue=self.loader.get_ack_queue(), settings_json=settings.to_json()) dls.append(tp) return dls
def __init__(self, source_patterns=None): super().__init__() sql.init_from_settings() # Make sure the database is built & migrated before starting threads. sql.close() self.daemon = False self.sources = source_patterns self.sources = self.load_sources() self.db_lock = RLock() # initialize Loader self.loader = RedditLoader(sources=self.sources, settings_json=settings.to_json(), db_lock=self.db_lock) self.deduplicator = Deduplicator( settings_json=settings.to_json(), stop_event=self.loader.get_stop_event(), db_lock=self.db_lock ) self._downloaders = self._create_downloaders() self._all_processes = [self.loader, *self._downloaders] if settings.get('processing.deduplicate_files'): self._all_processes.append(self.deduplicator)
def test_download(self): """ Downloader should work """ stop_event = multiprocessing.Event() in_queue = multiprocessing.Queue() ack_queue = multiprocessing.Queue() reader = QueueReader(in_queue, stop_event) dl = downloader.Downloader(reader, ack_queue, settings.to_json(), multiprocessing.RLock()) stats = {'ack': 0, 'sent': 0} def add_test(inf): sess = sql.session() lst = sess.query(sql.URL).all() st = time.time() sent = [] for l in lst: in_queue.put_nowait(l.id) inf['sent'] += 1 sent.append(l.id) while time.time() - st < 30 and sent: try: rd = ack_queue.get(block=True, timeout=.5) inf['ack'] += 1 sent.remove(rd.url_id) except queue.Empty: pass sess.close() stop_event.set() thread = Thread(target=add_test, args=(stats, )) thread.start() dl.run() thread.join() self.assertGreater(stats['sent'], 0, msg='Failed to send any test URLS for download!') self.assertEqual(stats['sent'], stats['ack'], msg='Not all sent URLs were Acked!') self.assertFalse(dl.progress.get_running(), msg='Failed to clear running status!')