class Deduplicator(multiprocessing.Process): def __init__(self, settings_json, stop_event, db_lock): """ Create a Hasher Process, which will be bound to the stop_event, performing post-processing on downloaded Files. """ super().__init__() self._settings = settings_json self._stop_event = stop_event self._lock = db_lock self.progress = DownloaderProgress() self.progress.clear(status="Starting up...") self._session = None self.daemon = True def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() try: self._session = sql.session() self.progress.clear(status="Starting up...") self.progress.set_running(True) while not self._stop_event.is_set(): self._dedupe() self.progress.set_status("Ready for new files...") self._stop_event.wait(2) self._dedupe() # Run one final pass after downloading stops. self.progress.clear(status="Finished.", running=False) except Exception as ex: print('Deduplication Process Error:', ex) self.progress.set_error(ex) self.progress.set_running(False) traceback.print_exc() finally: sql.close() def _dedupe(self): unfinished = self._session\ .query(File) \ .options(joinedload(File.urls))\ .filter(File.hash == None)\ .filter(File.downloaded == True)\ .all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. if not unfinished: return for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating (%s) files..." % (len(unfinished) - idx)) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file() or is_album: continue new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) matches = self._find_matching_files(new_hash, ignore_id=f.id) # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) if len(matches): # print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() self._prune() def _find_matching_files(self, search_hash, ignore_id): sp = Hash.split_hash(search_hash) all_hashes = self._session \ .query(File) \ .join(Hash, File.hash) \ .filter( (Hash.full_hash == search_hash) | (Hash.p1 == sp[0]) | (Hash.p2 == sp[1]) | (Hash.p3 == sp[2]) | (Hash.p4 == sp[3]) ).all() # print(sp) # print('Potential matches:', len(all_hashes), all_hashes) return list( filter(lambda f: self._check_hash_match(f, search_hash), all_hashes)) def _check_hash_match(self, file, search_hash): """ Compare the given hash against the given SQL File. Returns invalid if the target File has albums, or is not fully processed. """ if not file.hash or any(u.album_id or not u.processed for u in file.urls): return False if FileHasher.hamming_distance(search_hash, file.hash.full_hash) >= 4: return False return True def _choose_best_file(self, files): files = sorted( files, key=lambda f: SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path).size(), reverse=True) return files[0], files[1:] def _upgrade_file(self, new_file, old_file): # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path) self._session.query(URL). \ filter(URL.file_id == old_file.id). \ update({URL.file_id: new_file.id}) file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=old_file.path) if file.is_file(): file.delete_file() def _prune(self): with self._lock: orphans = self._session.query(File).filter( ~File.urls.any()).delete(synchronize_session='fetch') self._session.commit()
class Downloader(multiprocessing.Process): def __init__(self, reader, ack_queue, settings_json, db_lock): """ Create a Downloader Process, which will be bound to the queue given, listening for URLs to download. """ super().__init__() self._reader = reader self._settings = settings_json self.progress = DownloaderProgress() self._session = None self._db_lock = db_lock self._ack_queue = ack_queue self.daemon = True def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() self._session = sql.session() self.progress.clear(status="Starting up...", running=True) failed = False for nxt_id in self._reader: try: url = self._session.query( sql.URL).filter(sql.URL.id == nxt_id).first() if not url: raise Exception("Unknown URL ID provided: (%s}" % nxt_id) file = url.file path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=str(file.path)) self.progress.set_file(path.relative()) self.progress.set_status("Attempting to Handle URL...") self.progress.set_running(True) task = handlers.HandlerTask(url=url.address, file_obj=path) resp = handlers.handle(task, self.progress) is_album_parent = False with self._db_lock: if resp.album_urls: if url.album_id: resp.album_urls = [ ] # Ignore nested Albums to avoid recursion. else: url.album_id = str(uuid.uuid4()) is_album_parent = True else: resp.album_urls = [] url.failed = not resp.success url.failure_reason = resp.failure_reason url.last_handler = resp.handler url.album_is_parent = is_album_parent if resp.rel_file: file.downloaded = True file.path = resp.rel_file.relative() file.hash = None utime(resp.rel_file.absolute(), times=(time(), time())) self._session.commit() # Once *all* processing is completed on this URL, the Downloader needs to ACK it. # If any additional Album URLS were located, they should be sent before the ACK. self._ack_queue.put( AckPacket(url_id=nxt_id, extra_urls=resp.album_urls)) self.progress.clear(status="Waiting for URL...") except Exception as ex: failed = str(ex) self._ack_queue.put(AckPacket(url_id=nxt_id, extra_urls=[])) print(ex) traceback.print_exc() self.progress.set_error("Exited with error: {%s}" % failed) break sql.close() self.progress.clear( "Finished." if not failed else "Exited with error: %s" % failed, running=False)
class Deduplicator(multiprocessing.Process): def __init__(self, settings_json, stop_event): """ Create a Hasher Process, which will be bound to the stop_event, performing post-processing on downloaded Files. """ super().__init__() self._settings = settings_json self._stop_event = stop_event self.progress = DownloaderProgress() self.progress.clear(status="Starting up...") self._session = None self.daemon = True def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() self._session = sql.session() self.progress.clear(status="Starting up...") self.progress.set_running(True) while not self._stop_event.is_set(): self._dedupe() self.progress.set_status("Waiting for new files...") self._stop_event.wait(2) self._dedupe() # Run one final pass after downloading stops. self.progress.set_running(False) sql.close() self.progress.clear("Finished.") def _dedupe(self): unfinished = self._session\ .query(File) \ .options(joinedload(File.urls))\ .filter(File.hash == None)\ .filter(File.downloaded == True)\ .all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. if not unfinished: return for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating (%s) files..." % (len(unfinished) - idx)) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file() or is_album: continue new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) matches = [] if is_album else self._find_matching_files( new_hash, ignore_id=f.id) f.hash = new_hash if len(matches): print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() self._prune() def _find_matching_files(self, search_hash, ignore_id): all_hashes = self._session \ .query(File) \ .options(joinedload(File.urls)) \ .filter(File.hash != None) \ .filter(File.downloaded == True) \ .filter(File.id != ignore_id) \ .all() matches = [] for pm in all_hashes: if any(u.album_id for u in pm.urls) or any(not u.processed for u in pm.urls): continue if FileHasher.hamming_distance(search_hash, pm.hash) < 4: matches.append(pm) return matches def _choose_best_file(self, files): files = sorted( files, key=lambda f: SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path).size(), reverse=True) return files[0], files[1:] def _upgrade_file(self, new_file, old_file): print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path) self._session.query(URL). \ filter(URL.file_id == old_file.id). \ update({URL.file_id: new_file.id}) file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=old_file.path) if file.is_file(): file.delete_file() def _prune(self): orphans = self._session.query(File).filter(~File.urls.any()).delete( synchronize_session='fetch') self._session.commit() if orphans: print("Deleted orphan Files:", orphans)
class Deduplicator(multiprocessing.Process): def __init__(self, settings_json, stop_event, db_lock): """ Create a Hasher Process, which will be bound to the stop_event, performing post-processing on downloaded Files. """ super().__init__() self._settings = settings_json self._stop_event = stop_event self._lock = db_lock self.progress = DownloaderProgress() self.progress.clear(status="Starting up...") self._session = None self.daemon = True def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() print("Starting up...", debug=True) try: self._session = sql.session() self.progress.clear(status="Starting up...") self.progress.set_running(True) self.dedup_ignore_ids = set() self.prune_counter = 0 self.special_hashes = self._session.query(Hash).filter( Hash.id < 0).all() while not self._stop_event.is_set(): #print("_stop_event is %s"%self._stop_event.is_set(), debug=True) completed = self._dedupe() if completed: self.progress.set_status( "Completed %s files. Ready for new files..." % completed) self._stop_event.wait(1) else: self._stop_event.wait(10) print("_stop_event is %s" % self._stop_event.is_set(), debug=True) self._dedupe() # Run one final pass after downloading stops. self.progress.clear(status="Finished.", running=False) except Exception as ex: print('Deduplication Process Error:', ex) self.progress.set_error(ex) self.progress.set_running(False) traceback.print_exc() finally: print("Finished process, _stop_event is %s" % self._stop_event.is_set(), debug=True) sql.close() def _dedupe(self): # unfinished = self._session\ # .query(File) \ # .options(joinedload(File.urls))\ # .filter(File.hash == None)\ # .filter(File.downloaded == True)\ # .all() start_time = datetime.now() hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \ .filter(Hash.full_hash != None, Hash.file_id != None)) downloaded = set( r.id for r in self._session.query(File).filter(File.downloaded == True)) # get downloaded files without a hash search_ids = downloaded.difference(hashed).difference( self.dedup_ignore_ids) unfinished = self._session.query(File).filter( File.id.in_(search_ids)).all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. #print("Working on %s files total"%len(unfinished), debug=True) if not unfinished: return 0 stats = { 'unique': 0, 'has_dup': 0, 'special_hash': 0, 'not_is_file': 0, 'is_album': 0 } matches = [] last_printed = '' for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating %s of %s files..." % (idx + 1, len(unfinished))) #print("Working on %s/%s files"%(idx, len(unfinished)), debug=True) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file(): stats['not_is_file'] += 1 self.dedup_ignore_ids.add(f.id) continue if is_album: stats['is_album'] += 1 self.dedup_ignore_ids.add(f.id) continue if self._stop_event.is_set(): break new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) for h in self.special_hashes: if new_hash == h.full_hash: print("Found special hash:", h, "::\n", f, debug=True) stats['special_hash'] += 1 with self._lock: f.hash = Hash.make_hash(f, new_hash) self._session.query(URL).filter( URL.file_id == f.id).update( {URL.file_id: h.file_id}) file = SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path) if file.is_file(): file.delete_file() self._session.commit() break else: # not a special hash matches = self._find_matching_files(new_hash, ignore_id=f.id) if matches: if new_hash == last_printed: print("Found another duplicate:", new_hash, "::\n", f, debug=True) elif len(matches) > 6: printed = matches[:3] + [ "... %s total matches ..." % len(matches) ] + matches[-3:] print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + printed), debug=True) else: print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + matches), debug=True) stats['has_dup'] += 1 last_printed = new_hash else: stats['unique'] += 1 # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True) if len(matches): #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() if matches: print("Completed %s of %s files..." % (idx + 1, len(unfinished)), debug=True) dt = datetime.now() - start_time print("Completed all %s files in %s sec. Counts = %s" % (len(unfinished), str(dt), ', '.join( '%s: %s' % (k, v) for k, v in stats.items() if v)), debug=True) # self.prune_counter += len(matches) # if self.prune_counter >= 100: # self.prune_counter = 0 #self.progress.set_status("Pruning orphaned files...") #self._prune() #print("Finished pruning.", debug=True) return len(unfinished) def _find_matching_files(self, search_hash, ignore_id): sp = Hash.split_hash(search_hash) all_hashes = self._session \ .query(File) \ .join(Hash, File.hash) \ .filter( (Hash.full_hash == search_hash) | (Hash.p1 == sp[0]) | (Hash.p2 == sp[1]) | (Hash.p3 == sp[2]) | (Hash.p4 == sp[3]) ).all() # print(sp) # print('Potential matches:', len(all_hashes), all_hashes) return list( filter(lambda f: self._check_hash_match(f, search_hash), all_hashes)) def _check_hash_match(self, file, search_hash): """ Compare the given hash against the given SQL File. Returns invalid if the target File has albums, or is not fully processed. """ if not file.hash or any(u.album_id or not u.processed for u in file.urls): return False #if FileHasher.hamming_distance(search_hash, file.hash.full_hash) >= 4: if search_hash != file.hash.full_hash: return False return True def _choose_best_file(self, files): files = sorted( files, key=lambda f: SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path).size(), reverse=True) return files[0], files[1:] def _upgrade_file(self, new_file, old_file): # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path) self._session.query(URL). \ filter(URL.file_id == old_file.id). \ update({URL.file_id: new_file.id}) file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=old_file.path) if file.is_file(): file.delete_file() def _prune(self): with self._lock: files_id = set(r.id for r in self._session.query(File)) url_files_id = set( int(r.file_id) for r in self._session.query(URL)) orphans = self._session.query(File).filter( File.id.in_(files_id.difference(url_files_id))).delete( synchronize_session='fetch') #orphans = self._session.query(File).filter(~File.urls.any()).delete(synchronize_session='fetch') self._session.commit() if orphans: print("Deleted orphan Files:", orphans, debug=True)