def clear_duplicate_images(arg): dbm = DBManager(expanduser(settings.IMAGES_DB)) dbm.connect() jobname = arg result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1'%jobname) if len(result) == 0: print('no such job') imagepath = result[0]['path'] jobpath = imagepath[0:imagepath.rfind(jobname)+len(jobname)] print jobpath filelist = [] for root, dirs, files in os.walk(jobpath): for filename in files: md5hash = md5() filepath = joinpath(root, filename) with open(filepath, 'rb') as f: md5hash.update(f.read()) filehash = md5hash.hexdigest() filelist.append(FileItem(filepath, filehash, False)) dups_total = 0 for i in range(0, len(filelist)): if filelist[i].dup: continue hash = filelist[i].hash same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)] for j in range(i + 1, len(filelist)): if filelist[j].hash == hash: same_files.append((filelist[j].path, os.stat(filelist[j].path).st_mtime)) filelist[j] = FileItem(None, None, True) if len(same_files) > 1: min_mtime = sys.float_info.max keep = -1 for i in range(0, len(same_files)): if same_files[i][1] < min_mtime: min_mtime = same_files[i][1] keep = i for i in range(0, len(same_files)): if i != keep: dups_total += 1 print('deleting %s'%same_files[i][0]) try: os.remove(same_files[i][0]) except OSError as e: print(e.message) dbm.query('UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''%same_files[i][0]) dbm.commit() dbm.disconnect() print('%d duplicate images deleted.'%dups_total)
class ImageStoreMiddleware(object): def __init__(self): if exists(settings.IMAGES_DB): self._dbm = DBManager(settings.IMAGES_DB) self._dbm.connect() self._nodb = False else: self._nodb = True def find_url(self, url): result = self._dbm.query("SELECT * FROM images WHERE url = '%s'" % url) if len(result) > 0: return True return False def process_request(self, request, spider): if self._nodb: return None if self.find_url(request.url): raise IgnoreRequest return None def __del__(self): #self._db.disconnect() pass
class ImageStoreMiddleware(object): def __init__(self): if exists(settings.IMAGES_DB): self._dbm = DBManager(settings.IMAGES_DB) self._dbm.connect() self._nodb = False else: self._nodb = True def find_url(self, url): result = self._dbm.query("SELECT * FROM images WHERE url = '%s'"%url) if len(result) > 0: return True return False def process_request(self, request, spider): if self._nodb: return None if self.find_url(request.url): raise IgnoreRequest return None def __del__(self): self._db.disconnect()
from six.moves.urllib.parse import unquote from imagebot.dbmanager import DBManager sort_parts = [1, 2, 3] job = 'default' if len(sys.argv) >= 3: job = sys.argv[1] sort_parts = [int(i) for i in sys.argv[2].split(',')] #print sort_parts db = DBManager('../images.db') db.connect() result = db.query("SELECT url FROM images WHERE job = '%s' LIMIT 100" % job) for r in result: url = r['url'] #jobname = r['jobname'] path = job url_parts = [p for p in url.split('/') if p != ''] del url_parts[0] del url_parts[-1] #print url_parts for i in sort_parts: if i < len(url_parts): path = joinpath(path, url_parts[i])
def clear_duplicate_images(arg): dbm = DBManager(expanduser(settings.IMAGES_DB)) dbm.connect() jobname = arg result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1' % jobname) if len(result) == 0: print('no such job') imagepath = result[0]['path'] jobpath = imagepath[0:imagepath.rfind(jobname) + len(jobname)] print jobpath filelist = [] for root, dirs, files in os.walk(jobpath): for filename in files: md5hash = md5() filepath = joinpath(root, filename) with open(filepath, 'rb') as f: md5hash.update(f.read()) filehash = md5hash.hexdigest() filelist.append(FileItem(filepath, filehash, False)) dups_total = 0 for i in range(0, len(filelist)): if filelist[i].dup: continue hash = filelist[i].hash same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)] for j in range(i + 1, len(filelist)): if filelist[j].hash == hash: same_files.append( (filelist[j].path, os.stat(filelist[j].path).st_mtime)) filelist[j] = FileItem(None, None, True) if len(same_files) > 1: min_mtime = sys.float_info.max keep = -1 for i in range(0, len(same_files)): if same_files[i][1] < min_mtime: min_mtime = same_files[i][1] keep = i for i in range(0, len(same_files)): if i != keep: dups_total += 1 print('deleting %s' % same_files[i][0]) try: os.remove(same_files[i][0]) except OSError as e: print(e.message) dbm.query( 'UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\'' % same_files[i][0]) dbm.commit() dbm.disconnect() print('%d duplicate images deleted.' % dups_total)
from imagebot.dbmanager import DBManager sort_parts = [1, 2, 3] job = 'default' if len(sys.argv) >= 3: job = sys.argv[1] sort_parts = [int(i) for i in sys.argv[2].split(',')] #print sort_parts db = DBManager('../images.db') db.connect() result = db.query("SELECT url FROM images WHERE job = '%s' LIMIT 100"%job) for r in result: url = r['url'] #jobname = r['jobname'] path = job url_parts = [p for p in url.split('/') if p != ''] del url_parts[0] del url_parts[-1] #print url_parts for i in sort_parts: if i < len(url_parts): path = joinpath(path, url_parts[i])