Пример #1
0
def clear_duplicate_images(arg):
	dbm = DBManager(expanduser(settings.IMAGES_DB))
	dbm.connect()

	jobname = arg
	result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1'%jobname)
	if len(result) == 0:
		print('no such job')

	imagepath = result[0]['path']
	jobpath = imagepath[0:imagepath.rfind(jobname)+len(jobname)]

	print jobpath

	filelist = []
	for root, dirs, files in os.walk(jobpath):
		for filename in files:
			md5hash = md5()
			filepath = joinpath(root, filename)
			with open(filepath, 'rb') as f:
				md5hash.update(f.read())
			filehash = md5hash.hexdigest()
			filelist.append(FileItem(filepath, filehash, False))

	dups_total = 0

	for i in range(0, len(filelist)):
		if filelist[i].dup:
			continue
		hash = filelist[i].hash
		same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)]
		for j in range(i + 1, len(filelist)):
			if filelist[j].hash == hash:
				same_files.append((filelist[j].path, os.stat(filelist[j].path).st_mtime))
				filelist[j] = FileItem(None, None, True)

		if len(same_files) > 1:
			min_mtime = sys.float_info.max
			keep = -1
			for i in range(0, len(same_files)):
				if same_files[i][1] < min_mtime:
					min_mtime = same_files[i][1]
					keep = i

			for i in range(0, len(same_files)):
				if i != keep:
					dups_total += 1
					print('deleting %s'%same_files[i][0])
					try:
						os.remove(same_files[i][0])
					except OSError as e:
						print(e.message)
				
					dbm.query('UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''%same_files[i][0])

	dbm.commit()
	dbm.disconnect()

	print('%d duplicate images deleted.'%dups_total)		
Пример #2
0
class ImageStoreMiddleware(object):
    def __init__(self):
        if exists(settings.IMAGES_DB):
            self._dbm = DBManager(settings.IMAGES_DB)
            self._dbm.connect()
            self._nodb = False
        else:
            self._nodb = True

    def find_url(self, url):
        result = self._dbm.query("SELECT * FROM images WHERE url = '%s'" % url)
        if len(result) > 0:
            return True
        return False

    def process_request(self, request, spider):
        if self._nodb:
            return None

        if self.find_url(request.url):
            raise IgnoreRequest

        return None

    def __del__(self):
        #self._db.disconnect()
        pass
Пример #3
0
class ImageStoreMiddleware(object):
	def __init__(self):
		if exists(settings.IMAGES_DB):
			self._dbm = DBManager(settings.IMAGES_DB)
			self._dbm.connect()
			self._nodb = False
		else:
			self._nodb = True


	def find_url(self, url):
		result = self._dbm.query("SELECT * FROM images WHERE url = '%s'"%url)
		if len(result) > 0:
			return True
		return False


	def process_request(self, request, spider):
		if self._nodb:
			return None

		if self.find_url(request.url):
			raise IgnoreRequest

		return None


	def __del__(self):
		self._db.disconnect()	
Пример #4
0
from six.moves.urllib.parse import unquote

from imagebot.dbmanager import DBManager

sort_parts = [1, 2, 3]
job = 'default'

if len(sys.argv) >= 3:
    job = sys.argv[1]
    sort_parts = [int(i) for i in sys.argv[2].split(',')]
    #print sort_parts

db = DBManager('../images.db')
db.connect()

result = db.query("SELECT url FROM images WHERE job = '%s' LIMIT 100" % job)

for r in result:
    url = r['url']
    #jobname = r['jobname']
    path = job

    url_parts = [p for p in url.split('/') if p != '']
    del url_parts[0]
    del url_parts[-1]
    #print url_parts

    for i in sort_parts:
        if i < len(url_parts):
            path = joinpath(path, url_parts[i])
Пример #5
0
def clear_duplicate_images(arg):
    dbm = DBManager(expanduser(settings.IMAGES_DB))
    dbm.connect()

    jobname = arg
    result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1' %
                       jobname)
    if len(result) == 0:
        print('no such job')

    imagepath = result[0]['path']
    jobpath = imagepath[0:imagepath.rfind(jobname) + len(jobname)]

    print jobpath

    filelist = []
    for root, dirs, files in os.walk(jobpath):
        for filename in files:
            md5hash = md5()
            filepath = joinpath(root, filename)
            with open(filepath, 'rb') as f:
                md5hash.update(f.read())
            filehash = md5hash.hexdigest()
            filelist.append(FileItem(filepath, filehash, False))

    dups_total = 0

    for i in range(0, len(filelist)):
        if filelist[i].dup:
            continue
        hash = filelist[i].hash
        same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)]
        for j in range(i + 1, len(filelist)):
            if filelist[j].hash == hash:
                same_files.append(
                    (filelist[j].path, os.stat(filelist[j].path).st_mtime))
                filelist[j] = FileItem(None, None, True)

        if len(same_files) > 1:
            min_mtime = sys.float_info.max
            keep = -1
            for i in range(0, len(same_files)):
                if same_files[i][1] < min_mtime:
                    min_mtime = same_files[i][1]
                    keep = i

            for i in range(0, len(same_files)):
                if i != keep:
                    dups_total += 1
                    print('deleting %s' % same_files[i][0])
                    try:
                        os.remove(same_files[i][0])
                    except OSError as e:
                        print(e.message)

                    dbm.query(
                        'UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''
                        % same_files[i][0])

    dbm.commit()
    dbm.disconnect()

    print('%d duplicate images deleted.' % dups_total)
Пример #6
0
from imagebot.dbmanager import DBManager


sort_parts = [1, 2, 3]
job = 'default'

if len(sys.argv) >= 3:
	job = sys.argv[1]
	sort_parts = [int(i) for i in sys.argv[2].split(',')]
	#print sort_parts


db = DBManager('../images.db')
db.connect()

result = db.query("SELECT url FROM images WHERE job = '%s' LIMIT 100"%job)

for r in result:
	url = r['url']
	#jobname = r['jobname']
	path = job

	url_parts = [p for p in url.split('/') if p != '']
	del url_parts[0]
	del url_parts[-1]
	#print url_parts

	for i in sort_parts:
		if i < len(url_parts):
				path = joinpath(path, url_parts[i])