示例#1
0
 def delete_from_db(condition=None):
     dbm = DBManager(expanduser(settings.IMAGES_DB))
     dbm.connect()
     count = dbm.delete('images', condition)
     dbm.commit()
     dbm.disconnect()
     return count
示例#2
0
class ImageStoreMiddleware(object):
    def __init__(self):
        if exists(settings.IMAGES_DB):
            self._dbm = DBManager(settings.IMAGES_DB)
            self._dbm.connect()
            self._nodb = False
        else:
            self._nodb = True

    def find_url(self, url):
        result = self._dbm.query("SELECT * FROM images WHERE url = '%s'" % url)
        if len(result) > 0:
            return True
        return False

    def process_request(self, request, spider):
        if self._nodb:
            return None

        if self.find_url(request.url):
            raise IgnoreRequest

        return None

    def __del__(self):
        #self._db.disconnect()
        pass
示例#3
0
文件: clear.py 项目: donilan/imagebot
	def delete_from_db(condition=None):
		dbm = DBManager(expanduser(settings.IMAGES_DB))
		dbm.connect()
		count = dbm.delete('images', condition)
		dbm.commit()
		dbm.disconnect()
		return count
示例#4
0
class ImageStoreMiddleware(object):
	def __init__(self):
		if exists(settings.IMAGES_DB):
			self._dbm = DBManager(settings.IMAGES_DB)
			self._dbm.connect()
			self._nodb = False
		else:
			self._nodb = True


	def find_url(self, url):
		result = self._dbm.query("SELECT * FROM images WHERE url = '%s'"%url)
		if len(result) > 0:
			return True
		return False


	def process_request(self, request, spider):
		if self._nodb:
			return None

		if self.find_url(request.url):
			raise IgnoreRequest

		return None


	def __del__(self):
		self._db.disconnect()	
示例#5
0
文件: clear.py 项目: donilan/imagebot
def clear_duplicate_images(arg):
	dbm = DBManager(expanduser(settings.IMAGES_DB))
	dbm.connect()

	jobname = arg
	result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1'%jobname)
	if len(result) == 0:
		print('no such job')

	imagepath = result[0]['path']
	jobpath = imagepath[0:imagepath.rfind(jobname)+len(jobname)]

	print jobpath

	filelist = []
	for root, dirs, files in os.walk(jobpath):
		for filename in files:
			md5hash = md5()
			filepath = joinpath(root, filename)
			with open(filepath, 'rb') as f:
				md5hash.update(f.read())
			filehash = md5hash.hexdigest()
			filelist.append(FileItem(filepath, filehash, False))

	dups_total = 0

	for i in range(0, len(filelist)):
		if filelist[i].dup:
			continue
		hash = filelist[i].hash
		same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)]
		for j in range(i + 1, len(filelist)):
			if filelist[j].hash == hash:
				same_files.append((filelist[j].path, os.stat(filelist[j].path).st_mtime))
				filelist[j] = FileItem(None, None, True)

		if len(same_files) > 1:
			min_mtime = sys.float_info.max
			keep = -1
			for i in range(0, len(same_files)):
				if same_files[i][1] < min_mtime:
					min_mtime = same_files[i][1]
					keep = i

			for i in range(0, len(same_files)):
				if i != keep:
					dups_total += 1
					print('deleting %s'%same_files[i][0])
					try:
						os.remove(same_files[i][0])
					except OSError as e:
						print(e.message)
				
					dbm.query('UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''%same_files[i][0])

	dbm.commit()
	dbm.disconnect()

	print('%d duplicate images deleted.'%dups_total)		
示例#6
0
def setup_db():
	settings.IMAGES_DB = expanduser(settings.IMAGES_DB)

	db = DBManager(settings.IMAGES_DB)
	db.connect()
	schema_script = None
	with open(joinpath(dirname(realpath(__file__)).rsplit(sep, 1)[0], 'tables.sql'), 'r') as f:
		schema_script = f.read()
	db.executescript(schema_script)
	db.disconnect()
示例#7
0
文件: init.py 项目: donilan/imagebot
def setup_db():
	settings.IMAGES_DB = expanduser(settings.IMAGES_DB)

	db = DBManager(settings.IMAGES_DB)
	db.connect()
	schema_script = None
	with open(joinpath(dirname(realpath(__file__)).rsplit(sep, 1)[0], 'tables.sql'), 'r') as f:
		schema_script = f.read()
	db.executescript(schema_script)
	db.disconnect()
示例#8
0
import sys
from os.path import join as joinpath
from six.moves.urllib.parse import unquote

from imagebot.dbmanager import DBManager

sort_parts = [1, 2, 3]
job = 'default'

if len(sys.argv) >= 3:
    job = sys.argv[1]
    sort_parts = [int(i) for i in sys.argv[2].split(',')]
    #print sort_parts

db = DBManager('../images.db')
db.connect()

result = db.query("SELECT url FROM images WHERE job = '%s' LIMIT 100" % job)

for r in result:
    url = r['url']
    #jobname = r['jobname']
    path = job

    url_parts = [p for p in url.split('/') if p != '']
    del url_parts[0]
    del url_parts[-1]
    #print url_parts

    for i in sort_parts:
        if i < len(url_parts):
示例#9
0
def clear_duplicate_images(arg):
    dbm = DBManager(expanduser(settings.IMAGES_DB))
    dbm.connect()

    jobname = arg
    result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1' %
                       jobname)
    if len(result) == 0:
        print('no such job')

    imagepath = result[0]['path']
    jobpath = imagepath[0:imagepath.rfind(jobname) + len(jobname)]

    print jobpath

    filelist = []
    for root, dirs, files in os.walk(jobpath):
        for filename in files:
            md5hash = md5()
            filepath = joinpath(root, filename)
            with open(filepath, 'rb') as f:
                md5hash.update(f.read())
            filehash = md5hash.hexdigest()
            filelist.append(FileItem(filepath, filehash, False))

    dups_total = 0

    for i in range(0, len(filelist)):
        if filelist[i].dup:
            continue
        hash = filelist[i].hash
        same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)]
        for j in range(i + 1, len(filelist)):
            if filelist[j].hash == hash:
                same_files.append(
                    (filelist[j].path, os.stat(filelist[j].path).st_mtime))
                filelist[j] = FileItem(None, None, True)

        if len(same_files) > 1:
            min_mtime = sys.float_info.max
            keep = -1
            for i in range(0, len(same_files)):
                if same_files[i][1] < min_mtime:
                    min_mtime = same_files[i][1]
                    keep = i

            for i in range(0, len(same_files)):
                if i != keep:
                    dups_total += 1
                    print('deleting %s' % same_files[i][0])
                    try:
                        os.remove(same_files[i][0])
                    except OSError as e:
                        print(e.message)

                    dbm.query(
                        'UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''
                        % same_files[i][0])

    dbm.commit()
    dbm.disconnect()

    print('%d duplicate images deleted.' % dups_total)
示例#10
0
class ImageStorePipeline(object):
    def __init__(self):
        if exists(settings.IMAGES_DB):
            self._dbm = DBManager(settings.IMAGES_DB)
            self._dbm.connect()
            self._nodb = False
            log.debug('opened db: %s' % settings.IMAGES_DB)
        else:
            self._nodb = True
            log.debug('could not open db: %s' % settings.IMAGES_DB)

    def process_item(self, item, spider):
        if isinstance(item, ImageItem):
            images = item.get('images', None)
            final_storepath = joinpath(settings.IMAGES_STORE_FINAL,
                                       spider.jobname)

            if images:
                for d in item['images']:
                    ext = d['path'][d['path'].rfind('.') + 1:]
                    filebasename, ext = self.get_filename(d['url'])
                    final_path = joinpath(final_storepath,
                                          filebasename + '.' + ext)
                    i = 0
                    while exists(final_path):
                        log.debug(final_path + ' exists')
                        final_path = joinpath(
                            final_storepath,
                            filebasename + '_%02d' % i + '.' + ext)
                        i += 1

                    try:
                        os.rename(joinpath(settings.IMAGES_STORE, d['path']),
                                  final_path)
                        log.debug('moved to: ' + final_path)
                        spider.update_monitor(final_path)
                        if not self._nodb:
                            self._dbm.insert('images',
                                             (d['url'], final_path,
                                              spider.jobname, int(time())))
                    except OSError as e:
                        log.error(e)

        if not self._nodb:
            self._dbm.commit()
        return item

    def get_filename(self, url):
        url_parts = url.split('/')
        del url_parts[0:2]

        filename = url_parts.pop()
        ext = filename[filename.rfind('.') + 1:]
        filename = filename[0:filename.rfind('.')]
        url_parts.append(filename)

        part = filename
        words = []
        while not any([len(w) > 2 for w in words]) and len(url_parts) > 0:
            part = url_parts.pop()
            part = part.replace('-', '_').replace('.', '_').replace('+', '_')
            pwords = part.split('_')
            pwords = [
                w for w in pwords
                if (len(w) > 0 and len(w) <= 2) or (len(w) > 2 and (
                    len([c for c in w if
                         (ord(c) >= 48 and ord(c) <= 57)]) < len(w) / 2))
            ]
            words = pwords + words

        final = ''
        if len(words) > 0:
            for w in words:
                final += w + '_'
            final = final.rstrip('_')
        else:
            final = 'image.'

        return final, ext

    def __del__(self):
        self._dbm.disconnect()
示例#11
0
文件: sort.py 项目: donilan/imagebot
from os.path import join as joinpath

from imagebot.dbmanager import DBManager


sort_parts = [1, 2, 3]
job = 'default'

if len(sys.argv) >= 3:
	job = sys.argv[1]
	sort_parts = [int(i) for i in sys.argv[2].split(',')]
	#print sort_parts


db = DBManager('../images.db')
db.connect()

result = db.query("SELECT url FROM images WHERE job = '%s' LIMIT 100"%job)

for r in result:
	url = r['url']
	#jobname = r['jobname']
	path = job

	url_parts = [p for p in url.split('/') if p != '']
	del url_parts[0]
	del url_parts[-1]
	#print url_parts

	for i in sort_parts:
		if i < len(url_parts):