def __init__(self, path): self.__path = path """:type: str""" self.__connection = orm.Connection("%s:///%s" % (DB_SCHEME, path), echo=config.LOG_DATABASE_QUERIES) self.__session = orm.Session(bind=self.__connection) self.__is_closed = False self.__parser = None """:type: database.dbparser.GenericParser or None"""
def process_source(source_name, posts_url=None, params=None, url_format=None): session = orm.Session() dontbreak = True while params['page'] < 5000 and dontbreak: response = requests.get(posts_url, params=params) if response.status_code == 421: time.sleep(60) continue elif response.status_code != 200: print "FACK GOT A %d CODE" % response.status_code break images = response.json() if not images: print "No images, done." break for image in images: if 'file_url' not in image: continue exists_query = session.query( orm.Image ).filter( orm.Image.remote_id == image['id'], orm.Image.source_name == source_name, ).exists() if session.query(exists_query).scalar(): print "FOUND EXISTING POST" dontbreak = False break else: try: if source_name==u'danbooru': new_img, new_tags = orm.Image.from_danbooru_response(image, fork=False) else: new_img, new_tags = orm.Image.from_danbooru_response( image, fork=True, fork_url_format=url_format, fork_name=source_name ) if new_img: new_img.source_name = source_name else: continue session.merge(new_img) for tag in new_tags: session.merge(tag) session.commit() except Exception as e: import ipdb; ipdb.set_trace() params['page'] += 1
def process_images(): import multiprocessing import orm session = orm.Session() images = session.query(orm.Image.id, orm.Image.full_url, orm.Image.thumb_url, orm.Image.remote_url).filter( orm.Image.fetched == 0, orm.Image.source_name.in_( (u'awwnime', u'danbooru'))) #for image_id, full_url, thumb_url, source_url in images: # process_image(image_id, full_url, thumb_url, source_url) pool = multiprocessing.Pool(10) pool.map(process_wrap, [thing for thing in images])
from flask import Flask, send_from_directory, request, jsonify import orm from datetime import datetime, timedelta from time import time import conf app = Flask(__name__) session = orm.Session() @app.route('/') def index(): return app.send_static_file('index.html') @app.route('/items') def page(): before = datetime.fromtimestamp(int(request.args.get('before', time()))) limit = int(request.args.get('limit', 40)) keywords = request.args.get('keyword', '').split(',') query = session.query( orm.Image.id, orm.Image.thumb_url, orm.Image.title, orm.Image.date, orm.Image.source_name, orm.Image.url, orm.Image.atags, )
def process_image(image_id, full_url, thumb_url, source_url): import requests import orm import boto import gcs_oauth2_boto_plugin import tempfile import mimetypes import conf from PIL import Image as pimage from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True import imagehash from hashtest import hash_image session = orm.Session() gcs_oauth2_boto_plugin.SetFallbackClientIdAndSecret( conf.client_id, conf.client_secret) fullbucket = boto.storage_uri(conf.fullbucket, 'gs').get_bucket() thumbbucket = boto.storage_uri(conf.thumbbucket, 'gs').get_bucket() # Fetch images print "%d: Starting" % image_id response = requests.get(source_url, stream=True) if not response.status_code == 200: session.query(orm.Image).filter(orm.Image.id == image_id).update( {'fetched': -1}) session.commit() return fulltemp = tempfile.NamedTemporaryFile() thumbtemp = tempfile.NamedTemporaryFile() for block in response.iter_content(4096): fulltemp.write(block) fulltemp.seek(0) himg = pimage.open(fulltemp) ahash, phash, dhash = imagehash.average_hash(himg), imagehash.phash( himg), imagehash.dhash(himg) ahash, phash, dhash = int(str(ahash), base=16), int(str(phash), base=16), int(str(dhash), base=16) # Save images, make thumb himg.thumbnail((640, 640)) himg.convert("RGB").save(thumbtemp, format='WebP') del himg if ahash >= 2**63: ahash -= 2**64 if phash >= 2**63: phash -= 2**64 if dhash >= 2**63: dhash -= 2**64 # Upload fulltemp.seek(0) thumbtemp.seek(0) fullkey = fullbucket.new_key(full_url.split('/')[-1]) thumbkey = thumbbucket.new_key(thumb_url.split('/')[-1]) meta = { 'Cache-Control': 'public, max-age=3600', 'Content-Type': response.headers['content-type'], } fullkey.set_contents_from_file(fulltemp, headers=meta) print "%d: Uploaded full" % image_id meta['Content-Type'] = 'image/webp' thumbkey.set_contents_from_file(thumbtemp, headers=meta) print "%d: Uploaded thumb" % image_id try: bmbhash = hash_image(fulltemp.name) session.add(orm.Hash(name=u'bmbhash', value=bmbhash, image_id=image_id)) except: pass session.add(orm.Hash(name=u'ahash', value=ahash, image_id=image_id)) session.add(orm.Hash(name=u'phash', value=phash, image_id=image_id)) session.add(orm.Hash(name=u'dhash', value=dhash, image_id=image_id)) session.query(orm.Image).filter(orm.Image.id == image_id).update({ 'fetched': 1, 'size': int(response.headers['content-length']) }) session.commit() fulltemp.close() thumbtemp.close()
def item_passed(self, item, spider, output): session = orm.Session() obj = None try: if (isinstance(output, items.RentalItem)): obj = orm.Rental(**output) elif (isinstance(output, items.SaleItem)): obj = orm.Listing(**output) elif (isinstance(output, items.ListingItem)): obj = orm.Listing(**output) elif (isinstance(output, items.PersonItem)): if 'age' in output.keys() and len(output['age']) == 1: birth_year = ( datetime.now() - timedelta(days=365 * int(output['age'][0]))).year del output['age'] prop = session.query(orm.TCAD_2010).get( int(output['prop_ref'][0])) del output['prop_ref'] ref = session.query(orm.Person).filter( orm.Person.first_name == output['first_name'][0] ).filter( orm.Person.last_name == output['last_name'][0]).filter( orm.Person.city == output['city'][0]).filter( orm.Person.state == output['state'][0]).filter( orm.Person.zipcode == output['zipcode'] [0]).first() if ref: obj = orm.Person(id=ref.id, birth_year=birth_year, **output) else: obj = orm.Person(birth_year=birth_year, **output) prop.person = obj session.merge(prop) elif (isinstance(output, items.TCADParcelItem)): if 'prop_id' in output.keys(): parcel = session.query(orm.TCAD_2010).filter( orm.TCAD_2010.prop_id == int(output['prop_id'][0])) if parcel.count() == 1: try: improvements = output['improvements'] del (output['improvements']) for i in improvements: imp = orm.TCADImprovement( parcel=parcel.first(), **i) session.merge(imp) except KeyError: print "No Improvements Found" except sqlalchemy.exc.IntegrityError: print "Improvements already Processed" session.rollback() try: segments = output['segments'] del (output['segments']) for i in segments: seg = orm.TCADSegment(**i) session.merge(seg) except KeyError: print "No Segments Found" historical_values = output['historical_values'] del (output['historical_values']) obj = orm.TCAD_2010(objectid=parcel.first().objectid, **output) for i in historical_values: orm.TCADValueHistory(parcel=obj, **i) else: print "duplicate / missing prop_id - not inserted" else: print "prop_id not found" else: raise orm.Fail, 'unknown data type' except orm.Fail: log.msg("SQL handling failed") else: if obj: obj.last_crawl = datetime.now() session.merge(obj) session.commit() else: print "Duplicate handled"