def search_naa(): rs = RSSearchClient() kwargs = request.args.to_dict() kwargs['page'] = request.args.get('page', 1) kwargs['sort'] = 3 print kwargs results = rs.search_names(**kwargs) return jsonify(results)
def __init__(self, series, control=None): self.series = series self.control = control self.total_pages = None self.pages_complete = 0 self.client = RSSearchClient() self.prepare_harvest() db = self.get_db() self.items = db.items
class SeriesHarvester(): def __init__(self, series, control=None): self.series = series self.control = control self.total_pages = None self.pages_complete = 0 self.client = RSSearchClient() self.prepare_harvest() db = self.get_db() self.items = db.items def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() # items = db.items # items.remove() return db def get_total(self): return self.client.total_results def get_db_total(self): return self.items.find({'series': self.series}).count() def prepare_harvest(self): if self.control: self.client.search(series=self.series, control=self.control) else: self.client.search(series=self.series) total_results = self.client.total_results print '{} items'.format(total_results) self.total_pages = (int(total_results) / self.client.results_per_page) + 1 print self.total_pages def start_harvest(self, page=None): if not page: page = self.pages_complete + 1 while self.pages_complete < self.total_pages: if self.control: response = self.client.search(series=self.series, page=page, control=self.control) else: response = self.client.search(series=self.series, page=page, sort='9') self.items.insert_many(response['results']) self.pages_complete += 1 page += 1 print '{} pages complete'.format(self.pages_complete) time.sleep(1) def harvest_images(self): db = self.get_db() items = db.items.find({ 'series': self.series, 'digitised_status': True }) images = db.images headers = {'User-Agent': 'Mozilla/5.0'} for item in items: directory = os.path.join( IMAGES_DIR, '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-'), item['identifier'])) if not os.path.exists(directory): os.makedirs(directory) os.makedirs(os.path.join(directory, 'thumbs')) for page in range(1, item['digitised_pages'] + 1): filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page) print '{}, p. {}'.format(item['identifier'], page) if not os.path.exists(filename): img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format( item['identifier'], page) response = requests.get(img_url, headers=headers, stream=True) response.raise_for_status() try: image = Image.open(StringIO(response.content)) except: print 'Not an image' else: width, height = image.size image.save(filename) del response image_meta = { 'item_id': item['_id'], 'identifier': item['identifier'], 'page': page, 'width': width, 'height': height } images.save(image_meta) print 'Image saved' for size in IMAGE_SIZES: new_width, new_height = size thumb_file = '{}/thumbs/{}-p{}-{}-sq.jpg'.format( directory, item['identifier'], page, new_width) thumb_image = ImageOps.fit(image, size, Image.ANTIALIAS) thumb_image.save(thumb_file) thumb_file = '{}/thumbs/{}-p{}-200.jpg'.format( directory, item['identifier'], page) thumb_image = image.copy() thumb_image.thumbnail((200, 200)) thumb_image.save(thumb_file) image.close() thumb_image.close() time.sleep(5)
class SeriesHarvester(): def __init__(self, series, control=None): self.series = series self.control = control self.total_pages = None self.pages_complete = 0 self.client = RSSearchClient() self.prepare_harvest() db = self.get_db() self.items = db.items def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() # items = db.items # items.remove() return db def get_total(self): return self.client.total_results def get_db_total(self): return self.items.find({'series': self.series}).count() def prepare_harvest(self): if self.control: self.client.search(series=self.series, control=self.control) else: self.client.search(series=self.series) total_results = self.client.total_results print '{} items'.format(total_results) self.total_pages = (int(total_results) / self.client.results_per_page) + 1 print self.total_pages def start_harvest(self, page=None): if not page: page = self.pages_complete + 1 while self.pages_complete < self.total_pages: if self.control: response = self.client.search(series=self.series, page=page, control=self.control) else: response = self.client.search(series=self.series, page=page, sort='9') self.items.insert_many(response['results']) self.pages_complete += 1 page += 1 print '{} pages complete'.format(self.pages_complete) time.sleep(1) def harvest_images(self): db = self.get_db() items = db.items.find({'series': self.series, 'digitised_status': True}) images = db.images headers = {'User-Agent': 'Mozilla/5.0'} for item in items: directory = os.path.join(IMAGES_DIR, '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-'), item['identifier'])) if not os.path.exists(directory): os.makedirs(directory) os.makedirs(os.path.join(directory, 'thumbs')) for page in range(1, item['digitised_pages'] + 1): filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page) print '{}, p. {}'.format(item['identifier'], page) if not os.path.exists(filename): img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page) response = requests.get(img_url, headers=headers, stream=True) response.raise_for_status() try: image = Image.open(StringIO(response.content)) except: print 'Not an image' else: width, height = image.size image.save(filename) del response image_meta = { 'item_id': item['_id'], 'identifier': item['identifier'], 'page': page, 'width': width, 'height': height } images.save(image_meta) print 'Image saved' for size in IMAGE_SIZES: new_width, new_height = size thumb_file = '{}/thumbs/{}-p{}-{}-sq.jpg'.format(directory, item['identifier'], page, new_width) thumb_image = ImageOps.fit(image, size, Image.ANTIALIAS) thumb_image.save(thumb_file) thumb_file = '{}/thumbs/{}-p{}-200.jpg'.format(directory, item['identifier'], page) thumb_image = image.copy() thumb_image.thumbnail((200, 200)) thumb_image.save(thumb_file) image.close() thumb_image.close() time.sleep(5)