def post(self): artwork_json = json.loads(self.request.get('json')) crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop'))) publish_date = (datetime.datetime .utcfromtimestamp(artwork_json['publishDate'] / 1000) .date()) new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], crop_tuple, publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] new_artwork = FeaturedArtwork( title=artwork_json['title'], byline=artwork_json['byline'], image_url=new_image_url, thumb_url=new_thumb_url, details_url=artwork_json['detailsUri'], publish_date=publish_date) new_artwork.save() self.response.set_status(200)
def post(self): artwork_json = json.loads(self.request.get('json')) publish_date = (datetime.datetime.utcfromtimestamp( artwork_json['publishDate'] / 1000).date()) if FeaturedArtwork.all().filter('publish_date=', publish_date).get() != None: webapp2.abort(409, message='Artwork already exists for this date.') crop_tuple = tuple( float(x) for x in json.loads(self.request.get('crop'))) new_image_url, new_thumb_url = backroomarthelper.maybe_process_image( artwork_json['imageUri'], crop_tuple, publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] new_artwork = FeaturedArtwork( title=artwork_json['title'], byline=artwork_json['byline'], attribution=artwork_json['attribution'] if 'attribution' in artwork_json else None, image_url=new_image_url, thumb_url=new_thumb_url, details_url=artwork_json['detailsUri'], publish_date=publish_date) new_artwork.save() self.response.set_status(200)
def process_html(self, url, html): soup = BeautifulSoup(html) details_url = re.sub(r"#.+", "", url, re.I | re.S) + "?utm_source=Muzei&utm_campaign=Muzei" title = soup.select("h1 span")[0].get_text() author = soup.find(itemprop="author").get_text() completion_year_el = soup.find(itemprop="dateCreated") byline = author + ((", " + completion_year_el.get_text()) if completion_year_el else "") image_url = soup.find(id="paintingImage")["href"] if not title or not author or not image_url: self.response.out.write("Could not parse HTML") self.response.set_status(500) return publish_date = datetime.datetime.utcfromtimestamp(int(self.request.get("publishDate")) / 1000).date() image_url, thumb_url = maybe_process_image( image_url, NO_CROP_TUPLE, publish_date.strftime("%Y%m%d") + " " + title + " " + byline ) # create the artwork entry new_artwork = FeaturedArtwork( title=title, byline=byline, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date, ) new_artwork.save() self.response.set_status(200)
def process_html(self, url, html): soup = BeautifulSoup(html) details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find(itemprop='name').get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] if not title or not author or not image_url: self.response.out.write('Could not parse HTML') self.response.set_status(500) return publish_date = (datetime.datetime.utcfromtimestamp( int(self.request.get('publishDate')) / 1000).date()) image_url, thumb_url = maybe_process_image( image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork(title=title, byline=byline, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() self.response.set_status(200)
def post(self): artwork_json = json.loads(self.request.get("json")) publish_date = datetime.datetime.utcfromtimestamp(artwork_json["publishDate"] / 1000).date() if FeaturedArtwork.all().filter("publish_date=", publish_date).get() != None: webapp2.abort(409, message="Artwork already exists for this date.") crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop"))) new_image_url, new_thumb_url = backroomarthelper.maybe_process_image( artwork_json["imageUri"], crop_tuple, publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"], ) if not new_thumb_url and "thumbUri" in artwork_json: new_thumb_url = artwork_json["thumbUri"] new_artwork = FeaturedArtwork( title=artwork_json["title"], byline=artwork_json["byline"], attribution=artwork_json["attribution"] if "attribution" in artwork_json else None, image_url=new_image_url, thumb_url=new_thumb_url, details_url=artwork_json["detailsUri"], publish_date=publish_date, ) new_artwork.save() self.response.set_status(200)
def post(self): artwork_json = json.loads(self.request.get('json')) publish_date = (datetime.datetime .utcfromtimestamp(artwork_json['publishDate'] / 1000) .date()) if FeaturedArtwork.all().filter('publish_date=', publish_date).get() != None: webapp2.abort(409, message='Artwork already exists for this date.') crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop'))) new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], crop_tuple, publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] new_artwork = FeaturedArtwork( title=artwork_json['title'], byline=artwork_json['byline'], attribution=artwork_json['attribution'] if 'attribution' in artwork_json else None, image_url=new_image_url, thumb_url=new_thumb_url, details_url=artwork_json['detailsUri'], publish_date=publish_date) new_artwork.save() self.response.set_status(200)
def get(self): ARTWORKS = json.loads(open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read()) # ARTWORKS = filter(lambda a: '_stars' in a and a['_stars'] >= 1, ARTWORKS) # Fetch latest 300 artworks (for blacklisting) latest_artworks = (FeaturedArtwork.all() .order('-publish_date') .fetch(300)) # List dates for which artwork exists dates_with_existing_art = set(a.publish_date for a in latest_artworks) # List target dates that we want artwork for, but for which no artwork exists target_dates = [date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS)] target_dates = [d for d in target_dates if d not in dates_with_existing_art] # Create a blacklist of keys to avoid repeats blacklist = set(artwork_key(a.details_url) for a in latest_artworks) logging.debug('starting blacklist size: %d' % len(blacklist)) chosen_artworks = [] for target_date in target_dates: # Pick from available artworks, excluding artwork in the blacklist random_artwork = None while True: if len(ARTWORKS) == 0: logging.error('Ran out of artworks to choose from, cannot continue') return random_artwork = random.choice(ARTWORKS) key = artwork_key(random_artwork['detailsUri']) if key not in blacklist: # Once chosen, remove it from the list of artworks to choose next ARTWORKS.remove(random_artwork) chosen_artworks.append(random_artwork) break target_details_url = str(random_artwork['detailsUri']) logging.debug('%(date)s: setting to %(url)s' % dict(url=target_details_url, date=target_date)) # Store the new artwork if self.request.get('dry-run', '') != 'true': new_artwork = FeaturedArtwork( title=random_artwork['title'], byline=random_artwork['byline'], attribution=random_artwork['attribution'], image_url=random_artwork['imageUri'], thumb_url=random_artwork['thumbUri'], details_url=random_artwork['detailsUri'], publish_date=target_date) new_artwork.save() if self.request.get('output', '') == 'html': self.response.out.write(get_html(artworks_json=json.dumps(chosen_artworks))) # Finish up logging.debug('done')
def process_html(self, url, html): soup = BeautifulSoup(html) details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find(itemprop='name').get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] if not title or not author or not image_url: self.response.out.write('Could not parse HTML') self.response.set_status(500) return publish_date = (datetime.datetime .utcfromtimestamp(int(self.request.get('publishDate')) / 1000) .date()) image_url, thumb_url = maybe_process_image(image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork( title=title, byline=byline, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() self.response.set_status(200)
def add_art_from_external_details_url(publish_date, url): if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None: webapp2.abort(409, message='Artwork already exists for this date.') result = urlfetch.fetch(url) if result.status_code < 200 or result.status_code >= 300: webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s' % (result.status_code, result.content)) soup = BeautifulSoup(result.content) attribution = None if re.search(r'wikiart.org', url, re.I) or re.search(r'wikipaintings.org', url, re.I): attribution = 'wikiart.org' details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.select('h1 span')[0].get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] elif re.search(r'metmuseum.org', url, re.I): attribution = 'metmuseum.org' details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find('h2').get_text() author = '' try: author = unicode(soup.find(text='Artist:').parent.next_sibling).strip() except: pass author = re.sub(r'\s*\(.*', '', author) completion_year_el = None try: completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip() except: pass byline = author + ((', ' + completion_year_el) if completion_year_el else '') image_url = soup.find('a', class_='download').attrs['href'] else: webapp2.abort(400, message='Unrecognized URL') if not title or not author or not image_url: webapp2.abort(500, message='Could not parse HTML') image_url, thumb_url = maybe_process_image(image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork( title=title, byline=byline, attribution=attribution, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() return new_artwork
def get(self): ARTWORKS = json.loads( open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read()) # Fetch latest 300 artworks (for blacklisting) latest_artworks = ( FeaturedArtwork.all().order('-publish_date').fetch(300)) # List dates for which artwork exists dates_with_existing_art = set(a.publish_date for a in latest_artworks) # List target dates that we want artwork for, but for which no artwork exists target_dates = [ date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS) ] target_dates = [ d for d in target_dates if d not in dates_with_existing_art ] # Create a blacklist of keys to avoid repeats blacklist = set(artwork_key(a.details_url) for a in latest_artworks) self.response.out.write('starting blacklist size: %d<br>' % len(blacklist)) for target_date in target_dates: # Pick from available artworks, excluding artwork in the blacklist random_artwork = None while True: random_artwork = random.choice(ARTWORKS) key = artwork_key(random_artwork['detailsUri']) if key not in blacklist: # Once chosen, add to the blacklist to avoid repeats within the lookahead blacklist.add(key) break target_details_url = str(random_artwork['detailsUri']) self.response.out.write( '%(date)s: setting to <b>%(url)s</b><br>' % dict(url=target_details_url, date=target_date)) # Store the new artwork new_artwork = FeaturedArtwork( title=random_artwork['title'], byline=random_artwork['byline'], attribution=random_artwork['attribution'], image_url=random_artwork['imageUri'], thumb_url=random_artwork['thumbUri'], details_url=random_artwork['detailsUri'], publish_date=target_date) new_artwork.save() # Finish up self.response.out.write('done<br>')
def post(self): artwork_json = json.loads(self.request.get('json')) new_artwork = FeaturedArtwork( title=artwork_json['title'], byline=artwork_json['byline'], image_url=artwork_json['imageUri'], thumb_url=(artwork_json['thumbUri'] if 'thumbUri' in artwork_json else None), details_url=artwork_json['detailsUri'], publish_date=datetime.datetime .utcfromtimestamp(artwork_json['publishDate'] / 1000) .date()) new_artwork.save() self.response.set_status(200)
def post(self): artwork_json = json.loads(self.request.get('json')) new_artwork = FeaturedArtwork( title=artwork_json['title'], byline=artwork_json['byline'], image_url=artwork_json['imageUri'], thumb_url=(artwork_json['thumbUri'] if 'thumbUri' in artwork_json else (artwork_json['imageUri'] + '!BlogSmall.jpg')), details_url=artwork_json['detailsUri'], publish_date=datetime.datetime.utcfromtimestamp( artwork_json['publishDate'] / 1000).date()) new_artwork.save() self.response.set_status(200)
def process_html(self, url, html): soup = BeautifulSoup(html) if re.search(r'wikiart.org', url, re.I): details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.select('h1 span')[0].get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] elif re.search(r'metmuseum.org', url, re.I): details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find('h2').get_text() author = unicode( soup.find(text='Artist:').parent.next_sibling).strip() author = re.sub(r'\s*\(.*', '', author) completion_year_el = unicode( soup.find(text='Date:').parent.next_sibling).strip() byline = author + ( (', ' + completion_year_el) if completion_year_el else '') image_url = soup.find('a', class_='download').attrs['href'] else: self.response.out.write('Unrecognized URL') self.response.set_status(500) return if not title or not author or not image_url: self.response.out.write('Could not parse HTML') self.response.set_status(500) return publish_date = (datetime.datetime.utcfromtimestamp( int(self.request.get('publishDate')) / 1000).date()) image_url, thumb_url = maybe_process_image( image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork(title=title, byline=byline, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() self.response.set_status(200) self.response.out.write(json.dumps(artwork_dict(new_artwork)))
def post(self): id = long(self.request.get("id")) artwork_json = json.loads(self.request.get("json")) crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop"))) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: webapp2.abort(404) target_artwork.title = artwork_json["title"] target_artwork.byline = artwork_json["byline"] target_artwork.attribution = artwork_json["attribution"] if "attribution" in artwork_json else None new_image_url, new_thumb_url = backroomarthelper.maybe_process_image( artwork_json["imageUri"], crop_tuple, target_artwork.publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"], ) if not new_thumb_url and "thumbUri" in artwork_json: new_thumb_url = artwork_json["thumbUri"] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json["detailsUri"] target_artwork.save() self.response.set_status(200) self.response.out.write(json.dumps(artwork_dict(target_artwork)))
def post(self): id = long(self.request.get('id')) artwork_json = json.loads(self.request.get('json')) crop_tuple = tuple( float(x) for x in json.loads(self.request.get('crop'))) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return target_artwork.title = artwork_json['title'] target_artwork.byline = artwork_json['byline'] new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], crop_tuple, target_artwork.publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json['detailsUri'] target_artwork.save() self.response.set_status(200)
def post(self): id = long(self.request.get("id")) artwork_json = json.loads(self.request.get("json")) crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop"))) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return target_artwork.title = artwork_json["title"] target_artwork.byline = artwork_json["byline"] new_image_url, new_thumb_url = maybe_process_image( artwork_json["imageUri"], crop_tuple, target_artwork.publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"], ) if not new_thumb_url and "thumbUri" in artwork_json: new_thumb_url = artwork_json["thumbUri"] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json["detailsUri"] target_artwork.save() self.response.set_status(200)
def post(self): id = long(self.request.get('id')) artwork_json = json.loads(self.request.get('json')) crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop'))) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return target_artwork.title = artwork_json['title'] target_artwork.byline = artwork_json['byline'] new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], crop_tuple, target_artwork.publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json['detailsUri'] target_artwork.save() self.response.set_status(200)
def post(self): id = long(self.request.get('id')) artwork_json = json.loads(self.request.get('json')) crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop'))) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: webapp2.abort(404) target_artwork.title = artwork_json['title'] target_artwork.byline = artwork_json['byline'] target_artwork.attribution = artwork_json['attribution'] if 'attribution' in artwork_json else None new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], crop_tuple, target_artwork.publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json['detailsUri'] target_artwork.save() self.response.set_status(200) self.response.out.write(json.dumps(artwork_dict(target_artwork)))
def move_artwork(self, artwork, publish_date, initial_artwork_id): # cascade moves current_artwork_at_date = FeaturedArtwork.all().filter("publish_date =", publish_date).get() if current_artwork_at_date and current_artwork_at_date.key().id() != initial_artwork_id: self.move_artwork(current_artwork_at_date, publish_date + datetime.timedelta(hours=24), initial_artwork_id) artwork.publish_date = publish_date artwork.save()
def post(self): id = long(self.request.get('id')) artwork_json = json.loads(self.request.get('json')) crop_tuple = tuple( float(x) for x in json.loads(self.request.get('crop'))) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: webapp2.abort(404) target_artwork.title = artwork_json['title'] target_artwork.byline = artwork_json['byline'] target_artwork.attribution = artwork_json[ 'attribution'] if 'attribution' in artwork_json else None new_image_url, new_thumb_url = backroomarthelper.maybe_process_image( artwork_json['imageUri'], crop_tuple, target_artwork.publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json['detailsUri'] target_artwork.save() self.response.set_status(200) self.response.out.write(json.dumps(artwork_dict(target_artwork)))
def process_html(self, url, html): soup = BeautifulSoup(html) if re.search(r'wikiart.org', url, re.I): details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.select('h1 span')[0].get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] elif re.search(r'metmuseum.org', url, re.I): details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find('h2').get_text() author = unicode(soup.find(text='Artist:').parent.next_sibling).strip() author = re.sub(r'\s*\(.*', '', author) completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip() byline = author + ((', ' + completion_year_el) if completion_year_el else '') image_url = soup.find('a', class_='download').attrs['href'] else: self.response.out.write('Unrecognized URL') self.response.set_status(500) return if not title or not author or not image_url: self.response.out.write('Could not parse HTML') self.response.set_status(500) return publish_date = (datetime.datetime .utcfromtimestamp(int(self.request.get('publishDate')) / 1000) .date()) image_url, thumb_url = maybe_process_image(image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork( title=title, byline=byline, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() self.response.set_status(200) self.response.out.write(json.dumps(artwork_dict(new_artwork)))
def get(self): ARTWORKS = json.loads(open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read()) # Fetch latest 300 artworks (for blacklisting) latest_artworks = (FeaturedArtwork.all() .order('-publish_date') .fetch(300)) # List dates for which artwork exists dates_with_existing_art = set(a.publish_date for a in latest_artworks) # List target dates that we want artwork for, but for which no artwork exists target_dates = [date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS)] target_dates = [d for d in target_dates if d not in dates_with_existing_art] # Create a blacklist of keys to avoid repeats blacklist = set(artwork_key(a.details_url) for a in latest_artworks) self.response.out.write('starting blacklist size: %d<br>' % len(blacklist)) for target_date in target_dates: # Pick from available artworks, excluding artwork in the blacklist random_artwork = None while True: random_artwork = random.choice(ARTWORKS) key = artwork_key(random_artwork['detailsUri']) if key not in blacklist: # Once chosen, add to the blacklist to avoid repeats within the lookahead blacklist.add(key) break target_details_url = str(random_artwork['detailsUri']) self.response.out.write('%(date)s: setting to <b>%(url)s</b><br>' % dict(url=target_details_url, date=target_date)) # Store the new artwork new_artwork = FeaturedArtwork( title=random_artwork['title'], byline=random_artwork['byline'], attribution=random_artwork['attribution'], image_url=random_artwork['imageUri'], thumb_url=random_artwork['thumbUri'], details_url=random_artwork['detailsUri'], publish_date=target_date) new_artwork.save() # Finish up self.response.out.write('done<br>')
def render_with_headers(self, callback): now = datetime.utcnow() headers = {} current = None # Get up to 5 artworks published earlier than 2 days from now, ordered by latest first latest_artworks = (FeaturedArtwork.all() .filter('publish_date <=', date.today() + timedelta(days=2)) .order('-publish_date') .fetch(5)) # Pick out the first artwork in that set that has actually been published for artwork in latest_artworks: if now >= datetime.combine(artwork.publish_date, START_TIME): current = artwork break ret_obj = dict() if current is not None: # Found the next featured artwork ret_obj = dict( title=current.title.strip(), byline=current.byline.strip(), imageUri=current.image_url, detailsUri=current.details_url) if current.thumb_url: ret_obj['thumbUri'] = current.thumb_url if current.attribution: ret_obj['attribution'] = current.attribution # The next update time is the next START_TIME next_start_time = datetime.combine(date.today(), START_TIME) while next_start_time < now: next_start_time += timedelta(hours=24) ret_obj['nextTime'] = _serialize_datetime(next_start_time + NEXT_PADDING) # Caches expire in an hour, but no later than the next start time minus padding cache_expire_time = min( now + MAX_HTTP_CACHE_AGE, next_start_time) expire_seconds = max(0, (cache_expire_time - now).total_seconds()) # Note that this max-age header will be cached, so max-age may be off by the memcache # cache time which is set above to 60 seconds headers['Cache-Control'] = 'max-age=%d, must-revalidate, public' % expire_seconds headers['Expires'] = cache_expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT') headers['Pragma'] = 'public' else: # Found no featured artwork; hopefully this is temporary; don't cache this response headers['Cache-Control'] = 'max-age=0, no-cache, no-store' headers['Pragma'] = 'no-cache' body = json.dumps(ret_obj, sort_keys=True) if callback: body = '%s(%s)' % (callback, body) return (body, headers)
def post(self): artwork_json = json.loads(self.request.get('json')) new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] new_artwork = FeaturedArtwork( title=artwork_json['title'], byline=artwork_json['byline'], image_url=new_image_url, thumb_url=new_thumb_url, details_url=artwork_json['detailsUri'], publish_date=datetime.datetime.utcfromtimestamp( artwork_json['publishDate'] / 1000).date()) new_artwork.save() self.response.set_status(200)
def post(self): id = long(self.request.get("id")) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: webapp2.abort(404) target_artwork.delete() self.response.set_status(200)
def post(self): id = long(self.request.get('id')) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return target_artwork.delete() self.response.set_status(200)
def render(self): start = datetime.date(day=1, month=int(self.request.get('month')) + 1, year=int(self.request.get('year'))) start -= datetime.timedelta(weeks=2) queue = (FeaturedArtwork.all().filter( 'publish_date >=', start).order('publish_date').fetch(1000)) return json.dumps([artwork_dict(a) for a in queue])
def render_with_headers(self, callback): now = datetime.utcnow() headers = {} current = None # Get up to 5 artworks published earlier than 2 days from now, ordered by latest first latest_artworks = (FeaturedArtwork.all().filter( 'publish_date <=', date.today() + timedelta(days=2)).order('-publish_date').fetch(5)) # Pick out the first artwork in that set that has actually been published for artwork in latest_artworks: if now >= datetime.combine(artwork.publish_date, START_TIME): current = artwork break ret_obj = dict() if current is not None: # Found the next featured artwork ret_obj = dict(title=current.title.strip(), byline=current.byline.strip(), imageUri=current.image_url, detailsUri=current.details_url) if current.thumb_url: ret_obj['thumbUri'] = current.thumb_url if current.attribution: ret_obj['attribution'] = current.attribution # The next update time is the next START_TIME next_start_time = datetime.combine(date.today(), START_TIME) while next_start_time < now: next_start_time += timedelta(hours=24) ret_obj['nextTime'] = _serialize_datetime(next_start_time + NEXT_PADDING) # Caches expire in an hour, but no later than the next start time minus padding cache_expire_time = min(now + MAX_HTTP_CACHE_AGE, next_start_time) expire_seconds = max(0, (cache_expire_time - now).total_seconds()) # Note that this max-age header will be cached, so max-age may be off by the memcache # cache time which is set above to 60 seconds headers[ 'Cache-Control'] = 'max-age=%d, must-revalidate, public' % expire_seconds headers['Expires'] = cache_expire_time.strftime( '%a, %d %b %Y %H:%M:%S GMT') headers['Pragma'] = 'public' else: # Found no featured artwork; hopefully this is temporary; don't cache this response headers['Cache-Control'] = 'max-age=0, no-cache, no-store' headers['Pragma'] = 'no-cache' body = json.dumps(ret_obj, sort_keys=True) if callback: body = '%s(%s)' % (callback, body) return (body, headers)
def post(self): id = long(self.request.get("id")) publish_date = datetime.datetime.utcfromtimestamp(long(self.request.get("publishDate")) / 1000).date() target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: webapp2.abort(404) # shift other artworks over self.move_artwork(target_artwork, publish_date, target_artwork.key().id()) self.response.set_status(200)
def render(self): start = datetime.date(day=1, month=int(self.request.get('month')) + 1, year=int(self.request.get('year'))) start -= datetime.timedelta(weeks=2) queue = (FeaturedArtwork.all() .filter('publish_date >=', start) .order('publish_date') .fetch(1000)) return json.dumps([artwork_dict(a) for a in queue])
def move_artwork(self, artwork, publish_date, initial_artwork_id): # cascade moves current_artwork_at_date = FeaturedArtwork.all().filter( 'publish_date =', publish_date).get() if current_artwork_at_date and current_artwork_at_date.key().id( ) != initial_artwork_id: self.move_artwork(current_artwork_at_date, publish_date + datetime.timedelta(hours=24), initial_artwork_id) artwork.publish_date = publish_date artwork.save()
def post(self): id = long(self.request.get('id')) publish_date = (datetime.datetime.utcfromtimestamp( long(self.request.get('publishDate')) / 1000).date()) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: webapp2.abort(404) # shift other artworks over self.move_artwork(target_artwork, publish_date, target_artwork.key().id()) self.response.set_status(200)
def post(self): id = long(self.request.get('id')) publish_date = (datetime.datetime .utcfromtimestamp(long(self.request.get('publishDate')) / 1000) .date()) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return # shift other artworks over self.move_artwork(target_artwork, publish_date, target_artwork.key().id()) self.response.set_status(200)
def render(self): queue = (FeaturedArtwork.all() .filter('publish_date >=', datetime.date.today() - datetime.timedelta(days=30)) .order('publish_date') .fetch(1000)) return json.dumps([dict( id=a.key().id(), title=a.title, byline=a.byline, imageUri=a.image_url, thumbUri=a.thumb_url, detailsUri=a.details_url, publishDate=date_to_timestamp(a.publish_date),) for a in queue])
def post(self): artwork_json = json.loads(self.request.get("json")) crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop"))) publish_date = datetime.datetime.utcfromtimestamp(artwork_json["publishDate"] / 1000).date() new_image_url, new_thumb_url = maybe_process_image( artwork_json["imageUri"], crop_tuple, publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"], ) if not new_thumb_url and "thumbUri" in artwork_json: new_thumb_url = artwork_json["thumbUri"] new_artwork = FeaturedArtwork( title=artwork_json["title"], byline=artwork_json["byline"], image_url=new_image_url, thumb_url=new_thumb_url, details_url=artwork_json["detailsUri"], publish_date=publish_date, ) new_artwork.save() self.response.set_status(200)
def render(self): queue = (FeaturedArtwork.all().filter( 'publish_date >=', datetime.date.today() - datetime.timedelta(days=30)).order('publish_date').fetch(1000)) return json.dumps([ dict( id=a.key().id(), title=a.title, byline=a.byline, imageUri=a.image_url, thumbUri=a.thumb_url, detailsUri=a.details_url, publishDate=date_to_timestamp(a.publish_date), ) for a in queue ])
def render(self): start = datetime.date(day=1, month=int(self.request.get('month')), year=int(self.request.get('year'))) queue = (FeaturedArtwork.all() .filter('publish_date >=', start) .order('publish_date') .fetch(1000)) return json.dumps([dict( id=a.key().id(), title=a.title, byline=a.byline, imageUri=a.image_url, thumbUri=a.thumb_url, detailsUri=a.details_url, publishDate=date_to_timestamp(a.publish_date),) for a in queue])
def render(self, callback): now = datetime.datetime.utcnow() current = None # Get up to 5 artworks published earlier than 2 days from now, ordered by latest first latest_artworks = (FeaturedArtwork.all() .filter('publish_date <=', datetime.date.today() + datetime.timedelta(days=2)) .order('-publish_date') .fetch(5)) # Pick out the first artwork in that set that has actually been published for artwork in latest_artworks: if now >= datetime.datetime.combine(artwork.publish_date, START_TIME): current = artwork break ret_obj = dict() if current is not None: featured = dict( title=current.title, byline=current.byline, imageUri=current.image_url, detailsUri=current.details_url) if current.thumb_url: featured['thumbUri'] = current.thumb_url # The next update time is at START_TIME tomorrow next_time = datetime.datetime.combine(datetime.date.today() \ + datetime.timedelta(days=1), START_TIME) + NEXT_PADDING featured['nextTime'] = _serialize_datetime(next_time) # Caches expire in an hour, but no later than the next start time minus 5 minutes cache_expire_time = min( datetime.datetime.now() + datetime.timedelta(hours=1), next_time - datetime.timedelta(minutes=5)) expire_seconds = max(0, (cache_expire_time - now).total_seconds()) self.response.headers['Cache-Control'] = 'max-age=%d, must-revalidate, public' % expire_seconds self.response.headers['Expires'] = cache_expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT') ret_obj = featured s = json.dumps(ret_obj, sort_keys=True) if callback: return '%s(%s)' % (callback, s) else: return s
def render(self): start = datetime.date(day=1, month=int(self.request.get('month')), year=int(self.request.get('year'))) queue = (FeaturedArtwork.all().filter( 'publish_date >=', start).order('publish_date').fetch(1000)) return json.dumps([ dict( id=a.key().id(), title=a.title, byline=a.byline, imageUri=a.image_url, thumbUri=a.thumb_url, detailsUri=a.details_url, publishDate=date_to_timestamp(a.publish_date), ) for a in queue ])
def post(self): id = long(self.request.get('id')) artwork_json = json.loads(self.request.get('json')) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return target_artwork.title = artwork_json['title'] target_artwork.byline = artwork_json['byline'] target_artwork.image_url = artwork_json['imageUri'] target_artwork.thumb_url = (artwork_json['thumbUri'] if 'thumbUri' in artwork_json else (artwork_json['imageUri'] + '!BlogSmall.jpg')) target_artwork.details_url = artwork_json['detailsUri'] target_artwork.save() self.response.set_status(200)
def render(self): start = datetime.date(day=1, month=int(self.request.get("month")) + 1, year=int(self.request.get("year"))) start -= datetime.timedelta(weeks=2) queue = FeaturedArtwork.all().filter("publish_date >=", start).order("publish_date").fetch(1000) return json.dumps( [ dict( id=a.key().id(), title=a.title, byline=a.byline, imageUri=a.image_url, thumbUri=a.thumb_url, detailsUri=a.details_url, publishDate=date_to_timestamp(a.publish_date), ) for a in queue ] )
def get(self): # Fetch latest 1000 artworks latest_artworks = ( FeaturedArtwork.all().order('-publish_date').fetch(1000)) # List dates for which artwork exists dates_with_existing_art = set(a.publish_date for a in latest_artworks) # List target dates that we want artwork for, but for which no artwork exists target_dates = [date.today() + timedelta(days=n) for n in range(-1, 9)] target_dates = [ d for d in target_dates if d not in dates_with_existing_art ] for target_date in target_dates: self.response.out.write('looking for artwork for date ' + str(target_date) + '<br>') # Create a blacklist of the most recent 200 artwork # (don't want to repeat one of the last 200!) blacklist_artwork_keys = set( sanitized_artwork_key(a) for a in latest_artworks[:200]) if len(blacklist_artwork_keys) < 5: blacklist_artwork_keys = set( ) # should never happen, but just in case of a reset # Pick from one of the oldest 500, excluding artwork in the blacklist random_artwork = None while True: random_artwork = random.choice(latest_artworks[500:]) key = sanitized_artwork_key(random_artwork) if 'wikiart.org' in key or 'wikipaintings.org' in key or 'metmuseum.org' in key: if key not in blacklist_artwork_keys: break target_details_url = str(random_artwork.details_url) self.response.out.write('recycling ' + target_details_url + ' for date ' + str(target_date) + '<br>') backroomarthelper.add_art_from_external_details_url( target_date, target_details_url) self.response.out.write('done<br>')
def post(self): id = long(self.request.get('id')) artwork_json = json.loads(self.request.get('json')) target_artwork = FeaturedArtwork.get_by_id(id) if not target_artwork: self.response.set_status(404) return target_artwork.title = artwork_json['title'] target_artwork.byline = artwork_json['byline'] new_image_url, new_thumb_url = maybe_process_image( artwork_json['imageUri'], artwork_json['title'] + ' ' + artwork_json['byline']) if not new_thumb_url and 'thumbUri' in artwork_json: new_thumb_url = artwork_json['thumbUri'] target_artwork.image_url = new_image_url target_artwork.thumb_url = new_thumb_url target_artwork.details_url = artwork_json['detailsUri'] target_artwork.save() self.response.set_status(200)
def get(self): # Fetch latest 1000 artworks latest_artworks = (FeaturedArtwork.all() .order('-publish_date') .fetch(1000)) # List dates for which artwork exists dates_with_existing_art = set(a.publish_date for a in latest_artworks) # List target dates that we want artwork for, but for which no artwork exists target_dates = [date.today() + timedelta(days=n) for n in range(-1, 9)] target_dates = [d for d in target_dates if d not in dates_with_existing_art] for target_date in target_dates: self.response.out.write('looking for artwork for date ' + str(target_date) + '<br>') # Create a blacklist of the most recent 200 artwork # (don't want to repeat one of the last 200!) blacklist_artwork_keys = set(sanitized_artwork_key(a) for a in latest_artworks[:200]) if len(blacklist_artwork_keys) < 5: blacklist_artwork_keys = set() # should never happen, but just in case of a reset # Pick from one of the oldest 500, excluding artwork in the blacklist random_artwork = None while True: random_artwork = random.choice(latest_artworks[500:]) key = sanitized_artwork_key(random_artwork) if 'wikiart.org' in key or 'wikipaintings.org' in key or 'metmuseum.org' in key: if key not in blacklist_artwork_keys: break target_details_url = str(random_artwork.details_url) self.response.out.write('recycling ' + target_details_url + ' for date ' + str(target_date) + '<br>') backroomarthelper.add_art_from_external_details_url( target_date, target_details_url) self.response.out.write('done<br>')
def post(self): publish_date = (datetime.datetime .utcfromtimestamp(int(self.request.get('publishDate')) / 1000) .date()) if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None: webapp2.abort(409, message='Artwork already exists for this date.') url = self.request.get('externalArtworkUrl') result = urlfetch.fetch(url) if result.status_code < 200 or result.status_code >= 300: webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s' % (result.status_code, result.content)) soup = BeautifulSoup(result.content) attribution = None if re.search(r'wikiart.org', url, re.I): attribution = 'wikiart.org' details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.select('h1 span')[0].get_text() author = soup.find(itemprop='author').get_text() completion_year_el = soup.find(itemprop='dateCreated') byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '') image_url = soup.find(id='paintingImage')['href'] elif re.search(r'metmuseum.org', url, re.I): attribution = 'metmuseum.org' details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei' title = soup.find('h2').get_text() author = '' try: author = unicode(soup.find(text='Artist:').parent.next_sibling).strip() except: pass author = re.sub(r'\s*\(.*', '', author) completion_year_el = None try: completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip() except: pass byline = author + ((', ' + completion_year_el) if completion_year_el else '') image_url = soup.find('a', class_='download').attrs['href'] else: webapp2.abort(400, message='Unrecognized URL') if not title or not author or not image_url: webapp2.abort(500, message='Could not parse HTML') image_url, thumb_url = maybe_process_image(image_url, NO_CROP_TUPLE, publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline) # create the artwork entry new_artwork = FeaturedArtwork( title=title, byline=byline, attribution=attribution, image_url=image_url, thumb_url=thumb_url, details_url=details_url, publish_date=publish_date) new_artwork.save() self.response.set_status(200) self.response.out.write(json.dumps(artwork_dict(new_artwork)))
def get(self): now = datetime.datetime.utcnow() if self.request.get('datetime'): now = datetime.datetime.strptime(self.request.get('datetime'), '%Y-%m-%dT%H:%M:%S') current_date = (now.date() if now.time() > START_TIME else now.date() - datetime.timedelta(days=1)) current_month = current_date.month # list the expected archives up until this point, starting with current month's archive expected_archives = [] if current_date > ARCHIVE_START_DATE: if (current_date + datetime.timedelta(days=1)).month != current_month: # end of the month expected_archives.append( (current_date.year, current_date.month)) else: # partial month for this month expected_archives.append( (current_date.year, current_date.month, current_date.day)) # list all other months if current_date.month != ARCHIVE_START_DATE.month or current_date.year != ARCHIVE_START_DATE.year: current_date = current_date.replace(day=1) while True: current_date -= datetime.timedelta(days=1) # previous month current_date = current_date.replace(day=1) expected_archives.append( (current_date.year, current_date.month)) if current_date <= ARCHIVE_START_DATE: break # at this point expected_archives has a list of all archives that should be built # list current archive items to determine which archives are missing current_archives = [] current_archive_files = gcs.listbucket(CLOUD_STORAGE_ARCHIVE_PATH) self.response.out.write('<h1>current archives</h1>') for archive_file in current_archive_files: m = re.search(r'((?:\d){4})((?:\d){2})((?:\d){2})?\.txt', archive_file.filename) if m: if m.group(3): archive = (int(m.group(1)), int(m.group(2)), int(m.group(3))) else: archive = (int(m.group(1)), int(m.group(2))) current_archives.append(archive) self.response.out.write(repr(archive) + '<br>') #self.response.out.write(archivemeta.filename + '\n') current_archives = set(current_archives) expected_archives = set(expected_archives) missing_archives = expected_archives.difference(current_archives) # generate the missing archives self.response.out.write('<h1>building missing archives</h1>') for archive in missing_archives: self.response.out.write('<h2>' + repr(archive) + '</h2>') # when building an archive, try to start from an existing archive # find the latest archive from this month as a starting point other_archives_from_month = filter( lambda x: len(x) == 3 and x[0] == archive[0] and x[1] == archive[1], current_archives) latest_current_archive_from_month = None latest_archive_gcs_path = None archive_metadata = [] archive_image_blobs = [] if other_archives_from_month: latest_current_archive_from_month = reduce( lambda x, y: (x[0], x[1], max(x[2], y[2])), other_archives_from_month) self.response.out.write( 'starting from archive ' + repr(latest_current_archive_from_month) + '<br>') existing_archive_name = '%04d%02d%02d' % latest_current_archive_from_month try: latest_archive_gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + existing_archive_name + '.txt' existing_archive = gcs.open(latest_archive_gcs_path) content = gzip_decompress(existing_archive.read()) existing_archive_lines = content.split('\n') existing_archive.close() archive_metadata = json.loads(existing_archive_lines[0]) archive_image_blobs = filter(lambda x: len(x) > 0, existing_archive_lines[1:]) except: self.response.out.write( 'error reading from existing archive, starting from scratch<br>' ) latest_current_archive_from_month = None latest_archive_gcs_path = None # construct the query query_from = None if latest_current_archive_from_month: # get everything after the latest archive this month query_from = datetime.date( *latest_current_archive_from_month) + datetime.timedelta( days=1) else: # get everything from this month query_from = datetime.date( archive[0], archive[1], archive[2] if len(archive) == 3 else 1).replace(day=1) query_from = max(ARCHIVE_START_DATE, query_from) query_to = None archive_name = None if len(archive) == 3: # partial month archive archive_name = '%04d%02d%02d' % archive query_to = datetime.date(*archive) else: # full month archive archive_name = '%04d%02d' % archive next_month = datetime.date(archive[0], archive[1], 1) if next_month.month == 12: next_month = next_month.replace(year=next_month.year + 1) next_month.month = 1 else: next_month = next_month.replace(month=next_month.month + 1) query_to = next_month - datetime.timedelta(days=1) # fetch artworks that match this query artwork_objs = (FeaturedArtwork.all().order('publish_date').filter( 'publish_date >=', query_from).filter('publish_date <=', query_to).fetch(1000)) for artwork_obj in artwork_objs: metadata_item = dict( publish_date=artwork_obj.publish_date.isoformat(), title=artwork_obj.title, byline=artwork_obj.byline, thumb_url=artwork_obj.thumb_url, details_url=artwork_obj.details_url, ) # fetch the image image_result = urlfetch.fetch(artwork_obj.thumb_url) if image_result.status_code < 200 or image_result.status_code >= 300: raise IOError('Error downloading image: HTTP %d.' % image_result.status_code) # resize and crop thumb thumb = images.Image(image_result.content) if thumb.width > thumb.height: thumb.resize(width=4000, height=ARCHIVE_IMAGE_SIZE) thumb.crop( (float(thumb.width - thumb.height) / thumb.width) / 2, 0., 1 - (float(thumb.width - thumb.height) / thumb.width) / 2, 1.) else: thumb.resize(width=ARCHIVE_IMAGE_SIZE, height=4000) thumb.crop( 0., (float(thumb.height - thumb.width) / thumb.height) / 2, 1., 1 - (float(thumb.height - thumb.width) / thumb.height) / 2) # compute average color histogram = thumb.histogram() avg_color = tuple( [int(x) for x in img_weighed_average(histogram)]) avg_color_hex = "#%0.2X%0.2X%0.2X" % avg_color metadata_item['color'] = avg_color_hex # export thumb thumb_data_uri = 'data:image/jpeg;base64,' + base64.b64encode( thumb.execute_transforms(output_encoding=images.JPEG, quality=40)) # append the metadata archive_metadata.append(metadata_item) archive_image_blobs.append(thumb_data_uri) self.response.out.write('query: from ' + repr(query_from) + ' to ' + repr(query_to) + '<br>') self.response.out.write('artworks: ' + str(len(artwork_objs)) + '<br>') #self.response.out.write('<pre>' + json.dumps(archive_metadata, indent=2) + '</pre>') # create the archive contents s = json.dumps(archive_metadata) + '\n' for blob in archive_image_blobs: s += blob + '\n' # gzip and write the archive gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + archive_name + '.txt' self.response.out.write('writing to: ' + gcs_path + '<br>') gcsf = gcs.open(gcs_path, 'w', content_type='text/plain', options={'content-encoding': 'gzip'}) gcsf.write(gzip_compress(s)) gcsf.close() # delete the previous archive if latest_archive_gcs_path: gcs.delete(latest_archive_gcs_path)
def get(self): ARTWORKS = json.loads( open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read()) # ARTWORKS = filter(lambda a: '_stars' in a and a['_stars'] >= 1, ARTWORKS) # Fetch latest 300 artworks (for blacklisting) latest_artworks = ( FeaturedArtwork.all().order('-publish_date').fetch(300)) # List dates for which artwork exists dates_with_existing_art = set(a.publish_date for a in latest_artworks) # List target dates that we want artwork for, but for which no artwork exists target_dates = [ date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS) ] target_dates = [ d for d in target_dates if d not in dates_with_existing_art ] # Create a blacklist of keys to avoid repeats blacklist = set(artwork_key(a.details_url) for a in latest_artworks) logging.debug('starting blacklist size: %d' % len(blacklist)) chosen_artworks = [] for target_date in target_dates: # Pick from available artworks, excluding artwork in the blacklist random_artwork = None while True: if len(ARTWORKS) == 0: logging.error( 'Ran out of artworks to choose from, cannot continue') return random_artwork = random.choice(ARTWORKS) key = artwork_key(random_artwork['detailsUri']) if key not in blacklist: # Once chosen, remove it from the list of artworks to choose next ARTWORKS.remove(random_artwork) chosen_artworks.append(random_artwork) break target_details_url = str(random_artwork['detailsUri']) logging.debug('%(date)s: setting to %(url)s' % dict(url=target_details_url, date=target_date)) # Store the new artwork if self.request.get('dry-run', '') != 'true': new_artwork = FeaturedArtwork( title=random_artwork['title'], byline=random_artwork['byline'], attribution=random_artwork['attribution'], image_url=random_artwork['imageUri'], thumb_url=random_artwork['thumbUri'], details_url=random_artwork['detailsUri'], publish_date=target_date) new_artwork.save() if self.request.get('output', '') == 'html': self.response.out.write( get_html(artworks_json=json.dumps(chosen_artworks))) # Finish up logging.debug('done')
def get(self): now = datetime.datetime.utcnow() if self.request.get('datetime'): now = datetime.datetime.strptime(self.request.get('datetime'), '%Y-%m-%dT%H:%M:%S') current_date = (now.date() if now.time() > START_TIME else now.date() - datetime.timedelta(days=1)) current_month = current_date.month # list the expected archives up until this point, starting with current month's archive expected_archives = [] if current_date > ARCHIVE_START_DATE: if (current_date + datetime.timedelta(days=1)).month != current_month: # end of the month expected_archives.append((current_date.year, current_date.month)) else: # partial month for this month expected_archives.append((current_date.year, current_date.month, current_date.day)) # list all other months if current_date.month != ARCHIVE_START_DATE.month or current_date.year != ARCHIVE_START_DATE.year: current_date = current_date.replace(day=1) while True: current_date -= datetime.timedelta(days=1) # previous month current_date = current_date.replace(day=1) expected_archives.append((current_date.year, current_date.month)) if current_date <= ARCHIVE_START_DATE: break # at this point expected_archives has a list of all archives that should be built # list current archive items to determine which archives are missing current_archives = [] current_archive_files = gcs.listbucket(CLOUD_STORAGE_ARCHIVE_PATH) self.response.out.write('<h1>current archives</h1>') for archive_file in current_archive_files: m = re.search(r'((?:\d){4})((?:\d){2})((?:\d){2})?\.txt', archive_file.filename) if m: if m.group(3): archive = (int(m.group(1)), int(m.group(2)), int(m.group(3))) else: archive = (int(m.group(1)), int(m.group(2))) current_archives.append(archive) self.response.out.write(repr(archive) + '<br>') #self.response.out.write(archivemeta.filename + '\n') current_archives = set(current_archives) expected_archives = set(expected_archives) missing_archives = expected_archives.difference(current_archives) # generate the missing archives self.response.out.write('<h1>building missing archives</h1>') for archive in missing_archives: self.response.out.write('<h2>' + repr(archive) + '</h2>') # when building an archive, try to start from an existing archive # find the latest archive from this month as a starting point other_archives_from_month = filter( lambda x: len(x) == 3 and x[0] == archive[0] and x[1] == archive[1], current_archives) latest_current_archive_from_month = None latest_archive_gcs_path = None archive_metadata = [] archive_image_blobs = [] if other_archives_from_month: latest_current_archive_from_month = reduce( lambda x, y: (x[0], x[1], max(x[2], y[2])), other_archives_from_month) self.response.out.write('starting from archive ' + repr(latest_current_archive_from_month) + '<br>') existing_archive_name = '%04d%02d%02d' % latest_current_archive_from_month try: latest_archive_gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + existing_archive_name + '.txt' existing_archive = gcs.open(latest_archive_gcs_path) content = gzip_decompress(existing_archive.read()) existing_archive_lines = content.split('\n') existing_archive.close() archive_metadata = json.loads(existing_archive_lines[0]) archive_image_blobs = filter(lambda x: len(x) > 0, existing_archive_lines[1:]) except: self.response.out.write('error reading from existing archive, starting from scratch<br>') latest_current_archive_from_month = None latest_archive_gcs_path = None # construct the query query_from = None if latest_current_archive_from_month: # get everything after the latest archive this month query_from = datetime.date(*latest_current_archive_from_month) + datetime.timedelta(days=1) else: # get everything from this month query_from = datetime.date( archive[0], archive[1], archive[2] if len(archive) == 3 else 1).replace(day=1) query_from = max(ARCHIVE_START_DATE, query_from) query_to = None archive_name = None if len(archive) == 3: # partial month archive archive_name = '%04d%02d%02d' % archive query_to = datetime.date(*archive) else: # full month archive archive_name = '%04d%02d' % archive next_month = datetime.date(archive[0], archive[1], 1) if next_month.month == 12: next_month = next_month.replace(year=next_month.year + 1, month=1) else: next_month = next_month.replace(month=next_month.month + 1) query_to = next_month - datetime.timedelta(days=1) # fetch artworks that match this query artwork_objs = (FeaturedArtwork.all() .order('publish_date') .filter('publish_date >=', query_from) .filter('publish_date <=', query_to) .fetch(1000)) for artwork_obj in artwork_objs: metadata_item = dict( publish_date=artwork_obj.publish_date.isoformat(), title=artwork_obj.title, byline=artwork_obj.byline, thumb_url=artwork_obj.thumb_url, details_url=artwork_obj.details_url,) # fetch the image image_result = urlfetch.fetch(artwork_obj.thumb_url) if image_result.status_code < 200 or image_result.status_code >= 300: raise IOError('Error downloading image: HTTP %d.' % image_result.status_code) # resize and crop thumb thumb = images.Image(image_result.content) if thumb.width > thumb.height: thumb.resize(width=4000, height=ARCHIVE_IMAGE_SIZE) thumb.crop( (float(thumb.width - thumb.height) / thumb.width) / 2, 0., 1 - (float(thumb.width - thumb.height) / thumb.width) / 2, 1.) else: thumb.resize(width=ARCHIVE_IMAGE_SIZE, height=4000) thumb.crop( 0., (float(thumb.height - thumb.width) / thumb.height) / 2, 1., 1 - (float(thumb.height - thumb.width) / thumb.height) / 2) # compute average color histogram = thumb.histogram() avg_color = tuple([int(x) for x in img_weighed_average(histogram)]) avg_color_hex = "#%0.2X%0.2X%0.2X" % avg_color metadata_item['color'] = avg_color_hex # export thumb thumb_data_uri = 'data:image/jpeg;base64,' + base64.b64encode( thumb.execute_transforms(output_encoding=images.JPEG, quality=40)) # append the metadata archive_metadata.append(metadata_item) archive_image_blobs.append(thumb_data_uri) self.response.out.write('query: from ' + repr(query_from) + ' to ' + repr(query_to) + '<br>') self.response.out.write('artworks: ' + str(len(artwork_objs)) + '<br>') #self.response.out.write('<pre>' + json.dumps(archive_metadata, indent=2) + '</pre>') # create the archive contents s = json.dumps(archive_metadata) + '\n' for blob in archive_image_blobs: s += blob + '\n' # gzip and write the archive gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + archive_name + '.txt' self.response.out.write('writing to: ' + gcs_path + '<br>') gcsf = gcs.open(gcs_path, 'w', content_type='text/plain', options={'content-encoding':'gzip'}) gcsf.write(gzip_compress(s)) gcsf.close() # delete the previous archive if latest_archive_gcs_path: gcs.delete(latest_archive_gcs_path)