Пример #1
0
  def post(self):
    artwork_json = json.loads(self.request.get('json'))
    crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop')))
    publish_date = (datetime.datetime
        .utcfromtimestamp(artwork_json['publishDate'] / 1000)
        .date())

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        crop_tuple,
        publish_date.strftime('%Y%m%d') + ' '
            + artwork_json['title'] + ' '
            + artwork_json['byline'])

    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']
    new_artwork = FeaturedArtwork(
        title=artwork_json['title'],
        byline=artwork_json['byline'],
        image_url=new_image_url,
        thumb_url=new_thumb_url,
        details_url=artwork_json['detailsUri'],
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)
Пример #2
0
    def post(self):
        artwork_json = json.loads(self.request.get('json'))

        publish_date = (datetime.datetime.utcfromtimestamp(
            artwork_json['publishDate'] / 1000).date())
        if FeaturedArtwork.all().filter('publish_date=',
                                        publish_date).get() != None:
            webapp2.abort(409, message='Artwork already exists for this date.')

        crop_tuple = tuple(
            float(x) for x in json.loads(self.request.get('crop')))

        new_image_url, new_thumb_url = backroomarthelper.maybe_process_image(
            artwork_json['imageUri'], crop_tuple,
            publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] +
            ' ' + artwork_json['byline'])

        if not new_thumb_url and 'thumbUri' in artwork_json:
            new_thumb_url = artwork_json['thumbUri']
        new_artwork = FeaturedArtwork(
            title=artwork_json['title'],
            byline=artwork_json['byline'],
            attribution=artwork_json['attribution']
            if 'attribution' in artwork_json else None,
            image_url=new_image_url,
            thumb_url=new_thumb_url,
            details_url=artwork_json['detailsUri'],
            publish_date=publish_date)
        new_artwork.save()
        self.response.set_status(200)
Пример #3
0
    def process_html(self, url, html):
        soup = BeautifulSoup(html)

        details_url = re.sub(r"#.+", "", url, re.I | re.S) + "?utm_source=Muzei&utm_campaign=Muzei"
        title = soup.select("h1 span")[0].get_text()
        author = soup.find(itemprop="author").get_text()
        completion_year_el = soup.find(itemprop="dateCreated")
        byline = author + ((", " + completion_year_el.get_text()) if completion_year_el else "")
        image_url = soup.find(id="paintingImage")["href"]

        if not title or not author or not image_url:
            self.response.out.write("Could not parse HTML")
            self.response.set_status(500)
            return

        publish_date = datetime.datetime.utcfromtimestamp(int(self.request.get("publishDate")) / 1000).date()
        image_url, thumb_url = maybe_process_image(
            image_url, NO_CROP_TUPLE, publish_date.strftime("%Y%m%d") + " " + title + " " + byline
        )

        # create the artwork entry
        new_artwork = FeaturedArtwork(
            title=title,
            byline=byline,
            image_url=image_url,
            thumb_url=thumb_url,
            details_url=details_url,
            publish_date=publish_date,
        )
        new_artwork.save()
        self.response.set_status(200)
Пример #4
0
    def process_html(self, url, html):
        soup = BeautifulSoup(html)

        details_url = re.sub(r'#.+', '', url, re.I
                             | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
        title = soup.find(itemprop='name').get_text()
        author = soup.find(itemprop='author').get_text()
        completion_year_el = soup.find(itemprop='dateCreated')
        byline = author + ((', ' + completion_year_el.get_text())
                           if completion_year_el else '')
        image_url = soup.find(id='paintingImage')['href']

        if not title or not author or not image_url:
            self.response.out.write('Could not parse HTML')
            self.response.set_status(500)
            return

        publish_date = (datetime.datetime.utcfromtimestamp(
            int(self.request.get('publishDate')) / 1000).date())
        image_url, thumb_url = maybe_process_image(
            image_url, NO_CROP_TUPLE,
            publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

        # create the artwork entry
        new_artwork = FeaturedArtwork(title=title,
                                      byline=byline,
                                      image_url=image_url,
                                      thumb_url=thumb_url,
                                      details_url=details_url,
                                      publish_date=publish_date)
        new_artwork.save()
        self.response.set_status(200)
Пример #5
0
    def post(self):
        artwork_json = json.loads(self.request.get("json"))

        publish_date = datetime.datetime.utcfromtimestamp(artwork_json["publishDate"] / 1000).date()
        if FeaturedArtwork.all().filter("publish_date=", publish_date).get() != None:
            webapp2.abort(409, message="Artwork already exists for this date.")

        crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop")))

        new_image_url, new_thumb_url = backroomarthelper.maybe_process_image(
            artwork_json["imageUri"],
            crop_tuple,
            publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"],
        )

        if not new_thumb_url and "thumbUri" in artwork_json:
            new_thumb_url = artwork_json["thumbUri"]
        new_artwork = FeaturedArtwork(
            title=artwork_json["title"],
            byline=artwork_json["byline"],
            attribution=artwork_json["attribution"] if "attribution" in artwork_json else None,
            image_url=new_image_url,
            thumb_url=new_thumb_url,
            details_url=artwork_json["detailsUri"],
            publish_date=publish_date,
        )
        new_artwork.save()
        self.response.set_status(200)
Пример #6
0
  def post(self):
    artwork_json = json.loads(self.request.get('json'))

    publish_date = (datetime.datetime
        .utcfromtimestamp(artwork_json['publishDate'] / 1000)
        .date())
    if FeaturedArtwork.all().filter('publish_date=', publish_date).get() != None:
      webapp2.abort(409, message='Artwork already exists for this date.')

    crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop')))

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        crop_tuple,
        publish_date.strftime('%Y%m%d') + ' '
            + artwork_json['title'] + ' '
            + artwork_json['byline'])

    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']
    new_artwork = FeaturedArtwork(
        title=artwork_json['title'],
        byline=artwork_json['byline'],
        attribution=artwork_json['attribution'] if 'attribution' in artwork_json else None,
        image_url=new_image_url,
        thumb_url=new_thumb_url,
        details_url=artwork_json['detailsUri'],
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)
Пример #7
0
  def get(self):
    ARTWORKS = json.loads(open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read())

    # ARTWORKS = filter(lambda a: '_stars' in a and a['_stars'] >= 1, ARTWORKS)

    # Fetch latest 300 artworks (for blacklisting)
    latest_artworks = (FeaturedArtwork.all()
        .order('-publish_date')
        .fetch(300))

    # List dates for which artwork exists
    dates_with_existing_art = set(a.publish_date for a in latest_artworks)

    # List target dates that we want artwork for, but for which no artwork exists
    target_dates = [date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS)]
    target_dates = [d for d in target_dates if d not in dates_with_existing_art]

    # Create a blacklist of keys to avoid repeats
    blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

    logging.debug('starting blacklist size: %d' % len(blacklist))

    chosen_artworks = []

    for target_date in target_dates:
      # Pick from available artworks, excluding artwork in the blacklist
      random_artwork = None
      while True:
        if len(ARTWORKS) == 0:
          logging.error('Ran out of artworks to choose from, cannot continue')
          return

        random_artwork = random.choice(ARTWORKS)
        key = artwork_key(random_artwork['detailsUri'])
        if key not in blacklist:
          # Once chosen, remove it from the list of artworks to choose next
          ARTWORKS.remove(random_artwork)
          chosen_artworks.append(random_artwork)
          break

      target_details_url = str(random_artwork['detailsUri'])
      logging.debug('%(date)s: setting to %(url)s' % dict(url=target_details_url, date=target_date))

      # Store the new artwork
      if self.request.get('dry-run', '') != 'true':
        new_artwork = FeaturedArtwork(
            title=random_artwork['title'],
            byline=random_artwork['byline'],
            attribution=random_artwork['attribution'],
            image_url=random_artwork['imageUri'],
            thumb_url=random_artwork['thumbUri'],
            details_url=random_artwork['detailsUri'],
            publish_date=target_date)
        new_artwork.save()

    if self.request.get('output', '') == 'html':
      self.response.out.write(get_html(artworks_json=json.dumps(chosen_artworks)))

    # Finish up
    logging.debug('done')
Пример #8
0
  def process_html(self, url, html):
    soup = BeautifulSoup(html)

    details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find(itemprop='name').get_text()
    author = soup.find(itemprop='author').get_text()
    completion_year_el = soup.find(itemprop='dateCreated')
    byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
    image_url = soup.find(id='paintingImage')['href']

    if not title or not author or not image_url:
      self.response.out.write('Could not parse HTML')
      self.response.set_status(500)
      return

    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)
Пример #9
0
def add_art_from_external_details_url(publish_date, url):
  if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None:
    webapp2.abort(409, message='Artwork already exists for this date.')

  result = urlfetch.fetch(url)
  if result.status_code < 200 or result.status_code >= 300:
    webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s'
        % (result.status_code, result.content))

  soup = BeautifulSoup(result.content)
  attribution = None

  if re.search(r'wikiart.org', url, re.I) or re.search(r'wikipaintings.org', url, re.I):
    attribution = 'wikiart.org'
    details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.select('h1 span')[0].get_text()
    author = soup.find(itemprop='author').get_text()
    completion_year_el = soup.find(itemprop='dateCreated')
    byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
    image_url = soup.find(id='paintingImage')['href']
  elif re.search(r'metmuseum.org', url, re.I):
    attribution = 'metmuseum.org'
    details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find('h2').get_text()
    author = ''
    try:
      author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
    except:
      pass
    author = re.sub(r'\s*\(.*', '', author)
    completion_year_el = None
    try:
      completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip()
    except:
      pass
    byline = author + ((', ' + completion_year_el) if completion_year_el else '')
    image_url = soup.find('a', class_='download').attrs['href']
  else:
    webapp2.abort(400, message='Unrecognized URL')

  if not title or not author or not image_url:
    webapp2.abort(500, message='Could not parse HTML')

  image_url, thumb_url = maybe_process_image(image_url,
      NO_CROP_TUPLE,
      publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

  # create the artwork entry
  new_artwork = FeaturedArtwork(
      title=title,
      byline=byline,
      attribution=attribution,
      image_url=image_url,
      thumb_url=thumb_url,
      details_url=details_url,
      publish_date=publish_date)
  new_artwork.save()

  return new_artwork
Пример #10
0
    def get(self):
        ARTWORKS = json.loads(
            open(os.path.join(os.path.split(__file__)[0],
                              'lt-artworks.json')).read())

        # Fetch latest 300 artworks (for blacklisting)
        latest_artworks = (
            FeaturedArtwork.all().order('-publish_date').fetch(300))

        # List dates for which artwork exists
        dates_with_existing_art = set(a.publish_date for a in latest_artworks)

        # List target dates that we want artwork for, but for which no artwork exists
        target_dates = [
            date.today() + timedelta(days=n)
            for n in range(-1, LOOKAHEAD_DAYS)
        ]
        target_dates = [
            d for d in target_dates if d not in dates_with_existing_art
        ]

        # Create a blacklist of keys to avoid repeats
        blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

        self.response.out.write('starting blacklist size: %d<br>' %
                                len(blacklist))

        for target_date in target_dates:
            # Pick from available artworks, excluding artwork in the blacklist
            random_artwork = None
            while True:
                random_artwork = random.choice(ARTWORKS)
                key = artwork_key(random_artwork['detailsUri'])
                if key not in blacklist:
                    # Once chosen, add to the blacklist to avoid repeats within the lookahead
                    blacklist.add(key)
                    break

            target_details_url = str(random_artwork['detailsUri'])
            self.response.out.write(
                '%(date)s: setting to <b>%(url)s</b><br>' %
                dict(url=target_details_url, date=target_date))

            # Store the new artwork
            new_artwork = FeaturedArtwork(
                title=random_artwork['title'],
                byline=random_artwork['byline'],
                attribution=random_artwork['attribution'],
                image_url=random_artwork['imageUri'],
                thumb_url=random_artwork['thumbUri'],
                details_url=random_artwork['detailsUri'],
                publish_date=target_date)
            new_artwork.save()

        # Finish up
        self.response.out.write('done<br>')
Пример #11
0
 def post(self):
   artwork_json = json.loads(self.request.get('json'))
   new_artwork = FeaturedArtwork(
       title=artwork_json['title'],
       byline=artwork_json['byline'],
       image_url=artwork_json['imageUri'],
       thumb_url=(artwork_json['thumbUri'] if 'thumbUri' in artwork_json else None),
       details_url=artwork_json['detailsUri'],
       publish_date=datetime.datetime
           .utcfromtimestamp(artwork_json['publishDate'] / 1000)
           .date())
   new_artwork.save()
   self.response.set_status(200)
 def post(self):
     artwork_json = json.loads(self.request.get('json'))
     new_artwork = FeaturedArtwork(
         title=artwork_json['title'],
         byline=artwork_json['byline'],
         image_url=artwork_json['imageUri'],
         thumb_url=(artwork_json['thumbUri'] if 'thumbUri' in artwork_json
                    else (artwork_json['imageUri'] + '!BlogSmall.jpg')),
         details_url=artwork_json['detailsUri'],
         publish_date=datetime.datetime.utcfromtimestamp(
             artwork_json['publishDate'] / 1000).date())
     new_artwork.save()
     self.response.set_status(200)
Пример #13
0
    def process_html(self, url, html):
        soup = BeautifulSoup(html)

        if re.search(r'wikiart.org', url, re.I):
            details_url = re.sub(r'#.+', '', url, re.I |
                                 re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
            title = soup.select('h1 span')[0].get_text()
            author = soup.find(itemprop='author').get_text()
            completion_year_el = soup.find(itemprop='dateCreated')
            byline = author + ((', ' + completion_year_el.get_text())
                               if completion_year_el else '')
            image_url = soup.find(id='paintingImage')['href']
        elif re.search(r'metmuseum.org', url, re.I):
            details_url = re.sub(r'[#?].+', '', url, re.I |
                                 re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
            title = soup.find('h2').get_text()
            author = unicode(
                soup.find(text='Artist:').parent.next_sibling).strip()
            author = re.sub(r'\s*\(.*', '', author)
            completion_year_el = unicode(
                soup.find(text='Date:').parent.next_sibling).strip()
            byline = author + (
                (', ' + completion_year_el) if completion_year_el else '')
            image_url = soup.find('a', class_='download').attrs['href']
        else:
            self.response.out.write('Unrecognized URL')
            self.response.set_status(500)
            return

        if not title or not author or not image_url:
            self.response.out.write('Could not parse HTML')
            self.response.set_status(500)
            return

        publish_date = (datetime.datetime.utcfromtimestamp(
            int(self.request.get('publishDate')) / 1000).date())
        image_url, thumb_url = maybe_process_image(
            image_url, NO_CROP_TUPLE,
            publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

        # create the artwork entry
        new_artwork = FeaturedArtwork(title=title,
                                      byline=byline,
                                      image_url=image_url,
                                      thumb_url=thumb_url,
                                      details_url=details_url,
                                      publish_date=publish_date)
        new_artwork.save()

        self.response.set_status(200)
        self.response.out.write(json.dumps(artwork_dict(new_artwork)))
Пример #14
0
    def post(self):
        id = long(self.request.get("id"))
        artwork_json = json.loads(self.request.get("json"))
        crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop")))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            webapp2.abort(404)

        target_artwork.title = artwork_json["title"]
        target_artwork.byline = artwork_json["byline"]
        target_artwork.attribution = artwork_json["attribution"] if "attribution" in artwork_json else None

        new_image_url, new_thumb_url = backroomarthelper.maybe_process_image(
            artwork_json["imageUri"],
            crop_tuple,
            target_artwork.publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"],
        )
        if not new_thumb_url and "thumbUri" in artwork_json:
            new_thumb_url = artwork_json["thumbUri"]

        target_artwork.image_url = new_image_url
        target_artwork.thumb_url = new_thumb_url
        target_artwork.details_url = artwork_json["detailsUri"]
        target_artwork.save()

        self.response.set_status(200)
        self.response.out.write(json.dumps(artwork_dict(target_artwork)))
Пример #15
0
    def post(self):
        id = long(self.request.get('id'))
        artwork_json = json.loads(self.request.get('json'))
        crop_tuple = tuple(
            float(x) for x in json.loads(self.request.get('crop')))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            self.response.set_status(404)
            return

        target_artwork.title = artwork_json['title']
        target_artwork.byline = artwork_json['byline']

        new_image_url, new_thumb_url = maybe_process_image(
            artwork_json['imageUri'], crop_tuple,
            target_artwork.publish_date.strftime('%Y%m%d') + ' ' +
            artwork_json['title'] + ' ' + artwork_json['byline'])
        if not new_thumb_url and 'thumbUri' in artwork_json:
            new_thumb_url = artwork_json['thumbUri']

        target_artwork.image_url = new_image_url
        target_artwork.thumb_url = new_thumb_url
        target_artwork.details_url = artwork_json['detailsUri']
        target_artwork.save()
        self.response.set_status(200)
Пример #16
0
    def post(self):
        id = long(self.request.get("id"))
        artwork_json = json.loads(self.request.get("json"))
        crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop")))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            self.response.set_status(404)
            return

        target_artwork.title = artwork_json["title"]
        target_artwork.byline = artwork_json["byline"]

        new_image_url, new_thumb_url = maybe_process_image(
            artwork_json["imageUri"],
            crop_tuple,
            target_artwork.publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"],
        )
        if not new_thumb_url and "thumbUri" in artwork_json:
            new_thumb_url = artwork_json["thumbUri"]

        target_artwork.image_url = new_image_url
        target_artwork.thumb_url = new_thumb_url
        target_artwork.details_url = artwork_json["detailsUri"]
        target_artwork.save()
        self.response.set_status(200)
Пример #17
0
  def post(self):
    id = long(self.request.get('id'))
    artwork_json = json.loads(self.request.get('json'))
    crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop')))
    target_artwork = FeaturedArtwork.get_by_id(id)
    if not target_artwork:
      self.response.set_status(404)
      return

    target_artwork.title = artwork_json['title']
    target_artwork.byline = artwork_json['byline']

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        crop_tuple,
        target_artwork.publish_date.strftime('%Y%m%d') + ' '
            + artwork_json['title'] + ' '
            + artwork_json['byline'])
    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']

    target_artwork.image_url = new_image_url
    target_artwork.thumb_url = new_thumb_url
    target_artwork.details_url = artwork_json['detailsUri']
    target_artwork.save()
    self.response.set_status(200)
Пример #18
0
  def post(self):
    id = long(self.request.get('id'))
    artwork_json = json.loads(self.request.get('json'))
    crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop')))
    target_artwork = FeaturedArtwork.get_by_id(id)
    if not target_artwork:
      webapp2.abort(404)

    target_artwork.title = artwork_json['title']
    target_artwork.byline = artwork_json['byline']
    target_artwork.attribution = artwork_json['attribution'] if 'attribution' in artwork_json else None

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        crop_tuple,
        target_artwork.publish_date.strftime('%Y%m%d') + ' '
            + artwork_json['title'] + ' '
            + artwork_json['byline'])
    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']

    target_artwork.image_url = new_image_url
    target_artwork.thumb_url = new_thumb_url
    target_artwork.details_url = artwork_json['detailsUri']
    target_artwork.save()

    self.response.set_status(200)
    self.response.out.write(json.dumps(artwork_dict(target_artwork)))
Пример #19
0
 def move_artwork(self, artwork, publish_date, initial_artwork_id):
     # cascade moves
     current_artwork_at_date = FeaturedArtwork.all().filter("publish_date =", publish_date).get()
     if current_artwork_at_date and current_artwork_at_date.key().id() != initial_artwork_id:
         self.move_artwork(current_artwork_at_date, publish_date + datetime.timedelta(hours=24), initial_artwork_id)
     artwork.publish_date = publish_date
     artwork.save()
Пример #20
0
    def post(self):
        id = long(self.request.get('id'))
        artwork_json = json.loads(self.request.get('json'))
        crop_tuple = tuple(
            float(x) for x in json.loads(self.request.get('crop')))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            webapp2.abort(404)

        target_artwork.title = artwork_json['title']
        target_artwork.byline = artwork_json['byline']
        target_artwork.attribution = artwork_json[
            'attribution'] if 'attribution' in artwork_json else None

        new_image_url, new_thumb_url = backroomarthelper.maybe_process_image(
            artwork_json['imageUri'], crop_tuple,
            target_artwork.publish_date.strftime('%Y%m%d') + ' ' +
            artwork_json['title'] + ' ' + artwork_json['byline'])
        if not new_thumb_url and 'thumbUri' in artwork_json:
            new_thumb_url = artwork_json['thumbUri']

        target_artwork.image_url = new_image_url
        target_artwork.thumb_url = new_thumb_url
        target_artwork.details_url = artwork_json['detailsUri']
        target_artwork.save()

        self.response.set_status(200)
        self.response.out.write(json.dumps(artwork_dict(target_artwork)))
Пример #21
0
  def process_html(self, url, html):
    soup = BeautifulSoup(html)

    if re.search(r'wikiart.org', url, re.I):
      details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.select('h1 span')[0].get_text()
      author = soup.find(itemprop='author').get_text()
      completion_year_el = soup.find(itemprop='dateCreated')
      byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
      image_url = soup.find(id='paintingImage')['href']
    elif re.search(r'metmuseum.org', url, re.I):
      details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.find('h2').get_text()
      author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
      author = re.sub(r'\s*\(.*', '', author)
      completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip()
      byline = author + ((', ' + completion_year_el) if completion_year_el else '')
      image_url = soup.find('a', class_='download').attrs['href']
    else:
      self.response.out.write('Unrecognized URL')
      self.response.set_status(500)
      return      

    if not title or not author or not image_url:
      self.response.out.write('Could not parse HTML')
      self.response.set_status(500)
      return

    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()

    self.response.set_status(200)
    self.response.out.write(json.dumps(artwork_dict(new_artwork)))
Пример #22
0
  def get(self):
    ARTWORKS = json.loads(open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read())

    # Fetch latest 300 artworks (for blacklisting)
    latest_artworks = (FeaturedArtwork.all()
        .order('-publish_date')
        .fetch(300))

    # List dates for which artwork exists
    dates_with_existing_art = set(a.publish_date for a in latest_artworks)

    # List target dates that we want artwork for, but for which no artwork exists
    target_dates = [date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS)]
    target_dates = [d for d in target_dates if d not in dates_with_existing_art]

    # Create a blacklist of keys to avoid repeats
    blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

    self.response.out.write('starting blacklist size: %d<br>' % len(blacklist))

    for target_date in target_dates:
      # Pick from available artworks, excluding artwork in the blacklist
      random_artwork = None
      while True:
        random_artwork = random.choice(ARTWORKS)
        key = artwork_key(random_artwork['detailsUri'])
        if key not in blacklist:
          # Once chosen, add to the blacklist to avoid repeats within the lookahead
          blacklist.add(key)
          break

      target_details_url = str(random_artwork['detailsUri'])
      self.response.out.write('%(date)s: setting to <b>%(url)s</b><br>' % dict(url=target_details_url, date=target_date))

      # Store the new artwork
      new_artwork = FeaturedArtwork(
          title=random_artwork['title'],
          byline=random_artwork['byline'],
          attribution=random_artwork['attribution'],
          image_url=random_artwork['imageUri'],
          thumb_url=random_artwork['thumbUri'],
          details_url=random_artwork['detailsUri'],
          publish_date=target_date)
      new_artwork.save()

    # Finish up
    self.response.out.write('done<br>')
Пример #23
0
  def render_with_headers(self, callback):
    now = datetime.utcnow()
    headers = {}
    current = None

    # Get up to 5 artworks published earlier than 2 days from now, ordered by latest first
    latest_artworks = (FeaturedArtwork.all()
        .filter('publish_date <=', date.today() + timedelta(days=2))
        .order('-publish_date')
        .fetch(5))

    # Pick out the first artwork in that set that has actually been published
    for artwork in latest_artworks:
      if now >= datetime.combine(artwork.publish_date, START_TIME):
        current = artwork
        break

    ret_obj = dict()
    if current is not None:
      # Found the next featured artwork
      ret_obj = dict(
          title=current.title.strip(),
          byline=current.byline.strip(),
          imageUri=current.image_url,
          detailsUri=current.details_url)
      if current.thumb_url:
        ret_obj['thumbUri'] = current.thumb_url
      if current.attribution:
        ret_obj['attribution'] = current.attribution

      # The next update time is the next START_TIME
      next_start_time = datetime.combine(date.today(), START_TIME)
      while next_start_time < now:
        next_start_time += timedelta(hours=24)

      ret_obj['nextTime'] = _serialize_datetime(next_start_time + NEXT_PADDING)

      # Caches expire in an hour, but no later than the next start time minus padding
      cache_expire_time = min(
          now + MAX_HTTP_CACHE_AGE,
          next_start_time)
      expire_seconds = max(0, (cache_expire_time - now).total_seconds())

      # Note that this max-age header will be cached, so max-age may be off by the memcache
      # cache time which is set above to 60 seconds
      headers['Cache-Control'] = 'max-age=%d, must-revalidate, public' % expire_seconds
      headers['Expires'] = cache_expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT')
      headers['Pragma'] = 'public'

    else:
      # Found no featured artwork; hopefully this is temporary; don't cache this response
      headers['Cache-Control'] = 'max-age=0, no-cache, no-store'
      headers['Pragma'] = 'no-cache'

    body = json.dumps(ret_obj, sort_keys=True)
    if callback:
      body = '%s(%s)' % (callback, body)

    return (body, headers)
Пример #24
0
 def post(self):
     artwork_json = json.loads(self.request.get('json'))
     new_image_url, new_thumb_url = maybe_process_image(
         artwork_json['imageUri'],
         artwork_json['title'] + ' ' + artwork_json['byline'])
     if not new_thumb_url and 'thumbUri' in artwork_json:
         new_thumb_url = artwork_json['thumbUri']
     new_artwork = FeaturedArtwork(
         title=artwork_json['title'],
         byline=artwork_json['byline'],
         image_url=new_image_url,
         thumb_url=new_thumb_url,
         details_url=artwork_json['detailsUri'],
         publish_date=datetime.datetime.utcfromtimestamp(
             artwork_json['publishDate'] / 1000).date())
     new_artwork.save()
     self.response.set_status(200)
Пример #25
0
    def post(self):
        id = long(self.request.get("id"))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            webapp2.abort(404)

        target_artwork.delete()
        self.response.set_status(200)
Пример #26
0
 def post(self):
     id = long(self.request.get('id'))
     target_artwork = FeaturedArtwork.get_by_id(id)
     if not target_artwork:
         self.response.set_status(404)
         return
     target_artwork.delete()
     self.response.set_status(200)
Пример #27
0
 def post(self):
   id = long(self.request.get('id'))
   target_artwork = FeaturedArtwork.get_by_id(id)
   if not target_artwork:
     self.response.set_status(404)
     return
   target_artwork.delete()
   self.response.set_status(200)
Пример #28
0
 def render(self):
     start = datetime.date(day=1,
                           month=int(self.request.get('month')) + 1,
                           year=int(self.request.get('year')))
     start -= datetime.timedelta(weeks=2)
     queue = (FeaturedArtwork.all().filter(
         'publish_date >=', start).order('publish_date').fetch(1000))
     return json.dumps([artwork_dict(a) for a in queue])
Пример #29
0
    def render_with_headers(self, callback):
        now = datetime.utcnow()
        headers = {}
        current = None

        # Get up to 5 artworks published earlier than 2 days from now, ordered by latest first
        latest_artworks = (FeaturedArtwork.all().filter(
            'publish_date <=',
            date.today() + timedelta(days=2)).order('-publish_date').fetch(5))

        # Pick out the first artwork in that set that has actually been published
        for artwork in latest_artworks:
            if now >= datetime.combine(artwork.publish_date, START_TIME):
                current = artwork
                break

        ret_obj = dict()
        if current is not None:
            # Found the next featured artwork
            ret_obj = dict(title=current.title.strip(),
                           byline=current.byline.strip(),
                           imageUri=current.image_url,
                           detailsUri=current.details_url)
            if current.thumb_url:
                ret_obj['thumbUri'] = current.thumb_url
            if current.attribution:
                ret_obj['attribution'] = current.attribution

            # The next update time is the next START_TIME
            next_start_time = datetime.combine(date.today(), START_TIME)
            while next_start_time < now:
                next_start_time += timedelta(hours=24)

            ret_obj['nextTime'] = _serialize_datetime(next_start_time +
                                                      NEXT_PADDING)

            # Caches expire in an hour, but no later than the next start time minus padding
            cache_expire_time = min(now + MAX_HTTP_CACHE_AGE, next_start_time)
            expire_seconds = max(0, (cache_expire_time - now).total_seconds())

            # Note that this max-age header will be cached, so max-age may be off by the memcache
            # cache time which is set above to 60 seconds
            headers[
                'Cache-Control'] = 'max-age=%d, must-revalidate, public' % expire_seconds
            headers['Expires'] = cache_expire_time.strftime(
                '%a, %d %b %Y %H:%M:%S GMT')
            headers['Pragma'] = 'public'

        else:
            # Found no featured artwork; hopefully this is temporary; don't cache this response
            headers['Cache-Control'] = 'max-age=0, no-cache, no-store'
            headers['Pragma'] = 'no-cache'

        body = json.dumps(ret_obj, sort_keys=True)
        if callback:
            body = '%s(%s)' % (callback, body)

        return (body, headers)
Пример #30
0
    def post(self):
        id = long(self.request.get("id"))
        publish_date = datetime.datetime.utcfromtimestamp(long(self.request.get("publishDate")) / 1000).date()
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            webapp2.abort(404)

        # shift other artworks over
        self.move_artwork(target_artwork, publish_date, target_artwork.key().id())
        self.response.set_status(200)
Пример #31
0
 def render(self):
   start = datetime.date(day=1,
       month=int(self.request.get('month')) + 1,
       year=int(self.request.get('year')))
   start -= datetime.timedelta(weeks=2)
   queue = (FeaturedArtwork.all()
       .filter('publish_date >=', start)
       .order('publish_date')
       .fetch(1000))
   return json.dumps([artwork_dict(a) for a in queue])
Пример #32
0
 def move_artwork(self, artwork, publish_date, initial_artwork_id):
     # cascade moves
     current_artwork_at_date = FeaturedArtwork.all().filter(
         'publish_date =', publish_date).get()
     if current_artwork_at_date and current_artwork_at_date.key().id(
     ) != initial_artwork_id:
         self.move_artwork(current_artwork_at_date,
                           publish_date + datetime.timedelta(hours=24),
                           initial_artwork_id)
     artwork.publish_date = publish_date
     artwork.save()
Пример #33
0
    def post(self):
        id = long(self.request.get('id'))
        publish_date = (datetime.datetime.utcfromtimestamp(
            long(self.request.get('publishDate')) / 1000).date())
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            webapp2.abort(404)

        # shift other artworks over
        self.move_artwork(target_artwork, publish_date,
                          target_artwork.key().id())
        self.response.set_status(200)
Пример #34
0
  def post(self):
    id = long(self.request.get('id'))
    publish_date = (datetime.datetime
        .utcfromtimestamp(long(self.request.get('publishDate')) / 1000)
        .date())
    target_artwork = FeaturedArtwork.get_by_id(id)
    if not target_artwork:
      self.response.set_status(404)
      return

    # shift other artworks over
    self.move_artwork(target_artwork, publish_date, target_artwork.key().id())
    self.response.set_status(200)
Пример #35
0
 def render(self):
   queue = (FeaturedArtwork.all()
       .filter('publish_date >=', datetime.date.today() - datetime.timedelta(days=30))
       .order('publish_date')
       .fetch(1000))
   return json.dumps([dict(
       id=a.key().id(),
       title=a.title,
       byline=a.byline,
       imageUri=a.image_url,
       thumbUri=a.thumb_url,
       detailsUri=a.details_url,
       publishDate=date_to_timestamp(a.publish_date),)
       for a in queue])
Пример #36
0
    def post(self):
        artwork_json = json.loads(self.request.get("json"))
        crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop")))
        publish_date = datetime.datetime.utcfromtimestamp(artwork_json["publishDate"] / 1000).date()

        new_image_url, new_thumb_url = maybe_process_image(
            artwork_json["imageUri"],
            crop_tuple,
            publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"],
        )

        if not new_thumb_url and "thumbUri" in artwork_json:
            new_thumb_url = artwork_json["thumbUri"]
        new_artwork = FeaturedArtwork(
            title=artwork_json["title"],
            byline=artwork_json["byline"],
            image_url=new_image_url,
            thumb_url=new_thumb_url,
            details_url=artwork_json["detailsUri"],
            publish_date=publish_date,
        )
        new_artwork.save()
        self.response.set_status(200)
Пример #37
0
 def render(self):
     queue = (FeaturedArtwork.all().filter(
         'publish_date >=',
         datetime.date.today() -
         datetime.timedelta(days=30)).order('publish_date').fetch(1000))
     return json.dumps([
         dict(
             id=a.key().id(),
             title=a.title,
             byline=a.byline,
             imageUri=a.image_url,
             thumbUri=a.thumb_url,
             detailsUri=a.details_url,
             publishDate=date_to_timestamp(a.publish_date),
         ) for a in queue
     ])
Пример #38
0
 def render(self):
   start = datetime.date(day=1,
       month=int(self.request.get('month')),
       year=int(self.request.get('year')))
   queue = (FeaturedArtwork.all()
       .filter('publish_date >=', start)
       .order('publish_date')
       .fetch(1000))
   return json.dumps([dict(
       id=a.key().id(),
       title=a.title,
       byline=a.byline,
       imageUri=a.image_url,
       thumbUri=a.thumb_url,
       detailsUri=a.details_url,
       publishDate=date_to_timestamp(a.publish_date),)
       for a in queue])
Пример #39
0
  def render(self, callback):
    now = datetime.datetime.utcnow()
    current = None

    # Get up to 5 artworks published earlier than 2 days from now, ordered by latest first
    latest_artworks = (FeaturedArtwork.all()
        .filter('publish_date <=', datetime.date.today() + datetime.timedelta(days=2))
        .order('-publish_date')
        .fetch(5))

    # Pick out the first artwork in that set that has actually been published
    for artwork in latest_artworks:
      if now >= datetime.datetime.combine(artwork.publish_date, START_TIME):
        current = artwork
        break

    ret_obj = dict()
    if current is not None:
      featured = dict(
          title=current.title,
          byline=current.byline,
          imageUri=current.image_url,
          detailsUri=current.details_url)
      if current.thumb_url:
        featured['thumbUri'] = current.thumb_url

      # The next update time is at START_TIME tomorrow
      next_time = datetime.datetime.combine(datetime.date.today() \
          + datetime.timedelta(days=1), START_TIME) + NEXT_PADDING
      featured['nextTime'] = _serialize_datetime(next_time)

      # Caches expire in an hour, but no later than the next start time minus 5 minutes
      cache_expire_time = min(
          datetime.datetime.now() + datetime.timedelta(hours=1),
          next_time - datetime.timedelta(minutes=5))
      expire_seconds = max(0, (cache_expire_time - now).total_seconds())
      self.response.headers['Cache-Control'] = 'max-age=%d, must-revalidate, public' % expire_seconds
      self.response.headers['Expires'] = cache_expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT')

      ret_obj = featured

    s = json.dumps(ret_obj, sort_keys=True)
    if callback:
      return '%s(%s)' % (callback, s)
    else:
      return s
Пример #40
0
 def render(self):
     start = datetime.date(day=1,
                           month=int(self.request.get('month')),
                           year=int(self.request.get('year')))
     queue = (FeaturedArtwork.all().filter(
         'publish_date >=', start).order('publish_date').fetch(1000))
     return json.dumps([
         dict(
             id=a.key().id(),
             title=a.title,
             byline=a.byline,
             imageUri=a.image_url,
             thumbUri=a.thumb_url,
             detailsUri=a.details_url,
             publishDate=date_to_timestamp(a.publish_date),
         ) for a in queue
     ])
Пример #41
0
  def post(self):
    id = long(self.request.get('id'))
    artwork_json = json.loads(self.request.get('json'))
    target_artwork = FeaturedArtwork.get_by_id(id)
    if not target_artwork:
      self.response.set_status(404)
      return

    target_artwork.title = artwork_json['title']
    target_artwork.byline = artwork_json['byline']
    target_artwork.image_url = artwork_json['imageUri']
    target_artwork.thumb_url = (artwork_json['thumbUri']
               if 'thumbUri' in artwork_json
               else (artwork_json['imageUri'] + '!BlogSmall.jpg'))
    target_artwork.details_url = artwork_json['detailsUri']
    target_artwork.save()
    self.response.set_status(200)
    def post(self):
        id = long(self.request.get('id'))
        artwork_json = json.loads(self.request.get('json'))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            self.response.set_status(404)
            return

        target_artwork.title = artwork_json['title']
        target_artwork.byline = artwork_json['byline']
        target_artwork.image_url = artwork_json['imageUri']
        target_artwork.thumb_url = (artwork_json['thumbUri']
                                    if 'thumbUri' in artwork_json else
                                    (artwork_json['imageUri'] +
                                     '!BlogSmall.jpg'))
        target_artwork.details_url = artwork_json['detailsUri']
        target_artwork.save()
        self.response.set_status(200)
Пример #43
0
 def render(self):
     start = datetime.date(day=1, month=int(self.request.get("month")) + 1, year=int(self.request.get("year")))
     start -= datetime.timedelta(weeks=2)
     queue = FeaturedArtwork.all().filter("publish_date >=", start).order("publish_date").fetch(1000)
     return json.dumps(
         [
             dict(
                 id=a.key().id(),
                 title=a.title,
                 byline=a.byline,
                 imageUri=a.image_url,
                 thumbUri=a.thumb_url,
                 detailsUri=a.details_url,
                 publishDate=date_to_timestamp(a.publish_date),
             )
             for a in queue
         ]
     )
Пример #44
0
    def get(self):
        # Fetch latest 1000 artworks
        latest_artworks = (
            FeaturedArtwork.all().order('-publish_date').fetch(1000))

        # List dates for which artwork exists
        dates_with_existing_art = set(a.publish_date for a in latest_artworks)

        # List target dates that we want artwork for, but for which no artwork exists
        target_dates = [date.today() + timedelta(days=n) for n in range(-1, 9)]
        target_dates = [
            d for d in target_dates if d not in dates_with_existing_art
        ]

        for target_date in target_dates:
            self.response.out.write('looking for artwork for date ' +
                                    str(target_date) + '<br>')

            # Create a blacklist of the most recent 200 artwork
            # (don't want to repeat one of the last 200!)
            blacklist_artwork_keys = set(
                sanitized_artwork_key(a) for a in latest_artworks[:200])
            if len(blacklist_artwork_keys) < 5:
                blacklist_artwork_keys = set(
                )  # should never happen, but just in case of a reset

            # Pick from one of the oldest 500, excluding artwork in the blacklist
            random_artwork = None
            while True:
                random_artwork = random.choice(latest_artworks[500:])
                key = sanitized_artwork_key(random_artwork)
                if 'wikiart.org' in key or 'wikipaintings.org' in key or 'metmuseum.org' in key:
                    if key not in blacklist_artwork_keys:
                        break

            target_details_url = str(random_artwork.details_url)
            self.response.out.write('recycling ' + target_details_url +
                                    ' for date ' + str(target_date) + '<br>')

            backroomarthelper.add_art_from_external_details_url(
                target_date, target_details_url)

        self.response.out.write('done<br>')
Пример #45
0
    def post(self):
        id = long(self.request.get('id'))
        artwork_json = json.loads(self.request.get('json'))
        target_artwork = FeaturedArtwork.get_by_id(id)
        if not target_artwork:
            self.response.set_status(404)
            return

        target_artwork.title = artwork_json['title']
        target_artwork.byline = artwork_json['byline']

        new_image_url, new_thumb_url = maybe_process_image(
            artwork_json['imageUri'],
            artwork_json['title'] + ' ' + artwork_json['byline'])
        if not new_thumb_url and 'thumbUri' in artwork_json:
            new_thumb_url = artwork_json['thumbUri']

        target_artwork.image_url = new_image_url
        target_artwork.thumb_url = new_thumb_url
        target_artwork.details_url = artwork_json['detailsUri']
        target_artwork.save()
        self.response.set_status(200)
Пример #46
0
  def get(self):
    # Fetch latest 1000 artworks
    latest_artworks = (FeaturedArtwork.all()
        .order('-publish_date')
        .fetch(1000))

    # List dates for which artwork exists
    dates_with_existing_art = set(a.publish_date for a in latest_artworks)

    # List target dates that we want artwork for, but for which no artwork exists
    target_dates = [date.today() + timedelta(days=n) for n in range(-1, 9)]
    target_dates = [d for d in target_dates if d not in dates_with_existing_art]

    for target_date in target_dates:
      self.response.out.write('looking for artwork for date ' + str(target_date) + '<br>')

      # Create a blacklist of the most recent 200 artwork
      # (don't want to repeat one of the last 200!)
      blacklist_artwork_keys = set(sanitized_artwork_key(a) for a in latest_artworks[:200])
      if len(blacklist_artwork_keys) < 5:
        blacklist_artwork_keys = set() # should never happen, but just in case of a reset

      # Pick from one of the oldest 500, excluding artwork in the blacklist
      random_artwork = None
      while True:
        random_artwork = random.choice(latest_artworks[500:])
        key = sanitized_artwork_key(random_artwork)
        if 'wikiart.org' in key or 'wikipaintings.org' in key or 'metmuseum.org' in key:
          if key not in blacklist_artwork_keys:
            break

      target_details_url = str(random_artwork.details_url)
      self.response.out.write('recycling ' + target_details_url + ' for date ' + str(target_date) + '<br>')

      backroomarthelper.add_art_from_external_details_url(
          target_date,
          target_details_url)

    self.response.out.write('done<br>')
Пример #47
0
  def post(self):
    id = long(self.request.get('id'))
    artwork_json = json.loads(self.request.get('json'))
    target_artwork = FeaturedArtwork.get_by_id(id)
    if not target_artwork:
      self.response.set_status(404)
      return

    target_artwork.title = artwork_json['title']
    target_artwork.byline = artwork_json['byline']

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        artwork_json['title'] + ' ' + artwork_json['byline'])
    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']

    target_artwork.image_url = new_image_url
    target_artwork.thumb_url = new_thumb_url
    target_artwork.details_url = artwork_json['detailsUri']
    target_artwork.save()
    self.response.set_status(200)
Пример #48
0
  def post(self):
    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None:
      webapp2.abort(409, message='Artwork already exists for this date.')

    url = self.request.get('externalArtworkUrl')
    result = urlfetch.fetch(url)
    if result.status_code < 200 or result.status_code >= 300:
      webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s'
          % (result.status_code, result.content))

    soup = BeautifulSoup(result.content)
    attribution = None

    if re.search(r'wikiart.org', url, re.I):
      attribution = 'wikiart.org'
      details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.select('h1 span')[0].get_text()
      author = soup.find(itemprop='author').get_text()
      completion_year_el = soup.find(itemprop='dateCreated')
      byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
      image_url = soup.find(id='paintingImage')['href']
    elif re.search(r'metmuseum.org', url, re.I):
      attribution = 'metmuseum.org'
      details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.find('h2').get_text()
      author = ''
      try:
        author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
      except:
        pass
      author = re.sub(r'\s*\(.*', '', author)
      completion_year_el = None
      try:
        completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip()
      except:
        pass
      byline = author + ((', ' + completion_year_el) if completion_year_el else '')
      image_url = soup.find('a', class_='download').attrs['href']
    else:
      webapp2.abort(400, message='Unrecognized URL')

    if not title or not author or not image_url:
      webapp2.abort(500, message='Could not parse HTML')

    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        attribution=attribution,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()

    self.response.set_status(200)
    self.response.out.write(json.dumps(artwork_dict(new_artwork)))
Пример #49
0
    def get(self):
        now = datetime.datetime.utcnow()
        if self.request.get('datetime'):
            now = datetime.datetime.strptime(self.request.get('datetime'),
                                             '%Y-%m-%dT%H:%M:%S')

        current_date = (now.date() if now.time() > START_TIME else now.date() -
                        datetime.timedelta(days=1))
        current_month = current_date.month

        # list the expected archives up until this point, starting with current month's archive
        expected_archives = []

        if current_date > ARCHIVE_START_DATE:
            if (current_date +
                    datetime.timedelta(days=1)).month != current_month:
                # end of the month
                expected_archives.append(
                    (current_date.year, current_date.month))
            else:
                # partial month for this month
                expected_archives.append(
                    (current_date.year, current_date.month, current_date.day))

        # list all other months
        if current_date.month != ARCHIVE_START_DATE.month or current_date.year != ARCHIVE_START_DATE.year:
            current_date = current_date.replace(day=1)
            while True:
                current_date -= datetime.timedelta(days=1)  # previous month
                current_date = current_date.replace(day=1)
                expected_archives.append(
                    (current_date.year, current_date.month))
                if current_date <= ARCHIVE_START_DATE:
                    break

        # at this point expected_archives has a list of all archives that should be built

        # list current archive items to determine which archives are missing
        current_archives = []
        current_archive_files = gcs.listbucket(CLOUD_STORAGE_ARCHIVE_PATH)
        self.response.out.write('<h1>current archives</h1>')
        for archive_file in current_archive_files:
            m = re.search(r'((?:\d){4})((?:\d){2})((?:\d){2})?\.txt',
                          archive_file.filename)
            if m:
                if m.group(3):
                    archive = (int(m.group(1)), int(m.group(2)),
                               int(m.group(3)))
                else:
                    archive = (int(m.group(1)), int(m.group(2)))
                current_archives.append(archive)
                self.response.out.write(repr(archive) + '<br>')
            #self.response.out.write(archivemeta.filename + '\n')
        current_archives = set(current_archives)
        expected_archives = set(expected_archives)
        missing_archives = expected_archives.difference(current_archives)

        # generate the missing archives
        self.response.out.write('<h1>building missing archives</h1>')
        for archive in missing_archives:
            self.response.out.write('<h2>' + repr(archive) + '</h2>')
            # when building an archive, try to start from an existing archive
            # find the latest archive from this month as a starting point
            other_archives_from_month = filter(
                lambda x: len(x) == 3 and x[0] == archive[0] and x[1] ==
                archive[1], current_archives)
            latest_current_archive_from_month = None
            latest_archive_gcs_path = None

            archive_metadata = []
            archive_image_blobs = []

            if other_archives_from_month:
                latest_current_archive_from_month = reduce(
                    lambda x, y: (x[0], x[1], max(x[2], y[2])),
                    other_archives_from_month)
                self.response.out.write(
                    'starting from archive ' +
                    repr(latest_current_archive_from_month) + '<br>')

                existing_archive_name = '%04d%02d%02d' % latest_current_archive_from_month
                try:
                    latest_archive_gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + existing_archive_name + '.txt'
                    existing_archive = gcs.open(latest_archive_gcs_path)
                    content = gzip_decompress(existing_archive.read())
                    existing_archive_lines = content.split('\n')
                    existing_archive.close()
                    archive_metadata = json.loads(existing_archive_lines[0])
                    archive_image_blobs = filter(lambda x: len(x) > 0,
                                                 existing_archive_lines[1:])
                except:
                    self.response.out.write(
                        'error reading from existing archive, starting from scratch<br>'
                    )
                    latest_current_archive_from_month = None
                    latest_archive_gcs_path = None

            # construct the query
            query_from = None
            if latest_current_archive_from_month:
                # get everything after the latest archive this month
                query_from = datetime.date(
                    *latest_current_archive_from_month) + datetime.timedelta(
                        days=1)
            else:
                # get everything from this month
                query_from = datetime.date(
                    archive[0], archive[1],
                    archive[2] if len(archive) == 3 else 1).replace(day=1)
            query_from = max(ARCHIVE_START_DATE, query_from)

            query_to = None
            archive_name = None
            if len(archive) == 3:
                # partial month archive
                archive_name = '%04d%02d%02d' % archive
                query_to = datetime.date(*archive)
            else:
                # full month archive
                archive_name = '%04d%02d' % archive
                next_month = datetime.date(archive[0], archive[1], 1)
                if next_month.month == 12:
                    next_month = next_month.replace(year=next_month.year + 1)
                    next_month.month = 1
                else:
                    next_month = next_month.replace(month=next_month.month + 1)
                query_to = next_month - datetime.timedelta(days=1)

            # fetch artworks that match this query
            artwork_objs = (FeaturedArtwork.all().order('publish_date').filter(
                'publish_date >=', query_from).filter('publish_date <=',
                                                      query_to).fetch(1000))
            for artwork_obj in artwork_objs:
                metadata_item = dict(
                    publish_date=artwork_obj.publish_date.isoformat(),
                    title=artwork_obj.title,
                    byline=artwork_obj.byline,
                    thumb_url=artwork_obj.thumb_url,
                    details_url=artwork_obj.details_url,
                )

                # fetch the image
                image_result = urlfetch.fetch(artwork_obj.thumb_url)
                if image_result.status_code < 200 or image_result.status_code >= 300:
                    raise IOError('Error downloading image: HTTP %d.' %
                                  image_result.status_code)

                # resize and crop thumb
                thumb = images.Image(image_result.content)
                if thumb.width > thumb.height:
                    thumb.resize(width=4000, height=ARCHIVE_IMAGE_SIZE)
                    thumb.crop(
                        (float(thumb.width - thumb.height) / thumb.width) / 2,
                        0., 1 -
                        (float(thumb.width - thumb.height) / thumb.width) / 2,
                        1.)
                else:
                    thumb.resize(width=ARCHIVE_IMAGE_SIZE, height=4000)
                    thumb.crop(
                        0.,
                        (float(thumb.height - thumb.width) / thumb.height) / 2,
                        1., 1 -
                        (float(thumb.height - thumb.width) / thumb.height) / 2)

                # compute average color
                histogram = thumb.histogram()
                avg_color = tuple(
                    [int(x) for x in img_weighed_average(histogram)])
                avg_color_hex = "#%0.2X%0.2X%0.2X" % avg_color
                metadata_item['color'] = avg_color_hex

                # export thumb
                thumb_data_uri = 'data:image/jpeg;base64,' + base64.b64encode(
                    thumb.execute_transforms(output_encoding=images.JPEG,
                                             quality=40))

                # append the metadata
                archive_metadata.append(metadata_item)
                archive_image_blobs.append(thumb_data_uri)

            self.response.out.write('query: from ' + repr(query_from) +
                                    ' to ' + repr(query_to) + '<br>')
            self.response.out.write('artworks: ' + str(len(artwork_objs)) +
                                    '<br>')
            #self.response.out.write('<pre>' + json.dumps(archive_metadata, indent=2) + '</pre>')

            # create the archive contents
            s = json.dumps(archive_metadata) + '\n'
            for blob in archive_image_blobs:
                s += blob + '\n'

            # gzip and write the archive
            gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + archive_name + '.txt'
            self.response.out.write('writing to: ' + gcs_path + '<br>')
            gcsf = gcs.open(gcs_path,
                            'w',
                            content_type='text/plain',
                            options={'content-encoding': 'gzip'})
            gcsf.write(gzip_compress(s))
            gcsf.close()

            # delete the previous archive
            if latest_archive_gcs_path:
                gcs.delete(latest_archive_gcs_path)
Пример #50
0
    def get(self):
        ARTWORKS = json.loads(
            open(os.path.join(os.path.split(__file__)[0],
                              'lt-artworks.json')).read())

        # ARTWORKS = filter(lambda a: '_stars' in a and a['_stars'] >= 1, ARTWORKS)

        # Fetch latest 300 artworks (for blacklisting)
        latest_artworks = (
            FeaturedArtwork.all().order('-publish_date').fetch(300))

        # List dates for which artwork exists
        dates_with_existing_art = set(a.publish_date for a in latest_artworks)

        # List target dates that we want artwork for, but for which no artwork exists
        target_dates = [
            date.today() + timedelta(days=n)
            for n in range(-1, LOOKAHEAD_DAYS)
        ]
        target_dates = [
            d for d in target_dates if d not in dates_with_existing_art
        ]

        # Create a blacklist of keys to avoid repeats
        blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

        logging.debug('starting blacklist size: %d' % len(blacklist))

        chosen_artworks = []

        for target_date in target_dates:
            # Pick from available artworks, excluding artwork in the blacklist
            random_artwork = None
            while True:
                if len(ARTWORKS) == 0:
                    logging.error(
                        'Ran out of artworks to choose from, cannot continue')
                    return

                random_artwork = random.choice(ARTWORKS)
                key = artwork_key(random_artwork['detailsUri'])
                if key not in blacklist:
                    # Once chosen, remove it from the list of artworks to choose next
                    ARTWORKS.remove(random_artwork)
                    chosen_artworks.append(random_artwork)
                    break

            target_details_url = str(random_artwork['detailsUri'])
            logging.debug('%(date)s: setting to %(url)s' %
                          dict(url=target_details_url, date=target_date))

            # Store the new artwork
            if self.request.get('dry-run', '') != 'true':
                new_artwork = FeaturedArtwork(
                    title=random_artwork['title'],
                    byline=random_artwork['byline'],
                    attribution=random_artwork['attribution'],
                    image_url=random_artwork['imageUri'],
                    thumb_url=random_artwork['thumbUri'],
                    details_url=random_artwork['detailsUri'],
                    publish_date=target_date)
                new_artwork.save()

        if self.request.get('output', '') == 'html':
            self.response.out.write(
                get_html(artworks_json=json.dumps(chosen_artworks)))

        # Finish up
        logging.debug('done')
Пример #51
0
  def get(self):
    now = datetime.datetime.utcnow()
    if self.request.get('datetime'):
      now = datetime.datetime.strptime(self.request.get('datetime'), '%Y-%m-%dT%H:%M:%S')

    current_date = (now.date() if now.time() > START_TIME
                    else now.date() - datetime.timedelta(days=1))
    current_month = current_date.month

    # list the expected archives up until this point, starting with current month's archive
    expected_archives = []

    if current_date > ARCHIVE_START_DATE:
      if (current_date + datetime.timedelta(days=1)).month != current_month:
        # end of the month
        expected_archives.append((current_date.year, current_date.month))
      else:
        # partial month for this month
        expected_archives.append((current_date.year, current_date.month, current_date.day))

    # list all other months
    if current_date.month != ARCHIVE_START_DATE.month or current_date.year != ARCHIVE_START_DATE.year:
      current_date = current_date.replace(day=1)
      while True:
        current_date -= datetime.timedelta(days=1) # previous month
        current_date = current_date.replace(day=1)
        expected_archives.append((current_date.year, current_date.month))
        if current_date <= ARCHIVE_START_DATE:
          break

    # at this point expected_archives has a list of all archives that should be built

    # list current archive items to determine which archives are missing
    current_archives = []
    current_archive_files = gcs.listbucket(CLOUD_STORAGE_ARCHIVE_PATH)
    self.response.out.write('<h1>current archives</h1>')
    for archive_file in current_archive_files:
      m = re.search(r'((?:\d){4})((?:\d){2})((?:\d){2})?\.txt', archive_file.filename)
      if m:
        if m.group(3):
          archive = (int(m.group(1)), int(m.group(2)), int(m.group(3)))
        else:
          archive = (int(m.group(1)), int(m.group(2)))
        current_archives.append(archive)
        self.response.out.write(repr(archive) + '<br>')
      #self.response.out.write(archivemeta.filename + '\n')
    current_archives = set(current_archives)
    expected_archives = set(expected_archives)
    missing_archives = expected_archives.difference(current_archives)

    # generate the missing archives
    self.response.out.write('<h1>building missing archives</h1>')
    for archive in missing_archives:
      self.response.out.write('<h2>' + repr(archive) + '</h2>')
      # when building an archive, try to start from an existing archive
      # find the latest archive from this month as a starting point
      other_archives_from_month = filter(
          lambda x: len(x) == 3 and x[0] == archive[0] and x[1] == archive[1],
          current_archives)
      latest_current_archive_from_month = None
      latest_archive_gcs_path = None

      archive_metadata = []
      archive_image_blobs = []

      if other_archives_from_month:
        latest_current_archive_from_month = reduce(
            lambda x, y: (x[0], x[1], max(x[2], y[2])), other_archives_from_month)
        self.response.out.write('starting from archive ' + repr(latest_current_archive_from_month) + '<br>')

        existing_archive_name = '%04d%02d%02d' % latest_current_archive_from_month
        try:
          latest_archive_gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + existing_archive_name + '.txt'
          existing_archive = gcs.open(latest_archive_gcs_path)
          content = gzip_decompress(existing_archive.read())
          existing_archive_lines = content.split('\n')
          existing_archive.close()
          archive_metadata = json.loads(existing_archive_lines[0])
          archive_image_blobs = filter(lambda x: len(x) > 0, existing_archive_lines[1:])
        except:
          self.response.out.write('error reading from existing archive, starting from scratch<br>')
          latest_current_archive_from_month = None
          latest_archive_gcs_path = None

      # construct the query
      query_from = None
      if latest_current_archive_from_month:
        # get everything after the latest archive this month
        query_from = datetime.date(*latest_current_archive_from_month) + datetime.timedelta(days=1)
      else:
        # get everything from this month
        query_from = datetime.date(
            archive[0], archive[1], archive[2] if len(archive) == 3 else 1).replace(day=1)
      query_from = max(ARCHIVE_START_DATE, query_from)

      query_to = None
      archive_name = None
      if len(archive) == 3:
        # partial month archive
        archive_name = '%04d%02d%02d' % archive
        query_to = datetime.date(*archive)
      else:
        # full month archive
        archive_name = '%04d%02d' % archive
        next_month = datetime.date(archive[0], archive[1], 1)
        if next_month.month == 12:
          next_month = next_month.replace(year=next_month.year + 1, month=1)
        else:
          next_month = next_month.replace(month=next_month.month + 1)
        query_to = next_month - datetime.timedelta(days=1)

      # fetch artworks that match this query
      artwork_objs = (FeaturedArtwork.all()
          .order('publish_date')
          .filter('publish_date >=', query_from)
          .filter('publish_date <=', query_to)
          .fetch(1000))
      for artwork_obj in artwork_objs:
        metadata_item = dict(
            publish_date=artwork_obj.publish_date.isoformat(),
            title=artwork_obj.title,
            byline=artwork_obj.byline,
            thumb_url=artwork_obj.thumb_url,
            details_url=artwork_obj.details_url,)

        # fetch the image
        image_result = urlfetch.fetch(artwork_obj.thumb_url)
        if image_result.status_code < 200 or image_result.status_code >= 300:
          raise IOError('Error downloading image: HTTP %d.' % image_result.status_code)

        # resize and crop thumb
        thumb = images.Image(image_result.content)
        if thumb.width > thumb.height:
          thumb.resize(width=4000, height=ARCHIVE_IMAGE_SIZE)
          thumb.crop(
              (float(thumb.width - thumb.height) / thumb.width) / 2, 0.,
              1 - (float(thumb.width - thumb.height) / thumb.width) / 2, 1.)
        else:
          thumb.resize(width=ARCHIVE_IMAGE_SIZE, height=4000)
          thumb.crop(
              0., (float(thumb.height - thumb.width) / thumb.height) / 2,
              1., 1 - (float(thumb.height - thumb.width) / thumb.height) / 2)

        # compute average color
        histogram = thumb.histogram()
        avg_color = tuple([int(x) for x in img_weighed_average(histogram)])
        avg_color_hex = "#%0.2X%0.2X%0.2X" % avg_color
        metadata_item['color'] = avg_color_hex

        # export thumb
        thumb_data_uri = 'data:image/jpeg;base64,' + base64.b64encode(
            thumb.execute_transforms(output_encoding=images.JPEG, quality=40))

        # append the metadata
        archive_metadata.append(metadata_item)
        archive_image_blobs.append(thumb_data_uri)

      self.response.out.write('query: from ' + repr(query_from) + ' to ' + repr(query_to) + '<br>')
      self.response.out.write('artworks: ' + str(len(artwork_objs)) + '<br>')
      #self.response.out.write('<pre>' + json.dumps(archive_metadata, indent=2) + '</pre>')

      # create the archive contents
      s = json.dumps(archive_metadata) + '\n'
      for blob in archive_image_blobs:
        s += blob + '\n'

      # gzip and write the archive
      gcs_path = CLOUD_STORAGE_ARCHIVE_PATH + '/' + archive_name + '.txt'
      self.response.out.write('writing to: ' + gcs_path + '<br>')
      gcsf = gcs.open(gcs_path, 'w',
          content_type='text/plain', options={'content-encoding':'gzip'})
      gcsf.write(gzip_compress(s))
      gcsf.close()

      # delete the previous archive
      if latest_archive_gcs_path:
        gcs.delete(latest_archive_gcs_path)