def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') u'0060304' >>> get_data('236')['posters'][0] u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "url": get_url(id) } try: html = read_url(data["url"], timeout=timeout, unicode=True) except: html = ox.cache.read_url(data["url"], timeout=timeout) data["number"] = find_re(html, "<li>Spine #(\d+)") data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>") data["title"] = data["title"].split(u' \u2014 The Television Version')[0] data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>")) results = find_re(html, '<div class="left_column">(.*?)</div>') results = re.compile("<li>(.*?)</li>").findall(results) data["country"] = results[0] data["year"] = results[1] data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>")) result = find_re(html, "<div class=\"purchase\">(.*?)</div>") if 'Blu-Ray' in result or 'Essential Art House DVD' in result: r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html) if r: result = r[0] result = find_re(result, "<a href=\"(.*?)\"") if not "/boxsets/" in result: data["posters"] = [result] else: html_ = read_url(result, unicode=True) result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id) result = find_re(result, "src=\"(.*?)\"") if result: data["posters"] = [result.replace("_w100", "")] else: data["posters"] = [] data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']] result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"") if result: data["stills"] = [result] data["trailers"] = [] else: data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]) data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]) if timeout == ox.cache.cache_timeout: timeout = -1 if get_imdb: # removed year, as "title (year)" may fail to match data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout) return data
def get_news(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' ] dt = datetime(year, month, day) day = int(dt.strftime('%j')) date = dt.strftime('%d.%m.%Y') news = [] for section in sections: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) if date == time.strftime('%d.%m.%Y', time.localtime()): html = ox.net.read_url(url) else: html = ox.cache.read_url(url) for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html): dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip() try: description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0]) except: description = '' try: imageUrl = re.compile('<img src="(.*?)"').findall(item)[0] except: imageUrl = '' try: title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':') except: title = '' if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: new = {} if len(dateString) == 10: new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) else: new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) # fix decode_html # new['description'] = format_string(decode_html(description)) new['description'] = format_string(description) new['imageUrl'] = imageUrl new['section'] = format_section(section) new['title'] = format_string(title) new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') if new['title1'][-1:] == ':': new['title1'] = new['title1'][0:-1] new['title2'] = new['title'][len(new['title1']) + 2:] new['url'] = re.compile('<a href="(.*?)"').findall(item)[0] if new['url'][:1] == '/': new['url'] = 'http://www.spiegel.de' + new['url'] news.append(new) # print '%s, %s' % (new['section'], dateString) ''' elif dateString[:10] == date and not description: print dateString + ' - no description' elif dateString[:10] == date and not imageUrl: print dateString + ' - no image' ''' return news
def get_data(self): data = {"id": self.id} url = compose_url("viewAlbum", {"id": self.id}) xml = read_url(url, None, ITUNES_HEADERS) data["albumName"] = find_re(xml, "<B>(.*?)</B>") data["artistName"] = find_re(xml, "<b>(.*?)</b>") data["coverUrl"] = find_re(xml, 'reflection="." url="(.*?)"') data["genre"] = find_re(xml, "Genre:(.*?)<") data["releaseDate"] = find_re(xml, "Released(.*?)<") data["review"] = strip_tags( find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>') ) data["tracks"] = [] strings = find_re(xml, "<key>items</key>.*?<dict>(.*?)$").split("<dict>") for string in strings: data["tracks"].append(parse_xml_dict(string)) data["type"] = find_re(xml, "<key>listType</key><string>(.*?)<") return data
def get_data(id): ''' >>> get_data('1991/silence_of_the_lambs')['imdbId'] u'0102926' >>> get_data('1991/silence_of_the_lambs')['posters'][0] u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' >>> get_data('1991/silence_of_the_lambs')['url'] u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' ''' data = { 'url': get_url(id) } html = read_url(data['url'], unicode=True) data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') if not data['imdbId']: data['imdbId'] = _id_map.get(id, '') data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">')) data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)') data['posters'] = [] poster = find_re(html, '<img src="(posters.*?)"') if poster: poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster) data['posters'].append(poster) results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html) for result in results: result = result.replace('_xlg.html', '.html') url = 'http://www.impawards.com/%s/%s' % (data['year'], result) html = read_url(url, unicode=True) result = find_re(html, '<a href = (\w*?_xlg.html)') if result: url = 'http://www.impawards.com/%s/%s' % (data['year'], result) html = read_url(url, unicode=True) poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"')) else: poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"')) data['posters'].append(poster) return data
def get_issue(year, week): coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) if not ox.net.exists(coverUrl): return None url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) contents = [] data = ox.cache.read_url(url) items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data) for item in items: item = item[1] page = int(re.compile('&SE=(.*?)"').findall(item)[0]) title = strip_tags(item).strip() contents.append({'title': title, 'page': page}) pageUrl = {} pages = page + 2 for page in range(1, pages + 10): url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) if ox.cache.exists(url): pageUrl[page] = url else: pageUrl[page] = '' return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}