def get_data(mininovaId): _key_map = { 'by': u'uploader', } mininovaId = get_id(mininovaId) torrent = dict() torrent[u'id'] = mininovaId torrent[u'domain'] = 'mininova.org' torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True) if '<h1>Torrent not found...</h1>' in data: return None for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) torrent[key] = value torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>') torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>') if torrent['description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = get_torrent_info(t) return torrent
def torbrowser_url(): import re import sys from ox.cache import read_url base_url = 'https://dist.torproject.org/torbrowser/' r = re.compile('href="(\d\.\d\.\d/)"') current = sorted(r.findall(read_url(base_url).decode()))[-1] url = base_url + current if sys.platform.startswith('linux'): osname = 'linux64' ext = 'xz' elif sys.platform == 'darwin': osname = 'osx64' ext = 'dmg' elif sys.platform == 'win32': osname = 'install' ext = 'exe' else: logger.debug('no way to get torbrowser url for %s', sys.platform) return None r = re.compile('href="(.*?{osname}.*?en.*?{ext})"'.format(osname=osname, ext=ext)) torbrowser = sorted(r.findall(read_url(url).decode()))[-1] url += torbrowser return url
def torbrowser_url(): import re import sys from ox.cache import read_url base_url = 'https://dist.torproject.org/torbrowser/' r = re.compile('href="(\d\.\d\.\d/)"') current = sorted(r.findall(read_url(base_url).decode()))[-1] url = base_url + current if sys.platform.startswith('linux'): osname = 'linux64' ext = 'xz' elif sys.platform == 'darwin': osname = 'osx64' ext = 'dmg' elif sys.platform == 'win32': osname = 'install' ext = 'exe' else: logger.debug('no way to get torbrowser url for %s', sys.platform) return None r = re.compile('href="(.*?{osname}.*?en.*?{ext})"'.format(osname=osname,ext=ext)) torbrowser = sorted(r.findall(read_url(url).decode()))[-1] url += torbrowser return url
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') u'0060304' >>> get_data('236')['posters'][0] u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "url": get_url(id) } try: html = read_url(data["url"], timeout=timeout, unicode=True) except: html = ox.cache.read_url(data["url"], timeout=timeout) data["number"] = find_re(html, "<li>Spine #(\d+)") data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>") data["title"] = data["title"].split(u' \u2014 The Television Version')[0] data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>")) results = find_re(html, '<div class="left_column">(.*?)</div>') results = re.compile("<li>(.*?)</li>").findall(results) data["country"] = results[0] data["year"] = results[1] data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>")) result = find_re(html, "<div class=\"purchase\">(.*?)</div>") if 'Blu-Ray' in result or 'Essential Art House DVD' in result: r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html) if r: result = r[0] result = find_re(result, "<a href=\"(.*?)\"") if not "/boxsets/" in result: data["posters"] = [result] else: html_ = read_url(result, unicode=True) result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id) result = find_re(result, "src=\"(.*?)\"") if result: data["posters"] = [result.replace("_w100", "")] else: data["posters"] = [] data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']] result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"") if result: data["stills"] = [result] data["trailers"] = [] else: data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]) data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]) if timeout == ox.cache.cache_timeout: timeout = -1 if get_imdb: # removed year, as "title (year)" may fail to match data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout) return data
def get_data(isbn): r = {} url = '%s/Search/Book/%s/1' % (base, isbn) data = read_url(url).decode('utf-8') m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data) if m: ids = m[0].split('/') r['isbn'] = ids[-2] r['asin'] = ids[-3] url = '%s%s' % (base, m[0]) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key])) if r[key] == '--': r[key] = '' if key == 'pages' and r[key]: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = strip_tags(desc).strip() if r['description'] == u'Description of this item is not available at this time.': r['description'] = '' r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '') return r
def download_subtitle(opensubtitle_id): srts = {} data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>' for f in re.compile(reg_exp, re.DOTALL).findall(data): name = strip_tags(f[1]).split('\n')[0] url = "http://www.opensubtitles.com%s" % f[0] srts[name] = read_url(url, unicode=True) return srts
def get_lyrics(title, artist): html = read_url('http://lyricsfly.com/api/') key = find_re(html, '<font color=green><b>(.*?)</b></font>') url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) xml = read_url(url) lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('[br]', '\n').strip() lyrics.replace('\n\n\n', '\n\n') lyrics = decode_html(lyrics.replace('&', '&')) return lyrics
def info(id, timeout=cache_timeout): info = {} if id.startswith('http'): id = get_id(id) if not id: return info url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id data = read_url(url, timeout=timeout) xml = parseString(data) info['id'] = id info['url'] = get_url(id) info['title'] = xml.getElementsByTagName('title')[0].firstChild.data info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0] info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data info['categories'] = [] for cat in xml.getElementsByTagName('media:category'): info['categories'].append(cat.firstChild.data) k = xml.getElementsByTagName('media:keywords')[0].firstChild if k: info['keywords'] = k.data.split(', ') data = read_url(info['url'], timeout=timeout) match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data) if match: info['license'] = match[0].strip() info['license'] = re.sub('<.+?>', '', info['license']).strip() url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id data = read_url(url, timeout=timeout) xml = parseString(data) languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')] if languages: info['subtitles'] = {} for language in languages: url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language) data = read_url(url, timeout=timeout) xml = parseString(data) subs = [] for t in xml.getElementsByTagName('text'): start = float(t.getAttribute('start')) duration = t.getAttribute('dur') if not duration: duration = '2' end = start + float(duration) text = t.firstChild.data subs.append({ 'in': start, 'out': end, 'value': ox.decode_html(text), }) info['subtitles'][language] = subs return info
def get_posters(url, group=True, timeout=-1): posters = [] html = read_url(url, timeout=timeout, unicode=True) if url in html: if group: results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html) for result in results: posters += get_posters(result, False) results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html) for result in results: html = read_url(result, timeout=timeout, unicode=True) posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) return posters
def get_data(url): data = read_url(url) r = {} r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>') if '(' in r['title']: r['year'] = find_re(r['title'], '\((\d*?)\)') r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip() r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip() r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') if not r['summary']: r['summary'] = get_og(data, 'description') meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data) meter = filter(lambda m: m[1].isdigit(), meter) if meter: r['tomatometer'] = meter[0][1] r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>') r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>') r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5') poster = get_og(data, 'image') if poster and not 'poster_default.gif' in poster: r['posters'] = [poster] for key in r.keys(): if not r[key]: del r[key] return r
def get_show_data(url): data = read_url(url, unicode=True) r = {} r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>')) r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>') r['episodes'] = {} #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data): air_date = episode[3].strip() #'22 Sep 04' -> 2004-09-22 try: air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y')) except: pass s = episode[1].split('-')[0].strip() e = episode[1].split('-')[-1].strip() try: r['episodes']['S%02dE%02d' % (int(s), int(e))] = { 'prod code': episode[2], 'air date': air_date, 'url': episode[4], 'title':episode[5], } except: print "oxweb.epguides failed,", url return r
def get_reviews(url): data = read_url(url, unicode=True) doc = document_fromstring(data) score = doc.xpath('//span[@itemprop="ratingValue"]') if score: score = int(score[0].text) else: score = -1 # NOTE: some reviews may not have authors # one solution is to track by source instead sources = [a.text for a in doc.xpath('//div[contains(@class, "critic_reviews")]'\ '//div[@class="review_content"]'\ '//div[@class="source"]//a|//span[@class="no_link"]')] reviews = [d.text for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[@class="review_body"]')] scores = [score_to_int(d.text.strip()) for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[contains(@class, "metascore_w")]')] metacritics = [] for i in range(len(reviews)): if scores[i] != -1: # Don't include TBD scores metacritics.append({ 'source': sources[i], 'quote': strip_tags(reviews[i]).strip(), 'score': scores[i], }) return { 'critics': metacritics, 'id': get_id(url), 'score': score, 'url': url, }
def get_url(id=None, imdb=None): if imdb: url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb data = read_url(url) metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"') return metacritic_url or None return 'http://www.metacritic.com/movie/%s' % id
def get_data(id): info = {} base = 'http://www.abebooks.com' url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id) data = read_url(url) urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data) if urls: details = '%s%s' % (base, urls[0]) data = read_url(details) doc = lxml.html.document_fromstring(data) for e in doc.xpath("//*[contains(@id, 'biblio')]"): key = e.attrib['id'].replace('biblio-', '') value = e.text_content() if value and key not in ('bookcondition', 'binding'): info[key] = value return info
def authors_ol(authors): r = [] for a in authors: url = 'http://openlibrary.org%s.json' % a data = json.loads(read_url(url)) r.append(data['name']) return r
def __init__(self, id, timeout=-1): url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id ''' "http://graph.freebase.com/imdb.title.tt%s" % id might also be of interest at some point, right now not much info ''' data = read_url(url, unicode=True) try: data = json.loads(data) except ValueError: return ''' for key in data: self[key] = data[key] ''' for key in ('id', 'guid', 'name'): self[key] = data[key] keys = { 'wikipedia': '/wikipedia/en', 'netflix': '/authority/netflix/movie', 'nytimes': '/source/nytimes/movie', 'metacritic': '/source/metacritic/movie', } for key in keys: links = filter(lambda x: x['namespace'] == keys[key],data['ids']) if links: self[key] = links[0]['uri'] if 'nytimes' in self: self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-')) self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
def get_ids(key, value): ids = [] def add_other_isbn(v): if len(v) == 10: ids.append(('isbn', stdnum.isbn.to_isbn13(v))) if len(v) == 13 and v.startswith('978'): ids.append(('isbn', stdnum.isbn.to_isbn10(v))) if key in ('isbn', 'asin'): url = '%s/Search/Book/%s/1' % (base, value) data = read_url(url).decode('utf-8') m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data) if m: asin = m[0].split('/')[-3] if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin): ids.append(('asin', asin)) if key == 'isbn': add_other_isbn(value) if key == 'asin': if stdnum.isbn.is_valid(value): ids.append(('isbn', value)) add_other_isbn(value) else: for isbn in amazon_lookup(value): if stdnum.isbn.is_valid(isbn): ids.append(('isbn', isbn)) add_other_isbn(isbn) if ids: logger.debug('get_ids %s, %s => %s', key, value, ids) return ids
def get_data(self): data = {"id": self.id} url = compose_url("viewMovie", {"id": self.id}) xml = read_url(url, None, ITUNES_HEADERS) f = open("/Users/rolux/Desktop/iTunesData.xml", "w") f.write(xml) f.close() data["actors"] = parse_cast(xml, "actors") string = find_re(xml, "Average Rating:(.*?)</HBoxView>") data["averageRating"] = string.count("rating_star_000033.png") + string.count("½") * 0.5 data["directors"] = parse_cast(xml, "directors") data["format"] = find_re(xml, "Format:(.*?)<") data["genre"] = decode_html(find_re(xml, "Genre:(.*?)<")) data["plotSummary"] = decode_html( find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>') ) data["posterUrl"] = find_re(xml, 'reflection="." url="(.*?)"') data["producers"] = parse_cast(xml, "producers") data["rated"] = find_re(xml, "Rated(.*?)<") data["relatedMovies"] = parse_movies(xml, "related movies") data["releaseDate"] = find_re(xml, "Released(.*?)<") data["runTime"] = find_re(xml, "Run Time:(.*?)<") data["screenwriters"] = parse_cast(xml, "screenwriters") data["soundtrackId"] = find_re(xml, "viewAlbum\?id=(.*?)&") data["trailerUrl"] = find_re(xml, 'autoplay="." url="(.*?)"') return data
def get_ids(page=None): ids = [] if page: html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True) results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html) for result in results: url = 'http://impawards.com/%s' % result ids.append(get_id(url)) return set(ids) #get all html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True) pages = int(find_re(html, '<a href= page(.*?).html>')) + 1 for page in range(pages, 0, -1): for id in get_ids(page): if not id in ids: ids.append(id) return ids
def find_movie(query=None, imdb=None, max_results=10): '''search for torrents on mininova ''' if imdb: url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb) else: url = "http://www.mininova.org/search/%s/seeds" % quote(query) data = read_url(url, unicode=True) return _parse_results_page(data, max_results)
def playlist(url): data = read_url(url) items = [] for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))): items.append({ 'title': i[1], 'url': 'http://www.youtube.com' + i[0].split('&')[0] }) return items
def get_lymbix_tonalize(text_inp,timeout = cache_timeout): """ """ data = {'article':text_inp} print data, base_url+req_type[0], headers content = read_url(base_url+req_type[0],urlencode(data),headers,timeout, unicode=True) return content
def request(action, data): data = urlencode({'action': action, 'data': json.dumps(data)}) url = 'http://meta.openmedialibrary.com/api/' try: return json.loads(read_url(url, data, timeout=60).decode('utf-8'))['data'] except: logger.debug('metadata request failed', exc_info=1) return {}
def get_movie_data(title, director): if isinstance(title, unicode): title = title.encode('utf-8') if isinstance(director, unicode): director = director.encode('utf-8') data = {} # itunes section (preferred source for link) url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch' url += '?media=movie&movieTerm=' + title url += '&actorNames=&directorProducerName=' + director url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1' HEADERS['Referer'] = url html = read_url(url, headers=HEADERS, unicode=True) regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">' regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>' results = re.compile(regexp).findall(html) if results: data['link'] = results[0][0] data['poster'] = results[0][1].replace('140x140', '600x600') html = read_url(data['link'], headers=HEADERS, unicode=True) results = re.compile('video-preview-url="(.*?)"').findall(html) if results: data['trailer'] = results[0] # trailers section (preferred source for poster and trailer) host = 'http://trailers.apple.com' url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title js = json.loads(read_url(url, unicode=True)[16:-4]) results = js['results'] if results: url = host + results[0]['location'] if not 'link' in data: data['link'] = url headers = { 'User-Agent': USER_AGENT } html = read_url(url, headers=headers, unicode=True) results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html) if results: data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg') html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True) results = re.compile('"(' + host + '\S+\.mov)"').findall(html) if results: data['trailer'] = results[-1] return data
def get_lymbix_tonalize_multiple(text_inp,timeout = cache_timeout): """ """ data = {'article':text_inp} content = read_url(base_url+req_type[0],urlencode(data),headers,timeout) return content
def get_ids(page=None): ids = [] if page: url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page html = read_url(url) results = re.compile("films/(\d+)").findall(html) ids += results results = re.compile("boxsets/(.*?)\"").findall(html) for result in results: html = read_url("http://www.criterion.com/boxsets/" + result) results = re.compile("films/(\d+)").findall(html) ids += results return set(ids) html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True) results = re.compile("\&p=(\d+)\&").findall(html) pages = max(map(int, results)) for page in range(1, pages): ids += get_ids(page) return sorted(set(ids), key=int)
def get_data(id): ''' >>> get_data('129689')['cast'][1][1] u'Marianne' >>> get_data('129689')['credits'][0][0] u'Jean-Luc Godard' >>> get_data('129689')['posters'][0] u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' >>> get_data('129689')['rating'] u'4.5' ''' if id.startswith('http'): id = get_id(id) data = { "url": get_url(id) } html = read_url(data["url"], unicode=True) data['aka'] = parse_list(html, 'AKA') data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>') data['countries'] = parse_list(html, 'countries') data['director'] = parse_entry(html, 'directed by') data['genres'] = parse_list(html, 'genres') data['keywords'] = parse_list(html, 'keywords') data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')] data['produced'] = parse_list(html, 'produced by') data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"') data['released'] = parse_entry(html, 'released by') data['releasedate'] = parse_list(html, 'release date') data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip() data['set'] = parse_entry(html, 'set in') data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['themes'] = parse_list(html, 'themes') data['types'] = parse_list(html, 'types') data['year'] = find_re(html, '<span class="year">.*?(\d+)') #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)] data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html) #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True) #data['cast'] = parse_table(html) #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True) #data['credits'] = parse_table(html) html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True) data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() return data
def _request(self, action, data, timeout=None): for key in data: if not isinstance(data[key], str): data[key] = json.dumps(data[key]) url = self.base + '/' + action + '?' + urlencode(data) if timeout is None: r = read_url(url).decode('utf-8') if '504 Gateway Time-out' in r: r = read_url(url, timeout=-1).decode('utf-8') result = json.loads(r) else: r = read_url(url, timeout).decode('utf-8') if '504 Gateway Time-out' in r: r = read_url(url, timeout=-1).decode('utf-8') result = json.loads(r) if 'status' in result and result['status'] == 'error' or 'error' in result: logger.info('FAILED %s %s', action, data) logger.info('URL %s', url) return result
def get_lymbix_flag_response(text_inp,timeout = cache_timeout): """ """ data = {'article':text_inp} content = read_url(base_url+req_type[0],urlencode(data),headers,timeout) return content
def get_config(id): if id.startswith('http'): url = id else: url = get_url(id) data = read_url(url) match = re.compile('ytplayer.config = (.*?);<').findall(data) if match: config = json.load(match[0]) return config
def findISBN(title, author): q = '%s %s' % (title, author) url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q) data = read_url(url, unicode=True) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') data = get_data(id) if author in data['authors']: return data return {}
def get_data(url): if not url.startswith('http:'): url = get_url(url) data = read_url(url, unicode=True) m = { 'id': get_id(url), 'url': url, 'type': re.compile('ubu.com/(.*?)/').findall(url)[0] } for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data): if videourl.endswith('.srt'): m['srt'] = videourl elif not 'video' in m: m['video'] = videourl m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20') if m['video'] == 'http://ubumexico.centro.org.mx/video/': del m['video'] m['title'] = strip_tags(decode_html(title)).strip() if not 'url' in m: print url, 'missing' if 'title' in m: m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title']) match = re.compile("flashvars','file=(.*?.flv)'").findall(data) if match: m['flv'] = match[0] m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20') y = re.compile('\((\d{4})\)').findall(data) if y: m['year'] = int(y[0]) d = re.compile('Director: (.+)').findall(data) if d: m['director'] = strip_tags(decode_html(d[0])).strip() a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data) if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0])).strip() elif m['id'] == 'film/lawder_color': m['artist'] = 'Standish Lawder' if 'artist' in m: m['artist'] = m['artist'].replace('in UbuWeb Film', '') m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip() if m['id'] == 'film/coulibeuf': m['title'] = 'Balkan Baroque' m['year'] = 1999 return m
def get_ids(key, value): ids = [] if key == 'isbn': url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, value) data = read_url(url, unicode=True) urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data) if urls: ids.append((key, value)) if ids: logger.debug('get_ids %s %s => %s', key, value, ids) return ids
def _request(self, action, data, timeout=None): for key in data: if not isinstance(data[key], str): data[key] = json.dumps(data[key]) url = self.base + '/' + action + '?' + urlencode(data) if timeout is None: r = read_url(url).decode('utf-8') if '504 Gateway Time-out' in r: r = read_url(url, timeout=-1).decode('utf-8') result = json.loads(r) else: r = read_url(url, timeout).decode('utf-8') if '504 Gateway Time-out' in r: r = read_url(url, timeout=-1).decode('utf-8') result = json.loads(r) if 'status' in result and result[ 'status'] == 'error' or 'error' in result: logger.info('FAILED %s %s', action, data) logger.info('URL %s', url) return result
def lookup(id): logger.debug('lookup %s', id) data = {} url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id) html = read_url(url, unicode=True) urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(html) keys = {'pubdate': 'date'} if urls: details = '%s%s' % (base, urls[0]) html = read_url(details, unicode=True) doc = lxml.html.document_fromstring(html) for e in doc.xpath("//*[contains(@id, 'biblio')]"): key = e.attrib['id'].replace('biblio-', '') value = e.text_content().strip() k = keys.get(key, key) if k == 'date' and value == 'Publication Date:': value = '' elif k == 'publisher' and value == 'Publisher:': value = '' if value and key not in ('bookcondition', 'binding', 'edition-amz'): data[k] = value return data
def lookup(id): logger.debug('lookup %s', id) r = {'asin': [id]} url = '%s/Lookup/Book/%s/%s/1' % (base, id, id) logger.debug('%s', url) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") if r["title"] == 'Error!': return {} keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re( data, '<span class="title">%s:</span>(.*?)</li>' % re.escape(keys[key])) if r[key] == '--' or not r[key]: del r[key] if key == 'pages' and key in r: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = decode_html(strip_tags(desc)) r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace( '._SL160_', '') for key in r: if isinstance(r[key], str): r[key] = decode_html(strip_tags(r[key])).strip() if 'author' in r and isinstance(r['author'], str) and r['author']: r['author'] = [r['author']] else: r['author'] = [] if not r['author'] or r['author'][0].isupper(): del r['author'] if r['description'].lower( ) == 'Description of this item is not available at this time.'.lower(): r['description'] = '' return r
def get_ids(key, value): ids = [] if key == 'isbn': url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value) html = read_url(url).decode('utf-8') matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html) if matches: info = lookup(matches[0]) ids.append(('oclc', matches[0])) for v in info.get('isbn', []): if v != value: ids.append(('isbn', v)) elif key == 'oclc': info = lookup(value) if 'isbn' in info: for value in info['isbn']: ids.append(('isbn', value)) if ids: logger.debug('get_ids %s %s => %s', key, value, ids) return ids
def amazon_lookup(asin): url = 'http://www.amazon.com/dp/%s' % asin html = read_url(url, timeout=-1).decode('utf-8', 'ignore') return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
def lookup(id): data = { 'oclc': [id] } url = '%s/oclc/%s' % (base_url, id) html = read_url(url).decode('utf-8') doc = lxml.html.document_fromstring(html) for e in doc.xpath("//*[contains(@id, 'bibtip')]"): key = e.attrib['id'].replace('bibtip_', '') value = e.text_content().strip() if value: data[key] = value info = doc.xpath('//textarea[@id="util-em-note"]') if info: info = info[0].text info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')]) for key in info: k = key.lower() value = info[key].strip() if value: data[k] = value for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'): if key in data: del data[key] if 'isxn' in data: for isbn in data.pop('isxn').split(' '): isbn = normalize_isbn(isbn) if stdnum.isbn.is_valid(isbn): if not 'isbn' in data: data['isbn'] = [] if isbn not in data['isbn']: data['isbn'].append(isbn) cover = doc.xpath('//img[@class="cover"]') if cover: data['cover'] = cover[0].attrib['src'] if data['cover'].startswith('//'): data['cover'] = 'http:' + data['cover'] cdata = read_url(data['cover']) if hashlib.sha1(cdata).hexdigest() in ( 'd2e9ab0c87193d69a7d3a3c21ae4aa550f7dcf00', '70f16d3e077cdd47ef6b331001dbb1963677fa04' ): del data['cover'] if 'author' in data: data['author'] = data['author'].split('; ') if 'title' in data: data['title'] = data['title'].replace(' : ', ': ') if 'publisher' in data: m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher']) if m: place, publisher, date = m[0] data['publisher'] = publisher data['date'] = date data['place'] = [place] elif ':' in data['publisher']: place, publisher = data['publisher'].split(':', 1) data['place'] = [place.strip()] data['publisher'] = publisher.split(',')[0].strip() m = re.compile('\d{4}').findall(publisher) if m: data['date'] = m[0] if 'place' in data: if data['place'][0].startswith('['): data['place'] = [data['place'][0][1:]] if data['place'][0].endswith(']'): data['place'] = [data['place'][0][:-1]] logger.debug('lookup %s => %s', id, list(data.keys())) return data
def lookup(id): logger.debug('lookup %s', id) ns = '{http://www.loc.gov/mods/v3}' url = 'http://lccn.loc.gov/%s/mods' % id info = { 'lccn': [id] } try: data = read_url(url).decode('utf-8') mods = ET.fromstring(data) except: try: data = read_url(url, timeout=0).decode('utf-8') mods = ET.fromstring(data) except: logger.debug('lookup for %s url: %s failed', id, url, exc_info=1) return info title = mods.findall(ns + 'titleInfo') if not title: return {} info['title'] = ''.join([': ' + e.text.strip() if e.tag == ns + 'subTitle' else ' ' + e.text.strip() for e in title[0]]).strip() origin = mods.findall(ns + 'originInfo') if origin: info['place'] = [] for place in origin[0].findall(ns + 'place'): terms = place.findall(ns + 'placeTerm') if terms and terms[0].attrib['type'] == 'text': e = terms[0] info['place'].append(e.text) elif terms and terms[0].attrib['type'] == 'code': e = terms[0] info['country'] = COUNTRIES.get(e.text, e.text) publisher = [e.text for e in origin[0].findall(ns + 'publisher')] if publisher: info['publisher'] = publisher[0] info['date'] = ''.join([e.text for e in origin[0].findall(ns + 'dateIssued') if e.attrib.get('encoding') == 'marc']) for i in mods.findall(ns + 'identifier'): key = i.attrib['type'] value = i.text if key in ('oclc', 'lccn', 'isbn'): if i.attrib['type'] == 'oclc': value = value.replace('ocn', '').replace('ocm', '') if i.attrib['type'] == 'isbn': value = normalize_isbn(i.text) if not key in info: info[key] = [] if value not in info[key]: info[key].append(value) for i in mods.findall(ns + 'classification'): if i.attrib['authority'] == 'ddc': info['classification'] = get_classification(i.text.split('/')[0]) info['author'] = [] for a in mods.findall(ns + 'name'): if a.attrib.get('usage') == 'primary': info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )])) info['author'] = [ox.normalize_name(a) for a in info['author']] toc = mods.findall(ns + 'tableOfContents') if toc: info['description'] = toc[0].text.strip() for key in list(info.keys()): if not info[key]: del info[key] return info
"996": "Other parts of Pacific Polynesia", "997": "Atlantic Ocean islands", "998": "Arctic islands and Antarctica", "999": "Extraterrestrial worlds" } if __name__ == '__main__': import json import re from ox.cache import read_url dewey = {} for i in range(0, 1000): url = 'http://dewey.info/class/%s/about.en.json' % i print(url) data = json.loads(read_url(url).decode('utf-8')) for d in list(data.values()): if 'http://www.w3.org/2004/02/skos/core#prefLabel' in d: value = d['http://www.w3.org/2004/02/skos/core#prefLabel'][0][ 'value'] dewey[str(i)] = value break data = json.dumps(dewey, indent=4, ensure_ascii=False, sort_keys=True).encode('utf-8') with open(__file__) as f: pydata = f.read() pydata = re.sub(re.compile('\nDEWEY = {.*?}\n\n', re.DOTALL), '\nDEWEY = %s\n\n' % data, pydata) with open(__file__, 'w') as f:
"nw": "Northern Mariana Islands", "wvu": "West Virginia", "-xxr": "Soviet Union", "-tar": "Tajik S.S.R.", "bcc": "British Columbia" } if __name__ == '__main__': import json import re import ox from ox.cache import read_url url = "http://www.loc.gov/marc/countries/countries_code.html" data = read_url(url).decode('utf-8') countries = dict([ [ox.strip_tags(c) for c in r] for r in re.compile('<tr>.*?class="code">(.*?)</td>.*?<td>(.*?)</td>', re.DOTALL).findall(data) ]) data = json.dumps(countries, indent=4, ensure_ascii=False).encode('utf-8') with open(__file__) as f: pydata = f.read() pydata = re.sub( re.compile('\nCOUNTRIES = {.*?}\n\n', re.DOTALL), '\nCOUNTRIES = %s\n\n' % data, pydata) with open(__file__, 'w') as f: f.write(pydata)
def info(key, value): if key not in ('isbn', ): raise IOError('unknwon key %s' % key) if len(value) == 13: value = stdnum.isbn.to_isbn10(value) if len(value) != 10: raise IOError('invalid isbn %s' % value) url = 'http://www.amazon.com/dp/' + value data = read_url(url).decode() doc = lxml.html.document_fromstring(data) info = {} if '<title>404 - Document Not Found</title>' in data: return info if 'To discuss automated access to Amazon data please' in data: return info for l in doc.xpath('//link[@rel="canonical" and @href]'): info['asin'] = [l.get('href').rpartition('/')[-1]] break info['title'] = strip_tags( decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title']) info['description'] = strip_tags( decode_html( unquote( re.compile('encodedDescription\' : "(.*?)",').findall(data) [0]))) info['description'] = fix_bad_unicode(info['description']) content = doc.xpath('//div[@class="content"]')[0] content_info = {} for li in content.xpath('.//li'): v = li.text_content() if ': ' in v: k, v = li.text_content().split(': ', 1) content_info[k.strip()] = v.strip() if 'Language' in content_info: info['language'] = content_info['Language'] if 'Publisher' in content_info: if ' (' in content_info['Publisher']: info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}') info['publisher'] = content_info['Publisher'].split(' (')[0] if '; ' in info['publisher']: info['publisher'], info['edition'] = info['publisher'].split( '; ', 1) if 'ISBN-13' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-13'].replace('-', '')) if 'ISBN-10' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-10']) a = doc.xpath('//span[@class="a-size-medium"]') if a: for span in a: r = span.getchildren()[0].text.strip() role = get_role(r) if not role in info: info[role] = [] info[role].append(span.text.strip()) else: for span in doc.xpath('//span[@class="author notFaded"]'): author = [ x.strip() for x in span.text_content().strip().split('\n') if x.strip() ] role = get_role(author[-1]) if not role in info: info[role] = [] info[role].append(author[0]) covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0] covers = json.loads(decode_html(covers)) last = [0, 0] for url in covers: if covers[url] > last: last = covers[url] info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) return info