예제 #1
0
def get_data(mininovaId):
    _key_map = {
        'by': u'uploader',
    }
    mininovaId = get_id(mininovaId)
    torrent = dict()
    torrent[u'id'] = mininovaId
    torrent[u'domain'] = 'mininova.org'
    torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
    torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
    torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId

    data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
    if '<h1>Torrent not found...</h1>' in data:
        return None

    for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
        key = d[0].lower().strip()
        key = _key_map.get(key, key)
        value = decode_html(strip_tags(d[1].strip()))
        torrent[key] = value

    torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
    torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
    torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
    if torrent['description']:
        torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
    t = read_url(torrent[u'torrent_link'])
    torrent[u'torrent_info'] = get_torrent_info(t)
    return torrent
예제 #2
0
def torbrowser_url():
    import re
    import sys
    from ox.cache import read_url

    base_url = 'https://dist.torproject.org/torbrowser/'
    r = re.compile('href="(\d\.\d\.\d/)"')
    current = sorted(r.findall(read_url(base_url).decode()))[-1]
    url = base_url + current
    if sys.platform.startswith('linux'):
        osname = 'linux64'
        ext = 'xz'
    elif sys.platform == 'darwin':
        osname = 'osx64'
        ext = 'dmg'
    elif sys.platform == 'win32':
        osname = 'install'
        ext = 'exe'
    else:
        logger.debug('no way to get torbrowser url for %s', sys.platform)
        return None
    r = re.compile('href="(.*?{osname}.*?en.*?{ext})"'.format(osname=osname,
                                                              ext=ext))
    torbrowser = sorted(r.findall(read_url(url).decode()))[-1]
    url += torbrowser
    return url
예제 #3
0
def torbrowser_url():
    import re
    import sys
    from ox.cache import read_url

    base_url = 'https://dist.torproject.org/torbrowser/'
    r = re.compile('href="(\d\.\d\.\d/)"')
    current = sorted(r.findall(read_url(base_url).decode()))[-1]
    url = base_url + current
    if sys.platform.startswith('linux'):
        osname = 'linux64'
        ext = 'xz'
    elif sys.platform == 'darwin':
        osname = 'osx64'
        ext = 'dmg'
    elif sys.platform == 'win32':
        osname = 'install'
        ext = 'exe'
    else:
        logger.debug('no way to get torbrowser url for %s', sys.platform)
        return None
    r = re.compile('href="(.*?{osname}.*?en.*?{ext})"'.format(osname=osname,ext=ext))
    torbrowser = sorted(r.findall(read_url(url).decode()))[-1]
    url += torbrowser
    return url
예제 #4
0
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
    '''
    >>> get_data('1333').get('imdbId')
    u'0060304'

    >>> get_data('236')['posters'][0]
    u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'

    >>> get_data('786')['posters'][0]
    u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
    '''
    data = {
        "url": get_url(id)
    }
    try:
        html = read_url(data["url"], timeout=timeout, unicode=True)
    except:
        html = ox.cache.read_url(data["url"], timeout=timeout)
    data["number"] = find_re(html, "<li>Spine #(\d+)")

    data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
    data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
    data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
    results = find_re(html, '<div class="left_column">(.*?)</div>')
    results = re.compile("<li>(.*?)</li>").findall(results)
    data["country"] = results[0]
    data["year"] = results[1]
    data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))

    result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
    if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
        r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
        if r:
            result = r[0]
    result = find_re(result, "<a href=\"(.*?)\"")
    if not "/boxsets/" in result:
        data["posters"] = [result]
    else:
        html_ = read_url(result, unicode=True)
        result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
        result = find_re(result, "src=\"(.*?)\"")
        if result:
            data["posters"] = [result.replace("_w100", "")]
        else:
            data["posters"] = []
    data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
    result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
    if result:
        data["stills"] = [result]
        data["trailers"] = []
    else:
        data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
        data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])

    if timeout == ox.cache.cache_timeout:
        timeout = -1
    if get_imdb:
        # removed year, as "title (year)" may fail to match
        data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
    return data
예제 #5
0
def get_data(isbn):
    r = {}
    url = '%s/Search/Book/%s/1' % (base, isbn)

    data = read_url(url).decode('utf-8')
    m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
    if m:
        ids = m[0].split('/')
        r['isbn'] = ids[-2]
        r['asin'] = ids[-3]
        url = '%s%s' % (base, m[0])
        data = read_url(url).decode('utf-8')
        r["title"] = find_re(data, "<h2>(.*?)</h2>")
        keys = {
            'author': 'Author(s)',
            'publisher': 'Publisher',
            'date': 'Publication date',
            'edition': 'Edition',
            'binding': 'Binding',
            'volume': 'Volume(s)',
            'pages': 'Pages',
        }
        for key in keys:
            r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
            if r[key] == '--':
                r[key] = ''
            if key == 'pages' and r[key]:
                r[key] = int(r[key])
        desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
        desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
        r['description'] = strip_tags(desc).strip()
        if r['description'] == u'Description of this item is not available at this time.':
            r['description'] = ''
        r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
    return r
예제 #6
0
def download_subtitle(opensubtitle_id):
    srts = {}
    data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
    reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
    for f in re.compile(reg_exp, re.DOTALL).findall(data):
        name = strip_tags(f[1]).split('\n')[0]
        url = "http://www.opensubtitles.com%s" % f[0]
        srts[name] = read_url(url, unicode=True)
    return srts
예제 #7
0
def get_lyrics(title, artist):
    html = read_url('http://lyricsfly.com/api/')
    key = find_re(html, '<font color=green><b>(.*?)</b></font>')
    url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
    xml = read_url(url)
    lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
    lyrics = lyrics.replace('\n', '').replace('\r', '')
    lyrics = lyrics.replace('[br]', '\n').strip()
    lyrics.replace('\n\n\n', '\n\n')
    lyrics = decode_html(lyrics.replace('&amp;', '&'))
    return lyrics
예제 #8
0
def info(id, timeout=cache_timeout):
    info = {}
    if id.startswith('http'):
        id = get_id(id)
        if not id:
            return info
    url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
    data = read_url(url, timeout=timeout)
    xml = parseString(data)
    info['id'] = id
    info['url'] = get_url(id)
    info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
    info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
    info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
    info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data

    info['categories'] = []
    for cat in xml.getElementsByTagName('media:category'):
        info['categories'].append(cat.firstChild.data)

    k = xml.getElementsByTagName('media:keywords')[0].firstChild
    if k:
        info['keywords'] = k.data.split(', ')
    data = read_url(info['url'], timeout=timeout)
    match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
    if match:
        info['license'] = match[0].strip()
        info['license'] = re.sub('<.+?>', '', info['license']).strip()

    url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
    data = read_url(url, timeout=timeout)
    xml = parseString(data)
    languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
    if languages:
        info['subtitles'] = {}
        for language in languages:
            url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
            data = read_url(url, timeout=timeout)
            xml = parseString(data)
            subs = []
            for t in xml.getElementsByTagName('text'):
                start = float(t.getAttribute('start'))
                duration = t.getAttribute('dur')
                if not duration:
                    duration = '2'
                end = start + float(duration)
                text = t.firstChild.data
                subs.append({
                    'in': start,
                    'out': end,
                    'value': ox.decode_html(text),
                })
            info['subtitles'][language] = subs
    return info
예제 #9
0
def get_posters(url, group=True, timeout=-1):
    posters = []
    html = read_url(url, timeout=timeout, unicode=True)
    if url in html:
        if group:
            results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
            for result in results:
                posters += get_posters(result, False)
        results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
        for result in results:
            html = read_url(result, timeout=timeout, unicode=True)
            posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
    return posters
예제 #10
0
def get_data(url):
    data = read_url(url)
    r = {}
    r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
    if '(' in r['title']:
        r['year'] = find_re(r['title'], '\((\d*?)\)')
        r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
    r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
    r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace('  ', ' ').replace('  ', ' ')
    if not r['summary']:
        r['summary'] = get_og(data, 'description')

    meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
    meter = filter(lambda m: m[1].isdigit(), meter)
    if meter:
        r['tomatometer'] = meter[0][1]
    r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
    r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
    r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
    poster = get_og(data, 'image')
    if poster and not 'poster_default.gif' in poster:
        r['posters'] = [poster]
    for key in r.keys():
        if not r[key]:
            del r[key]
    return r
예제 #11
0
def get_show_data(url):
    data = read_url(url, unicode=True)
    r = {}
    r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
    r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
    r['episodes'] = {}
    #1.   1- 1       1001      7 Aug 05   You Can't Miss the Bear
    for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
        air_date = episode[3].strip()
        #'22 Sep 04' -> 2004-09-22 
        try:
            air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
        except:
            pass
        s = episode[1].split('-')[0].strip()
        e = episode[1].split('-')[-1].strip()
        try:
            r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
                'prod code': episode[2],
                'air date': air_date,
                'url': episode[4],
                'title':episode[5],
            }
        except:
            print "oxweb.epguides failed,", url
    return r
예제 #12
0
def get_reviews(url):
    data = read_url(url, unicode=True)
    doc = document_fromstring(data)
    score = doc.xpath('//span[@itemprop="ratingValue"]')
    if score:
        score = int(score[0].text)
    else:
        score = -1
    # NOTE: some reviews may not have authors
    #       one solution is to track by source instead
    sources = [a.text
        for a in doc.xpath('//div[contains(@class, "critic_reviews")]'\
                           '//div[@class="review_content"]'\
                           '//div[@class="source"]//a|//span[@class="no_link"]')]
    reviews = [d.text
        for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[@class="review_body"]')]
    scores = [score_to_int(d.text.strip())
        for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[contains(@class, "metascore_w")]')]
    
    metacritics = []
    for i in range(len(reviews)):
      if scores[i] != -1: # Don't include TBD scores
        metacritics.append({
            'source': sources[i],
            'quote': strip_tags(reviews[i]).strip(),
            'score': scores[i],
        })
        
    return {
        'critics': metacritics,
        'id': get_id(url),
        'score': score,
        'url': url,
    }
예제 #13
0
def get_url(id=None, imdb=None):
    if imdb:
        url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
        data = read_url(url)
        metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
        return metacritic_url or None
    return 'http://www.metacritic.com/movie/%s' % id
예제 #14
0
def get_data(id):
    info = {}
    base = 'http://www.abebooks.com'
    url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
    data = read_url(url)
    urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
    if urls:
        details = '%s%s' % (base, urls[0])
        data = read_url(details)
        doc = lxml.html.document_fromstring(data)
        for e in doc.xpath("//*[contains(@id, 'biblio')]"):
            key = e.attrib['id'].replace('biblio-', '')
            value = e.text_content()
            if value and key not in ('bookcondition', 'binding'):
                info[key] = value
    return info
예제 #15
0
def authors_ol(authors):
    r = []
    for a in authors:
        url = 'http://openlibrary.org%s.json' % a
        data = json.loads(read_url(url))
        r.append(data['name'])
    return r
예제 #16
0
    def __init__(self, id, timeout=-1):
        url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
        '''
            "http://graph.freebase.com/imdb.title.tt%s" % id
            might also be of interest at some point, right now not much info
        '''
        data = read_url(url, unicode=True)
        try:
            data = json.loads(data)
        except ValueError:
            return
        '''
        for key in data:
            self[key] = data[key]
        '''
        for key in ('id', 'guid', 'name'):
            self[key] = data[key]
        keys = {
            'wikipedia': '/wikipedia/en',
            'netflix': '/authority/netflix/movie',
            'nytimes': '/source/nytimes/movie',
            'metacritic': '/source/metacritic/movie',
        }
        for key in keys:
            links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
            if links:
                self[key] = links[0]['uri']

        if 'nytimes' in self:
            self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
            self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
예제 #17
0
def get_ids(key, value):
    ids = []

    def add_other_isbn(v):
        if len(v) == 10:
            ids.append(('isbn', stdnum.isbn.to_isbn13(v)))
        if len(v) == 13 and v.startswith('978'):
            ids.append(('isbn', stdnum.isbn.to_isbn10(v)))

    if key in ('isbn', 'asin'):
        url = '%s/Search/Book/%s/1' % (base, value)
        data = read_url(url).decode('utf-8')
        m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
        if m:
            asin = m[0].split('/')[-3]
            if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin):
                ids.append(('asin', asin))
    if key == 'isbn':
        add_other_isbn(value)
    if key == 'asin':
        if stdnum.isbn.is_valid(value):
            ids.append(('isbn', value))
            add_other_isbn(value)
        else:
            for isbn in amazon_lookup(value):
                if stdnum.isbn.is_valid(isbn):
                    ids.append(('isbn', isbn))
                    add_other_isbn(isbn)
    if ids:
        logger.debug('get_ids %s, %s => %s', key, value, ids)
    return ids
예제 #18
0
 def get_data(self):
     data = {"id": self.id}
     url = compose_url("viewMovie", {"id": self.id})
     xml = read_url(url, None, ITUNES_HEADERS)
     f = open("/Users/rolux/Desktop/iTunesData.xml", "w")
     f.write(xml)
     f.close()
     data["actors"] = parse_cast(xml, "actors")
     string = find_re(xml, "Average Rating:(.*?)</HBoxView>")
     data["averageRating"] = string.count("rating_star_000033.png") + string.count("&#189;") * 0.5
     data["directors"] = parse_cast(xml, "directors")
     data["format"] = find_re(xml, "Format:(.*?)<")
     data["genre"] = decode_html(find_re(xml, "Genre:(.*?)<"))
     data["plotSummary"] = decode_html(
         find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')
     )
     data["posterUrl"] = find_re(xml, 'reflection="." url="(.*?)"')
     data["producers"] = parse_cast(xml, "producers")
     data["rated"] = find_re(xml, "Rated(.*?)<")
     data["relatedMovies"] = parse_movies(xml, "related movies")
     data["releaseDate"] = find_re(xml, "Released(.*?)<")
     data["runTime"] = find_re(xml, "Run Time:(.*?)<")
     data["screenwriters"] = parse_cast(xml, "screenwriters")
     data["soundtrackId"] = find_re(xml, "viewAlbum\?id=(.*?)&")
     data["trailerUrl"] = find_re(xml, 'autoplay="." url="(.*?)"')
     return data
예제 #19
0
def get_ids(key, value):
    ids = []

    def add_other_isbn(v):
        if len(v) == 10:
            ids.append(('isbn', stdnum.isbn.to_isbn13(v)))
        if len(v) == 13 and v.startswith('978'):
            ids.append(('isbn', stdnum.isbn.to_isbn10(v)))

    if key in ('isbn', 'asin'):
        url = '%s/Search/Book/%s/1' % (base, value)
        data = read_url(url).decode('utf-8')
        m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
        if m:
            asin = m[0].split('/')[-3]
            if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin):
                ids.append(('asin', asin))
    if key == 'isbn':
        add_other_isbn(value)
    if key == 'asin':
        if stdnum.isbn.is_valid(value):
            ids.append(('isbn', value))
            add_other_isbn(value)
        else:
            for isbn in amazon_lookup(value):
                if stdnum.isbn.is_valid(isbn):
                    ids.append(('isbn', isbn))
                    add_other_isbn(isbn)
    if ids:
        logger.debug('get_ids %s, %s => %s', key, value, ids)
    return ids
예제 #20
0
def get_ids(page=None):
    ids = []
    if page:
        html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
        results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
        for result in results:
            url = 'http://impawards.com/%s' % result
            ids.append(get_id(url))
        return set(ids)
    #get all
    html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
    pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
    for page in range(pages, 0, -1):
        for id in get_ids(page):
            if not id in ids:
                ids.append(id)
    return ids
예제 #21
0
def find_movie(query=None, imdb=None, max_results=10):
    '''search for torrents on mininova
    '''
    if imdb:
        url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
    else:
        url = "http://www.mininova.org/search/%s/seeds" % quote(query)
    data = read_url(url, unicode=True)
    return _parse_results_page(data, max_results)
예제 #22
0
def playlist(url):
    data = read_url(url)
    items = []
    for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
        items.append({
            'title': i[1],
            'url': 'http://www.youtube.com' + i[0].split('&amp;')[0]
        })
    return items
예제 #23
0
def get_lymbix_tonalize(text_inp,timeout = cache_timeout):
    """
  
    """

    data = {'article':text_inp}
    print data, base_url+req_type[0], headers
    content = read_url(base_url+req_type[0],urlencode(data),headers,timeout, unicode=True)
    return content
예제 #24
0
def request(action, data):
    data = urlencode({'action': action, 'data': json.dumps(data)})
    url = 'http://meta.openmedialibrary.com/api/'
    try:
        return json.loads(read_url(url, data,
                                   timeout=60).decode('utf-8'))['data']
    except:
        logger.debug('metadata request failed', exc_info=1)
        return {}
예제 #25
0
def get_movie_data(title, director):
    if isinstance(title, unicode):
        title = title.encode('utf-8')
    if isinstance(director, unicode):
        director = director.encode('utf-8')
    data = {}
    # itunes section (preferred source for link)
    url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'
    url += '?media=movie&movieTerm=' + title
    url += '&actorNames=&directorProducerName=' + director
    url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
    HEADERS['Referer'] = url
    html = read_url(url, headers=HEADERS, unicode=True)
    regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
    regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
    results = re.compile(regexp).findall(html)
    if results:
        data['link'] = results[0][0]
        data['poster'] = results[0][1].replace('140x140', '600x600')
        html = read_url(data['link'], headers=HEADERS, unicode=True)
        results = re.compile('video-preview-url="(.*?)"').findall(html)
        if results:
            data['trailer'] = results[0]
    # trailers section (preferred source for poster and trailer)
    host = 'http://trailers.apple.com'
    url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
    js = json.loads(read_url(url, unicode=True)[16:-4])
    results = js['results']
    if results:
        url = host + results[0]['location']
        if not 'link' in data:
            data['link'] = url
        headers = {
            'User-Agent': USER_AGENT
        }
        html = read_url(url, headers=headers, unicode=True)
        results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
        if results:
            data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
        html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
        results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
        if results:
            data['trailer'] = results[-1]
    return data
예제 #26
0
def get_lymbix_tonalize_multiple(text_inp,timeout = cache_timeout):
    """
       
    """

    data = {'article':text_inp}

    
    content = read_url(base_url+req_type[0],urlencode(data),headers,timeout)
    return content
예제 #27
0
def get_ids(page=None):
    ids = []
    if page:
        url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
        html = read_url(url)
        results = re.compile("films/(\d+)").findall(html)
        ids += results
        results = re.compile("boxsets/(.*?)\"").findall(html)
        for result in results:
            html = read_url("http://www.criterion.com/boxsets/" + result)
            results = re.compile("films/(\d+)").findall(html)
            ids += results
        return set(ids)
    html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
    results = re.compile("\&amp;p=(\d+)\&").findall(html)
    pages = max(map(int, results))
    for page in range(1, pages):
        ids += get_ids(page)
    return sorted(set(ids), key=int)
예제 #28
0
def get_data(id):
    '''
    >>> get_data('129689')['cast'][1][1]
    u'Marianne'
    >>> get_data('129689')['credits'][0][0]
    u'Jean-Luc Godard'
    >>> get_data('129689')['posters'][0]
    u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
    >>> get_data('129689')['rating']
    u'4.5'
    '''
    if id.startswith('http'):
        id = get_id(id)
    data = {
        "url": get_url(id)
    }
    html = read_url(data["url"], unicode=True)
    data['aka'] = parse_list(html, 'AKA')
    data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
    data['countries'] = parse_list(html, 'countries')
    data['director'] = parse_entry(html, 'directed by')
    data['genres'] = parse_list(html, 'genres')
    data['keywords'] = parse_list(html, 'keywords')
    data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
    data['produced'] = parse_list(html, 'produced by')
    data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
    data['released'] = parse_entry(html, 'released by')
    data['releasedate'] = parse_list(html, 'release date')
    data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
    data['set'] = parse_entry(html, 'set in')
    data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    data['themes'] = parse_list(html, 'themes')
    data['types'] = parse_list(html, 'types')
    data['year'] = find_re(html, '<span class="year">.*?(\d+)')
    #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
    data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
    #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
    #data['cast'] = parse_table(html)
    #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
    #data['credits'] = parse_table(html)
    html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
    data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    return data
예제 #29
0
 def _request(self, action, data, timeout=None):
     for key in data:
         if not isinstance(data[key], str):
             data[key] = json.dumps(data[key])
     url = self.base + '/' + action + '?' + urlencode(data)
     if timeout is None:
         r = read_url(url).decode('utf-8')
         if '504 Gateway Time-out' in r:
             r = read_url(url, timeout=-1).decode('utf-8')
         result = json.loads(r)
     else:
         r = read_url(url, timeout).decode('utf-8')
         if '504 Gateway Time-out' in r:
             r = read_url(url, timeout=-1).decode('utf-8')
         result = json.loads(r)
     if 'status' in result and result['status'] == 'error' or 'error' in result:
         logger.info('FAILED %s %s', action, data)
         logger.info('URL %s', url)
     return result
예제 #30
0
def get_lymbix_flag_response(text_inp,timeout = cache_timeout):
    """
       
    """

    data = {'article':text_inp}

    
    content = read_url(base_url+req_type[0],urlencode(data),headers,timeout)
    return content
예제 #31
0
def get_config(id):
    if id.startswith('http'):
        url = id
    else:
        url = get_url(id)
    data = read_url(url)
    match = re.compile('ytplayer.config = (.*?);<').findall(data)
    if match:
        config = json.load(match[0])
    return config
예제 #32
0
def findISBN(title, author):
    q = '%s %s' % (title, author)
    url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
    data = read_url(url, unicode=True)
    links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
    id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
    data = get_data(id)
    if author in data['authors']:
        return data
    return {}
예제 #33
0
def get_data(url):
    if not url.startswith('http:'):
        url = get_url(url)
    data = read_url(url, unicode=True)
    m = {
        'id': get_id(url),
        'url': url,
        'type': re.compile('ubu.com/(.*?)/').findall(url)[0]
    }
    for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data):
        if videourl.endswith('.srt'):
            m['srt'] = videourl
        elif not 'video' in m:
            m['video'] = videourl
            m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20')
            if m['video'] == 'http://ubumexico.centro.org.mx/video/':
                del m['video']
            m['title'] = strip_tags(decode_html(title)).strip()
    if not 'url' in m:
        print url, 'missing'
    if 'title' in m:
        m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])

    match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
    if match:
        m['flv'] = match[0]
        m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')

    y = re.compile('\((\d{4})\)').findall(data)
    if y:
        m['year'] = int(y[0])
    d = re.compile('Director: (.+)').findall(data)
    if d:
        m['director'] = strip_tags(decode_html(d[0])).strip()

    a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data)
    if a:
        m['artist'] = strip_tags(decode_html(a[0][1])).strip()
    else:
        a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data)
        if a:
            m['artist'] = strip_tags(decode_html(a[0][1])).strip()
        else:
            a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
            if a:
                m['artist'] = strip_tags(decode_html(a[0])).strip()
            elif m['id'] == 'film/lawder_color':
                m['artist'] = 'Standish Lawder'
    if 'artist' in m:
        m['artist'] = m['artist'].replace('in UbuWeb Film', '')
        m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
    if m['id'] == 'film/coulibeuf':
        m['title'] = 'Balkan Baroque'
        m['year'] = 1999
    return m
예제 #34
0
def get_ids(key, value):
    ids = []
    if key == 'isbn':
        url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, value)
        data = read_url(url, unicode=True)
        urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
        if urls:
            ids.append((key, value))
    if ids:
        logger.debug('get_ids %s %s => %s', key, value, ids)
    return ids
예제 #35
0
 def _request(self, action, data, timeout=None):
     for key in data:
         if not isinstance(data[key], str):
             data[key] = json.dumps(data[key])
     url = self.base + '/' + action + '?' + urlencode(data)
     if timeout is None:
         r = read_url(url).decode('utf-8')
         if '504 Gateway Time-out' in r:
             r = read_url(url, timeout=-1).decode('utf-8')
         result = json.loads(r)
     else:
         r = read_url(url, timeout).decode('utf-8')
         if '504 Gateway Time-out' in r:
             r = read_url(url, timeout=-1).decode('utf-8')
         result = json.loads(r)
     if 'status' in result and result[
             'status'] == 'error' or 'error' in result:
         logger.info('FAILED %s %s', action, data)
         logger.info('URL %s', url)
     return result
예제 #36
0
def lookup(id):
    logger.debug('lookup %s', id)
    data = {}
    url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
    html = read_url(url, unicode=True)
    urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(html)
    keys = {'pubdate': 'date'}
    if urls:
        details = '%s%s' % (base, urls[0])
        html = read_url(details, unicode=True)
        doc = lxml.html.document_fromstring(html)
        for e in doc.xpath("//*[contains(@id, 'biblio')]"):
            key = e.attrib['id'].replace('biblio-', '')
            value = e.text_content().strip()
            k = keys.get(key, key)
            if k == 'date' and value == 'Publication Date:':
                value = ''
            elif k == 'publisher' and value == 'Publisher:':
                value = ''
            if value and key not in ('bookcondition', 'binding',
                                     'edition-amz'):
                data[k] = value
    return data
예제 #37
0
def lookup(id):
    logger.debug('lookup %s', id)
    r = {'asin': [id]}
    url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
    logger.debug('%s', url)
    data = read_url(url).decode('utf-8')
    r["title"] = find_re(data, "<h2>(.*?)</h2>")
    if r["title"] == 'Error!':
        return {}
    keys = {
        'author': 'Author(s)',
        'publisher': 'Publisher',
        'date': 'Publication date',
        'edition': 'Edition',
        'binding': 'Binding',
        'volume': 'Volume(s)',
        'pages': 'Pages',
    }
    for key in keys:
        r[key] = find_re(
            data,
            '<span class="title">%s:</span>(.*?)</li>' % re.escape(keys[key]))
        if r[key] == '--' or not r[key]:
            del r[key]
        if key == 'pages' and key in r:
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />',
                        ' ').replace('<br /> ', ' ').replace('<br />', ' ')
    r['description'] = decode_html(strip_tags(desc))
    r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace(
        '._SL160_', '')
    for key in r:
        if isinstance(r[key], str):
            r[key] = decode_html(strip_tags(r[key])).strip()
    if 'author' in r and isinstance(r['author'], str) and r['author']:
        r['author'] = [r['author']]
    else:
        r['author'] = []
    if not r['author'] or r['author'][0].isupper():
        del r['author']
    if r['description'].lower(
    ) == 'Description of this item is not available at this time.'.lower():
        r['description'] = ''
    return r
예제 #38
0
def get_ids(key, value):
    ids = []
    if key == 'isbn':
        url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value)
        html = read_url(url).decode('utf-8')
        matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html)
        if matches:
            info = lookup(matches[0])
            ids.append(('oclc', matches[0]))
            for v in info.get('isbn', []):
                if v != value:
                    ids.append(('isbn', v))
    elif key == 'oclc':
        info = lookup(value)
        if 'isbn' in info:
            for value in info['isbn']:
                ids.append(('isbn', value))
    if ids:
        logger.debug('get_ids %s %s => %s', key, value, ids)
    return ids
예제 #39
0
def amazon_lookup(asin):
    url = 'http://www.amazon.com/dp/%s' % asin
    html = read_url(url, timeout=-1).decode('utf-8', 'ignore')
    return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
예제 #40
0
def lookup(id):
    data = {
        'oclc': [id]
    }
    url = '%s/oclc/%s' % (base_url, id)
    html = read_url(url).decode('utf-8')
    doc = lxml.html.document_fromstring(html)
    for e in doc.xpath("//*[contains(@id, 'bibtip')]"):
        key = e.attrib['id'].replace('bibtip_', '')
        value = e.text_content().strip()
        if value:
            data[key] = value
    info = doc.xpath('//textarea[@id="util-em-note"]')
    if info:
        info = info[0].text
        info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')])
        for key in info:
            k = key.lower()
            value = info[key].strip()
            if value:
                data[k] = value
    for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'):
        if key in data:
            del data[key]
    if 'isxn' in data:
        for isbn in data.pop('isxn').split(' '):
            isbn = normalize_isbn(isbn)
            if stdnum.isbn.is_valid(isbn):
                if not 'isbn' in data:
                    data['isbn'] = []
                if isbn not in data['isbn']:
                    data['isbn'].append(isbn)
    cover = doc.xpath('//img[@class="cover"]')
    if cover:
        data['cover'] = cover[0].attrib['src']
        if data['cover'].startswith('//'):
            data['cover'] = 'http:' + data['cover']
        cdata = read_url(data['cover'])
        if  hashlib.sha1(cdata).hexdigest() in (
            'd2e9ab0c87193d69a7d3a3c21ae4aa550f7dcf00',
            '70f16d3e077cdd47ef6b331001dbb1963677fa04'
        ):
            del data['cover']

    if 'author' in data:
        data['author'] = data['author'].split('; ')
    if 'title' in data:
        data['title'] = data['title'].replace(' : ', ': ')
    if 'publisher' in data:
        m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher'])
        if m:
            place, publisher, date = m[0]
            data['publisher'] = publisher
            data['date'] = date
            data['place'] = [place]
        elif ':' in data['publisher']:
            place, publisher = data['publisher'].split(':', 1)
            data['place'] = [place.strip()]
            data['publisher'] = publisher.split(',')[0].strip()
            m = re.compile('\d{4}').findall(publisher)
            if m:
                data['date'] = m[0]

    if 'place' in data:
            if data['place'][0].startswith('['):
                data['place'] = [data['place'][0][1:]]
            if data['place'][0].endswith(']'):
                data['place'] = [data['place'][0][:-1]]
    logger.debug('lookup %s => %s', id, list(data.keys()))
    return data
예제 #41
0
def lookup(id):
    logger.debug('lookup %s', id)
    ns = '{http://www.loc.gov/mods/v3}'
    url = 'http://lccn.loc.gov/%s/mods' % id
    info = {
        'lccn': [id]
    }
    try:
        data = read_url(url).decode('utf-8')
        mods = ET.fromstring(data)
    except:
        try:
            data = read_url(url, timeout=0).decode('utf-8')
            mods = ET.fromstring(data)
        except:
            logger.debug('lookup for %s url: %s failed', id, url, exc_info=1)
            return info

    title = mods.findall(ns + 'titleInfo')
    if not title:
        return {}
    info['title'] = ''.join([': ' + e.text.strip() if e.tag == ns + 'subTitle' else ' ' + e.text.strip() for e in title[0]]).strip()
    origin = mods.findall(ns + 'originInfo')
    if origin:
        info['place'] = []
        for place in origin[0].findall(ns + 'place'):
            terms = place.findall(ns + 'placeTerm')
            if terms and terms[0].attrib['type'] == 'text':
                e = terms[0]
                info['place'].append(e.text)
            elif terms and terms[0].attrib['type'] == 'code':
                e = terms[0]
                info['country'] = COUNTRIES.get(e.text, e.text)
        publisher = [e.text for e in origin[0].findall(ns + 'publisher')]
        if publisher:
            info['publisher'] = publisher[0]
        info['date'] = ''.join([e.text
            for e in origin[0].findall(ns + 'dateIssued') if e.attrib.get('encoding') == 'marc'])
        for i in mods.findall(ns + 'identifier'):
            key = i.attrib['type']
            value = i.text
            if key in ('oclc', 'lccn', 'isbn'):
                if i.attrib['type'] == 'oclc':
                    value = value.replace('ocn', '').replace('ocm', '')
                if i.attrib['type'] == 'isbn':
                    value = normalize_isbn(i.text)
                if not key in info:
                    info[key] = []
                if value not in info[key]:
                    info[key].append(value)
        for i in mods.findall(ns + 'classification'):
            if i.attrib['authority'] == 'ddc':
                info['classification'] = get_classification(i.text.split('/')[0])
        info['author'] = []
        for a in mods.findall(ns + 'name'):
            if a.attrib.get('usage') == 'primary':
                info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )]))
        info['author'] = [ox.normalize_name(a) for a in info['author']]
    toc = mods.findall(ns + 'tableOfContents')
    if toc:
        info['description'] = toc[0].text.strip()
    for key in list(info.keys()):
        if not info[key]:
            del info[key]
    return info
예제 #42
0
    "996": "Other parts of Pacific    Polynesia",
    "997": "Atlantic Ocean islands",
    "998": "Arctic islands and Antarctica",
    "999": "Extraterrestrial worlds"
}

if __name__ == '__main__':
    import json
    import re
    from ox.cache import read_url

    dewey = {}
    for i in range(0, 1000):
        url = 'http://dewey.info/class/%s/about.en.json' % i
        print(url)
        data = json.loads(read_url(url).decode('utf-8'))
        for d in list(data.values()):
            if 'http://www.w3.org/2004/02/skos/core#prefLabel' in d:
                value = d['http://www.w3.org/2004/02/skos/core#prefLabel'][0][
                    'value']
                dewey[str(i)] = value
                break

    data = json.dumps(dewey, indent=4, ensure_ascii=False,
                      sort_keys=True).encode('utf-8')
    with open(__file__) as f:
        pydata = f.read()
    pydata = re.sub(re.compile('\nDEWEY = {.*?}\n\n', re.DOTALL),
                    '\nDEWEY = %s\n\n' % data, pydata)

    with open(__file__, 'w') as f:
예제 #43
0
    "nw": "Northern Mariana Islands", 
    "wvu": "West Virginia", 
    "-xxr": "Soviet Union", 
    "-tar": "Tajik S.S.R.", 
    "bcc": "British Columbia"
}


if __name__ == '__main__':
    import json
    import re
    import ox
    from ox.cache import read_url

    url = "http://www.loc.gov/marc/countries/countries_code.html"
    data = read_url(url).decode('utf-8')
    countries = dict([
        [ox.strip_tags(c) for c in r]
        for r in re.compile('<tr>.*?class="code">(.*?)</td>.*?<td>(.*?)</td>', re.DOTALL).findall(data)
    ])

    data = json.dumps(countries, indent=4, ensure_ascii=False).encode('utf-8')
    with open(__file__) as f:
        pydata = f.read()
    pydata = re.sub(
        re.compile('\nCOUNTRIES = {.*?}\n\n', re.DOTALL),
        '\nCOUNTRIES = %s\n\n' % data, pydata)

    with open(__file__, 'w') as f:
        f.write(pydata)
예제 #44
0
def info(key, value):
    if key not in ('isbn', ):
        raise IOError('unknwon key %s' % key)
    if len(value) == 13:
        value = stdnum.isbn.to_isbn10(value)
    if len(value) != 10:
        raise IOError('invalid isbn %s' % value)
    url = 'http://www.amazon.com/dp/' + value
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    info = {}
    if '<title>404 - Document Not Found</title>' in data:
        return info
    if 'To discuss automated access to Amazon data please' in data:
        return info
    for l in doc.xpath('//link[@rel="canonical" and @href]'):
        info['asin'] = [l.get('href').rpartition('/')[-1]]
        break
    info['title'] = strip_tags(
        decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
    info['description'] = strip_tags(
        decode_html(
            unquote(
                re.compile('encodedDescription\' : "(.*?)",').findall(data)
                [0])))
    info['description'] = fix_bad_unicode(info['description'])
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
        v = li.text_content()
        if ': ' in v:
            k, v = li.text_content().split(': ', 1)
            content_info[k.strip()] = v.strip()
    if 'Language' in content_info:
        info['language'] = content_info['Language']
    if 'Publisher' in content_info:
        if ' (' in content_info['Publisher']:
            info['date'] = find_re(content_info['Publisher'].split(' (')[-1],
                                   '\d{4}')
        info['publisher'] = content_info['Publisher'].split(' (')[0]
        if '; ' in info['publisher']:
            info['publisher'], info['edition'] = info['publisher'].split(
                '; ', 1)

    if 'ISBN-13' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    if 'ISBN-10' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
            if not role in info: info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [
                x.strip() for x in span.text_content().strip().split('\n')
                if x.strip()
            ]
            role = get_role(author[-1])
            if not role in info: info[role] = []
            info[role].append(author[0])

    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
    last = [0, 0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info