Python find_re 예제들, ox.text.find_re Python 예제들

예제 #1

0

파일 보기

 def get_data(self):
     data = {"id": self.id}
     url = compose_url("viewMovie", {"id": self.id})
     xml = read_url(url, None, ITUNES_HEADERS)
     f = open("/Users/rolux/Desktop/iTunesData.xml", "w")
     f.write(xml)
     f.close()
     data["actors"] = parse_cast(xml, "actors")
     string = find_re(xml, "Average Rating:(.*?)</HBoxView>")
     data["averageRating"] = string.count("rating_star_000033.png") + string.count("&#189;") * 0.5
     data["directors"] = parse_cast(xml, "directors")
     data["format"] = find_re(xml, "Format:(.*?)<")
     data["genre"] = decode_html(find_re(xml, "Genre:(.*?)<"))
     data["plotSummary"] = decode_html(
         find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')
     )
     data["posterUrl"] = find_re(xml, 'reflection="." url="(.*?)"')
     data["producers"] = parse_cast(xml, "producers")
     data["rated"] = find_re(xml, "Rated(.*?)<")
     data["relatedMovies"] = parse_movies(xml, "related movies")
     data["releaseDate"] = find_re(xml, "Released(.*?)<")
     data["runTime"] = find_re(xml, "Run Time:(.*?)<")
     data["screenwriters"] = parse_cast(xml, "screenwriters")
     data["soundtrackId"] = find_re(xml, "viewAlbum\?id=(.*?)&")
     data["trailerUrl"] = find_re(xml, 'autoplay="." url="(.*?)"')
     return data

예제 #2

0

파일 보기

파일: lyricsfly.py 프로젝트: adityamangla/metaStudio

def get_lyrics(title, artist):
    html = read_url('http://lyricsfly.com/api/')
    key = find_re(html, '<font color=green><b>(.*?)</b></font>')
    url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
    xml = read_url(url)
    lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
    lyrics = lyrics.replace('\n', '').replace('\r', '')
    lyrics = lyrics.replace('[br]', '\n').strip()
    lyrics.replace('\n\n\n', '\n\n')
    lyrics = decode_html(lyrics.replace('&amp;', '&'))
    return lyrics

예제 #3

0

파일 보기

def parse_cast(xml, title):
    list = []
    try:
        strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split(
            "</GotoURL>"
        )
        strings.pop()
        for string in strings:
            list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
        return list
    except:
        return list

예제 #4

0

파일 보기

def parse_xml_dict(xml):
    values = {}
    strings = xml.split("<key>")
    for string in strings:
        if string.find("</key>") != -1:
            key = find_re(string, "(.*?)</key>")
            type = find_re(string, "</key><(.*?)>")
            if type == "true/":
                value = True
            else:
                value = find_re(string, "<%s>(.*?)</%s>" % (type, type))
                if type == "integer":
                    value = int(value)
                elif type == "string":
                    value = decode_html(value)
            values[key] = value
    return values

예제 #5

0

파일 보기

def parse_movies(xml, title):
    list = []
    try:
        strings = find_re(
            xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()
        ).split("</GotoURL>")
        strings.pop()
        for string in strings:
            list.append(
                {
                    "id": find_re(string, "viewMovie\?id=(.*?)&"),
                    "title": find_re(
                        string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>'
                    ),
                }
            )
        return list
    except:
        return list

예제 #6

0

파일 보기

파일: impawards.py 프로젝트: adityamangla/metaStudio

def get_id(url):
    split = url.split('/')
    year = split[3]
    split = split[4][:-5].split('_')
    if split[-1] == 'xlg':
        split.pop()
    if find_re(split[-1], 'ver\d+$'):
        split.pop()
    id = '%s/%s' % (year, '_'.join(split))
    return id

예제 #7

0

파일 보기

 def get_data(self):
     data = {"id": self.id}
     url = compose_url("viewAlbum", {"id": self.id})
     xml = read_url(url, None, ITUNES_HEADERS)
     data["albumName"] = find_re(xml, "<B>(.*?)</B>")
     data["artistName"] = find_re(xml, "<b>(.*?)</b>")
     data["coverUrl"] = find_re(xml, 'reflection="." url="(.*?)"')
     data["genre"] = find_re(xml, "Genre:(.*?)<")
     data["releaseDate"] = find_re(xml, "Released(.*?)<")
     data["review"] = strip_tags(
         find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')
     )
     data["tracks"] = []
     strings = find_re(xml, "<key>items</key>.*?<dict>(.*?)$").split("<dict>")
     for string in strings:
         data["tracks"].append(parse_xml_dict(string))
     data["type"] = find_re(xml, "<key>listType</key><string>(.*?)<")
     return data

예제 #8

0

파일 보기

파일: impawards.py 프로젝트: adityamangla/metaStudio

def get_ids(page=None):
    ids = []
    if page:
        html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
        results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
        for result in results:
            url = 'http://impawards.com/%s' % result
            ids.append(get_id(url))
        return set(ids)
    #get all
    html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
    pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
    for page in range(pages, 0, -1):
        for id in get_ids(page):
            if not id in ids:
                ids.append(id)
    return ids

예제 #9

0

파일 보기

파일: impawards.py 프로젝트: adityamangla/metaStudio

def get_data(id):
    '''
    >>> get_data('1991/silence_of_the_lambs')['imdbId']
    u'0102926'

    >>> get_data('1991/silence_of_the_lambs')['posters'][0]
    u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'

    >>> get_data('1991/silence_of_the_lambs')['url']
    u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
    '''
    data = {
        'url': get_url(id)
    }
    html = read_url(data['url'], unicode=True)
    data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
    if not data['imdbId']:
        data['imdbId'] = _id_map.get(id, '')
    data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
    data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
    data['posters'] = []
    poster = find_re(html, '<img src="(posters.*?)"')
    if poster:
        poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
        data['posters'].append(poster)
    results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
    for result in results:
        result = result.replace('_xlg.html', '.html')
        url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
        html = read_url(url, unicode=True)
        result = find_re(html, '<a href = (\w*?_xlg.html)')
        if result:
            url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
            html = read_url(url, unicode=True)
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
        else:
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
        data['posters'].append(poster)

    return data

예제 #10

0

파일 보기

 def get_id(self):
     url = compose_url("advancedSearch", {"media": "movie", "title": self.title, "director": self.director})
     xml = read_url(url, headers=ITUNES_HEADERS)
     id = find_re(xml, "viewMovie\?id=(.*?)&")
     return id

예제 #11

0

파일 보기

 def get_id(self):
     url = compose_url("advancedSearch", {"media": "music", "title": self.title, "artist": self.artist})
     xml = read_url(url, headers=ITUNES_HEADERS)
     id = find_re(xml, "viewAlbum\?id=(.*?)&")
     return id

예제 #12

0

파일 보기

파일: criterion.py 프로젝트: adityamangla/metaStudio

def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
    '''
    >>> get_data('1333').get('imdbId')
    u'0060304'

    >>> get_data('236')['posters'][0]
    u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'

    >>> get_data('786')['posters'][0]
    u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
    '''
    data = {
        "url": get_url(id)
    }
    try:
        html = read_url(data["url"], timeout=timeout, unicode=True)
    except:
        html = ox.cache.read_url(data["url"], timeout=timeout)
    data["number"] = find_re(html, "<li>Spine #(\d+)")

    data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
    data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
    data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
    results = find_re(html, '<div class="left_column">(.*?)</div>')
    results = re.compile("<li>(.*?)</li>").findall(results)
    data["country"] = results[0]
    data["year"] = results[1]
    data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))

    result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
    if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
        r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
        if r:
            result = r[0]
    result = find_re(result, "<a href=\"(.*?)\"")
    if not "/boxsets/" in result:
        data["posters"] = [result]
    else:
        html_ = read_url(result, unicode=True)
        result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
        result = find_re(result, "src=\"(.*?)\"")
        if result:
            data["posters"] = [result.replace("_w100", "")]
        else:
            data["posters"] = []
    data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
    result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
    if result:
        data["stills"] = [result]
        data["trailers"] = []
    else:
        data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
        data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])

    if timeout == ox.cache.cache_timeout:
        timeout = -1
    if get_imdb:
        # removed year, as "title (year)" may fail to match
        data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
    return data

예제 #13

0

파일 보기

파일: impawards.py 프로젝트: adityamangla/metaStudio

def get_url(id):
    url = u"http://www.impawards.com/%s.html" % id
    html = read_url(url, unicode=True)
    if find_re(html, "No Movie Posters on This Page"):
        url = u"http://www.impawards.com/%s_ver1.html" % id
    return url