def GetItems(type, genre=None, sort=None, alpha=None, pages=5, start_page=0): page_num = 0 items = [] while (page_num < pages): page_num = page_num + 1 url = GetURL(type=type, genre=genre, sort=sort, alpha=alpha, page_num=page_num + start_page) url_parts = urlparse.urlparse(url) soup = BeautifulSoup(HTTP.Request(url).content) for item in soup.findAll("div", {'class': 'index_item index_item_ie'}): #Log('Found item: ' + str(item)) res = MediaInfo() res.type = type # Extract out title res.title = re.search("Watch (.*)", item.find('a')['title']).group(1).strip() match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out URL res.id = item.a['href'][1:] # Extract out thumb. Note that extracted path may be relative. poster_url = item.find('img')['src'] if poster_url.startswith("//"): poster_url = url_parts.scheme + ":" + poster_url elif poster_url.startswith("/"): # Deal with fully-relative paths. Doesn't deal with partial relative paths. poster_url = url_parts.scheme + "://" + url_parts.netloc + poster_url res.poster = poster_url # Extract out rating rating_style = item.find('li')['style'] rating = re.search("width:\s([\d\.]*)px;", rating_style).group(1) if (rating is not None and rating <> ""): res.rating = int(int(rating) / 10) # Add to item list. #Log("Adding item: " + str(res)) items.append(res) return items
def GetItems(type, genre=None, sort=None, alpha=None, pages=5, start_page=0): items = [] url = ICEFILMS_URL + "/" + type if (genre): genre = genre.lower() if (alpha): sort = "a-z" genre = "1" if alpha == "123" else alpha.upper() if (sort is not None): url = url + "/" + sort if (genre is not None): url = url + "/" + genre else: url = url + "/1" soup = BeautifulSoup(HTTP.Request(url).content) # RegEx to extract out item id. id_reg_ex = re.compile("/ip.php\?v=(\d+)") for item in soup.findAll("a", {'name': 'i'}): res = MediaInfo() res.type = type title_elem = item.nextSibling.nextSibling # Pick out next element # Extract out title res.title = Utils.decode_htmlentities(str(title_elem.string)) match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out id if available, otherwise, just store the item's URL. match = id_reg_ex.search(title_elem['href']) #Log(match) if (match): res.id = match.group(0) else: res.id = title_elem['href'] #Log(res.id) # Add to item list. #Log("Adding item: " + str(res)) items.append(res) return items
def GetItems(type, genre=None, sort=None, alpha=None, pages=5, start_page=0): page_num = 0 items = [] while (page_num < pages): page_num = page_num + 1 url = GetURL(type=type, genre=genre, sort=sort, alpha=alpha, page_num=page_num + start_page) soup = BeautifulSoup(HTTP.Request(url).content) for item in soup.findAll("div", {'class': 'index_item index_item_ie'}): #Log('Found item: ' + str(item)) res = MediaInfo() res.type = type # Extract out title res.title = re.search("Watch (.*)", item.find('a')['title']).group(1).strip() match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out URL res.id = item.a['href'][1:] # Extract out thumb res.poster = item.find('img')['src'] # Extract out rating rating_style = item.find('li')['style'] rating = re.search("width:\s([\d\.]*)px;", rating_style).group(1) if (rating is not None and rating <> ""): res.rating = int(int(rating) / 10) # Add to item list. #Log("Adding item: " + str(res)) items.append(res) return items
def GetItems(type, genre = None, sort = None, alpha = None, pages = 5, start_page = 0): items = [] url = ICEFILMS_URL + "/" + type if (genre): genre = genre.lower() if (alpha): sort = "a-z" genre = "1" if alpha == "123" else alpha.upper() if (sort is not None): url = url + "/" + sort if (genre is not None): url = url + "/" + genre else: url = url + "/1" soup = BeautifulSoup(HTTP.Request(url).content) # RegEx to extract out item id. id_reg_ex = re.compile("/ip.php\?v=(\d+)") for item in soup.findAll("a", { 'name': 'i' }): res = MediaInfo() res.type = type title_elem = item.nextSibling.nextSibling # Pick out next element # Extract out title res.title = Utils.decode_htmlentities(str(title_elem.string)) match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out id if available, otherwise, just store the item's URL. match = id_reg_ex.search(title_elem['href']) #Log(match) if (match): res.id = match.group(0) else: res.id = title_elem['href'] #Log(res.id) # Add to item list. #Log("Adding item: " + str(res)) items.append(res) return items
def GetItems(type, genre = None, sort = None, alpha = None, pages = 5, start_page = 0): page_num = 0 items = [] while (page_num < pages): page_num = page_num + 1 url = GetURL(type = type, genre = genre, sort = sort, alpha = alpha, page_num = page_num + start_page) url_parts = urlparse.urlparse(url) soup = BeautifulSoup(HTTP.Request(url).content) for item in soup.findAll("div", { 'class': 'index_item index_item_ie' }): #Log('Found item: ' + str(item)) res = MediaInfo() res.type = type # Extract out title res.title = re.search("Watch (.*)", item.find('a')['title']).group(1).strip() match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out URL res.id = item.a['href'][1:] # Extract out thumb. Note that extracted path may be relative. poster_url = item.find('img')['src'] if poster_url.startswith("//"): poster_url = url_parts.scheme + ":" + poster_url elif poster_url.startswith("/"): # Deal with fully-relative paths. Doesn't deal with partial relative paths. poster_url = url_parts.scheme + "://" + url_parts.netloc + poster_url res.poster = poster_url # Extract out rating rating_style = item.find('li')['style'] rating = re.search("width:\s([\d\.]*)px;", rating_style).group(1); if (rating is not None and rating <> ""): res.rating = int(int(rating) / 10) # Add to item list. #Log("Adding item: " + str(res)) items.append(res) return items
def GetItems(type, genre = None, sort = None, alpha = None, pages = 5, start_page = 0): page_num = 0 items = [] while (page_num < pages): page_num = page_num + 1 url = GetURL(type = type, genre = genre, sort = sort, alpha = alpha, page_num = page_num + start_page) soup = BeautifulSoup(HTTP.Request(url).content) for item in soup.findAll("div", { 'class': 'index_item index_item_ie' }): #Log('Found item: ' + str(item)) res = MediaInfo() res.type = type # Extract out title res.title = re.search("Watch (.*)", item.find('a')['title']).group(1).strip() match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out URL res.id = item.a['href'][1:] # Extract out thumb res.poster = item.find('img')['src'] # Extract out rating rating_style = item.find('li')['style'] rating = re.search("width:\s([\d\.]*)px;", rating_style).group(1); if (rating is not None and rating <> ""): res.rating = int(int(rating) / 10) # Add to item list. #Log("Adding item: " + str(res)) items.append(res) return items
def GetSearchResults(query=None,type=None,imdb_id=None, exact=False): if (type=="movies"): # This a google search. The -tv will ommit all TV shows. search = 'intitle:%s -"Episode List" -"Series Rating" site:%s' % (query,ICEFILMS_URL) else: search = 'allintitle:%s "Episode List" site:%s' % (query, ICEFILMS_URL) gs = GoogleSearch(search) gs.results_per_page = 25 gs.page = 0 results = gs.get_results() items = [] for res in results: name = re.sub( '(<em>|</em>|<a>|</a>|DivX|-|icefilms(\.info)?|<b>\.\.\.</b>|Episode List|links)', '', res.title.encode('utf8') ).strip() url=res.url video_url = re.search("icefilms\.info(/.*)", url).group(1) res = MediaInfo() res.type = type res.title = name match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) res.id = video_url items.append(res) return items
def GetSearchResults(query=None,type=None,imdb_id=None): if (type=="movies"): # This a google search. The -tv will ommit all TV shows. search = 'intitle:%s -"Episode List" -"Series Rating" site:%s' % (query,ICEFILMS_URL) else: search = 'allintitle:%s "Episode List" site:%s' % (query, ICEFILMS_URL) gs = GoogleSearch(search) gs.results_per_page = 25 gs.page = 0 results = gs.get_results() items = [] for res in results: name = re.sub( '(<em>|</em>|<a>|</a>|DivX|-|icefilms(\.info)?|<b>\.\.\.</b>|Episode List|links)', '', res.title.encode('utf8') ).strip() url=res.url video_url = re.search("icefilms\.info(/.*)", url).group(1) res = MediaInfo() res.type = type res.title = name match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) res.id = video_url items.append(res) return items
def GetMediaInfo(url, mediainfo, query_external=False): """ Retrieve meta data about the passed in LMWT item from a meta provider. Additionally, for any info not returned by the meta provider, try to collect the info directly from the LMWT item page. """ # The description meta header for some shows inserts random double quotes in the # content which breaks the parsing of the page. Work around that by simply # removing the head section in which the meta elements are contained. headMassage = [(re.compile('<head>(.*)</head>', re.S), lambda match: '')] soupMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) soupMassage.extend(headMassage) soup = BeautifulSoup(HTTP.Request(Dict['LMWT_URL'] + url).content, markupMassage=soupMassage) try: imdb_id = None try: imdb_link = soup.find('div', 'mlink_imdb').a['href'] imdb_id = re.search("(tt\d+)", str(imdb_link)).group() except: pass # Construct kwargs. kwargs = {} kwargs['imdb_id'] = imdb_id kwargs['show_name'] = mediainfo.show_name kwargs['season'] = mediainfo.season if hasattr(mediainfo, 'ep_num'): kwargs['ep_num'] = mediainfo.ep_num if (query_external): #Log("Query-ing External Provider") mediainfo_ret = DBProvider().GetProvider( mediainfo.type).RetrieveItemFromProvider(**kwargs) #Log(str(mediainfo)) else: mediainfo_ret = MediaInfo() mediainfo_ret.id = imdb_id # Also parse the LMWT page and extract out any info not set by the meta provider. info_div = soup.find('div', 'movie_info') # First, extract out description... info = {} info['Description:'] = info_div.find('td', {'colspan': '2'}).text # Then, ratings.... info['Rating:'] = info_div.find('li', 'current-rating').text # Extract out any other info. for row in info_div.findAll('tr'): row_items = row.findAll('td') if len(row_items) <> 2 or "colspan" in str(row_items[0]): continue info[row_items[0].text] = row_items[1].text # Map available extracted info back to the media info object. # First, define the mapping between LMWT items and media info and an additional function # to extract out sane info out of the LMWT data. item_map = { 'Description:': ['summary', lambda x: Utils.decode_htmlentities(x)], 'Air Date:': ['releasedate', lambda x: datetime.strptime(x, '%B %d, %Y')], 'Runtime:': [ 'duration', lambda x: int(re.search("(\d*)", x).group(0)) * 60 * 1000 if int(re.search("(\d*)", x).group(0) ) * 60 * 1000 < sys.maxint else 0 ], 'Rating:': [ 'rating', lambda x: float(re.search("([\d\.]+)", x).group(0)) * 2 ], 'Title:': ['title', lambda x: decode_htmlentities(x)], } # For each extracted item from LMWT... for lmwt_item in info.keys(): #Log("Processing: " + lmwt_item) # Look for matching entry in map... if lmwt_item not in item_map.keys(): #Log("Not processing - no mapping") continue mi_item = item_map[lmwt_item] if (mi_item is None): #Log("Couldn't find a mi attr!") continue try: # And see if it's already set in the mediaInfo object. mi_val = getattr(mediainfo_ret, mi_item[0], None) #Log("Current mi value: " + str(mi_val)) # And set it if it's not already. if (not mi_val): #Log("Setting mi attr " + mi_item[0] + " to: " + str(mi_item[1](info[lmwt_item]))) setattr(mediainfo_ret, mi_item[0], mi_item[1](info[lmwt_item])) except Exception, ex: #Log.Exception("Error whilst reading in info from LMWT Page. Field " + lmwt_item) pass return mediainfo_ret
def GetSearchResults(query=None, type=None, imdb_id=None, exact=False): items = [] if (imdb_id): res = MediaInfo() res.type = type res.id = "/item.php?imdb=" + imdb_id res.title = query items.append(res) else: soup = BeautifulSoup( HTTP.Request(Dict['LMWT_SEARCH_URL'] + "?search", cacheTime=0).content) key = soup.find('input', {'type': 'hidden', 'name': 'key'})['value'] section = "1" if (type == "tv"): section = "2" url = Dict[ 'LMWT_SEARCH_URL'] + "?search_section=" + section + "&search_keywords=" + urllib.quote_plus( query) + "&key=" + key + "&sort=views" soup = BeautifulSoup(HTTP.Request(url, cacheTime=0).content) #Log(soup) for item in soup.findAll("div", {'class': 'index_item index_item_ie'}): #Log('Found item: ' + str(item)) res = MediaInfo() res.type = type # Extract out title res.title = re.search("Watch (.*)", item.find('a')['title']).group(1).strip() match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out URL res.id = item.a['href'][1:] # Extract out thumb res.poster = item.find('img')['src'] # Extract out rating rating_style = item.find('li')['style'] res.rating = re.search("width:\s(\d)*px;", rating_style).group(1) # Add to item list. #Log("Adding item: " + str(res)) if not exact or res.title.lower() == query.lower(): items.append(res) #Log(items) return items
def GetSearchResults(query=None,type=None,imdb_id=None, exact=False): items = [] if (imdb_id): res = MediaInfo() res.type = type res.id = "/item.php?imdb=" + imdb_id res.title = query items.append(res) else: soup = BeautifulSoup(HTTP.Request(Dict['LMWT_SEARCH_URL'] + "?search",cacheTime=0).content) key = soup.find('input', { 'type': 'hidden', 'name': 'key' })['value'] section = "1" if (type == "tv"): section = "2" url = Dict['LMWT_SEARCH_URL'] + "?search_section=" + section + "&search_keywords=" + urllib.quote_plus(query) + "&key=" + key + "&sort=views" soup = BeautifulSoup(HTTP.Request(url,cacheTime=0).content) #Log(soup) for item in soup.findAll("div", { 'class': 'index_item index_item_ie' }): #Log('Found item: ' + str(item)) res = MediaInfo() res.type = type # Extract out title res.title = re.search("Watch (.*)", item.find('a')['title']).group(1).strip() match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) # Extract out URL res.id = item.a['href'][1:] # Extract out thumb res.poster = item.find('img')['src'] # Extract out rating rating_style = item.find('li')['style'] res.rating = re.search("width:\s(\d)*px;", rating_style).group(1); # Add to item list. #Log("Adding item: " + str(res)) if not exact or res.title.lower() == query.lower(): items.append(res) #Log(items) return items