def __init__(self): self.cache = Cache('kinopoisk.db') self.html = Clear() self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' }
def __init__(self): self.api_key = '1D62F2F90030C444' self.cache = Cache('tvdb.db') self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.thetvdb.com/' }
def __init__(self): self.setting = Setting() self.re_auth = re.compile(r'profile\.php\?mode=sendpassword"') self.re_captcha = re.compile(r'<img src="(\/\/[^\/]+/captcha/[^"]+)"') self.re_captcha_sid = re.compile(r'<input type="hidden" name="cap_sid" value="([^"]+)">') self.re_captcha_code = re.compile(r'<input type="text" name="(cap_code_[^"]+)"') self.captcha_sid = None self.captcha_code = None self.captcha_code_value = None self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://rutracker.lib/forum/index.php' }
def __init__(self): self.setting = Setting() self.re_auth = re.compile(r'"profile\.php\?mode=sendpassword"') self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://rutracker.org/forum/index.php' }
def __init__(self): self.setting = Setting() self.re_auth = re.compile(r'profile\.php\?mode=sendpassword"') self.re_captcha = re.compile(r'<img src="(\/\/[^\/]+/captcha/[^"]+)"') self.re_captcha_sid = re.compile(r'<input type="hidden" name="cap_sid" value="([^"]+)">') self.re_captcha_code = re.compile(r'<input type="text" name="(cap_code_[^"]+)"') self.captcha_sid = None self.captcha_code = None self.captcha_code_value = None self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://rutracker.nl/forum/index.php' }
class KinoPoisk: """ API: scraper - скрапер movie - профайл фильма search - поиск фильма best - поиск лучших фильмов person - поиск персон work - информация о работах персоны """ def __init__(self): self.cache = Cache('kinopoisk.db') self.html = Clear() self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' } # API def scraper(self, name, year=None, trailer_quality=None): try: tag = 'scraper:' + urllib.quote_plus(name.encode('windows-1251')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, name, year) if not id: return None return self.movie(id, trailer_quality) def movie(self, id, trailer_quality=None): id = str(id) if trailer_quality is None: trailer_quality = 6 movie = self.cache.get('movie:' + id, self._movie, id) if not movie: return None if movie['trailers']: # компилируем список с нужным нам качеством video = [] for m in movie['trailers']: url = [x for x in m['video'] if x[0] <= trailer_quality] if url: m['video'] = url[-1] video.append(m) movie['trailers'] = video if movie['trailers']: # готовим главный трейлер r = [x for x in movie['trailers'] if x['trailer']] if r: movie['info']['trailer'] = r[0]['video'][1] else: # если трейлер не найден, то отдаем что попало... movie['info']['trailer'] = movie['trailers'][0]['video'][1] return movie def search(self, name, trailer_quality=None): return self._search_movie(name) def best(self, **kwarg): page = kwarg.get('page', 1) limit = kwarg.get('limit', 50) url = 'http://www.kinopoisk.ru/top/navigator/m_act%5Bis_film%5D/on/m_act%5Bnum_vote%5D/' + str( kwarg.get('votes', 100)) + '/' if kwarg.get('dvd'): url += 'm_act%5Bis_dvd%5D/on/' if kwarg.get('decade'): url += 'm_act%5Bdecade%5D/' + str(kwarg['decade']) + '/' if kwarg.get('genre'): url += 'm_act%5Bgenre%5D/' + str(GENRE[kwarg['genre']]) + '/' if kwarg.get('country'): url += 'm_act%5Bcountry%5D/' + str(kwarg['country']) + '/' if kwarg.get('rate'): url += 'm_act%5Brating%5D/' + str(kwarg['rate']) + ':/' if kwarg.get('mpaa'): url += 'm_act%5Bmpaa%5D/' + str(kwarg['mpaa']) + '/' url += 'perpage/' + str(limit) + '/order/ex_rating/' if page > 1: url += 'page/' + str(page) + '/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = {'pages': (1, 0, 1, 0), 'data': []} r = re.compile('<div class="pagesFromTo(.+?)<div class="pagesFromTo', re.U | re.S).search( response.body.decode('windows-1251')) if r: body = r.group(1) # compile pagelist p = re.compile('>([0-9]+)—[0-9]+[^0-9]+?([0-9]+)', re.U).search(body) if p: page = (int(p.group(1)) - 1) / limit + 1 total = int(p.group(2)) pages = total / limit if limit * pages != total: pages += 1 res['pages'] = (pages, 0 if page == 1 else page - 1, page, 0 if page == pages else page + 1) # end compile for id in re.compile('<div id="tr_([0-9]+)"', re.U | re.S).findall(body): res['data'].append(int(id)) return res def person(self, name): #response = self.http.fetch('https://www.kinopoisk.ru/index.php?level=7&from=forma&result=adv&m_act%5Bfrom%5D=forma&m_act%5Bwhat%5D=actor&m_act%5Bfind%5D=' + urllib.quote_plus(name.encode('windows-1251')), headers=self.headers) response = self.http.fetch( 'http://www.kinopoisk.ru/s/type/people/list/1/find/' + urllib.quote_plus(name.encode('windows-1251')) + '/order/relevant/', headers=self.headers) if response.error: return None res = [] body = re.compile( '<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(response.body.decode('windows-1251')) if body: for block in re.compile('<p class="pic">(.+?)<div class="clear">', re.U | re.S).findall(body.group(1)): id, name, original, year, poster = None, None, None, None, None r = re.compile( '<p class="name"><a href="/name/([0-9]+)[^>]+>([^<]+)</a>', re.U | re.S).search(block) if r: id = r.group(1) name = r.group(2).strip() if id and name: r = re.compile('<span class="gray">([^<]+)</span>', re.U | re.S).search(block) if r: original = r.group(1).strip() if not original: original = None r = re.compile('<span class="year">([0-9]{4})</span>', re.U | re.S).search(block) if r: year = int(r.group(1)) if block.find('no-poster.gif') == -1: poster = 'http://st.kinopoisk.ru/images/actor/' + id + '.jpg' res.append({ 'id': int(id), 'name': name, 'originalname': original, 'year': year, 'poster': poster }) return {'pages': (1, 0, 1, 0), 'data': res} def work(self, id): response = self.http.fetch('http://www.kinopoisk.ru/name/' + str(id) + '/', headers=self.headers) if response.error: return None res = {} r = re.compile('id="sort_block">(.+?)<div id="block_right"', re.U | re.S).search(response.body.decode('windows-1251')) if r: for block in r.group(1).split( u'<tr><td colspan="3" class="specializationBox')[1:]: work = None for w in ('actor', 'director', 'writer', 'producer', 'producer_ussr', 'composer', 'operator', 'editor', 'design', 'voice', 'voice_director'): if block.find(u'id="' + w + u'"') != -1: work = 'producer' if w == 'producer_ussr' else w break if work: movies = [] for id, name in re.compile( '<span class="name"><a href="/film/([0-9]+)/[^>]+>([^<]+?)</a>', re.U).findall(block): for tag in (u'(мини-сериал)', u'(сериал)'): if name.find(tag) != -1: break else: movies.append(int(id)) if movies: res.setdefault(work, []).extend(movies) return res def review(self, id, query): query_s = 'all' if query == 'stat' else query data = self.cache.get('review:' + str(id) + ':' + query_s, self._review, id, query_s) if not data: return data return data[query] def countries(self): return COUNTRIES def country(self, id, default=None): country = [x[1] for x in COUNTRIES if x[0] == id] return country[0] if country else default # PRIVATE def _search_movie(self, name, year=None): url = 'http://www.kinopoisk.ru/s/type/film/list/1/find/' + urllib.quote_plus( name.encode('windows-1251')) + '/order/relevant' if year: url += '/m_act%5Byear%5D/' + str(year) url += '/m_act%5Btype%5D/film/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = [] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search( response.body.decode('windows-1251')) if r: for id in re.compile( '<p class="name"><a href="/level/1/film/([0-9]+)', re.U | re.S).findall(r.group(1)): res.append(int(id)) return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 #week ids = self._search_movie(name, year) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0] def _review(self, id, query): url = 'http://www.kinopoisk.ru/film/' + str(id) + '/ord/rating/' if query in ('good', 'bad', 'neutral'): url += 'status/' + query + '/' url += 'perpage/200/' response = self.http.fetch(url, headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'stat': { 'all': 0, 'good': 0, 'bad': 0, 'neutral': 0 }, query: [] } r = re.compile('<ul class="resp_type">(.+?)</ul>', re.U | re.S).search(html) if r: ul = r.group(1) for q, t in (('pos', 'good'), ('neg', 'bad'), ('neut', 'neutral')): r = re.compile( '<li class="' + q + '"><a href="[^>]+>[^<]+</a><b>([0-9]+)</b></li>', re.U).search(ul) if r: res['stat'][t] = int(r.group(1)) res['stat']['all'] = res['stat']['good'] + res['stat'][ 'bad'] + res['stat']['neutral'] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(html) if r: for block in r.group(1).split('itemprop="reviews"'): review = { 'nick': None, 'count': None, 'title': None, 'review': None, 'time': None } r = re.compile('itemprop="reviewBody">(.+?)</div>', re.U | re.S).search(block) if r: text = r.group(1) for tag1, tag2 in ((u'<=end=>', u'\n'), (u'<b>', u'[B]'), (u'</b>', u'[/B]'), (u'<i>', u'[I]'), (u'</i>', u'[/I]'), (u'<u>', u'[U]'), (u'</u>', u'[/U]')): text = text.replace(tag1, tag2) r = self.html.text(text) if r: review['review'] = r user = None r = re.compile( '<p class="profile_name"><s></s><a href="[^>]+>([^<]+)</a></p>' ).search(block) if r: user = self.html.string(r.group(1)) else: r = re.compile('<p class="profile_name"><s></s>([^<]+)</p>' ).search(block) if r: user = self.html.string(r.group(1)) if user: review['nick'] = user r = re.compile('<p class="sub_title"[^>]+>([^<]+)</p>').search( block) if r: title = self.html.string(r.group(1)) if title: review['title'] = title r = re.compile('<span class="date">([^<]+)</span>', re.U | re.S).search(block) if r: review['time'] = r.group(1).replace(u' |', u',') r = re.compile(u'<a href="[^>]+>рецензии \(([0-9]+)\)</a>', re.U | re.S).search(block) if r: review['count'] = int(r.group(1)) if review['nick'] and review['review']: res[query].append(review) return 3600, res # one hour def _movie(self, id): response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/', headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'id': int(id), 'thumb': None, 'fanart': None, 'trailers': [], 'info': {} } # имя, оригинальное имя, девиз, цензура, год, top250 # runtime - длительность фильма (в отдельную переменную, иначе не видно размер файла) for tag, reg, cb in ( ('title', '<h1 class="moviename-big" itemprop="name">(.+?)</h1>', self.html.string), ('originaltitle', 'itemprop="alternativeHeadline">([^<]*)</span>', self.html.string), ('tagline', '<td style="color\: #555">«(.+?)»</td></tr>', self.html.string), ('mpaa', 'images/mpaa/([^\.]+).gif', self.html.string), ('runtime', '<td class="time" id="runtime">[^<]+<span style="color\: #999">/</span>([^<]+)</td>', self.html.string), ('year', '<a href="/lists/m_act%5Byear%5D/([0-9]+)/"', int), ('top250', '<a href="/level/20/#([0-9]+)', int)): r = re.compile(reg, re.U).search(html) if r: value = r.group(1).strip() if value: res['info'][tag] = cb(value) # режисеры, сценаристы, жанры for tag, reg in (('director', u'<td itemprop="director">(.+?)</td>'), ( 'writer', u'<td class="type">сценарий</td><td[^>]*>(.+?)</td>'), ('genre', u'<span itemprop="genre">(.+?)</span>')): r = re.compile(reg, re.U | re.S).search(html) if r: r2 = [] for r in re.compile('<a href="[^"]+">([^<]+)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': r2.append(r) if r2: res['info'][tag] = u', '.join(r2) # описание фильма r = re.compile( '<span class="_reachbanner_"><div class="brand_words film-synopsys" itemprop="description">(.+?)</div></span>', re.U).search(html) if r: plot = self.html.text(r.group(1).replace('<=end=>', '\n')) if plot: res['info']['plot'] = plot # IMDB r = re.compile('IMDb: ([0-9.]+) \(([0-9\s]+)\)</div>', re.U).search(html) if r: res['info']['rating'] = float(r.group(1).strip()) res['info']['votes'] = r.group(2).strip() # премьера r = re.compile(u'премьера \(мир\)</td>(.+?)</tr>', re.U | re.S).search(html) if r: r = re.compile(u'data\-ical\-date="([^"]+)"', re.U | re.S).search(r.group(1)) if r: data = r.group(1).split(' ') if len(data) == 3: i = 0 for mon in (u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря'): i += 1 if mon == data[1]: mon = str(i) if len(mon) == 1: mon = '0' + mon day = data[0] if len(day) == 1: day = '0' + day res['info']['premiered'] = '-'.join( [data[2], mon, day]) break # постер r = re.compile(u'onclick="openImgPopup\(([^\)]+)\)', re.U | re.S).search(html) if r: poster = r.group(1).replace("'", '').strip() if poster: res['thumb'] = 'http://kinopoisk.ru' + poster # актеры r = re.compile(u'<h4>В главных ролях:</h4>(.+?)</ul>', re.U | re.S).search(html) if r: actors = [] for r in re.compile( '<li itemprop="actors"><a [^>]+>([^<]+)</a></li>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': actors.append(r) if actors: res['info']['cast'] = actors[:] menu = re.compile( '<ul id="newMenuSub" class="clearfix(.+?)<!\-\- /menu \-\->', re.U | re.S).search(html) if menu: menu = menu.group(1) # фанарт if menu.find('/film/' + id + '/wall/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/wall/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/w_size/([0-9]+)/">', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1), (id2, size2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/w_size/' + fanart[-1][1] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['fanart'] = r.group(1).strip() if res['fanart'].startswith('//'): res['fanart'] = 'http:' + res['fanart'] # если нет фанарта (обоев), то пробуем получить кадры if not res['fanart'] and menu.find('/film/' + id + '/stills/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/stills/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/"><img src="[^<]+</a>[^<]+<b><i>([0-9]+)×([0-9]+)</i>', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1, t1), ( id2, size2, t2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [ x for x in fanart if int(x[1]) <= 1280 and int(x[1]) > int(x[2]) ] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['fanart'] = r.group(1).strip() if res['fanart'].startswith('//'): res['fanart'] = 'http:' + res['fanart'] # студии if menu.find('/film/' + id + '/studio/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/studio/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile(u'<b>Производство:</b>(.+?)</table>', re.U | re.S).search(html) if r: studio = [] for r in re.compile( '<a href="/lists/m_act%5Bstudio%5D/[0-9]+/" class="all">(.+?)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r: studio.append(r) if studio: res['info']['studio'] = u', '.join(studio) # трэйлеры trailers1 = [] # русские трейлеры trailers2 = [] # другие русские видео trailers3 = [] # трейлеры trailers4 = [] # другие видео if menu.find('/film/' + id + '/video/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/video/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') for row in re.compile( u'<!-- ролик -->(.+?)<!-- /ролик -->', re.U | re.S).findall(html): # отсекаем лишние блоки if row.find(u'>СМОТРЕТЬ</a>') != -1: # русский ролик? if row.find('class="flag flag2"') == -1: is_ru = False else: is_ru = True # получаем имя трейлера r = re.compile( '<a href="/film/' + id + '/video/[0-9]+/[^>]+ class="all">(.+?)</a>', re.U).search(row) if r: name = self.html.string(r.group(1)) if name: trailer = { 'name': name, 'time': None, 'trailer': False, 'ru': is_ru, 'video': [] } # трейлер или тизер? for token in (u'Трейлер', u'трейлер', u'Тизер', u'тизер'): if name.find(token) != -1: trailer['trailer'] = True break # получаем время трейлера r = re.compile( u'clock.gif"[^>]+></td>\s*<td style="color\: #777">[^0-9]*([0-9\:]+)</td>', re.U | re.S).search(row) if r: trailer['time'] = r.group(1).strip() # делим ролики по качеству for r in re.compile( 'trailer/([1-3])a.gif"(.+?)link=([^"]+)" class="continue">.+?<td style="color\:#777">([^<]+)</td>\s*</tr>', re.U | re.S).findall(row): quality = int(r[0]) if r[1].find('icon-hd') != -1: quality += 3 trailer['video'].append( (quality, r[2].strip(), r[3])) if id == '462754': #raise pass if trailer['video']: if trailer['ru']: if trailer['trailer']: trailers1.append(trailer) else: trailers2.append(trailer) else: if trailer['trailer']: trailers3.append(trailer) else: trailers4.append(trailer) # склеиваем трейлеры res['trailers'].extend(trailers1) res['trailers'].extend(trailers2) res['trailers'].extend(trailers3) res['trailers'].extend(trailers4) timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int( res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 #week return timeout, res
class KinoPoisk: """ API: scraper - скрапер movie - профайл фильма search - поиск фильма best - поиск лучших фильмов person - поиск персон work - информация о работах персоны """ def __init__(self): self.cache = Cache('kinopoisk.db') self.html = Clear() self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' } # API def scraper(self, name, year=None, trailer_quality=None): try: tag = 'scraper:' + urllib.quote_plus(name.encode('windows-1251')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, name, year) if not id: return None return self.movie(id, trailer_quality) def movie(self, id, trailer_quality=None): id = str(id) if trailer_quality is None: trailer_quality = 6 movie = self.cache.get('movie:' + id, self._movie, id) if not movie: return None if movie['trailers']: # компилируем список с нужным нам качеством video = [] for m in movie['trailers']: url = [x for x in m['video'] if x[0] <= trailer_quality] if url: m['video'] = url[-1] video.append(m) movie['trailers'] = video if movie['trailers']: # готовим главный трейлер r = [x for x in movie['trailers'] if x['trailer']] if r: movie['info']['trailer'] = r[0]['video'][1] else: # если трейлер не найден, то отдаем что попало... movie['info']['trailer'] = movie['trailers'][0]['video'][1] return movie def search(self, name, trailer_quality=None): return self._search_movie(name) def best(self, **kwarg): page = kwarg.get('page', 1) limit = kwarg.get('limit', 50) url = 'http://www.kinopoisk.ru/top/navigator/m_act%5Bis_film%5D/on/m_act%5Bnum_vote%5D/' + str(kwarg.get('votes', 100)) + '/' if kwarg.get('dvd'): url += 'm_act%5Bis_dvd%5D/on/' if kwarg.get('decade'): url += 'm_act%5Bdecade%5D/' + str(kwarg['decade']) + '/' if kwarg.get('genre'): url += 'm_act%5Bgenre%5D/' + str(GENRE[kwarg['genre']]) + '/' if kwarg.get('country'): url += 'm_act%5Bcountry%5D/' + str(kwarg['country']) + '/' if kwarg.get('rate'): url += 'm_act%5Brating%5D/' + str(kwarg['rate']) + ':/' if kwarg.get('mpaa'): url += 'm_act%5Bmpaa%5D/' + str(kwarg['mpaa']) + '/' url += 'perpage/' + str(limit) + '/order/ex_rating/' if page > 1: url += 'page/' + str(page) + '/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = {'pages': (1, 0, 1, 0), 'data': []} r = re.compile('<div class="pagesFromTo(.+?)<div class="pagesFromTo', re.U|re.S).search(response.body.decode('windows-1251')) if r: body = r.group(1) # compile pagelist p = re.compile('>([0-9]+)—[0-9]+[^0-9]+?([0-9]+)', re.U).search(body) if p: page = (int(p.group(1))-1)/limit + 1 total = int(p.group(2)) pages = total/limit if limit*pages != total: pages += 1 res['pages'] = (pages, 0 if page == 1 else page-1, page, 0 if page==pages else page+1) # end compile for id in re.compile('<div id="tr_([0-9]+)"', re.U|re.S).findall(body): res['data'].append(int(id)) return res def person(self, name): response = self.http.fetch('http://www.kinopoisk.ru/s/type/people/list/1/find/' + urllib.quote_plus(name.encode('windows-1251')) + '/order/relevant/', headers=self.headers) if response.error: return None res = [] body = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U|re.S).search(response.body.decode('windows-1251')) if body: for block in re.compile('<p class="pic">(.+?)<div class="clear">', re.U|re.S).findall(body.group(1)): id, name, original, year, poster = None, None, None, None, None r = re.compile('<p class="name"><a href="http://www\.kinopoisk\.ru/level/4/people/([0-9]+)[^>]+>([^<]+)</a>', re.U|re.S).search(block) if r: id = r.group(1) name = r.group(2).strip() if id and name: r = re.compile('<span class="gray">([^<]+)</span>', re.U|re.S).search(block) if r: original = r.group(1).strip() if not original: original = None r = re.compile('<span class="year">([0-9]{4})</span>', re.U|re.S).search(block) if r: year = int(r.group(1)) if block.find('no-poster.gif') == -1: poster = 'http://st.kinopoisk.ru/images/actor/' + id + '.jpg' res.append({'id': int(id), 'name': name, 'originalname': original, 'year': year, 'poster': poster}) return {'pages': (1, 0, 1, 0), 'data': res} def work(self, id): response = self.http.fetch('http://www.kinopoisk.ru/name/' + str(id) + '/', headers=self.headers) if response.error: return None res = {} r = re.compile('id="sort_block">(.+?)<style>', re.U|re.S).search(response.body.decode('windows-1251')) if r: for block in r.group(1).split(u'<table cellspacing="0" cellpadding="0" border="0" width="100%">'): work = None for w in ('actor', 'director', 'writer', 'producer', 'producer_ussr', 'composer', 'operator', 'editor', 'design', 'voice', 'voice_director'): if block.find(u'id="' + w + u'"') != -1: work = 'producer' if w == 'producer_ussr' else w break if work: movies = [] for id, name in re.compile('<span class="name"><a href="/film/([0-9]+)/" >([^<]+?)</a>', re.U).findall(block): for tag in (u'(мини-сериал)', u'(сериал)'): if name.find(tag) != -1: break else: movies.append(int(id)) if movies: res.setdefault(work, []).extend(movies) return res def review(self, id, query): query_s = 'all' if query == 'stat' else query data = self.cache.get('review:' + str(id) + ':' + query_s, self._review, id, query_s) if not data: return data return data[query] def countries(self): return COUNTRIES def country(self, id, default=None): country = [x[1] for x in COUNTRIES if x[0] == id] return country[0] if country else default # PRIVATE def _search_movie(self, name, year=None): url = 'http://www.kinopoisk.ru/s/type/film/list/1/find/' + urllib.quote_plus(name.encode('windows-1251')) + '/order/relevant' if year: url += '/m_act%5Byear%5D/' + str(year) url += '/m_act%5Btype%5D/film/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = [] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U|re.S).search(response.body.decode('windows-1251')) if r: for id in re.compile('<p class="name"><a href="/level/1/film/([0-9]+)', re.U|re.S).findall(r.group(1)): res.append(int(id)) return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week ids = self._search_movie(name, year) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0] def _review(self, id, query): url = 'http://www.kinopoisk.ru/film/' + str(id) + '/ord/rating/' if query in ('good', 'bad', 'neutral'): url += 'status/' + query + '/' url += 'perpage/200/' response = self.http.fetch(url, headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'stat': {'all': 0, 'good': 0, 'bad': 0, 'neutral': 0}, query: [] } r = re.compile('<ul class="resp_type">(.+?)</ul>', re.U|re.S).search(html) if r: ul = r.group(1) for q, t in (('pos', 'good'), ('neg', 'bad'), ('neut', 'neutral')): r = re.compile('<li class="' + q + '"><a href="[^>]+>[^<]+</a><b>([0-9]+)</b></li>', re.U).search(ul) if r: res['stat'][t] = int(r.group(1)) res['stat']['all'] = res['stat']['good'] + res['stat']['bad'] + res['stat']['neutral'] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U|re.S).search(html) if r: for block in r.group(1).split('itemprop="reviews"'): review = { 'nick': None, 'count': None, 'title': None, 'review': None, 'time': None } r = re.compile('itemprop="reviewBody">(.+?)</div>', re.U|re.S).search(block) if r: text = r.group(1) for tag1, tag2 in ((u'<=end=>', u'\n'), (u'<b>', u'[B]'), (u'</b>', u'[/B]'), (u'<i>', u'[I]'), (u'</i>', u'[/I]'), (u'<u>', u'[U]'), (u'</u>', u'[/U]')): text = text.replace(tag1, tag2) r = self.html.text(text) if r: review['review'] = r user = None r = re.compile('<p class="profile_name"><s></s><a href="[^>]+>([^<]+)</a></p>').search(block) if r: user = self.html.string(r.group(1)) else: r = re.compile('<p class="profile_name"><s></s>([^<]+)</p>').search(block) if r: user = self.html.string(r.group(1)) if user: review['nick'] = user r = re.compile('<p class="sub_title"[^>]+>([^<]+)</p>').search(block) if r: title = self.html.string(r.group(1)) if title: review['title'] = title r = re.compile('<span class="date">([^<]+)</span>', re.U|re.S).search(block) if r: review['time'] = r.group(1).replace(u' |', u',') r = re.compile(u'<a href="[^>]+>рецензии \(([0-9]+)\)</a>', re.U|re.S).search(block) if r: review['count'] = int(r.group(1)) if review['nick'] and review['review']: res[query].append(review) return 3600, res # one hour def _movie(self, id): response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/', headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'id': int(id), 'thumb': None, 'fanart': None, 'trailers': [], 'info': {} } # имя, оригинальное имя, девиз, цензура, год, top250 # runtime - длительность фильма (в отдельную переменную, иначе не видно размер файла) for tag, reg, t in ( ('title', '<title>(.+?)</title>', 'str'), ('originaltitle', 'itemprop="alternativeHeadline">([^<]*)</span>', 'str'), ('tagline', '<td style="color\: #555">«(.+?)»</td></tr>', 'str'), ('mpaa', 'images/mpaa/([^\.]+).gif', 'str'), ('runtime', '<td class="time" id="runtime">[^<]+<span style="color\: #999">/</span>([^<]+)</td>', 'str'), ('year', '<a href="/lists/m_act%5Byear%5D/([0-9]+)/"', 'int'), ('top250', 'Топ250\: <a\shref="/level/20/#([0-9]+)', 'int') ): r = re.compile(reg, re.U).search(html) if r: value = r.group(1).strip() if value: res['info'][tag] = value if t == 'int': res['info'][tag] = int(res['info'][tag]) else: res['info'][tag] = self.html.string(res['info'][tag]) # режисеры, сценаристы, жанры for tag, reg in ( ('director', u'<td itemprop="director">(.+?)</td>'), ('writer', u'<td class="type">сценарий</td><td[^>]*>(.+?)</td>'), ('genre', u'<span itemprop="genre">(.+?)</span>') ): r = re.compile(reg, re.U|re.S).search(html) if r: r2 = [] for r in re.compile('<a href="[^"]+">([^<]+)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': r2.append(r) if r2: res['info'][tag] = u', '.join(r2) # актеры r = re.compile(u'<h4>В главных ролях:</h4>(.+?)</ul>', re.U|re.S).search(html) if r: actors = [] for r in re.compile('<li itemprop="actors"><a [^>]+>([^<]+)</a></li>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': actors.append(r) if actors: res['info']['cast'] = actors[:] #res['info']['castandrole'] = actors[:] # описание фильма r = re.compile('<span class="_reachbanner_"><div class="brand_words" itemprop="description">(.+?)</div></span>', re.U).search(html) if r: plot = self.html.text(r.group(1).replace('<=end=>', '\n')) if plot: res['info']['plot'] = plot # IMDB r = re.compile('IMDb: ([0-9.]+) \(([0-9\s]+)\)</div>', re.U).search(html) if r: res['info']['rating'] = float(r.group(1).strip()) res['info']['votes'] = r.group(2).strip() # премьера r = re.compile(u'премьера \(мир\)</td>(.+?)</tr>', re.U|re.S).search(html) if r: r = re.compile(u'data\-ical\-date="([^"]+)"', re.U|re.S).search(r.group(1)) if r: data = r.group(1).split(' ') if len(data) == 3: i = 0 for mon in (u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря'): i += 1 if mon == data[1]: mon = str(i) if len(mon) == 1: mon = '0' + mon day = data[0] if len(day) == 1: day = '0' + day res['info']['premiered'] = '-'.join([data[2], mon, day]) break # постер r = re.compile(u'onclick="openImgPopup\(([^\)]+)\)', re.U|re.S).search(html) if r: poster = r.group(1).replace("'", '').strip() if poster: res['thumb'] = 'http://kinopoisk.ru' + poster menu = re.compile('<ul id="newMenuSub" class="clearfix(.+?)<!\-\- /menu \-\->', re.U|re.S).search(html) if menu: menu = menu.group(1) # фанарт if menu.find('/film/' + id + '/wall/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/wall/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile('<a href="/picture/([0-9]+)/w_size/([0-9]+)/">', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1), (id2, size2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280] if fanart_best: fanart = fanart_best response = self.http.fetch('http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/w_size/' + fanart[-1][1] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U|re.S).search(html) if r: res['fanart'] = r.group(1).strip() # если нет фанарта (обоев), то пробуем получить кадры if not res['fanart'] and menu.find('/film/' + id + '/stills/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/stills/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile('<a href="/picture/([0-9]+)/"><img src="[^<]+</a>[^<]+<b><i>([0-9]+)×([0-9]+)</i>', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1, t1), (id2, size2, t2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280 and int(x[1]) > int(x[2])] if fanart_best: fanart = fanart_best response = self.http.fetch('http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U|re.S).search(html) if r: res['fanart'] = r.group(1).strip() # студии if menu.find('/film/' + id + '/studio/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/studio/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile(u'<b>Производство:</b>(.+?)</table>', re.U|re.S).search(html) if r: studio = [] for r in re.compile('<a href="/lists/m_act%5Bstudio%5D/[0-9]+/" class="all">(.+?)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r: studio.append(r) if studio: res['info']['studio'] = u', '.join(studio) # трэйлеры trailers1 = [] # русские трейлеры trailers2 = [] # другие русские видео trailers3 = [] # трейлеры trailers4 = [] # другие видео if menu.find('/film/' + id + '/video/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/video/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') for row in re.compile(u'<!-- ролик -->(.+?)<!-- /ролик -->', re.U|re.S).findall(html): # отсекаем лишние блоки if row.find(u'>СМОТРЕТЬ</a>') != -1: # русский ролик? if row.find('class="flag flag2"') == -1: is_ru = False else: is_ru = True # получаем имя трейлера r = re.compile('<a href="/film/' + id + '/video/[0-9]+/[^>]+ class="all">(.+?)</a>', re.U).search(row) if r: name = self.html.string(r.group(1)) if name: trailer = { 'name': name, 'time': None, 'trailer': False, 'ru': is_ru, 'video': [] } # трейлер или тизер? for token in (u'Трейлер', u'трейлер', u'Тизер', u'тизер'): if name.find(token) != -1: trailer['trailer'] = True break # получаем время трейлера r = re.compile(u'clock.gif"[^>]+></td>\s*<td style="color\: #777">[^0-9]*([0-9\:]+)</td>', re.U|re.S).search(row) if r: trailer['time'] = r.group(1).strip() print 'F**K' # делим ролики по качеству for r in re.compile('trailer/([1-3])a.gif"(.+?)link=([^"]+)" class="continue">.+?<td style="color\:#777">([^<]+)</td>\s*</tr>', re.U|re.S).findall(row): print str(r) quality = int(r[0]) if r[1].find('icon-hd') != -1: quality += 3 trailer['video'].append((quality, r[2].strip(), r[3])) print str(trailer) if id == '462754': #raise pass if trailer['video']: if trailer['ru']: if trailer['trailer']: trailers1.append(trailer) else: trailers2.append(trailer) else: if trailer['trailer']: trailers3.append(trailer) else: trailers4.append(trailer) # склеиваем трейлеры res['trailers'].extend(trailers1) res['trailers'].extend(trailers2) res['trailers'].extend(trailers3) res['trailers'].extend(trailers4) timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int(res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week return timeout, res
class TvDb: """ API: scraper - скрапер search - поиск сериалов movie - профайл фильма """ def __init__(self): self.api_key = '1D62F2F90030C444' self.cache = Cache('tvdb.db') self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.thetvdb.com/' } # API def scraper(self, name, year=None): try: tag = 'scraper:' + urllib.quote_plus(name.encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, name, year) if not id: return None return self.movie(id) def search(self, name): return self._search(name) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def _movie(self, id): dirname = tempfile.mkdtemp() response = self.http.fetch('http://www.thetvdb.com/api/' + self.api_key + '/series/' + id + '/all/ru.zip', headers=self.headers, download=os.path.join(dirname, 'movie.zip')) if response.error: self._movie_clear(dirname) return False, None try: filezip = zipfile.ZipFile(os.path.join(dirname, 'movie.zip'), 'r') filezip.extractall(dirname) filezip.close() movie = file(os.path.join(dirname, 'ru.xml'), 'rb').read().decode('utf8') except: self._movie_clear(dirname) return False, None self._movie_clear(dirname) body = re.compile(r'<Series>(.+?)</Series>', re.U | re.S).search(movie) if not body: return False, None body = body.group(1) res = { 'id': int(id), 'thumb': None, 'fanart': None, 'trailers': [], 'info': {} } # режисеры и сценаристы for tag in ('Director', 'Writer'): people = {} people_list = [] [ people_list.extend(x.split('|')) for x in re.compile(r'<' + tag + r'>([^<]+)</' + tag + r'>', re.U | re.S).findall(movie) ] [ people.update({x: 1}) for x in [x.strip() for x in people_list] if x ] if people: res['info'][tag.lower()] = u', '.join( [x for x in people.keys() if x]) for tag, retag, typeof in (('plot', 'Overview', None), ('mpaa', 'ContentRating', None), ('premiered', 'FirstAired', None), ('studio', 'Network', None), ('title', 'SeriesName', None), ('runtime', 'Runtime', None), ('votes', 'RatingCount', None), ('rating', 'Rating', float), ('genre', 'Genre', list), ('cast', 'Actors', list)): r = re.compile(r'<' + retag + r'>([^<]+)</' + retag + r'>', re.U | re.S).search(body) if r: r = r.group(1).strip() if typeof == float: res['info'][tag] = float(r) elif typeof == list: res['info'][tag] = [ x for x in [x.strip() for x in r.split(u'|')] if x ] if tag == 'genre': res['info'][tag] = u', '.join(res['info'][tag]) else: res['info'][tag] = r # год if 'premiered' in res['info']: res['info']['year'] = int(res['info']['premiered'].split('-')[0]) # постер r = re.compile(r'<poster>([^<]+)</poster>', re.U | re.S).search(body) if r: res['thumb'] = 'http://thetvdb.com/banners/' + r.group(1).strip() # фанарт r = re.compile(r'<fanart>([^<]+)</fanart>', re.U | re.S).search(body) if r: res['fanart'] = 'http://thetvdb.com/banners/' + r.group(1).strip() timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int( res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 #week return timeout, res def _movie_clear(self, dirname): for filename in os.listdir(dirname): try: os.unlink(os.path.join(dirname, filename)) except: raise try: os.rmdir(dirname) except: raise def _search(self, name): response = self.http.fetch( 'http://www.thetvdb.com/api/GetSeries.php?language=ru&seriesname=' + urllib.quote_plus(name.encode('utf8')), headers=self.headers) if response.error: return None res = [] rows = re.compile('<Series>(.+?)</Series>', re.U | re.S).findall(response.body.decode('utf8')) if rows: recmd = re.compile('<seriesid>([0-9]+)</seriesid>', re.U | re.S) for row in [ x for x in rows if x.find(u'<language>ru</language>') != -1 ]: r = recmd.search(row) if r: res.append(int(r.group(1))) return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 #week ids = self._search(name) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0]
class RuTrackerHTTP: def __init__(self): self.setting = Setting() self.re_auth = re.compile(r'"profile\.php\?mode=sendpassword"') self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://rutracker.org/forum/index.php' } def guest(self, url): response = self.http.fetch(url, headers=self.headers) if response.error: return None else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 return body def get(self, url): return self._fetch('GET', url) def post(self, url, params): return self._fetch('POST', url, params) def download(self, id): id = str(id) # проверяем авторизацию html = self.get('http://rutracker.org/forum/viewtopic.php?t=' + id) if not html: return html # хакаем куки cookies = cookielib.MozillaCookieJar() cookies.load(self.http.request.cookies) cookies.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) cookies.save(self.http.request.cookies, ignore_discard=True, ignore_expires=True) # тянем торрент response = self.http.fetch('http://dl.rutracker.org/forum/dl.php?t=' + id, cookies='rutracker.moz', headers=self.headers, method='POST') if response.error: return None else: return response.body def _fetch(self, method, url, params=None): while True: response = self.http.fetch(url, cookies='rutracker.moz', headers=self.headers, method=method, params=params) if response.error: return None else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 if not self.re_auth.search(body): return body else: xbmc.log('RUTRACKER: Request auth', xbmc.LOGDEBUG) auth = self._auth() if not auth: return auth def _auth(self): while True: login = self.setting['rutracker_login'] password = self.setting['rutracker_password'] if not login or not password: login, password = self._setting(login, password) if not login: return False response = self.http.fetch('http://login.rutracker.org/forum/login.php', cookies='rutracker.moz', headers=self.headers, method='POST', params={'login_username': login, 'login_password': password, 'login': r'Вход'}) if response.error: return False else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 if not self.re_auth.search(body): return True else: login, password = self._setting(login, password) if not login: return False def _setting(self, login, password): self.setting.dialog() login2 = self.setting['rutracker_login'] password2 = self.setting['rutracker_password'] if login == login2 and password == password2: return None, None else: return login2, password2
class TvDb: """ API: scraper - скрапер search - поиск сериалов movie - профайл фильма """ def __init__(self): self.api_key = '1D62F2F90030C444' self.cache = Cache('tvdb.db') self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.thetvdb.com/' } # API def scraper(self, name, year=None): try: tag = 'scraper:' + urllib.quote_plus(name.encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, name, year) if not id: return None return self.movie(id) def search(self, name): return self._search(name) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def _movie(self, id): dirname = tempfile.mkdtemp() response = self.http.fetch('http://www.thetvdb.com/api/' + self.api_key + '/series/' + id + '/all/ru.zip', headers=self.headers, download=os.path.join(dirname, 'movie.zip')) if response.error: self._movie_clear(dirname) return False, None try: filezip = zipfile.ZipFile(os.path.join(dirname, 'movie.zip'), 'r') filezip.extractall(dirname) filezip.close() movie = file(os.path.join(dirname, 'ru.xml'), 'rb').read().decode('utf8') except: self._movie_clear(dirname) return False, None self._movie_clear(dirname) body = re.compile(r'<Series>(.+?)</Series>', re.U|re.S).search(movie) if not body: return False, None body = body.group(1) res = { 'id': int(id), 'thumb': None, 'fanart': None, 'trailers': [], 'info': {} } # режисеры и сценаристы for tag in ('Director', 'Writer'): people = {} people_list = [] [people_list.extend(x.split('|')) for x in re.compile(r'<' + tag + r'>([^<]+)</' + tag + r'>', re.U|re.S).findall(movie)] [people.update({x: 1}) for x in [x.strip() for x in people_list] if x] if people: res['info'][tag.lower()] = u', '.join([x for x in people.keys() if x]) for tag, retag, typeof in ( ('plot', 'Overview', None), ('mpaa', 'ContentRating', None), ('premiered', 'FirstAired', None), ('studio', 'Network', None), ('title', 'SeriesName', None), ('runtime', 'Runtime', None), ('votes', 'RatingCount', None), ('rating', 'Rating', float), ('genre', 'Genre', list), ('cast', 'Actors', list) ): r = re.compile(r'<' + retag + r'>([^<]+)</' + retag + r'>', re.U|re.S).search(body) if r: r = r.group(1).strip() if typeof == float: res['info'][tag] = float(r) elif typeof == list: res['info'][tag] = [x for x in [x.strip() for x in r.split(u'|')] if x] if tag == 'genre': res['info'][tag] = u', '.join(res['info'][tag]) else: res['info'][tag] = r # год if 'premiered' in res['info']: res['info']['year'] = int(res['info']['premiered'].split('-')[0]) # постер r = re.compile(r'<poster>([^<]+)</poster>', re.U|re.S).search(body) if r: res['thumb'] = 'http://thetvdb.com/banners/' + r.group(1).strip() # фанарт r = re.compile(r'<fanart>([^<]+)</fanart>', re.U|re.S).search(body) if r: res['fanart'] = 'http://thetvdb.com/banners/' + r.group(1).strip() timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int(res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week return timeout, res def _movie_clear(self, dirname): for filename in os.listdir(dirname): try: os.unlink(os.path.join(dirname, filename)) except: raise try: os.rmdir(dirname) except: raise def _search(self, name): response = self.http.fetch('http://www.thetvdb.com/api/GetSeries.php?language=ru&seriesname=' + urllib.quote_plus(name.encode('utf8')), headers=self.headers) if response.error: return None res = [] rows = re.compile('<Series>(.+?)</Series>', re.U|re.S).findall(response.body.decode('utf8')) if rows: recmd = re.compile('<seriesid>([0-9]+)</seriesid>', re.U|re.S) for row in [x for x in rows if x.find(u'<language>ru</language>') != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week ids = self._search(name) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0]
class RuTrackerHTTP: def __init__(self): self.setting = Setting() self.re_auth = re.compile(r'profile\.php\?mode=sendpassword"') self.re_captcha = re.compile(r'<img src="(\/\/[^\/]+/captcha/[^"]+)"') self.re_captcha_sid = re.compile( r'<input type="hidden" name="cap_sid" value="([^"]+)">') self.re_captcha_code = re.compile( r'<input type="text" name="(cap_code_[^"]+)"') self.captcha_sid = None self.captcha_code = None self.captcha_code_value = None self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://rutracker.lib/forum/index.php' } def guest(self, url): response = self.http.fetch(url, headers=self.headers) if response.error: return None else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 return body def get(self, url): return self._fetch('GET', url) def post(self, url, params): return self._fetch('POST', url, params) def download(self, id): id = str(id) # проверяем авторизацию html = self.get('http://rutracker.lib/forum/viewtopic.php?t=' + id) if not html: return html # хакаем куки cookies = cookielib.MozillaCookieJar() cookies.load(self.http.request.cookies) cookies.set_cookie( cookielib.Cookie(version=0, name='bb_dl', value=id, port=None, port_specified=False, domain='.rutracker.lib', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) cookies.save(self.http.request.cookies, ignore_discard=True, ignore_expires=True) # тянем торрент response = self.http.fetch('http://rutracker.lib/forum/dl.php?t=' + id, cookies='rutracker.moz', headers=self.headers, method='POST') if response.error: return None else: return response.body def _fetch(self, method, url, params=None): while True: response = self.http.fetch(url, cookies='rutracker.moz', headers=self.headers, method=method, params=params) if response.error: return None else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 if not self.re_auth.search(body): return body xbmc.log('RUTRACKER: Request auth', xbmc.LOGDEBUG) auth = self._auth() if not auth: return auth def _auth(self): self.captcha_sid, self.captcha_code, self.captcha_code_value = None, None, None while True: login = self.setting['rutracker_login'] password = self.setting['rutracker_password'] if not login or not password: self.setting.dialog() login = self.setting['rutracker_login'] password = self.setting['rutracker_password'] if not login or not password: return None params = { 'login_username': login, 'login_password': password, 'login': r'вход' } if self.captcha_sid: params['login'] = r'Вход' params['cap_sid'] = self.captcha_sid params[self.captcha_code] = self.captcha_code_value response = self.http.fetch('http://rutracker.lib/forum/login.php', cookies='rutracker.moz', headers=self.headers, method='POST', params=params) self.captcha_sid, self.captcha_code, self.captcha_code_value = None, None, None if response.error: return None body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 if not self.re_auth.search(body): return True # проверяем капчу r = self.re_captcha.search(body) if r: r_sid = self.re_captcha_sid.search(body) if not r_sid: return None self.captcha_sid = r_sid.group(1) r_code = self.re_captcha_code.search(body) if not r_code: return None self.captcha_code = r_code.group(1) self.captcha_code_value = self._captcha('http:' + r.group(1)) if not self.captcha_code_value: return None # get login k = xbmc.Keyboard('', 'Enter login') k.doModal() if k.isConfirmed(): login = k.getText() else: return None # get password k = xbmc.Keyboard('', 'Enter password', True) k.doModal() if k.isConfirmed(): password = k.getText() else: return None if not login or not password: return None self.setting['rutracker_login'] = login self.setting['rutracker_password'] = password def _captcha(self, captcha): response = self.http.fetch(captcha, headers=self.headers, method='GET') if response.error: return import tempfile filename = tempfile.gettempdir() + '/captcha' file(filename, 'wb').write(response.body) win = xbmcgui.Window(xbmcgui.getCurrentWindowId()) # width = 120px, height = 72px image = xbmcgui.ControlImage(win.getWidth() / 2 - int(120 / 2), 20, 120, 72, filename) win.addControl(image) k = xbmc.Keyboard('', 'Enter captcha code') k.doModal() code = k.getText() if k.isConfirmed() else None win.removeControl(image) return code if code else None
class RuTrackerHTTP: def __init__(self): self.setting = Setting() self.re_auth = re.compile(r'profile\.php\?mode=sendpassword"') self.re_captcha = re.compile(r'<img src="(\/\/[^\/]+/captcha/[^"]+)"') self.re_captcha_sid = re.compile(r'<input type="hidden" name="cap_sid" value="([^"]+)">') self.re_captcha_code = re.compile(r'<input type="text" name="(cap_code_[^"]+)"') self.captcha_sid = None self.captcha_code = None self.captcha_code_value = None self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://rutracker.nl/forum/index.php' } def guest(self, url): response = self.http.fetch(url, headers=self.headers) if response.error: return None else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 return body def get(self, url): return self._fetch('GET', url) def post(self, url, params): return self._fetch('POST', url, params) def download(self, id): id = str(id) # проверяем авторизацию html = self.get('http://rutracker.nl/forum/viewtopic.php?t=' + id) if not html: return html # хакаем куки cookies = cookielib.MozillaCookieJar() cookies.load(self.http.request.cookies) cookies.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=id, port=None, port_specified=False, domain='.rutracker.nl', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) cookies.save(self.http.request.cookies, ignore_discard=True, ignore_expires=True) # тянем торрент response = self.http.fetch('http://rutracker.nl/forum/dl.php?t=' + id, cookies='rutracker.moz', headers=self.headers, method='POST') if response.error: return None else: return response.body def _fetch(self, method, url, params=None): while True: response = self.http.fetch(url, cookies='rutracker.moz', headers=self.headers, method=method, params=params) if response.error: return None else: body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 if not self.re_auth.search(body): return body xbmc.log('RUTRACKER: Request auth', xbmc.LOGDEBUG) auth = self._auth() if not auth: return auth def _auth(self): self.captcha_sid, self.captcha_code, self.captcha_code_value = None, None, None while True: login = self.setting['rutracker_login'] password = self.setting['rutracker_password'] if not login or not password: self.setting.dialog() login = self.setting['rutracker_login'] password = self.setting['rutracker_password'] if not login or not password: return None params = {'login_username': login, 'login_password': password, 'login': r'вход'} if self.captcha_sid: params['login'] = r'Вход' params['cap_sid'] = self.captcha_sid params[self.captcha_code] = self.captcha_code_value response = self.http.fetch('http://rutracker.nl/forum/login.php', cookies='rutracker.moz', headers=self.headers, method='POST', params=params) self.captcha_sid, self.captcha_code, self.captcha_code_value = None, None, None if response.error: return None body = response.body.decode('windows-1251') if body.find(u'>форум временно отключен</p>') != -1: return 0 if not self.re_auth.search(body): return True # проверяем капчу r = self.re_captcha.search(body) if r: r_sid = self.re_captcha_sid.search(body) if not r_sid: return None self.captcha_sid = r_sid.group(1) r_code = self.re_captcha_code.search(body) if not r_code: return None self.captcha_code = r_code.group(1) self.captcha_code_value = self._captcha('http:' + r.group(1)) if not self.captcha_code_value: return None # get login k = xbmc.Keyboard('', 'Enter login') k.doModal() if k.isConfirmed(): login = k.getText() else: return None # get password k = xbmc.Keyboard('', 'Enter password', True) k.doModal() if k.isConfirmed(): password = k.getText() else: return None if not login or not password: return None self.setting['rutracker_login'] = login self.setting['rutracker_password'] = password def _captcha(self, captcha): response = self.http.fetch(captcha, headers=self.headers, method='GET') if response.error: return import tempfile filename = tempfile.gettempdir() + '/captcha' file(filename, 'wb').write(response.body) win = xbmcgui.Window(xbmcgui.getCurrentWindowId()) # width = 120px, height = 72px image = xbmcgui.ControlImage(win.getWidth()/2 - int(120/2), 20, 120, 72, filename) win.addControl(image) k = xbmc.Keyboard('', 'Enter captcha code') k.doModal() code = k.getText() if k.isConfirmed() else None win.removeControl(image) return code if code else None