class ResultTest(unittest.TestCase): def setUp(self): self.result = Result() self.result.title = 'test title' # Include def test_not_validate_result_include_regex(self): regex = re.compile('\\bother\\b', re.I) self.assertFalse(self.result._validate_title(include=regex)) def test_not_validate_result_include_string(self): regex = '\\bother\\b' self.assertFalse(self.result._validate_title(include=regex)) def test_validate_result_include_regex(self): regex = re.compile('\\btest\\b', re.I) self.assertTrue(self.result._validate_title(include=regex)) def test_validate_result_include_string(self): regex = '\\btest\\b' self.assertTrue(self.result._validate_title(include=regex)) # Exclude def test_not_validate_result_exclude_regex(self): regex = re.compile('\\btest\\b', re.I) self.assertFalse(self.result._validate_title(exclude=regex)) def test_not_validate_result_exclude_string(self): regex = '\\btest\\b' self.assertFalse(self.result._validate_title(exclude=regex)) def test_validate_result_exclude_regex(self): regex = re.compile('\\bother\\b', re.I) self.assertTrue(self.result._validate_title(exclude=regex)) def test_validate_result_exclude_string(self): regex = '\\bother\\b' self.assertTrue(self.result._validate_title(exclude=regex))
def results(self, query, sort='date', pages_max=1, **kwargs): sort = SORT_DEF[sort] for page in range(1, pages_max + 1): data = self._send(query, page, sort) tree = etree.fromstring(data) try: results = int(tree.xpath('hasResults')[0].text) except (ValueError, IndexError): raise SearchError('failed to get results count from "%s"' % data) if not results: return hits = int(tree.xpath('results/hitsForThisPage')[0].text) if not hits: return for res in tree.xpath('results/hits'): url = res.xpath('link')[0].text if not url: logger.error('failed to get url from %s', data) continue size = res.xpath('size')[0].text if not size: logger.error('failed to get size from %s', data) continue date = res.xpath('added')[0].text if not date: logger.error('failed to get date from %s', data) continue result = Result() result.auto = False result.type = 'filestube' result.title = clean(res.xpath('name')[0].text) result.url = url result.size = get_size(size) result.date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') if not result.validate(**kwargs): continue yield result
def results(self, query, category=None, sort='date', pages_max=1, **kwargs): if not self.url: raise SearchError('no data') for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: if is_url(query): if not self.browser.open(query): raise SearchError('no data') else: fields = {'q': query} if category: val = CAT_DEF.get(category.lower()) if val: fields['t'] = [val] if not self.browser.submit_form(self.url, fields=fields): raise SearchError('no data') self._sort(sort) lis = self.browser.cssselect('#torrents li') if not lis: if lis is None: raise SearchError('no data') elif RE_OVERLOAD.search(self.browser.tree.text_content()): raise SearchError('overload') for el in lis: log = html.tostring(el, pretty_print=True)[:1000] result = Result() result.type = 'torrent' result.safe = False links = el.cssselect('a') if not links: logger.error('failed to get title from %s', log) continue result.title = clean(html.tostring(links[0])) details = el.cssselect('.torInfo') if not details: logger.error('failed to get details from %s', log) continue res = RE_DETAILS.search(html.tostring(details[0])) if not res: continue result.category = res.group(1).strip(' ').lower() date = res.group(3) result.date = self._get_date(date) if not result.date: logger.error('failed to get date from "%s"', date) continue seeds = details[0].cssselect('span.seeders') if seeds: try: result.seeds = int(seeds[0].text.replace(',', '')) except ValueError: pass tds = el.cssselect('tr td') if not tds: logger.error('failed to get size from %s', log) continue if not result.get_size(tds[0].text): continue url_info = urljoin(self.url, links[0].get('href')).encode('utf-8') result.url = self._get_torrent_url(url_info) if not result.url: logger.error('failed to get magnet url from %s', url_info) continue if not result.get_hash(): continue if not result.validate(**kwargs): continue yield result
def setUp(self): self.result = Result() self.result.title = 'test title'
def results(self, query, category=None, sort='date', pages_max=1, **kwargs): if not self.url: raise SearchError('no data') for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: if is_url(query): if not self.browser.open(query): raise SearchError('no data') else: fields = {'q': query} if category: val = CAT_DEF.get(category.lower()) if val: fields[val] = ['on'] if not self.browser.submit_form(self.url, fields=fields): raise SearchError('no data') self._sort(sort) trs = self.browser.cssselect('#searchResult tr:not([class="header"])') if not trs: if trs is None: raise SearchError('no data') elif RE_OVERLOAD.search(self.browser.tree.text_content()): raise SearchError('overload') for tr in trs: if len(tr) < 4: continue log = html.tostring(tr, pretty_print=True)[:1000] result = Result() result.type = 'torrent' result.safe = False try: result.category = tr[0].cssselect('a')[0].text.lower() except Exception: logger.error('failed to get category from %s', log) res = tr.cssselect('div.detName a') if not res: logger.error('failed to get title from %s', log) continue result.title = res[0].text result.url = self._get_torrent_url(tr) if not result.url: logger.error('failed to get magnet url from %s', log) continue if not result.get_hash(): continue res = tr.cssselect('.detDesc') if not res: logger.error('failed to get details from %s', log) continue details = clean(html.tostring(res[0])) res_ = RE_DETAILS.search(details) if not res_: logger.error('failed to parse details: %s', details) continue date, size = res_.groups() if not result.get_size(size): continue if not result.validate(**kwargs): continue try: result.date = self._get_date(date) except Exception, e: logger.error('failed to get date from "%s": %s', date, str(e)) continue try: result.seeds = int(tr[2].text) except Exception: pass yield result
def results(self, query, category=None, sort='date', pages_max=1, **kwargs): if not self.url: raise SearchError('no data') for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: if is_url(query): if not self.browser.open(query): raise SearchError('no data') else: if not self.browser.submit_form(self.url, index=0, fields={'q': query}): raise SearchError('no data') if sort != 'popularity': # default sort is peers ('popularity') self._sort(sort) divs = self.browser.cssselect('div.results') if divs is None: raise SearchError('no data') # Skip approximate matches res = self.browser.cssselect('div.results h3') if res and RE_APPROXIMATE_MATCH.search(html.tostring(res[0])): break for div in divs: # Skip sponsored links res = div.cssselect('h2') if res and RE_SPONSORED_LINK.search(html.tostring(res[0])): continue for dl in div.cssselect('dl'): links = dl.cssselect('a') if not links: continue log = html.tostring(dl, pretty_print=True)[:1000] result = Result() result.type = 'torrent' result.safe = False title = self.get_link_text(html.tostring(links[0])) if not title: continue result.title = clean(title) try: res = RE_CATEGORIES.search(html.tostring(links[0])) result.category = self._get_category(res.group(1)) except Exception: logger.error('failed to get category info from %s', log) if category and category != result.category: continue if dl.cssselect('span.pe'): # skip 'pending' results (missing date and size) continue try: date = dl.cssselect('.a')[0][0].get('title') result.date = self._get_date(date) except Exception: logger.debug('failed to get date from %s', log) continue try: size = dl.cssselect('.s')[0].text except Exception: logger.debug('failed to get size from %s', log) continue if not result.get_size(size): continue if not result.validate(**kwargs): continue try: seeds = dl.cssselect('.d')[0].text result.seeds = int(seeds.replace(',', '')) except Exception: logger.debug('failed to get seeds from %s', log) # Find torrent url url_info = urljoin(self.url, links[0].get('href')) result.url = self._get_torrent_url(query, url_info) if not result.url: continue if not result.get_hash(): continue yield result
def results(self, query, sort='date', pages_max=1, **kwargs): if not self.url: raise SearchError('no data') url = None for i in range(pages_max): if i == 0: if not self.browser.submit_form(url, fields={'q': query}): raise SearchError('no data') else: tables = self.browser.cssselect('table') if not tables: continue links = tables[-1].cssselect('a') if not links: break next_text = self.get_link_text(html.tostring(links[-1])) if next_text != '>': break url = urljoin(self.url, links[-1].get('href')) if not self.browser.open(url): raise SearchError('no data') for tr in self.browser.cssselect('table#r2 tr', []): if tr.cssselect('th'): continue log = html.tostring(tr, pretty_print=True)[:1000] result = Result() result.type = 'binsearch' titles = tr.cssselect('span.s') if not titles: continue title = titles[0].text res = RE_TITLE.findall(title) if res: title = res[0] result.title = clean(title) age = tr[-1].text if not age: logger.error('failed to get age from %s', log) result.date = self._get_date(age) refs = tr.cssselect('input[type="checkbox"]') if not refs: logger.error('failed to get references list from %s', log) continue ref = refs[0].get('name') if not ref: logger.error('failed to get reference from %s', log) continue result.ref = ref info = tr.cssselect('span.d') if not info: continue links = info[0].cssselect('a') if not links or not RE_COLLECTION.search(links[0].text): continue result.url = urljoin(self.url, links[0].get('href')) info = clean(html.tostring(info[0])) if RE_PASSWORD.search(info): continue res = RE_SIZE.search(info) if not res: continue result.size = get_size(res.group(1)) res = RE_PARTS.search(info) if not res or res.group(1) != res.group(2): continue if not result.validate(**kwargs): continue yield result
def results(self, query, category=None, sort='date', pages_max=1, **kwargs): if not self.url: raise SearchError('no data') for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: if is_url(query): if not self.browser.open(query): raise SearchError('no data') else: fields = {'ihq': query} if not self.browser.submit_form(self.url, fields=fields): raise SearchError('no data') self._sort(sort) trs = self.browser.cssselect('.table-torrents tr[data-key]') if not trs: if trs is None: raise SearchError('no data') elif RE_OVERLOAD.search(self.browser.tree.text_content()): raise SearchError('overload') for tr in trs: log = html.tostring(tr, pretty_print=True)[:1000] result = Result() result.type = 'torrent' result.safe = False category_ = tr.cssselect('.category-row span') if not category_: category = None else: try: category = category_[0].get('title').lower() except Exception: category = None if not category: logger.error('failed to get category from %s', log) else: result.category = category links_ = tr.cssselect('.title-row a') if not links_: logger.error('failed to get title link from %s', log) continue try: result.title = links_[0].cssselect('span')[0].text except Exception: logger.error('failed to get title from %s', log) continue url_info = urljoin(self.url, links_[0].get('href')) size_ = tr.cssselect('.size-row') if not size_: logger.error('failed to get size from %s', log) continue size = size_[0].text if not result.get_size(size): logger.error('failed to get size from "%s"', size) continue date_ = tr.cssselect('.date-row') if not date_: logger.error('failed to get size from %s', log) continue date = date_[0].text try: result.date = self._get_date(date) except Exception, e: logger.error('failed to get date from "%s": %s', date, str(e)) continue if not result.validate(**kwargs): continue result.url = self._get_torrent_url(url_info) if not result.url: logger.error('failed to get magnet url from %s', url_info) continue if not result.get_hash(): continue try: result.seeds = int(tr[-2].text) except Exception: logger.error('failed to get seeds from %s', log) yield result
def results(self, query, category=None, pages_max=1, **kwargs): if not self.url: raise SearchError('no data') url = '%s?%s' % (QUERY_URL, urlencode({'nm': query})) for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: if not self.browser.open(url): raise SearchError('no data') trs = self.browser.cssselect('#tor-tbl tbody tr') if not trs: if trs is None: raise SearchError('no data') elif RE_OVERLOAD.search(self.browser.tree.text_content()): raise SearchError('overload') for el in trs: if len(el) == 1: continue log = html.tostring(el, pretty_print=True)[:1000] result = Result() result.type = 'rutracker' result.safe = False result.category = None links = el[3].cssselect('a') if not links: logger.error('failed to get title from %s', log) continue result.title = clean(html.tostring(links[0])) links = el[5].cssselect('a') if not links: logger.debug('failed to get torrent url from %s', html.tostring(el[5])) continue result.url = links[0].get('href') size = clean(links[0].text or '').replace('_', ' ').strip() if not result.get_size(size): continue seeds = el[6].cssselect('.seedmed') if seeds: try: result.seeds = int(seeds[0].text) except ValueError: pass els = el[9].cssselect('u') if not els: logger.error('failed to get date from %s', log) continue try: result.date = datetime.utcfromtimestamp(int(els[0].text)) except ValueError: logger.error('failed to get date from %s', els[0].text) continue if not result.validate(**kwargs): continue yield result