def search(self, what, cat='all'): """ Method called by nova2. `what` is the already scaped search string, while `cat` restricts in which category the search should be performed. For each parsed line of the result, we put it in a dictionary and pass it to the prettyPrint function. """ data = retrieve_url('http://psychocydd.co.uk/torrents.php?search=%s' % what) soup = BeautifulSoup(data) res = soup.find_all('table', attrs={'width': '100%', 'class': 'lista'}) rows = res[5].find_all('tr') # by inspection, we want res[5] for row in rows[2:]: # by inspection, we want rows[2:] cells = row.find_all('td') # Columns of interest, all determined by inspection info = { 'name': cells[1].a.text, 'link': self.url + '/' + cells[4].a['href'], 'size': cells[6].text, 'seeds': cells[7].text, 'leech': cells[8].text, 'engine_url': self.url, } prettyPrinter(info)
def search(self, what): i = 1 while True: res = 0 dat = urllib.urlopen(self.url+'/search/%s/pg-%i'%(what,i)).read().decode('utf8', 'replace') # I know it's not very readable, but the SGML parser feels in pain section_re = re.compile("(?s)href='/torrent.*?<tr>") torrent_re = re.compile("(?s)href='/torrent.*?>(?P<name>.*?)</a>.*?" "title='(?P<seeds>\d+)\sseeders.*?" ",\s(?P<leech>\d+)\sdownloaders.*?" "href='(?P<link>.*?[^']+)'><img.*?src='/images/download.*?") for match in section_re.finditer(dat): txt = match.group(0) m = torrent_re.search(txt) if m: torrent_infos = m.groupdict() torrent_infos['engine_url'] = self.url torrent_infos['link'] = self.url+torrent_infos['link'] # This is a hack to return -1 # Size is not provided by shareTV torrent_infos['size'] = -1 prettyPrinter(torrent_infos) res = res + 1 if res == 0: break i = i + 1
def search(self, what, cat='all'): # Remove {} since isohunt does not seem # to handle those very well what = what.replace('{', '').replace('}', '') i = 1 while True and i<11: res = 0 dat = retrieve_url(self.url+'/torrents.php?ihq=%s&iht=%s&ihp=%s&ihs1=2&iho1=d'%(what, self.supported_categories[cat],i)) # I know it's not very readable, but the SGML parser feels in pain section_re = re.compile('(?s)id=link.*?</tr><tr') torrent_re = re.compile('(?s)torrent_details/(?P<link>.*?[^/]+).*?' '>(?P<name>.*?)</a>.*?' '>(?P<size>[\d,\.]+\s+MB)</td>.*?' '>(?P<seeds>\d+)</td>.*?' '>(?P<leech>\d+)</td>') for match in section_re.finditer(dat): txt = match.group(0) m = torrent_re.search(txt) if m: torrent_infos = m.groupdict() torrent_infos['name'] = re.sub('<.*?>', '', torrent_infos['name']) torrent_infos['engine_url'] = self.url torrent_code = torrent_infos['link'] torrent_infos['link'] = self.url + '/download/' + torrent_code torrent_infos['desc_link'] = self.url + '/torrent_details/' + torrent_code + '/dvdrip?tab=summary' prettyPrinter(torrent_infos) res = res + 1 if res == 0: break i = i + 1
def handle_data(self, data): if self.td_counter == 0: if "name" not in self.current_item: self.current_item["name"] = "" self.current_item["name"] += data elif self.td_counter == 4: if "size" not in self.current_item: self.current_item["size"] = data.strip() if self.current_item["size"] == "Pending": self.current_item["size"] = "" elif self.td_counter == 5: if "seeds" not in self.current_item: self.current_item["seeds"] = data.strip().replace(",", "") if not self.current_item["seeds"].isdigit(): self.current_item["seeds"] = 0 elif self.td_counter == 6: if "leech" not in self.current_item: self.current_item["leech"] = data.strip().replace(",", "") if not self.current_item["leech"].isdigit(): self.current_item["leech"] = 0 # display item self.td_counter = None self.current_item["engine_url"] = self.url if self.current_item["name"].find(u" \xbb"): self.current_item["name"] = self.current_item["name"].split(u" \xbb")[0] self.current_item["link"] += "&" + urlencode({"dn": self.current_item["name"].encode("utf-8")}) prettyPrinter(self.current_item) self.results.append("a")
def search(self, what, cat='all'): req = urllib.unquote(what) i = 0 results = 0 while i < 3: data = retrieve_url('https://api.btdigg.org/api/public-8e9a50f8335b964f/s01?%s' % urllib.urlencode(dict(q = req, p = i))) for line in data.splitlines(): if line.startswith('#'): continue info_hash, name, files, size, dl, seen = line.strip().split('\t')[:6] name = name.replace('|', '') res = dict(link = 'magnet:?xt=urn:btih:%s&dn=%s' % (info_hash, urllib.quote(name.encode('utf8'))), name = name, size = size, seeds = int(dl), leech = int(dl), engine_url = self.url, desc_link = '%s/search?%s' % (self.url, urllib.urlencode(dict(info_hash = info_hash, q = req)))) prettyPrinter(res) results += 1 if results == 0: break i += 1
def parse_search(self, what, start=0, first_page=True): """Search for what starting on specified page. Defaults to first page of results.""" logging.debug("parse_search({}, {}, {})".format(what, start, first_page)) # Search. parser = self.Parser(self.download_url, first_page) try: response = self.opener.open('{}?nm={}&start={}'.format(self.search_url, quote(what), start)) # Only continue if response status is OK. if response.getcode() != 200: raise HTTPError(response.geturl(), response.getcode(), "HTTP request to {} failed with status: {}".format(self.login_url, response.getcode()), response.info(), None) except (URLError, HTTPError) as e: logging.error(e) return data = response.read().decode('cp1251') parser.feed(data) parser.close() # PrettyPrint each torrent found. for torrent in parser.results: torrent['engine_url'] = self.url if __name__ != "__main__": # This is just to avoid printing when I debug. prettyPrinter(torrent) # If no torrent were found, stop immediately if parser.tr_counter == 0: return # Else return number of torrents found return (parser.tr_counter, parser.other_pages)
def search(self, what, cat='all'): ret = [] i = 1 while True and i<11: results = [] json_data = retrieve_url(self.url+'/json.php?q=%s&page=%d'%(what, i)) try: json_dict = json.loads(json_data) except: i += 1 continue if int(json_dict['total_results']) <= 0: return results = json_dict['list'] for r in results: try: if cat != 'all' and self.supported_categories[cat] != r['category']: continue res_dict = dict() res_dict['name'] = r['title'] res_dict['size'] = str(r['size']) res_dict['seeds'] = r['seeds'] res_dict['leech'] = r['leechs'] res_dict['link'] = r['torrentLink'] res_dict['desc_link'] = r['link'] res_dict['engine_url'] = self.url prettyPrinter(res_dict) except: pass i += 1
def search(self, what, cat='all'): req = urllib.parse.unquote(what) what_list = req.split() i = 0 results = 0 while i < 3: u = urllib.request.urlopen('https://api.btdigg.org/api/public-8e9a50f8335b964f/s01?%s' % urllib.parse.urlencode(dict(q = req, p = i))) for line in u: try: line = line.decode('utf8') if line.startswith('#'): continue info_hash, name, files, size, dl, seen = line.strip().split('\t')[:6] name = name.replace('|', '') # BTDigg returns unrelated results, we need to filter if not all(word in name.lower() for word in what_list): continue res = dict(link = 'magnet:?xt=urn:btih:%s&dn=%s' % (info_hash, urllib.parse.quote(name)), name = name, size = size, seeds = int(dl), leech = int(dl), engine_url = self.url, desc_link = '%s/search?%s' % (self.url, urllib.parse.urlencode(dict(info_hash = info_hash, q = req)))) prettyPrinter(res) results += 1 except: pass if results == 0: break i += 1
def search(self, what, cat="all"): json_data = retrieve_url( "".join( ( self.url, "api/v2/torrents/search/?phrase=", what, "&category=", self.supported_categories.get(cat, ""), ) ) ) json_dict = json.loads(json_data) if json_dict["results"] < 1: return for r in json_dict["torrents"]: r_dict = { "link": r["magnet_uri"], "name": r["torrent_title"], "size": str(r["size"]) + "B", "seeds": r["seeds"], "leech": r["leeches"], "desc_link": r["page"], "engine_url": self.url, } prettyPrinter(r_dict)
def search(self, what, cat='all'): try: self._sign_in() opener = self._opener data = opener.open(self.url + '/forum/tracker.php?nm=%s' % (urllib.quote(what.decode('utf8').encode('cp1251'))))\ .read() document = lxml.html.document_fromstring(data) info = {'engine_url': self.url} for t in document.cssselect('tr.tCenter'): try: a = t.xpath('.//a[contains(@href,"dl.php?t=")]')[0] info.update( name = (self.prefix + t.xpath('.//a[contains(@href,"tracker.php?f=")]')[0].text_content() + ' - ' + t.xpath('.//a[contains(@href,"viewtopic.php?t=")]')[0].text_content()), link = self.download_url + a.attrib['href'], size = a.text_content().replace(u'\xa0', ' ').replace(u' \u2193', ''), seeds = t.xpath('.//td[contains(@class,"seed")]')[0].text_content(), leech = t.xpath('.//td[contains(@class,"leech")]')[0].text_content() ) prettyPrinter(info) except IndexError: pass except Exception: try: with open(self.exc_log, 'a') as fo: fo.write(traceback.format_exc()) except Exception: pass
def handle_data(self, data): if not self.current_item is None: if self.size_found: #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] temp = data.split() if 'Size' in temp: sizeIn = temp.index('Size') self.current_item['size'] = temp[sizeIn + 1] self.size_found = False self.unit_found = True elif self.unit_found: temp = data.split() self.current_item['size'] = ' '.join((self.current_item['size'], temp[0])) self.unit_found = False elif self.seed_found: self.current_item['seeds'] += data.rstrip() elif self.leech_found: self.current_item['leech'] += data.rstrip() self.current_item['engine_url'] = self.url prettyPrinter(self.current_item) PREVIOUS_IDS.add(self.current_item['id']) self.results.append('a') self.current_item = None self.size_found = False self.unit_found = False self.seed_found = False self.leech_found = False
def search(self, what, cat='all'): i = 1 while True and i<11: results = [] url = self.url+'/api/list.json?sort=seeds&limit=50&keywords=%s&set=%s&genre=%s'%(what, i, self.supported_categories[cat]) json_data = retrieve_url(url) try: json_dict = json.loads(json_data) except: i += 1 continue try: results = json_dict['MovieList'] except KeyError: return else: for r in results: res_dict = dict() res_dict['name'] = r['MovieTitle'] res_dict['size'] = r['Size'] res_dict['seeds'] = r['TorrentSeeds'] res_dict['leech'] = r['TorrentPeers'] res_dict['link'] = r['TorrentUrl'] res_dict['desc_link'] = r['MovieUrl'] res_dict['engine_url'] = self.url prettyPrinter(res_dict) i += 1
def handle_starttag(self, tag, attrs): params = dict(attrs) if tag == 'a' and 'href' in params: if 'en/details/' in params['href'] and (self.td_counter is None or self.td_counter > 5): self.current_item = {} self.td_counter = 0 self.current_item['desc_link'] = params['href'] elif params['href'].startswith('http://torrents.sumotorrent.sx/download/'): parts = params['href'].strip().split('/') self.current_item['link'] = self.url + '/torrent_download/'+parts[-3]+'/'+parts[-2]+'/'+quote(parts[-1]).replace('%20', '+') elif tag == 'td' and isinstance(self.td_counter,int): self.td_counter += 1 if self.td_counter > 6: # Display item self.td_counter = None self.current_item['engine_url'] = self.url if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 self.current_item['name'] = self.current_item['name'].strip() try: #python2 self.current_item['name'] = self.current_item['name'].decode('utf8') except: pass prettyPrinter(self.current_item) self.results.append('a')
def handle_endtag(self, tag): if tag == "script": return if tag == "div": if self.meta_data_grabbing > 0: self.torrent_no_files = self.meta_data_array[2] # Not used self.torrent_date_added = self.meta_data_array[4] # Not used self.torrent_popularity = self.meta_data_array[6] # Not used self.current_item["size"] = self.meta_data_array[0] self.current_item["name"] = self.torrent_name self.current_item["engine_url"] = self.url self.current_item["link"] = self.mangnet_link self.current_item["desc_link"] = self.desc_link self.current_item["seeds"] = -1 self.current_item["leech"] = -1 prettyPrinter(self.current_item) self.results.append('a') self.current_item = {} self.meta_data_grabbing = 0 self.meta_data_array = [] self.mangnet_link = "" self.desc_link = "" self.torrent_name = ""
def search(self, what): i = 1 while True: res = 0 dat = urllib.urlopen(self.url+'/index.php?q=%s&p=%d'%(what,i)).read().decode('utf8', 'replace') print "url is "+self.url+'/index.php?q=%s&p=%d'%(what,i) # I know it's not very readable, but the SGML parser feels in pain section_re = re.compile('(?s)<a class="search_a_news".*?</li>') torrent_re = re.compile('(?s)<a class="search_a_news" href="(?P<link>.*?[^"]+).*?' 'Titre : (?P<name>.*?)- Comm.*?' 'Taille : (?P<size>.*?)</p></li>') for match in section_re.finditer(dat): txt = match.group(0) m = torrent_re.search(txt) if m: torrent_infos = m.groupdict() torrent_infos['name'] = re.sub('</?span.*?>', '', torrent_infos['name']) torrent_infos['engine_url'] = self.url torrent_infos['seeds'] = -1 torrent_infos['leech'] = -1 prettyPrinter(torrent_infos) res = res + 1 if res == 0: break i = i + 1
def handle_data(self, data): if self.td_counter == 0: if 'name' not in self.current_item: self.current_item['name'] = '' self.current_item['name'] += data elif self.td_counter == 3: if 'size' not in self.current_item: self.current_item['size'] = data.strip() if self.current_item['size'] == 'Pending': self.current_item['size'] = '' elif self.td_counter == 4: if 'seeds' not in self.current_item: self.current_item['seeds'] = data.strip().replace(',', '') if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 elif self.td_counter == 5: if 'leech' not in self.current_item: self.current_item['leech'] = data.strip().replace(',', '') if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 # display item self.td_counter = None self.current_item['engine_url'] = self.url if self.current_item['name'].find(' »'): self.current_item['name'] = self.current_item['name'].split(' »')[0] self.current_item['link'] += '&' + urlencode({'dn' : self.current_item['name']}) prettyPrinter(self.current_item) self.results.append('a')
def search(self, what, cat='all'): """Search for what on the search engine.""" # Instantiate parser self.parser = self.Parser(self) # Decode search string what = unquote(what) logging.info("Searching for {}...".format(what)) # Search on first page. logging.info("Parsing page 1.") self.parser.search(what) # If multiple pages of results have been found, repeat search for each page. logging.info("{} pages of results found.".format(len(self.parser.other_pages)+1)) for start in self.parser.other_pages: logging.info("Parsing page {}.".format(int(start)//50+1)) self.parser.search(what, start) # PrettyPrint each torrent found, ordered by most seeds self.parser.results.sort(key=lambda torrent:torrent['seeds'], reverse=True) for torrent in self.parser.results: torrent['engine_url'] = 'https://rutracker.org' # Kludge, see #15 if __name__ != "__main__": # This is just to avoid printing when I debug. prettyPrinter(torrent) self.parser.close() logging.info("{} torrents found.".format(len(self.parser.results)))
def start_td(self,attr): if isinstance(self.td_counter,int): self.td_counter += 1 if self.td_counter > 3: self.td_counter = None self.current_item["engine_url"] = self.url prettyPrinter(self.current_item) self.results.append("a")
def handle_endtag(self, tag): if tag == 'tr' and 'link' in self.current_item: # display item self.td_counter = None self.current_item['engine_url'] = self.url self.current_item['size'] = '' self.current_item['name'] = self.current_item['name'].strip() prettyPrinter(self.current_item) self.results.append('a')
def search(self, what, cat='all'): start = 0 while True: ds = list(self.search_page(what, cat, start)) if not ds: break for d in ds: prettyPrinter(d) start += 25
def handle_endtag(self, tag): """ Parser's end tag handler """ if tag == "tr" and self.current_item: self.current_item["engine_url"] = self.url prettyPrinter(self.current_item) self.current_item = None elif self.cur_item_name: if tag == "a" or tag == "td": self.cur_item_name = None
def search(self, what, cat='all'): start = 0 f = True while f and start < 51: f = False for d in self.search_page(what, cat, start): prettyPrinter(d) f = True start += 1
def handle_endtag(self, tag): if tag == "article": self.article_found = False elif self.item_name and (tag == "a" or tag == "td"): self.item_name = None elif self.item_found and tag == "tr": self.item_found = False if not self.item_bad: prettyPrinter(self.current_item) self.current_item = {}
def search(self, what, cat='all'): json_data = retrieve_url(self.url + 'api/v2/torrents/search/?phrase=' + what + '&category=' + self.supported_categories.get(cat, '')) json_dict = json.loads(json_data) if json_dict['results'] < 1: return for r in json_dict['torrents']: r_dict = {'link': r['magnet_uri'], 'name': r['torrent_title'], 'size': str(r['size']) + 'B', 'seeds': r['seeds'], 'leech': r['leeches'], 'desc_link': r['page'], 'engine_url': self.url} prettyPrinter(r_dict)
def search(self, what, cat='all'): start = 1 f = True while f and start < 21: page_results = self.search_page(what, cat, start) for d in page_results: prettyPrinter(d) if len(page_results) < 24: f = False start += 1
def search(self, what, cat='all'): start = 0 f = True while f: f = False for d in self.search_page(what, cat, start): if __name__ != "__main__": prettyPrinter(d) f = True start += 1
def search(self, what, cat='all'): # Get token baseURL = "https://torrentapi.org/pubapi_v2.php?%s" params = urlencode({'get_token': 'get_token', 'app_id' : 'qbittorrent'}) response = retrieve_url(baseURL % params) j = json.loads(response) token = j['token'] sleep(2.1) # get JSON what = unquote(what) category = self.supported_categories[cat] params = urlencode({ 'mode': 'search', 'search_string': what, 'ranked': 0, 'category': category, 'limit': 100, 'sort': 'seeders', 'format': 'json_extended', 'token': token, 'app_id' : 'qbittorrent' }) response = retrieve_url(baseURL % params) j = json.loads(response) for i in j['torrent_results']: tbytes = float(i['size']) size = "-1" if tbytes > 1024 * 1024 * 1024: size = "%.1f GB" % (tbytes / (1024 * 1024 * 1024)) elif tbytes > 1024 * 1024: size = "%.1f MB" % (tbytes / (1024 * 1024)) elif tbytes > 1024: size = "%.1f KB" % (tbytes / 1024) else: size = "%.1f B" % (tbytes) res = dict(link=i['download'], name=i['title'], size=size, seeds=i['seeders'], leech=i['leechers'], engine_url=self.url, desc_link=i['info_page']) prettyPrinter(res)
def end_tr(self): if self.td_counter == 5: self.td_counter = None # Display item if self.current_item and self.current_item.has_key('link'): self.current_item['engine_url'] = self.url if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 prettyPrinter(self.current_item) self.results.append('a')
def start_td(self,attr): if isinstance(self.td_counter,int): self.td_counter += 1 if self.td_counter > 7: self.td_counter = None if self.current_item: self.current_item['engine_url'] = self.url if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 prettyPrinter(self.current_item) self.results.append('a')
def handle_endtag(self, tag): if self.insideRow: if tag == "div": self.torrentrowDepth -= 1 if self.torrentrowDepth < 0: self.insideRow = False self.crtTorrent["name"] = ("__FREELEECH__" if self.isFree else "") + self.torrentRow["c2"] self.crtTorrent["size"] = str(int(round (float(self.torrentRow["c7"]) * 1024 * 1024))) self.crtTorrent["seeds"] = self.torrentRow["c9"] self.crtTorrent["leech"] = self.torrentRow["c10"] self.crtTorrent["engine_url"] = self.url prettyPrinter(self.crtTorrent) self.results.append('a')
def handle_endtag(self, tag): if self.item_name == 'name' and tag == self.SPAN: self.find_data = True self.end_name = True if self.inside_tr and tag == self.TR: self.inside_tr = False self.item_name = None self.find_data = False self.seed_found = False self.leech_found = False array_length = len(self.current_item) if array_length < 1: return prettyPrinter(self.current_item) self.current_item = {}
def handle_data(self, data): """ Parser's data handler """ if self.save_data: if self.save_data == "name": # names with special characters like '&' are splitted in several pieces if 'name' not in self.current_item: self.current_item['name'] = '' self.current_item['name'] += data else: self.current_item[self.save_data] = data self.save_data = None if self.current_item.__len__() == 7: self.current_item["size"] = self.size_repl.sub( "", self.current_item["size"]) prettyPrinter(self.current_item) self.current_item = None
def handle_endtag(self, tag): if tag == 'td': self.insideTd = False self.insideDataTd = False if tag == 'tr': self.tdCount = -1 if len(self.singleResData) > 0: # ignore trash stuff if self.singleResData['name'] != '-1': # ignore those with link and desc_link equals to -1 if (self.singleResData['desc_link'] != '-1' or self.singleResData['link'] != '-1'): prettyPrinter(self.singleResData) self.pageRes.append(self.singleResData) self.fullResData.append(self.singleResData) self.singleResData = self.getSingleData()
def pretty_print_results(self, results): for result in results: temp_result = { 'name': result['title'], 'size': result['size'], 'seeds': result['seed'], 'leech': result['leech'], 'engine_url': self.url, 'desc_link': result['pageLink'] } try: temp_result['link'] = result['magnetLink'] except KeyError: temp_result['link'] = str(-1) prettyPrinter(temp_result)
def handle_data(self, data): if self.handle_that_data: if self.save_data == "name": if 'name' not in self.current_item: self.current_item["name"] = "" self.current_item["name"] += data else: self.current_item[self.save_data] = data # all data collected if self.current_item.__len__() == 7: # remove preceding whitespaces self.current_item["name"] = self.name_repl.sub( "", self.current_item["name"]) prettyPrinter(self.current_item) self.current_item = None self.save_data = None
def __get_dic_lis(self, desc_link): url = desc_link[0] response = self.__urlGet(url) msgs = findall( r'<td colspan="6">.*?<a href="([^"]+)".*?/>([^>]+)</a>.*?<td> *(\d+) *次</td>\s*<td class="grey">(.*?)</td>', response, S) for i in msgs: link, lench, name = i[0], i[2], f'[更新:{i[-1]}]{i[1]}' try: size = search(r'\d+\.?\d* ?(?:G|M|K)(?=B?]?)', desc_link[1])[0] + 'B' except: size = '-1' link = quote(link.replace('dialog', 'download').replace('-ajax-1', ''), safe='/:') dic = {'name': name, 'seeds': '-1', 'leech': lench, 'size': size, 'link': link, 'desc_link': url, 'engine_url': self.url} prettyPrinter(dic)
def handle_endtag(self, tag): if tag == 'li': self.titleFound = False self.tagCount = -1 if len(self.singleResData) > 0: # ignore trash stuff if self.singleResData['name'] != '': # ignore those with desc_link equals to -1 if self.singleResData['desc_link'] != '-1': # remove trash from name self.singleResData['name'] = self.clearName( self.singleResData['name']) prettyPrinter(self.singleResData) self.pageRes.append(self.singleResData) self.fullResData.append(self.singleResData) self.singleResData = self.getSingleData()
def draw(self, html: str): torrents = RE_TORRENTS.findall(html) for tor in torrents: local = time.strftime("%y.%m.%d", time.localtime(int(tor[5]))) torrent_date = f"[{local}] " if config['torrentDate'] else "" prettyPrinter({ "engine_url": self.url, "desc_link": self.url + "viewtopic.php?t=" + tor[0], "name": torrent_date + unescape(tor[1]), "link": self.url_dl + tor[0], "size": tor[2], "seeds": max(0, int(tor[3])), "leech": tor[4] }) del torrents
def draw(self, html: str): torrents = re.findall(PATTERNS[1], html, re.S) for tor in torrents: local = time.strftime("%y.%m.%d", time.localtime(int(tor[6]))) torrent_date = f"[{local}] " if config['torrentDate'] else "" prettyPrinter({ "engine_url": self.url, "desc_link": self.url + tor[0], "name": torrent_date + unescape(tor[1]), "link": self.url + tor[2], "size": unescape(tor[3]), "seeds": tor[4] if tor[4].isdigit() else '0', "leech": tor[5] }) del torrents
def search(self, what, cat='all'): base_url = "https://torrentapi.org/pubapi_v2.php?%s" # get token params = urlencode({'get_token': 'get_token', 'app_id': 'qbittorrent'}) response = retrieve_url(base_url % params) j = json.loads(response) token = j['token'] time.sleep(2.1) # get response json what = unquote(what) category = self.supported_categories[cat] params = urlencode({ 'mode': 'search', 'search_string': what, 'ranked': 0, 'category': category, 'limit': 100, 'sort': 'seeders', 'format': 'json_extended', 'token': token, 'app_id': 'qbittorrent' }) response = retrieve_url(base_url % params) j = json.loads(response) # parse results for result in j['torrent_results']: res = { 'link': result['download'], 'name': result['title'], 'size': str(result['size']) + " B", 'seeds': result['seeders'], 'leech': result['leechers'], 'engine_url': self.url, 'desc_link': "%s&%s" % (result['info_page'], urlencode({'app_id': 'qbittorrent'})) } prettyPrinter(res)
def search(self, what, cat='all'): """ Performs search """ if cat != 'all': return search_url = "http://academictorrents.com/browse.php?search={what}" url = search_url.format(what=what) while url: response = retrieve_url(url) parser = MyHTMLParser() parser.feed(response) results = parser.get_results() for match in results["torrents"]: prettyPrinter(match) url = results["next_page"]
def search(self, what, cat="all"): """ Performs search """ page = 1 while page < 11: query = "".join((self.url, "/search?q=", what, "+category%3A", self.supported_categories[cat], "&fmt=rss")) if page > 1: query = query + "&pg=" + str(page) response = retrieve_url_nodecode(query) xmldoc = minidom.parseString(response) itemlist = xmldoc.getElementsByTagName('item') if len(itemlist) == 0: return for item in itemlist: zooqle_dict = zooqle_dict = {"engine_url": self.url} zooqle_dict['name'] = ( item.getElementsByTagName('title')[0].childNodes[0].data) zooqle_dict["size"] = (item.getElementsByTagName( 'enclosure')[0].attributes['length'].childNodes[0].data) if zooqle_dict["size"] == '0': zooqle_dict["link"] = (item.getElementsByTagName( 'torrent:magnetURI')[0].childNodes[0].data) else: zooqle_dict["link"] = (item.getElementsByTagName( 'enclosure')[0].attributes['url'].value) zooqle_dict["desc_link"] = ( item.getElementsByTagName('link')[0].childNodes[0].data) zooqle_dict["leech"] = (item.getElementsByTagName( 'torrent:peers')[0].childNodes[0].data) if not zooqle_dict["leech"].isdigit(): zooqle_dict["leech"] = '' zooqle_dict["seeds"] = (item.getElementsByTagName( 'torrent:seeds')[0].childNodes[0].data) if not zooqle_dict["seeds"].isdigit(): zooqle_dict["seeds"] = '' prettyPrinter(zooqle_dict) totalResultVal = (xmldoc.getElementsByTagName( 'opensearch:totalResults')[0].childNodes[0].data) startIndex = (xmldoc.getElementsByTagName('opensearch:startIndex') [0].childNodes[0].data) itemsPerPage = (xmldoc.getElementsByTagName( 'opensearch:itemsPerPage')[0].childNodes[0].data) if (int(startIndex) + int(itemsPerPage)) > int(totalResultVal): return page += 1 return
def search(self, what, cat='all'): """ Performs search """ # prepare query cat = self.supported_categories[cat.lower()] query = "".join((self.url, "/files/?category=", cat, "&subcategory=All&quality=All&seeded=2&external=2&query=", what, "&to=1&uid=0&sort=S")) data = retrieve_url(query) add_res_list = re_compile("/files.*page=[0-9]+") try: data = self.torrent_list.search(data).group(0) except AttributeError: if 'You must be logged in to to that!' in data: prettyPrinter({ 'seeds': -1, 'size': -1, 'leech': -1, 'engine_url': self.url, 'link': self.url, 'desc_link': query, 'name': 'It seems demonoid.pw is private at the moment. / ' + what }) return list_results = add_res_list.findall(data) parser = self.MyHtmlParseWithBlackJack(self.url) parser.feed(data) del data if list_results: # handling each gamepage in parallel, to not waste time on waiting for requests # for 10 pages this speeds up from 6.5s to 1.9s run time threads = [] search_queries = islice( (add_res_list.search(result).group(0) for result in list_results[1].split(" | ")), 0, 10) for search_query in search_queries: t = threading.Thread(target=self.handle_page, args=(search_query,)) threads.append(t) t.start() # search method needs to stay alive until all threads are done for t in threads: t.join() return
def search(self, what, cat='all'): # what is a string with the search tokens, already escaped (e.g. "Ubuntu+Linux") # cat is the name of a search category in ('all', 'movies', 'tv', 'music', 'games', 'anime', 'software', 'pictures', 'books') # q - query, f - filter, c - category base_url = 'https://sukebei.nyaa.si/?q=%s&f=0&c=0_0' base_url_with_query = base_url % what response = retrieve_url(base_url_with_query) soup = BeautifulSoup(response, 'html.parser') pagination_info = soup.find('div', {'class': 'pagination-page-info'}) PATTERN = 'Displaying results 1-(\d+) out of (\d+) results' parsed_pattern = re.search(PATTERN, pagination_info.text) item_per_pages = parsed_pattern.group(1) total_page = parsed_pattern.group(2) number_of_page = math.ceil(float(total_page) / float(item_per_pages)) for i in range(0, int(number_of_page)): base_url_with_query_and_page = base_url_with_query + '&p=%s' % str( i + 1) response = retrieve_url(base_url_with_query_and_page) soup = BeautifulSoup(response, 'html.parser') table = soup.find('table') table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: tds = row.find_all('td') ref = tds[1].find('a').get('href') title = tds[1].find('a').text link = tds[2].find_all('a')[-1].get('href') _size = tds[3].text size = _size[:-3] unit = _size[-3:] sizeInBytes = 0 if unit == "GiB": sizeInBytes = float(size) * 1073741824 elif unit == "MiB": sizeInBytes = float(size) * 1000000 seeders = tds[5].text leechers = tds[6].text res = dict(link=link, name=title, size=str(sizeInBytes), seeds=seeders, leech=leechers, engine_url=self.url, desc_link=self.url + ref) prettyPrinter(res)
def search(self, what, cat='all'): search_url = "{}/service/search?size=300&q={}".format(self.url, what) desc_url = "{}/#/search/torrent/{}/1".format(self.url, what) # get response json response = retrieve_url(search_url) response_json = json.loads(response) # parse results for result in response_json: res = {'link': self.download_link(result), 'name': result['name'], 'size': str(result['size_bytes']) + " B", 'seeds': result['seeders'], 'leech': result['leechers'], 'engine_url': self.url, 'desc_link': desc_url} prettyPrinter(res)
def search(self, what, cat='all'): query = "https://small-games.info/?go=search&go=search&search_text=" + what data = self.get_url(query).decode('utf-8', 'replace') match = re.compile( '<a title=\"(.*?)\"\shref=\"/.*?i=(\d*).*?Скачать\sигру\s\((.{2,11})\)' ) results = match.findall(data) name_clean = re.compile('[A-Za-z0-9].*') for res in results: self.result['name'] = name_clean.findall(res[0])[0] self.result[ 'link'] = self.url + "getTorrent.php?direct=1&gid=" + res[1] self.result['desc_link'] = self.url + "?go=game&c=61&i=" + res[1] # it always MB, and the M from the string is a weird russian one # so pretty printer will not recognize it self.result['size'] = res[2][:-3] + 'MB' prettyPrinter(self.result)
def start_td(self,attr): if isinstance(self.td_counter,int): self.td_counter += 1 if self.td_counter > 3: self.td_counter = None # add item to results if self.current_item: # TorrentReactor returns unrelated results, we need to filter if not all(word in self.current_item['name'].lower() for word in self.what_list): return self.current_item['engine_url'] = self.url if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 prettyPrinter(self.current_item) self.has_results = True self.results.append('a')
def search(self, what, cat='all'): json_data = retrieve_url(self.url + 'api/v2/torrents/search/?phrase=' + what + '&category=' + self.supported_categories.get(cat, '')) json_dict = json.loads(json_data) if json_dict['results'] < 1: return for r in json_dict['torrents']: r_dict = { 'link': r['magnet_uri'], 'name': r['torrent_title'], 'size': str(r['size']) + 'B', 'seeds': r['seeds'], 'leech': r['leeches'], 'desc_link': r['page'], 'engine_url': self.url } prettyPrinter(r_dict)
def draw(self, html: str): torrents = re.findall( r'd\stopic.+?href="(.+?)".+?<b>(.+?)</b>.+?href="(d.+?)"' r'.+?/u>\s(.+?)<.+?b>(\d+)</.+?b>(\d+)<', html, re.S) for tor in torrents: torrent = { "engine_url": self.url, "desc_link": self.url + tor[0], "name": tor[1], "link": self.url + tor[2], "size": tor[3].replace(',', '.'), "seeds": tor[4], "leech": tor[5] } prettyPrinter(torrent) del torrents
def handle_endtag(self, tag): # detecting that torrent row is closed and print all collected data if self.torrent_row and tag == 'tr': self.torrent["engine_url"] = self.url logging.debug('torrent row: ' + str(self.torrent)) prettyPrinter(self.torrent) self.torrent = {key: '' for key in self.torrent} self.index_td = 0 self.torrent_row = False self.found_torrents += 1 # detecting that table with result is close if self.result_table and tag == 'table': self.result_table = False # detecting that we found all pagination if self.paginator and tag == 'span': self.paginator = False
def handle_endtag(self, tag): if not self.pageComplete: if tag == 'div': self.insideDataDiv = False self.spanCount = -1 if len(self.singleResData) > 0: # ignore trash stuff if self.singleResData['name'] != '-1' and self.singleResData['size'] != '-1' \ and self.singleResData['name'].lower() != 'nome': # ignore those with link and desc_link equals to -1 if self.singleResData['desc_link'] != '-1' or self.singleResData['link'] != '-1': try: prettyPrinter(self.singleResData) except: print(self.singleResData) self.pageRes.append(self.singleResData) self.fullResData.append(self.singleResData) self.singleResData = self.getSingleData()
def start_td(self, attr): if isinstance(self.td_counter, int): self.td_counter += 1 if self.td_counter > 3: self.td_counter = None # Display item if self.current_item: if self.current_item['id'] in PREVIOUS_IDS: self.results = [] self.reset() return self.current_item['engine_url'] = self.url if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 prettyPrinter(self.current_item) PREVIOUS_IDS.add(self.current_item['id']) self.results.append('a')
def parse_search(self, what, start=0, first_page=True): """Search for what starting on specified page. Defaults to first page of results.""" logging.debug("parse_search({}, {}, {})".format( what, start, first_page)) # Search. parser = self.SimpleSGMLParser(self.download_url, first_page) page = self.opener.open('{}?nm={}&start={}'.format( self.search_url, urllib.parse.quote(what), start)) data = page.read().decode('cp1251') parser.feed(data) parser.close() # PrettyPrint each torrent found. for torrent in parser.results: torrent['engine_url'] = self.url if __name__ != "__main__": # This is just to avoid printing when I debug. prettyPrinter(torrent) return (parser.tr_counter, parser.other_pages)
def handle_endtag(self, tag): # we are exiting the table body # no data will be processed after this. if tag == self.TBODY: self.inside_tbody = False # exiting the table data and maybe moving td or tr element elif self.inside_tbody and self.inside_row and tag == self.TD: self.inside_row = False self.current_item = None # exiting the tr element, which means all necessary data # for a torrent has been extracted, we should save it # and clean the object's state. elif self.inside_tbody and tag == self.TR: self.current_result['leech'] = self.current_result['leeches'] prettyPrinter(self.current_result) self.current_result = {} self.current_item = None
def feed(self, html): self.pageResSize = 0 torrents = self.__findTorrents(html) resultSize = len(torrents) if resultSize == 0: return else: self.pageResSize = resultSize for torrent in range(resultSize): data = { 'link': torrents[torrent][0], 'name': torrents[torrent][1], 'size': torrents[torrent][2], 'seeds': torrents[torrent][3], 'leech': torrents[torrent][4], 'engine_url': self.url, 'desc_link': urllib.parse.unquote(torrents[torrent][0]) } prettyPrinter(data)
def handle_endtag(self, tag): if self.insideRow: if tag == "div": self.torrentrowDepth -= 1 if self.torrentrowDepth < 0: self.insideRow = False self.crtTorrent["name"] = ("__FREELEECH__" if self.isFree else "") + self.torrentRow["c2"] self.crtTorrent["size"] = str( int( round( float(self.torrentRow["c7"]) * 1024 * 1024))) self.crtTorrent["seeds"] = self.torrentRow["c9"] self.crtTorrent["leech"] = self.torrentRow["c10"] self.crtTorrent["engine_url"] = self.url prettyPrinter(self.crtTorrent) self.results.append('a')
def draw(self, html: str): torrents = RE_TORRENTS.findall(html) for tor in torrents: torrent_date = "" if config['torrentDate']: _loc = time.localtime(int(tor[6])) torrent_date = f'[{time.strftime("%y.%m.%d", _loc)}] ' prettyPrinter({ "engine_url": self.url, "desc_link": self.url + tor[0], "name": torrent_date + unescape(tor[1]), "link": self.url + tor[2], "size": tor[3], "seeds": tor[4], "leech": tor[5] }) del torrents
def _parse_document(self, data): document = lxml.html.document_fromstring(data) info = {'engine_url': self.url} for t in document.cssselect('tr.tCenter'): try: a = t.xpath('.//a[contains(@href,"dl.php?t=")]')[0] info.update( name = (self.prefix + t.xpath('.//a[contains(@href,"tracker.php?f=")]')[0].text_content() + ' - ' + t.xpath('.//a[contains(@href,"viewtopic.php?t=")]')[0].text_content()), link = self.download_url + a.attrib['href'], size = a.text_content().replace(u'\xa0', ' ').replace(u' \u2193', ''), seeds = t.xpath('.//td[contains(@class,"seed")]')[0].text_content(), leech = t.xpath('.//td[contains(@class,"leech")]')[0].text_content() ) prettyPrinter(info) except IndexError: pass
def start_li(self, attr): if isinstance(self.li_counter, int): self.li_counter += 1 if self.li_counter > 3: self.li_counter = None # Display item if self.current_item: self.current_item['engine_url'] = self.url if not self.current_item['seeds'].isdigit(): self.current_item['seeds'] = 0 if not self.current_item['leech'].isdigit(): self.current_item['leech'] = 0 # Search should use AND operator as a default tmp = self.current_item['name'].upper() if self.what is not None: for w in self.what: if tmp.find(w) < 0: return prettyPrinter(self.current_item) self.results.append('a')
def draw(self, html: str): torrents = re.findall(PATTERNS[1], html, re.S) for tor in torrents: torrent_date = "" if config['torrentDate']: _loc = time.localtime(int(tor[6])) torrent_date = f'[{time.strftime("%y.%m.%d", _loc)}] ' prettyPrinter({ "engine_url": self.url, "desc_link": self.url + tor[0], "name": torrent_date + tor[1], "link": self.url + tor[2], "size": tor[3].replace(',', '.'), "seeds": tor[4], "leech": tor[5] }) del torrents