Пример #1
0
 def last_entry(self):
     r = retry_on_fail(requests.get, self.url)
     soup = BeautifulSoup(r.text, 'lxml')
     link = soup.find('tr',
                      class_='tlistrow').find('td',
                                              class_='tlistname').a['href']
     return int(re.search('tid=([0-9]*)', link).group(1))
Пример #2
0
 def baka_list(self):
     page_index = {}
     page = self.url + 'browse.php?page=' + self.pagen
     self.last_category = ''
     print('> Index page', self.pagen, 'of', self.pagentotal)
     r = retry_on_fail(requests.get, page)
     setattr(r, 'encoding', 'utf-8')
     self.page = BeautifulSoup(r.text, 'lxml')
     table = self.page.find('table',
                            class_='torrents').find('tbody').find_all('tr')
     for tds in table:
         tdx = 0
         for td in tds.find_all('td'):
             append = False
             tdx += 1
             if len(tds) == 5:
                 append = True
                 if tdx == 1:
                     self.td1 = td
                 if tdx == 2:
                     self.td2 = td
                 elif tdx == 3:
                     self.td3 = td
                 elif tdx == 4:
                     self.td4 = td
                 elif tdx == 5:
                     self.td5 = td
             elif len(tds) == 4 and 'Alternative versions' not in tds.text:
                 append = True
                 if tdx == 1:
                     self.td2 = td
                 if tdx == 2:
                     self.td3 = td
                 elif tdx == 3:
                     self.td4 = td
                 elif tdx == 4:
                     self.td5 = td
             if append == True:
                 if len(tds) == 5 and tdx == 5 or len(
                         tds) == 4 and tdx == 4:
                     baka_id = self.baka_url_id
                     page_index[baka_id] = {}
                     page_index[baka_id]['baka_url_id'] = self.baka_url_id
                     page_index[baka_id]['baka_url'] = self.baka_url
                     if len(tds) == 5:
                         page_index[baka_id]['category'] = self.category
                     elif len(tds) == 4:
                         page_index[baka_id][
                             'category'] = self.last_category
                     page_index[baka_id]['title_orig'] = self.title_orig
                     page_index[baka_id]['title'] = self.title
                     page_index[baka_id]['resolution'] = self.resolution
                     page_index[baka_id]['sb'] = str(self.sb)
                     page_index[baka_id]['cb'] = str(self.cb)
                     page_index[baka_id]['tags'] = str(self.tags)
                     page_index[baka_id]['added'] = self.added
                     page_index[baka_id]['size'] = self.size
                     page_index[baka_id]['sld'] = str(self.sld)
     return page_index
Пример #3
0
    def __init__(self, baka_url, baka_title):
        self.baka_url = baka_url
        self.title = baka_title
        self.exists = True

        r = retry_on_fail(requests.get, self.baka_url)
        setattr(r, 'encoding', 'utf-8')
        self.page = BeautifulSoup(r.text, 'lxml')
Пример #4
0
 def magnet(self):
     try:
         r = retry_on_fail(requests.head, self.download_url)
         if 'magnet' not in r:
             print('Aliased torrent, skipping...')
             return None
         return r
     except:
         return None
Пример #5
0
 def last_entry(url):
     baka_url = url + 'browse.php'
     r = retry_on_fail(requests.get, baka_url)
     setattr(r, 'encoding', 'utf-8')
     page = BeautifulSoup(r.text, 'lxml')
     pager = page.find('div', class_='pager')
     pages = []
     for link in pager.find_all('a', href=True):
         if (int(re.sub('.*?([0-9]*)$', r'\1', link['href']))):
             pages.append(re.sub('.*?([0-9]*)$', r'\1', link['href']))
     return int(max(pages))
Пример #6
0
 def last_entry(self):
     try:
         r = retry_on_fail(requests.get, self.url + 'anime.php?o=9')
         soup = BeautifulSoup(r.text, 'lxml')
         seasonal = soup.find('div', class_='js-categories-seasonal')
         link = seasonal.findAll('tr')[1].find('td').a['href']
         if 'myanimelist.net/anime' in link:
             link = link.split('/')[4]
             return int(link)
         else:
             sys.exit('Failed retrieve last_entry')
     except:
         sys.exit('Failed retrieve last_entry')
Пример #7
0
 def torrent(self):
     for link in self.page.find_all('a', class_='download_link', href=True):
         if '.torrent' in link['href']:
             parsed_uri = urlparse(self.baka_url)
             domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
             self.full_baka = domain + link['href']
             r = retry_on_fail(requests.get, self.full_baka)
             if r.content:
                 torrent = r.content
                 return torrent
         elif '#' == link['href']:
             return None
         else:
             print('no link')
     return None
Пример #8
0
    def __init__(self, nyaa, nyaa_id):
        self.info_url = '{}{}'.format(nyaa.info_url, nyaa_id)
        self.download_url = '{}{}&magnet=1'.format(nyaa.dl_url, nyaa_id)
        self.nyaa_id = '{}'.format(nyaa_id)

        r = retry_on_fail(requests.get, self.info_url)
        setattr(r, 'encoding', 'utf-8')
        self.page = BeautifulSoup(r.text, 'lxml')
        content = self.page.find('div', class_='content').text
        if 'The torrent you are looking for does not appear to be in the database' in content:
            #print('{}{} not exist...'.format(nyaa.info_url, nyaa_id))
            self.exists = False
        elif 'The torrent you are looking for has been deleted' in content:
            print(2)
            self.exists = False
        else:
            self.exists = True
Пример #9
0
def retrieve_anime(id_ref=1, requester=request_passthrough):
    """Return the metadata for a particular show.

    Args:
        id_ref (Optional(int)): Internal show identifier
        requester (Optional(requests-like)): HTTP request maker
            This allows us to control/limit/mock requests.

    Return:
        None if we failed to download the page, otherwise a tuple of two dicts
        (retrieval information, anime information).

        The retrieval information will include the keys:
            success (bool): Was *all* the information was retrieved?
                (Some keys from anime information may be missing otherwise.)
            scraper_retrieved_at (datetime): When the request was completed.
            id_ref (int): id_ref of this anime.
        The anime information will include the keys:
            See tests/mal_scraper/test_anime.py::test_download_first
    """
    url = get_url_from_id_ref(id_ref)
    #response = requester.get(url, headers = {'User-agent': 'test'}) # custom user agent to avoid 429 (too many requests) error
    response = retry_on_fail(requests.get, url)
    if not response:
        return 404
    if not response.ok:
        return response.status_code

    soup = BeautifulSoup(response.content, 'html.parser')
    success, info = _process_soup(soup)

    if not success:
        logger.warn('Failed to properly process the page "%s".', url)

    retrieval_info = {
        'success': success,
        'scraper_retrieved_at': datetime.utcnow(),
        'id_ref': id_ref,
    }

    return (retrieval_info, info)