Пример #1
0
 def iter_subtitles(self, language):
     linksresults = self.parser.select(self.document.getroot(), "a.subtitle_page_link")
     for link in linksresults:
         id = unicode(link.attrib.get("href", "").split("-p")[-1])
         name = unicode(link.text_content())
         tr = link.getparent().getparent().getparent()
         cdtd = self.parser.select(tr, "td")[4]
         nb_cd = int(cdtd.text)
         description = NotLoaded
         subtitle = Subtitle(id, name)
         subtitle.nb_cd = nb_cd
         subtitle.language = language
         subtitle.description = description
         yield subtitle
Пример #2
0
 def iter_subtitles(self, language):
     linksresults = self.parser.select(self.document.getroot(), 'a.subtitle_page_link')
     for link in linksresults:
         id = unicode(link.attrib.get('href', '').split('-p')[-1])
         name = unicode(link.text_content())
         tr = link.getparent().getparent().getparent()
         cdtd = self.parser.select(tr, 'td')[4]
         nb_cd = int(cdtd.text)
         description = NotLoaded
         subtitle = Subtitle(id, name)
         subtitle.nb_cd = nb_cd
         subtitle.language = language
         subtitle.description = description
         yield subtitle
Пример #3
0
 def iter_subtitles(self):
     season = ''
     series_name = CleanText('//div[has-class("msg")]//h1//span[@itemprop="name"]')(self.doc)
     # A regexp to recover the sub id from url
     regexp = re.compile('.*/imdbid-(?P<episode_id>\d+)$')
     for sub in self.doc.xpath('//table[@id="search_results"]//tbody//tr[not(contains(@class,"head"))]'):
         if not Attr('.', 'class', default=None)(sub):
             season = CleanText('.//td[1]')(sub)
         else:
             subtitle = Subtitle()
             episode = CleanText('.//td[1]')(sub)
             subtitle.name = '%s - %s - Episode %s' % (series_name, season, episode)
             url = Link('.//td[3]//a')(sub)
             subtitle.url = self.browser.absurl(url)
             result = regexp.match(url)
             subtitle.id = result.groupdict()['episode_id']
             yield subtitle
Пример #4
0
    def get_subtitle(self, id):
        href = id.split('|')[1]
        # we have to find the 'tr' which contains the link to this address
        a = self.parser.select(self.document.getroot(), 'a[href="%s"]' % href,
                               1)
        line = a.getparent().getparent().getparent().getparent().getparent()
        cols = self.parser.select(line, 'td')
        traduced_title = self.parser.select(cols[0], 'font', 1).text.lower()
        original_title = self.parser.select(cols[1], 'font', 1).text.lower()
        nb_cd = self.parser.select(cols[2], 'font', 1).text.strip()
        nb_cd = int(nb_cd.split()[0])

        traduced_title_words = traduced_title.split()
        original_title_words = original_title.split()

        # this is to trash special spacing chars
        traduced_title = " ".join(traduced_title_words)
        original_title = " ".join(original_title_words)

        name = unicode('%s (%s)' % (original_title, traduced_title))
        url = unicode('http://davidbillemont3.free.fr/%s' % href)
        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = url.split('.')[-1]
        subtitle.language = unicode('fr')
        subtitle.nb_cd = nb_cd
        subtitle.description = NotAvailable
        return subtitle
Пример #5
0
    def get_subtitle(self):
        filename_line = self.doc.xpath('//img[@alt="filename"]')[0].getparent().getparent()
        name = to_unicode(filename_line.xpath('.//td')[2].text)
        id = self.url.split('/')[-1].replace('.html', '').replace('subtitle-', '')
        url = '%s/download-%s.html' % (self.browser.BASEURL, id)
        amount_line, = self.doc.xpath('//tr[contains(@title, "amount")]')
        nb_cd = int(amount_line.xpath('.//td')[2].text)
        lang = url.split('-')[-1].split('.html')[0]
        filenames_line, = self.doc.xpath('//tr[contains(@title,"list")]')
        file_names = filenames_line.xpath('.//td')[2].text_content().strip().replace('.srt', '.srt\n')
        desc = u"files :\n"
        desc += file_names

        m = re.match('(.*?)\.(\w+)$', name)
        if m:
            name = m.group(1)
            ext = m.group(2)
        else:
            ext = 'zip'

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = ext
        subtitle.language = lang
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #6
0
    def get_subtitle(self):
        filename_line = self.parser.select(self.document.getroot(),
                                           'img[alt=filename]',
                                           1).getparent().getparent()
        name = unicode(self.parser.select(filename_line, 'td')[2].text)
        id = self.browser.geturl().split('/')[-1].replace('.html', '').replace(
            'subtitle-', '')
        url = unicode('http://%s/download-%s.html' % (self.browser.DOMAIN, id))
        amount_line = self.parser.select(self.document.getroot(),
                                         'tr[title~=amount]', 1)
        nb_cd = int(self.parser.select(amount_line, 'td')[2].text)
        lang = unicode(url.split('-')[-1].split('.html')[0])
        filenames_line = self.parser.select(self.document.getroot(),
                                            'tr[title~=list]', 1)
        file_names = self.parser.select(
            filenames_line,
            'td')[2].text_content().strip().replace('.srt', '.srt\n')
        desc = u"files :\n"
        desc += file_names

        m = re.match('(.*?)\.(\w+)$', name)
        if m:
            name = m.group(1)
            ext = m.group(2)
        else:
            ext = 'zip'

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = ext
        subtitle.language = lang
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #7
0
 def iter_subtitles(self):
     season = ''
     series_name = CleanText(
         '//div[has-class("msg")]//h1//span[@itemprop="name"]')(self.doc)
     # A regexp to recover the sub id from url
     regexp = re.compile('.*/imdbid-(?P<episode_id>\d+)$')
     for sub in self.doc.xpath(
             '//table[@id="search_results"]//tbody//tr[not(contains(@class,"head"))]'
     ):
         if not Attr('.', 'class', default=None)(sub):
             season = CleanText('.//td[1]')(sub)
         else:
             subtitle = Subtitle()
             episode = CleanText('.//td[1]')(sub)
             subtitle.name = '%s - %s - Episode %s' % (series_name, season,
                                                       episode)
             url = Link('.//td[3]//a')(sub)
             subtitle.url = self.browser.absurl(url)
             result = regexp.match(url)
             subtitle.id = result.groupdict()['episode_id']
             yield subtitle
Пример #8
0
    def get_subtitle(self, id):
        href = id.split('|')[1]
        # we have to find the 'tr' which contains the link to this address
        a = self.parser.select(self.document.getroot(), 'a[href="%s"]' % href, 1)
        line = a.getparent().getparent().getparent().getparent().getparent()
        cols = self.parser.select(line, 'td')
        traduced_title = self.parser.select(cols[0], 'font', 1).text.lower()
        original_title = self.parser.select(cols[1], 'font', 1).text.lower()
        nb_cd = self.parser.select(cols[2], 'font', 1).text.strip()
        nb_cd = int(nb_cd.split()[0])

        traduced_title_words = traduced_title.split()
        original_title_words = original_title.split()

        # this is to trash special spacing chars
        traduced_title = " ".join(traduced_title_words)
        original_title = " ".join(original_title_words)

        name = unicode('%s (%s)' % (original_title, traduced_title))
        url = unicode('http://davidbillemont3.free.fr/%s' % href)
        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = url.split('.')[-1]
        subtitle.language = unicode('fr')
        subtitle.nb_cd = nb_cd
        subtitle.description = NotAvailable
        return subtitle
Пример #9
0
    def get_subtitle(self):
        desc = NotAvailable
        a = self.parser.select(self.document.getroot(), 'a#bt-dwl', 1)
        id = a.attrib.get('rel', '').split('/')[-1]
        m = re.match('Download \((\w+)\)', self.parser.tocleanstring(a))
        if m:
            ext = m.group(1)
        else:
            ext = u'zip'
        url = unicode('http://www.opensubtitles.org/subtitleserve/sub/%s' % id)
        link = self.parser.select(self.document.getroot(), 'link[rel=bookmark]', 1)
        title = unicode(link.attrib.get('title', ''))
        nb_cd = int(title.lower().split('cd')[0].split()[-1])
        lang = unicode(title.split('(')[1].split(')')[0])
        file_names = self.parser.select(self.document.getroot(), "img[title~=filename]")
        if len(file_names) > 0:
            file_name = file_names[0].getparent().text_content()
            file_name = ' '.join(file_name.split())
            desc = u'files :'
            for f in file_names:
                desc_line = f.getparent().text_content()
                desc += '\n'+' '.join(desc_line.split())
        name = unicode('%s (%s)' % (title, file_name))

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = ext
        for lshort, llong in LANGUAGE_CONV.items():
            if lang == llong:
                lang = unicode(lshort)
                break
        subtitle.language = lang
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #10
0
    def get_subtitle(self):
        filename_line = self.parser.select(self.document.getroot(), 'img[alt=filename]', 1).getparent().getparent()
        name = unicode(self.parser.select(filename_line, 'td')[2].text)
        id = self.browser.geturl().split('/')[-1].replace('.html', '').replace('subtitle-', '')
        url = unicode('http://%s/download-%s.html' % (self.browser.DOMAIN, id))
        amount_line = self.parser.select(self.document.getroot(), 'tr[title~=amount]', 1)
        nb_cd = int(self.parser.select(amount_line, 'td')[2].text)
        lang = unicode(url.split('-')[-1].split('.html')[0])
        filenames_line = self.parser.select(self.document.getroot(), 'tr[title~=list]', 1)
        file_names = self.parser.select(filenames_line, 'td')[2].text_content().strip().replace('.srt', '.srt\n')
        desc = u"files :\n"
        desc += file_names

        m = re.match('(.*?)\.(\w+)$', name)
        if m:
            name = m.group(1)
            ext = m.group(2)
        else:
            ext = 'zip'

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = ext
        subtitle.language = lang
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #11
0
    def get_subtitle(self, id):
        language = NotAvailable
        url = NotAvailable
        nb_cd = NotAvailable
        links_info = self.parser.select(self.document.getroot(), "fieldset.information a")
        for link in links_info:
            href = link.attrib.get("href", "")
            if "/fr/ppodnapisi/kategorija/jezik/" in href:
                nlang = href.split("/")[-1]
                for lang, langnum in LANGUAGE_NUMBERS.items():
                    if str(langnum) == str(nlang):
                        language = unicode(lang)
                        break

        desc = u""
        infos = self.parser.select(self.document.getroot(), "fieldset.information")
        for info in infos:
            for p in self.parser.select(info, "p"):
                desc += "%s\n" % (u" ".join(p.text_content().strip().split()))
            spans = self.parser.select(info, "span")
            for span in spans:
                if span.text is not None and "CD" in span.text:
                    nb_cd = int(self.parser.select(span.getparent(), "span")[1].text)

        title = unicode(self.parser.select(self.document.getroot(), "head title", 1).text)
        name = title.split(" - ")[0]

        dllinks = self.parser.select(self.document.getroot(), "div.footer > a.download")
        for link in dllinks:
            href = link.attrib.get("href", "")
            if id in href:
                url = u"http://www.podnapisi.net%s" % href

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.language = language
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #12
0
    def get_subtitle(self, id):
        language = NotAvailable
        url = NotAvailable
        nb_cd = NotAvailable
        links_info = self.parser.select(self.document.getroot(), 'fieldset.information a')
        for link in links_info:
            href = link.attrib.get('href', '')
            if '/fr/ppodnapisi/kategorija/jezik/' in href:
                nlang = href.split('/')[-1]
                for lang, langnum in LANGUAGE_NUMBERS.items():
                    if str(langnum) == str(nlang):
                        language = unicode(lang)
                        break

        desc = u''
        infos = self.parser.select(self.document.getroot(), 'fieldset.information')
        for info in infos:
            for p in self.parser.select(info, 'p'):
                desc += '%s\n' % (u' '.join(p.text_content().strip().split()))
            spans = self.parser.select(info, 'span')
            for span in spans:
                if span.text is not None and 'CD' in span.text:
                    nb_cd = int(self.parser.select(span.getparent(), 'span')[1].text)

        title = unicode(self.parser.select(self.document.getroot(), 'head title', 1).text)
        name = title.split(' - ')[0]

        dllinks = self.parser.select(self.document.getroot(), 'div.footer > a.download')
        for link in dllinks:
            href = link.attrib.get('href', '')
            if id in href:
                url = u'http://www.podnapisi.net%s' % href

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.language = language
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #13
0
    def get_subtitle_from_line(self, line):
        cells = self.parser.select(line, 'td')
        if len(cells) > 0:
            links = self.parser.select(line, 'a')
            a = links[0]
            name = u" ".join(a.text.strip().split())
            first_cell = cells[0]
            spanlist = self.parser.select(first_cell, 'span')
            if len(spanlist) > 0:
                long_name = spanlist[0].attrib.get('title', '')
            else:
                texts = first_cell.itertext()
                long_name = texts.next()
                long_name = texts.next()
                if "Download at 25" in long_name:
                    long_name = "---"
            name = "%s (%s)" % (name, long_name)
            second_cell = cells[1]
            link = self.parser.select(second_cell, 'a', 1)
            lang = link.attrib.get('href', '').split('/')[-1].split('-')[-1]
            for lshort, llong in LANGUAGE_CONV.items():
                if lang == llong:
                    lang = unicode(lshort)
                    break
            nb_cd = int(cells[2].text.strip().lower().replace('cd', ''))
            cell_dl = cells[4]
            href = self.parser.select(cell_dl, 'a', 1).attrib.get('href', '')
            url = unicode('http://www.opensubtitles.org%s' % href)
            id = href.split('/')[-1]

            subtitle = Subtitle(id, name)
            subtitle.url = url
            subtitle.language = lang
            subtitle.nb_cd = nb_cd
            subtitle.description = NotLoaded
            return subtitle
Пример #14
0
    def iter_subtitles(self, language, pattern):
        pattern = pattern.strip().replace('+', ' ').lower()
        pattern_words = pattern.split()
        tab = self.parser.select(self.document.getroot(),
                                 'table[bordercolor="#B8C0B2"]')
        if len(tab) == 0:
            tab = self.parser.select(self.document.getroot(),
                                     'table[bordercolordark="#B8C0B2"]')
        if len(tab) == 0:
            return
        # some results of freefind point on useless pages
        if tab[0].attrib.get('width', '') != '100%':
            return
        for line in tab[0].getiterator('tr'):
            cols = self.parser.select(line, 'td')
            traduced_title = self.parser.select(cols[0], 'font',
                                                1).text.lower()
            original_title = self.parser.select(cols[1], 'font',
                                                1).text.lower()

            traduced_title_words = traduced_title.split()
            original_title_words = original_title.split()

            # if the pattern is one word and in the title OR if the
            # intersection between pattern and the title is at least 2 words
            if (len(pattern_words) == 1 and pattern in traduced_title_words) or\
               (len(pattern_words) == 1 and pattern in original_title_words) or\
               (len(list(set(pattern_words) & set(traduced_title_words))) > 1) or\
               (len(list(set(pattern_words) & set(original_title_words))) > 1):

                # this is to trash special spacing chars
                traduced_title = " ".join(traduced_title_words)
                original_title = " ".join(original_title_words)

                nb_cd = self.parser.select(cols[2], 'font', 1).text.strip()
                nb_cd = int(nb_cd.strip(' CD'))
                name = unicode('%s (%s)' % (original_title, traduced_title))
                href = self.parser.select(cols[3], 'a',
                                          1).attrib.get('href', '')
                url = unicode('http://davidbillemont3.free.fr/%s' % href)
                id = unicode('%s|%s' %
                             (self.browser.geturl().split('/')[-1], href))
                subtitle = Subtitle(id, name)
                subtitle.url = url
                subtitle.ext = url.split('.')[-1]
                subtitle.language = unicode('fr')
                subtitle.nb_cd = nb_cd
                subtitle.description = NotAvailable
                yield subtitle
Пример #15
0
    def get_subtitle(self, id=None):
        subtitle = Subtitle()
        subtitle.description = CleanText(
            './/fieldset/span[@itemprop="description"]')(self.doc)
        if id:
            subtitle.id = id
        else:
            regexp = re.compile(
                'https://www.opensubtitles.org/en/subtitles/(?P<id>\d+)/.*$')
            result = regexp.match(self.url)
            subtitle.id = result.groupdict()['id']

        subtitle.name = CleanText('.//div//div//h2')(self.doc)
        subtitle.url = self.url
        return subtitle
Пример #16
0
    def get_subtitle(self, id=None):
        subtitle = Subtitle()
        subtitle.description = CleanText('.//fieldset/span[@itemprop="description"]')(self.doc)
        if id:
            subtitle.id = id
        else:
            regexp = re.compile('https://www.opensubtitles.org/en/subtitles/(?P<id>\d+)/.*$')
            result = regexp.match(self.url)
            subtitle.id = result.groupdict()['id']

        subtitle.name = CleanText('.//div//div//h2')(self.doc)
        subtitle.url = self.url
        return subtitle
Пример #17
0
    def get_subtitle(self, id):
        language = NotAvailable
        url = NotAvailable
        nb_cd = NotAvailable
        links_info = self.parser.select(self.document.getroot(),
                                        'fieldset.information a')
        for link in links_info:
            href = link.attrib.get('href', '')
            if '/fr/ppodnapisi/kategorija/jezik/' in href:
                nlang = href.split('/')[-1]
                for lang, langnum in LANGUAGE_NUMBERS.items():
                    if str(langnum) == str(nlang):
                        language = unicode(lang)
                        break

        desc = u''
        infos = self.parser.select(self.document.getroot(),
                                   'fieldset.information')
        for info in infos:
            for p in self.parser.select(info, 'p'):
                desc += '%s\n' % (u' '.join(p.text_content().strip().split()))
            spans = self.parser.select(info, 'span')
            for span in spans:
                if span.text is not None and 'CD' in span.text:
                    nb_cd = int(
                        self.parser.select(span.getparent(), 'span')[1].text)

        title = unicode(
            self.parser.select(self.document.getroot(), 'head title', 1).text)
        name = title.split(' - ')[0]

        dllinks = self.parser.select(self.document.getroot(),
                                     'div.footer > a.download')
        for link in dllinks:
            href = link.attrib.get('href', '')
            if id in href:
                url = u'http://www.podnapisi.net%s' % href

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.language = language
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #18
0
    def get_subtitle(self):
        desc = NotAvailable
        a = self.parser.select(self.document.getroot(), 'a#bt-dwl-bt', 1)
        id = a.attrib.get('data-product-id', '')
        m = re.match('Download \((\w+)\)', self.parser.tocleanstring(a))
        if m:
            ext = m.group(1)
        else:
            ext = u'zip'
        url = unicode('http://www.opensubtitles.org/en/subtitleserve/sub/%s' %
                      id)
        link = self.parser.select(self.document.getroot(),
                                  'link[rel=bookmark]', 1)
        title = unicode(link.attrib.get('title', ''))
        nb_cd = int(title.lower().split('cd')[0].split()[-1])
        lang = unicode(title.split('(')[1].split(')')[0])
        file_names = self.parser.select(self.document.getroot(),
                                        "img[title~=filename]")
        if len(file_names) > 0:
            file_name = file_names[0].getparent().text_content()
            file_name = ' '.join(file_name.split())
            desc = u'files :'
            for f in file_names:
                desc_line = f.getparent().text_content()
                desc += '\n' + ' '.join(desc_line.split())
        name = unicode('%s (%s)' % (title, file_name))

        subtitle = Subtitle(id, name)
        subtitle.url = url
        subtitle.ext = ext
        for lshort, llong in LANGUAGE_CONV.items():
            if lang == llong:
                lang = unicode(lshort)
                break
        subtitle.language = lang
        subtitle.nb_cd = nb_cd
        subtitle.description = desc
        return subtitle
Пример #19
0
    def iter_subtitles(self, language, pattern):
        pattern = pattern.strip().replace('+', ' ').lower()
        pattern_words = pattern.split()
        tab = self.parser.select(self.document.getroot(), 'table[bordercolor="#B8C0B2"]')
        if len(tab) == 0:
            tab = self.parser.select(self.document.getroot(), 'table[bordercolordark="#B8C0B2"]')
            if len(tab) == 0:
                return
        # some results of freefind point on useless pages
        if tab[0].attrib.get('width', '') != '100%':
            return
        for line in tab[0].getiterator('tr'):
            cols = self.parser.select(line, 'td')
            traduced_title = self.parser.select(cols[0], 'font', 1).text.lower()
            original_title = self.parser.select(cols[1], 'font', 1).text.lower()

            traduced_title_words = traduced_title.split()
            original_title_words = original_title.split()

            # if the pattern is one word and in the title OR if the
            # intersection between pattern and the title is at least 2 words
            if (len(pattern_words) == 1 and pattern in traduced_title_words) or\
               (len(pattern_words) == 1 and pattern in original_title_words) or\
               (len(list(set(pattern_words) & set(traduced_title_words))) > 1) or\
               (len(list(set(pattern_words) & set(original_title_words))) > 1):

                # this is to trash special spacing chars
                traduced_title = " ".join(traduced_title_words)
                original_title = " ".join(original_title_words)

                nb_cd = self.parser.select(cols[2], 'font', 1).text.strip()
                nb_cd = int(nb_cd.strip(' CD'))
                name = unicode('%s (%s)' % (original_title, traduced_title))
                href = self.parser.select(cols[3], 'a', 1).attrib.get('href', '')
                url = unicode('http://davidbillemont3.free.fr/%s' % href)
                id = unicode('%s|%s' % (self.browser.geturl().split('/')[-1], href))
                subtitle = Subtitle(id, name)
                subtitle.url = url
                subtitle.ext = url.split('.')[-1]
                subtitle.language = unicode('fr')
                subtitle.nb_cd = nb_cd
                subtitle.description = NotAvailable
                yield subtitle
Пример #20
0
    def get_subtitle_from_line(self, line):
        cells = self.parser.select(line, 'td')
        if len(cells) > 0:
            links = self.parser.select(line, 'a')
            a = links[0]
            name = u" ".join(a.text.strip().split())
            first_cell = cells[0]
            spanlist = self.parser.select(first_cell, 'span')
            if len(spanlist) > 0:
                long_name = spanlist[0].attrib.get('title', '')
            else:
                texts = first_cell.itertext()
                long_name = texts.next()
                long_name = texts.next()
                if "Download at 25" in long_name:
                    long_name = "---"
            name = "%s (%s)" % (name, long_name)
            second_cell = cells[1]
            link = self.parser.select(second_cell, 'a', 1)
            lang = link.attrib.get('href', '').split('/')[-1].split('-')[-1]
            for lshort, llong in LANGUAGE_CONV.items():
                if lang == llong:
                    lang = unicode(lshort)
                    break
            nb_cd = int(cells[2].text.strip().lower().replace('cd', ''))
            cell_dl = cells[4]
            href = self.parser.select(cell_dl, 'a', 1).attrib.get('href', '')
            url = unicode('http://www.opensubtitles.org%s' % href)
            id = href.split('/')[-1]

            subtitle = Subtitle(id, name)
            subtitle.url = url
            subtitle.language = lang
            subtitle.nb_cd = nb_cd
            subtitle.description = NotLoaded
            return subtitle