def hlsparse(options, res, url): streams = {} if not res: return None if res.status_code > 400: streams[0] = ServiceError("Can't read HLS playlist. {0}".format( res.status_code)) return streams files = (parsem3u(res.text))[1] http = HTTP(options) for i in files: try: bitrate = float(i[1]["BANDWIDTH"]) / 1000 except KeyError: streams[0] = ServiceError("Can't read HLS playlist") return streams urls = _get_full_url(i[0], url) res2 = http.get(urls) if res2.status_code < 400: streams[int(bitrate)] = HLS(copy.copy(options), urls, bitrate, cookies=res.cookies) return streams
def __init__(self, options, subtype, url, subfix=None): self.url = url self.subtitle = None self.options = options self.subtype = subtype self.http = HTTP(options) self.subfix = subfix self.bom = False
def __init__(self, options, _url): self.options = options self._url = _url self._urldata = None self._error = False self.subtitle = None self.cookies = {} self.http = HTTP(options)
class VideoRetriever(object): def __init__(self, options, url, bitrate=0, **kwargs): self.options = options self.url = url self.bitrate = int(bitrate) self.kwargs = kwargs self.http = HTTP(options) self.finished = False self.audio = kwargs.pop("audio", None) self.files = kwargs.pop("files", None) self.keycookie = kwargs.pop("keycookie", None) self.authorization = kwargs.pop("authorization", None) def __repr__(self): return "<Video(fetcher=%s, bitrate=%s>" % (self.__class__.__name__, self.bitrate) def name(self): pass def _download_url(self, url, audio=False, total_size=None): cookies = self.kwargs["cookies"] data = self.http.request("get", url, cookies=cookies, headers={'Range': 'bytes=0-8192'}) if not total_size: try: total_size = data.headers['Content-Range'] total_size = total_size[total_size.find("/") + 1:] total_size = int(total_size) except KeyError: raise KeyError("Can't get the total size.") bytes_so_far = 8192 if audio: file_d = output(copy.copy(self.options), "m4a") else: file_d = output(self.options, self.options.other) if file_d is None: return file_d.write(data.content) eta = ETA(total_size) while bytes_so_far < total_size: if not self.options.silent: eta.update(bytes_so_far) progressbar(total_size, bytes_so_far, ''.join(["ETA: ", str(eta)])) old = bytes_so_far + 1 bytes_so_far = total_size bytes_range = "bytes={0}-{1}".format(old, bytes_so_far) data = self.http.request("get", url, cookies=cookies, headers={'Range': bytes_range}) file_d.write(data.content) file_d.close() progressbar(bytes_so_far, total_size, "ETA: complete") progress_stream.write('\n') self.finished = True
def __init__(self, options, url, bitrate=0, **kwargs): self.options = options self.url = url self.bitrate = int(bitrate) self.kwargs = kwargs self.http = HTTP(options) self.finished = False self.audio = kwargs.pop("audio", None) self.files = kwargs.pop("files", None) self.keycookie = kwargs.pop("keycookie", None)
class Service(object): supported_domains = [] supported_domains_re = [] def __init__(self, options, _url): self.options = options self._url = _url self._urldata = None self._error = False self.subtitle = None self.cookies = {} self.http = HTTP(options) @property def url(self): return self._url def get_urldata(self): if self._urldata is None: self._urldata = self.http.request("get", self.url).text return self._urldata @classmethod def handles(cls, url): urlp = urlparse(url) # Apply supported_domains_re regexp to the netloc. This # is meant for 'dynamic' domains, e.g. containing country # information etc. for domain_re in [re.compile(x) for x in cls.supported_domains_re]: if domain_re.match(urlp.netloc): return True if urlp.netloc in cls.supported_domains: return True # For every listed domain, try with www. subdomain as well. if urlp.netloc in ["www." + x for x in cls.supported_domains]: return True return False def get_subtitle(self, options): pass def exclude(self): if self.options.exclude: for i in self.options.exclude: if is_py2: i = i.decode("utf-8") if i in self.options.output: return True return False # the options parameter is unused, but is part of the # interface, so we don't want to remove it. Thus, the # pylint ignore. def find_all_episodes(self, options): # pylint: disable-msg=unused-argument log.warning("--all-episodes not implemented for this service") return [self.url]
class Service(object): supported_domains = [] supported_domains_re = [] def __init__(self, options, _url): self.options = options self._url = _url self._urldata = None self._error = False self.subtitle = None self.cookies = {} self.http = HTTP(options) @property def url(self): return self._url def get_urldata(self): if self._urldata is None: self._urldata = self.http.request("get", self.url).text return self._urldata @classmethod def handles(cls, url): urlp = urlparse(url) # Apply supported_domains_re regexp to the netloc. This # is meant for 'dynamic' domains, e.g. containing country # information etc. for domain_re in [re.compile(x) for x in cls.supported_domains_re]: if domain_re.match(urlp.netloc): return True if urlp.netloc in cls.supported_domains: return True # For every listed domain, try with www. subdomain as well. if urlp.netloc in ['www.'+x for x in cls.supported_domains]: return True return False def get_subtitle(self, options): pass def exclude(self): if self.options.exclude: for i in self.options.exclude: if is_py2: i = i.decode("utf-8") if i in self.options.output: return True return False # the options parameter is unused, but is part of the # interface, so we don't want to remove it. Thus, the # pylint ignore. def find_all_episodes(self, options): # pylint: disable-msg=unused-argument log.warning("--all-episodes not implemented for this service") return [self.url]
def __init__(self, options, url, bitrate=0, **kwargs): self.options = options self.url = url self.bitrate = int(bitrate) self.kwargs = kwargs self.http = HTTP(options) self.finished = False self.audio = kwargs.pop("audio", None) self.files = kwargs.pop("files", None) self.keycookie = kwargs.pop("keycookie", None) self.authorization = kwargs.pop("authorization", None)
def hlsparse(options, res, url): streams = {} if not res: return None if res.status_code > 400: streams[0] = ServiceError("Can't read HLS playlist. {0}".format(res.status_code)) return streams files = (parsem3u(res.text))[1] http = HTTP(options) for i in files: try: bitrate = float(i[1]["BANDWIDTH"])/1000 except KeyError: streams[0] = ServiceError("Can't read HLS playlist") return streams urls = _get_full_url(i[0], url) res2 = http.get(urls, cookies=res.cookies) if res2.status_code < 400: streams[int(bitrate)] = HLS(copy.copy(options), urls, bitrate, cookies=res.cookies) return streams
def __init__(self, _url): self._url = _url self._urldata = None self._error = False self.http = HTTP()
class subtitle(object): def __init__(self, options, subtype, url, subfix=None): self.url = url self.subtitle = None self.options = options self.subtype = subtype self.http = HTTP(options) self.subfix = subfix self.bom = False def download(self): subdata = self.http.request("get", self.url, cookies=self.options.cookies) if subdata.status_code != 200: log.warning("Can't download subtitle file") return data = None if "mtgx" in self.url and subdata.content[:3] == b"\xef\xbb\xbf": subdata.encoding = "utf-8" self.bom = True if self.subtype == "tt": data = self.tt(subdata) if self.subtype == "json": data = self.json(subdata) if self.subtype == "sami": data = self.sami(subdata) if self.subtype == "smi": data = self.smi(subdata) if self.subtype == "wrst": if "tv4play" in self.url and subdata.content[:3] == b"\xef\xbb\xbf": subdata.encoding = "utf-8" self.bom = True data = self.wrst(subdata) if self.subtype == "raw": data = self.raw(subdata) if self.subfix: self.options.output = self.options.output + self.subfix if self.options.get_raw_subtitles: subdata = self.raw(subdata) self.save_file(subdata, self.subtype) self.save_file(data, "srt") def save_file(self, data, subtype): if platform.system() == "Windows" and is_py3: file_d = output(self.options, subtype, mode="wt", encoding="utf-8") else: file_d = output(self.options, subtype, mode="wt") if hasattr(file_d, "read") is False: return file_d.write(data) file_d.close() def raw(self, subdata): if is_py2: data = subdata.text.encode("utf-8") else: data = subdata.text return data def tt(self, subdata): i = 1 data = "" if is_py2: subs = subdata.text.encode("utf8") else: subs = subdata.text subdata = re.sub(' xmlns="[^"]+"', '', subs, count=1) tree = ET.XML(subdata) xml = tree.find("body").find("div") plist = list(xml.findall("p")) for node in plist: tag = norm(node.tag) if tag == "p" or tag == "span": begin = node.attrib["begin"] if not ("dur" in node.attrib): duration = node.attrib["duration"] else: duration = node.attrib["dur"] if not ("end" in node.attrib): begin2 = begin.split(":") duration2 = duration.split(":") try: sec = float(begin2[2]) + float(duration2[2]) except ValueError: sec = 0.000 end = "%02d:%02d:%06.3f" % (int(begin2[0]), int( begin2[1]), sec) else: end = node.attrib["end"] data += '%s\n%s --> %s\n' % (i, begin.replace( ".", ","), end.replace(".", ",")) data = tt_text(node, data) data += "\n" i += 1 if is_py2: data = data.encode("utf8") return data def json(self, subdata): data = json.loads(subdata.text) number = 1 subs = "" for i in data: subs += "%s\n%s --> %s\n" % (number, timestr(int( i["startMillis"])), timestr(int(i["endMillis"]))) if is_py2: subs += "%s\n\n" % i["text"].encode("utf-8") else: subs += "%s\n\n" % i["text"] number += 1 return subs def sami(self, subdata): text = subdata.text if is_py2: text = text.encode("utf8") text = re.sub(r'&', '&', text) tree = ET.fromstring(text) subt = tree.find("Font") subs = "" n = 0 for i in subt.getiterator(): if i.tag == "Subtitle": n = i.attrib["SpotNumber"] if i.attrib["SpotNumber"] == "1": subs += "%s\n%s --> %s\n" % ( i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"])) else: subs += "\n%s\n%s --> %s\n" % ( i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"])) else: if int(n) > 0 and i.text: subs += "%s\n" % decode_html_entities(i.text) if is_py2: subs = subs.encode('utf8') subs = re.sub('&', r'&', subs) return subs def smi(self, subdata): if requests_version < 0x20300: if is_py2: subdata = subdata.content else: subdata = subdata.content.decode("latin") else: subdata.encoding = "ISO-8859-1" subdata = subdata.text ssubdata = StringIO(subdata) timea = 0 number = 1 data = None subs = "" TAG_RE = re.compile(r'<(?!\/?i).*?>') bad_char = re.compile(r'\x96') for i in ssubdata.readlines(): i = i.rstrip() sync = re.search(r"<SYNC Start=(\d+)>", i) if sync: if int(sync.group(1)) != int(timea): if data and data != " ": subs += "%s\n%s --> %s\n" % (number, timestr(timea), timestr(sync.group(1))) text = "%s\n" % TAG_RE.sub('', data.replace("<br>", "\n")) text = decode_html_entities(text) if text[len(text) - 2] != "\n": text += "\n" subs += text number += 1 timea = sync.group(1) text = re.search("<P Class=SVCC>(.*)", i) if text: data = text.group(1) recomp = re.compile(r'\r') text = bad_char.sub('-', recomp.sub('', subs)) if is_py2 and isinstance(text, unicode): return text.encode("utf-8") return text def wrst(self, subdata): ssubdata = StringIO(subdata.text) srt = "" subtract = False number_b = 1 number = 0 block = 0 subnr = False if self.bom: ssubdata.read(1) for i in ssubdata.readlines(): match = re.search(r"^[\r\n]+", i) match2 = re.search(r"([\d:\.]+ --> [\d:\.]+)", i) match3 = re.search(r"^(\d+)\s", i) if i[:6] == "WEBVTT": continue elif "X-TIMESTAMP" in i: continue elif match and number_b == 1 and self.bom: continue elif match and number_b > 1: block = 0 srt += "\n" elif match2: if not subnr: srt += "%s\n" % number_b matchx = re.search( r'(?P<h1>\d+):(?P<m1>\d+):(?P<s1>[\d\.]+) --> (?P<h2>\d+):(?P<m2>\d+):(?P<s2>[\d\.]+)', i) if matchx: hour1 = int(matchx.group("h1")) hour2 = int(matchx.group("h2")) if int(number) == 1: if hour1 > 9: subtract = True if subtract: hour1 -= 10 hour2 -= 10 else: matchx = re.search( r'(?P<m1>\d+):(?P<s1>[\d\.]+) --> (?P<m2>\d+):(?P<s2>[\d\.]+)', i) hour1 = 0 hour2 = 0 time = "{0:02d}:{1}:{2} --> {3:02d}:{4}:{5}\n".format( hour1, matchx.group("m1"), matchx.group("s1").replace(".", ","), hour2, matchx.group("m2"), matchx.group("s2").replace(".", ",")) srt += time block = 1 subnr = False number_b += 1 elif match3 and block == 0: number = match3.group(1) srt += "%s\n" % number subnr = True else: if self.options.convert_subtitle_colors: colors = { '30': '#000000', '31': '#ff0000', '32': '#00ff00', '33': '#ffff00', '34': '#0000ff', '35': '#ff00ff', '36': '#00ffff', '37': '#ffffff' } sub = i for tag, color in colors.items(): regex1 = '<' + tag + '>' replace = '<font color="' + color + '">' sub = re.sub(regex1, replace, sub) sub = re.sub('</.+>', '</font>', sub) else: sub = re.sub('<[^>]*>', '', i) srt += sub.strip() srt += "\n" srt = decode_html_entities(srt) if is_py2: return srt.encode("utf-8") return srt
class subtitle(object): def __init__(self, options, subtype, url, subfix=None): self.url = url self.subtitle = None self.options = options self.subtype = subtype self.http = HTTP(options) self.subfix = subfix self.bom = False def download(self): subdata = self.http.request("get", self.url, cookies=self.options.cookies) if subdata.status_code != 200: log.warning("Can't download subtitle file") return data = None if "mtgx" in self.url and subdata.content[:3] == b"\xef\xbb\xbf": subdata.encoding = "utf-8" self.bom = True if self.subtype == "tt": data = self.tt(subdata) if self.subtype == "json": data = self.json(subdata) if self.subtype == "sami": data = self.sami(subdata) if self.subtype == "smi": data = self.smi(subdata) if self.subtype == "wrst": if "tv4play" in self.url and subdata.content[:3] == b"\xef\xbb\xbf": subdata.encoding = "utf-8" self.bom = True if "dplay" in self.url: subdata.encoding = "utf-8" data = self.wrst(subdata) if self.subtype == "raw": data = self.raw(subdata) if self.subfix: self.options.output = self.options.output + self.subfix if self.options.get_raw_subtitles: subdata = self.raw(subdata) self.save_file(subdata, self.subtype) self.save_file(data, "srt") def save_file(self, data, subtype): if platform.system() == "Windows" and is_py3: file_d = output(self.options, subtype, mode="wt", encoding="utf-8") else: file_d = output(self.options, subtype, mode="wt") if hasattr(file_d, "read") is False: return file_d.write(data) file_d.close() def raw(self, subdata): if is_py2: data = subdata.text.encode("utf-8") else: data = subdata.text return data def tt(self, subdata): i = 1 data = "" if is_py2: subs = subdata.text.encode("utf8") else: subs = subdata.text subdata = re.sub(' xmlns="[^"]+"', '', subs, count=1) tree = ET.XML(subdata) xml = tree.find("body").find("div") plist = list(xml.findall("p")) for node in plist: tag = norm(node.tag) if tag == "p" or tag == "span": begin = node.attrib["begin"] if not ("dur" in node.attrib): duration = node.attrib["duration"] else: duration = node.attrib["dur"] if not ("end" in node.attrib): begin2 = begin.split(":") duration2 = duration.split(":") try: sec = float(begin2[2]) + float(duration2[2]) except ValueError: sec = 0.000 end = "%02d:%02d:%06.3f" % (int(begin2[0]), int(begin2[1]), sec) else: end = node.attrib["end"] data += '%s\n%s --> %s\n' % (i, begin.replace(".", ","), end.replace(".", ",")) data = tt_text(node, data) data += "\n" i += 1 if is_py2: data = data.encode("utf8") return data def json(self, subdata): data = json.loads(subdata.text) number = 1 subs = "" for i in data: subs += "%s\n%s --> %s\n" % (number, timestr(int(i["startMillis"])), timestr(int(i["endMillis"]))) if is_py2: subs += "%s\n\n" % i["text"].encode("utf-8") else: subs += "%s\n\n" % i["text"] number += 1 return subs def sami(self, subdata): text = subdata.text if is_py2: text = text.encode("utf8") text = re.sub(r'&', '&', text) tree = ET.fromstring(text) subt = tree.find("Font") subs = "" n = 0 for i in subt.getiterator(): if i.tag == "Subtitle": n = i.attrib["SpotNumber"] if i.attrib["SpotNumber"] == "1": subs += "%s\n%s --> %s\n" % (i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"])) else: subs += "\n%s\n%s --> %s\n" % (i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"])) else: if int(n) > 0 and i.text: subs += "%s\n" % decode_html_entities(i.text) if is_py2: subs = subs.encode('utf8') subs = re.sub('&', r'&', subs) return subs def smi(self, subdata): if requests_version < 0x20300: if is_py2: subdata = subdata.content else: subdata = subdata.content.decode("latin") else: subdata.encoding = "ISO-8859-1" subdata = subdata.text ssubdata = StringIO(subdata) timea = 0 number = 1 data = None subs = "" TAG_RE = re.compile(r'<(?!\/?i).*?>') bad_char = re.compile(r'\x96') for i in ssubdata.readlines(): i = i.rstrip() sync = re.search(r"<SYNC Start=(\d+)>", i) if sync: if int(sync.group(1)) != int(timea): if data and data != " ": subs += "%s\n%s --> %s\n" % (number, timestr(timea), timestr(sync.group(1))) text = "%s\n" % TAG_RE.sub('', data.replace("<br>", "\n")) text = decode_html_entities(text) if text[len(text) - 2] != "\n": text += "\n" subs += text number += 1 timea = sync.group(1) text = re.search("<P Class=SVCC>(.*)", i) if text: data = text.group(1) recomp = re.compile(r'\r') text = bad_char.sub('-', recomp.sub('', subs)) if is_py2 and isinstance(text, unicode): return text.encode("utf-8") return text def wrst(self, subdata): ssubdata = StringIO(subdata.text) srt = "" subtract = False number_b = 1 number = 0 block = 0 subnr = False if self.bom: ssubdata.read(1) for i in ssubdata.readlines(): match = re.search(r"^[\r\n]+", i) match2 = re.search(r"([\d:\.]+ --> [\d:\.]+)", i) match3 = re.search(r"^(\d+)\s", i) if i[:6] == "WEBVTT": continue elif "X-TIMESTAMP" in i: continue elif match and number_b == 1 and self.bom: continue elif match and number_b > 1: block = 0 srt += "\n" elif match2: if not subnr: srt += "%s\n" % number_b matchx = re.search(r'(?P<h1>\d+):(?P<m1>\d+):(?P<s1>[\d\.]+) --> (?P<h2>\d+):(?P<m2>\d+):(?P<s2>[\d\.]+)', i) if matchx: hour1 = int(matchx.group("h1")) hour2 = int(matchx.group("h2")) if int(number) == 1: if hour1 > 9: subtract = True if subtract: hour1 -= 10 hour2 -= 10 else: matchx = re.search(r'(?P<m1>\d+):(?P<s1>[\d\.]+) --> (?P<m2>\d+):(?P<s2>[\d\.]+)', i) hour1 = 0 hour2 = 0 time = "{0:02d}:{1}:{2} --> {3:02d}:{4}:{5}\n".format(hour1, matchx.group("m1"), matchx.group("s1").replace(".", ","), hour2, matchx.group("m2"), matchx.group("s2").replace(".", ",")) srt += time block = 1 subnr = False number_b += 1 elif match3 and block == 0: number = match3.group(1) srt += "%s\n" % number subnr = True else: if self.options.convert_subtitle_colors: colors = {'30': '#000000', '31': '#ff0000', '32': '#00ff00', '33': '#ffff00', '34': '#0000ff', '35': '#ff00ff', '36': '#00ffff', '37': '#ffffff'} sub = i for tag, color in colors.items(): regex1 = '<' + tag + '>' replace = '<font color="' + color + '">' sub = re.sub(regex1, replace, sub) sub = re.sub('</.+>', '</font>', sub) else: sub = re.sub('<[^>]*>', '', i) srt += sub.strip() srt += "\n" srt = decode_html_entities(srt) if is_py2: return srt.encode("utf-8") return srt