示例#1
0
 def download(cls, url, language, options):
     """
     Download CCs from the given ``url`` for the given ``language``,
     and return a raw string with the result.
     """
     auto = options.get(cls.OPTION_AUTO, False)
     handler, tmp = gf.tmp_file()
     if os.path.exists(tmp):
         os.remove(tmp)
     tmp = gf.to_unicode_string(tmp)
     out = u"%s.%s.ttml" % (tmp, language)
     ydl_options = {
         "outtmpl": tmp,
         "subtitlesformat": u"ttml",
         "subtitleslangs": [language],
         "writesubtitles": not auto,
         "writeautomaticsub": auto,
         "skip_download": True,
         "logger": YDLogger(),
     }
     try:
         with youtube_dl.YoutubeDL(ydl_options) as ydl:
             ydl.download([url])
     except Exception as e:
         raise NotDownloadedError
     if not os.path.exists(out):
         raise NotDownloadedError
     with io.open(out, "r", encoding="utf-8") as out_file:
         data = out_file.read()
     gf.delete_file(handler, tmp)
     gf.delete_file(None, out)
     return data
示例#2
0
 def from_code(cls, code):
     if (isinstance(code, LanguageObject)) and (code in cls.ALL_LANGUAGES):
         return code
     code = gf.to_unicode_string(code)
     for language in cls.ALL_LANGUAGES:
         if language == code:
             return language
     return None
示例#3
0
    def predict(self, obj):
        """
        ``obj`` is either a list of features or a Span containing Tokens.
        In the latter case, features will be extracted from the sequence of Tokens.

        TBW
        """
        features = None
        if isinstance(obj, list):
            features = obj
        elif isinstance(obj, Span):
            features = sentence_to_features(obj)
        else:
            raise TypeError(u"The obj should be either a Span (sentence) object or a list of features (dict) objects.")
        predicted_labels = [gf.to_unicode_string(l) for l in self.tagger.tag(features)]
        probability = self.tagger.probability(predicted_labels)
        return predicted_labels, probability
示例#4
0
    def parse(cls, raw_data, language=None):
        """
        Parse the given ``raw_data`` string,
        and return a Document object.
        """
        # constants
        PLACEHOLDER_BR = u" ||| "
        PLACEHOLDER_NO_TEXT = u"()"
        PATTERN_SPAN_OPEN = re.compile(r"<span[^>]*>")
        PATTERN_SPAN_CLOSE = re.compile(r"</span>")
        PATTERN_BR = re.compile(r"<br[ ]*/>")
        PATTERN_SPACES = re.compile(r"\s+")
        TTML_NS = "{http://www.w3.org/ns/ttml}"
        TTML_TT = "%stt" % TTML_NS
        TTML_P = "%sp" % TTML_NS
        TTML_BEGIN = "begin"
        TTML_END = "end"
        XML_NS = "{http://www.w3.org/XML/1998/namespace}"
        XML_LANG = "%slang" % XML_NS

        # remove spans
        s = raw_data
        s = re.sub(PATTERN_SPAN_OPEN, u"", s)
        s = re.sub(PATTERN_SPAN_CLOSE, u"", s)
        # replace br with placeholder
        s = re.sub(PATTERN_BR, PLACEHOLDER_BR, s)
        # remove duplicated spaces
        s = re.sub(PATTERN_SPACES, u" ", s).strip()

        # encode to utf-8 as required by lxml
        if gf.is_unicode(s):
            s = s.encode("utf-8")

        # create tree
        root = etree.fromstring(s)

        # parse language
        xml_lang = language
        for elem in root.iter(TTML_TT):
            try:
                xml_lang = gf.to_unicode_string(elem.get(XML_LANG))
                break
            except:
                pass

        raw_ccl = RawCCListSpan()

        # parse fragments
        for elem in root.iter(TTML_P):
            begin = gf.time_from_hhmmssmmm(elem.get(TTML_BEGIN).strip())
            end = gf.time_from_hhmmssmmm(elem.get(TTML_END).strip())
            text = elem.text
            # text missing
            if text is None:
                text = u""
            # strip leading/trailing spaces
            text = text.strip()
            # if no text is available, replace it with ()
            if text == u"":
                text = PLACEHOLDER_NO_TEXT
            # split lines if the <br/> is present
            lines = [l.strip() for l in text.split(PLACEHOLDER_BR)]
            # make sure we return unicode strings
            lines = [gf.to_unicode_string(l) for l in lines if len(l) > 0]
            lines = [u"%s %s" % (l, EndOfLineToken.RAW) for l in lines]
            # append span objects
            raw_ccl.append(
                RawCCSpan(
                    elements=[RawCCLineSpan(raw=l) for l in lines],
                    time_interval=TimeInterval(TimeValue(begin),
                                               TimeValue(end)),
                ))

        # create new Document object
        doc = Document(raw=raw_ccl, language=xml_lang)

        return doc