Пример #1
0
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)

    Because we use the `langcodes` module, we can handle slight
    variations in language codes. For example, looking for 'pt-BR',
    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
    Looking up the alternate code 'por' will also get the same list.
    """
    available = available_languages(wordlist)
    best, score = langcodes.best_match(lang, list(available),
                                       min_score=match_cutoff)
    if score == 0:
        raise LookupError("No wordlist %r available for language %r"
                          % (wordlist, lang))

    if best != lang:
        logger.warning(
            "You asked for word frequencies in language %r. Using the "
            "nearest match, which is %r (%s)."
            % (lang, best, langcodes.get(best).language_name('en'))
        )

    return read_cBpack(available[best])
Пример #2
0
def get_frequency_list(lang, wordlist='best', match_cutoff=None):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)

    Because we use the `langcodes` module, we can handle slight
    variations in language codes. For example, looking for 'pt-BR',
    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
    Looking up the alternate code 'por' will also get the same list.
    """
    if match_cutoff is not None:
        warnings.warn("The `match_cutoff` parameter is deprecated",
                      DeprecationWarning)
    available = available_languages(wordlist)

    # TODO: decrease the maximum distance. This distance is so high just
    # because it allows a test where 'yue' matches 'zh', and maybe the
    # distance between those is high because they shouldn't match.
    best, _distance = langcodes.closest_match(lang,
                                              list(available),
                                              max_distance=70)
    if best == 'und':
        raise LookupError("No wordlist %r available for language %r" %
                          (wordlist, lang))

    if best != lang:
        logger.warning(
            "You asked for word frequencies in language %r. Using the "
            "nearest match, which is %r (%s)." %
            (lang, best, langcodes.get(best).language_name('en')))

    return read_cBpack(available[best])
Пример #3
0
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)

    Because we use the `langcodes` module, we can handle slight
    variations in language codes. For example, looking for 'pt-BR',
    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
    Looking up the alternate code 'por' will also get the same list.
    """
    available = available_languages(wordlist)
    best, score = langcodes.best_match(lang,
                                       list(available),
                                       min_score=match_cutoff)
    if score == 0:
        raise LookupError("No wordlist %r available for language %r" %
                          (wordlist, lang))

    if best != lang:
        logger.warning(
            "You asked for word frequencies in language %r. Using the "
            "nearest match, which is %r (%s)." %
            (lang, best, langcodes.get(best).language_name('en')))

    return read_cBpack(available[best])
Пример #4
0
def valid_language(code):
    if not code or code == 'und' or '-pro' in code:
        return False
    if ALPHA3_RE.match(code):
        return True
    try:
        lcode = langcodes.get(code)
        return lcode.language is not None and len(lcode.language) <= 3
    except LanguageTagError:
        return False
Пример #5
0
def valid_language(code):
    if not code or code == 'und' or '-pro' in code:
        return False
    if ALPHA3_RE.match(code):
        return True
    try:
        lcode = langcodes.get(code)
        return lcode.language is not None and len(lcode.language) <= 3
    except LanguageTagError:
        return False
Пример #6
0
def get_population_data():
    import langcodes

    filename = data_filename("supplementalData.xml")
    root = ET.fromstring(open(filename).read())
    territories = root.findall("./territoryInfo/territory")

    language_population = defaultdict(int)
    language_writing_population = defaultdict(int)

    for territory in territories:
        t_code = territory.attrib['type']
        t_population = float(territory.attrib['population'])
        t_literacy_rate = float(territory.attrib['literacyPercent']) / 100

        for language in territory:
            attrs = language.attrib
            l_code = attrs['type'].replace('_', '-')
            l_proportion = float(attrs.get('populationPercent', 0)) / 100
            if 'writingPercent' in attrs:
                writing_prop = float(attrs['writingPercent']) / 100
            elif 'literacyPercent' in attrs:
                writing_prop = float(attrs['literacyPercent']) / 100
            else:
                writing_prop = t_literacy_rate

            l_population = t_population * l_proportion
            l_writing = t_population * l_proportion * writing_prop

            # Distinguish data in different territories, and also in different
            # scripts when necessary, while also accumulating more general data

            # We need to use maximize() on the bare language code, not just
            # assume_script(), because assumed defaults like 'zh-Hans' are unwritten
            # in the data. We need this if we want to count the relative use of
            # Simplified vs. Traditional Chinese, for example.
            written_ls = (
                langcodes.get(l_code).maximize()._filter_attributes(['language', 'script'])
            )
            written_lst = written_ls.update_dict({'territory': t_code})

            spoken_lt = written_lst._filter_attributes(['language', 'territory'])
            spoken_l = written_lst._filter_attributes(['language'])

            written_lt = written_lst._filter_attributes(['language', 'territory'])
            written_l = written_lst._filter_attributes(['language'])

            for lang in set([spoken_lt, spoken_l]):
                language_population[str(lang)] += int(round(l_population))

            for lang in set([written_lst, written_lt, written_ls, written_l]):
                language_writing_population[str(lang)] += int(round(l_writing))

    return language_population, language_writing_population
Пример #7
0
def update_names(names_fwd, names_rev, name_quads):
    for name_language, referent, name, priority in name_quads:
        # Get just the language from name_language, not the territory or script.
        short_language = langcodes.get(name_language).language
        rev_all = names_rev.setdefault('und', {})
        rev_language = names_rev.setdefault(short_language, {})
        for rev_dict in (rev_all, rev_language):
            rev_dict.setdefault(normalize_name(name), []).append(
                (name_language, referent, priority))

        fwd_key = '{}@{}'.format(referent.lower(), name_language)
        if fwd_key not in names_fwd:
            names_fwd[fwd_key] = name
Пример #8
0
def get_language_name(code):
    """
    Get the English name of a language ConceptNet supports.
    """
    if code == 'mul':
        return 'Multilingual'
    elif code == 'roa-opt':
        # langcodes just has no way to produce a name for an extlang code that
        # Wiktionary came up with, but it's in a lot of etymological data, so
        # we need to describe it correctly
        return 'Old Portuguese'
    else:
        if code in LCODE_ALIASES:
            code = LCODE_ALIASES['code']
        return langcodes.get(code, normalize=False).display_name()
Пример #9
0
def valid_language(code):
    """
    Check whether a language code refers to a language we could represent in
    ConceptNet.
    """
    if code is None:
        return False
    if not code or code == 'und' or '-pro' in code:
        return False
    if ALPHA3_RE.match(code):
        return True
    try:
        lcode = langcodes.get(code)
        return lcode.language is not None and len(lcode.language) <= 3
    except LanguageTagError:
        return False
Пример #10
0
def valid_language(code):
    """
    Check whether a language code refers to a language we could represent in
    ConceptNet.
    """
    if code is None:
        return False
    if not code or code == 'und' or '-pro' in code:
        return False
    if ALPHA3_RE.match(code):
        return True
    try:
        lcode = langcodes.get(code)
        return lcode.language is not None and len(lcode.language) <= 3
    except LanguageTagError:
        return False
Пример #11
0
def test_alpha2_to_alpha3():
    """
    Test that each valid alpha2 code has a corresponding, unique alpha3 code.
    """
    seen = set()
    for letter1 in string.ascii_lowercase:
        for letter2 in string.ascii_lowercase:
            code = letter1 + letter2
            language = langcodes.get(code, normalize=False)
            if language.is_valid():
                alpha3 = language.to_alpha3()

                # These four 2-letter codes exist only as aliases, and don't have
                # their own unique 3-letter codes. All other 2-letter codes should
                # uniquely map to 3-letter codes.
                if code not in {'in', 'iw', 'ji', 'jw'}:
                    assert alpha3 not in seen
                    seen.add(alpha3)
Пример #12
0
    def language(self, value):

        # Fixme: better ???
        invalid = False
        try:
            self._language = langcodes.find(value)
        except LookupError:
            try:
                self._language = langcodes.get(value)
            except:
               invalid = True
        except:
            invalid = True

        if invalid:
            self._language = ''
            if value:
                self._logger.warning('Unknown language {}'.format(value))
Пример #13
0
def cld2_detect_language(text):
    """
    Uses CLD2 to detect the language.
    """
    # Format of pycld2.detect:
    #   (Confident in result: bool,
    #   Number of bytes of text: Int,
    #   Triples of detected languages in order of certainty:
    #       (Language name: str,
    #       Language code: str
    #       Percent of text in this language: float
    #       Confidence score: float))

    text = CLD2_BAD_CHARS_RE.sub('', text)
    lang = pycld2.detect(text)[2][0][1]

    # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
    # becomes 'zh'
    code = langcodes.get(lang).language
    return code
Пример #14
0
def cld2_detect_language(text):
    """
    Uses CLD2 to detect the language.
    """
    # Format of pycld2.detect:
    #   (Confident in result: bool,
    #   Number of bytes of text: Int,
    #   Triples of detected languages in order of certainty:
    #       (Language name: str,
    #       Language code: str
    #       Percent of text in this language: float
    #       Confidence score: float))

    text = CLD2_BAD_CHARS_RE.sub('', text)
    lang = pycld2.detect(text)[2][0][1]

    # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
    # becomes 'zh'
    code = langcodes.get(lang).language
    return code
Пример #15
0
def translate_dbpedia_url(url):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are the result of parsing glitches.
    The URL itself is a stable thing that we can build a ConceptNet URI from,
    on the other hand.
    """
    parsed = parse_url(url)
    domain = parsed.netloc

    if domain == "dbpedia.org":
        # Handle old DBPedia URLs that had no language code
        lang = "en"
    else:
        domain_parts = domain.split(".", 1)
        if domain_parts[1] == "dbpedia.org":
            lang = domain_parts[0]

            # If we can't name this language in English, it's probably
            # not really a language.
            if langcodes.get(lang).language_name("en") == lang:
                return None
        else:
            return None

    # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores
    # between words.
    pieces = parse_topic_name(resource_name(url))
    pieces[0] = un_camel_case(pieces[0])
    return standardized_concept_uri(lang, *pieces)
Пример #16
0
def translate_dbpedia_url(url):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are the result of parsing glitches.
    The URL itself is a stable thing that we can build a ConceptNet URI from,
    on the other hand.
    """
    parsed = parse_url(url)
    domain = parsed.netloc

    if domain == 'dbpedia.org':
        # Handle old DBPedia URLs that had no language code
        lang = 'en'
    else:
        domain_parts = domain.split('.', 1)
        if domain_parts[1] == 'dbpedia.org':
            lang = domain_parts[0]

            # If we can't name this language in English, it's probably
            # not really a language.
            if langcodes.get(lang).language_name('en') == lang:
                return None
        else:
            return None

    # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores
    # between words.
    pieces = parse_topic_name(resource_name(url))
    pieces[0] = un_camel_case(pieces[0])
    return standardized_concept_uri(lang, *pieces)
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, simplify language tags to a bare language. The main purpose
    is to remove the region tag from Chinese assertions, so they are considered
    simply as assertions about Chinese regardless of whether it is Traditional
    or Simplified Chinese. In the cases where they overlap, this helps to make
    the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    langtag = parts[1]
    if parts[1] != '[':
        langcode = langcodes.get(langtag).language
        if langcode:
            parts[1] = langcode
    return join_uri(*parts[:3])
Пример #18
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, simplify language tags to a bare language. The main purpose
    is to remove the region tag from Chinese assertions, so they are considered
    simply as assertions about Chinese regardless of whether it is Traditional
    or Simplified Chinese. In the cases where they overlap, this helps to make
    the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    langtag = parts[1]
    if parts[1] != '[':
        langcode = langcodes.get(langtag).language
        if langcode:
            parts[1] = langcode
    return join_uri(*parts[:3])
Пример #19
0
import string
import langcodes

# Iterate through all 2- and 3-letter language codes, and for all languages
# that have enough data to represent their own name, show:
#
# - The original code
# - The code after normalization
# - The language's name in English
# - The language's name in that language (its autonym)

for let1 in string.ascii_lowercase:
    for let2 in string.ascii_lowercase:
        for let3 in [''] + list(string.ascii_lowercase):
            code = let1 + let2 + let3
            lcode = langcodes.get(code)
            autonym = lcode.autonym()
            name = lcode.language_name()
            if autonym != lcode.language:
                print('%-3s %-3s %-30s %s' % (code, lcode.language, name, autonym))
Пример #20
0
    for row in reader:
        d = Dict(lang=row['language'].strip(),
                 code=row['code'].split(',')[-1].strip())
        Langs.append(d)

# for l in Langs:
#     lcode = langcodes.get(l.code)
#     print(l.code, lcode.display_name(), lcode.autonym())

# #Langs = ["en_GB", "en_US", "fr_CA", "fr_FR", "de_DE", "ru_RU", "es_MX", "es_ES"]
#Langs.sort()

ltxt = '[\n'
for L in Langs:
    lng = L.code
    lcode = langcodes.get(lng)
    autonym = lcode.autonym()
    name = lcode.display_name()
    #ui = '"%1 / {auto}".arg(app.tr("{name}"))'.format(name=name, auto=autonym)
    ui = 'app.tr("{name}")'.format(name=name, auto=autonym)

    # fill all translations
    check_translations(lcode, name)

    if autonym == lng:
        print("Skipping since we don't know much about it: " + lng)
        continue

    ltxt += '  { "key": "%s", "name": %s },\n' % (lng, ui)
    print(lng, '/', autonym, '/', name, '/', L.lang)
Пример #21
0
    def _process(self, resource, text, ratio=True):
        d = pq(text, parser='html')
        for img in d.items('img'):
            width = img.attr.width
            height = img.attr.height
            src = img.attr.src
            src = urllib.parse.unquote(src)
            if src is None:
                self.logger.warn("[%s] has an img tag without src attribute" %
                                 resource)
                continue
            if width is None or height is None:
                wh = self._size(resource, src, width, height)
                if wh is not None:
                    width, height = wh
                else:
                    width, height = None, None
            if width is not None:
                width, height = int(width), int(height)

            # Adapt width/height if this is a scaled image ([email protected])
            mo = re.match(r'.*@(\d+)x\.[^.]*$', src)
            if mo and width is not None:
                factor = int(mo.group(1))
                width //= factor
                height //= factor
                srcset = ['{} {}x'.format(src, factor)]
                for f in reversed(range(1, factor)):
                    tname = src.replace('@{}x.'.format(factor),
                                        '@{}x.'.format(f))
                    self._resize(src, os.path.basename(tname),
                                 float(f) / factor)
                    srcset.append('{} {}x'.format(tname, f))
                srcset = srcset[:-1]
                img.attr.src = tname
                img.attr.srcset = ','.join(srcset)

            # Put new width/height
            if width is not None:
                img.attr.width = '{}'.format(width)
                img.attr.height = '{}'.format(height)

            # If image is a SVG in /obj/, turns into an object
            if "/obj/" in src and src.endswith(".svg"):
                img[0].tag = 'object'
                img.attr("type", "image/svg+xml")
                img.attr("data", src)
                img.text('&#128444; {}'.format(img.attr.alt or ""))
                del img.attr.src
                del img.attr.alt

            # PDF files
            elif src.endswith('.pdf'):
                img[0].tag = 'object'
                img.attr("type", "application/pdf")
                options = "&".join([
                    f"{k}={v}" for k, v in dict(
                        toolbar=0,
                        navpanes=0,
                        scrollbar=0,
                        view="Fit",
                        # pdf.js in Firefox
                        zoom="page-fit",
                        pagemode="none").items()
                ])
                img.attr("data", f"{src}#{options}")
                fallback = pq('<a />')
                fallback.attr("href", src)
                fallback.text(img.attr.alt or "PDF")
                img.append(fallback)
                del img.attr.src
                del img.attr.alt

            # On-demand videos (should be in /videos)
            elif src.endswith('.m3u8'):
                id = os.path.splitext(os.path.basename(src))[0]
                img[0].tag = 'video'
                img[0].set("controls", None)
                img.attr("preload", "none")
                img.attr("crossorigin", "anonymous")
                img.attr(
                    "poster",
                    self.site.media_url('images/posters/{}.jpg'.format(id)))
                del img.attr.src
                del img.attr.alt
                # Add sources
                m3u8 = pq('<source>')
                m3u8.attr.src = self.site.media_url(
                    'videos/{}.m3u8'.format(id))
                m3u8.attr.type = 'application/vnd.apple.mpegurl'
                img.append(m3u8)
                progressive = pq('<source>')
                progressive.attr.src = self.site.media_url(
                    'videos/{}/progressive.mp4'.format(id))
                progressive.attr.type = 'video/mp4; codecs="mp4a.40.2,avc1.4d401f"'
                img.append(progressive)
                # Add subtitle tracks if any
                vtts = [
                    v for v in self.site.content.node_from_relative_path(
                        "media/videos").walk_resources()
                    if v.name.endswith('.vtt')
                    and v.name.startswith('{}.'.format(id))
                ]
                for vtt in vtts:
                    code = vtt.name[len(id) + 1:-4]
                    track = pq('<track>')
                    track.attr.src = self.site.media_url(vtt.relative_path[6:])
                    track.attr.kind = 'subtitles'
                    track.attr.srclang = code
                    if resource.meta.language == code:
                        track[0].set("default", None)
                    if '-' not in code:
                        track.attr.label = langcodes.get(code).autonym()
                    else:
                        details = langcodes.get(code).describe(code)
                        lang = details['language']
                        del details['language']
                        track.attr.label = u"{} ({})".format(
                            lang, u", ".join(details.values()))
                    img.append(track)

            # If image is a video not in /videos turn into a simple
            # video tag like an animated GIF.
            elif src.endswith(".mp4") or src.endswith(".ogv"):
                img[0].tag = 'video'
                for attr in 'muted loop autoplay playsinline controls'.split():
                    img[0].set(attr, None)
                del img.attr.alt

            # Lazy load
            if img[0].tag == "img" and width:
                lftext = img.parents('#lf-text')
                if lftext:
                    parents = img.parents()
                    rootEl = pq(parents[parents.index(lftext[0]) + 1])
                    if len(rootEl.prev_all()) > 3:
                        img.attr.loading = "lazy"
                    img.attr.decoding = "async"

            # If image is contained in a paragraph, enclose into a
            # responsive structure.
            parent = None
            parents = [p.tag for p in img.parents()]
            if parents[-1] == 'p':
                parent = img.parent()
            elif parents[-2:] == ['p', 'a']:
                parent = img.parent().parent()
            if parent and parent.contents().length == 1:
                img.addClass('lf-media')
                inner = pq('<span />')
                outer = pq('<div />')
                inner.addClass('lf-media-inner')
                outer.addClass('lf-media-outer')
                if width is not None and ratio:
                    inner.css.padding_bottom = '{:.3f}%'.format(
                        float(height) * 100. / width)
                    outer.css.width = '{}px'.format(width)
                outer.append(inner)

                # Check opacity
                if src in self.cache:
                    opaque = self.cache[src]['opaque']
                    if opaque:
                        img.addClass('lf-opaque')
                        try:
                            bg = "url({})".format(self.cache[src]["lqip"])
                            img.css("background-image", bg)
                        except KeyError:
                            pass

                # If we have a title, also enclose in a figure
                figure = pq('<figure />')
                if img.attr.title:
                    figcaption = pq('<figcaption />')
                    figcaption.html(img.attr.title)
                    del img.attr.title
                    figure.append(outer)
                    figure.append(figcaption)
                else:
                    figure.append(outer)

                # Put image in inner tag
                if img.parent()[0].tag == 'a':
                    inner.append(img.parent())
                else:
                    inner.append(img)
                # Replace parent with our enclosure
                parent.replace_with(
                    lxml.html.tostring(figure[0], encoding='unicode'))

        return d
Пример #22
0
def get_sorted_languages():
    return [(lang, langcodes.get(lang).autonym())
            for lang in SUPPORTED_LANGUAGE_CODES]
Пример #23
0
 def parse_track(self, item):
     options = {}
     error = False
     original = item  # preserve for error messages
     item = item.replace('\r', ' ').replace('\n', ' ')
     try:
         head, _emptyStr, lang_kind, _emptyStr, tail = re.split(
             r"(^| )\((.*?)\)( |$)", item)
         lang_kind = lang_kind.split()  # split input into a list of words
         kinds = set(lang_kind) & set(('captions', 'descriptions',
                                       'chapters', 'metadata', 'subtitles'))
         # Find kind
         for kind in kinds:
             if 'kind' not in options: options['kind'] = kind
             else:
                 error = True
                 continue
             lang_kind.remove(kind)
         # Find language
         for lang in lang_kind:
             if 'language' not in options:
                 if langcodes.code_to_names(
                         'language',
                         langcodes.get(
                             langcodes.standardize_tag(lang)).language):
                     options['language'] = langcodes.standardize_tag(lang)
                 else:  # lang is not a lang code. Try interpreting as a language name
                     try:
                         options['language'] = str(langcodes.find(lang))
                     except:
                         error = True
                         continue
             else:
                 error = True
                 continue
         item = head + ' ' + tail
     except:
         error = True
     if 'kind' not in options: options['kind'] = 'subtitles'
     if 'language' not in options:
         try:
             options['language'] = langcodes.standardize_tag(getlocale()[0])
         except:
             options['language'] = 'en'
     # find label
     try:
         head, _emptyStr, _quote, label, _emptyStr, tail = re.split(
             r"""(^| )(["'])(.*?)\2( |$)""", item)
         if head and tail: error = True
         item = head + tail
         options['label'] = label.strip()
     except:
         try:
             options['label'] = options['kind'].capitalize(
             ) + ' in ' + langcodes.get(
                 options['language']).autonym().capitalize()
         except:
             error = True
             options['label'] = None
     # get filename
     options['src'] = self.uri_check(item)
     # return error
     if error:
         self.state_machine.reporter.error(
             'Error in "%s" directive: \n Problems encountered parsing track "%s" \n\n'
             'Guessing the following values: \n'
             'filename: "%s" \n'
             'kind: "%s" \n'
             'language: "%s" \n'
             'label: "%s" \n\n'
             'Track kinds should be chosen from one of the following: \n'
             'captions, descriptions, chapters, metadata, subtitles \n'
             'Track languages should be given as BCP 47 compliant language codes. \n'
             'Track declarations should take the following form: \n'
             'filename (kind language_code) "label"\n'
             'Tracks must have one filename and one language_code. \n'
             'If a kind is not specified, "subtitles" will be assumed. \n'
             'If a label is not provided, it will be auto-generated from the kind and language specified.'
             % (self.name, original, options['src'], options['kind'],
                options['language'], options['label']),
             nodes.literal_block(self.block_text, self.block_text),
             line=self.lineno)
     track_node = track(self.block_text, **options)
     return track_node
Пример #24
0
def convert_lang_code(code):
    """
    Map a language code to the canonical one that ConceptNet 5 uses, using the
    'langcodes' library.
    """
    return str(langcodes.get(code))
Пример #25
0
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language. Strings that are looked up in wordfreq will be run through
    this function first, so that they can be expected to match the data.

    The text will be run through a number of pre-processing steps that vary
    by language; see the docstring of `wordfreq.preprocess.preprocess_text`.

    If `include_punctuation` is True, punctuation will be included as separate
    tokens. Otherwise, punctuation will be omitted in the output.

    CJK scripts
    -----------

    In the CJK languages, word boundaries can't usually be identified by a
    regular expression. Instead, there needs to be some language-specific
    handling. In Chinese, we use the Jieba tokenizer, with a custom word list
    to match the words whose frequencies we can look up. In Japanese and
    Korean, we use the MeCab tokenizer.

    The `external_wordlist` option only affects Chinese tokenization.  If it's
    True, then wordfreq will not use its own Chinese wordlist for tokenization.
    Instead, it will use the large wordlist packaged with the Jieba tokenizer,
    and it will leave Traditional Chinese characters as is. This will probably
    give more accurate tokenization, but the resulting tokens won't necessarily
    have word frequencies that can be looked up.

    If you end up seeing tokens that are entire phrases or sentences glued
    together, that probably means you passed in CJK text with the wrong
    language code.
    """
    # Use globals to load CJK tokenizers on demand, so that we can still run
    # in environments that lack the CJK dependencies
    global _mecab_tokenize, _jieba_tokenize

    language = langcodes.get(lang)
    info = get_language_info(language)
    text = preprocess_text(text, language)

    if info['tokenizer'] == 'mecab':
        from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
        # Get just the language code out of the Language object, so we can
        # use it to select a MeCab dictionary
        tokens = _mecab_tokenize(text, language.language)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    elif info['tokenizer'] == 'jieba':
        from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
        tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    else:
        # This is the default case where we use the regex tokenizer. First
        # let's complain a bit if we ended up here because we don't have an
        # appropriate tokenizer.
        if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES:
            logger.warning(
                "The language '{}' is in the '{}' script, which we don't "
                "have a tokenizer for. The results will be bad."
                .format(lang, info['script'])
            )
            _WARNED_LANGUAGES.add(lang)
        tokens = simple_tokenize(text, include_punctuation=include_punctuation)

    return tokens
Пример #26
0
    def upload(self, file, lang):
        """
        Call the youtube API and push the file to youtube
        :param file: file to upload
        :param lang: language of the file
        :return:
        """
        # todo split up event creation and upload
        # todo change function name
        # todo add the license properly

        title = self._build_title(lang)
        if self.t.subtitle:
            subtitle = self.t.subtitle
        else:
            subtitle = ''
        if self.t.abstract:
            abstract = self.strip_tags(self.t.abstract)
        else:
            abstract = ''

        if self.t.description:
            description = self.strip_tags(self.t.description)
        else:
            description = ''

        if self.t.url:
            if self.t.url.startswith('//'):
                url = 'https:' + self.t.url
            else:
                url = self.t.url
        else:
            url = ''

        topline = [
            "#" + x.replace(' ', '') for x in [self.t.acronym, self.t.track]
            if x
        ]
        if self.t.acronym and lang and lang != self.t.languages[0]:
            topline.append(("#" + self.t.acronym + lang).replace(' ', ''))

        description = '\n\n'.join([
            subtitle, abstract, description, ' '.join(self.t.people), url,
            ' '.join(topline)
        ])
        description = self.strip_tags(description)

        if self.t.voctoweb_url:
            description = os.path.join(self.t.voctoweb_url,
                                       self.t.slug) + '\n\n' + description

        if self.t.youtube_privacy:
            privacy = self.t.youtube_privacy
        else:
            privacy = 'private'

        license = self.t.get_raw_property('Meta.License')
        if license and 'https://creativecommons.org/licenses/by' in license:
            license = 'creativeCommon'
        else:
            license = 'youtube'

        metadata = {
            'snippet': {
                'title':
                title,
                # YouTube does not allow <> in description -> escape them
                'description':
                description.replace('<', '&lt').replace('>', '&gt'),
                'channelId':
                self.channelId,
                'tags':
                self._select_tags(lang),
                'defaultLanguage':
                langcodes.get(self.t.languages[0]).language,
                'defaultAudioLanguage':
                langcodes.get(lang or self.t.languages[0]).language,
            },
            'status': {
                'privacyStatus': privacy,
                'embeddable': True,
                'publicStatsViewable': True,
                'license': license,
            },
            'recordingDetails': {
                'recordingDate': self.t.date,
            },
        }

        # limit title length to 100 (YouTube api conformity)
        metadata['snippet']['title'] = metadata['snippet']['title'][:100]
        # limit Description length to 5000 (YouTube api conformity)
        metadata['snippet']['description'] = metadata['snippet'][
            'description'][:5000]

        if self.t.youtube_category:
            metadata['snippet']['categoryId'] = int(self.t.youtube_category)

        (mimetype, encoding) = mimetypes.guess_type(file)
        size = os.stat(file).st_size

        logging.debug(
            'guessed mime type for file %s as %s and its size as %u bytes' %
            (file, mimetype, size))

        # https://developers.google.com/youtube/v3/docs/videos#resource
        r = requests.post(
            'https://www.googleapis.com/upload/youtube/v3/videos',
            params={
                'uploadType': 'resumable',
                'part': 'snippet,status,recordingDetails'
            },
            headers={
                'Authorization': 'Bearer ' + self.accessToken,
                'Content-Type': 'application/json; charset=UTF-8',
                'X-Upload-Content-Type': mimetype,
                'X-Upload-Content-Length': str(size),
            },
            data=json.dumps(metadata))

        if 200 != r.status_code:
            if 400 == r.status_code:
                raise YouTubeException(r.json()['error']['message'] + '\n' +
                                       r.text + '\n\n' +
                                       json.dumps(metadata, indent=2))
            else:
                raise YouTubeException(
                    'Video creation failed with error-code %u: %s' %
                    (r.status_code, r.text))

        if 'location' not in r.headers:
            raise YouTubeException(
                'Video creation did not return a location-header to upload to: %s'
                % (r.headers, ))

        logging.info(
            'successfully created video and received upload-url from %s' %
            (r.headers['server'] if 'server' in r.headers else '-'))
        logging.debug('uploading video-data to %s' % r.headers['location'])

        with open(file, 'rb') as fp:
            upload = requests.put(r.headers['location'],
                                  headers={
                                      'Authorization':
                                      'Bearer ' + self.accessToken,
                                      'Content-Type': mimetype,
                                  },
                                  data=fp)

            if 200 != upload.status_code and 201 != upload.status_code:
                raise YouTubeException(
                    'uploading video failed with error-code %u: %s' %
                    (r.status_code, r.text))

        video = upload.json()

        outjpg = os.path.join(self.t.publishing_path,
                              self.t.local_filename_base + '_youtube.jpg')

        try:
            r = subprocess.check_output(
                'ffmpeg -loglevel error -i ' + self.thumbnail.path +
                ' -f image2 -vcodec mjpeg -pix_fmt yuv420p -q:v 0 -y ' +
                outjpg,
                shell=True)
            logging.info("thumbnails reformatted for youtube")
        except Exception as e_:
            raise YoutubeException("Could not scale thumbnail: " +
                                   r.decode('utf-8')) from e_

        YoutubeAPI.update_thumbnail(self.accessToken, video['id'], outjpg)

        youtube_url = 'https://www.youtube.com/watch?v=' + video['id']
        logging.info('successfully uploaded video as %s', youtube_url)

        return video['id']
Пример #27
0
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language. Strings that are looked up in wordfreq will be run through
    this function first, so that they can be expected to match the data.

    The text will be run through a number of pre-processing steps that vary
    by language; see the docstring of `wordfreq.preprocess.preprocess_text`.

    If `include_punctuation` is True, punctuation will be included as separate
    tokens. Otherwise, punctuation will be omitted in the output.

    CJK scripts
    -----------

    In the CJK languages, word boundaries can't usually be identified by a
    regular expression. Instead, there needs to be some language-specific
    handling. In Chinese, we use the Jieba tokenizer, with a custom word list
    to match the words whose frequencies we can look up. In Japanese and
    Korean, we use the MeCab tokenizer.

    The `external_wordlist` option only affects Chinese tokenization.  If it's
    True, then wordfreq will not use its own Chinese wordlist for tokenization.
    Instead, it will use the large wordlist packaged with the Jieba tokenizer,
    and it will leave Traditional Chinese characters as is. This will probably
    give more accurate tokenization, but the resulting tokens won't necessarily
    have word frequencies that can be looked up.

    If you end up seeing tokens that are entire phrases or sentences glued
    together, that probably means you passed in CJK text with the wrong
    language code.
    """
    # Use globals to load CJK tokenizers on demand, so that we can still run
    # in environments that lack the CJK dependencies
    global _mecab_tokenize, _jieba_tokenize

    language = langcodes.get(lang)
    info = get_language_info(language)
    text = preprocess_text(text, language)

    if info['tokenizer'] == 'mecab':
        from wordfreq.mecab import mecab_tokenize as _mecab_tokenize

        # Get just the language code out of the Language object, so we can
        # use it to select a MeCab dictionary
        tokens = _mecab_tokenize(text, language.language)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    elif info['tokenizer'] == 'jieba':
        from wordfreq.chinese import jieba_tokenize as _jieba_tokenize

        tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    else:
        # This is the default case where we use the regex tokenizer. First
        # let's complain a bit if we ended up here because we don't have an
        # appropriate tokenizer.
        if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES:
            logger.warning(
                "The language '{}' is in the '{}' script, which we don't "
                "have a tokenizer for. The results will be bad.".format(
                    lang, info['script']))
            _WARNED_LANGUAGES.add(lang)
        tokens = simple_tokenize(text, include_punctuation=include_punctuation)

    return tokens
Пример #28
0
def test_updated_iana():
    aqk = langcodes.get('aqk')
    assert aqk.language_name('en') == 'Aninka'
def _language_code_to_name(code):
    return langcodes.get(code).language_name('en')
Пример #30
0
import string
import langcodes

# Iterate through all 2- and 3-letter language codes, and for all languages
# that have enough data to represent their own name, show:
#
# - The original code
# - The code after normalization
# - The language's name in English
# - The language's name in that language (its autonym)

for let1 in string.ascii_lowercase:
    for let2 in string.ascii_lowercase:
        for let3 in [''] + list(string.ascii_lowercase):
            code = let1 + let2 + let3
            lcode = langcodes.get(code)
            autonym = lcode.autonym()
            name = lcode.language_name()
            if autonym != lcode.language:
                print('%-3s %-3s %-30s %s' %
                      (code, lcode.language, name, autonym))
Пример #31
0
    valid_set=list(known_multilingualities.keys()),
    format_func=lambda m: f"{m} : {known_multilingualities[m]}",
)

if "other" in state["multilinguality"]:
    other_multilinguality = leftcol.text_input(
        "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
        value="my-multilinguality",
    )
    leftcol.write(f"Registering other-{other_multilinguality} multilinguality")
    state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"

valid_values, invalid_values = list(), list()
for langtag in state["languages"]:
    try:
        lc.get(langtag)
        valid_values.append(langtag)
    except:
        invalid_values.append(langtag)
leftcol.markdown("#### Languages")
if len(invalid_values) > 0:
    leftcol.markdown("Found the following invalid values:")
    leftcol.error(invalid_values)

langtags = leftcol.text_area(
    "What languages are represented in the dataset? expected format is BCP47 tags separated for ';' e.g. 'en-US;fr-FR'",
    value=";".join(valid_values),
)
state["languages"] = langtags.strip().split(";") if langtags.strip() != "" else []

Пример #32
0
def subtitles(subtitles,
              wikifilename,
              username,
              statuscallback=None,
              errorcallback=None):
    """Convert and upload subtitles to corresponding TimedText pages."""
    statuscallback = statuscallback or (lambda text, percent: None)
    errorcallback = errorcallback or (lambda text: None)

    percent = 0
    c = Converter(ffmpeg_path='/usr/bin/ffmpeg',
                  ffprobe_path='/usr/bin/ffprobe')

    for langcode, filename in subtitles.items():
        try:
            lang = langcodes.get(langcode)
            langcode = str(lang).lower()

            langdesc = lang.describe()
            langname = langdesc['language']
            del langdesc['language']
            if langdesc:
                langname += u' (%s)' % ', '.join(langdesc.values())

            statuscallback(u'Loading subtitle in ' + langname, int(percent))
            subtitletext = ''

            info = c.probe(filename)
            if not info:
                continue
            if len(info.streams) != 1:
                continue
            if info.streams[0].type != 'subtitle':
                continue
            format = info.streams[0].codec

            if format.lower() != 'srt':
                target = filename + '.srt'
                cmd = ['/usr/bin/ffmpeg', '-i', filename, '-f', 'srt', target]
                statuscallback("Running cmd: %s" % cmd, None)
                subprocess.check_call(cmd, stderr=None)
                filename = target

            f = open(filename)
            subtitletext = f.read()
            f.close()
            subtitletext = subtitletext.decode(
                chardet.detect(subtitletext)['encoding'])

            percent += 50.0 / len(subtitles)
            statuscallback('Uploading subtitle in ' + langname, int(percent))

            # ENSURE PYWIKIBOT OAUTH PROPERLY CONFIGURED!
            site = pywikibot.Site('commons', 'commons', user=username)
            page = pywikibot.Page(
                site, u'TimedText:' + wikifilename.decode('utf-8') + u'.' +
                langcode.lower() + u'.srt')
            page.text = subtitletext
            if not page.exists():
                page.save(summary=u'Import ' + langname + u' subtitles for ' +
                          '[[:File:' + wikifilename.decode('utf-8') + ']]',
                          minor=False)

            percent += 50.0 / len(subtitles)
            statuscallback('Finished processing subtitle in ' + langname,
                           int(percent))
        except TaskAbort:
            raise
        except Exception as e:
            statuscallback(type(e).__name__ + ": " + str(e), None)
            pass
Пример #33
0
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
             combine_numbers=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language. Strings that are looked up in wordfreq will be run through
    this function first, so that they can be expected to match the data.

    Some of the processing steps are specific to one language, such as Chinese,
    but what broadly happens to the text depends on what general writing system
    the language uses, out of these categories:

    - Alphabetic scripts: English, Spanish, Russian, etc.
    - Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
    - CJK scripts: Chinese, Japanese, Korean
    - Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.

    The options `include_punctuation`, `external_wordlist`, and
    `combine_numbers` are passed on to the appropriate tokenizer:

    - `include_punctuation` preserves punctuation as tokens, instead of
      removing it.

    - `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
      instead of wordfreq's wordlist.

    - `combine_numbers` replaces multi-digit numbers with strings of zeroes.


    Alphabetic scripts
    ------------------

    The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
    European languages, which are relatively straightforward to tokenize.

    Text in these scripts will be normalized to NFC form, then passed
    through a regular expression that implements the Word Segmentation section
    of Unicode Annex #29, and then case-folded to lowercase.

    The effect is mostly to split the text on spaces and punctuation. There are
    some subtleties involving apostrophes inside words, which the regex will
    only split when they occur before a vowel. ("Hasn't" is one token, but
    "l'enfant" is two.)

    If the language is Turkish, the case-folding rules will take this into
    account, so that capital I and İ map to ı and i respectively.


    Abjad scripts
    -------------

    Languages in the Arabic or Hebrew scripts are written with optional vowel
    marks, and sometimes other decorative markings and ligatures. In these
    languages:

    - The text will be NFKC-normalized, which is a stronger and lossier form
      than NFC. Here its purpose is to reduce ligatures to simpler characters.

    - Marks will be removed, as well as the Arabic tatweel (an extension of
      a word that is used for justification or decoration).

    After these steps, the text will go through the same process as the
    alphabetic scripts above.


    CJK scripts
    -----------

    In the CJK languages, word boundaries can't usually be identified by a
    regular expression. Instead, there needs to be some language-specific
    handling.

    - Chinese text first gets converted to a canonical representation we call
      "Oversimplified Chinese", where all characters are replaced by their
      Simplified Chinese form, no matter what, even when this misspells a word or
      a name. This representation is then tokenized using the Jieba tokenizer,
      trained on the list of Chinese words that can be looked up in wordfreq.

    - Japanese and Korean will be NFKC-normalized, then tokenized using the
      MeCab tokenizer, using dictionary files that are included in this
      package.

    The `external_wordlist` option only affects Chinese tokenization.  If it's
    True, then wordfreq will not use its own Chinese wordlist for tokenization.
    Instead, it will use the large wordlist packaged with the Jieba tokenizer,
    and it will leave Traditional Chinese characters as is. This will probably
    give more accurate tokenization, but the resulting tokens won't necessarily
    have word frequencies that can be looked up.

    If you end up seeing tokens that are entire phrases or sentences glued
    together, that probably means you passed in CJK text with the wrong
    language code.


    Brahmic scripts and other languages
    -----------------------------------

    Any kind of language not previously mentioned will just go through the same
    tokenizer that alphabetic languages use.

    We've tweaked this tokenizer for the case of Indic languages in Brahmic
    scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
    languages where the default Unicode algorithm wouldn't quite work.

    Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
    written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
    does not support these languages yet. It will split on spaces and
    punctuation, giving tokens that are far too long.
    """
    # Reduce whatever language code was passed in to a normal form,
    # containing just the language subtag.
    lang = langcodes.get(lang).prefer_macrolanguage().language
    if lang == 'ja' or lang == 'ko':
        result = tokenize_mecab_language(text, lang, include_punctuation)
    elif lang == 'zh' or lang == 'yue':
        result = chinese_tokenize(text, include_punctuation, external_wordlist)
    elif lang == 'tr':
        result = simple_tokenize(preprocess_turkish(text), include_punctuation)
    elif lang == 'ro':
        result = simple_tokenize(preprocess_romanian(text), include_punctuation)
    elif lang == 'sr':
        result = simple_tokenize(preprocess_serbian(text), include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
        result = simple_tokenize(text, include_punctuation)
    else:
        result = simple_tokenize(text, include_punctuation)

    if combine_numbers:
        result = [smash_numbers(token) for token in result]
    return result
Пример #34
0
def _language_code_to_name(code):
    return langcodes.get(code).language_name('en')
Пример #35
0
def get_sorted_languages():
    return [
        (lang, langcodes.get(lang).autonym())
        for lang in SUPPORTED_LANGUAGE_CODES
    ]
Пример #36
0
def test_cldr_v40():
    en = langcodes.get('en')
    assert en.language_name('dsb') == 'engelšćina'
Пример #37
0
import string
import langcodes

# Iterate through all 2- and 3-letter language codes, and for all languages
# that have enough data to represent their own name, show:
#
# - The original code
# - The code after normalization
# - The language's name in English
# - The language's name in that language (its autonym)

en = langcodes.get('en')

for let1 in string.ascii_lowercase:
    for let2 in string.ascii_lowercase:
        for let3 in [''] + list(string.ascii_lowercase):
            code = let1 + let2 + let3
            lcode = langcodes.get(code)
            if lcode.has_name_data():
                autonym = lcode.autonym()
                name = lcode.language_name()
                print('%-3s %-3s %-30s %s' %
                      (code, lcode.language, name, autonym))
Пример #38
0
    def _process(self, resource, text, ratio=True):
        d = pq(text, parser='html')
        for img in d.items('img'):
            width = img.attr.width
            height = img.attr.height
            src = img.attr.src
            src = urllib.parse.unquote(src)
            if src is None:
                self.logger.warn(
                    "[%s] has an img tag without src attribute" % resource)
                continue
            if width is None or height is None:
                wh = self._size(resource, src, width, height)
                if wh is not None:
                    width, height = wh
                else:
                    width, height = None, None
            if width is not None:
                width, height = int(width), int(height)

            # Adapt width/height if this is a scaled image ([email protected])
            mo = re.match(r'.*@(\d+)x\.[^.]*$', src)
            if mo and width is not None:
                factor = int(mo.group(1))
                width //= factor
                height //= factor
                srcset = ['{} {}x'.format(src, factor)]
                for f in reversed(range(1, factor)):
                    tname = src.replace('@{}x.'.format(factor),
                                        '@{}x.'.format(f))
                    self._resize(src, os.path.basename(tname), float(f)/factor)
                    srcset.append('{} {}x'.format(tname, f))
                srcset = srcset[:-1]
                img.attr.src = tname
                img.attr.srcset = ','.join(srcset)

            # Put new width/height
            if width is not None:
                img.attr.width = '{}'.format(width)
                img.attr.height = '{}'.format(height)

            # If image is a SVG in /obj/, turns into an object
            if "/obj/" in src and src.endswith(".svg"):
                img[0].tag = 'object'
                img.attr("type", "image/svg+xml")
                img.attr("data", src)
                del img.attr.src
                img.text('&#128444; {}'.format(img.attr.alt or ""))

            # On-demand videos (should be in /videos)
            elif src.endswith('.m3u8'):
                id = os.path.splitext(os.path.basename(src))[0]
                img[0].tag = 'video'
                img[0].set("controls", None)
                img.attr("preload", "none")
                img.attr("crossorigin", "anonymous")
                img.attr("poster", self.site.media_url(
                    'images/posters/{}.jpg'.format(id)))
                del img.attr.src
                del img.attr.alt
                # Add sources
                m3u8 = pq('<source>')
                m3u8.attr.src = self.site.media_url(
                    'videos/{}.m3u8'.format(id))
                m3u8.attr.type = 'application/vnd.apple.mpegurl'
                img.append(m3u8)
                progressive = pq('<source>')
                progressive.attr.src = self.site.media_url(
                    'videos/{}/progressive.mp4'.format(id))
                progressive.attr.type = 'video/mp4; codecs="avc1.4d401f, mp4a.40.2"'
                img.append(progressive)
                # Add subtitle tracks if any
                vtts = [v
                        for v in self.site.content.node_from_relative_path(
                                "media/videos").walk_resources()
                        if v.name.endswith('.vtt')
                        and v.name.startswith('{}.'.format(id))]
                for vtt in vtts:
                    code = vtt.name[len(id)+1:-4]
                    track = pq('<track>')
                    track.attr.src = self.site.media_url(vtt.relative_path[6:])
                    track.attr.kind = 'subtitles'
                    track.attr.srclang = code
                    if resource.meta.language == code:
                        track[0].set("default", None)
                    if '-' not in code:
                        track.attr.label = langcodes.get(code).autonym()
                    else:
                        details = langcodes.get(code).describe(code)
                        lang = details['language']
                        del details['language']
                        track.attr.label = u"{} ({})".format(
                            lang, u", ".join(details.values()))
                    img.append(track)

            # If image is a video not in /videos turn into a simple
            # video tag like an animated GIF.
            elif src.endswith(".mp4") or src.endswith(".ogv"):
                img[0].tag = 'video'
                for attr in {'muted', 'loop', 'autoplay', 'playsinline', 'controls'}:
                    img[0].set(attr, None)
                del img.attr.alt

            # Lazy load if we have a large enough image
            if img[0].tag == "img" and width and width*height > 20000:
                img.attr.loading = "lazy"

            # If image is contained in a paragraph, enclose into a
            # responsive structure.
            parent = None
            parents = [p.tag for p in img.parents()]
            if parents[-1] == 'p':
                parent = img.parent()
            elif parents[-2:] == ['p', 'a']:
                parent = img.parent().parent()
            if parent:
                img.addClass('lf-media')
                inner = pq('<span />')
                outer = pq('<div />')
                inner.addClass('lf-media-inner')
                outer.addClass('lf-media-outer')
                if width is not None and ratio:
                    inner.css.padding_bottom = '{:.3f}%'.format(
                        float(height)*100./width)
                    outer.css.width = '{}px'.format(width)
                outer.append(inner)

                # If we have a title, also enclose in a figure
                figure = pq('<figure />')
                if img.attr.title:
                    figcaption = pq('<figcaption />')
                    figcaption.html(img.attr.title)
                    del img.attr.title
                    figure.append(outer)
                    figure.append(figcaption)
                else:
                    figure.append(outer)

                # Put image in inner tag
                if img.parent()[0].tag == 'a':
                    inner.append(img.parent())
                else:
                    inner.append(img)
                # Replace parent with our enclosure
                parent.replace_with(lxml.html.tostring(figure[0],
                                                       encoding='unicode'))

        return d