Пример #1
0
def check_wiktionary_language(target_lang):
    seen_codes = {}
    for lang_name in WIKT_LANGUAGE_NAMES[target_lang]:
        if lang_name.startswith('Proto-'):
            continue
        code = str(langcodes.find(lang_name))
        assert code not in seen_codes, \
            "%r and %r have the same code" % (seen_codes[code], lang_name)
        seen_codes[code] = lang_name
Пример #2
0
def language_to_code(language_name):
    """
    """
    code = ""
    try:
        l = langcodes.find(language_name)
        code = l.language
    except LookupError:
        pass
    return code
Пример #3
0
 async def voice(self, ctx, ttvoice):
     if len(ttvoice) == 2:
         lang = langcodes.Language.make(language=ttvoice)
     else:
         lang = langcodes.find(ttvoice)
     if not str(lang.display_name()) == f'Unknown language [{ttvoice}]':
         self.ttsvoice = str(lang.language)
         await ctx.send('Voice changed to %s' % lang.display_name())
     else:
         await ctx.send('Incorrect language code.')
Пример #4
0
def yake_keywords(
    texts: List[str],
    language: str = "slovenian",
    max_len: int = 1,
    stopwords: List[str] = None,
):
    # yake uses lancodes instead of full language name
    lg = langcodes.find(language).language
    yake_extractor = yake.KeywordExtractor(
        lan=lg, n=max_len, stopwords=stopwords
    )
    return [yake_extractor.extract_keywords(txt) for txt in texts]
Пример #5
0
 def tll(self, message, language, *args):
     try:
         language_code = str(langcodes.find(language))
     except LookupError:
         yield from self.client.send_message(message.channel,
                                             'Could not find language')
         return
     except:
         yield from self.client.send_message(
             message.channel, 'A language lookup error occured')
         return
     yield from self.client.send_message(
         message.channel,
         translate.translate(
             message.content.split(' ', 2)[2], language_code))
Пример #6
0
    def _create_language(self, movie_uri, lang):
        encoded_lang = to_turtle_fmt(lang)

        # I'm sorry it's the only one it doesn't detect
        if encoded_lang == "Castellano":
            lang_code = "es"
        else:
            lang_code = langcodes.find(lang).language

        language_uri = self.baseURI[encoded_lang]

        self.g.add((language_uri, RDF.type, self.dbpedia.Language))

        self.g.add(
            (language_uri, self.dbpedia.languageCode, Literal(lang_code)))

        return language_uri
Пример #7
0
def languageDetect(lang):
    # lang = lang.lower()
    # lang = lang.capitalize()
    sp = ['es', 'ca', 'eu', 'gl']

    lang = langcodes.find(lang).language
    if lang == 'en':
        lang = lang + '-GB'

    elif lang in sp:
        lang = lang + '-ES'

    else:
        raise Exception(
            'Sorry, language not supported for analysis.\nPlease, select a valid language.'
        )

    return lang
Пример #8
0
    def language(self, value):

        # Fixme: better ???
        invalid = False
        try:
            self._language = langcodes.find(value)
        except LookupError:
            try:
                self._language = langcodes.get(value)
            except:
               invalid = True
        except:
            invalid = True

        if invalid:
            self._language = ''
            if value:
                self._logger.warning('Unknown language {}'.format(value))
 def extract_lang(self, lang: str) -> Optional[List[str]]:
     """
     Extract language code from raw text
     :param lang: language raw text
     :return: list of language codes
     :rtype: list
     """
     if not lang.strip():
         return None
     lng = lang.strip()
     if lng:
         try:
             return langcodes.find(lng).language
         except LookupError:
             if lng in self.missing_languages:
                 pass
             else:
                 self.missing_languages.add(lng)
                 print("unknown language: {}".format(lng))
     return None
Пример #10
0
import langcodes

from string import punctuation
import os
import logging
import codecs

from six import string_types

from builtins import str


# The language management should be in `pke.utils` but it would create a circular import.

get_alpha_2 = lambda l: langcodes.find(l).language

# lang_stopwords = {get_alpha_2(l): l for l in stopwords._fileids}

lang_stem = {get_alpha_2(l): l for l in set(SnowballStemmer.languages) - set(['porter'])}
lang_stem.update({'en': 'porter'})

PRINT_NO_STEM_WARNING = defaultdict(lambda: True)
PRINT_NO_STWO_WARNING = defaultdict(lambda: True)


def get_stopwords(lang):
    """Provide stopwords for the given language, or default value.

    If stopwords are not available for a given language, a default value is
    returned and a warning is displayed
Пример #11
0
 def parse_track(self, item):
     options = {}
     error = False
     original = item  # preserve for error messages
     item = item.replace('\r', ' ').replace('\n', ' ')
     try:
         head, _emptyStr, lang_kind, _emptyStr, tail = re.split(
             r"(^| )\((.*?)\)( |$)", item)
         lang_kind = lang_kind.split()  # split input into a list of words
         kinds = set(lang_kind) & set(('captions', 'descriptions',
                                       'chapters', 'metadata', 'subtitles'))
         # Find kind
         for kind in kinds:
             if 'kind' not in options: options['kind'] = kind
             else:
                 error = True
                 continue
             lang_kind.remove(kind)
         # Find language
         for lang in lang_kind:
             if 'language' not in options:
                 if langcodes.code_to_names(
                         'language',
                         langcodes.get(
                             langcodes.standardize_tag(lang)).language):
                     options['language'] = langcodes.standardize_tag(lang)
                 else:  # lang is not a lang code. Try interpreting as a language name
                     try:
                         options['language'] = str(langcodes.find(lang))
                     except:
                         error = True
                         continue
             else:
                 error = True
                 continue
         item = head + ' ' + tail
     except:
         error = True
     if 'kind' not in options: options['kind'] = 'subtitles'
     if 'language' not in options:
         try:
             options['language'] = langcodes.standardize_tag(getlocale()[0])
         except:
             options['language'] = 'en'
     # find label
     try:
         head, _emptyStr, _quote, label, _emptyStr, tail = re.split(
             r"""(^| )(["'])(.*?)\2( |$)""", item)
         if head and tail: error = True
         item = head + tail
         options['label'] = label.strip()
     except:
         try:
             options['label'] = options['kind'].capitalize(
             ) + ' in ' + langcodes.get(
                 options['language']).autonym().capitalize()
         except:
             error = True
             options['label'] = None
     # get filename
     options['src'] = self.uri_check(item)
     # return error
     if error:
         self.state_machine.reporter.error(
             'Error in "%s" directive: \n Problems encountered parsing track "%s" \n\n'
             'Guessing the following values: \n'
             'filename: "%s" \n'
             'kind: "%s" \n'
             'language: "%s" \n'
             'label: "%s" \n\n'
             'Track kinds should be chosen from one of the following: \n'
             'captions, descriptions, chapters, metadata, subtitles \n'
             'Track languages should be given as BCP 47 compliant language codes. \n'
             'Track declarations should take the following form: \n'
             'filename (kind language_code) "label"\n'
             'Tracks must have one filename and one language_code. \n'
             'If a kind is not specified, "subtitles" will be assumed. \n'
             'If a label is not provided, it will be auto-generated from the kind and language specified.'
             % (self.name, original, options['src'], options['kind'],
                options['language'], options['label']),
             nodes.literal_block(self.block_text, self.block_text),
             line=self.lineno)
     track_node = track(self.block_text, **options)
     return track_node
Пример #12
0
 def lang_convert(self, language):
     return langcodes.find(language).language
sort_by = 'highest-rated'  # popularity|newest
timeout = 15
csv_file_name = 'Udemy Free Courses.csv'

# %%
# - Choose language
# lc.get('en').display_name()
while True:
    language_name = input('Please type your language (or type x to quit): ')
    language_err = False
    lang_code = ''
    if language_name.lower().strip() == "x":
        exit(0)
        break
    try:
        lang_code = lc.find(language_name).language
    except LookupError as err:
        print(err)
        language_err = True
    except Exception as err:
        print("Something went wrong")
        print(type(err).__name__)
        language_err = True
    if not language_err: break

#print(lang_code)


# %%
# - Methods
def generate_page_dataframe(elements_collection):
Пример #14
0
def territories_from_language(lang: str) -> List[str]:
    lang_iso = langcodes.find(lang).language
    iso2 = LANGUAGE2TERRITORIES[lang_iso]
    iso3 = coco.convert(iso2, src="ISO2", to="ISO3")
    return iso3
Пример #15
0
def language_to_iso2(lang: str) -> str:
    return langcodes.find(lang).language