from guessit import UnicodeMixin, base_text_type, u from guessit.fileutils import load_file_in_same_dir import logging __all__ = [ 'Country' ] log = logging.getLogger(__name__) # parsed from http://en.wikipedia.org/wiki/ISO_3166-1 # # Description of the fields: # "An English name, an alpha-2 code (when given), # an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code # are all separated by pipe (|) characters." _iso3166_contents = load_file_in_same_dir(__file__, 'ISO-3166-1_utf8.txt') country_matrix = [ l.strip().split('|') for l in _iso3166_contents.strip().split('\n') ] country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ], [ 'Latin America', '', 'lat', '', '' ] ] country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix)) country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix)) # add here exceptions / non ISO representations # Note: remember to put those exceptions in lower-case, they won't work otherwise country_to_alpha3.update({ 'latinoamérica': 'lat',
from __future__ import unicode_literals from guessit import fileutils from guessit.textutils import to_unicode import logging log = logging.getLogger(__name__) # parsed from http://en.wikipedia.org/wiki/ISO_3166-1 # # Description of the fields: # "An English name, an alpha-2 code (when given), # an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code # are all separated by pipe (|) characters." _iso3166_contents = fileutils.load_file_in_same_dir(__file__, "ISO-3166-1_utf8.txt").decode("utf-8") country_matrix = [l.strip().split("|") for l in _iso3166_contents.strip().split("\n")] country_matrix += [["Unknown", "un", "unk", "", ""], ["Latin America", "", "lat", "", ""]] country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix)) country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix)) # add here exceptions / non ISO representations # Note: remember to put those exceptions in lower-case, they won't work otherwise country_to_alpha3.update({"latinoamérica": "lat", "brazilian": "bra", "españa": "esp", "uk": "gbr"}) country_alpha3_to_en_name = dict((c[2].lower(), c[0]) for c in country_matrix) country_alpha3_to_alpha2 = dict((c[2].lower(), c[1].lower()) for c in country_matrix)
# from __future__ import unicode_literals from guessit import fileutils from guessit.textutils import to_unicode import logging log = logging.getLogger(__name__) # parsed from http://en.wikipedia.org/wiki/ISO_3166-1 # # Description of the fields: # "An English name, an alpha-2 code (when given), # an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code # are all separated by pipe (|) characters." _iso3166_contents = fileutils.load_file_in_same_dir( __file__, 'ISO-3166-1_utf8.txt').decode('utf-8') country_matrix = [ l.strip().split('|') for l in _iso3166_contents.strip().split('\n') ] country_matrix += [['Unknown', 'un', 'unk', '', ''], ['Latin America', '', 'lat', '', '']] country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) country_to_alpha3.update( dict((c[1].lower(), c[2].lower()) for c in country_matrix)) country_to_alpha3.update( dict((c[2].lower(), c[2].lower()) for c in country_matrix)) # add here exceptions / non ISO representations
__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED', 'search_language', 'guess_language' ] log = logging.getLogger(__name__) # downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt # # Description of the fields: # "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given), # an alpha-2 code (when given), an English name, and a French name of a language # are all separated by pipe (|) characters." _iso639_contents = load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt') # drop the BOM from the beginning of the file _iso639_contents = _iso639_contents[1:] language_matrix = [ l.strip().split('|') for l in _iso639_contents.strip().split('\n') ] # update information in the language matrix language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'], ['ass', '', '', 'Assyrian', 'assyrien']] for lang in language_matrix: # remove unused languages that shadow other common ones with a non-official form if (lang[2] == 'se' or # Northern Sami shadows Swedish
import logging __all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'search_language' ] log = logging.getLogger(__name__) # downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt # # Description of the fields: # "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given), # an alpha-2 code (when given), an English name, and a French name of a language # are all separated by pipe (|) characters." _iso639_contents = fileutils.load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt').decode('utf-8') # drop the BOM from the beginning of the file _iso639_contents = _iso639_contents[1:] language_matrix = [ l.strip().split('|') for l in _iso639_contents.strip().split('\n') ] # remove unused languages that shadow other common ones with a non-official form for lang in language_matrix: if (lang[2] == 'se' or # Northern Sami shadows Swedish lang[2] == 'br'): # Breton shadows Brazilian language_matrix.remove(lang)
import logging __all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED', 'search_language' ] log = logging.getLogger(__name__) # downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt # # Description of the fields: # "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given), # an alpha-2 code (when given), an English name, and a French name of a language # are all separated by pipe (|) characters." _iso639_contents = fileutils.load_file_in_same_dir( __file__, 'ISO-639-2_utf-8.txt').decode('utf-8') # drop the BOM from the beginning of the file _iso639_contents = _iso639_contents[1:] language_matrix = [ l.strip().split('|') for l in _iso639_contents.strip().split('\n') ] # update information in the language matrix language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'], ['ass', '', '', 'Assyrian', 'assyrien']] for lang in language_matrix: # remove unused languages that shadow other common ones with a non-official form if (lang[2] == 'se' or # Northern Sami shadows Swedish
# from __future__ import unicode_literals from guessit import fileutils import logging log = logging.getLogger(__name__) # parsed from http://en.wikipedia.org/wiki/ISO_3166-1 # # Description of the fields: # "An English name, an alpha-2 code (when given), # an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code # are all separated by pipe (|) characters." _iso3166_contents = fileutils.load_file_in_same_dir(__file__, 'ISO-3166-1_utf8.txt').decode('utf-8') country_matrix = [ l.strip().split('|') for l in _iso3166_contents.strip().split('\n') ] country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ], [ 'Latin America', '', 'lat', '', '' ] ] country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix)) country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix)) # add here exceptions / non ISO representations # Note: remember to put those exceptions in lower-case, they won't work otherwise country_to_alpha3.update({ 'latinoamérica': 'lat',
import os.path import re import logging log = logging.getLogger('guessit.language') # downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt # # Description of the fields: # "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given), # an alpha-2 code (when given), an English name, and a French name of a language # are all separated by pipe (|) characters." language_matrix = [ l.strip().decode('utf-8').split('|') for l in fileutils.load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt').split('\n') ] lng3 = frozenset(filter(bool, (l[0] for l in language_matrix))) lng3term = frozenset(filter(bool, (l[1] for l in language_matrix))) lng2 = frozenset(filter(bool, (l[2] for l in language_matrix))) lng_en_name = frozenset(filter(bool, (lng for l in language_matrix for lng in l[3].lower().split('; ')))) lng_fr_name = frozenset(filter(bool, (lng for l in language_matrix for lng in l[4].lower().split('; ')))) lng_all_names = lng3 | lng3term | lng2 | lng_en_name | lng_fr_name lng3_to_lng3term = dict((l[0], l[1]) for l in language_matrix if l[1]) lng3term_to_lng3 = dict((l[1], l[0]) for l in language_matrix if l[1]) lng3_to_lng2 = dict((l[0], l[2]) for l in language_matrix if l[2]) lng2_to_lng3 = dict((l[2], l[0]) for l in language_matrix if l[2]) # we only return the first given english name, hoping it is the most used one