예제 #1
0
from guessit import UnicodeMixin, base_text_type, u
from guessit.fileutils import load_file_in_same_dir
import logging

__all__ = [ 'Country' ]

log = logging.getLogger(__name__)


# parsed from http://en.wikipedia.org/wiki/ISO_3166-1
#
# Description of the fields:
# "An English name, an alpha-2 code (when given),
# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code
# are all separated by pipe (|) characters."
_iso3166_contents = load_file_in_same_dir(__file__, 'ISO-3166-1_utf8.txt')

country_matrix = [ l.strip().split('|')
                   for l in _iso3166_contents.strip().split('\n') ]

country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ],
                    [ 'Latin America', '', 'lat', '', '' ]
                    ]

country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix)
country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix))
country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix))

# add here exceptions / non ISO representations
# Note: remember to put those exceptions in lower-case, they won't work otherwise
country_to_alpha3.update({ 'latinoamérica': 'lat',
예제 #2
0
from __future__ import unicode_literals
from guessit import fileutils
from guessit.textutils import to_unicode
import logging

log = logging.getLogger(__name__)


# parsed from http://en.wikipedia.org/wiki/ISO_3166-1
#
# Description of the fields:
# "An English name, an alpha-2 code (when given),
# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code
# are all separated by pipe (|) characters."
_iso3166_contents = fileutils.load_file_in_same_dir(__file__, "ISO-3166-1_utf8.txt").decode("utf-8")

country_matrix = [l.strip().split("|") for l in _iso3166_contents.strip().split("\n")]

country_matrix += [["Unknown", "un", "unk", "", ""], ["Latin America", "", "lat", "", ""]]

country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix)
country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix))
country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix))

# add here exceptions / non ISO representations
# Note: remember to put those exceptions in lower-case, they won't work otherwise
country_to_alpha3.update({"latinoamérica": "lat", "brazilian": "bra", "españa": "esp", "uk": "gbr"})

country_alpha3_to_en_name = dict((c[2].lower(), c[0]) for c in country_matrix)
country_alpha3_to_alpha2 = dict((c[2].lower(), c[1].lower()) for c in country_matrix)
예제 #3
0
#

from __future__ import unicode_literals
from guessit import fileutils
from guessit.textutils import to_unicode
import logging

log = logging.getLogger(__name__)

# parsed from http://en.wikipedia.org/wiki/ISO_3166-1
#
# Description of the fields:
# "An English name, an alpha-2 code (when given),
# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code
# are all separated by pipe (|) characters."
_iso3166_contents = fileutils.load_file_in_same_dir(
    __file__, 'ISO-3166-1_utf8.txt').decode('utf-8')

country_matrix = [
    l.strip().split('|') for l in _iso3166_contents.strip().split('\n')
]

country_matrix += [['Unknown', 'un', 'unk', '', ''],
                   ['Latin America', '', 'lat', '', '']]

country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix)
country_to_alpha3.update(
    dict((c[1].lower(), c[2].lower()) for c in country_matrix))
country_to_alpha3.update(
    dict((c[2].lower(), c[2].lower()) for c in country_matrix))

# add here exceptions / non ISO representations
__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language',
            'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED',
            'search_language', 'guess_language' ]


log = logging.getLogger(__name__)


# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
#
# Description of the fields:
# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given),
# an alpha-2 code (when given), an English name, and a French name of a language
# are all separated by pipe (|) characters."
_iso639_contents = load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt')

# drop the BOM from the beginning of the file
_iso639_contents = _iso639_contents[1:]

language_matrix = [ l.strip().split('|')
                    for l in _iso639_contents.strip().split('\n') ]


# update information in the language matrix
language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'],
                    ['ass', '', '', 'Assyrian', 'assyrien']]

for lang in language_matrix:
    # remove unused languages that shadow other common ones with a non-official form
    if (lang[2] == 'se' or # Northern Sami shadows Swedish
예제 #5
0
import logging

__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language',
            'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'search_language' ]


log = logging.getLogger(__name__)


# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
#
# Description of the fields:
# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given),
# an alpha-2 code (when given), an English name, and a French name of a language
# are all separated by pipe (|) characters."
_iso639_contents = fileutils.load_file_in_same_dir(__file__,
                                                   'ISO-639-2_utf-8.txt').decode('utf-8')

# drop the BOM from the beginning of the file
_iso639_contents = _iso639_contents[1:]

language_matrix = [ l.strip().split('|')
                    for l in _iso639_contents.strip().split('\n') ]


# remove unused languages that shadow other common ones with a non-official form
for lang in language_matrix:
    if (lang[2] == 'se' or # Northern Sami shadows Swedish
        lang[2] == 'br'):  # Breton shadows Brazilian
        language_matrix.remove(lang)

예제 #6
0
import logging

__all__ = [
    'is_iso_language', 'is_language', 'lang_set', 'Language', 'ALL_LANGUAGES',
    'ALL_LANGUAGES_NAMES', 'UNDETERMINED', 'search_language'
]

log = logging.getLogger(__name__)

# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
#
# Description of the fields:
# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given),
# an alpha-2 code (when given), an English name, and a French name of a language
# are all separated by pipe (|) characters."
_iso639_contents = fileutils.load_file_in_same_dir(
    __file__, 'ISO-639-2_utf-8.txt').decode('utf-8')

# drop the BOM from the beginning of the file
_iso639_contents = _iso639_contents[1:]

language_matrix = [
    l.strip().split('|') for l in _iso639_contents.strip().split('\n')
]

# update information in the language matrix
language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'],
                    ['ass', '', '', 'Assyrian', 'assyrien']]

for lang in language_matrix:
    # remove unused languages that shadow other common ones with a non-official form
    if (lang[2] == 'se' or  # Northern Sami shadows Swedish
예제 #7
0
#

from __future__ import unicode_literals
from guessit import fileutils
import logging

log = logging.getLogger(__name__)


# parsed from http://en.wikipedia.org/wiki/ISO_3166-1
#
# Description of the fields:
# "An English name, an alpha-2 code (when given),
# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code
# are all separated by pipe (|) characters."
_iso3166_contents = fileutils.load_file_in_same_dir(__file__,
                                                    'ISO-3166-1_utf8.txt').decode('utf-8')

country_matrix = [ l.strip().split('|')
                   for l in _iso3166_contents.strip().split('\n') ]

country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ],
                    [ 'Latin America', '', 'lat', '', '' ]
                    ]

country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix)
country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix))
country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix))

# add here exceptions / non ISO representations
# Note: remember to put those exceptions in lower-case, they won't work otherwise
country_to_alpha3.update({ 'latinoamérica': 'lat',
예제 #8
0
import os.path
import re
import logging

log = logging.getLogger('guessit.language')



# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
#
# Description of the fields:
# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given),
# an alpha-2 code (when given), an English name, and a French name of a language
# are all separated by pipe (|) characters."
language_matrix = [ l.strip().decode('utf-8').split('|')
                    for l in fileutils.load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt').split('\n') ]

lng3        = frozenset(filter(bool, (l[0] for l in language_matrix)))
lng3term    = frozenset(filter(bool, (l[1] for l in language_matrix)))
lng2        = frozenset(filter(bool, (l[2] for l in language_matrix)))
lng_en_name = frozenset(filter(bool, (lng for l in language_matrix for lng in l[3].lower().split('; '))))
lng_fr_name = frozenset(filter(bool, (lng for l in language_matrix for lng in l[4].lower().split('; '))))
lng_all_names = lng3 | lng3term | lng2 | lng_en_name | lng_fr_name

lng3_to_lng3term = dict((l[0], l[1]) for l in language_matrix if l[1])
lng3term_to_lng3 = dict((l[1], l[0]) for l in language_matrix if l[1])

lng3_to_lng2 = dict((l[0], l[2]) for l in language_matrix if l[2])
lng2_to_lng3 = dict((l[2], l[0]) for l in language_matrix if l[2])

# we only return the first given english name, hoping it is the most used one