Python values 예제들, encodings.aliases.aliases.values Python 예제들

예제 #1

0

파일 보기

파일: htmlmov.py 프로젝트: ru-Dust/GeoTagMesg

def get_coding(url):
    patern = re.compile(r'charset=([-\w\d]+)', re.IGNORECASE)
    try:
        response = urllib.request.urlopen(url)
        chrset = patern.findall(response.info()['Content-Type'])
        if len(chrset) == 0:
            gl = 15
            for line in response:
                chrset = patern.findall(str(line))
                gl -= 1
                if gl == 0:
                    return 'utf_8'
                if len(chrset) != 0:
                    ch = chrset[0].lower().replace('-', '_')
                    if ch in cdn.keys():
                        return cdn[ch]
                    if ch in cdn.values():
                        return ch
            return 'utf_8'
        else:
            ch = chrset[0].lower().replace('-', '_')
            if ch in cdn.keys():
                return cdn[ch]
            if ch in cdn.values():
                return ch
    except:
        print('Помилка завантаження сторінки:\n', url)
        return None

예제 #2

0

파일 보기

파일: models.py 프로젝트: klada/django-fulltextfeed

def get_available_charsets():
    charsets = set()
    for i in aliases.values():
        i = i.replace('_', '-')
        charsets.add(i)
    charsets = list(charsets)
    charsets.sort()
    return [(i,i) for i in charsets]

예제 #3

0

파일 보기

파일: htmlmov.py 프로젝트: ru-Dust/GeoTagMesg

def cod_page(f):
    cod_p = re.findall(r'charset=([-\w\d]+)(?i)', f.info()['Content-Type'])
    if len(cod_p) == 0: return 'utf_8'
    ch = cod_p[0].lower().replace('-', '_')
    if ch in cdn.keys():
        return cdn[ch]
    if ch in cdn.values():
        return ch

예제 #4

0

파일 보기

파일: data_utils.py 프로젝트: mrugeles/AirlineTweets

    def get_encodings(self, filepath):
        """ Prints encodings related with a given file

        Parameters
        ----------
        filepath: string
            Path to the file to analyse.
        """
        from encodings.aliases import aliases

        alias_values = set(aliases.values())

        for encoding in set(aliases.values()):
            try:
                df = pd.read_csv(filepath, encoding=encoding)
                print('successful', encoding)
            except:
                pass

예제 #5

0

파일 보기

파일: test_encodings.py 프로젝트: modulexcite/code-snippets-2

def write_encodings(filename, line_number, final_encoding):
    # To ensure that we cover as many as possible encodings,
    # we take the union between our predefined encoding set and the
    # set of the values from the encodings.aliases.aliases.
    encodings = encs.union(set(aliases.values()))

    data = dict()

    # Read line from file
    try:
        with io.open(filename, "rb") as f:
            lines = f.readlines()
            line = lines[line_number - 1]
            print("\nProcessing line number: " + str(line_number))
            if len(line) < 3:
                print("!!!Warning!!!: Possible empty line.")
            print("")
    except Exception:
        _, err, _ = sys.exc_info()
        print("Error reading " + filename)
        print(err)
        sys.exit(1)

    # Decode it using every possible encoding
    for enc in encodings:
        try:
            data[enc] = line.decode(enc)
        except Exception:
            _, err, _ = sys.exc_info()
            print("Cannot decode using " + enc)
            # print(err)

    # We write the results in a new utf-8 text file
    # We use the same filename + an '.encodings' extension
    fpath = os.path.abspath(filename)
    newfilename = fpath + '.encodings'
    print("\nWriting successfully tested encodings in " + newfilename)

    with open(newfilename, 'w') as out:
        c = 0
        for enc in sorted(data.keys()):
            try:
                out.write("%-20s" % enc)
                if (sys.version_info[0] < 3):
                    line = data[enc].encode(final_encoding)
                else:
                    line = data[enc]
                out.write(line)
                out.write(os.linesep)
                c += 1
            except Exception:
                _, err, _ = sys.exc_info()
                print("Cannot encode " + enc + " to " + final_encoding)
                # print(err)

    print("\n" + str(c) + " out of " + str(len(encodings)) +
          " tested encodings were written.\n")

예제 #6

0

파일 보기

def get_encodings_list():
    """
    Список возможных кодировок.
    """
    try:
        result = reduce(lambda lst, code: lst if code in lst else lst + [code],
                        aliases.values(), [])
        result.sort()
        return result
    except:
        return ['UTF-8', 'UTF-16', 'CP1251', 'CP866', 'KOI8-R']

예제 #7

0

파일 보기

def find_encoding(file_path):
    """Requires Python 3.6 or higher."""
    from encodings.aliases import aliases

    alias_values = set(aliases.values())
    encodings = []
    for alias in alias_values:
        try:
            pd.read_csv(f'{file_path}', encoding=alias)
            encodings.append(alias)
        except:
            UnicodeDecodeError
    return encodings

예제 #8

0

파일 보기

    def train(self):
        '''
        load_data_folder: give the path of the folder
        csv : return a csv file
        '''
        for encoding in set(aliases.values()):
            try:
                dataframe = pd.read_csv(self.path, encoding=encoding)
                print(f"[SUCCESS] Dataset Loaded Successfully")
                self.display_option(dataframe)
                label_type = dataframe[self.label][0]
                self.data_clean(df=dataframe)  # Data cleaning null columns
                if self.fold == True:
                    df = self.create_folds(df=dataframe)
                else:
                    df = self.with_out_kfold(df=dataframe)

                if (isinstance(label_type, np.int64)):
                    print("True")
                    std_df = df.drop([self.label], axis=1)
                else:
                    data_input = input(
                        f"[INPUT] ENTER THE NAME OF ENCODER BECAUSE LABEL DTYPE IS OBJECT : "
                    )
                    split_df, inv_label = self.split_data(df, data_input)
                    X = split_df.drop([self.label], axis=1)
                    print("False")
                    std_df = self.scale_data(X)
                new_scaled_data = pd.concat([df[self.label], std_df], axis=1)
                X_train, X_test, y_train, y_test = self.train_val_data(
                    new_scaled_data)
                scores = self.model_(X_train, X_test, y_train, y_test)
                self.display_option(
                    scores.sort_values(by="best_score",
                                       ascending=False,
                                       ignore_index=True))
                filename = input(f"[INPUT] ENTER THE PATH : ")
                if os.path.exists(filename):
                    print(f"[INFO] LOADED SUCCESSFULLY")
                else:
                    print(f"[ERROR] FILE NOT FOUND")
                pred = self.load_datafile(
                    filename, new_scaled_data.drop([self.label], axis=1))
                return f"[RESULT] predicted label : {inv_label[pred]}"
            except Exception as e:
                print(f"[ERROR] {e}")
                break

예제 #9

0

파일 보기

파일: DataScienceHelperLibrary.py 프로젝트: bwenner/DisasterResponsePipeline

def ReadCsvFiles(files, delimiter=',', merge=False):
    '''
    INPUT:
    files: List of file names
    
    OUTPUT:
    dfs: overall status (bool) all succ, list of data frames
    '''
    if files is None or len(files) < 0:
        raise ValueError('Fnc "ReadCsvFiles": files is None or empty')
    if type(files) is not list:
        files = [files]
    dfs = {}
    PrintLine('Start reading files')
    notworked = []
    for file in files:
        try:
            curdf = pd.read_csv(file, delimiter=delimiter)
            dfs[file] = curdf
            print('Dataframe loaded from {}: shape = {}'.format(
                file, curdf.shape))
        except Exception as e:
            print('Could not read file ', file, ': ', str(e))
            notworked.append(file)
            dfs[file] = None
    log = 'Reading files successfully finished'

    if len(notworked) > 0:
        for file in notworked:
            print('Trying to load file with encodings: ', file)
            for encoding in set(aliases.values()):
                try:
                    curdf = pd.read_csv(file, encoding=encoding)
                    dfs[file] = curdf
                    print('Encoding found to load file: ', encoding)
                    break
                except:
                    pass
    allsucc = True
    for key, val in enumerate(dfs):
        if val is None:
            print('File could not be loaded: ', key)
            allsucc = False
    PrintLine(log)
    return allsucc, dfs

예제 #10

0

파일 보기

파일: utils.py 프로젝트: vijvijay/rapidapp

def known_encodings():
    """\
    Render a list of all-known-to-Python character encodings (including 
    all known aliases)

    """
    from encodings.aliases import aliases
    _raw_encname_list = []
    _raw_encname_list.extend(aliases.keys())
    _raw_encname_list.extend(aliases.values())
    _raw_encname_list.sort()
    _encname_list = []
    for _raw_encname in _raw_encname_list:
        _encname = _raw_encname.upper()
        _encname = _encname.replace('_', '-')
        _encname_list.append(_encname)
    _encname_list.sort()
    _encname_list = unique(_encname_list)
    return _encname_list

예제 #11

0

파일 보기

def main():
    '''
    parses available encoding types and checks
    if expected terms are present as such
    when decoded
    '''
    logging.basicConfig(level=logging.INFO,
                        format='[%(levelname)8s]: %(message)s')
    infile = ARGS.infile
    expected_words = ARGS.exp
    available_encs = list(set(aliases.values()))
    for enc in available_encs:
        try:
            with open(infile, 'r', encoding=enc) as inp:
                try:
                    contents = inp.read()
                    found, missed = defaultdict(list), defaultdict(list)
                    for word in expected_words:
                        if word in contents:
                            found[enc].append(word)
                        else:
                            missed[enc].append(word)

                    if expected_words:
                        if missed[enc]:
                            logging.debug('%s: Missed %s', enc, missed[enc])
                        if found[enc]:
                            logging.info('%s: Found %s', enc, found[enc])
                    else:
                        logging.info(
                            '%s: readable. Use expected terms (--exp) to narrow results.',
                            enc)

                except (UnicodeError, UnicodeDecodeError) as exception:
                    logging.debug('%s: %s', enc, type(exception).__name__)

        except LookupError as exception:
            logging.debug('%s: %s', enc, type(exception).__name__)

    return

예제 #12

0

파일 보기

파일: file_exchange.py 프로젝트: Sk1f161/ERP

 def _get_encoding(self, cr, user, context=None):
     result = [(x, x.replace('_', '-')) for x in set(aliases.values())]
     result.sort()
     return result

예제 #13

0

파일 보기

파일: CSVDialogs.py 프로젝트: miketian2020/fracturing

    def _initUI(self):
        """Initiates the user interface with a grid layout and several widgets.

        """
        self.setModal(self._modal)
        self.setWindowTitle(self._windowTitle)

        layout = QtGui.QGridLayout()

        self._filenameLabel = QtGui.QLabel('Choose File', self)
        self._filenameLineEdit = QtGui.QLineEdit(self)
        self._filenameLineEdit.textEdited.connect(self._updateFilename)
        chooseFileButtonIcon = QtGui.QIcon(
            QtGui.QPixmap(':/icons/document-open.png'))
        self._chooseFileAction = QtGui.QAction(self)
        self._chooseFileAction.setIcon(chooseFileButtonIcon)
        self._chooseFileAction.triggered.connect(self._openFile)

        self._chooseFileButton = QtGui.QToolButton(self)
        self._chooseFileButton.setDefaultAction(self._chooseFileAction)

        layout.addWidget(self._filenameLabel, 0, 0)
        layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2)
        layout.addWidget(self._chooseFileButton, 0, 3)

        self._encodingLabel = QtGui.QLabel('File Encoding', self)

        encoding_names = list(
            [x.upper() for x in sorted(list(set(_encodings.values())))])
        self._encodingComboBox = QtGui.QComboBox(self)
        self._encodingComboBox.addItems(encoding_names)
        self._encodingComboBox.activated.connect(self._updateEncoding)

        layout.addWidget(self._encodingLabel, 1, 0)
        layout.addWidget(self._encodingComboBox, 1, 1, 1, 1)

        self._hasHeaderLabel = QtGui.QLabel('Header Available?', self)
        self._headerCheckBox = QtGui.QCheckBox(self)
        self._headerCheckBox.toggled.connect(self._updateHeader)

        layout.addWidget(self._hasHeaderLabel, 2, 0)
        layout.addWidget(self._headerCheckBox, 2, 1)

        self._delimiterLabel = QtGui.QLabel('Column Delimiter', self)
        self._delimiterBox = DelimiterSelectionWidget(self)
        self._delimiter = self._delimiterBox.currentSelected()
        self._delimiterBox.delimiter.connect(self._updateDelimiter)

        layout.addWidget(self._delimiterLabel, 3, 0)
        layout.addWidget(self._delimiterBox, 3, 1, 1, 3)

        self._tabWidget = QtGui.QTabWidget(self)
        self._previewTableView = QtGui.QTableView(self)
        self._datatypeTableView = QtGui.QTableView(self)
        self._tabWidget.addTab(self._previewTableView, 'Preview')
        self._tabWidget.addTab(self._datatypeTableView, 'Change Column Types')
        layout.addWidget(self._tabWidget, 4, 0, 3, 4)

        self._datatypeTableView.horizontalHeader().setDefaultSectionSize(200)
        self._datatypeTableView.setItemDelegateForColumn(
            1, DtypeComboDelegate(self._datatypeTableView))

        self._loadButton = QtGui.QPushButton('Load Data', self)
        #self.loadButton.setAutoDefault(False)

        self._cancelButton = QtGui.QPushButton('Cancel', self)
        # self.cancelButton.setDefault(False)
        # self.cancelButton.setAutoDefault(True)

        self._buttonBox = QtGui.QDialogButtonBox(self)
        self._buttonBox.addButton(self._loadButton,
                                  QtGui.QDialogButtonBox.AcceptRole)
        self._buttonBox.addButton(self._cancelButton,
                                  QtGui.QDialogButtonBox.RejectRole)
        self._buttonBox.accepted.connect(self.accepted)
        self._buttonBox.rejected.connect(self.rejected)
        layout.addWidget(self._buttonBox, 9, 2, 1, 2)
        self._loadButton.setDefault(False)
        self._filenameLineEdit.setFocus()

        self._statusBar = QtGui.QStatusBar(self)
        self._statusBar.setSizeGripEnabled(False)
        self._headerCheckBox.setChecked(True)
        layout.addWidget(self._statusBar, 8, 0, 1, 4)
        self.setLayout(layout)

예제 #14

0

파일 보기

파일: CSVDialogs.py 프로젝트: miketian2020/fracturing

    def _initUI(self):
        """Initiates the user interface with a grid layout and several widgets.

        """
        self.setModal(self._modal)
        self.setWindowTitle(self._windowTitle)

        layout = QtGui.QGridLayout()

        self._filenameLabel = QtGui.QLabel('Output File', self)
        self._filenameLineEdit = QtGui.QLineEdit(self)
        chooseFileButtonIcon = QtGui.QIcon(
            QtGui.QPixmap(':/icons/document-save-as.png'))
        self._chooseFileAction = QtGui.QAction(self)
        self._chooseFileAction.setIcon(chooseFileButtonIcon)
        self._chooseFileAction.triggered.connect(self._createFile)

        self._chooseFileButton = QtGui.QToolButton(self)
        self._chooseFileButton.setDefaultAction(self._chooseFileAction)

        layout.addWidget(self._filenameLabel, 0, 0)
        layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2)
        layout.addWidget(self._chooseFileButton, 0, 3)

        self._encodingLabel = QtGui.QLabel('File Encoding', self)

        encoding_names = list(
            map(lambda x: x.upper(), sorted(list(set(_encodings.values())))))

        self._encodingComboBox = QtGui.QComboBox(self)
        self._encodingComboBox.addItems(encoding_names)
        self._idx = encoding_names.index('UTF_8')
        self._encodingComboBox.setCurrentIndex(self._idx)
        #self._encodingComboBox.activated.connect(self._updateEncoding)

        layout.addWidget(self._encodingLabel, 1, 0)
        layout.addWidget(self._encodingComboBox, 1, 1, 1, 1)

        self._hasHeaderLabel = QtGui.QLabel('Header Available?', self)
        self._headerCheckBox = QtGui.QCheckBox(self)
        #self._headerCheckBox.toggled.connect(self._updateHeader)

        layout.addWidget(self._hasHeaderLabel, 2, 0)
        layout.addWidget(self._headerCheckBox, 2, 1)

        self._delimiterLabel = QtGui.QLabel('Column Delimiter', self)
        self._delimiterBox = DelimiterSelectionWidget(self)

        layout.addWidget(self._delimiterLabel, 3, 0)
        layout.addWidget(self._delimiterBox, 3, 1, 1, 3)

        self._exportButton = QtGui.QPushButton('Export Data', self)
        self._cancelButton = QtGui.QPushButton('Cancel', self)

        self._buttonBox = QtGui.QDialogButtonBox(self)
        self._buttonBox.addButton(self._exportButton,
                                  QtGui.QDialogButtonBox.AcceptRole)
        self._buttonBox.addButton(self._cancelButton,
                                  QtGui.QDialogButtonBox.RejectRole)

        self._buttonBox.accepted.connect(self.accepted)
        self._buttonBox.rejected.connect(self.rejected)

        layout.addWidget(self._buttonBox, 5, 2, 1, 2)
        self._exportButton.setDefault(False)
        self._filenameLineEdit.setFocus()

        self._statusBar = QtGui.QStatusBar(self)
        self._statusBar.setSizeGripEnabled(False)
        layout.addWidget(self._statusBar, 4, 0, 1, 4)
        self.setLayout(layout)

예제 #15

0

파일 보기

파일: constant.py 프로젝트: SIRIUS-GOP/sms_service

UNICODE_SECONDARY_RANGE_KEYWORD = [
    'Supplement', 'Extended', 'Extensions', 'Modifier', 'Marks', 'Punctuation',
    'Symbols', 'Forms', 'Operators', 'Miscellaneous', 'Drawing', 'Block',
    'Shapes', 'Supplemental', 'Tags'
]  # type: List[str]

RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
    IGNORECASE)

IANA_SUPPORTED = sorted(
    filter(
        lambda x: x.endswith("_codec") is
        False and x not in {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values()))))  # type: List[str]

IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int

# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1257": ["iso8859_13"],

예제 #16

0

파일 보기

파일: clean.py 프로젝트: hamidkhbl/udacity_data_science

# Run this code cell to install and import the pycountry library
#!pip install pycountry
from pycountry import countries
import pandas as pd

# Run this code cell to see an example of how the library works
countries.get(name='Spain')

# Run this code cell to see how you can also look up countries without specifying the key
countries.lookup('Kingdom of Spain')

#encoding
from encodings.aliases import aliases

alias_values = set(aliases.values())

# This code finds the encodings that works for the file
for encoding in set(aliases.values()):
    try:
        df = pd.read_csv("mystery.csv", encoding=encoding)
        print('successful', encoding)
    except:
        pass

# Fill null
# Fill with mean of a group

df_melt = pd.read_csv('gdp_data.csv')
df_melt['GDP_filled'] = df_melt.groupby('Country Name')['GDP'].transform(
    lambda x: x.fillna(x.mean()))

예제 #17

0

파일 보기

파일: clean_wire_encoding.py 프로젝트: zseder/webcorpus

Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
"""

import sys
import re
from encodings.aliases import aliases

from splitcode import header, footer

charset_pattern = re.compile(r"<meta[^>]*charset=([a-zA-Z\-0-9\"\']*)")
available_encodings = set((_.lower() for _ in aliases.keys()))
available_encodings |= set((_.lower() for _ in aliases.values()))
# for detect invalid positions in UnicodeError message
position_interval_pattern = re.compile(r"position ([0-9]*)-([0-9]*)")
position_pattern = re.compile(r"position ([0-9]*):")

def test_encoding(t, enc, stop_at=None):
    """
    tests a "t" text decoding with enc and returns how many decode errors
    occured in the whole text
    """
    c = 0
    while True:
        try:
            t = t.decode(enc)
            break
        except LookupError:

예제 #18

0

파일 보기

파일: text.py 프로젝트: bitesofcode/projex

    'for', 'from',
    'have', 'he', 'her', 'him', 'his', 'has',
    'i', 'if', 'in', 'is', 'it',
    'just',
    'like',
    'man', 'may', 'more', 'most', 'my',
    'no', 'not', 'now',
    'of', 'on', 'only', 'or', 'out', 'over',
    'say', 'see', 'she', 'should', "shouldn't", 'so',
    'than', 'that', 'the', 'then', 'there', 'they', 'this', 'to',
    'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'wouldn', "won't",
    'you'
}

DEFAULT_ENCODING = 'utf-8'
SUPPORTED_ENCODINGS = list(sorted(set(aliases.values())))

# precompile all expressions
EXPR_UPPERCASE = re.compile('^[A-Z]+$')
EXPR_CAPITALS = re.compile('^[A-Z0-9]+$')
EXPR_PHRASE = re.compile('[A-Za-z0-9]+')
EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*')

logger = logging.getLogger(__name__)


class HTMLStripper(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()
        self._raw = []

예제 #19

0

파일 보기

파일: banner.py 프로젝트: m1gnus/RSArmageddon

def print_encodings():
    for enc in sorted(c for c in set(aliases.values()) if not c.endswith("_codec")):
        print(enc)

예제 #20

0

파일 보기

파일: generator.py 프로젝트: 5l1v3r1/percent_encoding_generator

#!/usr/bin/env python3

from sys import argv, stderr, stdout, exit
import argparse
from encodings.aliases import aliases
from codecs import encode as cencode
from pathlib import Path

ENCODINGS = list(set(aliases.values()))
ENCODINGS.remove('rot_13')

for e in ENCODINGS:
    if e.startswith('base'):
        ENCODINGS.pop(ENCODINGS.index(e))


def encode(s, encoding='utf-16'):
    'UTF-16 encode the string and return each char URI encoded'

    try:
        return ''.join(['%{:0>2x}'.format(b) for b in cencode(s, encoding)])
    except TypeError as e:
        if e.__str__(
        ) == "TypeError: a bytes-like object is required, not 'str'":
            return ''.join(
                ['%{:0>2x}'.format(b) for b in cencode(bytes(s), encoding)])
    except LookupError as e:
        return None
    except Exception as e:
        print(f'[+] Failed encoding for: {encoding}', file=stderr)
        print(f'Error Message: {e}', file=stderr)

예제 #21

0

파일 보기

파일: main.py 프로젝트: Dreace/MojibakeRecover

from encodings.aliases import aliases

encoding_list = list(set(aliases.values()))


def main():
    src_text = input("需要恢复的字符串：")
    for item_i in encoding_list:
        for item_j in encoding_list:
            try:
                guess_text = src_text.encode(encoding=item_i).decode(
                    encoding=item_j)
                print(f'{item_i}->{item_j}:{guess_text}')
            except Exception:
                pass


if __name__ == '__main__':
    main()

예제 #22

0

파일 보기

파일: helpers.py 프로젝트: jasonbutt/hivedesktop

def check_if_encoding_exist(encoding):
    return encoding in aliases.keys() or encoding in aliases.values()

예제 #23

0

파일 보기

파일: constant.py 프로젝트: nayanasp/project

    "Block",
    "Shapes",
    "Supplemental",
    "Tags",
]  # type: List[str]

RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
    IGNORECASE,
)

IANA_SUPPORTED = sorted(
    filter(
        lambda x: x.endswith("_codec") is False and x not in
        {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values())),
    ))  # type: List[str]

IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int

# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],

예제 #24

0

파일 보기

파일: mycodecs.py 프로젝트: pombredanne/atango

# -*- coding: utf-8 -*-
from encodings.aliases import aliases
import nkf

all_encodings = set(aliases.values()) | set(aliases.keys())


def normalize_encoding(encoding):
    encoding = encoding.lower()
    if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
        return 'cp932'
    return encoding


def decode(text, encoding=None, *args):
    if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'):
        encoding = nkf.guess(text)
        if encoding in ('BINARY', 'ISO-8859-1'):
            encoding = 'utf8'
    encoding = normalize_encoding(encoding)
    if not encoding in all_encodings:
        return nkf.nkf('-w', text).decode('utf8')
    return text.decode(encoding, *args)

예제 #25

0

파일 보기

# In[4]:

# TODO: Figure out what the encoding is of the myster.csv file
# HINT: pd.read_csv('mystery.csv', encoding=?) where ? is the string for an encoding like 'ascii'
# HINT: This link has a list of encodings that Python recognizes https://docs.python.org/3/library/codecs.html#standard-encodings

# Python has a file containing a dictionary of encoding names and associated aliases
# This line imports the dictionary and then creates a set of all available encodings
# You can use this set of encodings to search for the correct encoding
# If you'd like to see what this file looks like, execute the following Python code to see where the file is located
#    from encodings import aliases
#    aliases.__file__

from encodings.aliases import aliases

alias_values = set(aliases.values())

# TODO: iterate through the alias_values list trying out the different encodings to see which one or ones work
# HINT: Use a try - except statement. Otherwise your code will produce an error when reading in the csv file
#       with the wrong encoding.
# HINT: In the try statement, print out the encoding name so that you know which one(s) worked.

for encoding in alias_values:
    try:
        pd.read_csv('mystery.csv', encoding=encoding)
        print("Successfully read the csv with encoding of ", encoding)
    except:
        print("Failed: Encoding of ", encoding)

# # Conclusion
#

예제 #26

0

파일 보기

파일: text.py 프로젝트: ottochiu/projex

CONSTANT_EVALS = {'true': True, 'false': False, 'null': None}

COMMON_TERMS = {
    'a', 'about', 'all', 'and', 'are', 'as', 'at', 'be', 'but', 'by'
    'can', 'cannot', 'could', "couldn't", 'do', 'did', "didn't", 'for', 'from',
    'have', 'he', 'her', 'him', 'his', 'has', 'i', 'if', 'in', 'is', 'it',
    'just', 'like', 'man', 'may', 'more', 'most', 'my', 'no', 'not', 'now',
    'of', 'on', 'only', 'or', 'out', 'over', 'say', 'see', 'she', 'should',
    "shouldn't", 'so', 'than', 'that', 'the', 'then', 'there', 'they', 'this',
    'to', 'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will',
    'with', 'would', 'wouldn', "won't", 'you'
}

DEFAULT_ENCODING = 'utf-8'
SUPPORTED_ENCODINGS = list(sorted(set(aliases.values())))

# precompile all expressions
EXPR_UPPERCASE = re.compile('^[A-Z]+$')
EXPR_CAPITALS = re.compile('^[A-Z0-9]+$')
EXPR_PHRASE = re.compile('[A-Za-z0-9]+')
EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*')

logger = logging.getLogger(__name__)


class HTMLStripper(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()
        self._raw = []

예제 #27

0

파일 보기

파일: prep_mamra.py 프로젝트: roman-verbit-ai/lt

import re
from pathlib import Path
from encodings.aliases import aliases

# regex list
re_p_content = re.compile(r"<P>.*?</P>", re.DOTALL)
re_a_content = re.compile(r"<A.*?</A>", re.DOTALL)
re_b_content = re.compile(r"<B>.*?</B>", re.DOTALL)
re_tag = re.compile(r"</?.*?>", re.DOTALL)
re_empty_lines = re.compile(r"\n\s*\n")
re_parenth = re.compile(r"[{(].*?[})]")

# encodings
heb_encodings = ['utf-8', 'cp1255', 'iso8859_8', 'cp424', 'cp856', 'cp862']
other_encodings = set(aliases.values()) - set(heb_encodings)


def read_heb_file(file_path):

    # try hebrew encodings
    file_content = read_file(file_path, heb_encodings)
    if not file_content:

        # try all other encodings
        file_content = read_file(file_path, other_encodings)

    return file_content


def read_file(file_path, enc_list):