예제 #1
0
def get_coding(url):
    patern = re.compile(r'charset=([-\w\d]+)', re.IGNORECASE)
    try:
        response = urllib.request.urlopen(url)
        chrset = patern.findall(response.info()['Content-Type'])
        if len(chrset) == 0:
            gl = 15
            for line in response:
                chrset = patern.findall(str(line))
                gl -= 1
                if gl == 0:
                    return 'utf_8'
                if len(chrset) != 0:
                    ch = chrset[0].lower().replace('-', '_')
                    if ch in cdn.keys():
                        return cdn[ch]
                    if ch in cdn.values():
                        return ch
            return 'utf_8'
        else:
            ch = chrset[0].lower().replace('-', '_')
            if ch in cdn.keys():
                return cdn[ch]
            if ch in cdn.values():
                return ch
    except:
        print('Помилка завантаження сторінки:\n', url)
        return None
예제 #2
0
def get_available_charsets():
    charsets = set()
    for i in aliases.values():
        i = i.replace('_', '-')
        charsets.add(i)
    charsets = list(charsets)
    charsets.sort()
    return [(i,i) for i in charsets]
예제 #3
0
def cod_page(f):
    cod_p = re.findall(r'charset=([-\w\d]+)(?i)', f.info()['Content-Type'])
    if len(cod_p) == 0: return 'utf_8'
    ch = cod_p[0].lower().replace('-', '_')
    if ch in cdn.keys():
        return cdn[ch]
    if ch in cdn.values():
        return ch
예제 #4
0
    def get_encodings(self, filepath):
        """ Prints encodings related with a given file

        Parameters
        ----------
        filepath: string
            Path to the file to analyse.
        """
        from encodings.aliases import aliases

        alias_values = set(aliases.values())

        for encoding in set(aliases.values()):
            try:
                df = pd.read_csv(filepath, encoding=encoding)
                print('successful', encoding)
            except:
                pass
예제 #5
0
def write_encodings(filename, line_number, final_encoding):
    # To ensure that we cover as many as possible encodings,
    # we take the union between our predefined encoding set and the
    # set of the values from the encodings.aliases.aliases.
    encodings = encs.union(set(aliases.values()))

    data = dict()

    # Read line from file
    try:
        with io.open(filename, "rb") as f:
            lines = f.readlines()
            line = lines[line_number - 1]
            print("\nProcessing line number: " + str(line_number))
            if len(line) < 3:
                print("!!!Warning!!!: Possible empty line.")
            print("")
    except Exception:
        _, err, _ = sys.exc_info()
        print("Error reading " + filename)
        print(err)
        sys.exit(1)

    # Decode it using every possible encoding
    for enc in encodings:
        try:
            data[enc] = line.decode(enc)
        except Exception:
            _, err, _ = sys.exc_info()
            print("Cannot decode using " + enc)
            # print(err)

    # We write the results in a new utf-8 text file
    # We use the same filename + an '.encodings' extension
    fpath = os.path.abspath(filename)
    newfilename = fpath + '.encodings'
    print("\nWriting successfully tested encodings in " + newfilename)

    with open(newfilename, 'w') as out:
        c = 0
        for enc in sorted(data.keys()):
            try:
                out.write("%-20s" % enc)
                if (sys.version_info[0] < 3):
                    line = data[enc].encode(final_encoding)
                else:
                    line = data[enc]
                out.write(line)
                out.write(os.linesep)
                c += 1
            except Exception:
                _, err, _ = sys.exc_info()
                print("Cannot encode " + enc + " to " + final_encoding)
                # print(err)

    print("\n" + str(c) + " out of " + str(len(encodings)) +
          " tested encodings were written.\n")
예제 #6
0
def get_encodings_list():
    """
    Список возможных кодировок.
    """
    try:
        result = reduce(lambda lst, code: lst if code in lst else lst + [code],
                        aliases.values(), [])
        result.sort()
        return result
    except:
        return ['UTF-8', 'UTF-16', 'CP1251', 'CP866', 'KOI8-R']
예제 #7
0
def find_encoding(file_path):
    """Requires Python 3.6 or higher."""
    from encodings.aliases import aliases

    alias_values = set(aliases.values())
    encodings = []
    for alias in alias_values:
        try:
            pd.read_csv(f'{file_path}', encoding=alias)
            encodings.append(alias)
        except:
            UnicodeDecodeError
    return encodings
예제 #8
0
    def train(self):
        '''
        load_data_folder: give the path of the folder
        csv : return a csv file
        '''
        for encoding in set(aliases.values()):
            try:
                dataframe = pd.read_csv(self.path, encoding=encoding)
                print(f"[SUCCESS] Dataset Loaded Successfully")
                self.display_option(dataframe)
                label_type = dataframe[self.label][0]
                self.data_clean(df=dataframe)  # Data cleaning null columns
                if self.fold == True:
                    df = self.create_folds(df=dataframe)
                else:
                    df = self.with_out_kfold(df=dataframe)

                if (isinstance(label_type, np.int64)):
                    print("True")
                    std_df = df.drop([self.label], axis=1)
                else:
                    data_input = input(
                        f"[INPUT] ENTER THE NAME OF ENCODER BECAUSE LABEL DTYPE IS OBJECT : "
                    )
                    split_df, inv_label = self.split_data(df, data_input)
                    X = split_df.drop([self.label], axis=1)
                    print("False")
                    std_df = self.scale_data(X)
                new_scaled_data = pd.concat([df[self.label], std_df], axis=1)
                X_train, X_test, y_train, y_test = self.train_val_data(
                    new_scaled_data)
                scores = self.model_(X_train, X_test, y_train, y_test)
                self.display_option(
                    scores.sort_values(by="best_score",
                                       ascending=False,
                                       ignore_index=True))
                filename = input(f"[INPUT] ENTER THE PATH : ")
                if os.path.exists(filename):
                    print(f"[INFO] LOADED SUCCESSFULLY")
                else:
                    print(f"[ERROR] FILE NOT FOUND")
                pred = self.load_datafile(
                    filename, new_scaled_data.drop([self.label], axis=1))
                return f"[RESULT] predicted label : {inv_label[pred]}"
            except Exception as e:
                print(f"[ERROR] {e}")
                break
def ReadCsvFiles(files, delimiter=',', merge=False):
    '''
    INPUT:
    files: List of file names
    
    OUTPUT:
    dfs: overall status (bool) all succ, list of data frames
    '''
    if files is None or len(files) < 0:
        raise ValueError('Fnc "ReadCsvFiles": files is None or empty')
    if type(files) is not list:
        files = [files]
    dfs = {}
    PrintLine('Start reading files')
    notworked = []
    for file in files:
        try:
            curdf = pd.read_csv(file, delimiter=delimiter)
            dfs[file] = curdf
            print('Dataframe loaded from {}: shape = {}'.format(
                file, curdf.shape))
        except Exception as e:
            print('Could not read file ', file, ': ', str(e))
            notworked.append(file)
            dfs[file] = None
    log = 'Reading files successfully finished'

    if len(notworked) > 0:
        for file in notworked:
            print('Trying to load file with encodings: ', file)
            for encoding in set(aliases.values()):
                try:
                    curdf = pd.read_csv(file, encoding=encoding)
                    dfs[file] = curdf
                    print('Encoding found to load file: ', encoding)
                    break
                except:
                    pass
    allsucc = True
    for key, val in enumerate(dfs):
        if val is None:
            print('File could not be loaded: ', key)
            allsucc = False
    PrintLine(log)
    return allsucc, dfs
예제 #10
0
파일: utils.py 프로젝트: vijvijay/rapidapp
def known_encodings():
    """\
    Render a list of all-known-to-Python character encodings (including 
    all known aliases)

    """
    from encodings.aliases import aliases
    _raw_encname_list = []
    _raw_encname_list.extend(aliases.keys())
    _raw_encname_list.extend(aliases.values())
    _raw_encname_list.sort()
    _encname_list = []
    for _raw_encname in _raw_encname_list:
        _encname = _raw_encname.upper()
        _encname = _encname.replace('_', '-')
        _encname_list.append(_encname)
    _encname_list.sort()
    _encname_list = unique(_encname_list)
    return _encname_list
예제 #11
0
def main():
    '''
    parses available encoding types and checks
    if expected terms are present as such
    when decoded
    '''
    logging.basicConfig(level=logging.INFO,
                        format='[%(levelname)8s]: %(message)s')
    infile = ARGS.infile
    expected_words = ARGS.exp
    available_encs = list(set(aliases.values()))
    for enc in available_encs:
        try:
            with open(infile, 'r', encoding=enc) as inp:
                try:
                    contents = inp.read()
                    found, missed = defaultdict(list), defaultdict(list)
                    for word in expected_words:
                        if word in contents:
                            found[enc].append(word)
                        else:
                            missed[enc].append(word)

                    if expected_words:
                        if missed[enc]:
                            logging.debug('%s: Missed %s', enc, missed[enc])
                        if found[enc]:
                            logging.info('%s: Found %s', enc, found[enc])
                    else:
                        logging.info(
                            '%s: readable. Use expected terms (--exp) to narrow results.',
                            enc)

                except (UnicodeError, UnicodeDecodeError) as exception:
                    logging.debug('%s: %s', enc, type(exception).__name__)

        except LookupError as exception:
            logging.debug('%s: %s', enc, type(exception).__name__)

    return
예제 #12
0
 def _get_encoding(self, cr, user, context=None):
     result = [(x, x.replace('_', '-')) for x in set(aliases.values())]
     result.sort()
     return result
예제 #13
0
    def _initUI(self):
        """Initiates the user interface with a grid layout and several widgets.

        """
        self.setModal(self._modal)
        self.setWindowTitle(self._windowTitle)

        layout = QtGui.QGridLayout()

        self._filenameLabel = QtGui.QLabel('Choose File', self)
        self._filenameLineEdit = QtGui.QLineEdit(self)
        self._filenameLineEdit.textEdited.connect(self._updateFilename)
        chooseFileButtonIcon = QtGui.QIcon(
            QtGui.QPixmap(':/icons/document-open.png'))
        self._chooseFileAction = QtGui.QAction(self)
        self._chooseFileAction.setIcon(chooseFileButtonIcon)
        self._chooseFileAction.triggered.connect(self._openFile)

        self._chooseFileButton = QtGui.QToolButton(self)
        self._chooseFileButton.setDefaultAction(self._chooseFileAction)

        layout.addWidget(self._filenameLabel, 0, 0)
        layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2)
        layout.addWidget(self._chooseFileButton, 0, 3)

        self._encodingLabel = QtGui.QLabel('File Encoding', self)

        encoding_names = list(
            [x.upper() for x in sorted(list(set(_encodings.values())))])
        self._encodingComboBox = QtGui.QComboBox(self)
        self._encodingComboBox.addItems(encoding_names)
        self._encodingComboBox.activated.connect(self._updateEncoding)

        layout.addWidget(self._encodingLabel, 1, 0)
        layout.addWidget(self._encodingComboBox, 1, 1, 1, 1)

        self._hasHeaderLabel = QtGui.QLabel('Header Available?', self)
        self._headerCheckBox = QtGui.QCheckBox(self)
        self._headerCheckBox.toggled.connect(self._updateHeader)

        layout.addWidget(self._hasHeaderLabel, 2, 0)
        layout.addWidget(self._headerCheckBox, 2, 1)

        self._delimiterLabel = QtGui.QLabel('Column Delimiter', self)
        self._delimiterBox = DelimiterSelectionWidget(self)
        self._delimiter = self._delimiterBox.currentSelected()
        self._delimiterBox.delimiter.connect(self._updateDelimiter)

        layout.addWidget(self._delimiterLabel, 3, 0)
        layout.addWidget(self._delimiterBox, 3, 1, 1, 3)

        self._tabWidget = QtGui.QTabWidget(self)
        self._previewTableView = QtGui.QTableView(self)
        self._datatypeTableView = QtGui.QTableView(self)
        self._tabWidget.addTab(self._previewTableView, 'Preview')
        self._tabWidget.addTab(self._datatypeTableView, 'Change Column Types')
        layout.addWidget(self._tabWidget, 4, 0, 3, 4)

        self._datatypeTableView.horizontalHeader().setDefaultSectionSize(200)
        self._datatypeTableView.setItemDelegateForColumn(
            1, DtypeComboDelegate(self._datatypeTableView))

        self._loadButton = QtGui.QPushButton('Load Data', self)
        #self.loadButton.setAutoDefault(False)

        self._cancelButton = QtGui.QPushButton('Cancel', self)
        # self.cancelButton.setDefault(False)
        # self.cancelButton.setAutoDefault(True)

        self._buttonBox = QtGui.QDialogButtonBox(self)
        self._buttonBox.addButton(self._loadButton,
                                  QtGui.QDialogButtonBox.AcceptRole)
        self._buttonBox.addButton(self._cancelButton,
                                  QtGui.QDialogButtonBox.RejectRole)
        self._buttonBox.accepted.connect(self.accepted)
        self._buttonBox.rejected.connect(self.rejected)
        layout.addWidget(self._buttonBox, 9, 2, 1, 2)
        self._loadButton.setDefault(False)
        self._filenameLineEdit.setFocus()

        self._statusBar = QtGui.QStatusBar(self)
        self._statusBar.setSizeGripEnabled(False)
        self._headerCheckBox.setChecked(True)
        layout.addWidget(self._statusBar, 8, 0, 1, 4)
        self.setLayout(layout)
예제 #14
0
    def _initUI(self):
        """Initiates the user interface with a grid layout and several widgets.

        """
        self.setModal(self._modal)
        self.setWindowTitle(self._windowTitle)

        layout = QtGui.QGridLayout()

        self._filenameLabel = QtGui.QLabel('Output File', self)
        self._filenameLineEdit = QtGui.QLineEdit(self)
        chooseFileButtonIcon = QtGui.QIcon(
            QtGui.QPixmap(':/icons/document-save-as.png'))
        self._chooseFileAction = QtGui.QAction(self)
        self._chooseFileAction.setIcon(chooseFileButtonIcon)
        self._chooseFileAction.triggered.connect(self._createFile)

        self._chooseFileButton = QtGui.QToolButton(self)
        self._chooseFileButton.setDefaultAction(self._chooseFileAction)

        layout.addWidget(self._filenameLabel, 0, 0)
        layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2)
        layout.addWidget(self._chooseFileButton, 0, 3)

        self._encodingLabel = QtGui.QLabel('File Encoding', self)

        encoding_names = list(
            map(lambda x: x.upper(), sorted(list(set(_encodings.values())))))

        self._encodingComboBox = QtGui.QComboBox(self)
        self._encodingComboBox.addItems(encoding_names)
        self._idx = encoding_names.index('UTF_8')
        self._encodingComboBox.setCurrentIndex(self._idx)
        #self._encodingComboBox.activated.connect(self._updateEncoding)

        layout.addWidget(self._encodingLabel, 1, 0)
        layout.addWidget(self._encodingComboBox, 1, 1, 1, 1)

        self._hasHeaderLabel = QtGui.QLabel('Header Available?', self)
        self._headerCheckBox = QtGui.QCheckBox(self)
        #self._headerCheckBox.toggled.connect(self._updateHeader)

        layout.addWidget(self._hasHeaderLabel, 2, 0)
        layout.addWidget(self._headerCheckBox, 2, 1)

        self._delimiterLabel = QtGui.QLabel('Column Delimiter', self)
        self._delimiterBox = DelimiterSelectionWidget(self)

        layout.addWidget(self._delimiterLabel, 3, 0)
        layout.addWidget(self._delimiterBox, 3, 1, 1, 3)

        self._exportButton = QtGui.QPushButton('Export Data', self)
        self._cancelButton = QtGui.QPushButton('Cancel', self)

        self._buttonBox = QtGui.QDialogButtonBox(self)
        self._buttonBox.addButton(self._exportButton,
                                  QtGui.QDialogButtonBox.AcceptRole)
        self._buttonBox.addButton(self._cancelButton,
                                  QtGui.QDialogButtonBox.RejectRole)

        self._buttonBox.accepted.connect(self.accepted)
        self._buttonBox.rejected.connect(self.rejected)

        layout.addWidget(self._buttonBox, 5, 2, 1, 2)
        self._exportButton.setDefault(False)
        self._filenameLineEdit.setFocus()

        self._statusBar = QtGui.QStatusBar(self)
        self._statusBar.setSizeGripEnabled(False)
        layout.addWidget(self._statusBar, 4, 0, 1, 4)
        self.setLayout(layout)
예제 #15
0
UNICODE_SECONDARY_RANGE_KEYWORD = [
    'Supplement', 'Extended', 'Extensions', 'Modifier', 'Marks', 'Punctuation',
    'Symbols', 'Forms', 'Operators', 'Miscellaneous', 'Drawing', 'Block',
    'Shapes', 'Supplemental', 'Tags'
]  # type: List[str]

RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
    IGNORECASE)

IANA_SUPPORTED = sorted(
    filter(
        lambda x: x.endswith("_codec") is
        False and x not in {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values()))))  # type: List[str]

IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int

# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1257": ["iso8859_13"],
예제 #16
0
# Run this code cell to install and import the pycountry library
#!pip install pycountry
from pycountry import countries
import pandas as pd

# Run this code cell to see an example of how the library works
countries.get(name='Spain')

# Run this code cell to see how you can also look up countries without specifying the key
countries.lookup('Kingdom of Spain')

#encoding
from encodings.aliases import aliases

alias_values = set(aliases.values())

# This code finds the encodings that works for the file
for encoding in set(aliases.values()):
    try:
        df = pd.read_csv("mystery.csv", encoding=encoding)
        print('successful', encoding)
    except:
        pass

# Fill null
# Fill with mean of a group

df_melt = pd.read_csv('gdp_data.csv')
df_melt['GDP_filled'] = df_melt.groupby('Country Name')['GDP'].transform(
    lambda x: x.fillna(x.mean()))
예제 #17
0
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
"""

import sys
import re
from encodings.aliases import aliases

from splitcode import header, footer

charset_pattern = re.compile(r"<meta[^>]*charset=([a-zA-Z\-0-9\"\']*)")
available_encodings = set((_.lower() for _ in aliases.keys()))
available_encodings |= set((_.lower() for _ in aliases.values()))
# for detect invalid positions in UnicodeError message
position_interval_pattern = re.compile(r"position ([0-9]*)-([0-9]*)")
position_pattern = re.compile(r"position ([0-9]*):")

def test_encoding(t, enc, stop_at=None):
    """
    tests a "t" text decoding with enc and returns how many decode errors
    occured in the whole text
    """
    c = 0
    while True:
        try:
            t = t.decode(enc)
            break
        except LookupError:
예제 #18
0
파일: text.py 프로젝트: bitesofcode/projex
    'for', 'from',
    'have', 'he', 'her', 'him', 'his', 'has',
    'i', 'if', 'in', 'is', 'it',
    'just',
    'like',
    'man', 'may', 'more', 'most', 'my',
    'no', 'not', 'now',
    'of', 'on', 'only', 'or', 'out', 'over',
    'say', 'see', 'she', 'should', "shouldn't", 'so',
    'than', 'that', 'the', 'then', 'there', 'they', 'this', 'to',
    'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'wouldn', "won't",
    'you'
}

DEFAULT_ENCODING = 'utf-8'
SUPPORTED_ENCODINGS = list(sorted(set(aliases.values())))

# precompile all expressions
EXPR_UPPERCASE = re.compile('^[A-Z]+$')
EXPR_CAPITALS = re.compile('^[A-Z0-9]+$')
EXPR_PHRASE = re.compile('[A-Za-z0-9]+')
EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*')

logger = logging.getLogger(__name__)


class HTMLStripper(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()
        self._raw = []
예제 #19
0
def print_encodings():
    for enc in sorted(c for c in set(aliases.values()) if not c.endswith("_codec")):
        print(enc)
#!/usr/bin/env python3

from sys import argv, stderr, stdout, exit
import argparse
from encodings.aliases import aliases
from codecs import encode as cencode
from pathlib import Path

ENCODINGS = list(set(aliases.values()))
ENCODINGS.remove('rot_13')

for e in ENCODINGS:
    if e.startswith('base'):
        ENCODINGS.pop(ENCODINGS.index(e))


def encode(s, encoding='utf-16'):
    'UTF-16 encode the string and return each char URI encoded'

    try:
        return ''.join(['%{:0>2x}'.format(b) for b in cencode(s, encoding)])
    except TypeError as e:
        if e.__str__(
        ) == "TypeError: a bytes-like object is required, not 'str'":
            return ''.join(
                ['%{:0>2x}'.format(b) for b in cencode(bytes(s), encoding)])
    except LookupError as e:
        return None
    except Exception as e:
        print(f'[+] Failed encoding for: {encoding}', file=stderr)
        print(f'Error Message: {e}', file=stderr)
예제 #21
0
from encodings.aliases import aliases

encoding_list = list(set(aliases.values()))


def main():
    src_text = input("需要恢复的字符串:")
    for item_i in encoding_list:
        for item_j in encoding_list:
            try:
                guess_text = src_text.encode(encoding=item_i).decode(
                    encoding=item_j)
                print(f'{item_i}->{item_j}:{guess_text}')
            except Exception:
                pass


if __name__ == '__main__':
    main()
예제 #22
0
def check_if_encoding_exist(encoding):
    return encoding in aliases.keys() or encoding in aliases.values()
예제 #23
0
    "Block",
    "Shapes",
    "Supplemental",
    "Tags",
]  # type: List[str]

RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
    IGNORECASE,
)

IANA_SUPPORTED = sorted(
    filter(
        lambda x: x.endswith("_codec") is False and x not in
        {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values())),
    ))  # type: List[str]

IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int

# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
예제 #24
0
# -*- coding: utf-8 -*-
from encodings.aliases import aliases
import nkf

all_encodings = set(aliases.values()) | set(aliases.keys())


def normalize_encoding(encoding):
    encoding = encoding.lower()
    if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
        return 'cp932'
    return encoding


def decode(text, encoding=None, *args):
    if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'):
        encoding = nkf.guess(text)
        if encoding in ('BINARY', 'ISO-8859-1'):
            encoding = 'utf8'
    encoding = normalize_encoding(encoding)
    if not encoding in all_encodings:
        return nkf.nkf('-w', text).decode('utf8')
    return text.decode(encoding, *args)
예제 #25
0
# In[4]:

# TODO: Figure out what the encoding is of the myster.csv file
# HINT: pd.read_csv('mystery.csv', encoding=?) where ? is the string for an encoding like 'ascii'
# HINT: This link has a list of encodings that Python recognizes https://docs.python.org/3/library/codecs.html#standard-encodings

# Python has a file containing a dictionary of encoding names and associated aliases
# This line imports the dictionary and then creates a set of all available encodings
# You can use this set of encodings to search for the correct encoding
# If you'd like to see what this file looks like, execute the following Python code to see where the file is located
#    from encodings import aliases
#    aliases.__file__

from encodings.aliases import aliases

alias_values = set(aliases.values())

# TODO: iterate through the alias_values list trying out the different encodings to see which one or ones work
# HINT: Use a try - except statement. Otherwise your code will produce an error when reading in the csv file
#       with the wrong encoding.
# HINT: In the try statement, print out the encoding name so that you know which one(s) worked.

for encoding in alias_values:
    try:
        pd.read_csv('mystery.csv', encoding=encoding)
        print("Successfully read the csv with encoding of ", encoding)
    except:
        print("Failed: Encoding of ", encoding)

# # Conclusion
#
예제 #26
0
파일: text.py 프로젝트: ottochiu/projex
CONSTANT_EVALS = {'true': True, 'false': False, 'null': None}

COMMON_TERMS = {
    'a', 'about', 'all', 'and', 'are', 'as', 'at', 'be', 'but', 'by'
    'can', 'cannot', 'could', "couldn't", 'do', 'did', "didn't", 'for', 'from',
    'have', 'he', 'her', 'him', 'his', 'has', 'i', 'if', 'in', 'is', 'it',
    'just', 'like', 'man', 'may', 'more', 'most', 'my', 'no', 'not', 'now',
    'of', 'on', 'only', 'or', 'out', 'over', 'say', 'see', 'she', 'should',
    "shouldn't", 'so', 'than', 'that', 'the', 'then', 'there', 'they', 'this',
    'to', 'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will',
    'with', 'would', 'wouldn', "won't", 'you'
}

DEFAULT_ENCODING = 'utf-8'
SUPPORTED_ENCODINGS = list(sorted(set(aliases.values())))

# precompile all expressions
EXPR_UPPERCASE = re.compile('^[A-Z]+$')
EXPR_CAPITALS = re.compile('^[A-Z0-9]+$')
EXPR_PHRASE = re.compile('[A-Za-z0-9]+')
EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*')

logger = logging.getLogger(__name__)


class HTMLStripper(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()
        self._raw = []
예제 #27
0
import re
from pathlib import Path
from encodings.aliases import aliases

# regex list
re_p_content = re.compile(r"<P>.*?</P>", re.DOTALL)
re_a_content = re.compile(r"<A.*?</A>", re.DOTALL)
re_b_content = re.compile(r"<B>.*?</B>", re.DOTALL)
re_tag = re.compile(r"</?.*?>", re.DOTALL)
re_empty_lines = re.compile(r"\n\s*\n")
re_parenth = re.compile(r"[{(].*?[})]")

# encodings
heb_encodings = ['utf-8', 'cp1255', 'iso8859_8', 'cp424', 'cp856', 'cp862']
other_encodings = set(aliases.values()) - set(heb_encodings)


def read_heb_file(file_path):

    # try hebrew encodings
    file_content = read_file(file_path, heb_encodings)
    if not file_content:

        # try all other encodings
        file_content = read_file(file_path, other_encodings)

    return file_content


def read_file(file_path, enc_list):