def get_coding(url): patern = re.compile(r'charset=([-\w\d]+)', re.IGNORECASE) try: response = urllib.request.urlopen(url) chrset = patern.findall(response.info()['Content-Type']) if len(chrset) == 0: gl = 15 for line in response: chrset = patern.findall(str(line)) gl -= 1 if gl == 0: return 'utf_8' if len(chrset) != 0: ch = chrset[0].lower().replace('-', '_') if ch in cdn.keys(): return cdn[ch] if ch in cdn.values(): return ch return 'utf_8' else: ch = chrset[0].lower().replace('-', '_') if ch in cdn.keys(): return cdn[ch] if ch in cdn.values(): return ch except: print('Помилка завантаження сторінки:\n', url) return None
def get_available_charsets(): charsets = set() for i in aliases.values(): i = i.replace('_', '-') charsets.add(i) charsets = list(charsets) charsets.sort() return [(i,i) for i in charsets]
def cod_page(f): cod_p = re.findall(r'charset=([-\w\d]+)(?i)', f.info()['Content-Type']) if len(cod_p) == 0: return 'utf_8' ch = cod_p[0].lower().replace('-', '_') if ch in cdn.keys(): return cdn[ch] if ch in cdn.values(): return ch
def get_encodings(self, filepath): """ Prints encodings related with a given file Parameters ---------- filepath: string Path to the file to analyse. """ from encodings.aliases import aliases alias_values = set(aliases.values()) for encoding in set(aliases.values()): try: df = pd.read_csv(filepath, encoding=encoding) print('successful', encoding) except: pass
def write_encodings(filename, line_number, final_encoding): # To ensure that we cover as many as possible encodings, # we take the union between our predefined encoding set and the # set of the values from the encodings.aliases.aliases. encodings = encs.union(set(aliases.values())) data = dict() # Read line from file try: with io.open(filename, "rb") as f: lines = f.readlines() line = lines[line_number - 1] print("\nProcessing line number: " + str(line_number)) if len(line) < 3: print("!!!Warning!!!: Possible empty line.") print("") except Exception: _, err, _ = sys.exc_info() print("Error reading " + filename) print(err) sys.exit(1) # Decode it using every possible encoding for enc in encodings: try: data[enc] = line.decode(enc) except Exception: _, err, _ = sys.exc_info() print("Cannot decode using " + enc) # print(err) # We write the results in a new utf-8 text file # We use the same filename + an '.encodings' extension fpath = os.path.abspath(filename) newfilename = fpath + '.encodings' print("\nWriting successfully tested encodings in " + newfilename) with open(newfilename, 'w') as out: c = 0 for enc in sorted(data.keys()): try: out.write("%-20s" % enc) if (sys.version_info[0] < 3): line = data[enc].encode(final_encoding) else: line = data[enc] out.write(line) out.write(os.linesep) c += 1 except Exception: _, err, _ = sys.exc_info() print("Cannot encode " + enc + " to " + final_encoding) # print(err) print("\n" + str(c) + " out of " + str(len(encodings)) + " tested encodings were written.\n")
def get_encodings_list(): """ Список возможных кодировок. """ try: result = reduce(lambda lst, code: lst if code in lst else lst + [code], aliases.values(), []) result.sort() return result except: return ['UTF-8', 'UTF-16', 'CP1251', 'CP866', 'KOI8-R']
def find_encoding(file_path): """Requires Python 3.6 or higher.""" from encodings.aliases import aliases alias_values = set(aliases.values()) encodings = [] for alias in alias_values: try: pd.read_csv(f'{file_path}', encoding=alias) encodings.append(alias) except: UnicodeDecodeError return encodings
def train(self): ''' load_data_folder: give the path of the folder csv : return a csv file ''' for encoding in set(aliases.values()): try: dataframe = pd.read_csv(self.path, encoding=encoding) print(f"[SUCCESS] Dataset Loaded Successfully") self.display_option(dataframe) label_type = dataframe[self.label][0] self.data_clean(df=dataframe) # Data cleaning null columns if self.fold == True: df = self.create_folds(df=dataframe) else: df = self.with_out_kfold(df=dataframe) if (isinstance(label_type, np.int64)): print("True") std_df = df.drop([self.label], axis=1) else: data_input = input( f"[INPUT] ENTER THE NAME OF ENCODER BECAUSE LABEL DTYPE IS OBJECT : " ) split_df, inv_label = self.split_data(df, data_input) X = split_df.drop([self.label], axis=1) print("False") std_df = self.scale_data(X) new_scaled_data = pd.concat([df[self.label], std_df], axis=1) X_train, X_test, y_train, y_test = self.train_val_data( new_scaled_data) scores = self.model_(X_train, X_test, y_train, y_test) self.display_option( scores.sort_values(by="best_score", ascending=False, ignore_index=True)) filename = input(f"[INPUT] ENTER THE PATH : ") if os.path.exists(filename): print(f"[INFO] LOADED SUCCESSFULLY") else: print(f"[ERROR] FILE NOT FOUND") pred = self.load_datafile( filename, new_scaled_data.drop([self.label], axis=1)) return f"[RESULT] predicted label : {inv_label[pred]}" except Exception as e: print(f"[ERROR] {e}") break
def ReadCsvFiles(files, delimiter=',', merge=False): ''' INPUT: files: List of file names OUTPUT: dfs: overall status (bool) all succ, list of data frames ''' if files is None or len(files) < 0: raise ValueError('Fnc "ReadCsvFiles": files is None or empty') if type(files) is not list: files = [files] dfs = {} PrintLine('Start reading files') notworked = [] for file in files: try: curdf = pd.read_csv(file, delimiter=delimiter) dfs[file] = curdf print('Dataframe loaded from {}: shape = {}'.format( file, curdf.shape)) except Exception as e: print('Could not read file ', file, ': ', str(e)) notworked.append(file) dfs[file] = None log = 'Reading files successfully finished' if len(notworked) > 0: for file in notworked: print('Trying to load file with encodings: ', file) for encoding in set(aliases.values()): try: curdf = pd.read_csv(file, encoding=encoding) dfs[file] = curdf print('Encoding found to load file: ', encoding) break except: pass allsucc = True for key, val in enumerate(dfs): if val is None: print('File could not be loaded: ', key) allsucc = False PrintLine(log) return allsucc, dfs
def known_encodings(): """\ Render a list of all-known-to-Python character encodings (including all known aliases) """ from encodings.aliases import aliases _raw_encname_list = [] _raw_encname_list.extend(aliases.keys()) _raw_encname_list.extend(aliases.values()) _raw_encname_list.sort() _encname_list = [] for _raw_encname in _raw_encname_list: _encname = _raw_encname.upper() _encname = _encname.replace('_', '-') _encname_list.append(_encname) _encname_list.sort() _encname_list = unique(_encname_list) return _encname_list
def main(): ''' parses available encoding types and checks if expected terms are present as such when decoded ''' logging.basicConfig(level=logging.INFO, format='[%(levelname)8s]: %(message)s') infile = ARGS.infile expected_words = ARGS.exp available_encs = list(set(aliases.values())) for enc in available_encs: try: with open(infile, 'r', encoding=enc) as inp: try: contents = inp.read() found, missed = defaultdict(list), defaultdict(list) for word in expected_words: if word in contents: found[enc].append(word) else: missed[enc].append(word) if expected_words: if missed[enc]: logging.debug('%s: Missed %s', enc, missed[enc]) if found[enc]: logging.info('%s: Found %s', enc, found[enc]) else: logging.info( '%s: readable. Use expected terms (--exp) to narrow results.', enc) except (UnicodeError, UnicodeDecodeError) as exception: logging.debug('%s: %s', enc, type(exception).__name__) except LookupError as exception: logging.debug('%s: %s', enc, type(exception).__name__) return
def _get_encoding(self, cr, user, context=None): result = [(x, x.replace('_', '-')) for x in set(aliases.values())] result.sort() return result
def _initUI(self): """Initiates the user interface with a grid layout and several widgets. """ self.setModal(self._modal) self.setWindowTitle(self._windowTitle) layout = QtGui.QGridLayout() self._filenameLabel = QtGui.QLabel('Choose File', self) self._filenameLineEdit = QtGui.QLineEdit(self) self._filenameLineEdit.textEdited.connect(self._updateFilename) chooseFileButtonIcon = QtGui.QIcon( QtGui.QPixmap(':/icons/document-open.png')) self._chooseFileAction = QtGui.QAction(self) self._chooseFileAction.setIcon(chooseFileButtonIcon) self._chooseFileAction.triggered.connect(self._openFile) self._chooseFileButton = QtGui.QToolButton(self) self._chooseFileButton.setDefaultAction(self._chooseFileAction) layout.addWidget(self._filenameLabel, 0, 0) layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2) layout.addWidget(self._chooseFileButton, 0, 3) self._encodingLabel = QtGui.QLabel('File Encoding', self) encoding_names = list( [x.upper() for x in sorted(list(set(_encodings.values())))]) self._encodingComboBox = QtGui.QComboBox(self) self._encodingComboBox.addItems(encoding_names) self._encodingComboBox.activated.connect(self._updateEncoding) layout.addWidget(self._encodingLabel, 1, 0) layout.addWidget(self._encodingComboBox, 1, 1, 1, 1) self._hasHeaderLabel = QtGui.QLabel('Header Available?', self) self._headerCheckBox = QtGui.QCheckBox(self) self._headerCheckBox.toggled.connect(self._updateHeader) layout.addWidget(self._hasHeaderLabel, 2, 0) layout.addWidget(self._headerCheckBox, 2, 1) self._delimiterLabel = QtGui.QLabel('Column Delimiter', self) self._delimiterBox = DelimiterSelectionWidget(self) self._delimiter = self._delimiterBox.currentSelected() self._delimiterBox.delimiter.connect(self._updateDelimiter) layout.addWidget(self._delimiterLabel, 3, 0) layout.addWidget(self._delimiterBox, 3, 1, 1, 3) self._tabWidget = QtGui.QTabWidget(self) self._previewTableView = QtGui.QTableView(self) self._datatypeTableView = QtGui.QTableView(self) self._tabWidget.addTab(self._previewTableView, 'Preview') self._tabWidget.addTab(self._datatypeTableView, 'Change Column Types') layout.addWidget(self._tabWidget, 4, 0, 3, 4) self._datatypeTableView.horizontalHeader().setDefaultSectionSize(200) self._datatypeTableView.setItemDelegateForColumn( 1, DtypeComboDelegate(self._datatypeTableView)) self._loadButton = QtGui.QPushButton('Load Data', self) #self.loadButton.setAutoDefault(False) self._cancelButton = QtGui.QPushButton('Cancel', self) # self.cancelButton.setDefault(False) # self.cancelButton.setAutoDefault(True) self._buttonBox = QtGui.QDialogButtonBox(self) self._buttonBox.addButton(self._loadButton, QtGui.QDialogButtonBox.AcceptRole) self._buttonBox.addButton(self._cancelButton, QtGui.QDialogButtonBox.RejectRole) self._buttonBox.accepted.connect(self.accepted) self._buttonBox.rejected.connect(self.rejected) layout.addWidget(self._buttonBox, 9, 2, 1, 2) self._loadButton.setDefault(False) self._filenameLineEdit.setFocus() self._statusBar = QtGui.QStatusBar(self) self._statusBar.setSizeGripEnabled(False) self._headerCheckBox.setChecked(True) layout.addWidget(self._statusBar, 8, 0, 1, 4) self.setLayout(layout)
def _initUI(self): """Initiates the user interface with a grid layout and several widgets. """ self.setModal(self._modal) self.setWindowTitle(self._windowTitle) layout = QtGui.QGridLayout() self._filenameLabel = QtGui.QLabel('Output File', self) self._filenameLineEdit = QtGui.QLineEdit(self) chooseFileButtonIcon = QtGui.QIcon( QtGui.QPixmap(':/icons/document-save-as.png')) self._chooseFileAction = QtGui.QAction(self) self._chooseFileAction.setIcon(chooseFileButtonIcon) self._chooseFileAction.triggered.connect(self._createFile) self._chooseFileButton = QtGui.QToolButton(self) self._chooseFileButton.setDefaultAction(self._chooseFileAction) layout.addWidget(self._filenameLabel, 0, 0) layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2) layout.addWidget(self._chooseFileButton, 0, 3) self._encodingLabel = QtGui.QLabel('File Encoding', self) encoding_names = list( map(lambda x: x.upper(), sorted(list(set(_encodings.values()))))) self._encodingComboBox = QtGui.QComboBox(self) self._encodingComboBox.addItems(encoding_names) self._idx = encoding_names.index('UTF_8') self._encodingComboBox.setCurrentIndex(self._idx) #self._encodingComboBox.activated.connect(self._updateEncoding) layout.addWidget(self._encodingLabel, 1, 0) layout.addWidget(self._encodingComboBox, 1, 1, 1, 1) self._hasHeaderLabel = QtGui.QLabel('Header Available?', self) self._headerCheckBox = QtGui.QCheckBox(self) #self._headerCheckBox.toggled.connect(self._updateHeader) layout.addWidget(self._hasHeaderLabel, 2, 0) layout.addWidget(self._headerCheckBox, 2, 1) self._delimiterLabel = QtGui.QLabel('Column Delimiter', self) self._delimiterBox = DelimiterSelectionWidget(self) layout.addWidget(self._delimiterLabel, 3, 0) layout.addWidget(self._delimiterBox, 3, 1, 1, 3) self._exportButton = QtGui.QPushButton('Export Data', self) self._cancelButton = QtGui.QPushButton('Cancel', self) self._buttonBox = QtGui.QDialogButtonBox(self) self._buttonBox.addButton(self._exportButton, QtGui.QDialogButtonBox.AcceptRole) self._buttonBox.addButton(self._cancelButton, QtGui.QDialogButtonBox.RejectRole) self._buttonBox.accepted.connect(self.accepted) self._buttonBox.rejected.connect(self.rejected) layout.addWidget(self._buttonBox, 5, 2, 1, 2) self._exportButton.setDefault(False) self._filenameLineEdit.setFocus() self._statusBar = QtGui.QStatusBar(self) self._statusBar.setSizeGripEnabled(False) layout.addWidget(self._statusBar, 4, 0, 1, 4) self.setLayout(layout)
UNICODE_SECONDARY_RANGE_KEYWORD = [ 'Supplement', 'Extended', 'Extensions', 'Modifier', 'Marks', 'Punctuation', 'Symbols', 'Forms', 'Operators', 'Miscellaneous', 'Drawing', 'Block', 'Shapes', 'Supplemental', 'Tags' ] # type: List[str] RE_POSSIBLE_ENCODING_INDICATION = re_compile( r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)', IGNORECASE) IANA_SUPPORTED = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, list(set(aliases.values())))) # type: List[str] IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int # pre-computed code page that are similar using the function cp_similarity. IANA_SUPPORTED_SIMILAR = { "cp037": ["cp1026", "cp1140", "cp273", "cp500"], "cp1026": ["cp037", "cp1140", "cp273", "cp500"], "cp1125": ["cp866"], "cp1140": ["cp037", "cp1026", "cp273", "cp500"], "cp1250": ["iso8859_2"], "cp1251": ["kz1048", "ptcp154"], "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"], "cp1253": ["iso8859_7"], "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"], "cp1257": ["iso8859_13"],
# Run this code cell to install and import the pycountry library #!pip install pycountry from pycountry import countries import pandas as pd # Run this code cell to see an example of how the library works countries.get(name='Spain') # Run this code cell to see how you can also look up countries without specifying the key countries.lookup('Kingdom of Spain') #encoding from encodings.aliases import aliases alias_values = set(aliases.values()) # This code finds the encodings that works for the file for encoding in set(aliases.values()): try: df = pd.read_csv("mystery.csv", encoding=encoding) print('successful', encoding) except: pass # Fill null # Fill with mean of a group df_melt = pd.read_csv('gdp_data.csv') df_melt['GDP_filled'] = df_melt.groupby('Country Name')['GDP'].transform( lambda x: x.fillna(x.mean()))
Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA """ import sys import re from encodings.aliases import aliases from splitcode import header, footer charset_pattern = re.compile(r"<meta[^>]*charset=([a-zA-Z\-0-9\"\']*)") available_encodings = set((_.lower() for _ in aliases.keys())) available_encodings |= set((_.lower() for _ in aliases.values())) # for detect invalid positions in UnicodeError message position_interval_pattern = re.compile(r"position ([0-9]*)-([0-9]*)") position_pattern = re.compile(r"position ([0-9]*):") def test_encoding(t, enc, stop_at=None): """ tests a "t" text decoding with enc and returns how many decode errors occured in the whole text """ c = 0 while True: try: t = t.decode(enc) break except LookupError:
'for', 'from', 'have', 'he', 'her', 'him', 'his', 'has', 'i', 'if', 'in', 'is', 'it', 'just', 'like', 'man', 'may', 'more', 'most', 'my', 'no', 'not', 'now', 'of', 'on', 'only', 'or', 'out', 'over', 'say', 'see', 'she', 'should', "shouldn't", 'so', 'than', 'that', 'the', 'then', 'there', 'they', 'this', 'to', 'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'wouldn', "won't", 'you' } DEFAULT_ENCODING = 'utf-8' SUPPORTED_ENCODINGS = list(sorted(set(aliases.values()))) # precompile all expressions EXPR_UPPERCASE = re.compile('^[A-Z]+$') EXPR_CAPITALS = re.compile('^[A-Z0-9]+$') EXPR_PHRASE = re.compile('[A-Za-z0-9]+') EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*') logger = logging.getLogger(__name__) class HTMLStripper(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.reset() self._raw = []
def print_encodings(): for enc in sorted(c for c in set(aliases.values()) if not c.endswith("_codec")): print(enc)
#!/usr/bin/env python3 from sys import argv, stderr, stdout, exit import argparse from encodings.aliases import aliases from codecs import encode as cencode from pathlib import Path ENCODINGS = list(set(aliases.values())) ENCODINGS.remove('rot_13') for e in ENCODINGS: if e.startswith('base'): ENCODINGS.pop(ENCODINGS.index(e)) def encode(s, encoding='utf-16'): 'UTF-16 encode the string and return each char URI encoded' try: return ''.join(['%{:0>2x}'.format(b) for b in cencode(s, encoding)]) except TypeError as e: if e.__str__( ) == "TypeError: a bytes-like object is required, not 'str'": return ''.join( ['%{:0>2x}'.format(b) for b in cencode(bytes(s), encoding)]) except LookupError as e: return None except Exception as e: print(f'[+] Failed encoding for: {encoding}', file=stderr) print(f'Error Message: {e}', file=stderr)
from encodings.aliases import aliases encoding_list = list(set(aliases.values())) def main(): src_text = input("需要恢复的字符串:") for item_i in encoding_list: for item_j in encoding_list: try: guess_text = src_text.encode(encoding=item_i).decode( encoding=item_j) print(f'{item_i}->{item_j}:{guess_text}') except Exception: pass if __name__ == '__main__': main()
def check_if_encoding_exist(encoding): return encoding in aliases.keys() or encoding in aliases.values()
"Block", "Shapes", "Supplemental", "Tags", ] # type: List[str] RE_POSSIBLE_ENCODING_INDICATION = re_compile( r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)", IGNORECASE, ) IANA_SUPPORTED = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, list(set(aliases.values())), )) # type: List[str] IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int # pre-computed code page that are similar using the function cp_similarity. IANA_SUPPORTED_SIMILAR = { "cp037": ["cp1026", "cp1140", "cp273", "cp500"], "cp1026": ["cp037", "cp1140", "cp273", "cp500"], "cp1125": ["cp866"], "cp1140": ["cp037", "cp1026", "cp273", "cp500"], "cp1250": ["iso8859_2"], "cp1251": ["kz1048", "ptcp154"], "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"], "cp1253": ["iso8859_7"], "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
# -*- coding: utf-8 -*- from encodings.aliases import aliases import nkf all_encodings = set(aliases.values()) | set(aliases.keys()) def normalize_encoding(encoding): encoding = encoding.lower() if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'): return 'cp932' return encoding def decode(text, encoding=None, *args): if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'): encoding = nkf.guess(text) if encoding in ('BINARY', 'ISO-8859-1'): encoding = 'utf8' encoding = normalize_encoding(encoding) if not encoding in all_encodings: return nkf.nkf('-w', text).decode('utf8') return text.decode(encoding, *args)
# In[4]: # TODO: Figure out what the encoding is of the myster.csv file # HINT: pd.read_csv('mystery.csv', encoding=?) where ? is the string for an encoding like 'ascii' # HINT: This link has a list of encodings that Python recognizes https://docs.python.org/3/library/codecs.html#standard-encodings # Python has a file containing a dictionary of encoding names and associated aliases # This line imports the dictionary and then creates a set of all available encodings # You can use this set of encodings to search for the correct encoding # If you'd like to see what this file looks like, execute the following Python code to see where the file is located # from encodings import aliases # aliases.__file__ from encodings.aliases import aliases alias_values = set(aliases.values()) # TODO: iterate through the alias_values list trying out the different encodings to see which one or ones work # HINT: Use a try - except statement. Otherwise your code will produce an error when reading in the csv file # with the wrong encoding. # HINT: In the try statement, print out the encoding name so that you know which one(s) worked. for encoding in alias_values: try: pd.read_csv('mystery.csv', encoding=encoding) print("Successfully read the csv with encoding of ", encoding) except: print("Failed: Encoding of ", encoding) # # Conclusion #
CONSTANT_EVALS = {'true': True, 'false': False, 'null': None} COMMON_TERMS = { 'a', 'about', 'all', 'and', 'are', 'as', 'at', 'be', 'but', 'by' 'can', 'cannot', 'could', "couldn't", 'do', 'did', "didn't", 'for', 'from', 'have', 'he', 'her', 'him', 'his', 'has', 'i', 'if', 'in', 'is', 'it', 'just', 'like', 'man', 'may', 'more', 'most', 'my', 'no', 'not', 'now', 'of', 'on', 'only', 'or', 'out', 'over', 'say', 'see', 'she', 'should', "shouldn't", 'so', 'than', 'that', 'the', 'then', 'there', 'they', 'this', 'to', 'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'wouldn', "won't", 'you' } DEFAULT_ENCODING = 'utf-8' SUPPORTED_ENCODINGS = list(sorted(set(aliases.values()))) # precompile all expressions EXPR_UPPERCASE = re.compile('^[A-Z]+$') EXPR_CAPITALS = re.compile('^[A-Z0-9]+$') EXPR_PHRASE = re.compile('[A-Za-z0-9]+') EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*') logger = logging.getLogger(__name__) class HTMLStripper(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.reset() self._raw = []
import re from pathlib import Path from encodings.aliases import aliases # regex list re_p_content = re.compile(r"<P>.*?</P>", re.DOTALL) re_a_content = re.compile(r"<A.*?</A>", re.DOTALL) re_b_content = re.compile(r"<B>.*?</B>", re.DOTALL) re_tag = re.compile(r"</?.*?>", re.DOTALL) re_empty_lines = re.compile(r"\n\s*\n") re_parenth = re.compile(r"[{(].*?[})]") # encodings heb_encodings = ['utf-8', 'cp1255', 'iso8859_8', 'cp424', 'cp856', 'cp862'] other_encodings = set(aliases.values()) - set(heb_encodings) def read_heb_file(file_path): # try hebrew encodings file_content = read_file(file_path, heb_encodings) if not file_content: # try all other encodings file_content = read_file(file_path, other_encodings) return file_content def read_file(file_path, enc_list):