def test_filename_in_exception(self): # When possible, include the file name in the exception. path = 'some_file_path' lines = ( b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S ) class Bunk: def __init__(self, lines, path): self.name = path self._lines = lines self._index = 0 def readline(self): if self._index == len(lines): raise StopIteration line = lines[self._index] self._index += 1 return line with self.assertRaises(SyntaxError): ins = Bunk(lines, path) # Make sure lacking a name isn't an issue. del ins.name detect_encoding(ins.readline) with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): ins = Bunk(lines, path) detect_encoding(ins.readline)
def read_py_file(filepath): if sys.version_info < (3, ): return open(filepath, 'rU').read() else: # see https://docs.python.org/3/library/tokenize.html#tokenize.detect_encoding # first just see if the file is properly encoded try: with open(filepath, 'rb') as f: tokenize.detect_encoding(f.readline) except SyntaxError as err: # this warning is issued: # (1) in badly authored files (contains non-utf8 in a comment line) # (2) a coding is specified, but wrong and # (3) no coding is specified, and the default # 'utf8' fails to decode. # (4) the encoding specified by a pep263 declaration did not match # with the encoding detected by inspecting the BOM raise CouldNotHandleEncoding(filepath, err) try: return tokenize.open(filepath).read() # this warning is issued: # (1) if uft-8 is specified, but latin1 is used with something like \x0e9 appearing # (see http://stackoverflow.com/a/5552623) except UnicodeDecodeError as err: raise CouldNotHandleEncoding(filepath, err)
def update_fileinfo(cls, fileinfo, document=None): import tokenize if not document: try: with open(fileinfo.fullpathname, 'rb') as buffer: encoding, lines = tokenize.detect_encoding(buffer.readline) fileinfo.encoding = encoding except IOError: pass else: s = document.gettext(0, 1024).encode('utf-8', errors='ignore') buffer = io.BytesIO(s) encoding, lines = tokenize.detect_encoding(buffer.readline) fileinfo.encoding = encoding
def patch(self, filename): self.current_file = filename with tokenize.open(filename) as fp: content = fp.read() old_content = content for operation in self.operations: content = operation.patch(content) if content == old_content: # no change self.check(content) if self.options.to_stdout: self.write_stdout(content) return False with open(filename, "rb") as fp: encoding, _ = tokenize.detect_encoding(fp.readline) if not self.options.quiet: print("Patch %s" % filename) if not self.options.to_stdout: with open(filename, "w", encoding=encoding) as fp: fp.write(content) else: self.write_stdout(content) self.check(content) return True
def open_source_file(filename): # pylint: disable=consider-using-with with open(filename, "rb") as byte_stream: encoding = detect_encoding(byte_stream.readline)[0] stream = open(filename, newline=None, encoding=encoding) data = stream.read() return stream, encoding, data
def insert_suppressions( source: bytes, comments: Iterable[SuppressionComment], *, code_width: int = DEFAULT_CODE_WIDTH, min_comment_width: int = DEFAULT_MIN_COMMENT_WIDTH, ) -> InsertSuppressionsResult: """ Given an iterable of `lines`, forms a new sequence of lines with `comments` inserted. """ encoding = tokenize.detect_encoding(BytesIO(source).readline)[0] tokens = tuple(tokenize.tokenize(BytesIO(source).readline)) indentations = _get_indentations(tokens) physical_to_logical = LineMappingInfo.compute(tokens=tokens).physical_to_logical comments_queue = deque(sorted(comments)) # sort by line number updated_lines = [] for line_number, line_bytes in enumerate(BytesIO(source).readlines(), start=1): while comments_queue: target_line = physical_to_logical[comments_queue[0].before_line] if target_line == line_number: indent = indentations[line_number] width = max(code_width - len(indent), min_comment_width) for line in comments_queue.popleft().to_lines(width): updated_lines.append(f"{indent}{line}\n".encode(encoding)) else: break updated_lines.append(line_bytes) return InsertSuppressionsResult( updated_source=b"".join(updated_lines), failed_insertions=tuple(comments_queue) )
def roundtrip(filename, output=sys.stdout): with open(filename, "rb") as pyfile: encoding = tokenize.detect_encoding(pyfile.readline)[0] with open(filename, "r", encoding=encoding) as pyfile: source = pyfile.read() tree = compile(source, filename, "exec", ast.PyCF_ONLY_AST) Unparser(tree, output)
def get_source(self, fullname): """Concrete implementation of InspectLoader.get_source.""" path = self.get_filename(fullname) try: source_bytes = self.get_data(path) except IOError: raise ImportError("source not available through get_data()") if py3k: import io, tokenize readsource = io.BytesIO(source_bytes).readline try: encoding = tokenize.detect_encoding(readsource) except SyntaxError as exc: raise ImportError("Failed to detect encoding") newline_decoder = io.IncrementalNewlineDecoder(None, True) try: return newline_decoder.decode(source_bytes.decode(encoding[0])) except UnicodeDecodeError as exc: raise ImportError("Failed to decode source file") else: return source_bytes # XXX proper encoding
def read_py_url(url, errors='replace', skip_encoding_cookie=True): """Read a Python file from a URL, using the encoding declared inside the file. Parameters ---------- url : str The URL from which to fetch the file. errors : str How to handle decoding errors in the file. Options are the same as for bytes.decode(), but here 'replace' is the default. skip_encoding_cookie : bool If True (the default), and the encoding declaration is found in the first two lines, that line will be excluded from the output - compiling a unicode string with an encoding declaration is a SyntaxError in Python 2. Returns ------- A unicode string containing the contents of the file. """ response = urllib.request.urlopen(url) buffer = io.BytesIO(response.read()) encoding, lines = detect_encoding(buffer.readline) buffer.seek(0) text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) text.mode = 'r' if skip_encoding_cookie: return "".join(strip_encoding_cookie(text)) else: return text.read()
def __init__(self, source: IO, modname: str, srcname: str, decoded: bool = False) -> None: self.modname = modname # name of the module self.srcname = srcname # name of the source file # cache the source code as well pos = source.tell() if not decoded: warnings.warn('decode option for ModuleAnalyzer is deprecated.', RemovedInSphinx40Warning, stacklevel=2) self._encoding, _ = tokenize.detect_encoding(source.readline) source.seek(pos) self.code = source.read().decode(self._encoding) else: self._encoding = None self.code = source.read() # will be filled by parse() self.annotations = None # type: Dict[Tuple[str, str], str] self.attr_docs = None # type: Dict[Tuple[str, str], List[str]] self.finals = None # type: List[str] self.overloads = None # type: Dict[str, List[Signature]] self.tagorder = None # type: Dict[str, int] self.tags = None # type: Dict[str, Tuple[str, int, int]]
def test_cookie_second_line_empty_first_line(self): lines = (b'\n', b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n") encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected)
def test_matched_bom_and_cookie_second_line(self): lines = (b'\xef\xbb\xbf#! something\n', b'f# coding=utf-8\n', b'print(something)\n', b'do_something(else)\n') encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'#! something\n', b'f# coding=utf-8\n'])
def test_cookie_second_line_no_bom(self): lines = (b'#! something\n', b'# vim: set fileencoding=ascii :\n', b'print(something)\n', b'do_something(else)\n') encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'ascii') expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] self.assertEqual(consumed_lines, expected)
def test_bom_no_cookie(self): lines = (b'\xef\xbb\xbf# something\n', b'print(something)\n', b'do_something(else)\n') encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# something\n', b'print(something)\n'])
def check(file): """check(file_or_dir) If file_or_dir is a directory and not a symbolic link, then recursively descend the directory tree named by file_or_dir, checking all .py files along the way. If file_or_dir is an ordinary Python source file, it is checked for whitespace related problems. The diagnostic messages are written to standard output using the print statement. """ if os.path.isdir(file) and not os.path.islink(file): if verbose: print("%r: listing directory" % (file, )) names = os.listdir(file) for name in names: fullname = os.path.join(file, name) if (os.path.isdir(fullname) and not os.path.islink(fullname) or os.path.normcase(name[-3:]) == ".py"): check(fullname) return with open(file, 'rb') as f: encoding, lines = tokenize.detect_encoding(f.readline) try: f = open(file, encoding=encoding) except IOError as msg: errprint("%r: I/O Error: %s" % (file, msg)) return if verbose > 1: print("checking %r ..." % file) try: process_tokens(tokenize.generate_tokens(f.readline)) except tokenize.TokenError as msg: errprint("%r: Token Error: %s" % (file, msg)) return except IndentationError as msg: errprint("%r: Indentation Error: %s" % (file, msg)) return except NannyNag as nag: badline = nag.get_lineno() line = nag.get_line() if verbose: print("%r: *** Line %d: trouble in tab city! ***" % (file, badline)) print("offending line: %r" % (line, )) print(nag.get_msg()) else: if ' ' in file: file = '"' + file + '"' if filename_only: print(file) else: print(file, badline, repr(line)) return if verbose: print("%r: Clean bill of health." % (file, ))
def read_text_file(filename, encoding=None): """Read text file. Give back the contents, and the encoding we used. Unless specified manually, We have no way of knowing what text encoding this file may be in. The standard Python 'open' method uses the default system encoding to read text files in Python 3 or falls back to utf-8. On Python 3 we can use tokenize to detect the encoding. On Python 2 we can use chardet to detect the encoding. """ # Only if the encoding is not manually specified, we may try to # detect it. if encoding is None and detect_encoding is not None: with open(filename, 'rb') as filehandler: encoding = detect_encoding(filehandler.readline)[0] with open(filename, 'rb') as filehandler: data = filehandler.read() if encoding is not None: return data.decode(encoding), encoding if HAVE_CHARDET: encoding_result = chardet.detect(data) if encoding_result and encoding_result['encoding'] is not None: encoding = encoding_result['encoding'] return data.decode(encoding), encoding # Look for hints, PEP263-style if data[:3] == b'\xef\xbb\xbf': encoding = 'utf-8' return data.decode(encoding), encoding data_len = len(data) for canary in ENCODING_HINTS: if canary in data: pos = data.index(canary) if pos > 1 and data[pos - 1] not in (b' ', b'\n', b'\r'): continue pos += len(canary) coding = b'' while pos < data_len and data[pos] not in (b' ', b'\n'): coding += data[pos] pos += 1 encoding = coding.decode('ascii').strip() try: return data.decode(encoding), encoding except (LookupError, UnicodeError): # Try the next one pass # Fall back to utf-8 encoding = 'utf-8' return data.decode(encoding), encoding
def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): """Converts a bytes string with python source code to unicode. Unicode strings are passed through unchanged. Byte strings are checked for the python source file encoding cookie to determine encoding. txt can be either a bytes buffer or a string containing the source code. """ if isinstance(txt, unicode): return txt if isinstance(txt, bytes): buffer = BytesIO(txt) else: buffer = txt try: encoding, _ = detect_encoding(buffer.readline) except SyntaxError: encoding = "ascii" buffer.seek(0) text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) text.mode = 'r' if skip_encoding_cookie: return u"".join(strip_encoding_cookie(text)) else: return text.read()
def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): """Converts a bytes string with python source code to unicode. Unicode strings are passed through unchanged. Byte strings are checked for the python source file encoding cookie to determine encoding. txt can be either a bytes buffer or a string containing the source code. """ if isinstance(txt, unicode_type): return txt if isinstance(txt, bytes): buf = BytesIO(txt) else: buf = txt try: encoding, _ = detect_encoding(buf.readline) except SyntaxError: encoding = "ascii" buf.seek(0) text = TextIOWrapper(buf, encoding, errors=errors, line_buffering=True) text.mode = 'r' if skip_encoding_cookie: return u"".join(strip_encoding_cookie(text)) else: return text.read()
def execute(self): # Try to detect the encoding for you. with open(self.script, 'rb') as file: try: encoding = tokenize.detect_encoding(file.readline)[0] except SyntaxError: encoding = "utf-8" # Set the global values for the module. global_values = { '__file__': self.script, # Use actual filename of the script. '__name__': '__main__' # Make sure that 'if __name__ == "__main__"'-hook works } with open(self.script, 'r', encoding=encoding) as file: # Do not inherit any 'from future import ...'-statements # that may be used by AnimaFX. # Additionally set the current filename. module = compile(file.read(), self.script, 'exec', False) try: exec(module, global_values) # Reraise any occuring exceptions except (SystemExit, KeyboardInterrupt) as e: raise e # Print the exception except BaseException as e: traceback.print_exception(e.__class__, e, e.__traceback__) return False return True
def encode(self, chars): if isinstance(chars, bytes): # This is either plain ASCII, or Tk was returning mixed-encoding # text to us. Don't try to guess further. return chars # Preserve a BOM that might have been present on opening if self.fileencoding == 'utf-8-sig': return chars.encode('utf-8-sig') # See whether there is anything non-ASCII in it. # If not, no need to figure out the encoding. try: return chars.encode('ascii') except UnicodeEncodeError: pass # Check if there is an encoding declared try: encoded = chars.encode('ascii', 'replace') enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline) return chars.encode(enc) except SyntaxError as err: failed = str(err) except UnicodeEncodeError: failed = "Invalid encoding '%s'" % enc messagebox.showerror( "I/O Error", "%s.\nSaving as UTF-8" % failed, parent=self.text) # Fallback: save as UTF-8, with BOM - ignoring the incorrect # declared encoding return chars.encode('utf-8-sig')
def _readSourceCodeFromFilename3(source_filename): import tokenize try: with open(source_filename, "rb") as source_file: encoding = tokenize.detect_encoding(source_file.readline)[0] # @UndefinedVariable # Rewind to get the whole file. source_file.seek(0) source_code = source_file.read() return source_code.decode(encoding) except SyntaxError as e: if Options.isFullCompat(): if PythonVersions.doShowUnknownEncodingName(): match = re.match("unknown encoding for '.*?': (.*)", e.args[0]) complaint = match.group(1) else: complaint = "with BOM" e.args = ( "encoding problem: %s" % complaint, (source_filename, 1, None, None) ) if hasattr(e, "msg"): e.msg = e.args[0] raise
def _LoadModule(self, name, fp, path, info, deferredImports, parent = None, namespace = False): """Load the module, given the information acquired by the finder.""" suffix, mode, type = info if type == imp.PKG_DIRECTORY: return self._LoadPackage(name, path, parent, deferredImports, namespace) module = self._AddModule(name, file_name=path, parent=parent) if type == imp.PY_SOURCE: logging.debug("Adding module [%s] [PY_SOURCE]", name) # Load & compile Python source code fp = open(path, "rb") encoding, lines = tokenize.detect_encoding(fp.readline) fp = open(path, "U", encoding = encoding) codeString = fp.read() if codeString and codeString[-1] != "\n": codeString = codeString + "\n" try: module.code = compile(codeString, path, "exec") except SyntaxError: raise ImportError("Invalid syntax in %s" % path) elif type == imp.PY_COMPILED: logging.debug("Adding module [%s] [PY_COMPILED]", name) # Load Python bytecode if isinstance(fp, bytes): magic = fp[:4] else: magic = fp.read(4) if magic != imp.get_magic(): raise ImportError("Bad magic number in %s" % path) skip_bytes = 8 if isinstance(fp, bytes): module.code = marshal.loads(fp[skip_bytes+4:]) module.source_is_zip_file = True else: fp.read(skip_bytes) module.code = marshal.load(fp) elif type == imp.C_EXTENSION: logging.debug("Adding module [%s] [C_EXTENSION]", name) # If there's a custom hook for this module, run it. self._RunHook("load", module.name, module) if module.code is not None: if self.replace_paths: topLevelModule = module while topLevelModule.parent is not None: topLevelModule = topLevelModule.parent module.code = self._ReplacePathsInCode(topLevelModule, module.code) # Scan the module code for import statements self._ScanCode(module.code, module, deferredImports) module.in_import = False return module
def _read_file(filename): # read the file contents, obeying the python encoding marker with open(filename, 'rb') as fp: encoding, _ = tokenize.detect_encoding(fp.readline) with open(filename, 'rt', encoding=encoding) as fp: content = fp.read() content += '\n\n' return content
def read_source_code(filename): with open(filename, 'rb') as source_file: encoding, first_lines = tokenize.detect_encoding(source_file.readline) source_bytes = b''.join(first_lines) + source_file.read() newline_decoder = io.IncrementalNewlineDecoder(None, translate=True) source_code = newline_decoder.decode(source_bytes.decode(encoding)) return source_code.splitlines(True)
def read_pyfile(filename): """Read and return the contents of a Python source file (as a string), taking into account the file encoding.""" with open(filename, "rb") as pyfile: encoding = tokenize.detect_encoding(pyfile.readline)[0] with open(filename, "r", encoding=encoding) as pyfile: source = pyfile.read() return source
def _stdin_get_value_py3(): # type: () -> str stdin_value = sys.stdin.buffer.read() fd = io.BytesIO(stdin_value) try: coding, _ = tokenize.detect_encoding(fd.readline) return stdin_value.decode(coding) except (LookupError, SyntaxError, UnicodeError): return stdin_value.decode("utf-8")
def _stdin_get_value_py3(): stdin_value = sys.stdin.buffer.read() fd = io.BytesIO(stdin_value) try: (coding, lines) = tokenize.detect_encoding(fd.readline) return io.StringIO(stdin_value.decode(coding)) except (LookupError, SyntaxError, UnicodeError): return io.StringIO(stdin_value.decode("utf-8"))
def test_cookie_second_line_noncommented_first_line(self): lines = (b"print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n") encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected)
def _LoadModule(self, name, fp, path, info, deferredImports, parent=None): """Load the module, given the information acquired by the finder.""" suffix, mode, type = info if type == imp.PKG_DIRECTORY: return self._LoadPackage(name, path, parent, deferredImports) module = self._AddModule(name, file_name=path, parent=parent) if type == imp.PY_SOURCE: logging.debug("Adding module [%s] [PY_SOURCE]", name) # Load & compile Python source code # if file opened, it already use good encoding; else detect it manually if not fp: with open(path, "rb") as f: encoding = tokenize.detect_encoding(f.readline)[0] fp = open(path, "r", encoding=encoding) codeString = fp.read() if codeString and codeString[-1] != "\n": codeString = codeString + "\n" try: module.code = compile(codeString, path, "exec", optimize=self.optimizeFlag) except SyntaxError: raise ImportError("Invalid syntax in %s" % path) elif type == imp.PY_COMPILED: logging.debug("Adding module [%s] [PY_COMPILED]", name) # Load Python bytecode if isinstance(fp, bytes): fp = io.BytesIO(fp) module.source_is_zip_file = True module.code = pkgutil.read_code(fp) if module.code is None: raise ImportError("Bad magic number in %s" % path) elif type == imp.C_EXTENSION: logging.debug("Adding module [%s] [C_EXTENSION]", name) # If there's a custom hook for this module, run it. self._RunHook("load", module.name, module) if module.code is not None: if self.replace_paths: topLevelModule = module while topLevelModule.parent is not None: topLevelModule = topLevelModule.parent module.code = self._ReplacePathsInCode(topLevelModule, module.code) # Scan the module code for import statements self._ScanCode(module.code, module, deferredImports) # Verify __package__ in use self._ReplacePackageInCode(module) module.in_import = False return module
def check(file, depth): if depth > 1 and os.path.isfile(os.path.join(file, ".git")): return if os.path.isdir(file) and not os.path.islink(file): if verbose: print("listing directory", file) names = os.listdir(file) for name in names: fullname = os.path.join(file, name) if ((recurse and os.path.isdir(fullname) and not os.path.islink(fullname) and not os.path.split(fullname)[1].startswith(".")) or name.lower().endswith(".py")): check(fullname, depth + 1) return if verbose: print("checking", file, "...", end=' ') with open(file, 'rb') as f: try: encoding, _ = tokenize.detect_encoding(f.readline) except SyntaxError as se: errprint("%s: SyntaxError: %s" % (file, str(se))) return try: print(file) with open(file, encoding=encoding) as f: r = Reindenter(f) except IOError as msg: errprint("%s: I/O Error: %s" % (file, str(msg))) return newline = spec_newline if spec_newline else r.newlines if isinstance(newline, tuple): errprint( "%s: mixed newlines detected; cannot continue without --newline" % file) return if r.run(): if verbose: print("changed.") if dryrun: print("But this is a dry run, so leaving it alone.") if not dryrun: bak = file + ".bak" if makebackup: shutil.copyfile(file, bak) if verbose: print("backed up", file, "to", bak) with open(file, "w", encoding=encoding, newline=newline) as f: r.write(f) if verbose: print("wrote new", file) return True else: if verbose: print("unchanged.") return False
def test_cookie_first_line_no_bom(self): lines = ( b'# -*- coding: latin-1 -*-\n', b'print(something)\n', b'do_something(else)\n' ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso-8859-1') self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
def test_no_bom_no_encoding_cookie(self): lines = ( b'# something\n', b'print(something)\n', b'do_something(else)\n' ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, list(lines[:2]))
def _detect_encoding(source: bytes) -> str: """ :param bytes source: :type source: :return: :rtype: """ return tokenize.detect_encoding(io.BytesIO(source).readline)[0]
def _get_complexity(src_code): to_count = 'print' encoding = detect_encoding( (l.encode() for l in src_code.split(os_linesep, 1)).__next__)[0] with BytesIO(src_code.encode(encoding, 'ignore')) as src_stream: return dict( Counter(t[1] for t in tokenize(src_stream.readline) if t[0] is t_name and t[1] == to_count))
def test_matched_bom_and_cookie_first_line(self): lines = ( b'\xef\xbb\xbf# coding=utf-8\n', b'print(something)\n', b'do_something(else)\n' ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
def load_settings(filename, settings): encoding = 'utf-8' with open(filename, 'rb') as fp: try: encoding = tokenize.detect_encoding(fp.readline)[0] except SyntaxError: pass with open(filename, 'r', encoding=encoding) as fp: exec(compile(fp.read(), filename, 'exec'), settings, settings)
def open_source_file(filename): byte_stream = open(filename, 'bU') encoding = detect_encoding(byte_stream.readline)[0] stream = open(filename, 'U', encoding=encoding) try: data = stream.read() except UnicodeError, uex: # wrong encodingg # detect_encoding returns utf-8 if no encoding specified msg = 'Wrong (%s) or no encoding specified' % encoding raise ASTNGBuildingException(msg)
def _read(filename): try: with open(filename, 'rb') as f: (encoding, _) = tokenize.detect_encoding(f.readline) except (LookupError, SyntaxError, UnicodeError): # Fall back if file encoding is improperly declared with open(filename, encoding='latin-1') as f: return f.readlines() with open(filename, 'r', encoding=encoding) as f: return f.readlines()
def _source_encoding_py3(source): """Determine the encoding for `source`, according to PEP 263. `source` is a byte string: the text of the program. Returns a string, the name of the encoding. """ readline = iternext(source.splitlines(True)) return tokenize.detect_encoding(readline)[0]
def test_short_files(self): readline = self.get_readline((b'print(something)\n',)) encoding, consumed_lines = detect_encoding(readline) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, [b'print(something)\n']) encoding, consumed_lines = detect_encoding(self.get_readline(())) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, []) readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) encoding, consumed_lines = detect_encoding(readline) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, [b'print(something)\n']) readline = self.get_readline((b'\xef\xbb\xbf',)) encoding, consumed_lines = detect_encoding(readline) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, [])
def tokopen(filename): """Open a file in read only mode using the encoding detected by detect_encoding(). """ buf = io.open(filename, "rb") # Tweaked to use io.open for Python 2 encoding, lines = detect_encoding(buf.readline) buf.seek(0) text = io.TextIOWrapper(buf, encoding, line_buffering=True) text.mode = "r" return text
def open(filename): """Open a file in read only mode using the encoding detected by detect_encoding(). """ buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2 encoding, lines = detect_encoding(buffer.readline) buffer.seek(0) text = TextIOWrapper(buffer, encoding, line_buffering=True) text.mode = 'r' return text
def load_setup(): """run the setup script (i.e the setup.py file) This function load the setup file in all cases (even if it have already been loaded before, because we are monkey patching its setup function with a particular one""" with open("setup.py", "rb") as f: encoding, lines = detect_encoding(f.readline) with open("setup.py", encoding=encoding) as f: imp.load_module("setup", f, "setup.py", (".py", "r", imp.PY_SOURCE))
def open_source_file(filename): byte_stream = open(filename, 'bU') encoding = detect_encoding(byte_stream.readline)[0] stream = open(filename, 'U', encoding=encoding) try: data = stream.read() except UnicodeError, uex: # wrong encodingg # detect_encoding returns utf-8 if no encoding specified msg = 'Wrong (%s) or no encoding specified' % encoding raise AstroidBuildingException(msg)
def test_cookie_second_line_noncommented_first_line(self): lines = ( b"print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected)
def test_utf8_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix") for encoding in encodings: for rep in ("-", "_"): enc = encoding.replace("-", rep) lines = (b"#!/usr/bin/python\n", b"# coding: " + enc.encode("ascii") + b"\n", b"1 + 3\n") rl = self.get_readline(lines) found, consumed_lines = detect_encoding(rl) self.assertEqual(found, "utf-8")
def test_cookie_second_line_empty_first_line(self): lines = ( b'\n', b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected)
def test_bom_no_cookie(self): lines = ( b'\xef\xbb\xbf# something\n', b'print(something)\n', b'do_something(else)\n' ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'# something\n', b'print(something)\n'])
def stdin_get_value() -> str: """Get and cache it so plugins can use it.""" stdin_value = sys.stdin.buffer.read() fd = io.BytesIO(stdin_value) try: coding, _ = tokenize.detect_encoding(fd.readline) fd.seek(0) return io.TextIOWrapper(fd, coding).read() except (LookupError, SyntaxError, UnicodeError): return stdin_value.decode("utf-8")
def _LoadModule(self, name, fp, path, info, deferredImports, parent = None, namespace = False): """Load the module, given the information acquired by the finder.""" suffix, mode, type = info if type == imp.PKG_DIRECTORY: return self._LoadPackage(name, path, parent, deferredImports, namespace) module = self._AddModule(name) module.file = path module.parent = parent if type == imp.PY_SOURCE: # Load & compile Python source code if sys.version_info[0] >= 3: # For Python 3, read the file with the correct encoding import tokenize fp = open(path, "rb") encoding, lines = tokenize.detect_encoding(fp.readline) fp = open(path, "U", encoding = encoding) codeString = fp.read() if codeString and codeString[-1] != "\n": codeString = codeString + "\n" module.code = compile(codeString, path, "exec") elif type == imp.PY_COMPILED: # Load Python bytecode if isinstance(fp, str): magic = fp[:4] else: magic = fp.read(4) if magic != imp.get_magic(): raise ImportError("Bad magic number in %s" % path) if isinstance(fp, str): module.code = marshal.loads(fp[8:]) module.inZipFile = True else: fp.read(4) module.code = marshal.load(fp) # If there's a custom hook for this module, run it. self._RunHook("load", module.name, module) if module.code is not None: if self.replacePaths: topLevelModule = module while topLevelModule.parent is not None: topLevelModule = topLevelModule.parent module.code = self._ReplacePathsInCode(topLevelModule, module.code) # Scan the module code for import statements self._ScanCode(module.code, module, deferredImports) module.inImport = False return module
def decode_source(source_bytes): # copied from _bootstrap_external.py """Decode bytes representing source code and return the string. Universal newline support is used in the decoding. """ import _io import tokenize # To avoid bootstrap issues. source_bytes_readline = _io.BytesIO(source_bytes).readline encoding = tokenize.detect_encoding(source_bytes_readline) newline_decoder = _io.IncrementalNewlineDecoder(None, True) return newline_decoder.decode(source_bytes.decode(encoding[0]))