def write_json_lines(json_objects, filepath, mode='wt', encoding=None, auto_make_dirs=False, ensure_ascii=False, separators=(',', ':'), sort_keys=False): """ Iterate over a stream of JSON objects, writing each to a separate line in file ``filepath`` but without a top-level JSON object (e.g. array). Args: json_objects (iterable[json]): iterable of valid JSON objects to be written filepath (str): /path/to/file on disk to which JSON objects will be written, where each line in the file is its own json object; for example:: {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n {"title": "2BR02B", "text": "Everything was perfectly swell."} mode (str) encoding (str) auto_make_dirs (bool) ensure_ascii (bool) separators (tuple[str]) sort_keys (bool) .. seealso:: https://docs.python.org/3/library/json.html#json.dump """ newline = '\n' if 't' in mode else unicode_to_bytes('\n') with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: for json_object in json_objects: f.write(json.dumps(json_object, ensure_ascii=ensure_ascii, separators=separators, sort_keys=sort_keys) + newline)
def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package; to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ try: cld2_detect except NameError: raise ImportError( '`cld2-cffi` must be installed to use textacy\'s automatic language detection; ' 'you may do so via `pip install cld2-cffi` or `pip install textacy[lang]`.' ) if is_python2: is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True) else: is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: msg = 'Text language detected with low confidence; best guesses: %s' logger.warning(msg, best_guesses) return best_guesses[0][1]
def test_read_write_file_bytes(self): expected = unicode_to_bytes(self.text) for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = os.path.join( self.tempdir, 'test_read_write_file_bytes' + ext) fileio.write_file(expected, filename, mode='wb', auto_make_dirs=True) observed = fileio.read_file(filename, mode='rb') self.assertEqual(observed, expected)
def test_read_write_file_lines_bytes(self): expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents] for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = os.path.join( self.tempdir, 'test_read_write_file_lines_bytes' + ext) fileio.write_file_lines(expected, filename, mode='wb', auto_make_dirs=True) observed = [line.strip() for line in fileio.read_file_lines(filename, mode='rb')] self.assertEqual(observed, expected)
def coerce_content_type(content, file_mode): """ If the `content` to be written to file and the `file_mode` used to open it are incompatible (either bytes with text mode or unicode with bytes mode), try to coerce the content type so it can be written. """ if 't' in file_mode and isinstance(content, compat.bytes_): return compat.bytes_to_unicode(content) elif 'b' in file_mode and isinstance(content, compat.unicode_): return compat.unicode_to_bytes(content) return content
def coerce_content_type(content, file_mode): """ If the `content` to be written to file and the `file_mode` used to open it are incompatible (either bytes with text mode or unicode with bytes mode), try to coerce the content type so it can be written. """ if 't' in file_mode and isinstance(content, compat.bytes_type): return compat.bytes_to_unicode(content) elif 'b' in file_mode and isinstance(content, compat.unicode_type): return compat.unicode_to_bytes(content) return content
def test_read_write_text_bytes(tmpdir): expected = compat.unicode_to_bytes(TEXT) for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = str(tmpdir.join('test_read_write_file_bytes' + ext)) if compat.is_python2 is True and ext == '.xz': with pytest.raises(ValueError): io.open_sesame( filename, mode='wb', encoding='utf-8', make_dirs=True) else: io.write_text(expected, filename, mode='wb', make_dirs=True) observed = next(io.read_text(filename, mode='rb')) assert observed == expected
def write_file_lines(lines, filepath, mode='wt', encoding=None, auto_make_dirs=False): """ Write the content in ``lines`` to disk at ``filepath``, line by line. Files with appropriate extensions are compressed with gzip or bz2 automatically. Any intermediate folders not found on disk may automatically be created. """ newline = '\n' if 't' in mode else unicode_to_bytes('\n') with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: for line in lines: f.write(line + newline)
def test_read_write_text_bytes(tmpdir): expected = compat.unicode_to_bytes(TEXT) for ext in (".txt", ".gz", ".bz2", ".xz"): filename = str(tmpdir.join("test_read_write_file_bytes" + ext)) if compat.is_python2 is True and ext == ".xz": with pytest.raises(ValueError): io.open_sesame(filename, mode="wb", encoding="utf-8", make_dirs=True) else: io.write_text(expected, filename, mode="wb", make_dirs=True) observed = next(io.read_text(filename, mode="rb")) assert observed == expected
def test_read_write_text_lines_bytes(tmpdir, spacy_doc): expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents] for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = str(tmpdir.join('test_read_write_file_lines_bytes' + ext)) if compat.is_python2 is True and ext == '.xz': with pytest.raises(ValueError): io.open_sesame( filename, mode='wb', encoding='utf-8', make_dirs=True) else: io.write_text(expected, filename, mode='wb', make_dirs=True, lines=True) observed = [ line.strip() for line in io.read_text(filename, mode='rb', lines=True)] assert observed == expected
def test_read_write_file_bytes(self): expected = unicode_to_bytes(self.text) for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = os.path.join( self.tempdir, 'test_read_write_file_bytes' + ext) if PY2 is True and ext == '.xz': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wb', 'utf-8', True) else: fileio.write_file(expected, filename, mode='wb', auto_make_dirs=True) observed = fileio.read_file(filename, mode='rb') self.assertEqual(observed, expected)
def test_read_write_file_lines_bytes(self): expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents] for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = os.path.join( self.tempdir, 'test_read_write_file_lines_bytes' + ext) if PY2 is True and ext == '.xz': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wb', 'utf-8', True) else: fileio.write_file_lines(expected, filename, mode='wb', auto_make_dirs=True) observed = [line.strip() for line in fileio.read_file_lines(filename, mode='rb')] self.assertEqual(observed, expected)
def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package; to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ if PY2: is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True) else: is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: msg = 'Text language detected with low confidence; best guesses: %s' logger.warning(msg, best_guesses) return best_guesses[0][1]
def test_read_write_text_lines_bytes(tmpdir, spacy_doc): expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents] for ext in (".txt", ".gz", ".bz2", ".xz"): filename = str(tmpdir.join("test_read_write_file_lines_bytes" + ext)) if compat.is_python2 is True and ext == ".xz": with pytest.raises(ValueError): io.open_sesame(filename, mode="wb", encoding="utf-8", make_dirs=True) else: io.write_text(expected, filename, mode="wb", make_dirs=True, lines=True) observed = [ line.strip() for line in io.read_text(filename, mode="rb", lines=True) ] assert observed == expected
def write_json_lines(json_objects, filepath, mode='wt', encoding=None, auto_make_dirs=False, ensure_ascii=False, separators=(',', ':'), sort_keys=False): """ Iterate over a stream of JSON objects, writing each to a separate line in file ``filepath`` but without a top-level JSON object (e.g. array). Args: json_objects (iterable[json]): iterable of valid JSON objects to be written filepath (str): /path/to/file on disk to which JSON objects will be written, where each line in the file is its own json object; for example:: {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n {"title": "2BR02B", "text": "Everything was perfectly swell."} mode (str) encoding (str) auto_make_dirs (bool) ensure_ascii (bool) separators (tuple[str]) sort_keys (bool) .. seealso:: https://docs.python.org/3/library/json.html#json.dump """ newline = '\n' if 't' in mode else unicode_to_bytes('\n') with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: for json_object in json_objects: f.write( json.dumps(json_object, ensure_ascii=ensure_ascii, separators=separators, sort_keys=sort_keys) + newline)