Python unicode_to_bytes примеры, textacy.compat.unicode_to_bytes Python примеры использования

Пример #1

0

Показать файл

Файл: write.py Проект: GregBowyer/textacy

def write_json_lines(json_objects, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False, ensure_ascii=False,
                     separators=(',', ':'), sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(json.dumps(json_object,
                               ensure_ascii=ensure_ascii,
                               separators=separators,
                               sort_keys=sort_keys) + newline)

Пример #2

0

Показать файл

def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    try:
        cld2_detect
    except NameError:
        raise ImportError(
            '`cld2-cffi` must be installed to use textacy\'s automatic language detection; '
            'you may do so via `pip install cld2-cffi` or `pip install textacy[lang]`.'
        )

    if is_python2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text),
                                                   bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]

Пример #3

0

Показать файл

Файл: test_fileio.py Проект: GregBowyer/textacy

 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         fileio.write_file(expected, filename, mode='wb',
                           auto_make_dirs=True)
         observed = fileio.read_file(filename, mode='rb')
         self.assertEqual(observed, expected)

Пример #4

0

Показать файл

Файл: test_fileio.py Проект: GregBowyer/textacy

 def test_read_write_file_lines_bytes(self):
     expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents]
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_lines_bytes' + ext)
         fileio.write_file_lines(expected, filename, mode='wb',
                                 auto_make_dirs=True)
         observed = [line.strip() for line
                     in fileio.read_file_lines(filename, mode='rb')]
         self.assertEqual(observed, expected)

Пример #5

0

Показать файл

Файл: utils.py Проект: winstonewert/textacy

def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_):
        return compat.unicode_to_bytes(content)
    return content

Пример #6

0

Показать файл

Файл: utils.py Проект: chartbeat-labs/textacy

def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_type):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_type):
        return compat.unicode_to_bytes(content)
    return content

Пример #7

0

Показать файл

def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True)
            observed = next(io.read_text(filename, mode='rb'))
            assert observed == expected

Пример #8

0

Показать файл

Файл: write.py Проект: GregBowyer/textacy

def write_file_lines(lines, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)

Пример #9

0

Показать файл

Файл: test_io.py Проект: yashchoubey/textacy

def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected, filename, mode="wb", make_dirs=True)
            observed = next(io.read_text(filename, mode="rb"))
            assert observed == expected

Пример #10

0

Показать файл

def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_lines_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True, lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode='rb', lines=True)]
            assert observed == expected

Пример #11

0

Показать файл

 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         if PY2 is True and ext == '.xz':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wb', 'utf-8', True)
         else:
             fileio.write_file(expected, filename, mode='wb',
                               auto_make_dirs=True)
             observed = fileio.read_file(filename, mode='rb')
             self.assertEqual(observed, expected)

Пример #12

0

Показать файл

 def test_read_write_file_lines_bytes(self):
     expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents]
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_lines_bytes' + ext)
         if PY2 is True and ext == '.xz':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wb', 'utf-8', True)
         else:
             fileio.write_file_lines(expected, filename, mode='wb',
                                     auto_make_dirs=True)
             observed = [line.strip() for line
                         in fileio.read_file_lines(filename, mode='rb')]
             self.assertEqual(observed, expected)

Пример #13

0

Показать файл

Файл: write.py Проект: winstonewert/textacy

def write_file_lines(lines,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)

Пример #14

0

Показать файл

def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]

Пример #15

0

Показать файл

Файл: test_io.py Проект: yashchoubey/textacy

def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_lines_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected,
                          filename,
                          mode="wb",
                          make_dirs=True,
                          lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode="rb", lines=True)
            ]
            assert observed == expected

Пример #16

0

Показать файл

Файл: text_utils.py Проект: GregBowyer/textacy

def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]

Пример #17

0

Показать файл

Файл: write.py Проект: winstonewert/textacy

def write_json_lines(json_objects,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False,
                     ensure_ascii=False,
                     separators=(',', ':'),
                     sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(
                json.dumps(json_object,
                           ensure_ascii=ensure_ascii,
                           separators=separators,
                           sort_keys=sort_keys) + newline)

Python unicode_to_bytes примеры использования