Python get_text_lines 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: textcode.sfdb

메소드/함수: get_text_lines

hotexamples.com에서의 예제들: 2

Python get_text_lines - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 textcode.sfdb.get_text_lines에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: analysis.py 프로젝트: akugarg/scancode-toolkit

def numbered_text_lines(
    location,
    demarkup=False,
    plain_text=False,
    start_line=1,
):
    """
    Yield tuples of (line number, text line) from the file at `location`. Return
    an empty iterator if no text content is extractible. Text extraction is
    based on detected file type. Long lines are broken down in chunks, therefore
    two items can have the same line number.

    line numbers start at ``start_line`` which is 1-based by default.

    If `demarkup` is True, attempt to detect if a file contains HTML/XML-like
    markup and cleanup this markup.

    If `plain_text` is True treat the file as a plain text file and do not
    attempt to detect its type and extract it's content with special procedures.
    This is used mostly when loading license texts and rules.

    Note: For testing or building from strings, location can be a is a list of
    unicode line strings.
    """
    if not location:
        return iter([])

    if not isinstance(location, str):
        # not a path: wrap an iterator on location which should be a sequence
        # of lines
        if TRACE:
            logger_debug('numbered_text_lines:', 'location is not a file')
        return enumerate(iter(location), start_line)

    if plain_text:
        if TRACE:
            logger_debug('numbered_text_lines:', 'plain_text')
        return enumerate(unicode_text_lines(location), start_line)

    T = typecode.get_type(location)

    if TRACE:
        logger_debug('numbered_text_lines: T.filetype_file:', T.filetype_file)
        logger_debug('numbered_text_lines: T.is_text_with_long_lines:',
                     T.is_text_with_long_lines)
        logger_debug('numbered_text_lines: T.is_binary:', T.is_binary)

    # TODO: we should have a command line to force digging inside binaries
    if not T.contains_text:
        return iter([])

    # Should we read this as some markup, pdf office doc, text or binary?
    if T.is_pdf and T.is_pdf_with_text:
        if TRACE:
            logger_debug('numbered_text_lines:', 'is_pdf')
        return enumerate(unicode_text_lines_from_pdf(location), start_line)

    if T.filetype_file.startswith('Spline Font Database'):
        if TRACE:
            logger_debug('numbered_text_lines:', 'Spline Font Database')
        return enumerate(
            (as_unicode(l) for l in sfdb.get_text_lines(location)),
            start_line,
        )

    # lightweight markup stripping support
    if demarkup and markup.is_markup(location):
        try:
            lines = list(enumerate(markup.demarkup(location), start_line))
            if TRACE:
                logger_debug('numbered_text_lines:', 'demarkup')
            return lines
        except:
            # try again later with as plain text
            pass

    if T.is_js_map:
        try:
            lines = list(enumerate(js_map_sources_lines(location), start_line))
            if TRACE:
                logger_debug('numbered_text_lines:', 'js_map')
            return lines
        except:
            # try again later with as plain text otherwise
            pass

    if T.is_text:
        numbered_lines = enumerate(unicode_text_lines(location), start_line)
        # text with very long lines such minified JS, JS map files or large JSON
        if (not location.endswith('package.json')
                and (T.is_text_with_long_lines or T.is_compact_js
                     or T.filetype_file == 'data' or 'locale' in location)):

            numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
            if TRACE:
                logger_debug('numbered_text_lines:',
                             'break_numbered_unicode_text_lines')
        return numbered_lines

    # TODO: handle Office-like documents, RTF, etc
    # if T.is_doc:
    #     return unicode_text_lines_from_doc(location)

    # TODO: add support for "wide" UTF-16-like strings where each char is
    # followed by a zero as is often found in some Windows binaries. Do this for
    # binaries only. This is may conflicting  with "strings" extraction as
    # currently implemented
    if T.is_binary:
        # fall back to binary
        if TRACE:
            logger_debug('numbered_text_lines:', 'is_binary')

        return enumerate(unicode_text_lines_from_binary(location), start_line)

    return iter([])

예제 #2

파일 보기

파일: test_sfdb.py 프로젝트: xavierfigueroav/scancode-toolkit

 def check_get_text_lines(self, test_file, expected_file):
     test_file = self.get_test_loc(test_file)
     expected_file = self.get_test_loc(expected_file)
     expected = open(expected_file, 'rb').read().splitlines(True)
     assert expected == list(sfdb.get_text_lines(test_file))