示例#1
0
def splitby_example():
    # The splitby function in the textmining package is very useful. It
    # allows for flexible chunking of a long text document into smaller
    # groups of lines according to a user-defined split function.

    # First let's use the default split function which splits a sequence
    # of lines into groups corresponding to paragraphs. The function
    # defines a paragraph boundary to lie between a non-blank line and
    # blank line.
    text = """
    Hello there
    how are you today?
    I hope you
    are doing well.
    Thanks for using the textmining module!
    """
    lines = text.splitlines()
    print('\nsplitby_example 1\n')
    for paragraph in textmining.splitby(lines):
        # paragraph is a list of lines.
        # Notice that the last paragraph will just contain
        # lines of spaces as there is no text in it.
        print(paragraph)

    # Now let's use a custom split function to process a more complicated
    # document structure. We want to extract three cleaned-up documents
    # from the following messy text string (this is a common preprocessing
    # task in text mining!)
    text = """
    Document One:
    -------------
    First line of Document One
    Second line of Document One
    Third line of Document One
    Document Two:
    -------------
    First line of Document Two
    Second line of Document Two
    Document Three:
    ---------------
    First line of Document Three
    """

    # Define new split function for this special document structure.
    def document_boundary(line1, line2):
        return line2.strip().startswith('Document')

    # Loop over documents
    lines = text.splitlines()
    print('\nsplitby_example 2\n')
    for document in textmining.splitby(lines, document_boundary):
        # Skip if first line (document[0]) doesn't match document structure
        if not document[0].strip().startswith('Document'):
            continue
        # document is a list of lines. Remove blank lines and strip out
        # whitespace to create a clean document.
        clean_lines = [line.strip() for line in document if line.strip()]
        # Print out clean document
        print('\n'.join(clean_lines))
        print()
示例#2
0
def get_docs_from_reuters_file(corpus):
    """
    splits different news items in one reuters sgm file into documents
    :param corpus: the reuters file
    :return: list of documents
    """

    # Define new split function for this special document structure.
    def document_boundary(line1, line2):
        return line2.strip().startswith('')

    docs = []
    # Loop over documents
    lines = corpus.splitlines()
    for document in tm.splitby(lines, document_boundary):
        # Skip if first line (document[0]) doesn't match document structure
        if not document[0].strip().startswith(''):
            continue
        # document is a list of lines. Remove blank lines and strip out
        # whitespace to create a clean document.
        clean_lines = [line.strip() for line in document if line.strip()]
        # Print out clean document
        # print '\n'.join(clean_lines)
        docs.append('\n'.join(clean_lines))
        # print
    return docs
示例#3
0
def splitby_example():
    # The splitby function in the textmining package is very useful. It
    # allows for flexible chunking of a long text document into smaller
    # groups of lines according to a user-defined split function.

    # First let's use the default split function which splits a sequence
    # of lines into groups corresponding to paragraphs. The function
    # defines a paragraph boundary to lie between a non-blank line and
    # blank line.
    text = """

    Hello there
    how are you today?

    I hope you
    are doing well.

    Thanks for using the textmining module!


    """
    lines = text.splitlines()
    print '\nsplitby_example 1\n'
    for paragraph in textmining.splitby(lines):
        # paragraph is a list of lines.
        # Notice that the last paragraph will just contain
        # lines of spaces as there is no text in it.
        print paragraph

    # Now let's use a custom split function to process a more complicated
    # document structure. We want to extract three cleaned-up documents
    # from the following messy text string (this is a common preprocessing
    # task in text mining!)
    text = """

    Document One:
    -------------

    First line of Document One
    Second line of Document One

    Third line of Document One

    Document Two:
    -------------

    First line of Document Two

    Second line of Document Two
    Document Three:
    ---------------
    First line of Document Three


    """

    # Define new split function for this special document structure.
    def document_boundary(line1, line2):
        return line2.strip().startswith('Document')

    # Loop over documents
    lines = text.splitlines()
    print '\nsplitby_example 2\n'
    for document in textmining.splitby(lines, document_boundary):
        # Skip if first line (document[0]) doesn't match document structure
        if not document[0].strip().startswith('Document'):
            continue
        # document is a list of lines. Remove blank lines and strip out
        # whitespace to create a clean document.
        clean_lines = [line.strip() for line in document if line.strip()]
        # Print out clean document
        print '\n'.join(clean_lines)
        print
示例#4
0
def splitby_example():
    # The splitby function in the textmining package is very useful.
    # 根据用户定义的划分函数,允许对长文本划分为短文本或词条进行灵活的划分

    # First let's use the default split function which splits a sequence
    # of lines into groups corresponding to paragraphs. The function
    # defines a paragraph boundary to lie between a non-blank line and
    # blank line.
    # 使用默认的划分函数
    text = """

    Hello there
    how are you today?

    I hope you
    are doing well.

    Thanks for using the textmining module!


    """
    lines = text.splitlines()
    print '\nsplitby_example 1\n'
    for paragraph in textmining.splitby(lines):
        # paragraph is a list of lines.
        # Notice that the last paragraph will just contain
        # lines of spaces as there is no text in it.
        print paragraph

    #使用通用的划分函数处理更加复杂的文本,删除无意义的空格得出更加清晰的词条
    text = """

    Document One:
    -------------

    First line of Document One
    Second line of Document One

    Third line of Document One

    Document Two:
    -------------

    First line of Document Two

    Second line of Document Two
    Document Three:
    ---------------
    First line of Document Three


    """

    # 为文本边界定义函数
    def document_boundary(line1, line2):
        return line2.strip().startswith('Document')

    # 文本循环
    lines = text.splitlines()
    print '\nsplitby_example 2\n'
    for document in textmining.splitby(lines, document_boundary):
        # 如果第一行不符合文本结构则跳过
        if not document[0].strip().startswith('Document'):
            continue
        # 删除空格,得出新的词条
        clean_lines = [line.strip() for line in document if line.strip()]
        # 输出
        print '\n'.join(clean_lines)
        print