def splitby_example(): # The splitby function in the textmining package is very useful. It # allows for flexible chunking of a long text document into smaller # groups of lines according to a user-defined split function. # First let's use the default split function which splits a sequence # of lines into groups corresponding to paragraphs. The function # defines a paragraph boundary to lie between a non-blank line and # blank line. text = """ Hello there how are you today? I hope you are doing well. Thanks for using the textmining module! """ lines = text.splitlines() print('\nsplitby_example 1\n') for paragraph in textmining.splitby(lines): # paragraph is a list of lines. # Notice that the last paragraph will just contain # lines of spaces as there is no text in it. print(paragraph) # Now let's use a custom split function to process a more complicated # document structure. We want to extract three cleaned-up documents # from the following messy text string (this is a common preprocessing # task in text mining!) text = """ Document One: ------------- First line of Document One Second line of Document One Third line of Document One Document Two: ------------- First line of Document Two Second line of Document Two Document Three: --------------- First line of Document Three """ # Define new split function for this special document structure. def document_boundary(line1, line2): return line2.strip().startswith('Document') # Loop over documents lines = text.splitlines() print('\nsplitby_example 2\n') for document in textmining.splitby(lines, document_boundary): # Skip if first line (document[0]) doesn't match document structure if not document[0].strip().startswith('Document'): continue # document is a list of lines. Remove blank lines and strip out # whitespace to create a clean document. clean_lines = [line.strip() for line in document if line.strip()] # Print out clean document print('\n'.join(clean_lines)) print()
def get_docs_from_reuters_file(corpus): """ splits different news items in one reuters sgm file into documents :param corpus: the reuters file :return: list of documents """ # Define new split function for this special document structure. def document_boundary(line1, line2): return line2.strip().startswith('') docs = [] # Loop over documents lines = corpus.splitlines() for document in tm.splitby(lines, document_boundary): # Skip if first line (document[0]) doesn't match document structure if not document[0].strip().startswith(''): continue # document is a list of lines. Remove blank lines and strip out # whitespace to create a clean document. clean_lines = [line.strip() for line in document if line.strip()] # Print out clean document # print '\n'.join(clean_lines) docs.append('\n'.join(clean_lines)) # print return docs
def splitby_example(): # The splitby function in the textmining package is very useful. It # allows for flexible chunking of a long text document into smaller # groups of lines according to a user-defined split function. # First let's use the default split function which splits a sequence # of lines into groups corresponding to paragraphs. The function # defines a paragraph boundary to lie between a non-blank line and # blank line. text = """ Hello there how are you today? I hope you are doing well. Thanks for using the textmining module! """ lines = text.splitlines() print '\nsplitby_example 1\n' for paragraph in textmining.splitby(lines): # paragraph is a list of lines. # Notice that the last paragraph will just contain # lines of spaces as there is no text in it. print paragraph # Now let's use a custom split function to process a more complicated # document structure. We want to extract three cleaned-up documents # from the following messy text string (this is a common preprocessing # task in text mining!) text = """ Document One: ------------- First line of Document One Second line of Document One Third line of Document One Document Two: ------------- First line of Document Two Second line of Document Two Document Three: --------------- First line of Document Three """ # Define new split function for this special document structure. def document_boundary(line1, line2): return line2.strip().startswith('Document') # Loop over documents lines = text.splitlines() print '\nsplitby_example 2\n' for document in textmining.splitby(lines, document_boundary): # Skip if first line (document[0]) doesn't match document structure if not document[0].strip().startswith('Document'): continue # document is a list of lines. Remove blank lines and strip out # whitespace to create a clean document. clean_lines = [line.strip() for line in document if line.strip()] # Print out clean document print '\n'.join(clean_lines) print
def splitby_example(): # The splitby function in the textmining package is very useful. # 根据用户定义的划分函数,允许对长文本划分为短文本或词条进行灵活的划分 # First let's use the default split function which splits a sequence # of lines into groups corresponding to paragraphs. The function # defines a paragraph boundary to lie between a non-blank line and # blank line. # 使用默认的划分函数 text = """ Hello there how are you today? I hope you are doing well. Thanks for using the textmining module! """ lines = text.splitlines() print '\nsplitby_example 1\n' for paragraph in textmining.splitby(lines): # paragraph is a list of lines. # Notice that the last paragraph will just contain # lines of spaces as there is no text in it. print paragraph #使用通用的划分函数处理更加复杂的文本,删除无意义的空格得出更加清晰的词条 text = """ Document One: ------------- First line of Document One Second line of Document One Third line of Document One Document Two: ------------- First line of Document Two Second line of Document Two Document Three: --------------- First line of Document Three """ # 为文本边界定义函数 def document_boundary(line1, line2): return line2.strip().startswith('Document') # 文本循环 lines = text.splitlines() print '\nsplitby_example 2\n' for document in textmining.splitby(lines, document_boundary): # 如果第一行不符合文本结构则跳过 if not document[0].strip().startswith('Document'): continue # 删除空格,得出新的词条 clean_lines = [line.strip() for line in document if line.strip()] # 输出 print '\n'.join(clean_lines) print