示例#1
0
def get_word_list(file_name):
    """ Reads the specified project Gutenberg book.  Header comments,
            punctuation, and whitespace are stripped away.  The function
            returns a list of the words used in the book as a list.
            All words are converted to lower case.
    """

    if exists(file_name):
        fiel = open(file_name, 'r')
    else:
        with open(file_name, 'w') as fiel:
            page = URL('http://www.gutenberg.org/cache/epub/' +
                       file_name[2:-4] + '/' + file_name)
            fiel.write(page.read().strip())
        fiel = open(file_name, 'r')

    txt = fiel.read()
    txt = txt[txt.index('*** START OF THIS PROJECT GUTENBERG EBOOK'):txt.
              index('*** END OF THIS PROJECT GUTENBERG EBOOK')]
    for dot in string.punctuation:
        if dot is not "'":
            txt = txt.replace(dot, " ")
    txt = txt.lower()
    wordlist = txt.split()
    return wordlist