Exemplo n.º 1
0
        # comment out lines dependent on methods that you haven't yet implemented
        print "Statistics for", self.filename

        print self.numsents, "sentences"
        print self.numtokens, "word tokens"
        print self.numtypes, "word types"

        print '{0:.2f} average tokens per type'.format(self.numtokens/self.numtypes)
        print '{0:.2f} average sentence length'.format(self.numtokens/self.numsents)

        print '{0:.2f} average word token length'.format(expectation(self.token_lengths))
        print '{0:.2f} average word type length'.format(expectation(self.type_lengths))
        
        print "Hapax legomena comprise {0:.2%} of the types".format(num_hapaxes(self.word_counts)/self.numtypes)

        #self.plot_freq(self.filename+'_freq.png')
        #self.plot_lengths(self.filename+'_lengths.png')

if __name__=='__main__':
    filename = sys.argv[1]
    corpus = Corpus(filename, casefold=True)
    corpus.display_stats()

    print
    word_model = NGram(1, 'word', corpus.word_counts)
    word_model.display_stats()

    print
    char_model = NGram(1, 'character', corpus.char_counts)
    char_model.display_stats()
Exemplo n.º 2
0
        print '{0:.2f} average tokens per type'.format(self.numtokens /
                                                       self.numtypes)
        print '{0:.2f} average sentence length'.format(self.numtokens /
                                                       self.numsents)

        print '{0:.2f} average word token length'.format(
            expectation(self.token_lengths))
        print '{0:.2f} average word type length'.format(
            expectation(self.type_lengths))

        print "Hapax legomena comprise {0:.2%} of the types".format(
            num_hapaxes(self.word_counts) / self.numtypes)

        #self.plot_freq(self.filename+'_freq.png')
        #self.plot_lengths(self.filename+'_lengths.png')


if __name__ == '__main__':
    filename = sys.argv[1]
    corpus = Corpus(filename, casefold=True)
    corpus.display_stats()

    print
    word_model = NGram(1, 'word', corpus.word_counts)
    word_model.display_stats()

    print
    char_model = NGram(1, 'character', corpus.char_counts)
    char_model.display_stats()