Exemplo n.º 1
0
    def create(self, text=''):

        #rfi = readabilityFilesInstaller()
        # NewDaleChallWordsFile = open(rfi.getPath("Dale-Chall wordlist")[0]).read()
        #  NewDaleChallWordsList = NewDaleChallWordsFile.split(';')

        #Array mit Werten erstellen
        ta = textanalyzer("eng")
        raw_sentences = ta.getSentences(text)

        values = []
        sentence_values = []
        max_words = 0
        print "\n\n\nhalllo\n\n\n"
        for sentence in raw_sentences:
            raw_words = ta.getWords(sentence)
            if len(raw_words) > max_words:
                max_words = len(raw_words)
            for word in raw_words:
                value = 0.0
                #   if word.lower() in NewDaleChallWordsList:
                #        value = 0.25
                #   else:
                #        value = 0.5

                if word.isdigit():
                    value = 0.0
                sentence_values.append(value)
            values.append(sentence_values)
            sentence_values = []

        #mit Nullen auffuellen
        for value in values:
            while len(value) < max_words:
                value.append(1.0)

        values.reverse()
        a = array(values)

        fig = dale_plt.figure()

        #Achsenbeschriftungen erstellen

        i = len(values)
        ylabels = []
        while i > 0:
            ylabels.append(i)
            i = i - 1

        yticks(arange(len(values)) + 0.5, ylabels)

        #pcolor-Graph erzeugen
        pcolor(a, cmap=self.my_cmap, norm=normalize(vmin=0.0, vmax=1.0))
Exemplo n.º 2
0
    def create(self, text=""):

        # rfi = readabilityFilesInstaller()
        # NewDaleChallWordsFile = open(rfi.getPath("Dale-Chall wordlist")[0]).read()
        #  NewDaleChallWordsList = NewDaleChallWordsFile.split(';')

        # Array mit Werten erstellen
        ta = textanalyzer("eng")
        raw_sentences = ta.getSentences(text)

        values = []
        sentence_values = []
        max_words = 0
        print "\n\n\nhalllo\n\n\n"
        for sentence in raw_sentences:
            raw_words = ta.getWords(sentence)
            if len(raw_words) > max_words:
                max_words = len(raw_words)
            for word in raw_words:
                value = 0.0
                #   if word.lower() in NewDaleChallWordsList:
                #        value = 0.25
                #   else:
                #        value = 0.5

                if word.isdigit():
                    value = 0.0
                sentence_values.append(value)
            values.append(sentence_values)
            sentence_values = []

        # mit Nullen auffuellen
        for value in values:
            while len(value) < max_words:
                value.append(1.0)

        values.reverse()
        a = array(values)

        fig = dale_plt.figure()

        # Achsenbeschriftungen erstellen

        i = len(values)
        ylabels = []
        while i > 0:
            ylabels.append(i)
            i = i - 1

        yticks(arange(len(values)) + 0.5, ylabels)

        # pcolor-Graph erzeugen
        pcolor(a, cmap=self.my_cmap, norm=normalize(vmin=0.0, vmax=1.0))
Exemplo n.º 3
0
    def create_enhanced_dale_chall_list(self):
        #list of sites used to create list of most frequent words
        alexa_list = [
            'Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia',
            'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay',
            'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin',
            'Craigslist', 'Ask'
        ]

        #bring all privacy texts into one list
        corpus = []
        data = get_all_policies()
        for site in data:
            if site in alexa_list:
                corpus.append(data[site]["text"])

        #get the words of this list into a list of words
        t = textanalyzer("eng")
        words = t.getWords("".join(corpus))

        #open the dale chall wordlist
        dale_chall_list = open(
            '../nltk_contrib/dale_chall_wordlist.txt').read().split(';')

        #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
        new_corpus = []

        for word in words:
            if word.lower() not in dale_chall_list and word not in alexa_list:
                new_corpus.append(word.lower())

        #create a frequency distribution of the words of this list of words
        fdist = FreqDist(new_corpus)
        #plot this
        fdist.plot(80, cumulative=True)

        #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
        most_frequ = []
        cum_percentage = 0.0
        for sample in fdist:
            cum_percentage += fdist.freq(sample)
            most_frequ.append(sample)
            if cum_percentage > 0.33:
                break

        #write those into a file
        privacy_file = open("privacy_wordlist.txt", "w")
        privacy_file.write(";".join(most_frequ))
Exemplo n.º 4
0
    def create_enhanced_dale_chall_list(self):
        #list of sites used to create list of most frequent words 
        alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask']
    
        #bring all privacy texts into one list
        corpus = []
        data = get_all_policies()
        for site in data:
                if site in alexa_list:
                    corpus.append(data[site]["text"])
        
        #get the words of this list into a list of words
        t = textanalyzer("eng")
        words = t.getWords("".join(corpus))
        
        #open the dale chall wordlist        
        dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';')
        
        #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
        new_corpus = []
        
        for word in words:
            if word.lower() not in dale_chall_list and word not in alexa_list:
                new_corpus.append(word.lower())
        
        #create a frequency distribution of the words of this list of words
        fdist = FreqDist(new_corpus)
        #plot this
        fdist.plot(80, cumulative=True)
        
        #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
        most_frequ = []
        cum_percentage = 0.0
        for sample in fdist:
            cum_percentage += fdist.freq(sample)
            most_frequ.append(sample)
            if cum_percentage > 0.33:
                break

        #write those into a file
        privacy_file = open("privacy_wordlist.txt", "w")
        privacy_file.write(";".join(most_frequ))
Exemplo n.º 5
0
    def create(self, data = {}):
             
        sites = data.keys()
        
        ######################################
        # To calculate a grade level score:
        # 1. Randomly select three separate 100 word passages. 
        # 2. Count the number of sentences in each 100 word sample (estimate to nearest tenth).
        # 3. Count the number of syllables in each 100 word sample. (Each numeral is a syllable. For example, 2007 is 5 syllables -- two-thou-sand-se-ven -- and one word.)
        # 4. Plot the average sentence length and the average number of syllables on the graph.
        # The area in which it falls is the approximate grade
        ######################################
        
        for site in sites: 
            site_sentences = []
            site_words = []
            sentence_lengths = []
            
            sentences_count = []
            syllables_count = []
            
            ta = textanalyzer("eng")
            site_sentences = ta.getSentences(data[site]['text'])
            
            words = ta.getWords(data[site]['text'])
            word_count = len(words)
            
            for sentence in site_sentences:
                site_words.append(ta.getWords(sentence))
                sentence_lengths.append(len(ta.getWords(sentence)))
            
            print word_count
            
            sample_size = ""
            if word_count < 100:
                sample_size = word_count
                number_of_iterations = 1
            else:
                sample_size = 100
                if word_count < 200:
                    number_of_iterations = 1
                elif word_count < 300:
                    number_of_iterations = 2
                else:
                    number_of_iterations = 3
                
            j = 1
            
            while j <= number_of_iterations:
                print j
                count_index = j - 1
                
                if word_count < 100:
                    start = 0
                else:
                    start = randint(0, word_count - (sample_size * number_of_iterations))
                
                #Silben zählen
                sample_words = words[start:start + sample_size]
    
                #Sätze zählen
                
                #Beginn des Samples finden
                i = 0
                start_value = start
                while (start_value - sentence_lengths[i]) > 0:
                    start_value = start_value - sentence_lengths[i]
                    i += 1
            
                sentneces_count_rest = sentence_lengths[i] - start_value
                sentences_count.append(0.0)
                words_to_count_for = sample_size - sentneces_count_rest
                rest = sentneces_count_rest / sentence_lengths[i]
                
                #100 Wörter abzählen (abzüglich Restwörter aus Vorsatz)
                i += 1
                while (words_to_count_for - sentence_lengths[i]) > 0:
                    words_to_count_for = words_to_count_for - sentence_lengths[i]
                    sentences_count[count_index] = sentences_count[count_index] + 1
                    i += 1
                
                #Anzahl der Sätze zählen und Reste vorher und nachher aufaddieren
                sentences_count[count_index] = sentences_count[count_index]
                rest = rest + (words_to_count_for / sentence_lengths[i])
                
                #Werte vom aktuellen Sample
                sentences_count[count_index] = sentences_count[count_index] + rest
                syllables_count.append(ta.countSyllables(sample_words))
                
                #Wenn kleiner 100, dann auf 100 hochrechnen
                if word_count < 100:
                    sentences_count[count_index] = sentences_count[count_index] * 100 / word_count
                    syllables_count[count_index] = syllables_count[count_index] * 100 / word_count
                
                #das nächste sample
                j += 1
                
            data[site]['Syllables'] = float(sum(syllables_count)) / len(syllables_count)
            data[site]['Sentences'] = float(sum(sentences_count)) / len(sentences_count)
                      
        fig = fry_plt.figure(figsize=(8.15,5.03))
        
        ax = fig.add_subplot(111)
        
        #Achse ausblenden
        Axes.set_frame_on(ax, False)
        
        for site in sites:
            ax.plot(self.get_x_value(data[site]['Syllables']), self.get_y_value(data[site]['Sentences']), 'bo', ms=5)
            ax.annotate(site, (self.get_x_value(data[site]['Syllables']) - 6, self.get_y_value(data[site]['Sentences'])))

        fig.figimage(self.im, 82, 40)
     
        fry_plt.xlim([108, 174])
        fry_plt.xlabel("Average Number of Syllables per 100 words")
        fry_plt.xticks(arange(108, 174, 4))
        
        fry_plt.ylim([0,29])
        fry_plt.ylabel("Average Number of Sentences per 100 words")
        #Beschriftung gemäß Fry-Graph
        y_ticks = ['2.0','2.5','3.0','3.3','3.5','3.6','3.7','3.8','4.0','4.2','4.3','4.5','4.8','5.0','5.2','5.6','5.9','6.3','6.7','7.1','7.7','8.3','9.1','10.0','11.1','12.5','14.3','16.7','20','25+']
        fry_plt.yticks(arange(30), y_ticks)
        
        labels = sites
Exemplo n.º 6
0
    def create(self, data={}):

        sites = data.keys()

        ######################################
        # To calculate a grade level score:
        # 1. Randomly select three separate 100 word passages.
        # 2. Count the number of sentences in each 100 word sample (estimate to nearest tenth).
        # 3. Count the number of syllables in each 100 word sample. (Each numeral is a syllable. For example, 2007 is 5 syllables -- two-thou-sand-se-ven -- and one word.)
        # 4. Plot the average sentence length and the average number of syllables on the graph.
        # The area in which it falls is the approximate grade
        ######################################

        for site in sites:
            site_sentences = []
            site_words = []
            sentence_lengths = []

            sentences_count = []
            syllables_count = []

            ta = textanalyzer("eng")
            site_sentences = ta.getSentences(data[site]['text'])

            words = ta.getWords(data[site]['text'])
            word_count = len(words)

            for sentence in site_sentences:
                site_words.append(ta.getWords(sentence))
                sentence_lengths.append(len(ta.getWords(sentence)))

            print word_count

            sample_size = ""
            if word_count < 100:
                sample_size = word_count
                number_of_iterations = 1
            else:
                sample_size = 100
                if word_count < 200:
                    number_of_iterations = 1
                elif word_count < 300:
                    number_of_iterations = 2
                else:
                    number_of_iterations = 3

            j = 1

            while j <= number_of_iterations:
                print j
                count_index = j - 1

                if word_count < 100:
                    start = 0
                else:
                    start = randint(
                        0, word_count - (sample_size * number_of_iterations))

                #Silben zählen
                sample_words = words[start:start + sample_size]

                #Sätze zählen

                #Beginn des Samples finden
                i = 0
                start_value = start
                while (start_value - sentence_lengths[i]) > 0:
                    start_value = start_value - sentence_lengths[i]
                    i += 1

                sentneces_count_rest = sentence_lengths[i] - start_value
                sentences_count.append(0.0)
                words_to_count_for = sample_size - sentneces_count_rest
                rest = sentneces_count_rest / sentence_lengths[i]

                #100 Wörter abzählen (abzüglich Restwörter aus Vorsatz)
                i += 1
                while (words_to_count_for - sentence_lengths[i]) > 0:
                    words_to_count_for = words_to_count_for - sentence_lengths[
                        i]
                    sentences_count[
                        count_index] = sentences_count[count_index] + 1
                    i += 1

                #Anzahl der Sätze zählen und Reste vorher und nachher aufaddieren
                sentences_count[count_index] = sentences_count[count_index]
                rest = rest + (words_to_count_for / sentence_lengths[i])

                #Werte vom aktuellen Sample
                sentences_count[
                    count_index] = sentences_count[count_index] + rest
                syllables_count.append(ta.countSyllables(sample_words))

                #Wenn kleiner 100, dann auf 100 hochrechnen
                if word_count < 100:
                    sentences_count[count_index] = sentences_count[
                        count_index] * 100 / word_count
                    syllables_count[count_index] = syllables_count[
                        count_index] * 100 / word_count

                #das nächste sample
                j += 1

            data[site]['Syllables'] = float(
                sum(syllables_count)) / len(syllables_count)
            data[site]['Sentences'] = float(
                sum(sentences_count)) / len(sentences_count)

        fig = fry_plt.figure(figsize=(8.15, 5.03))

        ax = fig.add_subplot(111)

        #Achse ausblenden
        Axes.set_frame_on(ax, False)

        for site in sites:
            ax.plot(self.get_x_value(data[site]['Syllables']),
                    self.get_y_value(data[site]['Sentences']),
                    'bo',
                    ms=5)
            ax.annotate(site, (self.get_x_value(data[site]['Syllables']) - 6,
                               self.get_y_value(data[site]['Sentences'])))

        fig.figimage(self.im, 82, 40)

        fry_plt.xlim([108, 174])
        fry_plt.xlabel("Average Number of Syllables per 100 words")
        fry_plt.xticks(arange(108, 174, 4))

        fry_plt.ylim([0, 29])
        fry_plt.ylabel("Average Number of Sentences per 100 words")
        #Beschriftung gemäß Fry-Graph
        y_ticks = [
            '2.0', '2.5', '3.0', '3.3', '3.5', '3.6', '3.7', '3.8', '4.0',
            '4.2', '4.3', '4.5', '4.8', '5.0', '5.2', '5.6', '5.9', '6.3',
            '6.7', '7.1', '7.7', '8.3', '9.1', '10.0', '11.1', '12.5', '14.3',
            '16.7', '20', '25+'
        ]
        fry_plt.yticks(arange(30), y_ticks)

        labels = sites