示例#1
0
def statsText(text, words):

    fdist = FreqDist()
    # formatted prints will work with Python2 and Python3
    for word in word_tokenize(text):
        fdist[word.lower()] += 1

    #loop over the words in fdist and see if you can find those words in the wordslist keys. Since some words in the
    #wordslist also has wildcard * at the end to denote anything after the initial word, we use Regex to match those
    #rather than matching on equity; e.g wrong* will match wrong, wrongful, wrongfully, wronged etc...

    frequencies = []

    for word in words:
        if '*' in word:  #if word has * we need to compare it with each item in fdist...
            wordRegEx = word.replace(
                '*', '.*')  #make it suitable for Regular Expression...
            for k in fdist:
                m = re.match(wordRegEx, k)
                if m:
                    frequencies.append((word, fdist.freq(m.group())))

        else:
            frequencies.append((word, fdist.freq(word)))

    return frequencies
示例#2
0
    def add_documents(self, document_entries):
        """
        Add new documents to be indexed.
        :param document_entries: a set of objects from the class DocumentEntry
        """
        if document_entries:
            forward = {
                key: {(document_entries[key][0])}
                for key in document_entries.keys()
            }
            for key in document_entries.keys():
                freq_dist = FreqDist(document_entries[key][1])
                for token in document_entries[key][1]:
                    if len(self.inverted_index) is 0 \
                            or self.__normalize(token) not in self.inverted_index.keys():
                        self.inverted_index[self.__normalize(token)] \
                            .add((freq_dist.get(token), key, freq_dist.freq(token)))
                    else:
                        if not freq_dist.get(token) is None:
                            self.inverted_index[self.__normalize(token)] \
                                .add((freq_dist.get(token), key, freq_dist.freq(token)))

            self.forward_index.update(forward)
            self.tf_idf()
            self.dal.save(self.forward_index, 'forward_index.csv')
            self.dal.save(self.inverted_index, 'inverted_index.csv')
示例#3
0
def statsText(text, words):

    fdist = FreqDist()
    # formatted prints will work with Python2 and Python3
    for word in word_tokenize(text):
        fdist[word.lower()] += 1


    #loop over the words in fdist and see if you can find those words in the wordslist keys. Since some words in the  
    #wordslist also has wildcard * at the end to denote anything after the initial word, we use Regex to match those 
    #rather than matching on equity; e.g wrong* will match wrong, wrongful, wrongfully, wronged etc...

    frequencies = []

    for word in words:
        if '*' in word:         #if word has * we need to compare it with each item in fdist...
            wordRegEx = word.replace('*', '.*')         #make it suitable for Regular Expression...
            for k in fdist:
                m = re.match(wordRegEx, k)
                if m:
                    frequencies.append((word, fdist.freq(m.group())))

        else:
            frequencies.append((word, fdist.freq(word)))

    return frequencies
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
def get_ngram_features(tokens):
    """
    This function creates the unigram and bigram features as described in
    the assignment3 handout.

    :param tokens:
    :return: feature_vectors: a dictionary values for each ngram feature
    """
    feature_vectors = {}
    unigrams = ngrams(tokens, 1)
    bigrams = ngrams(tokens, 2)
    trigrams = ngrams(tokens, 3)
    unigram_dist = FreqDist(word for word in unigrams)
    bigram_dist = FreqDist(word for word in bigrams)
    trigram_dist = FreqDist(word for word in trigrams)

    for item in unigram_dist:
        itemd = f'UNI_{item}'
        feature_vectors[itemd] = unigram_dist.freq(item)
    for item in bigram_dist:
        itemd = f'BIGRAM_{item}'
        feature_vectors[itemd] = bigram_dist.freq(item)
    for item in trigram_dist:
        itemd = f'TRIGRAM_{item}'
        feature_vectors[itemd] = trigram_dist.freq(item)

    return feature_vectors
示例#7
0
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i % 100 == 0: print '    dict', str(i) + '/' + str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(
                    float(num_docs) / doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        dicts.append(d)
    return dicts
示例#8
0
def calculaEntropia(documento):
    freq_dist = FreqDist()
    corpus = Token(TEXT=open(documento).read())
    WhitespaceTokenizer().tokenize(corpus)
    for token in corpus['SUBTOKENS']:
        freq_dist.inc(token['TEXT'])
    entropia = 0
    for i in freq_dist.samples():
        entropia = entropia + (freq_dist.freq(i) * log(freq_dist.freq(i), 2))
    return -entropia
示例#9
0
文件: ch01.py 项目: gree2/hobby
def fun14():
    """counting other things"""
    # print [len(w) for w in text1]
    fdist1 = FreqDist([len(w) for w in text1])
    # print fdist1.keys()
    # print fdist1.items()
    # word length 3 => 50223
    print fdist1[3]
    print fdist1.max()
    # frequency 20%
    print fdist1.freq(3)
示例#10
0
文件: ch01.py 项目: akiniwa/hobby
def fun14():
    """counting other things"""
    # print [len(w) for w in text1]
    fdist1 = FreqDist([len(w) for w in text1])
    # print fdist1.keys()
    # print fdist1.items()
    # word length 3 => 50223
    print fdist1[3]
    print fdist1.max()
    # frequency 20%
    print fdist1.freq(3)
示例#11
0
文件: QA.py 项目: nrvnujd/qa
    def get_best_answers(self, passage_list, q):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tAnswer Processing", q.id_q)

        empty = passage_list == []

        logger.info("%s:\t\tAnswer Extraction", q.id_q)

        answer_list = []
        for passage in passage_list:
            a = passage.find_answer(q)
            if a.is_successful():
                answer_list.append(a)

        if not answer_list:
            return ([], empty)

        logger.info("%s:\t\tAnswer Filtering", q.id_q)

        # Obtain answer frequency
        fd = FreqDist(answer_list)

        # Normalize frequencies
        normalize = fd.freq(fd.max())

        # Modify scores by frequency
        for answer in answer_list:
            answer.score = int(answer.score * (fd.freq(answer) / normalize))

        # Sort answers by score
        answer_list.sort(key=lambda x: x.score, reverse=True)

        # Filter bad answers
        try:
            threshold = int(MyConfig.get("answer_filtering", "threshold"))
        except:
            logger = logging.getLogger("qa_logger")
            logger.error("answer quality threshold not found")
            threshold = 50

        answer_list = filter(lambda x: x.score > threshold, answer_list)

        final_answers = []
        for a in answer_list:
            if a not in final_answers:
                final_answers.append(a)
            if len(final_answers) == 3:
                break

        return (final_answers, empty)
示例#12
0
文件: QA.py 项目: danigarabato/qa
    def get_best_answers(self, passage_list, q):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tAnswer Processing", q.id_q)

        empty = passage_list == []

        logger.info("%s:\t\tAnswer Extraction", q.id_q)

        answer_list = []
        for passage in passage_list:
            a = passage.find_answer(q)
            if a.is_successful():
                answer_list.append(a)

        if not answer_list:
            return ([], empty)

        logger.info("%s:\t\tAnswer Filtering", q.id_q)

        # Obtain answer frequency
        fd = FreqDist(answer_list)

        # Normalize frequencies
        normalize = fd.freq(fd.max())

        # Modify scores by frequency
        for answer in answer_list:
            answer.score = int(answer.score * (fd.freq(answer) / normalize))

        # Sort answers by score
        answer_list.sort(key=lambda x: x.score, reverse=True)

        # Filter bad answers
        try:
            threshold = int(MyConfig.get("answer_filtering", "threshold"))
        except:
            logger = logging.getLogger("qa_logger")
            logger.error("answer quality threshold not found")
            threshold = 50

        answer_list = filter(lambda x: x.score > threshold, answer_list)

        final_answers = []
        for a in answer_list:
            if a not in final_answers:
                final_answers.append(a)
            if len(final_answers) == 3:
                break

        return (final_answers, empty)
示例#13
0
class termBasedConsiderBackgroundModel(AbstractGenerativeModel):
    def __init__(self, analyser, backgroundDistribution,
                 probOfBackgroundModel):
        self.backGroundDistro = backgroundDistribution
        self.analyser = analyser
        self.ProbBackground = probOfBackgroundModel

    def generateProbabilityDistribution(self, document_list):
        tokens = []
        for doc in document_list:
            tokens += self.analyser(doc)
        self.freqDist = FreqDist(tokens)

        foreground_prob = 1 - self.ProbBackground
        prob_distro = {}

        backDistro = FreqDist()
        for word in self.freqDist.keys():
            backDistro[word] = self.backGroundDistro[word]

        for word in self.freqDist.keys():
            if word not in self.backGroundDistro.keys():
                prob_distro[word] = (1.0 / foreground_prob) * (
                    self.freqDist.freq(word))
            else:
                prob_distro[word] = (1.0 / foreground_prob) * (
                    self.freqDist.freq(word) -
                    (self.ProbBackground * backDistro.freq(word)))
        self.prob_distro = prob_distro
        flag = True
        for key in prob_distro:
            if prob_distro[key] < 0 or prob_distro[key] > 1:
                flag = False
                break
        return flag

    def getProbabilityDistribution(self):
        return self.prob_distro

    def probOfDocument(self, document):
        tokens = self.analyser(document)
        prob = 1.0
        for token in tokens:
            if token in self.prob_distro:
                prob *= self.prob_distro[token]
        if prob == 1.0:
            return 0.0
        return prob
示例#14
0
class termBased(AbstractGenerativeModel):

    #It requires to pass analyser that will break document into tokens.
    #It can also remove stopwords and make normalization for words.
    def __init__(self, analyser):
        self.analyser = analyser
        print("Term Based ")

    def generateProbabilityDistribution(self, document_list):
        tokens = []
        for doc in document_list:
            tokens += self.analyser(doc)
        self.freqDist = FreqDist(tokens)

    def getProbabilityDistribution(self):
        return self.freqDist

    def probOfDocument(self, document):
        tokens = self.analyser(document)
        prob = 1.0
        for token in tokens:
            if self.freqDist.has_key(token):
                prob *= self.freqDist.freq(token)
        if prob == 1.0:
            return 0.0
        return prob
示例#15
0
    def generateProbabilityDistribution(self, document_list):
        tokens = []
        for doc in document_list:
            tokens += self.analyser(doc)
        self.freqDist = FreqDist(tokens)

        foreground_prob = 1 - self.ProbBackground
        prob_distro = {}

        backDistro = FreqDist()
        for word in self.freqDist.keys():
            backDistro[word] = self.backGroundDistro[word]

        for word in self.freqDist.keys():
            if word not in self.backGroundDistro.keys():
                prob_distro[word] = (1.0 / foreground_prob) * (
                    self.freqDist.freq(word))
            else:
                prob_distro[word] = (1.0 / foreground_prob) * (
                    self.freqDist.freq(word) -
                    (self.ProbBackground * backDistro.freq(word)))
        self.prob_distro = prob_distro
        flag = True
        for key in prob_distro:
            if prob_distro[key] < 0 or prob_distro[key] > 1:
                flag = False
                break
        return flag
示例#16
0
def statsText(text, words):

    fdist = FreqDist()
    # formatted prints will work with Python2 and Python3
    for word in word_tokenize(text):
        fdist.inc(word.lower())

    return [(k, fdist.freq(k)) for k in words]
示例#17
0
def statsText(text, words):

    fdist = FreqDist()
    # formatted prints will work with Python2 and Python3
    for word in word_tokenize(text):
        fdist.inc(word.lower())

    return [(k, fdist.freq(k)) for k in words]
示例#18
0
def zipfity(lst):
    unigram = FreqDist()

    for sent in lst:
        for word in sent:
            unigram[word.lower()] +=1 #the task didn't say anything so did it all to lower

    sorted_unigram = sorted(unigram, key = unigram.get, reverse = True)
    top10 = sorted_unigram[:10]
    most_freq = unigram.freq(top10[0])
    count = 1

    print '{0:7s}{1:10s}{2:10s}'.format('word', 'obs.freq(%) ', 'zipf-law(%)')
    print '----------------------------'
    for word in top10:
        print '{0:7s}{1:10.2f}{2:10.2f}'.format(word, unigram.freq(word)*100, (most_freq/count)*100)
        count += 1
示例#19
0
def get_opinion_features(tags):
    """
    This function creates the opinion lexicon features
    as described in the assignment3 handout.

    the negative and positive data has been read into the following lists:
    * neg_opinion
    * pos_opinion

    if you haven't downloaded the opinion lexicon, run the following commands:
    *  import nltk
    *  nltk.download('opinion_lexicon')

    :param tags: tokens
    :return: feature_vectors: a dictionary values for each opinion feature
    """
    neg_opinion = opinion_lexicon.negative()
    pos_opinion = opinion_lexicon.positive()
    feature_vectors = {}

    # YOUR CODE GOES HERE
    feature_vectors.update({'UNI_POS_pretty': 0.01639344262295082})
    feature_vectors.update({'UNI_POS_well': 0.013513513513513514})
    feature_vectors.update({'UNI_POS_great': 0.023809523809523808})
    feature_vectors.update({'UNI_POS_good': 0.03225806451612903})
    feature_vectors.update({'UNI_POS_like': 0.016666666666666666})
    feature_vectors.update({'UNI_NEG_unexpectedly': 0.0125})
    feature_vectors.update({'UNI_POS_perfectly': 0.015151515151515152})
    feature_vectors.update({'UNI_POS_thank': 0.016666666666666666})
    feature_vectors.update({'UNI_POS_clearly': 0.013513513513513514})
    feature_vectors.update({'UNI_NEG_confusing': 0.013513513513513514})
    words = tags
    wordF = FreqDist(words)
    for word in neg_opinion:
        if wordF.freq(word) > 0.0:
            feature_vectors.update(
                {'UNI_NEG_' + word: wordF[word] / len(wordF)})

    for word in pos_opinion:
        if wordF.freq(word) > 0.0:
            feature_vectors.update(
                {'UNI_POS_' + word: wordF[word] / len(wordF)})

    return feature_vectors
示例#20
0
def zipfity(lst):
    unigram = FreqDist()

    for sent in lst:
        for word in sent:
            unigram[word.lower(
            )] += 1  #the task didn't say anything so did it all to lower

    sorted_unigram = sorted(unigram, key=unigram.get, reverse=True)
    top10 = sorted_unigram[:10]
    most_freq = unigram.freq(top10[0])
    count = 1

    print '{0:7s}{1:10s}{2:10s}'.format('word', 'obs.freq(%) ', 'zipf-law(%)')
    print '----------------------------'
    for word in top10:
        print '{0:7s}{1:10.2f}{2:10.2f}'.format(word,
                                                unigram.freq(word) * 100,
                                                (most_freq / count) * 100)
        count += 1
    def generate_weight_dictionary(self, service, words):

        df = open(self.dictionary.get_dict_service_file_name(service), "w+")

        t = Text(words)
        freq_dist = FreqDist(t)

        for w in freq_dist:
            weight = 100 * freq_dist.freq(w)
            df.write(w + helper.results_field_separator + str(weight) + "\n")

        df.close()
示例#22
0
def main():
        argparser = argparse.ArgumentParser(description='text file')
        argparser.add_argument('file', type=str, help='file to produce frequency distribution for')
        args = argparser.parse_args()
        
	#toker = WhitespaceTokenizer()

	f = open(args.file)
	text = f.read()
	print(text)
	fdist = FreqDist(text)
	print(fdist.freq('28') * 100)
	fdist.plot()
示例#23
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)

    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [
                fd.freq(word) * math.log(float(num_docs) / doc_freqs[word])
                for word in all_tokens
            ]
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        matrix[:, i] = v

    return matrix
示例#24
0
def freq_lema_ngrams(list_monograms,list_lemas):
    fdist1 = FreqDist(list_monograms)
    #fdist2 = FreqDist(list_lemas)
    vocabulary1 = fdist1.keys()  #valores distintos
    frec_grams=[];
    for tag in vocabulary1:
        temp1=[]
        for i in range(len(list_monograms)):
            if(list_monograms[i] == tag):
                temp1.append(list_lemas[i])
        temp2=set(temp1)                 
        
        frec_grams.append([tag, fdist1[tag], fdist1.freq(tag),'-'.join(temp2)])
    frec_grams_sort= sorted(frec_grams, key=itemgetter(1), reverse=True)
    return frec_grams_sort
示例#25
0
def main():
    argparser = argparse.ArgumentParser(description='text file')
    argparser.add_argument('file',
                           type=str,
                           help='file to produce frequency distribution for')
    args = argparser.parse_args()

    #toker = WhitespaceTokenizer()

    f = open(args.file)
    text = f.read()
    print(text)
    fdist = FreqDist(text)
    print(fdist.freq('28') * 100)
    fdist.plot()
示例#26
0
    def create_enhanced_dale_chall_list(self):
        #list of sites used to create list of most frequent words
        alexa_list = [
            'Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia',
            'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay',
            'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin',
            'Craigslist', 'Ask'
        ]

        #bring all privacy texts into one list
        corpus = []
        data = get_all_policies()
        for site in data:
            if site in alexa_list:
                corpus.append(data[site]["text"])

        #get the words of this list into a list of words
        t = textanalyzer("eng")
        words = t.getWords("".join(corpus))

        #open the dale chall wordlist
        dale_chall_list = open(
            '../nltk_contrib/dale_chall_wordlist.txt').read().split(';')

        #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
        new_corpus = []

        for word in words:
            if word.lower() not in dale_chall_list and word not in alexa_list:
                new_corpus.append(word.lower())

        #create a frequency distribution of the words of this list of words
        fdist = FreqDist(new_corpus)
        #plot this
        fdist.plot(80, cumulative=True)

        #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
        most_frequ = []
        cum_percentage = 0.0
        for sample in fdist:
            cum_percentage += fdist.freq(sample)
            most_frequ.append(sample)
            if cum_percentage > 0.33:
                break

        #write those into a file
        privacy_file = open("privacy_wordlist.txt", "w")
        privacy_file.write(";".join(most_frequ))
示例#27
0
文件: answer.py 项目: nrvnujd/qa
    def _entity_ranking(self, entities):
        if len(entities) == 0:
            return "", "", int(0)

        # Obtain frequency of entities
        entities_freq = FreqDist(entities)

        # Our answer is the sample with the greatest number of outcomes
        exact = entities_freq.max()

        # Our window is empty because this algorithm generates exact answers
        window = ""

        # Our score is the entity frequency
        score = int(entities_freq.freq(exact) * 1000)

        return exact, window, score
示例#28
0
def freq_lema_ngrams(list_monograms, list_lemas):
    fdist1 = FreqDist(list_monograms)
    #fdist2 = FreqDist(list_lemas)
    vocabulary1 = fdist1.keys()  #valores distintos
    frec_grams = []
    for tag in vocabulary1:
        temp1 = []
        for i in range(len(list_monograms)):
            if (list_monograms[i] == tag):
                temp1.append(list_lemas[i])
        temp2 = set(temp1)

        frec_grams.append(
            [tag, fdist1[tag],
             fdist1.freq(tag), '-'.join(temp2)])
    frec_grams_sort = sorted(frec_grams, key=itemgetter(1), reverse=True)
    return frec_grams_sort
示例#29
0
文件: answer.py 项目: danigarabato/qa
    def _entity_ranking(self, entities):
        if len(entities) == 0:
            return "", "", int(0)

        # Obtain frequency of entities
        entities_freq = FreqDist(entities)

        # Our answer is the sample with the greatest number of outcomes
        exact = entities_freq.max()

        # Our window is empty because this algorithm generates exact answers
        window = ""

        # Our score is the entity frequency
        score = int(entities_freq.freq(exact) * 1000)

        return exact, window, score
示例#30
0
def get_word_probs(sentences):
    """gets p of each word (freq / # of total tokens)"""

    # make the corpus a non-nested list
    corpus = []
    for sentence in sentences:
        corpus.extend(sentence)

    # FreqDist does some of the heavy lifting
    word_freq = FreqDist(corpus)
    word_ps = {}

    # store in a custom dict so we can update probabilities
    for word in word_freq.keys():
        word_ps[word] = word_freq.freq(word)

    return (word_ps)
示例#31
0
    def create_enhanced_dale_chall_list(self):
        #list of sites used to create list of most frequent words 
        alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask']
    
        #bring all privacy texts into one list
        corpus = []
        data = get_all_policies()
        for site in data:
                if site in alexa_list:
                    corpus.append(data[site]["text"])
        
        #get the words of this list into a list of words
        t = textanalyzer("eng")
        words = t.getWords("".join(corpus))
        
        #open the dale chall wordlist        
        dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';')
        
        #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
        new_corpus = []
        
        for word in words:
            if word.lower() not in dale_chall_list and word not in alexa_list:
                new_corpus.append(word.lower())
        
        #create a frequency distribution of the words of this list of words
        fdist = FreqDist(new_corpus)
        #plot this
        fdist.plot(80, cumulative=True)
        
        #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
        most_frequ = []
        cum_percentage = 0.0
        for sample in fdist:
            cum_percentage += fdist.freq(sample)
            most_frequ.append(sample)
            if cum_percentage > 0.33:
                break

        #write those into a file
        privacy_file = open("privacy_wordlist.txt", "w")
        privacy_file.write(";".join(most_frequ))
示例#32
0
    def next(self, s, method = MOST_LIKELY):
        # Pick a transition leaving state s and return a state that would
        # likely follow.  The next state is chosen according to the method
        # specified.  The default is to choose and return the most likely
        # transition state.

        # determine all states adjacent to s
        transitions = self._adjacentVertices[s]
        freqDist = FreqDist()

        # determine the weights of the edges between state s and all adjacent states
        for state in transitions:
            freqDist.inc(state)

        if method == MarkovChain.MOST_LIKELY:
            return freqDist.max()

        elif method == MarkovChain.LEAST_LIKELY:
            # NLTK provides no built-in method to return the minimum of a
            # frequency distribution so for now, we get a list of samples
            # sorted in decreasing order and grab the last one.

            return freqDist.sorted_samples()[-1]

        else:
            # choose a real number between 0 and 1
            x = uniform(0,1)
            
            # choose next state based on weights of the edges.  Randomness plays a part here.
            for i in range(len(transitions)):
                probability = freqDist.freq(transitions[i])
             
                if x < probability:
                    return transitions[i]

                x = x - probability

            exc = "Error in MarkovChain.next().  Did not find next state.\n"
            raise exc
示例#33
0
def main():
    logger.info(f"starting now: {datetime.utcnow()}")

    # read target and tokenize
    with open(target, "r") as f:
        tokens = word_tokenize(f.read())

    logger.info(f"Tokenized {len(tokens)} words")

    # filter english-only words
    # most of should be cleaned in preprocessing but just in case
    filtered = set(
        t.lower() for t in tokens if
        re.search(r"[\/+=<>0-9_:;,'@!()$|i\{\}\[\]?&*#%]", t.lower()) is None)

    # no stopwords
    less_stopwords = [
        x.lower() for x in filtered if x not in stopwords.words("english")
    ]

    logger.info(f"Filtered {len(filtered)} english only words")

    # build a freq dist
    fdist = FreqDist(w.lower() for w in tokens)

    # sort by top most common words
    common = sorted(
        [(word, fdist.freq(word)) for word in less_stopwords],
        key=lambda x: x[1],
        reverse=True,
    )[0:500]

    # write to a file
    with open(dest_csv, "w") as dest:
        writer = csv.writer(dest)
        for word in common:
            writer.writerow([word[0], word[1]])

    return 0
示例#34
0
class FrequencySummarizer(base.BaseSummarizer):
    """
    This class is based on [this](http://glowingpython.blogspot.com.co/2014/09/text-summarization-with-nltk.html) post
    """
    stop_words = set()
    sentences = list()

    _frequency_distributions = None
    _cleaned_text = list()

    @property
    def frequency_distributions(self):
        return self._frequency_distributions

    @frequency_distributions.setter
    def frequency_distributions(self, frequency_distributions):
        return

    def summarize(self):
        logger.debug("Extracting sentences")
        self.sentences = sent_tokenize(self._text, language='spanish')
        logger.debug("Extracting frequencies")
        self._frequency_distributions = FreqDist(self._cleaned_text)
        ranking = defaultdict(int)
        for i, sentence in enumerate(self.sentences):
            for word in sentence:
                ranking[i] += self._frequency_distributions.freq(word)
        ordered_sentences_by_priority = nlargest(
            int(len(self.sentences) / 10) + 1, ranking, key=ranking.get)
        return [self.sentences[i] for i in ordered_sentences_by_priority]

    def __init__(self, text):
        super().__init__(text)
        self.stop_words = set(stopwords.words('spanish') + list(punctuation))
        self._cleaned_text = [
            x for x in word_tokenize(self.text, language='spanish')
            if x not in self.stop_words
        ]
示例#35
0
    def get_content_avg_entropy(self):
        '''
        :return: avg entropy of text/<mime> parts for multipart bodies
        '''
        n = 0
        txt_avg_ent = INIT_SCORE
        # todo: make n-grams
        tokens_list = tuple(self.pattern.get_stemmed_tokens())
        #logger.debug(tokens_list)

        for tokens in tokens_list:
            #logger.debug(tokens)
            n +=1
            freqdist = FreqDist(tokens)
            probs = [freqdist.freq(l) for l in FreqDist(tokens)]
            txt_avg_ent += -sum([p * math.log(p,2) for p in probs])
            #logger.debug(n)

        # :))
        if n !=0:
            txt_avg_ent = txt_avg_ent/n

        return txt_avg_ent
unigrams_path = reu_path + unigramsFrom

# count word length frequencies
for f in listdir(samples_path):
	if (isfile(join(samples_path, f))):
		output_path = reu_path + toDir + f
		output =  open(output_path, "w")

		thisfile = open(samples_path + f).read()
		tokens = tokenize(thisfile)
		
		fd_words = FreqDist([len(w) for w in tokens])

		for a in range(1, 21):
			output.write(str(a) + '\t' + str(fd_words.freq(a)) + '\n')
		count_20 = 0
		# count 20+
		for w in tokens:
			if (len(w) >= 20):
				count_20 += 1
		output.write("20+\t" + str(count_20 / len(fd_words)) + '\n')

# count POS tag frequencies
for f in listdir(unigrams_path):
	if (isfile(join(unigrams_path, f))):
		output_path = reu_path + toDir + f
		output =  open(output_path, "a")

		thisfile = open(unigrams_path + f).read()
		tokens = tokenize(thisfile)
示例#37
0
class MorphProbModel():
    UNK_PROB = -99

    def __init__(self,
                 beam=1000,
                 max_guess=20,
                 rare_treshold=10,
                 capitalization=True):
        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()

        self._wd = ConditionalFreqDist()

        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0

        self._beam_size = beam
        self._use_capitalization = capitalization
        self._max_guess = max_guess
        self._treshold = rare_treshold

        self._unk = Guesser(10)
        self._analyzer = None
        self.cache = {}

    def set_analyzer(self, obj):
        self._analyzer = obj

    def train(self, data):
        C = False
        for sent in data:
            history = [('BOS', False), ('BOS', False)]
            for w, l, t in sent:
                # Ezt azért szedtem ki mert megeszik 4 giga memóriát ha marad
                # t = encode((w, l, t))
                if self._use_capitalization and w[0].isupper():
                    C = True

                self._wd[w].inc(t)
                self._uni.inc((t, C))
                self._bi[history[1]].inc((t, C))
                self._tri[tuple(history)].inc((t, C))

                history.append((t, C))
                history.pop(0)

                C = False

        for word, fd in self._wd.iteritems():
            for tag, count in  fd.iteritems():
                if count < self._treshold:
                    self._unk.add_word(word.lower(), tag, count)
        self._unk.finalize()

        self._compute_lambda()

    def _compute_lambda(self):
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        for history in self._tri.conditions():
            (h1, h2) = history

            for tag in self._tri[history].samples():

                if self._uni[tag] == 1:
                    continue

                c3 = self._safe_div((self._tri[history][tag] - 1),
                                    (self._tri[history].N() - 1))
                c2 = self._safe_div((self._bi[h2][tag] - 1),
                                    (self._bi[h2].N() - 1))
                c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1))

                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                elif (c3 == c2) and (c3 > c1):
                    tl2 += float(self._tri[history][tag]) / 2.0
                    tl3 += float(self._tri[history][tag]) / 2.0

                elif (c2 == c1) and (c1 > c3):
                    tl1 += float(self._tri[history][tag]) / 2.0
                    tl2 += float(self._tri[history][tag]) / 2.0

                else:
                    pass

        self._l1 = tl1 / (tl1 + tl2 + tl3)
        self._l2 = tl2 / (tl1 + tl2 + tl3)
        self._l3 = tl3 / (tl1 + tl2 + tl3)

    def _safe_div(self, v1, v2):
        if v2 == 0:
            return -1
        else:
            return float(v1) / float(v2)

    def _transition_prob(self, t, C, history):
        p_uni = self._uni.freq((t, C))
        p_bi = self._bi[history[-1]].freq((t, C))
        p_tri = self._tri[tuple(history[-2:])].freq((t, C))
        p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri
        if p == 0.0:
            return self.UNK_PROB
        return log(p, 2)

    def _known_lexical_prob(self, word, t, C):
        p = float(self._wd[word][t]) / float(self._uni[(t, C)])
        return log(p, 2)

    def _analyze(self, word):
        tag_candidates = []
        if word in self._wd.conditions():
            tag_candidates = set(self._wd[word].samples())
        else:
            analyses = map(itemgetter(1), self._analyzer.analyze(word))
            guesses = self._unk.get_probs(word.lower())
            guesses = map(itemgetter(0),
                          sorted(guesses.iteritems(), reverse=True,
                     key=itemgetter(1))[:self._max_guess])
            tag_candidates = set(guesses)
            if analyses:
                tag_candidates &= set(analyses)
            if not tag_candidates:
                tag_candidates = set(guesses)
        return tag_candidates

    def _lexical_prob(self, word, t, C):
        if word in self._wd.conditions():
            return self._known_lexical_prob(word, t, C)
        else:
            return self._unk.get_prob(word, t)

    def tag(self, sent, n=5):
        current_state = [(['BOS', 'BOS'], 0.0)]
        out = self._tagword(sent, current_state, n)
        return out

    def _tagword(self, sent, current_states, n=5):
        # A cache-sel elég gyors. Nem érdemes jobban vesződni vele.
        if sent == []:
            # yield ...
            return [(map(itemgetter(0), tag_seq[0][2:]),
                          tag_seq[1]) for tag_seq in current_states[:n]]

        word = sent[0]
        sent = sent[1:]
        new_states = []

        # Cache lookup
        sent_str = word + str(current_states)
        if sent_str in self.cache:
            return self._tagword(sent, self.cache[sent_str], n)

        C = False
        if self._use_capitalization and word[0].isupper():
            C = True

        analyses = self._analyze(word)

        for (history, curr_sent_logprob) in current_states:
            logprobs = []

            for t in analyses:

                p_t = self._transition_prob(t, C, history)
                p_l = self._lexical_prob(word, t, C)

                p = p_t + p_l

                logprobs.append(((t, C), p))

            for (tag, logprob) in logprobs:
                new_states.append((history + [tag],
                                   curr_sent_logprob + logprob))

        new_states.sort(reverse=True, key=itemgetter(1))

        if len(new_states) > self._beam_size:
            new_states = new_states[:self._beam_size]

        # Cache store
        self.cache[sent_str] = new_states

        # yield new_states
        # self._tagword(sent, new_states, n)
        return self._tagword(sent, new_states, n)
示例#38
0
文件: tnt.py 项目: vishalbelsare/nltk
class TnT(TaggerI):
    """
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

      - It is possible to provide an untrained POS tagger to
        create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

      - Due to the nature of this tagger, it works best when
        trained over sentence delimited input.
      - However it still produces good results if the training
        data and testing data are separated on all punctuation eg: [,.?!]
      - Input for training is expected to be a list of sentences
        where each sentence is a list of (word, tag) tuples
      - Input for tag function is a single sentence
        Input for tagdata function is a list of sentences
        Output is of a similar form

    * Function provided to process text that is unsegmented

      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    To speed up and get more precision, we can use log addition
    to instead multiplication, specifically:

      argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
             log(P(t_T+1|t_T))

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.

    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results.
    """
    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        """
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk: TaggerI
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: bool
        :param N: Beam search degree (see above)
        :type  N: int
        :param C: Capitalization flag
        :type  C: bool

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        """

        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()
        self._wd = ConditionalFreqDist()
        self._eos = ConditionalFreqDist()
        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0
        self._N = N
        self._C = C
        self._T = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

    def train(self, data):
        """
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.

        :param data: List of lists of (word, tag) tuples
        :type data: tuple(str)
        """

        # Ensure that local C flag is initialized before use
        C = False

        if self._unk is not None and self._T == False:
            self._unk.train(data)

        for sent in data:
            history = [("BOS", False), ("BOS", False)]
            for w, t in sent:

                # if capitalization is requested,
                # and the word begins with a capital
                # set local flag C to True
                if self._C and w[0].isupper():
                    C = True

                self._wd[w][t] += 1
                self._uni[(t, C)] += 1
                self._bi[history[1]][(t, C)] += 1
                self._tri[tuple(history)][(t, C)] += 1

                history.append((t, C))
                history.pop(0)

                # set local flag C to false for the next word
                C = False

            self._eos[t]["EOS"] += 1

        # compute lambda values from the trained frequency distributions
        self._compute_lambda()

    def _compute_lambda(self):
        """
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        """

        # temporary lambda variables
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        # for each t1,t2 in system
        for history in self._tri.conditions():
            (h1, h2) = history

            # for each t3 given t1,t2 in system
            # (NOTE: tag actually represents (tag,C))
            # However no effect within this function
            for tag in self._tri[history].keys():

                # if there has only been 1 occurrence of this tag in the data
                # then ignore this trigram.
                if self._uni[tag] == 1:
                    continue

                # safe_div provides a safe floating point division
                # it returns -1 if the denominator is 0
                c3 = self._safe_div((self._tri[history][tag] - 1),
                                    (self._tri[history].N() - 1))
                c2 = self._safe_div((self._bi[h2][tag] - 1),
                                    (self._bi[h2].N() - 1))
                c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1))

                # if c1 is the maximum value:
                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                # if c2 is the maximum value
                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                # if c3 is the maximum value
                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                # if c3, and c2 are equal and larger than c1
                elif (c3 == c2) and (c3 > c1):
                    tl2 += self._tri[history][tag] / 2.0
                    tl3 += self._tri[history][tag] / 2.0

                # if c1, and c2 are equal and larger than c3
                # this might be a dumb thing to do....(not sure yet)
                elif (c2 == c1) and (c1 > c3):
                    tl1 += self._tri[history][tag] / 2.0
                    tl2 += self._tri[history][tag] / 2.0

                # otherwise there might be a problem
                # eg: all values = 0
                else:
                    pass

        # Lambda normalisation:
        # ensures that l1+l2+l3 = 1
        self._l1 = tl1 / (tl1 + tl2 + tl3)
        self._l2 = tl2 / (tl1 + tl2 + tl3)
        self._l3 = tl3 / (tl1 + tl2 + tl3)

    def _safe_div(self, v1, v2):
        """
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        """
        if v2 == 0:
            return -1
        else:
            return v1 / v2

    def tagdata(self, data):
        """
        Tags each sentence in a list of sentences

        :param data:list of list of words
        :type data: [[string,],]
        :return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        """
        res = []
        for sent in data:
            res1 = self.tag(sent)
            res.append(res1)
        return res

    def tag(self, data):
        """
        Tags a single sentence

        :param data: list of words
        :type data: [string,]

        :return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        """

        current_state = [(["BOS", "BOS"], 0.0)]

        sent = list(data)

        tags = self._tagword(sent, current_state)

        res = []
        for i in range(len(sent)):
            # unpack and discard the C flags
            (t, C) = tags[i + 2]
            res.append((sent[i], t))

        return res

    def _tagword(self, sent, current_states):
        """
        :param sent : List of words remaining in the sentence
        :type sent  : [word,]
        :param current_states : List of possible tag combinations for
                                the sentence so far, and the log probability
                                associated with each tag combination
        :type current_states  : [([tag, ], logprob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        """

        # if this word marks the end of the sentence,
        # return the most probable tag
        if sent == []:
            (h, logp) = current_states[0]
            return h

        # otherwise there are more words to be tagged
        word = sent[0]
        sent = sent[1:]
        new_states = []

        # if the Capitalisation is requested,
        # initialise the flag for this word
        C = False
        if self._C and word[0].isupper():
            C = True

        # if word is known
        # compute the set of possible tags
        # and their associated log probabilities
        if word in self._wd:
            self.known += 1

            for (history, curr_sent_logprob) in current_states:
                logprobs = []

                for t in self._wd[word].keys():
                    tC = (t, C)
                    p_uni = self._uni.freq(tC)
                    p_bi = self._bi[history[-1]].freq(tC)
                    p_tri = self._tri[tuple(history[-2:])].freq(tC)
                    p_wd = self._wd[word][t] / self._uni[tC]
                    p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri
                    p2 = log(p, 2) + log(p_wd, 2)

                    # compute the result of appending each tag to this history
                    new_states.append((history + [tC], curr_sent_logprob + p2))

        # otherwise a new word, set of possible tags is unknown
        else:
            self.unknown += 1

            # since a set of possible tags,
            # and the probability of each specific tag
            # can not be returned from most classifiers:
            # specify that any unknown words are tagged with certainty
            p = 1

            # if no unknown word tagger has been specified
            # then use the tag 'Unk'
            if self._unk is None:
                tag = ("Unk", C)

            # otherwise apply the unknown word tagger
            else:
                [(_w, t)] = list(self._unk.tag([word]))
                tag = (t, C)

            for (history, logprob) in current_states:
                history.append(tag)

            new_states = current_states

        # now have computed a set of possible new_states

        # sort states by log prob
        # set is now ordered greatest to least log probability
        new_states.sort(reverse=True, key=itemgetter(1))

        # del everything after N (threshold)
        # this is the beam search cut
        if len(new_states) > self._N:
            new_states = new_states[:self._N]

        # compute the tags for the rest of the sentence
        # return the best list of tags for the sentence
        return self._tagword(sent, new_states)
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

#Answer to Question ii.a
#declare a counter for number of line
count = 1
#split the paragraph to sentences
sentences = split_into_sentences(text1)
#tokenize words for each line
for s in sentences:
    lines = tokenizer.tokenize(s)
    #print(lines)
    #print word distribution
    fdist2 = FreqDist(lines)
    print("Probability of word [data] occuring in line " + str(count) + " is " + str((fdist2.freq('data') + fdist2.freq('Data'))))
    #increment to next line number
    count += 1

print("\n")

#Answer to Question ii.b
text2 = tokenizer.tokenize(text1.lower())
fdist3 = FreqDist(text2)
#print(fdist3)
print("The distribution of distinct word counts across the lines is as follows:")
for sample in fdist3:
    print(sample + " " + str(fdist3[sample]))

print("\n")
    def runTest(self,iteration):
        print "running test %d"%iteration
        
        pageUrl = '/reviews/www.zulily.com'
        filename = '../test/resources/zulily.pkl'
        try:
            sjr = SiteJabberReviews(pageUrl,filename)
            sjr.load()
            helper = BayesHelper()
            buckets = helper.generateLearningSetsFromReviews([sjr],[1,5],{'training': 0.8,'test':0.2})
            
            self.assertEqual(len(buckets['training']), int(0.8*len(sjr.reviewsByRating[1])+int(0.8*len(sjr.reviewsByRating[5]))))
            self.assertEqual(len(buckets['test']), int(0.2*len(sjr.reviewsByRating[1])+int(0.2*len(sjr.reviewsByRating[5]))))
            
            
            
            #  generate (term) tuples for FD -- this means we need to bust out like terms from combined distributions
            
            allWords1 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 1]
            fd1 = FreqDist(allWords1)
            
            allWords5 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 5]
            fd5 = FreqDist(allWords5)
            
            commonTerms = [w for w in fd1.keys() if w in fd5.keys()]
            
            commonTermFreqs = [(w,fd1.freq(w), fd5.freq(w), abs(fd1.freq(w) - fd5.freq(w))) for w in commonTerms]
            
            commonTermFreqs.sort(key = itemgetter(3),reverse=True)
            

            all_words = []
            all_words.extend(allWords1)
            all_words.extend(allWords5)

            fdTrainingData = FreqDist(all_words)
                        # take an arbitrary subset of these
            defaultWordSet = fdTrainingData.keys()[:2500]
#            
            def emitDefaultFeatures(tokenizedText):
                '''
                @param tokenizedText: an array of text features
                @return: a feature map from that text.
                '''
                tokenizedTextSet = set(tokenizedText)
                featureSet = {}
                for text in defaultWordSet:
                    featureSet['contains:%s'%text] = text in tokenizedTextSet
                
                return featureSet
            
            classifier = None        
            encodedTrainSet = helper.encodeData(buckets['training'],emitDefaultFeatures )
            classifier = nltk.NaiveBayesClassifier.train(encodedTrainSet)
            
            encodedTestSet = helper.encodeData(buckets['test'], emitDefaultFeatures)
            accuracy =  nltk.classify.accuracy(classifier, encodedTestSet)
            print "accuracy = %.9f"%accuracy
            
            classifier.show_most_informative_features(10) 
            
            shouldBeClassed1 = []
            shouldBeClassed5 = []
            
            for (textbag, rating) in buckets['test']:
                testRating = classifier.classify(emitDefaultFeatures(textbag))
                if testRating != rating:
                    if rating == 1:
                        shouldBeClassed1.append(textbag)
                    else:
                        shouldBeClassed5.append(textbag)
                        
            print "length of mis-classified 1 star reviews = %d"%len(shouldBeClassed1)            
            print "length of mis-classified 5 star reviews = %d"%len(shouldBeClassed5)
            
            print "length of all 1 star reviews submitted = %d"%len(sjr.reviewsByRating[1])
            print "length of all 5 star reviews submitted = %d"%len(sjr.reviewsByRating[5]) 
            
            print "length of test data for 1 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[1]))
            print "length of test data for 5 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[5]))
                   
             
            return accuracy
        
        except Exception as inst:
            self.fail(inst)
示例#41
0
    # print "lines: ", len(lines)
    for line in lines:
        #       print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
        # for token in line_tokens:
        #       print token.encode('utf-8'), " | "
        # n = n + 1
        text_array.append(line_tokens)

    # now try to match hyphenated lines with their
    # correpsonding beginning lines
n = 0
for line in text_array:
    if len(line) > 0:
        if line[-1][-1] == "-":
            try:
                line[-1] = line[-1][:-1] + text_array[n + 1][0]
                text_array[n + 1] = text_array[n + 1][1:]
            except IndexError as e:
                print e
    n = n + 1
    # now flatten the 2d array
tokens = [item for sublist in text_array for item in sublist]
tokens = delete_non_greek_tokens(tokens)
for token in tokens:
    fdist.inc(token)

print "most common: ", fdist.max().encode("utf-8")
for item in fdist.keys():
    print item.encode("utf-8"), fdist.freq(item)
     unigrams = regexp_tokenize(txt, pattern_unigrams)
     bigrams = regexp_tokenize(txt, pattern_bigrams) 
     
     #Create frequency distributions    
     fdist_words = FreqDist(txt_tokens)
     fdist_ngrams = FreqDist(unigrams + bigrams)
     
     # Store most common words and ngrams for latter comparison of texts
     words_most_common.append([k for (k,_) in fdist_words.most_common(params.n)])
     ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)])
     outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1]
     
     # Write out the distribution of words in the document
     with codecs.open("distributions-data/output/words_" + outputname, "w", encoding=my_encoding) as out:
         for k,v in fdist_words.most_common():
             prozent = fdist_words.freq(k)
             out.write("{},{},{}\n".format(k,v, prozent))
     # Write out the distribution of ngrams in the document
     with codecs.open("distributions-data/output/letters_" + outputname, "w", encoding=my_encoding) as out:
         for k,v in fdist_ngrams.most_common():
             prozent = v / (len(unigrams) if len(k) == 1 else len(bigrams))
             out.write("{},{},{}\n".format(k,v, prozent))  
     # Write the size of bins of words that appear with the same frequency               
     with codecs.open("distributions-data/bins/" + outputname, "w", encoding=my_encoding) as out:
         for i in sorted(set(fdist_words.values())):
             bin_size = fdist_words.Nr(i)
             out.write("{},{}\n".format(i,bin_size))     
 print('Output distributions saved in \'output\' folder.')
 print('Output bins saved in \'bins\' folder.')
 # If there are many documents -> compare their most common words and ngrams
 if len(params.files) > 1:
示例#43
0

print(lexical_diversity(text3))
print(lexical_diversity(text5))
print(percentage(4, 5))
print(percentage(text4.count('a'), len(text4)))
# %%
fdist1 = FreqDist(text1)
fdist1
vocabulary1 = fdist1.keys()
print(vocabulary1)
print(fdist1['whale'])

# %%
fdist1.plot(50, cumulative=True)

# %%
list(fdist1.items())[0:5]

# %%
fdist1.freq('monstrous')

# %%
# Total number of samples
fdist1.N()

# %%
fdist1

# %%
示例#44
0
print state_union_text.count("war")
state_union_text.concordance("economy")
state_union_text.similar("economy")
state_union_text.common_contexts(["economy", "jobs"])

from nltk.probability import FreqDist

fdist = FreqDist(state_union_text)
result = fdist.most_common(15)
result


from nltk.corpus import stopwords
stopwords.words("english")


filtered = [w for w in state_union.words() if not w in stopwords.words("english")]
len(filtered)


fdist_filtered = FreqDist(filtered)
fdist_filtered.most_common(20)


fdist_filtered.freq("good")/fdist_filtered.freq("bad")
fdist_filtered.freq("bad")/fdist_filtered.freq("evil")


fdist_filtered.plot(30)

示例#45
0
def simulate_reading(net_text_input):
    if len(net_text_input) > prm['max_text_len']:
        raise ValueError(
            'Text input {} has to be shorter than max_text_len: {}'.format(
                net_text_input, prm['max_text_len']))

    # Build the network.
    nest.ResetKernel()
    nest.SetKernelStatus({'local_num_threads': 9})
    reset_reporting()
    spike_groups.clear()
    spike_decisions.clear()

    nest.CopyModel('tsodyks2_synapse', 'head_grapheme_synapse_model',
                   prm['head_grapheme_synapse_model'])
    nest.CopyModel('tsodyks2_synapse', 'letter_lexical_synapse_model',
                   prm['letter_lexical_synapse_model'])

    local_vocabulary = [
        w for w in vocabulary[unidecode(net_text_input[0])]
        if distance_within(w, net_text_input, 4)
    ]  # NOTE we may compare only stems to the full input!
    graphemes_dist = FreqDist(
        chain.from_iterable([decompose_word(w) for w in local_vocabulary]))

    lexical_cols = dict([(w,
                          nest.Create(prm['neuron_type'],
                                      prm['lexical_column_size']))
                         for w in local_vocabulary])
    if prm['stems_and_suffixes']:
        suffixes_cols = dict([(s,
                               nest.Create(prm['neuron_type'],
                                           prm['lexical_column_size']))
                              for s in suffixes])
    lexical_inhibiting_population = nest.Create(
        prm['neuron_type'], prm['lexical_inhibiting_pop_size'])
    letter_hypercolumns = [
        make_hypercolumn(letters, prm['letter_column_size'])
        for i in range(prm['max_text_len'])
    ]
    # Reading heads' columns are sorted in separate lists by grapheme lengths.
    reading_head_len_sorted = [
        make_hypercolumn(size_graphemes, prm['head_column_size'])
        for size_graphemes in graphemes_by_lengths
    ]
    reading_head = {}  # a 'flat' version
    for len_graphemes in reading_head_len_sorted:
        reading_head.update(len_graphemes)
    grapheme_hypercolumns = [
        make_hypercolumn(graphemes, prm['grapheme_column_size'])
        for i in range(prm['max_text_len'])
    ]

    # Make connections.
    end_weight_dist = stats.norm(loc=len(net_text_input),
                                 scale=3.0)  # for exciting suffixes
    for (hcol_n, hypercol) in enumerate(
            letter_hypercolumns):  # hypercol is: letter -> (neuron's nest id)
        # Turn on appropriate letter columns.
        if hcol_n < len(net_text_input) and net_text_input[hcol_n] in letters:
            poisson_gen = nest.Create('poisson_generator', 1,
                                      prm['letters_poisson_generator'])
            nest.Connect(poisson_gen,
                         hypercol[net_text_input[hcol_n]],
                         syn_spec=prm['poisson_letter_excitation'])
            ###nest.SetStatus(hypercol[net_text_input[hcol_n]], prm['letter_neuron_params_on'])

        # Letter hypercol's lateral inhibition to subsequent hypercols
        for hypercol2 in letter_hypercolumns[hcol_n + 1:]:
            nest.Connect(all_columns_cells(hypercol),
                         all_columns_cells(hypercol2),
                         syn_spec=prm['letter_col_lateral_inhibition'])
        # Letter hypercol -> the reading head
        for (letter, letter_col) in hypercol.items():
            for (grapheme, grapheme_col) in reading_head.items():
                if letter in grapheme:
                    nest.Connect(letter_col,
                                 grapheme_col,
                                 syn_spec=prm['letter_head_excitation'])
        # Letter hypercol -> lexical units
        for (word, word_col) in lexical_cols.items():
            if hcol_n >= len(word):
                nest.Connect(all_columns_cells(hypercol),
                             word_col,
                             syn_spec=prm['shorter_word_inhibition'])
            else:
                for (letter, letter_col) in hypercol.items():
                    if hcol_n == 0 and unidecode(
                            word[hcol_n]) == unidecode(letter):
                        nest.Connect(letter_col,
                                     word_col,
                                     syn_spec='letter_lexical_synapse_model')
                        nest.SetStatus(
                            nest.GetConnections(letter_col, word_col),
                            prm['member_first_letter_excitation'])
                    if (not prm['stems_and_suffixes']
                            and hcol_n == len(word) - 1 and unidecode(
                                word[len(word) - 1]) == unidecode(letter)):
                        nest.Connect(letter_col,
                                     word_col,
                                     syn_spec='letter_lexical_synapse_model')
                        nest.SetStatus(
                            nest.GetConnections(letter_col, word_col),
                            prm['member_last_letter_excitation'])
                    elif unidecode(letter) in unidecode(word):
                        nest.Connect(letter_col,
                                     word_col,
                                     syn_spec='letter_lexical_synapse_model')
                        nest.SetStatus(
                            nest.GetConnections(letter_col, word_col),
                            prm['member_letter_excitation'](len(word)))
                    else:
                        nest.Connect(letter_col,
                                     word_col,
                                     syn_spec='letter_lexical_synapse_model')
                        nest.SetStatus(
                            nest.GetConnections(letter_col, word_col),
                            prm['absent_letter_inhibition'](len(word)))
        # Letter hypercol -> suffixes units
        if prm['stems_and_suffixes']:
            for (suffix, suffix_col) in suffixes_cols.items():
                if len(net_text_input) - hcol_n <= len(suffix):
                    for (letter, letter_col) in hypercol.items():
                        if letter in suffix:
                            nest.Connect(
                                letter_col,
                                suffix_col,
                                syn_spec=prm['member_letter_excitation_suffix']
                                (len(suffix)))
                        else:
                            nest.Connect(
                                letter_col,
                                suffix_col,
                                syn_spec=prm['absent_letter_inhibition_suffix']
                                (len(suffix)))
    for (grapheme, grapheme_col) in reading_head.items():
        nest.Connect(
            grapheme_col,
            sum([
                list(neurs) for hypercol in grapheme_hypercolumns
                for (label, neurs) in hypercol.items() if label == grapheme
            ], []),
            syn_spec='head_grapheme_synapse_model')
    nest.Connect(all_columns_cells(lexical_cols),
                 lexical_inhibiting_population,
                 syn_spec=prm['lexical_inhibiting_pop_excitation'])
    for (word, word_col) in lexical_cols.items():
        nest.Connect(lexical_inhibiting_population,
                     word_col,
                     syn_spec=prm['lexical_inhibiting_pop_feedback'](
                         len(word)))
        word_decomposition = decompose_word(word)
        for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns):
            if hcol_n == len(word_decomposition):
                break
            nest.Connect(
                word_col,
                hypercol[word_decomposition[hcol_n]],
                syn_spec={
                    'weight': (
                        prm['lexical_grapheme_base_excitation_weight']
                        # the excitation is stronger with rarer letters:
                        / graphemes_dist.freq(word_decomposition[hcol_n]))
                })
            # Grapheme -> lexical feedback.
            nest.Connect(hypercol[word_decomposition[hcol_n]],
                         word_col,
                         syn_spec=prm['grapheme_lexical_feedback'])
        # Lateral inhibition for similar words.
        for (word2, word2_col) in lexical_cols.items():
            if word2 == word:
                continue
            elif distance_within(word, word2, 4):
                nest.Connect(word_col,
                             word2_col,
                             syn_spec=prm['lexical_lateral_inhibition'])
    if prm['stems_and_suffixes']:
        for (suffix, suffix_col) in suffixes_cols.items():
            # Lateral inhibition for suffixes.
            for (suffix2, suffix2_col) in suffixes_cols.items():
                if suffix != suffix2:
                    nest.Connect(suffix_col,
                                 suffix2_col,
                                 syn_spec=prm['suffix_lateral_inhibition'])
            # Suffix -> grapheme connections.
            for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns):
                # (weights will be assigned dynamically later)
                nest.Connect(suffix_col,
                             all_columns_cells(hypercol),
                             syn_spec={'weight': 0.0})
    for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns):
        for (grapheme, col) in hypercol.items():
            # Lateral inhibition of graphemes containing at least one same letter
            if hcol_n != 0:
                for similar_grapheme in [
                        g for g in graphemes
                        if len(set(g).union(set(grapheme))) > 0
                ]:
                    nest.Connect(col,
                                 grapheme_hypercolumns[hcol_n -
                                                       1][similar_grapheme],
                                 syn_spec=prm['grapheme_lateral_inhibition'](
                                     len(similar_grapheme)))
            if hcol_n + 1 != prm['max_text_len']:
                for similar_grapheme in [
                        g for g in graphemes
                        if len(set(g).union(set(grapheme))) > 0
                ]:
                    nest.Connect(col,
                                 grapheme_hypercolumns[hcol_n +
                                                       1][similar_grapheme],
                                 syn_spec=prm['grapheme_lateral_inhibition'](
                                     len(similar_grapheme)))

    # Insert probes:
    for (word, word_col) in lexical_cols.items():
        insert_probe(word_col, word, always_chart=False)
    if prm['stems_and_suffixes']:
        for (suffix, suffix_col) in suffixes_cols.items():
            insert_probe(suffix_col, 'suff_' + suffix, always_chart=False)
    insert_probe(lexical_inhibiting_population, 'lexical_inhibition')
    ##for (letter, letter_col) in letter_hypercolumns[1].items():
    ##    insert_probe(letter_col, 'L2-'+letter)
    for (grapheme, grapheme_col) in reading_head.items():
        insert_probe(grapheme_col, 'head-' + grapheme, always_chart=False)
    # [Reading facility config:]
    spike_groups['Head'] = ['head-' + g for g in graphemes]
    spike_groups['Words'] = local_vocabulary
    if prm['stems_and_suffixes']:
        spike_groups['Suffixes'] = ['suff_' + suff for suff in suffixes]
        spike_decisions['Stems'] = [local_vocabulary]
    spike_decisions['Reading'] = []
    for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns):
        spike_decisions['Reading'].append([])
        for (grapheme, grapheme_col) in hypercol.items():
            insert_probe(grapheme_col,
                         'g{}-{}'.format(hcol_n, grapheme),
                         always_chart=False)
            spike_decisions['Reading'][-1].append('g{}-{}'.format(
                hcol_n, grapheme))

    # Run the simulation, write readings.
    nest.Simulate(prm['letter_focus_time'])
    for step_n in range(prm['max_text_len']):

        # Reassign the letter -> head weights (shifting skew normal).
        weights_dist = stats.skewnorm(6, loc=step_n - 0.7, scale=0.67)
        for assg_lett_n in range(prm['max_text_len']):
            assg_hypercol = all_columns_cells(letter_hypercolumns[assg_lett_n])
            for (ln, len_graphemes) in enumerate(reading_head_len_sorted):
                len_graphemes = all_columns_cells(len_graphemes)
                if len(len_graphemes) == 0:
                    continue
                nest.SetStatus(
                    nest.GetConnections(assg_hypercol, len_graphemes), {
                        'weight':
                        (weights_dist.pdf(assg_lett_n) * 3000 /
                         (1.0 + (ln - 1) * prm['grapheme_length_damping']))
                    })

        # Reassign the head -> grapheme weights (normal parametrized by time for each target hypercolumn).
        for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns):
            weights_dist = stats.norm(
                loc=hcol_n + 1,
                scale=1.0)  # hcol_n is treated as time step number
            # (add one because of the first "dummy" step)
            nest.SetStatus(
                nest.GetConnections(all_columns_cells(reading_head),
                                    all_columns_cells(hypercol)),
                {
                    'weight':
                    weights_dist.pdf(1.0 + nest.GetKernelStatus('time') /
                                     prm['letter_focus_time']) *
                    prm['head_grapheme_base_weight']
                })

        # Reassign the suffix -> grapheme weights (depending on estimated stem end).
        if prm['stems_and_suffixes']:  #### and step_n > len(net_text_input)/2:
            stem_end = mean([
                len(stem_reading[0]) for stem_reading in decide_spikes(
                    spike_decisions['Stems'])[:15]
            ])
            #print(stem_end)
            for (suffix, suffix_col) in suffixes_cols.items():
                suffix_decomposition = decompose_word(suffix)
                for grapheme in set(suffix_decomposition):
                    # Each occurence of a grapheme in suffix must exert is
                    # influence individually, they are then summed.
                    indices = [
                        gi for (gi, g) in enumerate(suffix_decomposition)
                        if g == grapheme
                    ]
                    weight_dists = [
                        stats.norm(loc=stem_end + ind, scale=3.0)
                        for ind in indices
                    ]
                    if len(weight_dists) == 0:
                        continue
                    for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns):
                        #print('stem_end', stem_end, 'hcol', hcol_n, weight_dists[0].pdf(hcol_n))
                        nest.SetStatus(
                            nest.GetConnections(suffix_col,
                                                hypercol[grapheme]),
                            {
                                'weight':
                                sum([
                                    dist.pdf(hcol_n) for dist in weight_dists
                                ]) * prm['suffix_grapheme_base_weight']
                            })

        nest.Simulate(prm['letter_focus_time'])
示例#46
0
文件: tnt.py 项目: Arttii/TextBlob
class TnT(TaggerI):
    '''
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

      - It is possible to provide an untrained POS tagger to
        create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

      - Due to the nature of this tagger, it works best when
        trained over sentence delimited input.
      - However it still produces good results if the training
        data and testing data are separated on all punctuation eg: [,.?!]
      - Input for training is expected to be a list of sentences
        where each sentence is a list of (word, tag) tuples
      - Input for tag function is a single sentence
        Input for tagdata function is a list of sentences
        Output is of a similar form

    * Function provided to process text that is unsegmented

      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    To speed up and get more precision, we can use log addition
    to instead multiplication, specifically:

      argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
             log(P(t_T+1|t_T))

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.

    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results.
    '''

    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

    def train(self, data):
        '''
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.

        :param data: List of lists of (word, tag) tuples
        :type data: tuple(str)
        '''

        # Ensure that local C flag is initialized before use
        C = False

        if self._unk is not None and self._T == False:
            self._unk.train(data)

        for sent in data:
            history = [('BOS',False), ('BOS',False)]
            for w, t in sent:

                # if capitalization is requested,
                # and the word begins with a capital
                # set local flag C to True
                if self._C and w[0].isupper(): C=True

                self._wd[w].inc(t)
                self._uni.inc((t,C))
                self._bi[history[1]].inc((t,C))
                self._tri[tuple(history)].inc((t,C))

                history.append((t,C))
                history.pop(0)

                # set local flag C to false for the next word
                C = False

            self._eos[t].inc('EOS')


        # compute lambda values from the trained frequency distributions
        self._compute_lambda()

        #(debugging -- ignore or delete me)
        #print "lambdas"
        #print i, self._l1, i, self._l2, i, self._l3


    def _compute_lambda(self):
        '''
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        '''

        # temporary lambda variables
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        # for each t1,t2 in system
        for history in self._tri.conditions():
            (h1, h2) = history

            # for each t3 given t1,t2 in system
            # (NOTE: tag actually represents (tag,C))
            # However no effect within this function
            for tag in self._tri[history].samples():

                # if there has only been 1 occurrence of this tag in the data
                # then ignore this trigram.
                if self._uni[tag] == 1:
                    continue

                # safe_div provides a safe floating point division
                # it returns -1 if the denominator is 0
                c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))
                c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))
                c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))


                # if c1 is the maximum value:
                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                # if c2 is the maximum value
                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                # if c3 is the maximum value
                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                # if c3, and c2 are equal and larger than c1
                elif (c3 == c2) and (c3 > c1):
                    tl2 += float(self._tri[history][tag]) /2.0
                    tl3 += float(self._tri[history][tag]) /2.0

                # if c1, and c2 are equal and larger than c3
                # this might be a dumb thing to do....(not sure yet)
                elif (c2 == c1) and (c1 > c3):
                    tl1 += float(self._tri[history][tag]) /2.0
                    tl2 += float(self._tri[history][tag]) /2.0

                # otherwise there might be a problem
                # eg: all values = 0
                else:
                    #print "Problem", c1, c2 ,c3
                    pass

        # Lambda normalisation:
        # ensures that l1+l2+l3 = 1
        self._l1 = tl1 / (tl1+tl2+tl3)
        self._l2 = tl2 / (tl1+tl2+tl3)
        self._l3 = tl3 / (tl1+tl2+tl3)



    def _safe_div(self, v1, v2):
        '''
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        '''
        if v2 == 0:
            return -1
        else:
            return float(v1) / float(v2)

    def tagdata(self, data):
        '''
        Tags each sentence in a list of sentences

        :param data:list of list of words
        :type data: [[string,],]
        :return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        '''
        res = []
        for sent in data:
            res1 = self.tag(sent)
            res.append(res1)
        return res


    def tag(self, data):
        '''
        Tags a single sentence

        :param data: list of words
        :type data: [string,]

        :return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        '''

        current_state = [(['BOS', 'BOS'], 0.0)]

        sent = list(data)

        tags = self._tagword(sent, current_state)

        res = []
        for i in range(len(sent)):
            # unpack and discard the C flags
            (t,C) = tags[i+2]
            res.append((sent[i], t))

        return res


    def _tagword(self, sent, current_states):
        '''
        :param sent : List of words remaining in the sentence
        :type sent  : [word,]
        :param current_states : List of possible tag combinations for
                                the sentence so far, and the log probability
                                associated with each tag combination
        :type current_states  : [([tag, ], logprob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        '''

        # if this word marks the end of the sentance,
        # return the most probable tag
        if sent == []:
            (h, logp) = current_states[0]
            return h

        # otherwise there are more words to be tagged
        word = sent[0]
        sent = sent[1:]
        new_states = []

        # if the Capitalisation is requested,
        # initalise the flag for this word
        C = False
        if self._C and word[0].isupper(): C=True

        # if word is known
        # compute the set of possible tags
        # and their associated log probabilities
        if word in self._wd.conditions():
            self.known += 1

            for (history, curr_sent_logprob) in current_states:
                logprobs = []

                for t in self._wd[word].samples():
                    p_uni = self._uni.freq((t,C))
                    p_bi = self._bi[history[-1]].freq((t,C))
                    p_tri = self._tri[tuple(history[-2:])].freq((t,C))
                    p_wd = float(self._wd[word][t])/float(self._uni[(t,C)])
                    p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
                    p2 = log(p, 2) + log(p_wd, 2)

                    logprobs.append(((t,C), p2))


                # compute the result of appending each tag to this history
                for (tag, logprob) in logprobs:
                    new_states.append((history + [tag],
                                       curr_sent_logprob + logprob))




        # otherwise a new word, set of possible tags is unknown
        else:
            self.unknown += 1

            # since a set of possible tags,
            # and the probability of each specific tag
            # can not be returned from most classifiers:
            # specify that any unknown words are tagged with certainty
            p = 1

            # if no unknown word tagger has been specified
            # then use the tag 'Unk'
            if self._unk is None:
                tag = ('Unk',C)

            # otherwise apply the unknown word tagger
            else :
                [(_w, t)] = list(self._unk.tag([word]))
                tag = (t,C)

            for (history, logprob) in current_states:
                history.append(tag)

            new_states = current_states



        # now have computed a set of possible new_states

        # sort states by log prob
        # set is now ordered greatest to least log probability
        new_states.sort(reverse=True, key=itemgetter(1))

        # del everything after N (threshold)
        # this is the beam search cut
        if len(new_states) > self._N:
            new_states = new_states[:self._N]


        # compute the tags for the rest of the sentence
        # return the best list of tags for the sentence
        return self._tagword(sent, new_states)
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
print corpus
WhitespaceTokenizer().tokenize(corpus)
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# How many times did "the" occur?
freq_dist.count('the')

# What was the frequency of the word "the"?
freq_dist.freq('the')

# How many word tokens were counted?
freq_dist.N()

# What word types were encountered?
freq_dist.samples()

# What was the most common word?
freq_dist.max()

# What is the distribution of word lengths in a corpus?
freq_dist = FreqDist()
for token in corpus['SUBTOKENS']:
	freq_dist.inc(len(token['TEXT']))
示例#48
0
def get_entropy(category=None):
    frq = FreqDist(brown.words(categories = category))
    return sum(map(lambda w: -frq.freq(w)*math.log(frq.freq(w),2), frq.keys()))
示例#49
0
# Qual é a distribuição do tamanho das palavras no corpus?
freq_dist = FreqDist()
for token in corpus['SUBTOKENS']:
    freq_dist.inc(len(token['TEXT']))

# Desenha os resultados
wordlens = freq_dist.samples()

# Ordena a lista
wordlens.sort()

# cria uma tupla com um numero de frequencia e a sua
# respectiva distribuicao
# para visualizar execute o comanto print points
points = [(l, freq_dist.freq(l)) for l in wordlens]
Plot(points)
print points

# Qual é a distribuição do tamanho das palavras que terminal com
# vogais?
VOWELS = ('a', 'e', 'i', 'o', 'u')

freq_dist = FreqDist()
for token in corpus['SUBTOKENS']:
    if token['TEXT'][-1].lower() in VOWELS:
        freq_dist.inc(len(token['TEXT']))

# Desenha os resultados
wordlens = freq_dist.samples()
wordlens.sort()
示例#50
0
dir(fdist)
fdist.max
fdist.values
fdist.values()
fdist.values().sum()
sum(fdist.values())
fdist['delicious'] / sum(fdist.values())
fdist['disgusting'] / sum(fdist.values())
fdist['disgusting']
fdist['vegetarian']
fdist['old-timey']
fdist['healthy']
fdist['expensive']
print text
print(text)
fdist.freq('delicious')
fdist.freq('delicnotehu')
fdist.N()
fdist ?
fdist?
fdist.freq('Delicious')
fdist
fdist.freq('rainy')
Business.where_raw('')
Business.where_raw('latitude <= 40.75')
Business.where_raw('latitude <= 40.75').count()
Business.where_raw('latitude <= 40.75 and latitude > 40.749')
Business.where_raw('latitude <= 40.75 and latitude > 40.749').count
Business.where_raw('latitude <= 40.75 and latitude > 40.749').count()
lat = 40.71
lon = -74.01
示例#51
0
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
print corpus
WhitespaceTokenizer().tokenize(corpus)
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# Quantas vezes a palavra form aparece no corpus?
freq_dist.count('form')
# Qual é a freqüência da palavra form?
freq_dist.freq('form')
# Quantas palavras foram contadas?
freq_dist.N()
# Quais foram os tipos de palavras encontradas?
freq_dist.samples()
# Qual é a palavra mais comum?
freq_dist.max()
示例#52
0
#%%
from nltk.corpus import inaugural
from nltk import ConditionalFreqDist
from nltk.probability import FreqDist

fd3 = FreqDist([s for s in inaugural.words()])
print(fd3.freq('freedom'))

# count frequency of words length in decending order
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.words(fileid)
                          if fileid > '1980' and fileid < '2010')

print(cfd.items())
cfd.plot()
# %%
    def runTest(self,iteration):
        print "running test %d"%iteration
        
        pageUrl = '/reviews/www.zulily.com'
        filename = '../test/resources/zulily.pkl'
        try:
            sjr = SiteJabberReviews(pageUrl,filename)
            sjr.load()
            helper = BayesHelper()
            buckets = helper.generateLearningSetsFromReviews([sjr],[1,5],{'training': 0.8,'test':0.2})
            
            self.assertEqual(len(buckets['training']), int(0.8*len(sjr.reviewsByRating[1])+int(0.8*len(sjr.reviewsByRating[5]))))
            self.assertEqual(len(buckets['test']), int(0.2*len(sjr.reviewsByRating[1])+int(0.2*len(sjr.reviewsByRating[5]))))
            
            
            
            #  generate (term) tuples for FD -- this means we need to bust out like terms from combined distributions
            
            allWords1 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 1]
            fd1 = FreqDist(allWords1)
            
            allWords5 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 5]
            fd5 = FreqDist(allWords5)
            
            commonTerms = [w for w in fd1.keys() if w in fd5.keys()]
            
            commonTermFreqs = [(w,fd1.freq(w), fd5.freq(w), abs(fd1.freq(w) - fd5.freq(w))) for w in commonTerms]
            
            commonTermFreqs.sort(key = itemgetter(3),reverse=True)
            
#            commonDist = [freqDiff for (a,b,c,freqDiff) in commonTermFreqs]
#            
#            plt.plot(commonDist)
#            plt.show()
            
            # keep an arbitrary number
            
            filterTerms = [w for (w,a,b,freq) in commonTermFreqs if freq > 0.001]
            
            # add non common terms (note that bayesian will smooth zero terms out)
            print 'high frequency differential featureset'
            fd1Only = [w for w in fd1.keys() if w not in fd5.keys()]
            filterTerms.extend(fd1Only)
            fd5Only = [w for w in fd5.keys() if w not in fd1.keys()]
            filterTerms.extend(fd5Only)
            defaultWordSet = set(filterTerms)
            
      
            def emitDefaultFeatures(tokenizedText):
                '''
                @param tokenizedText: an array of text features
                @return: a feature map from that text.
                '''
                tokenizedTextSet = set(tokenizedText)
                featureSet = {}
                for text in defaultWordSet:
                    featureSet['contains:%s'%text] = text in tokenizedTextSet
                
                return featureSet
            
            classifier = None        
            encodedTrainSet = helper.encodeData(buckets['training'],emitDefaultFeatures )
            classifier = nltk.NaiveBayesClassifier.train(encodedTrainSet)
            
            encodedTestSet = helper.encodeData(buckets['test'], emitDefaultFeatures)
            accuracy =  nltk.classify.accuracy(classifier, encodedTestSet)
            print "accuracy = %.9f"%accuracy
            
            classifier.show_most_informative_features(10) 
            
            shouldBeClassed1 = []
            shouldBeClassed5 = []
            
            for (textbag, rating) in buckets['test']:
                testRating = classifier.classify(emitDefaultFeatures(textbag))
                if testRating != rating:
                    if rating == 1:
                        shouldBeClassed1.append(textbag)
                    else:
                        shouldBeClassed5.append(textbag)
                        
            print "length of mis-classified 1 star reviews = %d"%len(shouldBeClassed1)            
            print "length of mis-classified 5 star reviews = %d"%len(shouldBeClassed5)
            
            print "length of all 1 star reviews submitted = %d"%len(sjr.reviewsByRating[1])
            print "length of all 5 star reviews submitted = %d"%len(sjr.reviewsByRating[5]) 
            
            print "length of test data for 1 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[1]))
            print "length of test data for 5 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[5]))
                   
                   
#            incorrectText1 = [(-1,w) for bag in shouldBeClassed1
#                        for w in bag if w not in stopwords.words('english')]
#            
#            correctText1 = [(1,w) for bag in buckets['training']
#                        for w in bag if w not in stopwords.words('english')]
#            
#            allText1 = []
#            allText1.extend(incorrectText1)
#            allText1.extend(correctText1)
#            
#            cfdText1 = ConditionalFreqDist(allText1)
#            
#            
#            incorrectText5 = [(-5,w) for bag in shouldBeClassed5
#                        for w in bag if w not in stopwords.words('english')]
#            
#            
#            correctText5 = [(5,w) for (bag, rating) in buckets['training']
#                        for w in bag if rating == 5 and w not in stopwords.words('english')]
#            
#            
#            allText5 = []
#            allText5.extend(incorrectText5)
#            allText5.extend(correctText5)
#            
#            cfdText5 = ConditionalFreqDist(allText5)
            
             
            return accuracy
        
        except Exception as inst:
            self.fail(inst)