Exemplo n.º 1
0
    def __init__(self, tokens):

        stopwords = set([('5', 'stars')])
        num_words = len(tokens)

        finder = BigramCollocationFinder.from_words(tokens)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder.apply_freq_filter(int(.0002 *
                                     num_words))  # some parameter tunin?
        bigrams = finder.nbest(bigram_measures.pmi, 15)

        tfinder = TrigramCollocationFinder.from_words(tokens)
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        tfinder.apply_freq_filter(int(.0001 * num_words))
        trigrams = tfinder.nbest(trigram_measures.pmi, 10)

        # merge bigrams and trigrams
        phrases = bigrams
        combined = []
        for bigram in bigrams:
            other_bigrams = list(set(bigrams) - set(bigram))
            for other_bigram in other_bigrams:
                if bigram[1] == other_bigram[0]:
                    combined.append((bigram[0], bigram[1], other_bigram[1]))
        for trigram in trigrams:
            if trigram in set(combined):
                phrases.append(trigram)
                phrases.remove((trigram[0], trigram[1]))
                phrases.remove((trigram[1], trigram[2]))

        p = POS()
        self.phrases = [
            phrase for phrase in phrases
            if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords
        ]
Exemplo n.º 2
0
    def frequent_nouns_counts(self, tokens):
        """Return frequently mentioned nouns WITH COUNTS"""

        unigram_fd = nltk.FreqDist(tokens)
        pos = POS()
        common_unigrams = unigram_fd.most_common(
            int(self.top_pct * len(unigram_fd)))

        nouns = [
            pair for pair in common_unigrams if
            pair[0] not in self.stopwords() and pos.percent_noun(pair[0]) > 0.5
        ]

        return nouns
Exemplo n.º 3
0
    def phrases(self, tokens):

        bigrams = self.top_bigrams(tokens)
        trigrams = self.top_trigrams(tokens)
        phrases = self.merged_phrases(bigrams, trigrams)

        # clean the phrases to be NOUN PHRASES and w/o phrasal stopwords.
        p = POS()
        stopwords = {('5', 'stars')}
        phrases = [
            phrase for phrase in phrases
            if p.percent_noun(phrase[-1]) > 0.5 and phrase not in stopwords
        ]

        return phrases
Exemplo n.º 4
0
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'
Exemplo n.º 5
0
from SSBParser import SSBParser
from POS import POS
from unknown_word_handler import unknown_word_handler



parser = SSBParser()
train_clean_file = 'BROWN-clean.pos.txt'
test_clean_file = 'SnapshotBROWN-clean.pos.txt'
parser.gen_clean_file('BROWN.pos.all',train_clean_file)
parser.gen_clean_file('SnapshotBROWN.pos.all.txt',test_clean_file)

print('1(i) : Baseline statistical tagger implemented for entire Brown corpus.')
print('***********************************************************************')

print('1(ii) : Calculating performance for snapshot')
pos = POS(train_clean_file,test_clean_file)
print('***********************************************************************')


print('1(iii) : Calculating performance for news collected from web. please see news.txt to see the input file')
uwh = unknown_word_handler('news.txt','news-clean.txt',pos)
test_clean_file = 'news-clean.txt'
pos = POS(train_clean_file,test_clean_file)
Exemplo n.º 6
0
    def __init__(self, tokens, category):

        # these generic words will be ignored
        stopwords = [
            'product', 'price', 'reviews', 'unit', 'model', 'purchase',
            'amount', 'item'
        ]
        words = category.split()
        # add the category and its plural (crude) to stopwords
        for el in words:
            stopwords.append(el)
            stopwords.append(el + 's')

        # will have to change this!
        self.tokens = tokens
        self.num_words = len(self.tokens)

        # calculate freq dist from tokens
        self.unigram_fd = nltk.FreqDist(self.tokens)
        self.unique_words = len(self.unigram_fd)

        # get frequent unigram nouns
        pos = POS()
        common_unigrams = self.unigram_fd.most_common(
            int(.02 * self.unique_words))
        self.unigrams = [pair for pair in common_unigrams \
         if pair[0] not in stopwords and pos.percent_noun(pair[0]) > 0.5]

        # use threshold? get slightly better w/o the threshold.
        # threshold = .001
        # self.unigrams = [pair for pair in common_unigrams \
        # 	if pair[1] > int(threshold*self.num_words) and pair[0] not in stopwords
        # 	and pos.percent_noun(pair[0]) > 0.5]

        # create a pandas DataFrame indexed by word, review corpus
        zipped = zip(*self.unigrams)
        df_reviews = pd.DataFrame(list(zipped[1]),
                                  index=list(zipped[0]),
                                  columns=['count_reviews'])

        # a list of words from 'generic' corpus
        generic_words = self.chat_words()

        # create a pandas DataFrame indexed by word, generic
        self.generic_words = [w.lower() for w in generic_words]
        generic_fd = nltk.FreqDist(generic_words)+\
         nltk.FreqDist(nltk.bigrams(generic_words)) + \
         nltk.FreqDist(nltk.trigrams(generic_words))
        zipped_generic = zip(*generic_fd.items())
        df_generic = pd.DataFrame(list(zipped_generic[1]),\
         index=list(zipped_generic[0]),columns=['count_generic'])

        # merge the two on words
        df = df_reviews.join(df_generic)
        self.df = df.fillna(0)

        # compute generic freq.
        term_freq = self.term_freq_log()
        inverse_generic_freq = self.inverse_generic_freq()

        self.scores = term_freq * inverse_generic_freq
        self.scores.sort()
        self.unigrams = list(reversed(self.scores.index))