Пример #1
0
class NGram(object):
    '''
    classdocs
    '''

    def __init__(self):
        '''
        Constructor
        '''
        self.F = Filter()
        
    def Grams(self, pos, n=3, boundy=1):
        '''
        
        '''
        ngrams = []
        for x in range(2, n):
            ngrams.append(self.Gram(pos, n=x))
        
        tmp = []
        
        for x in range(0, n-2):
            tmp = tmp + ngrams[x]
                   
        return tmp
        
    def Gram(self, text, n=3, boundy=1):
        '''
        @param text: text to be created into Ngrams
        @param n: Length of Ngrams
        @param boundy: Number of instiances of gram
           
        @return: List of ngrams of text
        '''
        
        sentence = [nltk.ngrams(sent, n) for sent in text]
        t = []
        for s in sentence:
            t = t + s
            
        freq = nltk.FreqDist(t)
        
        tmp = []
        for f in freq.keys():
            if int(freq[f]) > boundy:
                tmp.append(f)
            
        return tmp
    
    def NGramUn(self, text, n=3):

        sentance = nltk.sent_tokenize(text)     
        sentance = [nltk.word_tokenize(self.F.strip(sent)) for sent in sentance]  
        sentence = [nltk.ngrams(sent, n) for sent in sentance]
    
        return sentence
    
    def capitalList(self, text):
        '''
        @param text: text input which has to be 
        @return: List of tagged words which havve all capitalized first letters
        ''' 
        tmp = []
        
        for sent in text:
            count = 0
            for word in sent:
                if (word[0][0].isupper() & count == 0) | (word[0][0].islower() & count > 0):
                    t = []
                    for x in range(count, len(sent)):
                        if  sent[x][0][0].isupper():
                            t.append(sent[x])
                        else:
                            if len(t) >= 2:
                                tmp.append(t)
                            t = []
                            break
                    
                count = count + 1
                
        return tmp
Пример #2
0
class POS(object):
    '''
    Class for POS tagging, use POS tagger from NLTK.
    '''

    def __init__(self):
        '''
        Constructor inisiates the filter. Along with the Taggers which will be used,
        And loads the copora. 
        '''   
        self.FF = Filter()
        
        try:
            #Attempt to open .plk file and load. 
            input = open("./Corpus/Brown-Uni.pkl", 'rb')
            self.unigram_tagger = load(input)
            input.close() 
        except IOError as e:   
            self.brown_tagged_sents = nltk.corpus.brown.tagged_sents(simplify_tags=True)
            t0 = nltk.DefaultTagger('NN')
            t1 = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t0)
            t2 = nltk.BigramTagger(self.brown_tagged_sents, backoff=t1)
            self.unigram_tagger = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t2)
            
            output = open("./Corpus/Brown-Uni.pkl", 'wb')
            dump(self.unigram_tagger, output, -1)
            output.close()
    
    def POSTag(self, text, s='false'):
        '''
        Method to POS tagged the Tokonized text.
        
        @param text: TOK text which is going to be POS tagged
        @param s: Whether is it a sentence of not. 
        
        @return: POSTaged version of input  
        '''
        if s == 'false':
            sentance = nltk.sent_tokenize(text)
            sentance = [nltk.word_tokenize(self.FF.strip(sent)) for sent in sentance]
            sentance = [self.unigram_tagger.tag(sent) for sent in sentance]
        elif s == 'tok':
            sentance = [self.unigram_tagger.tag(sent,) for sent in text]
        else:
            sentance = self.unigram_tagger.tag(text)
        
        
        return sentance
    
    def POSNgram(self, text, s='false', n=3):
        '''
        Method to POS tag N-grams 
        
        @param text: n-grams to be POS tagged
        @param s: Whether is it a sentence of not. 
        @param n: length of n gram  
        
        @return: POS-Tagged n-grams 
        '''
        if s == 'false':
            sentance = self.POSTag(text);
            sentence = [nltk.ngrams(sent, n) for sent in sentance]
        else:
            sentence = [nltk.ngrams(sent, n) for sent in text]
        
        return sentence
Пример #3
0
class POS(object):
    '''
    Class for POS tagging, use POS tagger from NLTK.
    '''
    def __init__(self):
        '''
        Constructor inisiates the filter. Along with the Taggers which will be used,
        And loads the copora. 
        '''
        self.FF = Filter()

        try:
            #Attempt to open .plk file and load.
            input = open("./Corpus/Brown-Uni.pkl", 'rb')
            self.unigram_tagger = load(input)
            input.close()
        except IOError as e:
            self.brown_tagged_sents = nltk.corpus.brown.tagged_sents(
                simplify_tags=True)
            t0 = nltk.DefaultTagger('NN')
            t1 = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t0)
            t2 = nltk.BigramTagger(self.brown_tagged_sents, backoff=t1)
            self.unigram_tagger = nltk.UnigramTagger(self.brown_tagged_sents,
                                                     backoff=t2)

            output = open("./Corpus/Brown-Uni.pkl", 'wb')
            dump(self.unigram_tagger, output, -1)
            output.close()

    def POSTag(self, text, s='false'):
        '''
        Method to POS tagged the Tokonized text.
        
        @param text: TOK text which is going to be POS tagged
        @param s: Whether is it a sentence of not. 
        
        @return: POSTaged version of input  
        '''
        if s == 'false':
            sentance = nltk.sent_tokenize(text)
            sentance = [
                nltk.word_tokenize(self.FF.strip(sent)) for sent in sentance
            ]
            sentance = [self.unigram_tagger.tag(sent) for sent in sentance]
        elif s == 'tok':
            sentance = [self.unigram_tagger.tag(sent, ) for sent in text]
        else:
            sentance = self.unigram_tagger.tag(text)

        return sentance

    def POSNgram(self, text, s='false', n=3):
        '''
        Method to POS tag N-grams 
        
        @param text: n-grams to be POS tagged
        @param s: Whether is it a sentence of not. 
        @param n: length of n gram  
        
        @return: POS-Tagged n-grams 
        '''
        if s == 'false':
            sentance = self.POSTag(text)
            sentence = [nltk.ngrams(sent, n) for sent in sentance]
        else:
            sentence = [nltk.ngrams(sent, n) for sent in text]

        return sentence