示例#1
0
 def transform(self,sentence):
     
     sentence_mod = exp_replace.replace_reg(sentence)
     tokens = nltk.word_tokenize(sentence_mod)
     tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop] 
     corpus_sentence = self.dictionary.doc2bow(tokens)
     
     return self.lda[corpus_sentence]  
示例#2
0
def grams_feature(features, sentence):
    sentence_reg = exp_replace.replace_reg(sentence)


    tokens = nltk.word_tokenize(sentence_reg)
    tokens = [porter.stem(t.lower()) for t in tokens]
    bigrams = nltk.bigrams(tokens)
    bigrams = [tuple[0] + ' ' + tuple[1] for tuple in bigrams]
    ngrams = tokens + bigrams

    for tup in ngrams:
        features['contains(%s)' % tup] = 1.0
示例#3
0
 def fit(self,documents):
     
     documents_mod = [exp_replace.replace_reg(sentence) for sentence in documents]
     tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
     tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]        
         
     self.dictionary = corpora.Dictionary(tokens)
     corpus = [self.dictionary.doc2bow(text) for text in tokens]
     self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
     
     self.lda.save('topics.tp')
     self.dictionary.save('topics_dict.tp')
def grams_feature(features, sentence):
    sentence_reg = exp_replace.replace_reg(sentence)

    #Spell check
    #sentence_reg = TextBlob(sentence_reg)
    #sentence_reg = str(sentence_reg.correct())

    tokens = nltk.word_tokenize(sentence_reg)
    tokens = [porter.stem(t.lower()) for t in tokens]
    bigrams = nltk.bigrams(tokens)
    bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
    grams = tokens + bigrams

    for t in grams:
        features['contains(%s)' % t] = 1.0
def grams_feature(features, sentence):
    sentence_reg = exp_replace.replace_reg(sentence)

    # Spell check
    # sentence_reg = TextBlob(sentence_reg)
    # sentence_reg = str(sentence_reg.correct())

    tokens = nltk.word_tokenize(sentence_reg)
    tokens = [porter.stem(t.lower()) for t in tokens]
    bigrams = nltk.bigrams(tokens)
    bigrams = [tup[0] + " " + tup[1] for tup in bigrams]
    grams = tokens + bigrams

    for t in grams:
        features["contains(%s)" % t] = 1.0