Пример #1
0
    def tag(self, corpus, tokenize=True):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = SentenceTokenizer(
        ).tokenize if tokenize else lambda t: t.split('\n')
        w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()

        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w)
                                    for w in words] + self.END
            for i, word in enumerate(words):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev,
                                                  prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens
Пример #2
0
def extract_global_bag_of_words_processed(df_comments):
    corpus = []   
    i = 0
    lemmatizer = WordNetLemmatizer()    
    tb = Blobber(pos_tagger=PerceptronTagger())
    sentencer = SentenceTokenizer()
    for _,row in df_comments.iterrows():  
        comm = row['comment_content']
        tokens = []   
        for sent in sentencer.tokenize(comm.decode('ascii','ignore')):
            tagged = tb(sent.lower()).tags    
            # Remove stops
            filtered_words = [w for w in tagged if not w[0] in stopwords.words('english')]
                   
            # Remove punctuation
            filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0]             
                    
            # Lemmatize
            filtered_words = [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in filtered_words]  
            
            filtered_words = [w for w in filtered_words if len(w) > 1]
            
            for word in filtered_words:
                tokens.append(word)  
        corpus.append(' '.join(tokens))
        i += 1
        if i % 1000 == 0:
            print i, "words processed for Ngrams"
                
            
    return corpus
Пример #3
0
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects given
     a list of sentence strings. Attempts to handle sentences that
     have more than one punctuation mark at the end of the sentence.
     Examples: "An ellipses is no problem..." or "This is awesome!!!"
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent,
                      start_index=start_index,
                      end_index=end_index,
                      tokenizer=self.tokenizer,
                      np_extractor=self.np_extractor,
                      pos_tagger=self.pos_tagger,
                      analyzer=self.analyzer,
                      parser=self.parser,
                      classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects
def comment_to_sentences(comment, remove_stops=False):
    sentencer = SentenceTokenizer()

    corpus = []
    for sent in sentencer.tokenize(comment):
        if len(sent) > 0:
            corpus.append(comment_to_wordlist(sent, remove_stops))

    return corpus
Пример #5
0
 def test_overrides(self):
     b = tb.Blobber(tokenizer=SentenceTokenizer(),
                     np_extractor=ConllExtractor())
     blob = b("How now? Brown cow?")
     assert_true(isinstance(blob.tokenizer, SentenceTokenizer))
     assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"]))
     blob2 = b("Another blob")
     # blobs have the same tokenizer
     assert_true(blob.tokenizer is blob2.tokenizer)
     # but aren't the same object
     assert_not_equal(blob, blob2)
Пример #6
0
    def test_get_np_for_CONLLExtractor(self):
        text_list = self.text_list

        from textblob.taggers import NLTKTagger
        from textblob.tokenizers import SentenceTokenizer
        chunker = ConllExtractor()

        tb = Blobber(pos_tagger=NLTKTagger(),
                     tokenizer=SentenceTokenizer(),
                     np_extractor=chunker)

        for text in text_list:
            b = tb(text)
            print(b.noun_phrases)
            print(b.parse())
Пример #7
0
def plagiarism_check(reader, pdfurl):
    global cred_score
    text = ''
    for i in range(5, reader.numPages):
        text += reader.getPage(i).extractText()
    sentences = TextBlob(text, tokenizer=SentenceTokenizer())
    sentences = [' '.join(sentence.split()) for sentence in sentences]
    sentences = [sentence for sentence in sentences if len(sentence) > 50]
    t = random.sample(sentences, min(len(sentences), 3))  # can increase this number
    for sentence in t:
        print(sentence)
        res = requests.get('https://www.google.ca/search?q="' + sentence + '"')
        soup = bs4.BeautifulSoup(res.text, 'html.parser')
        results = soup.select('h3.r a')
        for result in results[:min(len(results), 3)]:  # can increase this number
            if results.get('href') != pdfurl:
                cred_score -= 0.05
                output['plagiarism'] = -0.05
                return
Пример #8
0
    def test_get_np_for_all(self):
        text_list = self.text_list

        from textblob.taggers import NLTKTagger
        from textblob.tokenizers import SentenceTokenizer
        chunker = ConllExtractor()

        tb = Blobber(pos_tagger=NLTKTagger(),
                     tokenizer=SentenceTokenizer(),
                     np_extractor=chunker)

        for text in text_list:
            # tbinstance=tb(text)
            # sentences=tbinstance.sentences
            # print(sentences)
            # for s in sentences:
            #     s.
            pst = parsetree(text)
            print(pst)
            for sentence in pst:
                for chunk in sentence.chunks:
                    if chunk.type == "NP":
                        print chunk.type, [(w.string, w.type)
                                           for w in chunk.words]
Пример #9
0
 def setUp(self):
     self.tokenizer = SentenceTokenizer()
     self.text = "Beautiful is better than ugly. Simple is better than complex."
Пример #10
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer

sentencer = SentenceTokenizer()
worder = WordTokenizer()

sentences = ['How are you? I am fine!']

tokens = []
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)

    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0, len(word)):
                term = word[:j] + word[j + 1:]
                tokens.append(term.strip())

print tokens
Пример #11
0
 def __init__(self):
     self.lemmatizer = WordNetLemmatizer()    
     self.tb = Blobber(pos_tagger=PerceptronTagger())
     self.sentencer = SentenceTokenizer()
Пример #12
0
 def __init__(self):
     self.sentencer = SentenceTokenizer()
     self.max = 8
     self.min = 2
Пример #13
0
 def __init__(self):
     self.sentencer = SentenceTokenizer()
     self.worder = WordTokenizer();
Пример #14
0
def extract_feature_matrix(df_comments, df_thread_groupby):
    print "START"
    # Sentence Tokenizer
    sentencer = SentenceTokenizer()
    
    clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle')
        
    featureMatrix = np.empty([df_comments.shape[0],25])
    
    feature_dict = dict()
    for ix, row in df_comments.iterrows():
        feature_dict[row['comment_id']] = ix
    
    feature_count = 0
    
    for _,row in df_comments.iterrows():
        index = feature_dict[row['comment_id']]
        
        comm = row['comment_content'].decode('ASCII', 'ignore')
        tokens = words(comm)
        unique_tokens = set(tokens)
        sentences = sentencer.tokenize(comm)
        
        featureMatrix[index][3] =  len(comm)
        
        verb_fr, noun_fr, pronoun_fr = pos_freq(tokens)
        featureMatrix[index][4] = verb_fr
        featureMatrix[index][5] = noun_fr
        featureMatrix[index][6] = pronoun_fr
        
        featureMatrix[index][7] = capital_frequency(tokens)
        featureMatrix[index][8] = sent_frequency(sentences, '?')
        featureMatrix[index][9] = sent_frequency(sentences, '!')
        featureMatrix[index][10] = sentence_capital_frequency(sentences)
        
        featureMatrix[index][11] = entropy(comm)
        featureMatrix[index][12] = lexical_diversity(tokens)
        
        
        if len(tokens) == 0:
            featureMatrix[index][13] =  0
            featureMatrix[index][14] =  0
            featureMatrix[index][15] =  0
            featureMatrix[index][16] =  0
        else:
            spelt_wrong = missing_words(unique_tokens)
            bad_words_list = swears(unique_tokens)
            
            featureMatrix[index][13] =  len(spelt_wrong)
            featureMatrix[index][14] =  len(spelt_wrong)/float(len(unique_tokens))
            featureMatrix[index][15] =  len(bad_words_list)
            featureMatrix[index][16] =  len(bad_words_list)/float(len(unique_tokens))
            
            
        featureMatrix[index][19] =  F_K_score(sentences, tokens)
        
        testSet = dict()
        refWords = make_full_dict(tokens)
        testSet.update(refWords)
    
        probDist = clf.prob_classify(testSet)                
        sentiment = probDist.prob('pos')            
        subj_obj = get_subjectivity(probDist)
    
        polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf)
        featureMatrix[index][22] =  sentiment
        featureMatrix[index][23] =  subj_obj
        featureMatrix[index][24] =  polarity_overlap
        
        feature_count += 1
        if feature_count % 1000 == 0:
            print feature_count
    
    print "DONE"
    
    feature_count = 0
    # Grouped
    for _,group in df_thread_groupby:
        thread_comments = [row['comment_content'] for _,row in group.iterrows()]
        
        # Get average time
        sumTime = 0 
        count = 0                
        previous = mktime(group.iloc[0]['date'])
        first = mktime(group.iloc[0]['date'])
        
        # Average length
        sumLen = 0 
        
        
        thread_tokens = []    
        
        # Within Thread
        for _, row in group.iterrows():
            index = feature_dict[row['comment_id']]
            comm = row['comment_content'].decode('ascii','ignore')
            tokens = words(comm)
            sentences = sentencer.tokenize(comm)
            
            # Ongoing average time
            sumTime += mktime(row['date']) - previous
            count += 1            
            avgTime = sumTime/float(count)
            
            # Ongoing average length
            sumLen += len(words(row['comment_content']))
            avgLen = sumLen/float(count)
            
            ######################################################################
            # Get chunked sentences
            for sent in sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = [] 
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
                
                # The cumulative tokens up to this point
                thread_tokens += doc
            
            ######################################################################
            article_tokens = []
            article_sentences = sentencer.tokenize(row['article_body'])
            for sent in article_sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = []
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
            
            ######################################################################
            
            
            featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1))
            previous = mktime(row['date'])        
            
            featureMatrix[index][1] =  mktime(row['date']) - first  
            
            featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1))  
            
            featureMatrix[index][17] =  np.mean([termf(comm.count(w), tokens) for w in set(tokens)])  
            featureMatrix[index][18] =  tf_idf(comm, thread_comments)     
            
            featureMatrix[index][20] =  onSubForumTopic(tokens, thread_tokens)
            featureMatrix[index][21] =  onSubForumTopic(tokens, article_tokens)
    
    
            feature_count += 1
            if feature_count % 1000 == 0:
                print feature_count
    
    return featureMatrix