예제 #1
0
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects given
     a list of sentence strings. Attempts to handle sentences that
     have more than one punctuation mark at the end of the sentence.
     Examples: "An ellipses is no problem..." or "This is awesome!!!"
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent,
                      start_index=start_index,
                      end_index=end_index,
                      tokenizer=self.tokenizer,
                      np_extractor=self.np_extractor,
                      pos_tagger=self.pos_tagger,
                      analyzer=self.analyzer,
                      parser=self.parser,
                      classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects
예제 #2
0
class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    @attr("skip")  # This is a known problem with the sentence tokenizer.
    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

    def test_sent_tokenize(self):
        tokens = sent_tokenize(self.text)
        assert_true(is_generator(tokens))  # It's a generator
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
예제 #3
0
def extract_global_bag_of_words_processed(df_comments):
    corpus = []   
    i = 0
    lemmatizer = WordNetLemmatizer()    
    tb = Blobber(pos_tagger=PerceptronTagger())
    sentencer = SentenceTokenizer()
    for _,row in df_comments.iterrows():  
        comm = row['comment_content']
        tokens = []   
        for sent in sentencer.tokenize(comm.decode('ascii','ignore')):
            tagged = tb(sent.lower()).tags    
            # Remove stops
            filtered_words = [w for w in tagged if not w[0] in stopwords.words('english')]
                   
            # Remove punctuation
            filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0]             
                    
            # Lemmatize
            filtered_words = [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in filtered_words]  
            
            filtered_words = [w for w in filtered_words if len(w) > 1]
            
            for word in filtered_words:
                tokens.append(word)  
        corpus.append(' '.join(tokens))
        i += 1
        if i % 1000 == 0:
            print i, "words processed for Ngrams"
                
            
    return corpus
예제 #4
0
class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    @attr("skip")  # This is a known problem with the sentence tokenizer.
    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

    def test_sent_tokenize(self):
        tokens = sent_tokenize(self.text)
        assert_true(is_generator(tokens))  # It's a generator
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
def comment_to_sentences(comment, remove_stops=False):
    sentencer = SentenceTokenizer();
    
    corpus = []
    for sent in sentencer.tokenize(comment):  
        if len(sent) > 0 :  
            corpus.append(comment_to_wordlist(sent, remove_stops))
    
    return corpus
def comment_to_sentences(comment, remove_stops=False):
    sentencer = SentenceTokenizer()

    corpus = []
    for sent in sentencer.tokenize(comment):
        if len(sent) > 0:
            corpus.append(comment_to_wordlist(sent, remove_stops))

    return corpus
예제 #7
0
    def tag(self, corpus, tokenize=True):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = SentenceTokenizer(
        ).tokenize if tokenize else lambda t: t.split('\n')
        w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()

        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w)
                                    for w in words] + self.END
            for i, word in enumerate(words):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev,
                                                  prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens
예제 #8
0
class LexicalBigramUnigramAnalyzer(object):   
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()    
        self.tb = Blobber(pos_tagger=PerceptronTagger())
        self.sentencer = SentenceTokenizer()
    def __call__(self, doc):   
        tokens = []     
        for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')):
            tagged = self.tb(sent.lower()).tags    
            
            tagged = [(t[0], penn_to_wn(t[1])) for t in tagged]
            tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')]
            ng = zip(tagged, tagged[1:])
            rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN]
            rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)]
            rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB]
            rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN]
            
            filtered_list = rule1 + rule2 + rule3 + rule4
                             
                    
            # Lemmatize
            filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list]
            filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged]
            for bigram in filtered_bigrams:
                tokens.append(bigram)
            for unigram in filtered_unigrams:
                tokens.append(unigram)
        return tokens
예제 #9
0
파일: blob.py 프로젝트: pathouse/TextBlob
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects from the raw text.
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent, start_index=start_index, end_index=end_index,
             tokenizer=self.tokenizer, np_extractor=self.np_extractor,
             pos_tagger=self.pos_tagger, analyzer=self.analyzer,
             parser=self.parser, classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects
예제 #10
0
 def test_overrides(self):
     b = tb.Blobber(tokenizer=SentenceTokenizer(),
                     np_extractor=ConllExtractor())
     blob = b("How now? Brown cow?")
     assert_true(isinstance(blob.tokenizer, SentenceTokenizer))
     assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"]))
     blob2 = b("Another blob")
     # blobs have the same tokenizer
     assert_true(blob.tokenizer is blob2.tokenizer)
     # but aren't the same object
     assert_not_equal(blob, blob2)
예제 #11
0
class TestSentenceTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            "Beautiful is better than ugly.", "Simple is better than complex."
        ])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
                     ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")
예제 #12
0
class CharacterAnalyzer(object):   
    def __init__(self):
        self.sentencer = SentenceTokenizer()
        self.max = 8
        self.min = 2
    def __call__(self, doc):  
        tokens = []      
        for sent in self.sentencer.tokenize(doc.lower()):
            words = ''.join([ch for ch in sent if ch not in string.punctuation])
            for n in range(self.min,self.max+1):
                ngr = [words[i:i+n] for i in range(len(words)-n+1)]
                if len(ngr) > 0:
                    tokens += ngr
        return tokens
예제 #13
0
파일: blob.py 프로젝트: DDani/TextBlob
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects given
     a list of sentence strings. Attempts to handle sentences that
     have more than one punctuation mark at the end of the sentence.
     Examples: "An ellipses is no problem..." or "This is awesome!!!"
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent, start_index=start_index, end_index=end_index,
             tokenizer=self.tokenizer, np_extractor=self.np_extractor,
             pos_tagger=self.pos_tagger, analyzer=self.analyzer,
             parser=self.parser, classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects
예제 #14
0
    def test_get_np_for_CONLLExtractor(self):
        text_list = self.text_list

        from textblob.taggers import NLTKTagger
        from textblob.tokenizers import SentenceTokenizer
        chunker = ConllExtractor()

        tb = Blobber(pos_tagger=NLTKTagger(),
                     tokenizer=SentenceTokenizer(),
                     np_extractor=chunker)

        for text in text_list:
            b = tb(text)
            print(b.noun_phrases)
            print(b.parse())
예제 #15
0
class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")
예제 #16
0
class CharacterSkipGramAnalyzer(object):   
    def __init__(self):
        self.sentencer = SentenceTokenizer()
        self.worder = WordTokenizer();
    def __call__(self, doc):  
        tokens = []      
        for sent in self.sentencer.tokenize(doc.lower()):
            words = ''.join([ch for ch in sent if ch not in string.punctuation])
            words = self.worder.tokenize(words)
            
            for word in words:
                tokens.append(word.strip())
                if len(word) > 2:
                    for j in range(0,len(word)):    
                        term = word[:j] + word[j+1:] 
                        tokens.append(term.strip())
        return tokens
예제 #17
0
def plagiarism_check(reader, pdfurl):
    global cred_score
    text = ''
    for i in range(5, reader.numPages):
        text += reader.getPage(i).extractText()
    sentences = TextBlob(text, tokenizer=SentenceTokenizer())
    sentences = [' '.join(sentence.split()) for sentence in sentences]
    sentences = [sentence for sentence in sentences if len(sentence) > 50]
    t = random.sample(sentences, min(len(sentences), 3))  # can increase this number
    for sentence in t:
        print(sentence)
        res = requests.get('https://www.google.ca/search?q="' + sentence + '"')
        soup = bs4.BeautifulSoup(res.text, 'html.parser')
        results = soup.select('h3.r a')
        for result in results[:min(len(results), 3)]:  # can increase this number
            if results.get('href') != pdfurl:
                cred_score -= 0.05
                output['plagiarism'] = -0.05
                return
예제 #18
0
    def test_get_np_for_all(self):
        text_list = self.text_list

        from textblob.taggers import NLTKTagger
        from textblob.tokenizers import SentenceTokenizer
        chunker = ConllExtractor()

        tb = Blobber(pos_tagger=NLTKTagger(),
                     tokenizer=SentenceTokenizer(),
                     np_extractor=chunker)

        for text in text_list:
            # tbinstance=tb(text)
            # sentences=tbinstance.sentences
            # print(sentences)
            # for s in sentences:
            #     s.
            pst = parsetree(text)
            print(pst)
            for sentence in pst:
                for chunk in sentence.chunks:
                    if chunk.type == "NP":
                        print chunk.type, [(w.string, w.type)
                                           for w in chunk.words]
예제 #19
0
 def __init__(self):
     self.sentencer = SentenceTokenizer()
     self.worder = WordTokenizer();
예제 #20
0
 def setUp(self):
     self.tokenizer = SentenceTokenizer()
     self.text = "Beautiful is better than ugly. Simple is better than complex."
예제 #21
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer

sentencer = SentenceTokenizer()
worder = WordTokenizer()

sentences = ['How are you? I am fine!']

tokens = []
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)

    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0, len(word)):
                term = word[:j] + word[j + 1:]
                tokens.append(term.strip())

print tokens
예제 #22
0
 def __init__(self):
     self.sentencer = SentenceTokenizer()
     self.max = 8
     self.min = 2
예제 #23
0
 def __init__(self):
     self.lemmatizer = WordNetLemmatizer()    
     self.tb = Blobber(pos_tagger=PerceptronTagger())
     self.sentencer = SentenceTokenizer()
예제 #24
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer


sentencer = SentenceTokenizer()
worder = WordTokenizer();

sentences = ['How are you? I am fine!']

tokens = []      
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)
    
    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0,len(word)):    
                term = word[:j] + word[j+1:] 
                tokens.append(term.strip())

print tokens
예제 #25
0
def extract_feature_matrix(df_comments, df_thread_groupby):
    print "START"
    # Sentence Tokenizer
    sentencer = SentenceTokenizer()
    
    clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle')
        
    featureMatrix = np.empty([df_comments.shape[0],25])
    
    feature_dict = dict()
    for ix, row in df_comments.iterrows():
        feature_dict[row['comment_id']] = ix
    
    feature_count = 0
    
    for _,row in df_comments.iterrows():
        index = feature_dict[row['comment_id']]
        
        comm = row['comment_content'].decode('ASCII', 'ignore')
        tokens = words(comm)
        unique_tokens = set(tokens)
        sentences = sentencer.tokenize(comm)
        
        featureMatrix[index][3] =  len(comm)
        
        verb_fr, noun_fr, pronoun_fr = pos_freq(tokens)
        featureMatrix[index][4] = verb_fr
        featureMatrix[index][5] = noun_fr
        featureMatrix[index][6] = pronoun_fr
        
        featureMatrix[index][7] = capital_frequency(tokens)
        featureMatrix[index][8] = sent_frequency(sentences, '?')
        featureMatrix[index][9] = sent_frequency(sentences, '!')
        featureMatrix[index][10] = sentence_capital_frequency(sentences)
        
        featureMatrix[index][11] = entropy(comm)
        featureMatrix[index][12] = lexical_diversity(tokens)
        
        
        if len(tokens) == 0:
            featureMatrix[index][13] =  0
            featureMatrix[index][14] =  0
            featureMatrix[index][15] =  0
            featureMatrix[index][16] =  0
        else:
            spelt_wrong = missing_words(unique_tokens)
            bad_words_list = swears(unique_tokens)
            
            featureMatrix[index][13] =  len(spelt_wrong)
            featureMatrix[index][14] =  len(spelt_wrong)/float(len(unique_tokens))
            featureMatrix[index][15] =  len(bad_words_list)
            featureMatrix[index][16] =  len(bad_words_list)/float(len(unique_tokens))
            
            
        featureMatrix[index][19] =  F_K_score(sentences, tokens)
        
        testSet = dict()
        refWords = make_full_dict(tokens)
        testSet.update(refWords)
    
        probDist = clf.prob_classify(testSet)                
        sentiment = probDist.prob('pos')            
        subj_obj = get_subjectivity(probDist)
    
        polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf)
        featureMatrix[index][22] =  sentiment
        featureMatrix[index][23] =  subj_obj
        featureMatrix[index][24] =  polarity_overlap
        
        feature_count += 1
        if feature_count % 1000 == 0:
            print feature_count
    
    print "DONE"
    
    feature_count = 0
    # Grouped
    for _,group in df_thread_groupby:
        thread_comments = [row['comment_content'] for _,row in group.iterrows()]
        
        # Get average time
        sumTime = 0 
        count = 0                
        previous = mktime(group.iloc[0]['date'])
        first = mktime(group.iloc[0]['date'])
        
        # Average length
        sumLen = 0 
        
        
        thread_tokens = []    
        
        # Within Thread
        for _, row in group.iterrows():
            index = feature_dict[row['comment_id']]
            comm = row['comment_content'].decode('ascii','ignore')
            tokens = words(comm)
            sentences = sentencer.tokenize(comm)
            
            # Ongoing average time
            sumTime += mktime(row['date']) - previous
            count += 1            
            avgTime = sumTime/float(count)
            
            # Ongoing average length
            sumLen += len(words(row['comment_content']))
            avgLen = sumLen/float(count)
            
            ######################################################################
            # Get chunked sentences
            for sent in sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = [] 
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
                
                # The cumulative tokens up to this point
                thread_tokens += doc
            
            ######################################################################
            article_tokens = []
            article_sentences = sentencer.tokenize(row['article_body'])
            for sent in article_sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = []
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
            
            ######################################################################
            
            
            featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1))
            previous = mktime(row['date'])        
            
            featureMatrix[index][1] =  mktime(row['date']) - first  
            
            featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1))  
            
            featureMatrix[index][17] =  np.mean([termf(comm.count(w), tokens) for w in set(tokens)])  
            featureMatrix[index][18] =  tf_idf(comm, thread_comments)     
            
            featureMatrix[index][20] =  onSubForumTopic(tokens, thread_tokens)
            featureMatrix[index][21] =  onSubForumTopic(tokens, article_tokens)
    
    
            feature_count += 1
            if feature_count % 1000 == 0:
                print feature_count
    
    return featureMatrix
예제 #26
0
 def setUp(self):
     self.tokenizer = SentenceTokenizer()
     self.text = "Beautiful is better than ugly. Simple is better than complex."
            if type(chunk) == nltk.Tree:
                entity_names.append(' '.join(c[0] for c in chunk.leaves()))
            else:
                entity_names.append(chunk[0])
        entity_names = [
            word.strip(string.punctuation).lower() for word in entity_names
            if len(word.strip(string.punctuation)) > 1
        ]

    words = [w for w in entity_names if not w in stops]
    return words


lemmatizer = WordNetLemmatizer()
tb = Blobber(pos_tagger=PerceptronTagger())
sentencer = SentenceTokenizer()


def comment_to_words_for_topics(comment_body):

    tokens = []
    for sent in sentencer.tokenize(comment_body.decode('ascii', 'ignore')):
        tagged = tb(sent.lower()).tags
        filtered_words = [
            w for w in tagged if not w[0] in stopwords.words('english')
        ]
        # Remove punctuation
        filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1])
                          for w in filtered_words
                          if len(re.findall('[a-z]+', w[0].lower())) > 0]