def _create_sentence_objects(self): '''Returns a list of Sentence objects given a list of sentence strings. Attempts to handle sentences that have more than one punctuation mark at the end of the sentence. Examples: "An ellipses is no problem..." or "This is awesome!!!" ''' sent_tokenizer = SentenceTokenizer() sentence_objects = [] sentences = sent_tokenizer.itokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) @attr("skip") # This is a known problem with the sentence tokenizer. def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.") def test_sent_tokenize(self): tokens = sent_tokenize(self.text) assert_true(is_generator(tokens)) # It's a generator assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
def extract_global_bag_of_words_processed(df_comments): corpus = [] i = 0 lemmatizer = WordNetLemmatizer() tb = Blobber(pos_tagger=PerceptronTagger()) sentencer = SentenceTokenizer() for _,row in df_comments.iterrows(): comm = row['comment_content'] tokens = [] for sent in sentencer.tokenize(comm.decode('ascii','ignore')): tagged = tb(sent.lower()).tags # Remove stops filtered_words = [w for w in tagged if not w[0] in stopwords.words('english')] # Remove punctuation filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0] # Lemmatize filtered_words = [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in filtered_words] filtered_words = [w for w in filtered_words if len(w) > 1] for word in filtered_words: tokens.append(word) corpus.append(' '.join(tokens)) i += 1 if i % 1000 == 0: print i, "words processed for Ngrams" return corpus
def comment_to_sentences(comment, remove_stops=False): sentencer = SentenceTokenizer(); corpus = [] for sent in sentencer.tokenize(comment): if len(sent) > 0 : corpus.append(comment_to_wordlist(sent, remove_stops)) return corpus
def comment_to_sentences(comment, remove_stops=False): sentencer = SentenceTokenizer() corpus = [] for sent in sentencer.tokenize(comment): if len(sent) > 0: corpus.append(comment_to_wordlist(sent, remove_stops)) return corpus
def tag(self, corpus, tokenize=True): '''Tags a string `corpus`.''' # Assume untokenized corpus has \n between sentences and ' ' between words s_split = SentenceTokenizer( ).tokenize if tokenize else lambda t: t.split('\n') w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split() def split_sents(corpus): for s in s_split(corpus): yield w_split(s) prev, prev2 = self.START tokens = [] for words in split_sents(corpus): context = self.START + [self._normalize(w) for w in words] + self.END for i, word in enumerate(words): tag = self.tagdict.get(word) if not tag: features = self._get_features(i, word, context, prev, prev2) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens
class LexicalBigramUnigramAnalyzer(object): def __init__(self): self.lemmatizer = WordNetLemmatizer() self.tb = Blobber(pos_tagger=PerceptronTagger()) self.sentencer = SentenceTokenizer() def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')): tagged = self.tb(sent.lower()).tags tagged = [(t[0], penn_to_wn(t[1])) for t in tagged] tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')] ng = zip(tagged, tagged[1:]) rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN] rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)] rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB] rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN] filtered_list = rule1 + rule2 + rule3 + rule4 # Lemmatize filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list] filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged] for bigram in filtered_bigrams: tokens.append(bigram) for unigram in filtered_unigrams: tokens.append(unigram) return tokens
def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sent_tokenizer = SentenceTokenizer() sentence_objects = [] sentences = sent_tokenizer.itokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
def test_overrides(self): b = tb.Blobber(tokenizer=SentenceTokenizer(), np_extractor=ConllExtractor()) blob = b("How now? Brown cow?") assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"])) blob2 = b("Another blob") # blobs have the same tokenizer assert_true(blob.tokenizer is blob2.tokenizer) # but aren't the same object assert_not_equal(blob, blob2)
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ "Beautiful is better than ugly.", "Simple is better than complex." ]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.")
class CharacterAnalyzer(object): def __init__(self): self.sentencer = SentenceTokenizer() self.max = 8 self.min = 2 def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) for n in range(self.min,self.max+1): ngr = [words[i:i+n] for i in range(len(words)-n+1)] if len(ngr) > 0: tokens += ngr return tokens
def test_get_np_for_CONLLExtractor(self): text_list = self.text_list from textblob.taggers import NLTKTagger from textblob.tokenizers import SentenceTokenizer chunker = ConllExtractor() tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer(), np_extractor=chunker) for text in text_list: b = tb(text) print(b.noun_phrases) print(b.parse())
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.")
class CharacterSkipGramAnalyzer(object): def __init__(self): self.sentencer = SentenceTokenizer() self.worder = WordTokenizer(); def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = self.worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0,len(word)): term = word[:j] + word[j+1:] tokens.append(term.strip()) return tokens
def plagiarism_check(reader, pdfurl): global cred_score text = '' for i in range(5, reader.numPages): text += reader.getPage(i).extractText() sentences = TextBlob(text, tokenizer=SentenceTokenizer()) sentences = [' '.join(sentence.split()) for sentence in sentences] sentences = [sentence for sentence in sentences if len(sentence) > 50] t = random.sample(sentences, min(len(sentences), 3)) # can increase this number for sentence in t: print(sentence) res = requests.get('https://www.google.ca/search?q="' + sentence + '"') soup = bs4.BeautifulSoup(res.text, 'html.parser') results = soup.select('h3.r a') for result in results[:min(len(results), 3)]: # can increase this number if results.get('href') != pdfurl: cred_score -= 0.05 output['plagiarism'] = -0.05 return
def test_get_np_for_all(self): text_list = self.text_list from textblob.taggers import NLTKTagger from textblob.tokenizers import SentenceTokenizer chunker = ConllExtractor() tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer(), np_extractor=chunker) for text in text_list: # tbinstance=tb(text) # sentences=tbinstance.sentences # print(sentences) # for s in sentences: # s. pst = parsetree(text) print(pst) for sentence in pst: for chunk in sentence.chunks: if chunk.type == "NP": print chunk.type, [(w.string, w.type) for w in chunk.words]
def __init__(self): self.sentencer = SentenceTokenizer() self.worder = WordTokenizer();
def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex."
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer() sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0, len(word)): term = word[:j] + word[j + 1:] tokens.append(term.strip()) print tokens
def __init__(self): self.sentencer = SentenceTokenizer() self.max = 8 self.min = 2
def __init__(self): self.lemmatizer = WordNetLemmatizer() self.tb = Blobber(pos_tagger=PerceptronTagger()) self.sentencer = SentenceTokenizer()
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer(); sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0,len(word)): term = word[:j] + word[j+1:] tokens.append(term.strip()) print tokens
def extract_feature_matrix(df_comments, df_thread_groupby): print "START" # Sentence Tokenizer sentencer = SentenceTokenizer() clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle') featureMatrix = np.empty([df_comments.shape[0],25]) feature_dict = dict() for ix, row in df_comments.iterrows(): feature_dict[row['comment_id']] = ix feature_count = 0 for _,row in df_comments.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ASCII', 'ignore') tokens = words(comm) unique_tokens = set(tokens) sentences = sentencer.tokenize(comm) featureMatrix[index][3] = len(comm) verb_fr, noun_fr, pronoun_fr = pos_freq(tokens) featureMatrix[index][4] = verb_fr featureMatrix[index][5] = noun_fr featureMatrix[index][6] = pronoun_fr featureMatrix[index][7] = capital_frequency(tokens) featureMatrix[index][8] = sent_frequency(sentences, '?') featureMatrix[index][9] = sent_frequency(sentences, '!') featureMatrix[index][10] = sentence_capital_frequency(sentences) featureMatrix[index][11] = entropy(comm) featureMatrix[index][12] = lexical_diversity(tokens) if len(tokens) == 0: featureMatrix[index][13] = 0 featureMatrix[index][14] = 0 featureMatrix[index][15] = 0 featureMatrix[index][16] = 0 else: spelt_wrong = missing_words(unique_tokens) bad_words_list = swears(unique_tokens) featureMatrix[index][13] = len(spelt_wrong) featureMatrix[index][14] = len(spelt_wrong)/float(len(unique_tokens)) featureMatrix[index][15] = len(bad_words_list) featureMatrix[index][16] = len(bad_words_list)/float(len(unique_tokens)) featureMatrix[index][19] = F_K_score(sentences, tokens) testSet = dict() refWords = make_full_dict(tokens) testSet.update(refWords) probDist = clf.prob_classify(testSet) sentiment = probDist.prob('pos') subj_obj = get_subjectivity(probDist) polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf) featureMatrix[index][22] = sentiment featureMatrix[index][23] = subj_obj featureMatrix[index][24] = polarity_overlap feature_count += 1 if feature_count % 1000 == 0: print feature_count print "DONE" feature_count = 0 # Grouped for _,group in df_thread_groupby: thread_comments = [row['comment_content'] for _,row in group.iterrows()] # Get average time sumTime = 0 count = 0 previous = mktime(group.iloc[0]['date']) first = mktime(group.iloc[0]['date']) # Average length sumLen = 0 thread_tokens = [] # Within Thread for _, row in group.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ascii','ignore') tokens = words(comm) sentences = sentencer.tokenize(comm) # Ongoing average time sumTime += mktime(row['date']) - previous count += 1 avgTime = sumTime/float(count) # Ongoing average length sumLen += len(words(row['comment_content'])) avgLen = sumLen/float(count) ###################################################################### # Get chunked sentences for sent in sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] # The cumulative tokens up to this point thread_tokens += doc ###################################################################### article_tokens = [] article_sentences = sentencer.tokenize(row['article_body']) for sent in article_sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] ###################################################################### featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1)) previous = mktime(row['date']) featureMatrix[index][1] = mktime(row['date']) - first featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1)) featureMatrix[index][17] = np.mean([termf(comm.count(w), tokens) for w in set(tokens)]) featureMatrix[index][18] = tf_idf(comm, thread_comments) featureMatrix[index][20] = onSubForumTopic(tokens, thread_tokens) featureMatrix[index][21] = onSubForumTopic(tokens, article_tokens) feature_count += 1 if feature_count % 1000 == 0: print feature_count return featureMatrix
if type(chunk) == nltk.Tree: entity_names.append(' '.join(c[0] for c in chunk.leaves())) else: entity_names.append(chunk[0]) entity_names = [ word.strip(string.punctuation).lower() for word in entity_names if len(word.strip(string.punctuation)) > 1 ] words = [w for w in entity_names if not w in stops] return words lemmatizer = WordNetLemmatizer() tb = Blobber(pos_tagger=PerceptronTagger()) sentencer = SentenceTokenizer() def comment_to_words_for_topics(comment_body): tokens = [] for sent in sentencer.tokenize(comment_body.decode('ascii', 'ignore')): tagged = tb(sent.lower()).tags filtered_words = [ w for w in tagged if not w[0] in stopwords.words('english') ] # Remove punctuation filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0]