def tag(self, corpus, tokenize=True): '''Tags a string `corpus`.''' # Assume untokenized corpus has \n between sentences and ' ' between words s_split = SentenceTokenizer( ).tokenize if tokenize else lambda t: t.split('\n') w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split() def split_sents(corpus): for s in s_split(corpus): yield w_split(s) prev, prev2 = self.START tokens = [] for words in split_sents(corpus): context = self.START + [self._normalize(w) for w in words] + self.END for i, word in enumerate(words): tag = self.tagdict.get(word) if not tag: features = self._get_features(i, word, context, prev, prev2) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens
def extract_global_bag_of_words_processed(df_comments): corpus = [] i = 0 lemmatizer = WordNetLemmatizer() tb = Blobber(pos_tagger=PerceptronTagger()) sentencer = SentenceTokenizer() for _,row in df_comments.iterrows(): comm = row['comment_content'] tokens = [] for sent in sentencer.tokenize(comm.decode('ascii','ignore')): tagged = tb(sent.lower()).tags # Remove stops filtered_words = [w for w in tagged if not w[0] in stopwords.words('english')] # Remove punctuation filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0] # Lemmatize filtered_words = [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in filtered_words] filtered_words = [w for w in filtered_words if len(w) > 1] for word in filtered_words: tokens.append(word) corpus.append(' '.join(tokens)) i += 1 if i % 1000 == 0: print i, "words processed for Ngrams" return corpus
def _create_sentence_objects(self): '''Returns a list of Sentence objects given a list of sentence strings. Attempts to handle sentences that have more than one punctuation mark at the end of the sentence. Examples: "An ellipses is no problem..." or "This is awesome!!!" ''' sent_tokenizer = SentenceTokenizer() sentence_objects = [] sentences = sent_tokenizer.itokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
def comment_to_sentences(comment, remove_stops=False): sentencer = SentenceTokenizer() corpus = [] for sent in sentencer.tokenize(comment): if len(sent) > 0: corpus.append(comment_to_wordlist(sent, remove_stops)) return corpus
def test_overrides(self): b = tb.Blobber(tokenizer=SentenceTokenizer(), np_extractor=ConllExtractor()) blob = b("How now? Brown cow?") assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"])) blob2 = b("Another blob") # blobs have the same tokenizer assert_true(blob.tokenizer is blob2.tokenizer) # but aren't the same object assert_not_equal(blob, blob2)
def test_get_np_for_CONLLExtractor(self): text_list = self.text_list from textblob.taggers import NLTKTagger from textblob.tokenizers import SentenceTokenizer chunker = ConllExtractor() tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer(), np_extractor=chunker) for text in text_list: b = tb(text) print(b.noun_phrases) print(b.parse())
def plagiarism_check(reader, pdfurl): global cred_score text = '' for i in range(5, reader.numPages): text += reader.getPage(i).extractText() sentences = TextBlob(text, tokenizer=SentenceTokenizer()) sentences = [' '.join(sentence.split()) for sentence in sentences] sentences = [sentence for sentence in sentences if len(sentence) > 50] t = random.sample(sentences, min(len(sentences), 3)) # can increase this number for sentence in t: print(sentence) res = requests.get('https://www.google.ca/search?q="' + sentence + '"') soup = bs4.BeautifulSoup(res.text, 'html.parser') results = soup.select('h3.r a') for result in results[:min(len(results), 3)]: # can increase this number if results.get('href') != pdfurl: cred_score -= 0.05 output['plagiarism'] = -0.05 return
def test_get_np_for_all(self): text_list = self.text_list from textblob.taggers import NLTKTagger from textblob.tokenizers import SentenceTokenizer chunker = ConllExtractor() tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer(), np_extractor=chunker) for text in text_list: # tbinstance=tb(text) # sentences=tbinstance.sentences # print(sentences) # for s in sentences: # s. pst = parsetree(text) print(pst) for sentence in pst: for chunk in sentence.chunks: if chunk.type == "NP": print chunk.type, [(w.string, w.type) for w in chunk.words]
def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex."
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer() sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0, len(word)): term = word[:j] + word[j + 1:] tokens.append(term.strip()) print tokens
def __init__(self): self.lemmatizer = WordNetLemmatizer() self.tb = Blobber(pos_tagger=PerceptronTagger()) self.sentencer = SentenceTokenizer()
def __init__(self): self.sentencer = SentenceTokenizer() self.max = 8 self.min = 2
def __init__(self): self.sentencer = SentenceTokenizer() self.worder = WordTokenizer();
def extract_feature_matrix(df_comments, df_thread_groupby): print "START" # Sentence Tokenizer sentencer = SentenceTokenizer() clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle') featureMatrix = np.empty([df_comments.shape[0],25]) feature_dict = dict() for ix, row in df_comments.iterrows(): feature_dict[row['comment_id']] = ix feature_count = 0 for _,row in df_comments.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ASCII', 'ignore') tokens = words(comm) unique_tokens = set(tokens) sentences = sentencer.tokenize(comm) featureMatrix[index][3] = len(comm) verb_fr, noun_fr, pronoun_fr = pos_freq(tokens) featureMatrix[index][4] = verb_fr featureMatrix[index][5] = noun_fr featureMatrix[index][6] = pronoun_fr featureMatrix[index][7] = capital_frequency(tokens) featureMatrix[index][8] = sent_frequency(sentences, '?') featureMatrix[index][9] = sent_frequency(sentences, '!') featureMatrix[index][10] = sentence_capital_frequency(sentences) featureMatrix[index][11] = entropy(comm) featureMatrix[index][12] = lexical_diversity(tokens) if len(tokens) == 0: featureMatrix[index][13] = 0 featureMatrix[index][14] = 0 featureMatrix[index][15] = 0 featureMatrix[index][16] = 0 else: spelt_wrong = missing_words(unique_tokens) bad_words_list = swears(unique_tokens) featureMatrix[index][13] = len(spelt_wrong) featureMatrix[index][14] = len(spelt_wrong)/float(len(unique_tokens)) featureMatrix[index][15] = len(bad_words_list) featureMatrix[index][16] = len(bad_words_list)/float(len(unique_tokens)) featureMatrix[index][19] = F_K_score(sentences, tokens) testSet = dict() refWords = make_full_dict(tokens) testSet.update(refWords) probDist = clf.prob_classify(testSet) sentiment = probDist.prob('pos') subj_obj = get_subjectivity(probDist) polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf) featureMatrix[index][22] = sentiment featureMatrix[index][23] = subj_obj featureMatrix[index][24] = polarity_overlap feature_count += 1 if feature_count % 1000 == 0: print feature_count print "DONE" feature_count = 0 # Grouped for _,group in df_thread_groupby: thread_comments = [row['comment_content'] for _,row in group.iterrows()] # Get average time sumTime = 0 count = 0 previous = mktime(group.iloc[0]['date']) first = mktime(group.iloc[0]['date']) # Average length sumLen = 0 thread_tokens = [] # Within Thread for _, row in group.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ascii','ignore') tokens = words(comm) sentences = sentencer.tokenize(comm) # Ongoing average time sumTime += mktime(row['date']) - previous count += 1 avgTime = sumTime/float(count) # Ongoing average length sumLen += len(words(row['comment_content'])) avgLen = sumLen/float(count) ###################################################################### # Get chunked sentences for sent in sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] # The cumulative tokens up to this point thread_tokens += doc ###################################################################### article_tokens = [] article_sentences = sentencer.tokenize(row['article_body']) for sent in article_sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] ###################################################################### featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1)) previous = mktime(row['date']) featureMatrix[index][1] = mktime(row['date']) - first featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1)) featureMatrix[index][17] = np.mean([termf(comm.count(w), tokens) for w in set(tokens)]) featureMatrix[index][18] = tf_idf(comm, thread_comments) featureMatrix[index][20] = onSubForumTopic(tokens, thread_tokens) featureMatrix[index][21] = onSubForumTopic(tokens, article_tokens) feature_count += 1 if feature_count % 1000 == 0: print feature_count return featureMatrix