class TestConllExtractor(unittest.TestCase): def setUp(self): self.extractor = ConllExtractor() self.text = ''' Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in other languages. The language provides constructs intended to enable clear programs on both a small and large scale. ''' self.sentence = "Python is a widely used general-purpose, high-level programming language" @attr('slow') def test_extract(self): noun_phrases = self.extractor.extract(self.text) assert_true("Python" in noun_phrases) assert_true("design philosophy" in noun_phrases) assert_true("code readability" in noun_phrases) @attr('slow') def test_parse_sentence(self): parsed = self.extractor._parse_sentence(self.sentence) assert_true(isinstance(parsed, nltk.tree.Tree)) @attr('slow') def test_filter_insignificant(self): chunk = self.extractor._parse_sentence(self.sentence) tags = [tag for word, tag in chunk.leaves()] assert_true('DT' in tags) filtered = filter_insignificant(chunk.leaves()) tags = [tag for word, tag in filtered] assert_true("DT" not in tags)
def setUp(self): self.extractor = ConllExtractor() self.text = ''' Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in other languages. The language provides constructs intended to enable clear programs on both a small and large scale. ''' self.sentence = "Python is a widely used general-purpose, high-level programming language"
def main(): st.title("Twitter sentiment analysis") st.subheader("Discover the positive and negative opinions about a product") st.markdown(""" #### Sentiment analysis is the automated process of analyzing text data and sorting it into sentiments positive, negative, or neutral. """) st.subheader("Search by topic") message = st.text_area("Enter Text", "Type Here ..") if st.button("Analyze"): new_tweets = api.search(q=message) for tweet in new_tweets: analysis = TextBlob(tweet.text, analyzer=NaiveBayesAnalyzer(), np_extractor=ConllExtractor()) polarity = 'Positive' if (analysis.sentiment.p_pos < 0.50): polarity = 'Negative' st.subheader("Sentiment Analysis and Topic of Interest") st.write(tweet.text) st.write(polarity) st.write("Confidence : Positive score: ", analysis.sentiment.p_pos * 100, " Negative score: ", analysis.sentiment.p_neg * 100) st.write("Areas of interest: ", analysis.noun_phrases) st.subheader( "---------------------------------------------------------------------------" ) st.subheader('Enter a Twitter Username to search tweets for: ') messageID = st.text_area("Enter a ID here", "...") if st.button("Process"): new_tweetsID = api.user_timeline(screen_name=messageID, count=20) for tweet in new_tweetsID: analysis = TextBlob(tweet.text, analyzer=NaiveBayesAnalyzer(), np_extractor=ConllExtractor()) polarity = 'Positive' if (analysis.sentiment.p_pos < 0.50): polarity = 'Negative' st.subheader("Sentiment Analysis and Topic of Interest") st.write("Tweet : ", tweet.text) st.write("Sentiment:", polarity) st.write("Confidence : Positive score: ", analysis.sentiment.p_pos * 100, " Negative score: ", analysis.sentiment.p_neg * 100) st.write("Areas of interest: ", analysis.noun_phrases) st.subheader( "---------------------------------------------------------------------------" ) st.sidebar.subheader("About App") st.sidebar.text("TSA with Streamlit") st.sidebar.info("Cudos to the Streamlit Team") st.sidebar.subheader("Developed By") st.sidebar.text("Sahrul ALom Choudhari")
def noun_phrases(query): # noun-phrase chunking # extractor = FastNPExtractor() extractor = ConllExtractor() blob = TextBlob(query, np_extractor=extractor) noun_phrases = blob.noun_phrases return noun_phrases
def main(): # FILENAME = "CellPhoneReview-1000.json" # print('Reading data...') # review_data = open(FILENAME).readlines() # document = [json.loads(d)['reviewText'] for d in review_data][0] document = "These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!" print(document) nltk_tagger = NLTKTagger() extractor = ConllExtractor() blob = TextBlob(document, pos_tagger=nltk_tagger, np_extractor=extractor) print(blob.tags) print(blob.noun_phrases) pattern_tagger = PatternTagger() blob2 = TextBlob(document, pos_tagger=pattern_tagger, np_extractor=extractor) print(blob2.tags) print(blob2.noun_phrases) tagged = nltk.pos_tag(tokenize(document.lower())) print(tagged) grammar = (''' NP: {<DT>?(<RB.?>*<VB.?>*<NNPS?>+<NNS?>+ | <JJ>*<NNS?>+)} # NP ''') chunkParser = nltk.RegexpParser(grammar) tree = chunkParser.parse(tagged) noun_phrases = [] for subtree in tree.subtrees(): if subtree.label() == 'NP': noun_phrase = ' '.join([elem[0] for elem in subtree]) noun_phrases.append(noun_phrase) print(noun_phrases)
def get_textblob_entities(text): extractor = ConllExtractor() blob = TextBlob(text, np_extractor=extractor) entities = [] for entity in blob.noun_phrases: entities.append({'text': entity.strip()}) return (entities)
def Preprocess(Reviews): extractor = ConllExtractor() text = TextBlob(Reviews, np_extractor=extractor) pos = [] sents = [sent.lower().correct() for sent in text.sentences] #singularize() #lemmatize() for sent in sents: pos.append(sent.tags) return text.sentences, pos, text.noun_phrases
def test_overrides(self): b = tb.Blobber(tokenizer=SentenceTokenizer(), np_extractor=ConllExtractor()) blob = b("How now? Brown cow?") assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"])) blob2 = b("Another blob") # blobs have the same tokenizer assert_true(blob.tokenizer is blob2.tokenizer) # but aren't the same object assert_not_equal(blob, blob2)
def get_NP_extractor(self): """ return text blob NP extractor """ #from textblob import TextBlob from textblob.np_extractors import ConllExtractor from textblob import Blobber #extractor = ConllExtractor() extractor =ConllExtractor() tb = Blobber(np_extractor=extractor) return tb
def parse(string): extractor = ConllExtractor() #text = "open the browser" text = string # Commands that Dhwani can handle and their respective targets commands = [ 'open', 'close', 'play', 'start', 'pause', 'stop', 'increase', 'increment', 'decrease', 'decrement', 'set', 'shutdown' ] actions = [ 'door', 'browser', 'song', 'music', 'player', 'brightness', 'volume' ] attributes = [] blob = TextBlob(text) token_string = blob.words mood = blob.sentiment.polarity #print(mood) found = False e = 0.005 # To set range for the neutral mood. for index in range(len(token_string)): if token_string[index] in str(commands): found = True #print(token_string[index]) #Command found and look for the target for ahead in range(index, len(token_string)): if token_string[ahead] in str(actions): #print(token_string[ahead]) # Call the execution function for (command, action) # Update the index with ahead + 1 print("Add the execute command function") index = ahead + 1 if token_string[ahead] == 'brightness' or 'volume': # If Brightness and volume, look for third attribute pass elif index == len(token_string) and found == False: # Invalid input. Say, "Can you rephrase your words?" print("Can you please rephrase your words?") if -1 <= mood <= (0 - e): # Bad mood. Call function to cheer him/her up. Songs/jokes stt.say("Call the mood, cheer man!") if (0 + e) <= mood <= 1: # happy mood. Say, "looks like you are enjoying your day!" stt.say("Enjoy the party man!") if (0 - e) <= mood <= (0 + e): # Neutral sentiment output. Make a pun joke stt.say("Ohh cool")
def on_status(self, data): #print(data.text) analysis = TextBlob(data.text, analyzer=NaiveBayesAnalyzer(), np_extractor=ConllExtractor()) #print ("Confidence : Positive score: " ,analysis.sentiment.p_pos*100, " Negative score: ", analysis.sentiment.p_neg*100 ) self.db.sports.insert({ "name": data.user.name, "text": data.text, "created_at": data.created_at, "positive_score:": analysis.sentiment.p_pos * 100, "negative_score:": analysis.sentiment.p_neg * 100 }) return True
def __init__(self, text): self.text = text self.conll_extractor = ConllExtractor() self.topia_extractor = extract.TermExtractor() ##Our custom tokenizer self.custom_sent_tokenizer = SentenceTokenizationOnRegexOnInterjections() self.tokenized_sentences = self.custom_sent_tokenizer.tokenize(self.text) ##This method will apply the sstemmers to the sentences self.stemming() print nltk.sent_tokenize(self.text) self.np_textblob() self.np_topia()
def parseContents(contentList): tupleList = [] posTagger = OpenNLP("/home/rohith/nitk/apache-opennlp-1.6.0", "POSTagger", "en-pos-maxent.bin") chunker = OpenNLP("/home/rohith/nitk/apache-opennlp-1.6.0", "ChunkerME", "en-chunker.bin") for item in contentList: attr = item[0] content = item[1] content = content.replace('\n','') sentences = sent_tokenize(content) for sentence in sentences: print('#'+sentence, file=sys.stderr) extractor = ConllExtractor() np = TextBlob(sentence, np_extractor=extractor).noun_phrases yield attr, np.lemmatize()
def __init__(self, list_of_sentences, default_np_extractor=None, regexp_grammer=None, if_postagged=False): """ Args: list_of_sentences: A list of lists with each element is a list of sentences which is pos tagged Example: [[('I', 'PRP'), ('went', 'VBD'), ('there', 'RB'), ('for', 'IN'), ('phirni', 'NN')], [], [], ...] default_np_extractor: if a list been passed then the noun phrases from various np_extractors will be appended if a string is passed, only the noun phrases from that np extractor will be appended Options regex_np_extractor regex_textblob_conll_np textblob_np_conll textblob_np_base """ self.if_postagged = if_postagged self.noun_phrases = list() self.conll_extractor = ConllExtractor() self.topia_extractor = extract.TermExtractor() self.list_of_sentences = list_of_sentences self.np_extractor = ( "textblob_np_conll", default_np_extractor)[default_np_extractor != None] if not regexp_grammer: self.regexp_grammer = r"CustomNounP:{<JJ|VB|FW|VBN>?<NN.*>*<NN.*>}" eval("self.{0}()".format(self.np_extractor)) self.noun_phrases = {self.np_extractor: self.noun_phrases} return
blob.sentiment # Tokenizer from nltk.tokenize import TabTokenizer tokenizer = TabTokenizer() blob = TextBlob("This is\ta rather tabby\tblob.", tokenizer=tokenizer) blob.tokens #This is an alternative way tokenizer = BlanklineTokenizer() blob = TextBlob("A token\n\nof appreciation") blob.tokenize(tokenizer) # Noun phrase chunkers from textblob.np_extractors import ConllExtractor extractor = ConllExtractor() blob = TextBlob("Python is a high-level programming language.", np_extractor=extractor) blob.noun_phrases # POS taggers from textblob.taggers import NLTKTagger nltk_tagger = NLTKTagger() blob = TextBlob("Tag! You're It!", pos_tagger=nltk_tagger) blob.pos_tags # Parser from textblob.parsers import PatternParser blob = TextBlob("Parsing is fun.", parser=PatternParser()) blob.parse() # TextBlob that share same model
import sys from textblob import TextBlob from rake import RakeKeywordExtractor from textblob.np_extractors import ConllExtractor from nltk.stem import WordNetLemmatizer from nltk.tokenize import WordPunctTokenizer from textblob.taggers import NLTKTagger from stopwordList import getList import codecs ## GLOBAL VARIABLES top_fraction = 1 LEMMA_OBJ = WordNetLemmatizer() tokenizer = WordPunctTokenizer() nltk_tagger = NLTKTagger() stopwords = getList() COLL_OBJ = ConllExtractor() def extractKeywords(phrase_list): RAKE_OBJ = RakeKeywordExtractor(set([])) word_scores = RAKE_OBJ._calculate_word_scores(phrase_list) phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores) sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True) n_phrases = len(sorted_phrase_scores) return [x[0] for x in sorted_phrase_scores[0:int(n_phrases)]] def extractChunks(CONTENT): BLOB_OBJ = TextBlob(CONTENT,
auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) print('Choose an option (1 or 2): ') print('1. Choose a topic to search tweets for. ') print('2. Choose a Twitter Username to search tweets for. ') input_data = input() if input_data == '1': print('Enter a topic: ') topic_name = input() new_tweets = api.search(q=topic_name) for tweet in new_tweets: analysis = TextBlob(tweet.text, analyzer=NaiveBayesAnalyzer(), np_extractor=ConllExtractor()) polarity = 'Positive' if (analysis.sentiment.p_pos < 0.50): polarity = 'Negative' print("Sentiment Analysis and Topic of Interest") print("Tweet : ", tweet.text) print("Sentiment:", polarity) print("Confidence : Positive score: ", analysis.sentiment.p_pos * 100, " Negative score: ", analysis.sentiment.p_neg * 100) print("Areas of interest: ", analysis.noun_phrases) print( "---------------------------------------------------------------------------" ) else: print('2. Enter a Twitter Username to search tweets for: ')
def post(self): """ """ cprint(figlet_format("Now exceuting %s" % self.__class__.__name__, font='mini'), attrs=['bold']) text = self.get_argument("text") link = self.get_argument("link") tokenizer = None conll_extractor = ConllExtractor() topia_extractor = extract.TermExtractor() if link: print "Link is present, so have to run goose to extract text" print link text = text.replace("\n", "") if not tokenizer: tokenizer = SentenceTokenizationOnRegexOnInterjections() result = tokenizer.tokenize(text) else: result = nltk.sent_tokenize(text) tags = TAG_CLASSIFIER_LIB.predict(result) sentiments = SENTI_CLASSIFIER_LIB_THREE_CATEGORIES.predict(result) def assign_proba(__list): return { "mixed": round(__list[0], 2), "negative": round(__list[1], 2), "neutral": round(__list[2], 2), "positive": round(__list[3], 2), } sentiment_probabilities = map( assign_proba, SENTI_CLASSIFIER_LIB_THREE_CATEGORIES.predict_proba(result)) new_result = list() for sentence, tag, sentiment, probability in zip( result, tags, sentiments, sentiment_probabilities): try: subcategory = list( eval('{0}_SB_TAG_CLASSIFIER_LIB.predict(["{1}"])'.format( tag[0:4].upper(), sentence)))[0] except: subcategory = None if max(probability) < .7: polarity_result = "can't decide" else: polarity_result = "decided" file_name, dependencies, indexeddependencies = save_tree(sentence) if file_name: with open(file_name, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) else: encoded_string = None blob = TextBlob(sentence) tb_nps = list(blob.noun_phrases) blob = TextBlob(sentence, np_extractor=conll_extractor) tb_conll_nps = list(blob.noun_phrases) te_nps = [e[0] for e in topia_extractor(sentence)] print sentence, dependencies, "\n" new_result.append({ "sentence": sentence, "encoded_string": encoded_string, "polarity": sentiment, "sentiment_probabilities": probability, "dependencies": dependencies, "indexeddependencies": indexeddependencies, "polarity_result": polarity_result, "noun_phrases": ["a", "b", "c"], "tag": tag, "tb_conll_nps": tb_conll_nps, "te_nps": te_nps, "subcategory": subcategory }) self.write({ "success": True, "error": False, "result": new_result, }) self.finish() return
def test_can_pass_np_extractor_to_constructor(self): e = ConllExtractor() blob = tb.TextBlob('Hello world!', np_extractor=e) assert_true(isinstance(blob.np_extractor, ConllExtractor))
def test_can_use_different_np_extractors(self): e = ConllExtractor() text = "Python is a high-level scripting language." blob = tb.TextBlob(text) blob.np_extractor = e assert_true(isinstance(blob.np_extractor, ConllExtractor))
def words(text): list_digit_words = [ 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine' ] extractor = ConllExtractor() blob = TextBlob(text, np_extractor=extractor) noun_phrases = blob.noun_phrases #print(noun_phrases) print('The voicemail is from ', end='') for np in noun_phrases: npp = np.split(' ') if len(npp) <= 2: if np != 'phone number' and np != 'good day' and np != 'great day': if np not in list_digit_words: #if len(np)<=3: print(np.title() + ' ', end='') verbs = list() for word, tag in blob.tags: if tag == 'VB': verbs.append(word.lemmatize()) #print(verbs) verbs_l = list() for i in verbs: i_l = i.lower() verbs_l.append(i_l) nouns = list() #print(verbs_l) if 'please' in verbs_l: next_word = verbs_l[verbs_l.index('please') + 1] print("Please" + ' ' + next_word + '\n', end='') if next_word == 'call': print(' regarding ', end='') for word, tag in blob.tags: if tag == 'NN': nouns.append(word.lemmatize()) #print(nouns) num = len(nouns) for item in random.sample(nouns, num): word = Word(item) if (word != 'phone' and word != 'number' and word != 'name'): #if (word!='number'): print(word, end=' ') else: print('Please \n ', end='') for verb in verbs_l: print(verb, end=' ') for word, tag in blob.tags: if tag == 'NN': nouns.append(word.lemmatize()) #print(nouns) #print("\nThe voicemail is about ", end='') num = len(nouns) for item in random.sample(nouns, num): word = Word(item) if (word != 'phone' and word != 'number' and word != 'name'): if word not in list_digit_words: #if (word!='number'): print(word, end=' ')
def extract_named_entities(user_input): """ This method extracts 'named entities' from user input text. :param user_input: the user input text to extract named entities from :type user_input: string :return artists_tracks_albums :rtype list """ # targets and extracts named entities by filtering extracted noun phrases based on the POS tags of the words they contain artists_tracks_albums = [] search_query_conllextractor = TextBlob( user_input, np_extractor=ConllExtractor()) # using Conll noun phrase extractor for noun_phrase in search_query_conllextractor.noun_phrases: np = TextBlob(noun_phrase) for np_word, np_word_tag in np.tags: for search_query_word, search_query_word_tag in search_query_conllextractor.tags: if search_query_word.lower() == np_word.lower(): np_word_tag = search_query_word_tag # gets original POS tag back as breaking down np for use as TextBlob object re-tags np words incorrectly if np_word_tag != "NNP" and np_word_tag != "NNPS" and np_word != "&" and np_word != "+": # covers things like R&B where '&' would be an np_word: if search_query_word.lower( ) in np.words: # checks if word has already been removed e.g. in query "I like music. Give me rap music." - it prevents an error from trying to remove 'music' twice if its only in one extracted noun phrase np.words.remove(search_query_word.lower() ) # np.words are lowercase if len(np.words) != 0: artists_tracks_albums.append(' '.join(np_word for np_word in np.words)) search_query_fastnpextractor = TextBlob( user_input) # using FastNP noun phrase extractor - TextBlob's default for noun_phrase in search_query_fastnpextractor.noun_phrases: np = TextBlob(noun_phrase) for np_word, np_word_tag in np.tags: for search_query_word, search_query_word_tag in search_query_fastnpextractor.tags: if search_query_word.lower() == np_word.lower(): np_word_tag = search_query_word_tag if np_word_tag != "NNP" and np_word_tag != "NNPS" and np_word != "&" and np_word != "+": if search_query_word.lower() in np.words: np.words.remove(search_query_word.lower()) if np.words not in artists_tracks_albums and len(np.words) != 0: artists_tracks_albums.append(' '.join(np_word for np_word in np.words)) for item in artists_tracks_albums: if re.match( r"\b([a-z]) (?=[a-z]\b)", item ): # matches words like 'r b' which should be 'r&b' - the '&' was getting removed for some reason in the join, although recognised as an np_word and therefore should have been joined item_index = artists_tracks_albums.index(item) artists_tracks_albums[item_index] = item.replace(" ", "&") artists_tracks_albums = list( dict.fromkeys(artists_tracks_albums)) # deletes any duplicates artists_tracks_albums = [item.lower() for item in artists_tracks_albums ] # makes all lowercase return artists_tracks_albums
from nltk.corpus import wordnet as wn from textblob import TextBlob from textblob_aptagger import PerceptronTagger from textblob.np_extractors import ConllExtractor from utils import flatten STOPS = stopwords.words('english') EN_US_DICT = enchant.Dict("en_US") EN_GB_DICT = enchant.Dict("en_GB") PORTER = PorterStemmer() WN_LEMMATIZER = WordNetLemmatizer() SENTENCE_DETECTOR = data.load('tokenizers/punkt/english.pickle') TAGGER = PerceptronTagger() EXTRACTOR = ConllExtractor() def get_fast_blob(text): return TextBlob(text, pos_tagger=TAGGER, np_extractor=EXTRACTOR) def get_blob(text): return TextBlob(text, np_extractor=EXTRACTOR) def tokenize_string(doc): return word_tokenize(doc) def sentence_tokenize(s):
def get_noun_phrase(sentence): extractor = ConllExtractor() blob = TextBlob(sentence, np_extractor=extractor) print blob.noun_phrases
def tweets(): connection = httplib.HTTPSConnection('parseapi.back4app.com', 443) params = urllib.urlencode({ "where": json.dumps({"manualTwitterURL": { "$ne": "" }}), "include": "user", "keys": "manualTwitterURL,user.objectId" }) connection.connect() connection.request( 'GET', '/classes/Lead?%s' % params, '', { "X-Parse-Application-Id": "9LT6MCUSdT4mnzlNkG2pS8L51wvMWvugurQJnjwB", "X-Parse-REST-API-Key": "6gwEVURQBIkh9prcc3Bgy8tRiJTFYFbJJkQsB45w" }) result = json.loads(connection.getresponse().read()) twitterURL = [] leadid = [] userid = [] oldest = {} for i in range(0, len(result['results'])): twitterURL.append(result['results'][i]['manualTwitterURL']) leadid.append(result['results'][i]['objectId']) userid.append(result['results'][i]['user']['objectId']) for i in range(0, len(twitterURL)): alltweets = [] oldestid = json.load(open("lastTweetId.txt")) try: oldestid = oldest.get(twitterURL[i]) except IndexError: oldestid = '0' if oldestid == '0': new_tweets = api.user_timeline(screen_name=twitterURL[i], count=20) else: new_tweets = api.user_timeline(screen_name=twitterURL[i], count=20, since_id=oldestid) alltweets.extend(new_tweets) for tweet in alltweets: analysis = TextBlob(tweet.text, analyzer=NaiveBayesAnalyzer(), np_extractor=ConllExtractor()) if (analysis.sentiment.p_pos > 0.75): polarity = 'Positive' elif (analysis.sentiment.p_neg > 0.75): polarity = 'Negative' else: polarity = 'Neutral' oldest[twitterURL[i]] = tweet.id if (analysis.sentiment.p_pos > 0.70 or analysis.sentiment.p_neg > 0.70): try: interestTopic = analysis.noun_phrases[0] except IndexError: interestTopic = 'null' connection = httplib.HTTPSConnection('parseapi.back4app.com', 443) connection.connect() connection.request( 'POST', '/classes/Insight', json.dumps({ "user": { "__type": "Pointer", "className": "_User", "objectId": userid[i] }, "lead": { "__type": "Pointer", "className": "Lead", "objectId": leadid[i] }, "type": "topic", "confidence": analysis.sentiment.p_pos * 100, "tweet": tweet.text, "insight": polarity, "tweetId": tweet.id, "interestTopic": interestTopic, "description": "insight" }), { "X-Parse-Application-Id": "9LT6MCUSdT4mnzlNkG2pS8L51wvMWvugurQJnjwB", "X-Parse-REST-API-Key": "6gwEVURQBIkh9prcc3Bgy8tRiJTFYFbJJkQsB45w", "Content-Type": "application/json" }) json.dump(oldest, open("lastTweetId.txt", "w")) return ('Successfully added data to Insights!')
def extract_phrase(text): '''take a preprocessed tweet and return a dict of noun phrases and counts as value''' extractor = ConllExtractor() blob = TextBlob(text) return dict(blob.np_counts)