예제 #1
0

# Check out noun phrases, will be useful for frequent feature extraction
comments.noun_phrases


# In[9]:


#compactness pruning:
cleaned = list()
for phrase in comments.noun_phrases:
    count = 0
    for word in phrase.split():
        # Count the number of small words and words without an English definition
        if len(word) <= 2 or (not Word(word).definitions):
            count += 1
    # Only if the 'nonsensical' or short words DO NOT make up more than 40% (arbitrary) of the phrase add
    # it to the cleaned list, effectively pruning the ones not added.
    if count < len(phrase.split())*0.4:
        cleaned.append(phrase)
        
print("After compactness pruning:\nFeature Size:")
len(cleaned)


# In[10]:


for phrase in cleaned:    
    match = list()
예제 #2
0
print(wiki.tags)
print(wiki.noun_phrases)
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
print(testimonial.sentiment)
zen = TextBlob("Beautiful is better than ugly. "
               "Explicit is better than implicit. "
               "Simple is better than complex.")
print(zen.words)
print(zen.sentences)
for sentence in zen.sentences:
    print(sentence.sentiment)
sentence = TextBlob('Use 4 spaces per indentation level.')
print(sentence.words)
print(sentence.words[2].singularize())
print(sentence.words[-1].pluralize())
w = Word("octopi")
print(w.lemmatize())
w = Word("went")
print(w.lemmatize("v"))
word = Word("octopus")
print(word.synsets)
print(Word("hack").get_synsets(pos=VERB))
print(Word("octopus").definitions)
octopus = Synset('octopus.n.02')
shrimp = Synset('shrimp.n.03')
print(octopus.path_similarity(shrimp))
animals = TextBlob("cat dog octopus")
print(animals.words.pluralize())
b = TextBlob("I havv goood speling!")
print(b.correct())
w = Word('falibility')
예제 #3
0
## Deleting less words

import pandas as pd

sil = pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]
df["text"] = df["text"].apply(
    lambda x: " ".join(x for x in x.split() if x not in sil))

## Lemmatization

from textblob import Word
nltk.download("wordnet")

df["text"] = df["text"].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

## TEST -TRAIN OLUŞTURMA

train_x, test_x, train_y, test_y = model_selection.train_test_split(
    df["text"], df["label"], random_state=1)

encoder = preprocessing.LabelEncoder()

train_y = encoder.fit_transform(train_y)
train_y = encoder.fit_transform(test_y)

##print(train_y[0:5])

##print(test_y[0:5])
예제 #4
0
def words(text):
    list_digit_words = [
        'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
        'nine'
    ]
    extractor = ConllExtractor()
    blob = TextBlob(text, np_extractor=extractor)
    noun_phrases = blob.noun_phrases
    #print(noun_phrases)
    print('The voicemail is from ', end='')
    for np in noun_phrases:
        npp = np.split(' ')
        if len(npp) <= 2:

            if np != 'phone number' and np != 'good day' and np != 'great day':
                if np not in list_digit_words:

                    #if len(np)<=3:
                    print(np.title() + ' ', end='')

    verbs = list()
    for word, tag in blob.tags:
        if tag == 'VB':
            verbs.append(word.lemmatize())
    #print(verbs)

    verbs_l = list()

    for i in verbs:
        i_l = i.lower()
        verbs_l.append(i_l)
    nouns = list()
    #print(verbs_l)
    if 'please' in verbs_l:
        next_word = verbs_l[verbs_l.index('please') + 1]
        print("Please" + ' ' + next_word + '\n', end='')
        if next_word == 'call':
            print(' regarding ', end='')

            for word, tag in blob.tags:
                if tag == 'NN':
                    nouns.append(word.lemmatize())
#print(nouns)
            num = len(nouns)
            for item in random.sample(nouns, num):
                word = Word(item)
                if (word != 'phone' and word != 'number' and word != 'name'):
                    #if (word!='number'):
                    print(word, end=' ')
    else:
        print('Please \n ', end='')
        for verb in verbs_l:
            print(verb, end=' ')
        for word, tag in blob.tags:
            if tag == 'NN':
                nouns.append(word.lemmatize())


#print(nouns)

#print("\nThe voicemail is about ", end='')
        num = len(nouns)
        for item in random.sample(nouns, num):
            word = Word(item)
            if (word != 'phone' and word != 'number' and word != 'name'):
                if word not in list_digit_words:
                    #if (word!='number'):
                    print(word, end=' ')
예제 #5
0
from textblob import TextBlob
from textblob import Word
s1 = " The Game Of Thrones"
s2 = " Winter is comming in thrones"
s3 = " White walker are comming to kill all"
d1 = TextBlob(s1)
d2 = TextBlob(s2)
d3 = TextBlob(s3)

word = Word("thrones")
doclist = [d1, d2, d3]
for doc in doclist:
    wordlist = doc.words
    #noofwords=len(wordlist);

    for word in wordlist:

        tf = doc.words.count(word)

        print(tf)
        print(word + "occur" + str(tf) + "times")

#ntf=tf/noofwords;
#print(ntf)
예제 #6
0
from textblob.wordnet import VERB

raw_query = "Physics is a better subject to study than Mathematics. I like Physics more than I like Mathematics. Physicists are more intelligent than Mathematicians."

# Get input ready for use
query = TextBlob(raw_query)
print 'Query: ', query
tags = query.tags
print 'Tags: ', tags
nouns = query.noun_phrases
print 'Nouns: ', nouns
sentiment = query.sentiment
print 'Sentiment: ', sentiment
words = query.words
print 'Words: ', words
sentences = query.sentences
print 'Sentences: ', sentences
parse = query.parse()
print 'Parse: ', parse
language = query.detect_language()
print 'Language: ', language
# TODO : add spelling checks to correct the input sentences for better searches
corrected = query.correct()
print 'Corrected: ', corrected

# Search for results
w = Word('Octopus')
print '\nSynsets: ', w.synsets
print '\nDefinitions: ', w.definitions
print Word("hack").get_synsets(pos=VERB)
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


print(
    remove_special_characters("Well this was fun! What do you think? 123#@!",
                              remove_digits=True))

###########################################
######## STEP 7. Correct Spelling #########
###########################################

from textblob import Word

w = Word('fianlly')
print(w.correct())

w = Word('flaot')
print(w.spellcheck())

###########################################
############ STEP 8. Stemming #############
###########################################

# there are several stemmers: PorterStemmer, LacasterStemmer etc
# I will go with SS because I know it
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("english")
print(ss.stem("jumping"))
예제 #8
0
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


data = pd.read_csv('../dataset/dataset.csv')
x = data['news'].tolist()
y = data['type'].tolist()

for index, value in enumerate(x):
    print("processing data:", index)
    x[index] = ' '.join(
        [Word(word).lemmatize() for word in clean_str(value).split()])

vect = TfidfVectorizer(stop_words='english', min_df=2)
X = vect.fit_transform(x)
Y = np.array(y)

print("no of features extracted:", X.shape[1])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=42)

print("train size:", X_train.shape)
print("test size:", X_test.shape)
예제 #9
0
Tokenization refers to dividing the text into a sequence of words or sentences. In our example, we have used the textblob library to first transform our tweets into a blob and then converted them into a series of words.
"""

TextBlob(combi['tweet2'][1]).words

import nltk
nltk.download('wordnet')

"""#Lemmatization

Lemmatization is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices. It makes use of the vocabulary and does a morphological analysis to obtain the root word. Therefore, we usually prefer using lemmatization over stemming.
"""

from textblob import Word
combi['tweet3'] = combi['tweet2'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
combi['tweet3'].head()

from wordcloud import WordCloud

text = combi['tweet3'].to_string()

wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(text)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

neg_tweets = combi[combi.sentiment == 0  ]
neg_string = []
for t in neg_tweets.tweet3:
예제 #10
0
object_synsets = json.load(open(config.object_synset_path, 'r'))
attribute_synsets = json.load(open(config.attribute_synset_path, 'r'))

# synsets['crisp'] = 'chip.n.04'
synsets = {}
for key, val in object_synsets.items():
    synsets[key] = val
for key, val in attribute_synsets.items():
    if key not in synsets:
        synsets[key] = val

answer_dict = cPickle.load(open(config.answer_dict_path, 'rb'))

vocab_with_synset = []
for v in answer_dict['vocab']:
    v_w = Word('_'.join(v.split()))
    if len(v_w.synsets) > 0:
        synsets[v] = v_w.synsets[0].name()
    if v in synsets:
        vocab_with_synset.append(v)

hypernym_set = set()
vocab_hypernyms = {}
max_depth = defaultdict(int)
for v in tqdm(vocab_with_synset, desc='make hypernymset'):
    # Synset('chip.n.04')
    word_synset = wn.synset(synsets[v])
    # set([Synset('nutriment.n.01'), Synset('food.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01'), Synset('matter.n.03'), Synset('dish.n.02'), Synset('snack_food.n.01'), Synset('substance.n.07'), Synset('chip.n.04')])
    hypernyms = []
    hypernym2distance = {}
    for hypernym, distance in list(word_synset.hypernym_distances()):
예제 #11
0
data = data.drop(data[data.sentiment == 'hate'].index)
data = data.drop(data[data.sentiment == 'neutral'].index)
data = data.drop(data[data.sentiment == 'worry'].index)

# Making all letters lowercase
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing Punctuation, Symbols
data['content'] = data['content'].str.replace('[^\w\s]',' ')

# Removing Stop Words using NLTK
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#Lemmatisation
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#Correcting Letter Repetitions

def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
예제 #12
0
def Lemmatize(data):
    data['Requirement'] = data['Requirement'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    return data
예제 #13
0
파일: nlp_2.py 프로젝트: KennedyTurner1/nlp
from pathlib import Path
from textblob import TextBlob

blob = TextBlob(Path('RomeoAndJuliet.txt').read_text()
                )  #creates the entire novel into an object

#print(blob.words.count('joy')) #count the word 'joy' in the text of R&J

#print(blob.noun_phrases.count('lady capulet')) #46 times mentioned, the phrase "LC" shows up

#synonymns and antonymns of words

from textblob import Word

happy = Word('happy')

#print(happy.definitions) #made the dictionary, gives definitions

print(happy.synsets)
예제 #14
0
def export_language_file(csvfile_lang):

    words = pd.read_csv(csvfile_lang)
    wordlist = list(words['word'].values)

    for cc,the_word in enumerate(wordlist):

        # We want to look for the root word first.
        # If we don't find anything,
        # try removing any common suffixes.

        found_result = False


        # ------------
        # Original word

        browser = mechanize.Browser()
        response = browser.open('http://www.etymonline.com/')
        browser.select_form(nr=0)
        try:
            browser['search'] = the_word
        except TypeError:        
            pass
        resp = browser.submit()
        html_doc = resp.read()
        
        # now use beautifulsoup to go through resp.read()

        soup = BeautifulSoup(html_doc)
        matching_words = {}
    
        # dt = dictionary term
        dts = soup.find_all('dt')
    
        # dd = dictionary definition
        dds = soup.find_all('dd')

        # Look for original word 
        # in each search result
        # (searching for original word)
        for dt,dd in zip(dts,dds):

            # get the text of the term
            # in the form:
            # this (n.)
            # that (n.)
            # foo (v.)
            # bar (adj.)
            this_word_full = dt.get_text()

            # remove (n./adj./v.) 
            this_word = this_word_full.split(' ')[:-2]
            this_word = ''.join(this_word)

            if the_word.lower()==this_word.lower():
                # We have an exact match!

                # key is the_word 
                # or 
                # key is this_word

                # get etymology from the dd tag
                # corresponding to our dt tag
                etym = dd.get_text()

                # etymology is the value
                matching_words[this_word] = etym

                # yay!
                found_result = True

                break


        # Now check for common suffixes...
        if found_result == False:

            # pick out synonyms from the synset that are basically the same word (share first 3 letters)
            # (and also strip them of part of speech,
            #  n., v., and so on...)
            #
            # synsets look like:
            # swing.n.04
            # 
            # so use .split('.')[0] (first token before .)
            #

            try:
                synset = Word(the_word).synsets
            except:
                synset = None
            synset_strings = [syn.name().split('.')[0] for syn in synset]

            final_synset = []
            for ss,sss in zip(synset,synset_strings):
                if sss[:3] == the_word[:3]:
                    final_synset.append(sss)

            final_synset = list(set(final_synset))

            # Look for synonyms
            # in each search result
            # (searching for each synonym)
            for dt,dd in zip(dts,dds):

                # get the text of the term
                this_word_full = dt.get_text()

                # remove (n./adj./v.) 
                this_word = this_word_full.split(' ')[:-2]
                this_word = ''.join(this_word)

                for syn in final_synset:
                    if syn.lower() == this_word.lower():
                        # We have an exact match!

                        # key is the_word 
                        # or 
                        # key is this_word

                        # get etymology from the dd tag
                        # corresponding to our dt tag
                        etym = dd.get_text()

                        # etymology is the value
                        matching_words[this_word] = etym

                        # yay!
                        found_result = True

                        break


        # if found_result is False,
        #    FAIL...!!! 
        #    we are skipping this word
        # 
        # 
        # but otherwise...
        #     we extract etymology info!
        if found_result:

            # remember:
            # this_word and the_word are all lowercase...
            # just to avoid confusion.

            # 
            # avoid false positives...
            if the_word in matching_words.keys():

                etymology = matching_words[the_word]

                # create a grid with location (in etymology) of each language's reference.
                ## old way (incorrect, 'Germanic' always flags 'German' too)
                #etymology_grid = [etymology.index(lang) if lang in etymology else -1 for lang in languages]
                etymology_grid = []

                for lang in languages:

                    # need to do re
                    # need to compile these in a list

                    m = re.search(lang+r'\b', etymology)

                    # double check if N is actually Old N
                    n = re.search('Old '+lang+r'\b', etymology)

                    if (m and not n):
                        etymology_grid.append( m.start() )
                    else:
                        etymology_grid.append(-1)

                # each word is tagged with whatever language 
                # is referenced FIRST in the etymology

                # get whole grid
                etymology_grid_gt0 = [eg for eg in etymology_grid if eg >= 0 ]

                # check whether there IS a result
                if len(etymology_grid_gt0)>0:

                    # etymology_grid_gt0 contains the ranked order of each language. min means it comes first.
                    #
                    # (Pdb) p etymology_grid
                    # [-1, 239, 227, -1, 188, 184, 71, 71, 138, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1]
                    #
                    # (Pdb) etymology_grid_gt0
                    # [239, 227, 188, 184, 71, 71, 138, 0]
                    #
                    # (Pdb) p [languages[ii] for ii in range(len(etymology_grid)) if etymology_grid[ii] > 0]
                    # ['Greek', 'Latin', 'Norse', 'Old Norse', 'German', 'Dutch']
                    #
                    sorted_etymology_grid = sorted(etymology_grid_gt0)

                    ranking = []
                    for ii,et in enumerate(etymology_grid):

                        if et>=0:
                            val = sorted_etymology_grid.index(et)
                        else:
                            val = -1

                        ranking.append(val)

                    ranked_langs = [languages[r] for r in ranking if r > -1] 

                    try:
                        language1_name = ranked_langs[0]
                    except:
                        language1_name = ''

                    try:
                        language2_name = ranked_langs[1]
                    except:
                        language2_name = ''

                    #first_lang_ref = min(etymology_grid_gt0)
                    #last_lang_ref = max(etymology_grid_gt0)
    
                    print ""
                    print "Tagging word %d of %d: %s"%(cc,len(wordlist),the_word)

                    print the_word,":",language1_name

                    words.loc[words['word']==the_word,'root language'] = language1_name
                    words.loc[words['word']==the_word,'second language'] = language2_name
                    words.loc[words['word']==the_word,'ranked languages'] = ",".join(ranked_langs)

        if cc%50==0:
            print "Exporting to file..."
            words.to_csv(csvfile_lang,na_rep="")
            print "done"

    print "Exporting to file..."
    words.to_csv(csvfile_lang,na_rep="")
    print "done"
data.head()

stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data1['text'] = data1['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head()

#####Removing pantuation

data['text'] = data['text'].str.replace('[^\w\s]','')
data1['text'] = data1['text'].str.replace('[^\w\s]','')

nltk.download('wordnet')
from textblob import Word

data['text']  = data['text'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data.head()

data1['text']  = data1['text'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data1.head()

##Remove special characters
import re

data['text']=data['text'].apply(lambda x :re.sub(r'\W+', ' ', x))
data1['text']=data1['text'].apply(lambda x :re.sub(r'\W+', ' ', x))

data.head()

####Remove numbers
data['text']=data['text'].str.replace('\d+', '')
예제 #16
0
파일: 11_02.08.py 프로젝트: gvc2000/curso
# Section 11.2.8 snippets
from textblob import Word

index = Word('index')

index.pluralize()

cacti = Word('cacti')

cacti.singularize()

from textblob import TextBlob

animals = TextBlob('dog cat fish bird').words

animals.pluralize()



##########################################################################
# (C) Copyright 2019 by Deitel & Associates, Inc. and                    #
# Pearson Education, Inc. All Rights Reserved.                           #
#                                                                        #
# DISCLAIMER: The authors and publisher of this book have used their     #
# best efforts in preparing the book. These efforts include the          #
# development, research, and testing of the theories and programs        #
# to determine their effectiveness. The authors and publisher make       #
# no warranty of any kind, expressed or implied, with regard to these    #
# programs or to the documentation contained in these books. The authors #
# and publisher shall not be liable in any event for incidental or       #
# consequential damages in connection with, or arising out of, the       #
예제 #17
0
 async def meaning(self, ctx, defword):
     "defines a word"
     defenition_w = Word(defword).definitions
     await ctx.send(defenition_w)
예제 #18
0
def analyze_querydata(read_file, write_file):
    count = 0
    msg = ""
    error = ""
    start = time.time()

    filename = read_file
    filename2 = write_file

    alltweets = csv.reader(open(filename, 'r'))

    with open(filename2, 'w', newline='') as analyzetweets:
        writer = csv.writer(analyzetweets)
        blob_words = set()
        # while True:
        for row in alltweets:
            count = count + 1
            print(str(count))
            try:
                tweet_string = row[1]
                lang_predict = multinomial.predict(tweet_string)
                blob = TextBlob(tweet_string)
                if lang_predict != 'ENGLISH':
                   received_text2 = blob.translate(to='en')
                else:
                   received_text2 = blob

                pass
            except urllib.error.URLError as urlE:
                error = urlE
                print(error)

                received_text = tweet_string

                number_of_tokens = len(list(blob.words))
                number_of_tokens = number_of_tokens

                time_created = row[0]
                blob_words = list(blob.words)
                writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words])
            except socket.timeout as socketE:
                error = socketE
                print(error)

                received_text = tweet_string

                number_of_tokens = len(list(blob.words))
                number_of_tokens = number_of_tokens

                time_created = row[0]
                blob_words = list(blob.words)
                writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words])
            except exceptions.NotTranslated as NT:
                error = NT
                print(error)
                received_text = tweet_string

                number_of_tokens = len(list(blob.words))
                number_of_tokens = number_of_tokens

                time_created = row[0]
                blob_words = list(blob.words)
                writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words])
            except exceptions.TranslatorError as TE:
                error = TE
                print(error)
                received_text = tweet_string

                number_of_tokens = len(list(blob.words))
                number_of_tokens = number_of_tokens

                time_created = row[0]
                blob_words = list(blob.words)
                writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words])
            else:
                # using textBlob for NLP of languages other than Malay
                blob_sentiment, blob_subjectivity = received_text2.sentiment.polarity, received_text2.sentiment.subjectivity
                textblob_sentiment = ''
                if blob.sentiment.polarity > 0:
                    textblob_sentiment = 'positive'
                elif blob.sentiment.polarity < 0:
                    textblob_sentiment = 'negative'
                elif blob.sentiment.polarity == 0.0:
                    textblob_sentiment = 'neutral'

                number_of_tokens = len(list(blob.words))

                # Extracting Main Points
                nouns = list()
                for word, tag in received_text2.tags:
                    if tag == 'NN':
                        nouns.append(word.lemmatize())
                        len_of_words = len(nouns)
                        rand_words = random.sample(nouns, len(nouns))
                        final_word = list()
                        for item in rand_words:
                            word = Word(item).pluralize()
                            final_word.append(word)
                            summary = final_word
                            end = time.time()
                            final_time = end-start
                            received_text = tweet_string
                            number_of_tokens = number_of_tokens
                            time_created = row[0]
                            blob_words = list(blob.words)
                            blob_sentiment = blob_sentiment
                            blob_subjectivity = blob_subjectivity
                            summary = summary
                            final_time = final_time
                            msg = "pass"
                
                # using Malaya for Malay lang tweet
                if lang_predict == 'MALAY':
                   malaya_sentiment = bayes_sentiment.predict(tweet_string)
                   malaya_proba = bayes_sentiment.predict(tweet_string, get_proba=True)
                   proba_value = malaya_proba.get(malaya_sentiment)

                   writer.writerow([count, msg, time_created, received_text, number_of_tokens, blob_words, malaya_sentiment, proba_value, blob_subjectivity, summary, final_time])
                else:
                   writer.writerow([count, msg, time_created, received_text, number_of_tokens, blob_words, textblob_sentiment, blob_sentiment,blob_subjectivity, summary, final_time])

                pass

    analyzetweets.close()
    print("Done building tweets sentiment!")
예제 #19
0
import math
from textblob import Word
from textblob.wordnet import VERB
from textblob.wordnet import NOUN
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

word_noun_forms = Word("match").get_synsets(pos=VERB)
word_verb_forms = Word("match").get_synsets(pos=NOUN)

word = TextBlob("modified")
ant_word = TextBlob("complicated")

vad_word = analyser.polarity_scores("modified")
vad_word2 = analyser.polarity_scores("complicated")

print(vad_word)
print(vad_word2)
print(word.sentiment.polarity)
print(ant_word.sentiment.polarity)
#print(word_noun_forms)
#print(word_verb_forms)

예제 #20
0
파일: 11_02.10.py 프로젝트: gvc2000/curso
# Section 11.2.10 snippets
from textblob import Word

word = Word('varieties')

word.stem()

word.lemmatize()



##########################################################################
# (C) Copyright 2019 by Deitel & Associates, Inc. and                    #
# Pearson Education, Inc. All Rights Reserved.                           #
#                                                                        #
# DISCLAIMER: The authors and publisher of this book have used their     #
# best efforts in preparing the book. These efforts include the          #
# development, research, and testing of the theories and programs        #
# to determine their effectiveness. The authors and publisher make       #
# no warranty of any kind, expressed or implied, with regard to these    #
# programs or to the documentation contained in these books. The authors #
# and publisher shall not be liable in any event for incidental or       #
# consequential damages in connection with, or arising out of, the       #
# furnishing, performance, or use of these programs.                     #
##########################################################################
예제 #21
0
def clean_textblob(sentence):
    stop_words = list(get_stop_words('en'))
    sentence = re.split('\W+', sentence)
    sentence = " ".join([w for w in sentence if not w in stop_words])
    return "".join([Word(w).lemmatize() for w in sentence])
예제 #22
0
     except IndexError:
         continue
     
 # NLP for the collection of tweets
 
 try:
 
     # clean up text
     tweets = standardize_text(tweetDF, "Text")
     
     # get rid of stop words
     tweets['stopwords'] = tweets['Text'].apply(lambda x: len([x for x in x.split() if x in stopTemp]))
     tweets['Text'] = tweets['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopTemp))
     
     # lemmatize
     tweets['Text'] = tweets['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
     
     # tokenize the text strings
     tweets["tokens"] = tweets["Text"].apply(tokenizer.tokenize)
     
     # add a sentence lengths variable
     sentence_lengths = [len(tokens) for tokens in tweets["tokens"]]
     tweets['length'] = pd.Series(sentence_lengths, index=tweets.index)
     
     # apply sentiment analysis
     embeddings = get_word2vec_embeddings(word2vec, tweets)
     totSent = pd.DataFrame(embeddings).mean(axis=0)
     
     # create index labels
     totSent.index = totSent.index.map(str)
     totSent.index = 's' + totSent.index.astype(str)
예제 #23
0
def sentimentPredict(df):
    df_ppm = df
    original_cols = df_ppm.columns
    health_plans = pd.Categorical(df_ppm['healthplan']).categories
    regions = pd.Categorical(df_ppm['region']).categories
    rptqtrs = pd.Categorical(df_ppm['rptqtr']).categories

    # In[3]:

    df_ppm.shape

    # In[5]:

    df_ppm.columns

    # In[4]:

    print(df_ppm[['sat1', 'sat1_oe']].head(10))

    # In[274]:

    #for d in df_ppm.describe().columns:
    #    print (df_ppm.describe()[d])

    # In[5]:

    df_ppm.info()

    # In[5]:

    df_ppm.isnull().sum()

    # In[7]:

    df_ppm.dtypes

    # In[6]:

    df_ppm['sat1'] = df_ppm['sat1'].astype(str)
    for ind, row in df_ppm.iterrows():
        try:
            if (row['sat1_oe'] == '') | (row['sat1_oe'] == ' '):
                if (row['sat1'] != '') & (float(row['sat1']) > 8.0):
                    df_ppm.loc[ind, 'sat1_oe'] = 'Positive Feedback'
                elif (row['sat1'] != '') & (float(row['sat1']) <= 7.0):
                    df_ppm.loc[ind, 'sat1_oe'] = 'Negative Feedback'
                elif (row['sat1'] != '') & (float(row['sat1']) == 8.0):
                    df_ppm.loc[ind, 'sat1_oe'] = 'Passive Feedback'
                elif row['sat1'] == '':
                    df_ppm.loc[ind, 'sat1_oe'] = ''
            else:
                continue
        except:
            continue

    # In[29]:

    df_ppm.head(10)

    # In[7]:

    print(df_ppm[['sat1', 'sat1_oe']].head(10))

    # In[8]:

    #length of comments
    data_nonnull = df_ppm[(df_ppm['sat1_oe'] != '')
                          & (df_ppm['sat1_oe'] != ' ')]
    data_nonnull.shape

    # In[9]:

    data_nonnull['sat1'] = data_nonnull['sat1'].astype(str)
    dfppm = data_nonnull[(data_nonnull['sat1'] != '')]

    # In[10]:

    dfppm.shape

    # In[11]:

    dfppm.reset_index(drop=True, inplace=True)
    dfppm['review_length'] = dfppm['sat1_oe'].apply(lambda x: len(str(x)))
    dfppm[['sat1_oe', 'review_length']].head(30)

    # In[12]:

    #word count
    dfppm['word_count'] = dfppm['sat1_oe'].apply(
        lambda t: len(str(t).split(" ")))
    dfppm[['sat1_oe', 'word_count']].head(20)

    # In[13]:

    dfppm.shape

    # In[14]:

    dfppm.columns

    # In[15]:

    def avg_word_len(text):
        tokens = str(text).split(" ")
        word_l = [len(word) for word in tokens]
        total_l = sum(word_l)
        avg_l = total_l / len(word_l)
        return (avg_l)

    #average word length
    dfppm['avg_word_length'] = dfppm['sat1_oe'].apply(
        lambda t: avg_word_len(t))
    dfppm[['sat1_oe', 'avg_word_length']].head(10)

    # In[16]:

    #replace quotation with nothing
    dfppm['cleaned'] = dfppm['sat1_oe'].apply(
        lambda t: str(t).replace("'", ""))
    #convert into lower case
    dfppm['cleaned'] = dfppm['cleaned'].str.lower()

    #replace punctuations
    dfppm['cleaned'] = dfppm['cleaned'].str.replace('[^\w\s]', '')
    dfppm[['sat1_oe', 'cleaned']].head(10)

    # In[13]:

    #Stopwords removal
    def prepareStopWords():

        stopwordsList = []

        # Load default stop words and add a few more specific to my text.
        stopwordsList = stopwords.words('english')
        stopwordsList.append('dont')
        stopwordsList.append('didnt')
        stopwordsList.append('doesnt')
        stopwordsList.append('cant')
        stopwordsList.append('couldnt')
        stopwordsList.append('couldve')
        stopwordsList.append('im')
        stopwordsList.append('ive')
        stopwordsList.append('isnt')
        stopwordsList.append('theres')
        stopwordsList.append('wasnt')
        stopwordsList.append('wouldnt')
        stopwordsList.append('a')
        stopwordsList.append('also')
        stopwordsList.append('could')
        stopwordsList.append('would')
        stopwordsList.append('might')
        stopwordsList.append('may')

        return stopwordsList

    stop = prepareStopWords()
    stop = [i.replace("'", "") for i in stop]

    dfppm['bagofwords'] = dfppm['cleaned'].apply(
        lambda x: " ".join(x for x in x.split() if x not in stop))
    dfppm[['sat1_oe', 'cleaned', 'bagofwords']].head(10)

    # In[14]:

    #common words removal
    #selecting only 5 because if it is more than 10 then the token reimbursement will be lost
    freq = pd.Series(' '.join(dfppm['bagofwords']).split()).value_counts()
    print(freq)

    # In[108]:

    #common words removal
    #selecting only 5 because if it is more than 10 then the token reimbursement will be lost
    freq10 = pd.Series(' '.join(
        dfppm['bagofwords']).split()).value_counts()[:10]
    print(freq10)

    # In[287]:

    #common words removal
    #selecting only 5 because if it is more than 10 then the token reimbursement will be lost
    freq5 = pd.Series(' '.join(dfppm['bagofwords']).split()).value_counts()[:6]
    print(freq5)

    # In[242]:

    type(freq5)

    # In[243]:

    freq5.index

    # In[15]:

    freq_s = freq.loc[[
        'uhc', 'insurance', 'unitedhealthcare', 'unitedhealthcares',
        'patients', 'patient', 'insurances', 'united', 'uniteds', 'healthcare'
    ], ]
    freq_s

    # In[16]:

    #all_words=pd.Series(' '.join(dfppm['bagofwords']).split()).value_counts()
    #freq5 = list(freq5.index)
    dfppm['bagofwords'] = dfppm['bagofwords'].apply(
        lambda x: " ".join(x for x in x.split() if x not in freq_s))
    dfppm[['sat1_oe', 'cleaned', 'bagofwords']].head(10)

    # In[17]:

    #Rare words removal
    #Optional
    in_freq = freq = pd.Series(' '.join(
        dfppm['bagofwords']).split()).value_counts()[-5:]
    print(in_freq)

    # In[18]:

    #Optional
    in_freq = list(in_freq.index)
    dfppm['bagofwords_rarewords'] = dfppm['bagofwords'].apply(
        lambda x: " ".join(x for x in x.split() if x not in in_freq))
    dfppm.columns

    # In[116]:

    #Spelling corrections

    #spell_checked_comments = dfppm['bagofwords'].apply(lambda t: str(TextBlob(t).correct()))
    #dfppm['spell_checked'] = spell_checked_comments
    #dfppm[['sat1_oe','cleaned','spell_checked','bagofwords']].head(20)

    # In[19]:

    print(dfppm.columns)
    #Stemming
    st = PorterStemmer()
    dfppm['stemmed_text'] = dfppm['bagofwords'].apply(
        lambda x: " ".join([st.stem(word) for word in x.split()]))
    dfppm['stemmed_text'].head(10)

    # In[20]:

    #Lemmatization
    #fb_corpus = " ".join(t for t in total_df['feedback'])
    #wnl = nltk.WordNetLemmatizer()
    #content_tokens = fb_corpus.split(" ")
    #cleaned_tokens = [x for x in content_tokens if x is not ""]
    #CORPUS_tokens = [wnl.lemmatize(t) for t in cleaned_tokens]
    #CORPUS = " ".join(CORPUS_tokens)

    from textblob import Word
    dfppm['bagofwords'] = dfppm['bagofwords'].apply(
        lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    dfppm['bagofwords'].head(5)

    # In[21]:

    #TF-IDF matrix

    tfidf = TfidfVectorizer(max_features=1000,
                            lowercase=True,
                            analyzer='word',
                            stop_words='english',
                            ngram_range=(1, 1))
    sklearn_tfidf = TfidfVectorizer(norm='l2',
                                    min_df=0,
                                    use_idf=True,
                                    smooth_idf=False,
                                    sublinear_tf=False)
    tfidf_set = sklearn_tfidf.fit_transform(dfppm['bagofwords'])
    tfidf_set

    # In[38]:

    #Polarity
    sia = SentimentIntensityAnalyzer()
    dfppm['polarity'] = dfppm['cleaned'].apply(
        lambda s: sia.polarity_scores(s))
    dfppm['polarity_compound'] = dfppm['cleaned'].apply(
        lambda s: sia.polarity_scores(s)['compound'])
    dfppm['polarity_positive'] = dfppm['cleaned'].apply(
        lambda s: sia.polarity_scores(s)['pos'])
    dfppm['polarity_negative'] = dfppm['cleaned'].apply(
        lambda s: sia.polarity_scores(s)['neg'])
    dfppm['polarity_neutral'] = dfppm['cleaned'].apply(
        lambda s: sia.polarity_scores(s)['neu'])
    dfppm['subjectivity'] = dfppm['cleaned'].apply(
        lambda s: TextBlob(s).sentiment.subjectivity)
    #total_df.to_csv(os.path.join(path,'new_scores_ppm.csv'),index=False)

    dfppm[[
        'sat1_oe', 'cleaned', 'bagofwords', 'polarity', 'polarity_compound',
        'polarity_positive', 'polarity_negative', 'polarity_neutral'
    ]].head(10)

    #Tested with both 'cleaned' and 'bagofwords' headers to see how sentiment values are varying

    # In[24]:

    dfppm.shape

    # In[172]:

    dfppm.columns

    # In[25]:

    parent_df = dfppm

    # In[26]:

    parent_df.shape

    # In[175]:

    parent_df.dtypes

    # In[27]:

    pd.Categorical(parent_df['sat1']).categories

    # In[28]:

    parent_df.isnull().sum()

    # In[31]:

    parent_df = parent_df[parent_df['sat1'] != 'nan']
    parent_df.shape

    # In[29]:

    parent_df.select_dtypes(include=['O']).columns
    #sat2 is a string here - actually should be numeric

    # In[30]:

    parent_df['sat1'] = parent_df['sat1'].astype(int)
    parent_df.select_dtypes(include=['int32', 'int64']).columns

    # In[31]:

    pos_ppm_nps = parent_df[(parent_df['sat1'] == 9) |
                            (parent_df['sat1'] == 10)]

    neu_ppm_nps = parent_df[(parent_df['sat1'] == 8)]
    neg_ppm_nps = parent_df[parent_df['sat1'] < 8]

    # In[32]:

    pos_ppm_nps.shape

    # In[36]:

    neg_ppm_nps.shape

    # In[134]:

    neu_ppm_nps.shape

    # In[42]:

    for ind, row in parent_df.iterrows():
        if (parent_df.loc[ind, 'sat1'] == 8):
            parent_df.loc[ind, 'sentiment'] = 'neutral'
        elif (parent_df.loc[ind, 'sat1'] < 8):
            parent_df.loc[ind, 'sentiment'] = 'negative'
        elif (parent_df.loc[ind, 'sat1'] > 8):
            parent_df.loc[ind, 'sentiment'] = 'positive'

    value_cnts_actual = parent_df['sentiment'].value_counts()
    print(value_cnts_actual)

    # In[43]:

    for ind, row in parent_df.iterrows():
        if (parent_df.loc[ind, 'polarity_positive'] >
                parent_df.loc[ind, 'polarity_negative']):
            parent_df.loc[ind, 'sentiment_predicted'] = 'positive'
        elif (parent_df.loc[ind, 'polarity_positive'] <
              parent_df.loc[ind, 'polarity_negative']):
            parent_df.loc[ind, 'sentiment_predicted'] = 'negative'
        elif (parent_df.loc[ind, 'polarity_positive'] == parent_df.loc[
                ind, 'polarity_negative']):
            parent_df.loc[ind, 'sentiment_predicted'] = 'neutral'

    value_cnts_pred = parent_df['sentiment_predicted'].value_counts()
    print(value_cnts_pred)

    # In[44]:

    actuals = parent_df['sentiment']
    predicted = parent_df['sentiment_predicted']

    actuals_list = parent_df['sentiment'].tolist()
    predicted_list = parent_df['sentiment_predicted'].tolist()

    vader_conf_matrix = pd.crosstab(actuals, predicted)

    confusionMatrix = ConfusionMatrix(actuals_list, predicted_list)
    confusionMatrix.print_stats()

    cmatrix = confusion_matrix(actuals_list, predicted_list)
    print(classification_report(actuals_list, predicted_list))
    accuracy_score(actuals_list, predicted_list)

    # In[45]:

    parent_df['textblob_polarity'] = parent_df['cleaned'].apply(
        lambda s: TextBlob(s).sentiment.polarity)

    # In[41]:

    parent_df.columns

    # In[46]:

    for ind, row in parent_df.iterrows():
        if (parent_df.loc[ind, 'textblob_polarity'] > 0):
            parent_df.loc[ind, 'textblob_sentiment'] = 'positive'
        elif (parent_df.loc[ind, 'textblob_polarity'] < 0):
            parent_df.loc[ind, 'textblob_sentiment'] = 'negative'
        elif (parent_df.loc[ind, 'textblob_polarity'] == 0):
            parent_df.loc[ind, 'textblob_sentiment'] = 'neutral'

    textblob_sentiments = parent_df['textblob_sentiment'].tolist()
    tb_confMatrix = ConfusionMatrix(actuals_list, textblob_sentiments)
    #tb_cm = pd.crosstab(actuals_list,textblob_sentiments)

    tb_confMatrix.print_stats()

    finalCols = list(original_cols) + [
        'polarity_compound', 'polarity_positive', 'polarity_negative',
        'polarity_neutral', 'sentiment', 'sentiment_predicted'
    ]
    visualdf = parent_df[finalCols]
    return (visualdf)
targetKeywords = di['targetKeywords']

#targetParagraphs = [di['targetParagraphs']]
targetParagraphs = di['targetParagraphs']

targetTitle = di['targetTitle']

# In[403]:

tstart = time.time()

# In[404]:

#Vector dimension extraction
postText_cleaned_splitted = [
    Word(i).lemmatize().strip().lower()
    for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(postText)))
]
targetTitle_cleaned_splitted = [
    Word(i).lemmatize().strip().lower()
    for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetTitle)))
]
targetDescription_cleaned_splitted = [
    Word(i).lemmatize().strip().lower()
    for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetDescription)))
]

#targetParagraphs_cleaned_splitted = [Word(j).lemmatize().strip().lower() for i in eval(str(targetParagraphs)) for j in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetParagraphs)))]
targetParagraphs_cleaned_splitted = [
    Word(i).lemmatize().strip().lower()
    for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetParagraphs)))
예제 #25
0
def lemmatize_ingredient(single_ingredient):
  words = single_ingredient.split()
  converted_words = [Word(word) for word in words]
  lemmatized_words = [converted_word.lemmatize() for converted_word in converted_words]
  lemmatized_text = " ".join(lemmatized_words)
  return(lemmatized_text)
sw = stopwords.words('english')
df['Phrase'] = df['Phrase'].apply(
    lambda x: " ".join(x for x in x.split() if x not in sw))

df.head()

#Deleting Less Words
sil = pd.Series(' '.join(df['Phrase']).split()).value_counts()[-1000:]
df['Phrase'] = df['Phrase'].apply(
    lambda x: " ".join(x for x in x.split() if x not in sil))

#Lemmization
from textblob import Word
nltk.download('wordnet')
df['Phrase'] = df['Phrase'].apply(
    lambda x: " ".join([Word(i).lemmatize() for i in x.split()]))

df.head()

df.info()

from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
df["sonuç"] = encoder.fit_transform(df["sonuç"])

df.head()

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df["Phrase"],
                                                    df["sonuç"],
예제 #27
0
    for line in low:
        #print line
        line = line.translate(None, '",.()!;/?0123456789')  #过滤标点和数字
        #tem = TextBlob(line)
        #s = tem.correct()
        list = []
        for word in line.split():
            list.append(str(word).strip())
        wordDict[i] = list
        i = i + 1
#SpellCorrect 这里使用的是TEXTBLOB库。花时间较长。
correct_Dict = {}
for index, words in wordDict.items():
    list = []
    for word in words:
        w = Word(word)
        list.append(w.correct())
    correct_Dict[index] = list
#Singularization 使用TEXTBLOB,花时间较长。
singular_Dict = {}
for index, words in correct_Dict.items():
    list = []
    for word in words:
        list.append(str(word.singularize()))
    singular_Dict[index] = list

#stemming 词干提取 提取后有些单词不知道什么意思了。所以注释掉了。使用的nltk
'''
stemed_Dict = {}#词干提取后的词典
from nltk.stem import PorterStemmer
pst = PorterStemmer()
예제 #28
0
 def lemmatize(self, list_of_tokens):
     # Given a tokenized text, lemmatizes all nouns (otherwise it requires POS analisys)
     # THIS TAKES A LONG TIME EVEN FOR A SHORT STRING
     pos_tags = [TextBlob(e).tags[0] for e in list_of_tokens]
     return [Word(word).lemmatize(wordnet_pos_code(tag)) for (word, tag) in pos_tags]
예제 #29
0
from textblob import TextBlob, Word

sentence = TextBlob("he wass verry siccck ")
print(sentence.correct())


word = Word("gooa")
print(word.spellcheck())

예제 #30
0
    ## 2. Removing digits
    df['user_story'] = df['user_story'].apply(lambda x: ''.join([x for x in x if not x.isdigit()]))


    ## 2.4 Common word (removal)
    pd.Series(' '.join(df['user_story']).split()).value_counts()[:10]
    # uncommon words
    pd.Series(' '.join(df['user_story']).split()).value_counts()[-10:]

    ## 2.5 Tokenization

    ## 2.6 Stemming
    user_story_stemmed = df['user_story'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))

    ## 2.7 Lemmatization
    user_story_lemmatized = df['user_story'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    df['user_story'] = df['user_story'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    # remove stopwords again since corpus is now lematized
    # first lemmatize the added stopwords
    stop_words.add(Word("user").lemmatize())
    stop_words.add(Word("want").lemmatize())
    stop_words.add(Word("able").lemmatize())
    stop_words.add(Word("would").lemmatize())
    stop_words.add(Word("like").lemmatize())
    stop_words.add(Word("need").lemmatize())

    # filter out stopwords
    df['user_story'] = df['user_story'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

#### Advanced text processing