def metatransformation(client, db, query, to_train=True):
    """
    Arguments:
    Query: MongoDB query 
    to_train:  True: return a train and test dataset
    False: return only data to predict out of sample
    
    Step :1
        - Extract the data from MonGoDb
        
    Step 2:
        - Exclude multi tickers

    Step :3
        - take negation into account:
            - "not","no","none","neither","never" or “nobody”
        - Convert digit to "numbertag"        
        - Remove @USER
        - Remove ticker
        - Remove special characters
        - Lower test

    Step 4: Remove stop words
    Step 5: Lemmanize
    Step 6: Train/test set

    """

    text = pd.DataFrame(list(db.messages.find(query)))

    # Count stock

    text["count_stock"] = text["symbols"].apply(lambda x: len(x))

    # Extract single count

    text = text[text["count_stock"].isin([1])]

    # take negation into account
    # Convert digit to "_digit"
    # Remove @USER
    # Remove unicode issue
    # Remove ticker
    # Remove all the special characters
    # remove all single characters
    # Remove Ya
    # Remove bitcoin
    # remove btc
    text["body_transform"] = text["body"].replace(
        regex={
            r"\bnothing\b": "negtag_nothing",
            r"\bno\b": "negtag_no",
            r"\bnone\b": "negtag_none",
            r"\bneither\b": "negtag_neither",
            r"\bnever\b": "negtag_never",
            r"\bnobody\b": "negtag_nobody",
            r"\d+": "numbertag ",
            r"([@?])(\w+)\b": "user",
            r"\b&#\b": " ",
            r"[$][A-Za-z][\S]*": "",
            r"\W": " ",
            r"\s+[a-zA-Z]\s+": " ",
            r"\^[a-zA-Z]\s+": " ",
            r"\s+": " ",
            r"^b\s+": "",
            r"\bya\b": "",
            r"\bbitcoin\b": "",
            r"\bBitcoin\b": "",
            r"\bbtc\b": "",
        })
    # Lower

    text["body_transform"] = text["body_transform"].str.lower()

    # Remove stop words

    stop = stopwords.words('english')

    text["body_transform"] = text["body_transform"].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    # Lemmatize

    lemmatizer = WordNetLemmatizer()
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

    text["body_transform"] = text["body_transform"].apply(lambda x: " ".join(
        [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]))

    # Split the dataset

    X_ = text["body_transform"]
    y_ = text["sentiment_"]

    count_ = text.groupby("sentiment")["sentiment"].count()

    print("The shape of the data is {}, and {}".format(text.shape, count_))

    if to_train:
        X_train, X_test, y_train, y_test = train_test_split(X_,
                                                            y_,
                                                            test_size=0.1,
                                                            random_state=0)

        return X_train, X_test, y_train, y_test

    else:

        return X_
예제 #2
0
'''

import pandas as pd
import nltk
from nltk import pos_tag, RegexpParser, Tree
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import re
import string
import config

lemmatizer = WordNetLemmatizer()
stopword = list(
    set(stopwords.words('english')).difference(config.REMOVE_FROM_STOPWORDS))
stopword.extend(config.STOPWORDS_EXTEND)
indicators = config.INDICATOR_LIST


def text_cleaner(sent):
    '''
    membersihkan tanda baca, dan mengoreksi singkatan (syntactical noise)
    '''
    sent = sent.lower()
    sent = re.sub(r"\'s", " is ", sent)
    # sent = re.sub(r"\'", "", sent)
    sent = re.sub(r"@", " ", sent)
    sent = re.sub(r"\'ve", " have ", sent)
 def __init__(self, data, column_name):
     self.data = data
     self.column_name = column_name
     self.stemmer = PorterStemmer()
     self.lemmatiser = WordNetLemmatizer()
예제 #4
0
 def __init__(self):
     nltk.download('wordnet')
     nltk.download('stopwords')
     self.wordnet_lemmatizer = WordNetLemmatizer()
예제 #5
0
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

paragraph = """  My biological mother found out later that my mother had never graduated from college and that my 
                 father had never graduated from high school. She refused to sign the final adoption papers. 
                 She only relented a few months later when my parents promised that I would go to college.
                This was the start in my life. And 17 years later I did go to college. But I naively chose a 
                college that was almost as expensive as Stanford, and all of my working-class parents’ savings 
                were being spent on my college tuition. After six months, I couldn’t see the value in it. 
                I had no idea what I wanted to do with my life and no idea how college was going to help 
                me figure it out. And here I was spending all of the money my parents had saved their 
                entire life. So I decided to drop out and trust that it would all work out OK.
                It was pretty scary at the time, but looking back it was one of the best decisions I ever made.
                The minute I dropped out I could stop taking the required classes that didn’t interest me, 
                and begin dropping in on the ones that looked far more interesting.It wasn’t all romantic. 
                I didn’t have a dorm room, so I slept on the floor in friends’ rooms, I returned coke 
                bottles for the $0.05 deposits to buy food with, and I would walk the 7 miles across 
                town every Sunday night to get one good meal a week at the Hare Krishna temple. I loved it.               
            """

sentences = nltk.sent_tokenize(paragraph) ## converted paragraph into sentences
lemmatizer = WordNetLemmatizer() ## created lemmatizer object

# Lemmatization Process
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)
예제 #6
0
def tokenize(fileItem: list) -> None:
    ps = PorterStemmer().stem
    wnl = WordNetLemmatizer()
    lem = wnl.lemmatize
    lemmaCache = dict()

    tokenDict = dict()
    filePath = fileItem[1]
    docID = int(fileItem[0])

    with open(filePath, 'r') as content_file:
        textContent = content_file.read()
        jsonOBJ = json.loads(textContent)
        htmlContent = jsonOBJ["content"]

        # initialize BeautifulSoup object and pass in html content
        soup = BeautifulSoup(htmlContent, 'html.parser')

        # Deletes HTML comments, javascript, and css from text
        for tag in soup(text=lambda text: isinstance(text, Comment)):
            tag.extract()
        for element in soup.findAll(['script', 'style']):
            element.extract()

        # Collect all words found from html response WITH TAGS IN A TUPLE WITH EACH WORD ('word', 'tag')
        # Tags below are in order of importance/weight
        tagNamesList = [
            'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', 'a',
            'p', 'span', 'div'
        ]
        tagsTextList = []
        for tag in tagNamesList:
            tagsTextList.append(soup.find_all(tag))

        ##### REDIS ONLY START #####
        # urlContent = jsonOBJ["url"]

        # # return if html text has identical hash
        # # Add all tokens found from html response with tags removed
        # varTemp = soup.get_text()
        # if util.isHashSame(varTemp):
        #    util.addDuplicateURL(docID, urlContent)
        #    return

        # # Add unique url to redis
        # util.addUniqueURL(docID, urlContent)
        ##### REDIS ONLY END #####

        taggedTextDict = dict()
        for i, tagSubList in enumerate(tagsTextList):
            taggedTextDict[tagNamesList[i]] = list()
            for phrase in tagSubList:
                for word in re.split(r"[^a-z0-9']+",
                                     phrase.get_text().lower()):
                    taggedTextDict.get(tagNamesList[i]).append(word)

        # Store words as tokens in tokenDict, ignore words that are bad
        for tag, wordList in taggedTextDict.items():
            for word in wordList:
                if (len(word) == 0):  # ignore empty strings
                    continue
                if (
                        len(word) > 30 and tag != 'a'
                ):  # ignore words like ivborw0kggoaaaansuheugaaabaaaaaqcamaaaaolq9taaaaw1bmveuaaaacagiahb0bhb0bhr0ahb4chh8dhx8eicifisiukt4djzankywplcwhltkfpl8nn0clpvm9qumvvxu8wnvbrezesepkyxvwzxbpbnjqb3jtcxruc3vvdxhzdnhyehtefjvdf5xtjkv
                    continue  # But accept any URLs that may be large
                if (word[0] == "'"):  # ignore words that start with '
                    continue
                if (len(word) == 1
                        and word.isalpha()):  # ignore single characters
                    continue

                # will not change numbers/digits
                # lemmatized things that are 3 letter or greater
                if not any(char.isdigit() for char in
                           word) and len(word) > 2 and word not in lemmaCache:
                    # Lemmatization of a word with a number is usually itself.
                    # lemmatization of in, on, as, is usually itself.
                    # Checking for the above and if word is not already cached saves time.
                    # gets the part of speech or a word, to make lemmatization more accurate
                    pos = tag_map[pos_tag((word, ))[0][1][0]]
                    lemWord = lem(word, pos)  # lemmatized word

                    #catches words that lemmatization misses and porter stemmer in its place
                    if word[-2:] == "ly" or word[-4:] == "ness" or word[
                            -3:] == "ish":  # Catches any ly, ness, or ish that lemmatize doesnt catch. Words are less accurate, but cuts off extraneous words.
                        lemWord = ps(word)
                    lemmaCache[word] = lemWord
                else:
                    lemmaCache[word] = word  # the lemma of the word is itself

                if lemmaCache[word] in tokenDict:
                    tokenDict.get(lemmaCache[word]).incFreq()
                else:
                    tokenDict[lemmaCache[word]] = Posting(docID, 1, tag)

                if len(
                        lemmaCache
                ) > 5000000:  # Save up to 5million tokens, and then clear to prevent too much memory error
                    lemmaCache.clear()

        # Write tokens and their Postings to a text file ("store on disk")
        buildIndex(tokenDict)
csv_f=csv.reader(f)
Abstracts_2011=[]
for row in csv_f:
    Abstracts_2011.append(row[0])    	
Abstracts_str_2011=' '.join(Abstracts_2011)

#Clean and return stopwords from 2011 abstracts          
alpha_only_2011 = re.sub("[^a-zA-Z]", " ", Abstracts_str_2011)  
words_2011 = alpha_only_2011.lower().split()
#Was testing stopword removal with counter   
#counter1=Counter(words)
meaningful_words_2011 = [w for w in words_2011 if not w in stops] 
#Was testing stopword removal with counter     
#counter2=Counter(meaningful_words)

patent_lemmatizer=WordNetLemmatizer()
lemmatized_words_2011 = patent_lemmatizer.lemmatize({(meaningful_words_2011)})

#Read 2012 Abstracts from CSV file    
g = open('PowerAbstracts_csv_2012.csv')
csv_g=csv.reader(g)
Abstracts_2012=[]
for row in csv_g:
    Abstracts_2012.append(row[0])    	
Abstracts_str_2012=' '.join(Abstracts_2012)

#Clean and return stopwords from 2012 abstracts          
alpha_only_2012 = re.sub("[^a-zA-Z]", " ", Abstracts_str_2012)  
words_2012 = alpha_only_2012.lower().split()
#Was testing stopword removal with counter   
#counter1=Counter(words)
예제 #8
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter

positive_file_path = 'Data/positive.txt'
negative_file_path = 'Data/negative.txt'
myStemmer = WordNetLemmatizer()
n_lines = 10000000


def create_vocab(pos_file, neg_file):
    all_tokens = []
    for f in [pos_file, neg_file]:
        with open(f, 'r') as f:
            sentences = f.readlines()
            for s in sentences[:n_lines]:
                words_in_sentence = word_tokenize(s.lower())
                all_tokens += list(words_in_sentence)
    all_tokens = [myStemmer.lemmatize(i) for i in all_tokens]
    unique_words = Counter(all_tokens)
    vocab = []
    for w in unique_words:
        if 1000 > unique_words[w] > 50:
            vocab.append(w)
    print("The size of the vocab is:", len(vocab))
    return vocab
예제 #9
0
def prep_text(df, model, datapath='./data/', stemming=0):
    """
    0. replace NA/missing values with ""
    1. filter the non english patents for now,
    2. remove punctuations,
    3. stop words
    4. lemmatize
    5. stemming?
    and what else?

    :return:
    """
    print("Embeddings loaded, Preparing data for semantic analysis...")

    #0. replace NA/missing values with ""
    df.fillna('', inplace=True)


    #1. filter the non english patents for now,
    print ("No Multiligual support for semantic search yet, Please give English inputs only.")
    df = df[df.lang=='en']
    print (df.shape[0], "english patents exist.")

    # combine all text
    df['text'] = df[['titles', 'abstract', 'descriptions', 'claims']].values.tolist()


    # 2. remove punctuations,
    print("Removing punctuations")

    df['text'] = df.text.apply(lambda text: str(text))
    df['text'] = df.text.apply(word_tokenize)
    df['text'] = df.text.apply(lambda text: [word for word in text if word.isalpha()])


    # filter out stop words from all languages
    print("Removing stopwords...")

    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    stop_words_fr = set(stopwords.words('french'))
    stop_words_de = set(stopwords.words('german'))
    stop_words.update(stop_words_de)
    stop_words.update(stop_words_fr)

    df['text'] = df.text.apply(lambda text: [w for w in text if not w in stop_words])


    # converts the word to its meaningful base form, infer the POS automatically
    print("Lemmatizing...")

    lemmatizer = WordNetLemmatizer()
    df['text'] = df.text.apply(lambda text: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text])



    # stemming of words
    if stemming:
        porter = PorterStemmer()
        df['text'] = df.text.apply(lambda text: [porter.stem(word) for word in text])



    # load model and filter words that exist in our pretrained word2vec model
    print("Infering semantics")

    df['text'] = df.text.apply(lambda text: [word for word in text if word in model.vocab])
    df.dropna(inplace=True)
    df = df[df.text.apply(lambda text: len(text)>0)]

    print("Counting occurences")
    # count frequency of words
    df['freq_dict'] = df.text.apply(lambda text: dict(Counter(text)))


    # save in ftr to disk
    df = df.reset_index(drop=True)
    df.to_feather(datapath + 'df_tok_freq.ftr')

    print ("Dataframe saved to ", datapath + "df_tok_freq.ftr  with columms", df.columns.to_list())
예제 #10
0
 def __init__(self):
     self.wnl = WordNetLemmatizer()
def stemText(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    stemmed = []
    for word in text:
        stemmed.append(wordnet_lemmatizer.lemmatize(word))
    return stemmed
예제 #12
0
def lemmatize_tweet(tweet):
    lemmatizer = WordNetLemmatizer()
    word_list = word_tokenize(tweet)
    result = ''
    result = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return result
예제 #13
0
def lemmatizationFunct(x):
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    finalLem = [lemmatizer.lemmatize(s) for s in x]
    return finalLem
예제 #14
0
파일: util.py 프로젝트: JimSEvans/fnc-p1
 def __init__(self):
     self.lemmatizer = WordNetLemmatizer()
     self.mystopwords = stopwords.words('english') + ['n\'t','wo']
예제 #15
0
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stems = stem_lemmatize_tokens(tokens, lemmatizer, stemmer)
    return stems
예제 #16
0
 abstract_process = re.sub(r'[\d]', '', abstract_process)
 abstract_process = abstract_process.lower()
 abstract_process = abstract_process.strip()
 word_tokens = abstract_process.split(
 )  #splits words from comments into list
 #Step 6: Map words to contracted word dictionary and substitute them
 word_tokens = [
     APPOSTOPHES[word] if word in APPOSTOPHES else word
     for word in word_tokens
 ]
 '''remove any items in list that is empty as it 
 causes error to pos_tag'''
 word_tokens = [w for w in word_tokens if len(w) > 0]
 #Step 7: Parts of Speech tagging of each word
 from nltk.stem import WordNetLemmatizer
 wnl = WordNetLemmatizer()
 new_word_token_list = []
 new_word_token = nltk.pos_tag(word_tokens)
 new_word_token_list.append(new_word_token)
 [new_word_token_list] = new_word_token_list
 #Step 8: Lemmatize Tokenized Text
 post_lemm = []
 for word, tag in new_word_token_list:
     wntag = tag[0].lower()
     wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
     if not wntag:
         lemma = word
         post_lemm.append(lemma)
     else:
         lemma = wnl.lemmatize(word, wntag)
         post_lemm.append(lemma)
def lemmatize(chunklist):
  lemmalist=[]
  lemmatizer = WordNetLemmatizer() 
  for i in range(len(chunklist)):
    lemmalist.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(chunklist[i])]))
  return lemmalist
예제 #18
0
 def __init__(self, decode_error="strict", strip_accents='unicode', lowercase=True, contractions=True,\
                 ignore_list=[], stopwords=None, remove_html=True, treat_urls="join", extract_phrases=True,\
                 treat_ner="replace_ner", lemmatize=False, stemming=True, spellcheck=False, tokenize=True,\
                 join_char="_", 
                 ):
     '''A comprehensive text pre-processing class
     ...
     Attributes
     ----------
     contractions: bool
         whether to decontract words like you're -> you are
     
     ignore_list: list
         list of characters to remove from the text
         
     stopwords: set
         set of words to be considered as stopwords, if None then the stopwords won't be removed
         
     remove_html: bool
         remove html tags important for the text data scrapped from web pages
         
     treat_urls: str: 
         how should be the urls present in the text be treated
         takes values from (join, remove, replace), if selected as replace: urls will be replaced with _url_
     
     extract_phrases: extracts most common phrases from the text and joins them with a predefined character
     
     treat_ner: string
         extracts and treats named entities from the text
         replace: the entities will be replaced by a common token (_ner_)
         replace_ner: entities will be replaced by corresponding named entity eg. _person_, _location_ etc.
         join: entities token will be joined together by a joining character
         None: doesn't extract NER
         
     lemmatize: lemmatize the tokens
     
     stemming: tokens are stemmed
     
     spellcheck: should the spelling be corrected
     
     tokenize: text will be returned as a list of tokens
     
     Methods
     -------
     
     Returns
     -------
     '''
     self.lowercase = lowercase
     self.decode_error = decode_error
     self.strip_accents = strip_accents
     self.contractions = contractions
     self.ignore_list = ignore_list
     self.stopwords = stopwords
     self.remove_html = remove_html
     self.treat_urls = treat_urls
     self.extract_phrases = extract_phrases
     self.treat_ner = treat_ner
     self.lemmatize = lemmatize
     self.stemming = stemming
     self.spellcheck = spellcheck
     self.tokenize = tokenize
     self.join_char = join_char
     self.compound_pattern = re.compile(r'\w+(\-\w+)+') #here-there
     self.stanford_ner = load_stanford_ner_tagger("stanford_ner_path")
     self.spell_checker = SpellChecker()
     self.stemmer = SnowballStemmer("english")
     self.lemmatizer = WordNetLemmatizer()
     disable = ['parser']
     if self.treat_ner == None:
         disable.append('ners')
     self.nlp = spacy.load('en_core_web_sm', disable=disable)
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
nltk.download('stopwords')
nltk.download('wordnet')temporal_data = load_files(r"txt_sentoken")
X, y = temporal_data.data, temporal_data.targetdocuments = []
stemmer = WordNetLemmatizer()# Pre-processing tasks
for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)    # Converting to Lowercase
    document = document.lower()    # Lemmatization
    document = document.split()    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)    documents.append(document)# Bag of Words model to convert text documents into numerical features
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()# Training and testing splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)# Evaluating the model
print(confusion_matrix(y_test,y_pred))
예제 #20
0
def main(index_dir, silent, context_size, folder, use_gender):
    doc_path = os.path.join(index_dir, 'docs.list')
    lex_path = os.path.join(index_dir, 'words.lex')
    idx_path = os.path.join(index_dir, 'index.bin')

    documents = Documents.load(doc_path)
    lexicon = Lexicon.load(lex_path)

    words = get_lexicon()
    stop_words = set(
        list(STOP_WORDS) + [
            "know", "don", "ve", "say", "way", "said", "ll", "think", "thing",
            "don’t", "like", "got", "people", "going", "talk", "right",
            "happened", ">>"
        ])
    print("Stop words", stop_words)

    doc_idxs = range(144, 246923)
    word_idx_dic = {}
    idx_counter = 0

    # Create folder
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Create stemmer
    stemmer = WordNetLemmatizer()
    with CaptionIndex(idx_path, lexicon, documents) as index:
        for doc_id in tqdm.tqdm(doc_idxs):
            dic = {}
            count = 1
            if use_gender:
                intervals_gender = gender_to_time(str(doc_id), gender_reqs)
                postings = []
                for t1, t2 in intervals_gender:
                    postings.extend(index.intervals(int(doc_id), t1, t2))
            else:
                postings = index.intervals(int(doc_id))

            starttime = None

            for p in postings:
                if starttime is None:
                    starttime = p.start

                # Cut after 30s
                if p.end - starttime > 30 * count:
                    pickle.dump(
                        dic,
                        open(
                            os.path.join(
                                folder,
                                'Doc_%d_Chunk_%d.p' % (doc_id, count - 1)),
                            'wb'))
                    dic = {}
                    count += 1
                    starttime = p.end

                # Get words in posting
                tokens = index.tokens(0, p.idx, p.len)
                if not tokens:
                    continue
                for token in tokens:
                    word = words[token]
                    # stemmed_word = stemmer.stem(word)
                    if word not in stop_words and len(word) > 1:
                        stemmed_word = stemmer.lemmatize(word)
                        # print("Word {} -> {}".format(word, stemmed_word))
                        if stemmed_word not in word_idx_dic.keys():
                            word_idx_dic[stemmed_word] = idx_counter
                            idx_counter += 1
                        idx_token = word_idx_dic[stemmed_word]
                        if idx_token in dic:
                            dic[idx_token] += 1
                        else:
                            dic[idx_token] = 1
    pickle.dump(word_idx_dic, open(os.path.join(folder, "word_idx.p"), "wb"))
예제 #21
0
 def __init__(self, stem=False):
     self.wnl = WordNetLemmatizer()
     if stem:
         self.stemmer = SnowballStemmer('english')
     else:
         self.stemmer = Bunch(stem=lambda x: x)
예제 #22
0
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    tokenized = word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokenized]
예제 #23
0
def senti_features(corpus):
    tokenizer = TweetTokenizer(preserve_case=False,
                               reduce_len=True,
                               strip_handles=True)
    lemma = WordNetLemmatizer()
    NEGATE = \
    ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
    X = []
    afinn = Afinn(emoticons=True)
    analyzer = SentimentIntensityAnalyzer()
    for line in corpus:
        token = tokenizer.tokenize(line)
        token = [
            word for word in token if word not in stopwords.words('english')
        ]
        #token = [porter.stem(i.lower()) for i in token]
        token = [lemma.lemmatize(word) for word in token]
        poseachtweet = []
        negeachtweet = []
        poseachtweet1 = []
        negeachtweet1 = []
        neutral = 0
        prev_neg = 0
        for lem in token:
            a, b = 0, 0
            syn = list(swn.senti_synsets(lem))
            for sy in syn:
                a += sy.pos_score()
                b += sy.neg_score()
            if (len(syn) != 0):
                a = a / len(syn)
                b = b / len(syn)
            if prev_neg == 1:
                a, b = b, a
            poseachtweet.append(a)
            negeachtweet.append(b * -1)

            sc = afinn.score(lem)
            if prev_neg == 1:
                sc = sc * -1
                prev_neg = 0
            if sc > 0:
                poseachtweet1.append(sc)
                negeachtweet1.append(0)
            elif sc < 0:
                negeachtweet1.append(sc)
                poseachtweet1.append(0)
            else:
                negeachtweet1.append(0)
                poseachtweet1.append(0)

            if lem in NEGATE:
                prev_neg = 1

        max_pos = 0
        max_neg = 0
        imbal = 0
        avg_pos = 0
        avg_neg = 0
        pol = 0
        contrast = 0
        max_pos1 = 0
        max_neg1 = 0
        imbal1 = 0
        avg_pos1 = 0
        avg_neg1 = 0
        pol1 = 0
        contrast1 = 0
        tweetscore1 = 0
        avg_pos2 = 0
        avg_neg2 = 0
        pol2 = 0
        contrast2 = 0
        polarity = 0
        subjectivity = 0

        if (len(token) != 0):
            max_pos = max(poseachtweet)
            max_neg = min(negeachtweet)
            imbal = max_pos + max_neg
            avg_pos = np.count_nonzero(poseachtweet) / len(token)
            avg_neg = np.count_nonzero(negeachtweet) / len(token)
            pol = sum(poseachtweet) + sum(negeachtweet)
            if (max_pos != 0) and (max_neg != 0):
                contrast = 1

            max_pos1 = max(poseachtweet1)
            max_neg1 = min(negeachtweet1)
            imbal1 = max_pos1 + max_neg1
            avg_pos1 = np.count_nonzero(poseachtweet1) / len(token)
            avg_neg1 = np.count_nonzero(negeachtweet1) / len(token)
            pol1 = sum(poseachtweet1) + sum(negeachtweet1)
            if (max_pos1 != 0) and (max_neg1 != 0):
                contrast1 = 1
            tweetscore1 = afinn.score(line) / len(token)

            vs = analyzer.polarity_scores(line)
            avg_pos2 = vs['pos']
            avg_neg2 = vs['neg']
            pol2 = vs['compound']
            if (avg_pos2 != 0) and (avg_neg2 != 0):
                contrast2 = 1

            polarity = TextBlob(str(line)).sentiment.polarity
            subjectivity = TextBlob(str(line)).sentiment.subjectivity

        X.append([
            int(contrast),
            float(avg_pos),
            float(avg_neg),
            float(imbal),
            float(pol),
            int(contrast1),
            float(tweetscore1),
            float(avg_pos2),
            float(avg_neg2),
            float(polarity),
            float(subjectivity)
        ])
    return X
예제 #24
0
def lemmatize(token):
    """Returns lemmatization of a token"""
    return WordNetLemmatizer().lemmatize(token, pos='v')
예제 #25
0
def my_clean(text, stops=False, stemming=False):
    text = str(text)
    text = re.sub(r" US ", " american ", text)
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"don't", "do not ", text)
    text = re.sub(r"aren't", "are not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"%", " percent ", text)
    text = re.sub(r"that's", "that is ", text)
    text = re.sub(r"doesn't", "does not ", text)
    text = re.sub(r"he's", "he is ", text)
    text = re.sub(r"she's", "she is ", text)
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.lower().split()
    text = [w for w in text if len(w) >= 2]
    if stemming and stops:
        text = [
            word for word in text if word not in stopwords.words('english')
        ]
        wordnet_lemmatizer = WordNetLemmatizer()
        englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
        text = [englishStemmer.stem(word) for word in text]
        text = [wordnet_lemmatizer.lemmatize(word) for word in text]
        text = [
            word for word in text if word not in stopwords.words('english')
        ]
    elif stops:
        text = [
            word for word in text if word not in stopwords.words('english')
        ]
    elif stemming:
        wordnet_lemmatizer = WordNetLemmatizer()
        englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
        text = [englishStemmer.stem(word) for word in text]
        text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    return text
예제 #26
0
def example_three():

    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w, 'v') for w in text1]

    return len(set(lemmatized))
예제 #27
0
파일: word2vec.py 프로젝트: deesaw/DNLP02
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re
from textblob import TextBlob
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
spell = SpellChecker()
ps = PorterStemmer()
wordnet = WordNetLemmatizer()


def speak(text):
    try:
        a = TextBlob(text).correct()
    except:
        a = text
    finally:
        return (str(a))


df = pd.read_csv('ICD_CODES.tsv', sep='\t')
df['Issue'] = df['CODES'].str.split(n=1).str[1]
df['CODES'] = df.CODES.str.split().str.get(0)
df['Issuew2v'] = df['Issue']
df['Issuetextblob'] = df['Issue']
예제 #28
0
# Adjectives
AJ: { <CA>(<CC>?<CA>)* }

# Entities
EN: {<AJ>?<NN.*|FW>+}

# Noun-phrases
NP: {<DT>?<CC>?(<CC><CD>)*<EN>(<CC>?<EN>)*}

# Rest should be considered as a Verb-Phrase Chunk
VP: {<.*>+}
}<NP>+{
'''
PARSER = RegexpParser(GRAMMAR)
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = stopwords.words('english')


class TextParser:
    @staticmethod
    def calculate_similarity(a, b) -> float:
        return SequenceMatcher(None, a, b).ratio()

    @staticmethod
    def generate_pos_tag_sets(input_string: str) -> next:
        """
    Break given string into sentences, and return their pos-tagged lists.\n
    **REQUIRES AN ACTIVE POS TAGGER TO BE RUNNING!!**
        :param input_string: input string. may contain one or more sentences
        """
    if days < 0:
        creditScore['days_employed'] = creditScore['days_employed'].replace(days, 0)
        
creditScore['years_employed'] = (creditScore['days_employed']/365).astype(int).round()

# change float type to int for total_income
creditScore['total_income'] = creditScore['total_income'].astype(int)

#handle duplicates in education
creditScore['education'] = creditScore['education'].str.lower()

#handle duplicates in purpose
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmma = WordNetLemmatizer()

for purpose in creditScore['purpose']:
    words = nltk.word_tokenize(purpose)

    if 'education' in words or 'university' in words or 'educated' in words:
        creditScore['purpose'].replace(purpose, 'education',inplace=True)
    
    if 'car' in words or 'cars' in words:
        creditScore['purpose'].replace(purpose,'car',inplace=True)
        
    if 'house' in words or 'housing' in words or 'estate' in words or 'property' in words:
        creditScore['purpose'].replace(purpose,'real estate',inplace=True)

    if 'wedding' in words:
        creditScore['purpose'].replace(purpose,'wedding',inplace=True)
예제 #30
0
 def __init__(self):
     self.list_of_categories = ["sports", "health", "religion", "politics", "technology", "science", "culture", "travel", "food", "business"]
     self.lemmatizer = WordNetLemmatizer()
     file = open("classifier", "rb")
     self.clf = pickle.load(file)