示例#1
0
 def __init__(self):
     with open('./stopwords.txt') as f:
         more_stopword=f.read().split('\n')
     
     SWfactory = StopWordRemoverFactory()
     stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
     self.stopword = StopWordRemover(stopword_data)
示例#2
0
def modif(kalimat):
    # nltk.download('punkt')
    pat = ""

    kalimat = kalimat.translate(str.maketrans('', '',
                                              string.punctuation)).lower()
    # case folding & menghilangkan tanda baca . ,

    tokens = nltk.tokenize.word_tokenize(kalimat)  # tokenization

    fac = StopWordRemoverFactory()  #set stopword
    stop = fac.get_stop_words()
    stop.append("kak")  #menambahkan "kak" ke dalam kamus stopword

    stop.remove("tidak")  #menghapus kata "tidak"
    stop.remove("boleh")  #menghapus kata "boleh"
    stop.remove("bisa")  #menghapus kata "bisa"
    stop.remove("dimana")
    removed = []
    for t in tokens:
        if t not in stop:
            removed.append(t)  #stopword removal

    pat = ""

    for w in removed:
        pat += w + " "

    return (pat)
示例#3
0
    def stopwords_removal(self, text, stopwords, output_stopwords):
        with open(dataOutputPath, encoding='utf-8') as f:
            text = f.read()
            f.close()

        with open(stopwords, encoding='utf-8') as f:
            list_stopwords = f.read()
            f.close()

        stop_factory = StopWordRemoverFactory()
        more_stopwords = list_stopwords.split("\n")

        #Tambahkan Stopword Baru
        data = stop_factory.get_stop_words() + more_stopwords
        stopword = stop_factory.create_stop_word_remover()
        remove_stopwords = stopword.remove(text)

        with open(pathStopwords, 'w', encoding='utf-8') as f:
            f.write(remove_stopwords)
            f.close()

        print(
            "Stopwords Removal success!\nCount Words Frequency on process...")

        return remove_stopwords
示例#4
0
def Preprocessing(data):
    print("Preprocessing")
    cleanData = []
    tokenizer = RegexpTokenizer(r'\w+')
    factory_stopwords = StopWordRemoverFactory()
    stopwordsFact = factory_stopwords.get_stop_words()
    stemmer = StemmerFactory().create_stemmer()
    count = 0
    for kalimat in data:
        removedHttp = re.sub(r"http\S+", '', kalimat)  #hilangin link http
        removedPic = re.sub(r"pic.twitter\S+", '',
                            removedHttp)  #hilangin link pic.twitter
        lower = removedPic.lower()  #casefolding
        tokenized = tokenizer.tokenize(lower)  #tokenizer + punctuation removal
        stopwords = []  #Stopwords removal
        for kata in tokenized:
            if kata not in stopwordsFact:
                stopwords.append(kata)
        stemmed = []  #stemming
        for kata in stopwords:  #stemming
            stemmed.append(stemmer.stem(kata))  #stemming
        cleanData.append(stemmed)
        count += 1
        print(count)
    return cleanData
示例#5
0
def word_tokenizer(text):
    #tokenizes and stems the text
    tokens = word_tokenize(text)
    fac2 = StemmerFactory()
    stemmer = fac2.create_stemmer()
    factory = StopWordRemoverFactory()
    tokens = [stemmer.stem(t) for t in tokens if t not in factory.get_stop_words()]
    
    return tokens
示例#6
0
def generate_sastrawi_stopwords():
    # get Sastrawi stopwords as list
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()

    # write to txt file
    with open(stopwords_list_path + '/sastrawi-stopwords.txt', 'w') as file:
        for word in stopwords:
            file.write(word + "\n")
def Stopword_removal(sentence):
    stopword_factory = StopWordRemoverFactory()
    stopwords = stopword_factory.get_stop_words()
    words = sentence.split()
    output = ""
    for word in words:
        if word not in stopwords:
            output = output + " " + word

    return output
 def __init__(self, min_cut=0.1, max_cut=0.9):
     """
  Initilize the text summarizer.
  Words that have a frequency term lower than min_cut 
  or higer than max_cut will be ignored.
 """
     factory = StopWordRemoverFactory()
     self._min_cut = min_cut
     self._max_cut = max_cut
     self._stopwords = set(factory.get_stop_words() + list(punctuation))
def bahasa_stopwords(additional_words=[]):
    stopword_a = pd.read_csv('Data/indonesia_tweet/stopwordbahasa.csv',
                             names=['stopword'])
    stopword = stopword_a['stopword'].tolist()
    factory = StopWordRemoverFactory()
    stopword_b = factory.get_stop_words()
    stopword.extend(stopword_b)
    if type(additional_words) != list:
        return TypeError
    else:
        if len(additional_words) > 0:
            stopword.extend(additional_words)
    stopword = list(dict.fromkeys(stopword))
    return stopword
示例#10
0
def cleanTweets(Tweets):
    factory = StopWordRemoverFactory(); stopwords = set(factory.get_stop_words()+['twitter','rt','pic','com','yg','ga','https'])
    factory = StemmerFactory(); stemmer = factory.create_stemmer()
    for i,tweet in enumerate(tqdm(Tweets)):
        txt = tweet['fullTxt'] # if you want to ignore retweets  ==> if not re.match(r'^RT.*', txt):
        txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',txt)# clean urls
        txt = txt.lower() # Lowercase
        txt = Tokenizer.tokenize(txt)
        symbols = set(['@']) # Add more if you want
        txt = [strip_non_ascii(t,symbols) for t in txt] #remove all non ASCII characters
        txt = ' '.join([t for t in txt if len(t)>1])
        Tweets[i]['cleanTxt'] = txt # this is not a good Python practice, only for learning.
        txt = stemmer.stem(txt).split()
        Tweets[i]['nlp'] = ' '.join([t for t in txt if t not in stopwords])
    return Tweets
示例#11
0
def pre_processing(text):
    stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist()

    stem = StemmerFactory() 
    stemmer = stem.create_stemmer()
    factory = StopWordRemoverFactory()
    stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords))

    clean_str = text.lower() # lowercase
    clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags
    clean_str = re.sub(r'&', '', clean_str) # remove & as it equals &
    clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation
    clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space
    clean_str = clean_str.strip() # trim
    clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem
    clean_str = stopword.remove(clean_str) # remove stopwords
    return clean_str
示例#12
0
def preprocess(data):
    cleanData = []
    tokenizer = RegexpTokenizer(r'\w+')
    factory_stopwords = StopWordRemoverFactory()
    stopwords = factory_stopwords.get_stop_words()
    count = 0
    for i in range(len(data)):
        lowerText = data[i].lower()#Case folding
        tokenizedText = tokenizer.tokenize(lowerText)#Punctual removal and tokenization
        swRemovedText = []#Stopwords removal
        for j in range(len(tokenizedText)):
            if tokenizedText[j] not in stopwords:
                swRemovedText.append(tokenizedText[j])
        cleanData.append(swRemovedText)
        count += 1
        print(count, "data cleaned")
    return cleanData
示例#13
0
    def stopwords_removal(self, list_stopwords, output_stopwords):
        with open(listStopwordsPath, 'r', encoding='utf-8') as f:
            list_stopwords = f.read()
            f.close()

        stop_factory = StopWordRemoverFactory()
        more_stopwords = list_stopwords.split("\n")

        data = stop_factory.get_stop_words() + more_stopwords
        stopwords = stop_factory.create_stop_word_remover()
        remove_stopwords = stopwords.remove(self.text)

        with open(stopwordsRemovalPath, 'w', encoding='utf-8') as f:
            f.write(remove_stopwords)

        print("Done!")

        return remove_stopwords
def Preprocessing(data):#Preprocessing
    cleanData = []
    tokenizer = RegexpTokenizer(r'\w+')
    factory_stopwords = StopWordRemoverFactory()
    stopwords = factory_stopwords.get_stop_words()
    factory_stemmer = StemmerFactory()
    stemmer = factory_stemmer.create_stemmer()
    for i in range(len(data)):
        lowerText = data[i].lower()#Case folding
        tokenizedText = tokenizer.tokenize(lowerText)#Punctual removal and tokenization
        swRemovedText = []#Stopwords removal
        for j in range(len(tokenizedText)):
            if tokenizedText[j] not in stopwords:
                swRemovedText.append(tokenizedText[j])
        stemmedText = []
        for k in range(len(swRemovedText)):#Stemming
            stemmedText.append(stemmer.stem(swRemovedText[k]))
        cleanData.append(stemmedText)
    return cleanData
示例#15
0
def mnb():
    factory = StopWordRemoverFactory()
    stop_word_list = factory.get_stop_words()
    stop = stop_word_list + list(punctuation)
    tfidf = TfidfVectorizer(sublinear_tf=True,
                            min_df=5,
                            norm='l2',
                            encoding='latin-1',
                            ngram_range=(1, 2),
                            stop_words=stop)
    df = convert_to_tidf()
    X_train, X_test, y_train, y_test = train_test_split(df['questions'],
                                                        df['labels'],
                                                        random_state=0)
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    feed = MultinomialNB().fit(X_train_tfidf, y_train)
    return feed, count_vect
示例#16
0
import pandas
import pickle as pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from dateutil.parser import parse
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory , ArrayDictionary , StopWordRemover

factory = StopWordRemoverFactory()
a = list(factory.get_stop_words())
if "di" in a: a.remove("di")
if "adalah" in a: a.remove("adalah")    
dictionary = ArrayDictionary(a)
stopwordId = StopWordRemover(dictionary)

sf= StemmerFactory()
stemmerId = sf.create_stemmer() 

def date_detection(doc,fuzzy=True):
    try: 
        parse(doc, fuzzy=fuzzy)
        return True

    except ValueError:
        return False
    except :
        return False
    
def all_caps_detection(doc):
示例#17
0
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
from collections import Counter
from modulenorm.modNormalize import normalize
from modulenorm.modTokenizing import tokenize
from rake_nltk import Rake

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Stopword Removal
stop_factory = StopWordRemoverFactory()
stopword = stop_factory.create_stop_word_remover()
data = stop_factory.get_stop_words()

# Standar Word Checker
dictFile = os.path.dirname(os.path.realpath(__file__)) + "/improveDict.txt"
swChecker = SWChecker(dictFile)
prosaHelper = Prosa()
usenorm = normalize()

sentence = "q cinta lo tp lo kaga sejak weekend lalu"
# text_norm = usenorm.enterNormalize(sentence) # normalisasi enter, 1 revw 1 baris
# # text_norm = usenorm.lowerNormalize(text_norm) # normalisasi huruf besar ke kecil
# text_norm = usenorm.repeatcharNormalize(text_norm) # normalisasi titik yang berulang
# text_norm = usenorm.linkNormalize(text_norm) # normalisasi link dalam text
# text_norm = usenorm.spacecharNormalize(text_norm) # normalisasi spasi karakter
# text_norm = usenorm.ellipsisNormalize(text_norm) # normalisasi elepsis (…)
            synonyms = [synonym.lower() for synonym in synonyms.split()]
            for synonym in synonyms:
                table[synonym] = primary.lower()

spelling = []
for idx, value in enumerate(tokenisasi):
    temp = []
    for idy, value1 in enumerate(value):
        temp.append(''.join(
            table.get(word.lower(), word)
            for word in re.findall(r'(\W+|\w+)', value1)))
    spelling.append(temp)

#stopword
stop_factory = StopWordRemoverFactory()
data_stopword = stop_factory.get_stop_words()
stopword = stop_factory.create_stop_word_remover()
stopword_removal = []
for idx, value in enumerate(spelling):
    temp = []
    for idy, value1 in enumerate(value):
        temp.append(stopword.remove(value1))
    stopword_removal.append(temp)

#stemming:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stemming = []
for idx, value in enumerate(stopword_removal):
    temp = []
    for idy, value1 in enumerate(value):
示例#19
0
# ABOUT : Simple Chatbot Itenas
# ==========================================================

import nltk
import string
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from colorama import Fore, Style

text = open("itenasDoc.txt", "r", errors='ignore').read()
text = text.lower()

factory = StopWordRemoverFactory()
stopwords_indonesia = factory.get_stop_words()

sents_tokenize = nltk.sent_tokenize(text)

# Inisialisasi kata yang akan digunakan
sapa_user = ['hello', 'hallo', 'hi', 'hallo itenas']
greet_user = [
    'Hallo juga :D', "Senang bertemu dengan mu, apa yang bisa aku bantu?",
    "Hiiiiii ^_^"
]
kelimat_perpisahan = ['selesai', 'quit', 'dadah', 'berhenti']


def tokenize_function(data):
    # Menghilangkan tanda baca yang nantinya tidak akan terpakai di pencari kesamaan
    texts = [token for token in data if token not in string.punctuation]
示例#20
0
#stopword removal
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
getStopWord = factory.get_stop_words()

print(getStopWord)
#
# Melakukan Proses Stopword Removal
# Dan Menambah Stoplist
# Dengan Python Sastrawi

# import StopWordRemoverFactory class
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()

# Menambah Stopword
more_stopword = ['dengan', 'saya']
data = factory.get_stop_words() + more_stopword

stopword = factory.create_stop_word_remover()
# Kalimat
kalimat = 'dengan menggunakan python dan library sastrawi saya dapat melakukan proses stopword removal'
stop = stopword.remove(kalimat)
print(stop)
示例#22
0
def get_stopwords():
    factory = StopWordRemoverFactory()
    stop_words = factory.get_stop_words()
    return stop_words
rawdata = []
for j in range(0, 8):
    x = open(str(j + 1) + '.txt', 'r').read()
    rawdata.append(x.replace('\n', ' '))

import nltk
from nltk.tokenize import word_tokenize as token
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string, numpy as np

ST = StemmerFactory()
stemmer = ST.create_stemmer()
SW = StopWordRemoverFactory()
stop_word = SW.get_stop_words()

#rawdata
print('rawdata')
print(rawdata)

doc = []
for i in rawdata:
    temp = []
    for j in token(i):
        word = stemmer.stem(str.lower(j))
        #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')):
        temp.append(word)
    doc.append(temp)

dictionary = []
for i in doc:
示例#24
0
#     df = convert_to_tidf()
#     X_train, X_test, y_train, y_test = train_test_split(df['questions'], df['labels'], random_state=0)
#     count_vect = CountVectorizer()
#     X_train_counts = count_vect.fit_transform(X_train)
#     tfidf_transformer = TfidfTransformer()
#     X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#     feed = MultinomialNB().fit(X_train_tfidf, y_train)
#     return feed, count_vect

#X_test.iloc[0]

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
factory = StopWordRemoverFactory()
stop_word_list = factory.get_stop_words()

# stopwords added
stopwords = stop_word_list + list(punctuation)
# create vectorizer
vect = TfidfVectorizer(sublinear_tf=True, min_df=3, norm='l2', encoding='id', ngram_range=(1, 2), stop_words=stopwords)
# create data
df = convert_to_tidf()
features = vect.fit_transform(df.questions).toarray()
labels = df.id_label
# cross validation technique
X_train, X_test, y_train, y_test = train_test_split(df['questions'], df['labels'], random_state=0)

# import and instantiate CountVectorizer
# vect = CountVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='id', ngram_range=(1, 2),stop_words=stopwords)
示例#25
0
    return replaced

def remove_stopwords(text):
    words = tokenizer.tokenize(text)
    filtered_words = [word for word in words if not word in stop_words] 
    return (" ".join(filtered_words))

def preprocess_text(text, abbreviation_dict):
    abbreviation_replaced = replace_abbreviation(text, abbreviation_dict)
    regex_filtered = regex_filter(abbreviation_replaced)
    stopwords_removed = remove_stopwords(regex_filtered)
    return stopwords_removed

tokenizer = WordPunctTokenizer()
factory = StopWordRemoverFactory()
stop_words = factory.get_stop_words()

remove_mentions = r'@[A-Za-z0-9_]+' 
remove_links = r'https?://[A-Za-z0-9./]+' 
remove_retweets = r'RT' 
remove_hashtags = r'#[A-Za-z0-9_]+' 
remove_pics = r'pic.twitter.com/[A-Za-z0-9]+'
letters_only = r'[^\w\s-]'
combined_pat = r'|'.join((remove_mentions,remove_links,remove_retweets,remove_hashtags,remove_pics, letters_only))


data_path = 'data/all_tweets.csv'
tweet_data = pd.read_csv(data_path)

abbreviation_dict = get_abbreviated_dict()
clean_tweets = tweet_data['tweet'].apply(lambda x: preprocess_text(x, abbreviation_dict))
def hoax_detection():
    #Reading data as pandas dataframe
    frame = pd.read_csv('MasterBeritaAfterCleanCombined.csv',
                        error_bad_lines=False,
                        encoding='latin1')
    frame2 = pd.read_csv('new_TestData.csv',
                         error_bad_lines=False,
                         encoding='latin1')

    # TODO: remove this line
    #    frame = frame.head(5)

    berita = ''
    berita = stem(berita)
    data = {'no': ['1'], 'berita': [berita], 'tagging': ['Hoax']}
    #    frame2 = pd.DataFrame(data, columns=['no','berita','tagging'])

    #Inspecing Shape
    frame.shape
    frame2.shape

    #Inspecting top 5 rows
    frame.head()
    frame2.head()

    #Setting the DataFrame index (row labels) using one or more existing columns
    frame = frame.set_index("no")
    frame.head()

    frame2 = frame2.set_index("no")
    frame2.head()

    y = frame.tagging
    y.head()

    y2 = frame2.tagging

    frame.drop("tagging", axis=1)
    frame.head()

    frame2.drop("tagging", axis=1)

    # print(frame['berita'])

    # print(frame['berita'])

    X_train = frame['berita']
    y_train = y
    print(X_train.shape)
    print(y_train.shape)
    # print(X_train)
    # print(y_train)
    # print(len(X_train))
    # print(len(y_train))

    # uux_train, X_test , uuy_train, y_test = train_test_split(frame2['berita'], y2, test_size=0.33, random_state=53)

    X_test = frame2['berita']
    y_test = y2
    print(len(X_test))

    # stemming
    # print(frame['berita'][0])

    # print(frame2['berita'])

    X_train.head()

    y_train.head()

    X_train, X_test, y_train, y_test = train_test_split(frame['berita'],
                                                        y,
                                                        test_size=0.33,
                                                        random_state=53)

    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()

    # count_vectorizer = case folding, tokenizing, remove stopwords
    # analyze = count_vectorizer.build_analyzer()
    # analyze("Saya mau MAKAN dimakan di tempat makan")
    # print(count_vectorizer)
    # count_vectorizer = CountVectorizer(lowercase=True, stop_words=frozenset(stopwords))

    # Fit and transform the training data.
    # count_train = count_vectorizer.fit_transform(X_train)

    # print(count_train)
    # Transform the test set
    # count_test = count_vectorizer.transform(X_test)

    # Initialize the `tfidf_vectorizer`
    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                       stop_words=frozenset(stopwords),
                                       max_df=0.7)

    # Fit and transform the training data
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)

    # Transform the test set
    tfidf_test = tfidf_vectorizer.transform(X_test)

    print(tfidf_test)

    print('separator')

    # Get the feature names of `tfidf_vectorizer`
    print(tfidf_vectorizer.get_feature_names()[-20:])

    tfidf_df = pd.DataFrame(tfidf_train.A,
                            columns=tfidf_vectorizer.get_feature_names())

    #    tfidf_df.to_excel('output-hoax-only.xlsx')

    #    print(tfidf_df)
    # Get the feature names of `count_vectorizer`
    # print(count_vectorizer.get_feature_names()[0:10])

    import matplotlib.pyplot as plt

    def plot_confusion_matrix(cm,
                              classes,
                              normalize=False,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        See full source and example: 
        http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
        
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j,
                     i,
                     cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()

    start = timeit.default_timer()

    clf = MultinomialNB()
    clf.fit(tfidf_train, y_train)
    pred = clf.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    multinomialpred = pred
    print("#Result:#Multinomial#", pred)
    print("accuracy:   %0.3f" % score)
    cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time Multinomial: ', stop - start)
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='MultinomialNB Confusion Matrix (Predict: Test)')

    #    y_pred_prob = clf.predict_proba(tfidf_test)
    #    print(y_pred_prob)
    #    hoax_probs = y_pred_prob[:,1]
    #
    #
    #    fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=hoax_probs, pos_label='spam')
    #    # Plot
    #    plt.plot(fpr,tpr, color='red')
    #    plt.title('Receiver Operating Characteristic Curve', size=20)
    #    plt.plot([0, 1], [0, 1], color='green', linestyle=':')
    #    plt.xlabel('False Positive Rate', size=15)
    #    plt.ylabel('True Positive Rate', size=15)
    #    plt.show()

    clf = MultinomialNB()
    clf.fit(tfidf_train, y_train)
    pred = clf.predict(tfidf_train)
    score = accuracy_score(y_train, pred)
    multinomialpred = pred
    cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='MultinomialNB Confusion Matrix (Predict: Training)')

    start = timeit.default_timer()
    linear_clf = PassiveAggressiveClassifier()
    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    passiveaggressivepred = pred
    print("#Result:#PassiveAggressiveClassifier#", pred)
    print("accuracy:   %0.3f" % score)
    cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time PassiveAggressiveClassifier: ', stop - start)
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='PassiveAggressiveClassifier Confusion Matrix (Predict: Test)')

    linear_clf = PassiveAggressiveClassifier()
    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_train)
    score = accuracy_score(y_train, pred)
    passiveaggressivepred = pred
    cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time PassiveAggressiveClassifier: ', stop - start)
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='PassiveAggressiveClassifier Confusion Matrix (Predict: Training)'
    )

    start = timeit.default_timer()
    linear_clf_svm = svm.SVC()
    linear_clf_svm.fit(tfidf_train, y_train)
    pred = linear_clf_svm.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    print("#Result:#SVM#", pred)
    svmpred = pred
    cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time SVM: ', stop - start)
    plot_confusion_matrix(cm,
                          classes=['Hoax', 'Valid'],
                          title='SVM Confusion Matrix (Predict: Test)')

    linear_clf_svm = svm.SVC()
    linear_clf_svm.fit(tfidf_train, y_train)
    pred = linear_clf_svm.predict(tfidf_train)
    score = accuracy_score(y_train, pred)
    svmpred = pred
    cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time SVM: ', stop - start)
    plot_confusion_matrix(cm,
                          classes=['Hoax', 'Valid'],
                          title='SVM Confusion Matrix (Predict: Training)')

    def most_informative_feature_for_binary_classification(
            vectorizer, classifier, n=100):
        """
        See: https://stackoverflow.com/a/26980472
        
        Identify most important features if given a vectorizer and binary classifier. Set n to the number
        of weighted features you would like to show. (Note: current implementation merely prints and does not 
        return top classes.)
        """

        class_labels = classifier.classes_
        feature_names = vectorizer.get_feature_names()
        topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
        topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

        for coef, feat in topn_class1:
            print(class_labels[0], coef, feat)

        print()

        for coef, feat in reversed(topn_class2):
            print(class_labels[1], coef, feat)

    print('y_test')
    print(y_test)

    #    print('score')
    #    print(score)

    #    y_pred_prob = clf.predict_proba(tfidf_test)
    #    spam_probs = y_pred_prob[:,1]
    #    print(spam_probs)
    #
    #    # Build confusion metrics
    #    fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=spam_probs, pos_label='spam')
    #    # Plot
    #    plt.plot(fpr,tpr, color='red')
    #    plt.title('Receiver Operating Characteristic Curve', size=20)
    #    plt.plot([0, 1], [0, 1], color='green', linestyle=':')
    #    plt.xlabel('False Positive Rate', size=15)
    #    plt.ylabel('True Positive Rate', size=15)
    #    plt.show()

    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_test,
                                     linear_clf.decision_function(tfidf_test),
                                     pos_label='neg')
    # find threshold closest to zero:
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(fpr[close_zero],
             tpr[close_zero],
             'o',
             markersize=10,
             label='threshold zero(default)',
             fillstyle='none',
             c='k',
             mew=2)
    plt.plot([0, 1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
    plt.legend(loc=4)
    plt.plot(fpr, tpr, label='ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (recall)')
    plt.title('roc_curve')
    plt.show()
    from sklearn.metrics import auc
    print('AUC score is: ', auc(fpr, tpr))

    # plot precision recall curve Multinomial
    #    disp = plot_precision_recall_curve(linear_clf, tfidf_test, y_test)
    #    y_score = linear_clf.decision_function(X_test)
    #    average_precision = average_precision_score(y_test, y_score)
    #    disp.ax_.set_title('2-class Precision-Recall curve: '
    #                       'AP={0:0.2f}'.format(average_precision))
    #    disp.show()
    #
    #    most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

    feature_names = tfidf_vectorizer.get_feature_names()
    sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20]

    ### Most fake
    sorted(zip(clf.coef_[0], feature_names))[:20]

    tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))
    for i in tokens_with_weights:
        print(i)
        break

    result = dict()
    result['multinomial'] = multinomialpred
    result['passive'] = passiveaggressivepred
    result['svm'] = svmpred

    # print(result)
    return result
"""

# coding: utf-8

# In[73]:

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import pandas as pd

# In[74]:

stemm = StemmerFactory()
stemmer = stemm.create_stemmer()
stop = StopWordRemoverFactory()
stopwords = stop.get_stop_words()

# In[75]:

data = pd.read_csv("D:\dataset_textmining\dataset4.csv", encoding="ISO-8859-1")
dataset_uji = pd.read_csv("D:\dataset_textmining\datauji4.csv",
                          encoding="ISO-8859-1")

# ### Get komentar

# In[76]:

desc = data.loc[:, 'Komentar']
dataset = data.loc[:, ["Komentar", "Hasil Akhir"]]
data_uji = dataset_uji.loc[:, 'Komentar']
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet or number
        val = re.search(r"^[a-zA-Z0-9][a-zA-Z0-9-]*$", w)
        #add tokens
        if (w in ['b', 'rt', 'at', 'user', 'url'] or val is None
                or len(w) <= 3):
            continue
        else:
            tokens.append(w)
    return tokens


#List of Stop Words
factory2 = StopWordRemoverFactory()
stopwords = factory2.get_stop_words()

# Load the Blog article
data = pd.read_csv(open(r"D:\Kuliah\Big Data\Merge\CrawlingJaktim.csv"))
file = data['Tweets'].tolist()

# Word Vectorization
print("Memulai proses vektorisasi kata...")
vectorizer_count = CountVectorizer(preprocessor=my_preprocessor,
                                   tokenizer=my_tokenizer,
                                   stop_words=stopwords,
                                   min_df=2,
                                   max_df=0.95)
vectorizer_tfidf = TfidfVectorizer(preprocessor=my_preprocessor,
                                   tokenizer=my_tokenizer,
                                   stop_words=stopwords,
 def __init__(self):
     self._stopwords = StopWordRemoverFactory.get_stop_words(self)
示例#30
0
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import string

factory_stopwrods = StopWordRemoverFactory()
stopwords = factory_stopwrods.get_stop_words()

factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()


def clean_text(text):

    # removing punctuation
    for c in string.punctuation:
        text = text.replace(c, "")

    # removing excessive whitespace
    text = " ".join(text.split())

    # text to array of word
    words = text.split()

    # removing stopwords
    words = [word for word in words if word not in stopwords]

    # stemming word in query
    words = [stemmer.stem(word) for word in words]

    return words