def _processTweet(self, tweet): punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~''' tweet = tweet.lower() # convert text to lower-case tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) # remove URLs tweet = re.sub('@[^\s]+', '', tweet) # remove usernames tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag tweet = "".join( (char for char in tweet if char not in string.punctuation)) tweet = re.sub('\s+', ' ', tweet).strip() tweet = re.sub(r"\d", "", tweet) # Ambil Stopword bawaan stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = open("stopword.txt", "r").read().split() # Merge stopword data = stop_factory + more_stopword dictionary = ArrayDictionary(data) str = StopWordRemover(dictionary) factory1 = StemmerFactory() #stemming factory stemmer = factory1.create_stemmer() #buat stemming # tweet = str.remove(tweet) # tweet = stemmer.stem(tweet) # stemming tweet tweet = word_tokenize( tweet) # remove repeated characters (helloooooooo into hello) # return [word for word in tweet if word not in self._stopwords] return tweet
def __filtering_sastrawi(self, documents): stop_factory = StopWordRemoverFactory().get_stop_words() list_stop = stop_factory + self.stop_more dictionary = ArrayDictionary(list_stop) stopwords = StopWordRemover(dictionary) stop = stopwords.remove(documents) return stop
def __init__(self): with open('./stopwords.txt') as f: more_stopword=f.read().split('\n') SWfactory = StopWordRemoverFactory() stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words()) self.stopword = StopWordRemover(stopword_data)
def stop_stem_remover(kalimat): """ membersihkan stop word dan melakukan seemming input : kalimat : kalimat di dalam corpus return kalimat """ #buang kata tidak terlalu penting #factory = StopWordRemoverFactory() stop_factory = StopWordRemoverFactory().get_stop_words() add_stop_word = ['dkk', 'et', 'al', 'all'] #tambah manual stopwords stop = stop_factory + add_stop_word dicts = ArrayDictionary(stop) all_stop = StopWordRemover(dicts) kalimat = all_stop.remove(kalimat) #stemming (menjadi kata dasar) stemmerFactory = StemmerFactory() stemmer = stemmerFactory.create_stemmer() kalimat = stemmer.stem(kalimat) return kalimat
def removeStopWord(query): factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['!', '.', ',', '?'] data = factory + more_stopword dic = ArrayDictionary(data) stopword = StopWordRemover(dic) return stopword.remove(query)
def Stopword(doc): stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['ini', 'itu', 'the'] data = stop_factory + more_stopword dictionary = ArrayDictionary(data) data_str = StopWordRemover(dictionary) dokumen = data_str.remove(doc) return dokumen
def stopword(self): stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya'] data = stop_factory + more_stopword stop_factory = StopWordRemoverFactory() dictionary = ArrayDictionary(data) self.stopword = StopWordRemover(dictionary)
def remove_stopwords_id(kalimat): # ambil stopword bawaan stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['daring', 'online', 'nih'] # menggabungkan stopword data = stop_factory + more_stopword dictionary = ArrayDictionary(data) string = StopWordRemover(dictionary) tokens = nltk.tokenize.word_tokenize(string.remove(kalimat)) return (" ".join(tokens))
def __stopward_removal(self, tokens): stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['dong', 'atuh', 'plis'] data = stop_factory + more_stopword dictionary = ArrayDictionary(data) str_remove = StopWordRemover(dictionary) tokens = word_tokenize(str_remove.remove(' '.join(tokens))) return tokens
def stopword(text): # Ambil Stopword bawaan stop_factory = StopWordRemoverFactory().get_stop_words() print(stop_factory) more_stopword = ['diatur', 'perjodohan'] # Merge stopword data = stop_factory + more_stopword dictionary = ArrayDictionary(data) str = StopWordRemover(dictionary) hasil = str.remove(text) # print(hasil) return hasil
def pre_processing(text): stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist() stem = StemmerFactory() stemmer = stem.create_stemmer() factory = StopWordRemoverFactory() stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords)) clean_str = text.lower() # lowercase clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags clean_str = re.sub(r'&', '', clean_str) # remove & as it equals & clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space clean_str = clean_str.strip() # trim clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem clean_str = stopword.remove(clean_str) # remove stopwords return clean_str
def preprocess_sentence(self, q=""): #tokenize, lower, stopword,stem default_stopwords = StopWordRemoverFactory().get_stop_words() additional_stopwords = [ "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu" ] dictionary = ArrayDictionary(default_stopwords + additional_stopwords) stopword = StopWordRemover(dictionary) factory = StemmerFactory() stemmer = factory.create_stemmer() tokenizer = RegexpTokenizer(r'\w+') res = " ".join(tokenizer.tokenize(q)) res = res.lower() res = stopword.remove(res) res = factory = stemmer.stem(res) return res
def generateStopWords(pat, txt): # Ambil Stopword bawaan stop_factory = StopWordRemoverFactory().get_stop_words() more_stopwords = [' ?', '?', ' .', '.', ' ,', ','] # Merge stopword data = stop_factory + more_stopwords dictionary = ArrayDictionary(data) str = StopWordRemover(dictionary) temppat = str.remove(pat) if (temppat == '' or temppat == None): temppat = pat temptxt = str.remove(txt) if (temptxt == '' or temptxt == None): temptxt = txt return temppat, temptxt
def remove_stopwords(self, csv_src="", csv_dest="", cols_to_clean=["KOMPETENSI"], sep=";"): #factory = StopWordRemoverFactory() default_stopwords = StopWordRemoverFactory().get_stop_words() additional_stopwords = [ "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu" ] dictionary = ArrayDictionary(default_stopwords + additional_stopwords) stopword = StopWordRemover( dictionary ) #factory.create_stop_word_remover(dictionary = dictionary) tokenizer = RegexpTokenizer(r'\w+') df = pd.read_csv(csv_src, sep=sep) for c in cols_to_clean: df[c] = df[c].map(lambda x: " ".join(tokenizer.tokenize(x)) ) #get only words without symbols df[c] = df[c].map(lambda x: stopword.remove(x)) #remove stop words df.to_csv(csv_dest, sep=sep, index=None) print("lower %d rows" % len(df))
exit() start = time.time() os.system('cls') # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # create stopword stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['halaman', 'kompas', 'com', 'all', '-'] # Merge stopword stop_factory += more_stopword dictionary = ArrayDictionary(stop_factory) stopword = StopWordRemover(dictionary) for i in range(total_documents): print(i, "/", total_documents, "documents cleaned") print("Cleaning...") try: with open("download/" + site + "/scrapped/" + site + "-" + str(i + 1) + "-bersih.html", 'r', encoding="utf8") as f: soup = bs(f, 'html.parser') url = soup.url.text title = soup.title.text top = soup.top.text
sentimen_count = df['sentiment'].value_counts() sentimen_count words_positif = ' '.join(df_positif['tweet_bersih']) words_negatif = ' '.join(df_negatif['tweet_bersih']) words_netral = ' '.join(df_netral['tweet_bersih']) # MORE STOPWORDS stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = StopwordsID.more_stopword # Merge stopword data = stop_factory + more_stopword dictionary = ArrayDictionary(data) StopWordRemover(dictionary) stopwords = data mask = np.array(Image.open("shape.png")) ######################################################################################################## while True: choice = displayMenu(menuItems) if choice == 1: print(df) elif choice == 2:
def __init__(self, tweet): self.tweet = tweet stop_factory = StopWordRemoverFactory().get_stop_words() stop_factory = stop_factory + self.additional_stopwords dictionary = ArrayDictionary(stop_factory) self.strword = StopWordRemover(dictionary)
def createStopword(more_stopword=[]): stop_factory = StopWordRemoverFactory().get_stop_words() new_stop_word = stop_factory + more_stopword dictionary = ArrayDictionary(new_stop_word) stopword = StopWordRemover(dictionary) return stopword
import Sastrawi as sts import nltk from bs4 import BeautifulSoup from nltk.tokenize import WordPunctTokenizer tok = WordPunctTokenizer() from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary stop_word = StopWordRemoverFactory().get_stop_words() more_stopword = [ 'yg', 'ajah', 'iya', 'mba', 'mas', 'kak', 'pak', 'pahi', 'mah', 'muehehe', 'men', 'kehfine', 'alhamdulilah', 'alhamdulillah', 'nih', 'om', 'selamat', 'sama', 'sabar', 'gak', 'yak', 'semoga' 'bu', 'adik', 'omen', 'tumben', 'tp', 'sy', 'kmu', 'jg', 'kyk', 'dll' ] d_sword = stop_word + more_stopword dictionary = ArrayDictionary(d_sword) swr = StopWordRemover(dictionary) pat1 = r'@[A-Za-z0-9]+' pat2 = r'https?://[A-Za-z0-9./]+' pat3 = '(RT)' combined_pat = r'|'.join((pat1, pat2, pat3)) df_t = df['text'] def tweet_cleaner(text): soup = BeautifulSoup(text, 'lxml') souped = soup.get_text() stripped = re.sub(combined_pat, '', souped) try: clean = stripped.decode("utf-8").replace(u"\ufffd", "?")
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary from string_matching_algorithm import * import re as regex from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory # factory = StopWordRemoverFactory() newStopFactory = StopWordRemoverFactory().get_stop_words() newStopFactory.remove("sampai") newStopFactory.remove("dan") newStopFactory.append("deadline") newStopFactory.append("mengenai") newStopFactory.append("tanggal") stopword = StopWordRemover(ArrayDictionary(newStopFactory)) # Regex untuk bulan JANUARI_REGEX = '[Jj]an(?:uari)?' FEBRUARI_REGEX = '[Ff]eb(?:ruari)?' MARET_REGEX = '[Mm]ar(?:et)?' APRIL_REGEX = '[Aa]pr(?:il)?' MEI_REGEX = '[Mm]ei' JUNI_REGEX = '[Jj]uni?' JULI_REGEX = '[Jj]uli?' AGUSTUS_REGEX = '[Aa]gu(?:stus)?' SEPTEMBER_REGEX = '[Ss]ep(?:tember)?' OKTOBER_REGEX = '[Oo]kt(?:ober)?' NOVEMBER_REGEX = '[Nn]ov(?:ember)?' DESEMBER_REGEX = '[Dd]es(?:ember)?' # Regex untuk keutuhan tanggal ANYTHING = '.*' DAY_REGEX = '(0[1-9]|[1-2][0-9]|3[0-1])'
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tool = victorinox() population1_dict = {} population2_dict = {} population_root_path = r"corpus/population" population_files = glob(os.path.join(population_root_path, "**/*.txt"), recursive=True) tokenizer = RegexpTokenizer(r'[a-zA-Z]+') factory = StemmerFactory() stemmer = factory.create_stemmer() default_stopwords = StopWordRemoverFactory().get_stop_words() additional_stopwords = [ "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu" ] dictionary = ArrayDictionary(default_stopwords + additional_stopwords) id_stopword = StopWordRemover(dictionary) en_stopword = set(stopwords.words('english')) en_stemmer = PorterStemmer() def remove_numbers(text): words = tokenizer.tokenize(text) return " ".join(words) def remove_punctuation(text): words = text.split() table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in words] return " ".join(stripped)
'melalui', 'tentang', 'februari', 'dilakukan', 'pusat', 'selatan', 'atas', 'data', 'lp', 'dalam', 'juni', 'adanya', 'mengenai', 'jkt', 'atau', 'jawaban', 'tinggi', 'telah', 'maret', 'bapak', 'oktober', 'januari', 'juli', 'mei', 'september', 'xi', 'agung', 'ada', 'dengan', 'kedua', 'di', 'selatan', 'nama', 'ada', 'terkait', 'tentang', 'yang', 'nomor', 'tidak', 'dengan', 'terhadap', 'sept', 'november', 'nov', 'dalam', 'atau', 'bapak', 'nama', 'kami', 'ada', 'melalui', 'assalamualaikum', 'wr', 'wb', 'jp', 'lp', 'md', 'mh', 'melakukuan', 'sbg', 'selasa' 'oleh', 'segera', 'tahun', 'melakukan', 'oleh', 'agustus', 'atau', 'dki', 'kab', 'belum', 'untuk', 'adanya', 'kecamatan', 'yang', 'yg', 'memberikan', 'mengenai', 'ayat', 'tanggal', 'dan', 'bukan', 'dab', 'dan', 'ke', 'qq' ] sw = stopword1 + more_stopwords dictionary = ArrayDictionary(sw) strw = StopWordRemover(dictionary) removestop = [] for line in Wtd: word_token = nltk.word_tokenize(line) word_token = [word for word in word_token if not word in sw] removestop.append(" ".join(word_token)) doc_clean = removestop kata1 = { "adatno": "adat", "admnistrasi": "administrasi", "ahali": "ahli", "agutus": "agustus", "asset": "aset", "bantenh": "banten",
import pandas import pickle as pickle from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from dateutil.parser import parse import numpy as np import re from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory , ArrayDictionary , StopWordRemover factory = StopWordRemoverFactory() a = list(factory.get_stop_words()) if "di" in a: a.remove("di") if "adalah" in a: a.remove("adalah") dictionary = ArrayDictionary(a) stopwordId = StopWordRemover(dictionary) sf= StemmerFactory() stemmerId = sf.create_stemmer() def date_detection(doc,fuzzy=True): try: parse(doc, fuzzy=fuzzy) return True except ValueError: return False except : return False def all_caps_detection(doc):
def api_echo(): if request.method == 'POST': # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() factory = StopWordRemoverFactory() more_stopword = [] # add stopword with open('dataset/stopword.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: more_stopword.append(row[0]) dictionary = ArrayDictionary(more_stopword) str = StopWordRemover(dictionary) newsTrainer = Trainer(tokenizer) kesehatan = [] konsultasi = [] marketing = [] with open("dataset/kesehatan.txt", "r") as ins: for line in ins: kesehatan.append({ 'text': line.rstrip(), 'category': 'kesehatan' }) with open("dataset/konsultasi.txt", "r") as ins: for line in ins: konsultasi.append({ 'text': line.rstrip(), 'category': 'konsultasi' }) with open("dataset/marketing.txt", "r") as ins: for line in ins: marketing.append({ 'text': line.rstrip(), 'category': 'marketing' }) # You need to train the system passing each text one by one to the trainer module. newsSet = kesehatan + konsultasi + marketing for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) query = request.form['query'].encode("utf8") #query = "Apa saja level bonus yang didapat bagi seorang agen?" # stemming and remove stop word on Query out = stemmer.stem(query) out = str.remove(out) classification = newsClassifier.classify(out) # the classification variable holds the detected categories sorted #return classification[0][0] return jsonify(classification)