def stemming_words(words): # Import Library from Sastrawi.Stemmer.StemmerFactory import StemmerFactory factory = StemmerFactory() stemmer = factory.create_stemmer() from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # Lakukan Stemming hasil1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', words) hasil2 = hasil1.encode('ascii', 'ignore').decode('ascii') hasil3 = ' '.join(word for word in hasil2.split(' ') if not word.startswith('#')) hasil4 = ' '.join(word for word in hasil3.split(' ') if not word.startswith('@')) katadasar = stemmer.stem(str(hasil4)) #hapus stopword/hapus kata dasar dengan menggunakan metode sastrawi stop = stopword.remove(katadasar) hasil5 = (" ".join(stop.split())) # Kembalikan hasil stemming return hasil5
positif = inSetLexicon['positif'] # tokenization from nltk.tokenize import sent_tokenize, word_tokenize # stemming from Sastrawi.Stemmer.StemmerFactory import StemmerFactory factory = StemmerFactory() stemmer = factory.create_stemmer() #stopword removal from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # -------------global variable------------- # dataStatis = 'buang air kecil karena kurang buang pikiran air, semua pergi dilakukan untuk mencari pasangan hidup, serta bersuka-sukaan.' # dataStatis = 'aku mencium telapak kaki ayah, sangat ingin makan bawang agar sehat.' # dataStatis = 'aku tarik tarik napas habis jalan bebas hambat, tarik napas habis, tarik membuat orak senyum.' # dataStatis = 'Jangan terlalu sering memenggal lidah, suka mengganggu kita yang mau belajar suka mencium telapak kaki ayah.' # dataStatis = 'menaruh hati kepadanya, menyakitkanku' dataStatis = 'perang mulut, gatal' # dataStatis = 'hati kecil buang air kecil, kecil hati, buang hati air' # -------------import excel dataset------------- def importExcelDataSet(): hasil = []
def process(path_file): text = "" filename = secure_filename(path_file) if filename != '': conn = sqlite3.connect(os.path.join(current_app.config['DB_PATH'], 'auditree.db'), timeout=10) c = conn.cursor() c.execute( "SELECT text_raw,text_network_object,benford_object FROM corpus WHERE filename = ?", [filename]) data = c.fetchall() if len(data) != 0: file_path = os.path.join(current_app.config['UPLOAD_PATH'], filename) if list(data[0])[0] is None: # content raw content_text = Corpus.get_text_from_pdf(file_path) c.execute( "UPDATE corpus SET text_raw = ? WHERE filename = ?", [content_text, filename]) if list(data[0])[1] is None: # text network reader = PyPDF2.PdfFileReader(file_path) for page in reader.pages: text += page.extractText() # cleaning text = text.replace("\n", ' ') text = text.replace(":", ' ') text = text.replace(".", ' ') text = text.replace("/", ' ') text = text.replace(",", ' ') text = text.replace("(", ' ') text = text.replace(")", ' ') text = text.lower() text = ' '.join(text.split()) text = text.translate({ord(k): None for k in digits}) # stemming factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) # remover factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) # process graph graph = Corpus.process_graph(text) c.execute( "UPDATE corpus SET text_network_object = ? WHERE filename = ?", [json.dumps(graph), filename]) if list(data[0])[2] is None: p = re.compile(r"^(19|20)\d{2}$") res = [] reader = PyPDF2.PdfFileReader(file_path) for page in reader.pages: arr = re.findall(r"[\d.]*\d+", page.extractText()) for a in arr: a = str(a) if p.findall(a): continue a = a.replace(".", "") if len(a) > 17: a = a[:16] res.append(a) df = pd.DataFrame(list(res), columns=['nilai']) fld = bf.first_digits(data=df['nilai'].astype(np.float), digs=1, decimals=8, confidence=95) result_json = fld.to_json(orient="split") result = json.loads(result_json) c.execute( "UPDATE corpus SET benford_object = ? WHERE filename = ?", [result_json, filename]) conn.commit() conn.close() return redirect( url_for('home_blueprint.index_filename', filename=filename))