示例#1
0
def stemming_words(words):
    # Import Library
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    # Lakukan Stemming
    hasil1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',
                    words)
    hasil2 = hasil1.encode('ascii', 'ignore').decode('ascii')
    hasil3 = ' '.join(word for word in hasil2.split(' ')
                      if not word.startswith('#'))
    hasil4 = ' '.join(word for word in hasil3.split(' ')
                      if not word.startswith('@'))
    katadasar = stemmer.stem(str(hasil4))
    #hapus stopword/hapus kata dasar dengan menggunakan metode sastrawi
    stop = stopword.remove(katadasar)
    hasil5 = (" ".join(stop.split()))

    # Kembalikan hasil stemming
    return hasil5
示例#2
0
positif = inSetLexicon['positif']

# tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

# stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

#stopword removal
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# -------------global variable-------------
# dataStatis = 'buang air kecil karena kurang buang pikiran air, semua pergi dilakukan untuk mencari pasangan hidup, serta bersuka-sukaan.'
# dataStatis = 'aku mencium telapak kaki ayah, sangat ingin makan bawang agar sehat.'
# dataStatis = 'aku tarik tarik napas habis jalan bebas hambat, tarik napas habis, tarik membuat orak senyum.'
# dataStatis = 'Jangan terlalu sering memenggal lidah, suka mengganggu kita yang mau belajar suka mencium telapak kaki ayah.'
# dataStatis = 'menaruh hati kepadanya, menyakitkanku'
dataStatis = 'perang mulut, gatal'

# dataStatis = 'hati kecil buang air kecil, kecil hati, buang hati air'


# -------------import excel dataset-------------
def importExcelDataSet():
    hasil = []
示例#3
0
    def process(path_file):
        text = ""
        filename = secure_filename(path_file)
        if filename != '':
            conn = sqlite3.connect(os.path.join(current_app.config['DB_PATH'],
                                                'auditree.db'),
                                   timeout=10)
            c = conn.cursor()
            c.execute(
                "SELECT text_raw,text_network_object,benford_object FROM corpus WHERE filename = ?",
                [filename])
            data = c.fetchall()
            if len(data) != 0:
                file_path = os.path.join(current_app.config['UPLOAD_PATH'],
                                         filename)
                if list(data[0])[0] is None:
                    # content raw
                    content_text = Corpus.get_text_from_pdf(file_path)
                    c.execute(
                        "UPDATE corpus SET text_raw = ? WHERE filename = ?",
                        [content_text, filename])

                if list(data[0])[1] is None:
                    # text network
                    reader = PyPDF2.PdfFileReader(file_path)
                    for page in reader.pages:
                        text += page.extractText()

                    # cleaning
                    text = text.replace("\n", ' ')
                    text = text.replace(":", ' ')
                    text = text.replace(".", ' ')
                    text = text.replace("/", ' ')
                    text = text.replace(",", ' ')
                    text = text.replace("(", ' ')
                    text = text.replace(")", ' ')
                    text = text.lower()
                    text = ' '.join(text.split())
                    text = text.translate({ord(k): None for k in digits})

                    # stemming
                    factory = StemmerFactory()
                    stemmer = factory.create_stemmer()
                    text = stemmer.stem(text)

                    # remover
                    factory = StopWordRemoverFactory()
                    stopword = factory.create_stop_word_remover()
                    text = stopword.remove(text)

                    # process graph
                    graph = Corpus.process_graph(text)
                    c.execute(
                        "UPDATE corpus SET text_network_object = ? WHERE filename = ?",
                        [json.dumps(graph), filename])

                if list(data[0])[2] is None:
                    p = re.compile(r"^(19|20)\d{2}$")
                    res = []
                    reader = PyPDF2.PdfFileReader(file_path)
                    for page in reader.pages:
                        arr = re.findall(r"[\d.]*\d+", page.extractText())
                        for a in arr:
                            a = str(a)
                            if p.findall(a):
                                continue
                            a = a.replace(".", "")
                            if len(a) > 17:
                                a = a[:16]
                            res.append(a)
                    df = pd.DataFrame(list(res), columns=['nilai'])
                    fld = bf.first_digits(data=df['nilai'].astype(np.float),
                                          digs=1,
                                          decimals=8,
                                          confidence=95)
                    result_json = fld.to_json(orient="split")
                    result = json.loads(result_json)
                    c.execute(
                        "UPDATE corpus SET benford_object = ? WHERE filename = ?",
                        [result_json, filename])
            conn.commit()
            conn.close()
        return redirect(
            url_for('home_blueprint.index_filename', filename=filename))