Python ArrayDictionary示例，Sastrawi.StopWordRemover.StopWordRemoverFactory.ArrayDictionary Python示例

示例#1

0

显示文件

文件： test1.py 项目： fajaramaulana/sentimen-analisis-naive-bayes

    def _processTweet(self, tweet):
        punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~'''
        tweet = tweet.lower()  # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',
                       tweet)  # remove URLs
        tweet = re.sub('@[^\s]+', '', tweet)  # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
        tweet = "".join(
            (char for char in tweet if char not in string.punctuation))
        tweet = re.sub('\s+', ' ', tweet).strip()
        tweet = re.sub(r"\d", "", tweet)
        # Ambil Stopword bawaan
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = open("stopword.txt", "r").read().split()
        # Merge stopword
        data = stop_factory + more_stopword
        dictionary = ArrayDictionary(data)
        str = StopWordRemover(dictionary)

        factory1 = StemmerFactory()  #stemming factory
        stemmer = factory1.create_stemmer()  #buat stemming
        #
        tweet = str.remove(tweet)
        # tweet = stemmer.stem(tweet)  # stemming tweet
        tweet = word_tokenize(
            tweet)  # remove repeated characters (helloooooooo into hello)
        # return [word for word in tweet if word not in self._stopwords]
        return tweet

示例#2

0

显示文件

 def __filtering_sastrawi(self, documents):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     list_stop = stop_factory + self.stop_more
     dictionary = ArrayDictionary(list_stop)
     stopwords = StopWordRemover(dictionary)
     stop = stopwords.remove(documents)
     return stop

示例#3

0

显示文件

 def __init__(self):
     with open('./stopwords.txt') as f:
         more_stopword=f.read().split('\n')
     
     SWfactory = StopWordRemoverFactory()
     stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
     self.stopword = StopWordRemover(stopword_data)

示例#4

0

显示文件

文件： utils.py 项目： yighu/ML_Project

def stop_stem_remover(kalimat):
    """
    membersihkan stop word dan melakukan seemming

    input : 
    kalimat : kalimat di dalam corpus

    return
    kalimat
    """
    #buang kata tidak terlalu penting
    #factory = StopWordRemoverFactory()
    stop_factory = StopWordRemoverFactory().get_stop_words()
    add_stop_word = ['dkk', 'et', 'al', 'all'] #tambah manual stopwords
    stop = stop_factory + add_stop_word
    dicts = ArrayDictionary(stop)

    all_stop = StopWordRemover(dicts)
    kalimat = all_stop.remove(kalimat)

    #stemming (menjadi kata dasar)
    stemmerFactory = StemmerFactory()
    stemmer = stemmerFactory.create_stemmer()

    kalimat = stemmer.stem(kalimat)
    return kalimat

示例#5

0

显示文件

def removeStopWord(query):
    factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['!', '.', ',', '?']
    data = factory + more_stopword
    dic = ArrayDictionary(data)
    stopword = StopWordRemover(dic)
    return stopword.remove(query)

示例#6

0

显示文件

 def Stopword(doc):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['ini', 'itu', 'the']
     data = stop_factory + more_stopword
     dictionary = ArrayDictionary(data)
     data_str = StopWordRemover(dictionary)
     dokumen = data_str.remove(doc)
     return dokumen

示例#7

0

显示文件

 def stopword(self):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya']
     data = stop_factory + more_stopword
      
     stop_factory = StopWordRemoverFactory()
     dictionary = ArrayDictionary(data)
     self.stopword = StopWordRemover(dictionary)

示例#8

0

显示文件

def remove_stopwords_id(kalimat):
    # ambil stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['daring', 'online', 'nih']

    # menggabungkan stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    string = StopWordRemover(dictionary)
    tokens = nltk.tokenize.word_tokenize(string.remove(kalimat))
    return (" ".join(tokens))

示例#9

0

显示文件

    def __stopward_removal(self, tokens):
        stop_factory = StopWordRemoverFactory().get_stop_words()

        more_stopword = ['dong', 'atuh', 'plis']

        data = stop_factory + more_stopword

        dictionary = ArrayDictionary(data)

        str_remove = StopWordRemover(dictionary)

        tokens = word_tokenize(str_remove.remove(' '.join(tokens)))

        return tokens

示例#10

0

显示文件

文件： vsm.py 项目： ariefzzz/penambanganweb-crawler-Kmean

def stopword(text):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    print(stop_factory)
    more_stopword = ['diatur', 'perjodohan']

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    hasil = str.remove(text)
    # print(hasil)

    return hasil

示例#11

0

显示文件

文件： api_processing.py 项目： jibrilhp/doc2vec-test

def pre_processing(text):
    stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist()

    stem = StemmerFactory() 
    stemmer = stem.create_stemmer()
    factory = StopWordRemoverFactory()
    stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords))

    clean_str = text.lower() # lowercase
    clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags
    clean_str = re.sub(r'&amp;', '', clean_str) # remove &amp; as it equals &
    clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation
    clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space
    clean_str = clean_str.strip() # trim
    clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem
    clean_str = stopword.remove(clean_str) # remove stopwords
    return clean_str

示例#12

0

显示文件

 def preprocess_sentence(self, q=""):
     #tokenize, lower, stopword,stem
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(dictionary)
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     tokenizer = RegexpTokenizer(r'\w+')
     res = " ".join(tokenizer.tokenize(q))
     res = res.lower()
     res = stopword.remove(res)
     res = factory = stemmer.stem(res)
     return res

示例#13

0

显示文件

文件： BM.py 项目： pandyakaa/Simple-ChatBot

def generateStopWords(pat, txt):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopwords = [' ?', '?', ' .', '.', ' ,', ',']
    # Merge stopword
    data = stop_factory + more_stopwords

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    temppat = str.remove(pat)
    if (temppat == '' or temppat == None):
        temppat = pat

    temptxt = str.remove(txt)
    if (temptxt == '' or temptxt == None):
        temptxt = txt

    return temppat, temptxt

示例#14

0

显示文件

 def remove_stopwords(self,
                      csv_src="",
                      csv_dest="",
                      cols_to_clean=["KOMPETENSI"],
                      sep=";"):
     #factory = StopWordRemoverFactory()
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(
         dictionary
     )  #factory.create_stop_word_remover(dictionary = dictionary)
     tokenizer = RegexpTokenizer(r'\w+')
     df = pd.read_csv(csv_src, sep=sep)
     for c in cols_to_clean:
         df[c] = df[c].map(lambda x: " ".join(tokenizer.tokenize(x))
                           )  #get only words without symbols
         df[c] = df[c].map(lambda x: stopword.remove(x))  #remove stop words
     df.to_csv(csv_dest, sep=sep, index=None)
     print("lower %d rows" % len(df))

示例#15

0

显示文件

    exit()

start = time.time()
os.system('cls')

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# create stopword
stop_factory = StopWordRemoverFactory().get_stop_words()
more_stopword = ['halaman', 'kompas', 'com', 'all', '-']

# Merge stopword
stop_factory += more_stopword

dictionary = ArrayDictionary(stop_factory)
stopword = StopWordRemover(dictionary)

for i in range(total_documents):
    print(i, "/", total_documents, "documents cleaned")
    print("Cleaning...")
    try:
        with open("download/" + site + "/scrapped/" + site + "-" + str(i + 1) +
                  "-bersih.html",
                  'r',
                  encoding="utf8") as f:
            soup = bs(f, 'html.parser')

        url = soup.url.text
        title = soup.title.text
        top = soup.top.text

示例#16

0

显示文件

    sentimen_count = df['sentiment'].value_counts()
    sentimen_count

    words_positif = ' '.join(df_positif['tweet_bersih'])
    words_negatif = ' '.join(df_negatif['tweet_bersih'])
    words_netral = ' '.join(df_netral['tweet_bersih'])

    # MORE STOPWORDS
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = StopwordsID.more_stopword

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    StopWordRemover(dictionary)
    stopwords = data

    mask = np.array(Image.open("shape.png"))

########################################################################################################

    while True:

        choice = displayMenu(menuItems)

        if choice == 1:
            print(df)

        elif choice == 2:

示例#17

0

显示文件

文件： tweetprocessing.py 项目： mdja/tweet_classification

 def __init__(self, tweet):
     self.tweet = tweet
     stop_factory = StopWordRemoverFactory().get_stop_words()
     stop_factory = stop_factory + self.additional_stopwords
     dictionary = ArrayDictionary(stop_factory)
     self.strword = StopWordRemover(dictionary)

示例#18

0

显示文件

def createStopword(more_stopword=[]):
    stop_factory = StopWordRemoverFactory().get_stop_words()
    new_stop_word = stop_factory + more_stopword
    dictionary = ArrayDictionary(new_stop_word)
    stopword = StopWordRemover(dictionary)
    return stopword

示例#19

0

显示文件

import Sastrawi as sts
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
stop_word = StopWordRemoverFactory().get_stop_words()

more_stopword = [
    'yg', 'ajah', 'iya', 'mba', 'mas', 'kak', 'pak', 'pahi', 'mah', 'muehehe',
    'men', 'kehfine', 'alhamdulilah', 'alhamdulillah', 'nih', 'om', 'selamat',
    'sama', 'sabar', 'gak', 'yak', 'semoga'
    'bu', 'adik', 'omen', 'tumben', 'tp', 'sy', 'kmu', 'jg', 'kyk', 'dll'
]
d_sword = stop_word + more_stopword
dictionary = ArrayDictionary(d_sword)
swr = StopWordRemover(dictionary)

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = '(RT)'
combined_pat = r'|'.join((pat1, pat2, pat3))
df_t = df['text']


def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8").replace(u"\ufffd", "?")

示例#20

0

显示文件

文件： classes.py 项目： AndhikaRei/Otobot

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from string_matching_algorithm import *
import re as regex
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# factory = StopWordRemoverFactory()
newStopFactory = StopWordRemoverFactory().get_stop_words()
newStopFactory.remove("sampai")
newStopFactory.remove("dan")
newStopFactory.append("deadline")
newStopFactory.append("mengenai")
newStopFactory.append("tanggal")
stopword = StopWordRemover(ArrayDictionary(newStopFactory))

# Regex untuk bulan
JANUARI_REGEX = '[Jj]an(?:uari)?'
FEBRUARI_REGEX = '[Ff]eb(?:ruari)?'
MARET_REGEX = '[Mm]ar(?:et)?'
APRIL_REGEX = '[Aa]pr(?:il)?'
MEI_REGEX = '[Mm]ei'
JUNI_REGEX = '[Jj]uni?'
JULI_REGEX = '[Jj]uli?'
AGUSTUS_REGEX = '[Aa]gu(?:stus)?'
SEPTEMBER_REGEX = '[Ss]ep(?:tember)?'
OKTOBER_REGEX = '[Oo]kt(?:ober)?'
NOVEMBER_REGEX = '[Nn]ov(?:ember)?'
DESEMBER_REGEX = '[Dd]es(?:ember)?'

# Regex untuk keutuhan tanggal
ANYTHING = '.*'
DAY_REGEX = '(0[1-9]|[1-2][0-9]|3[0-1])'

示例#21

0

显示文件

ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

tool = victorinox()
population1_dict = {}
population2_dict = {}
population_root_path = r"corpus/population"
population_files = glob(os.path.join(population_root_path, "**/*.txt"),
                        recursive=True)
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
default_stopwords = StopWordRemoverFactory().get_stop_words()
additional_stopwords = [
    "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu"
]
dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
id_stopword = StopWordRemover(dictionary)
en_stopword = set(stopwords.words('english'))
en_stemmer = PorterStemmer()


def remove_numbers(text):
    words = tokenizer.tokenize(text)
    return " ".join(words)


def remove_punctuation(text):
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    return " ".join(stripped)

示例#22

0

显示文件

文件： GUI_Klasifikasi.py 项目： B0ndan/Orthogonal-Projections

        'melalui', 'tentang', 'februari', 'dilakukan', 'pusat', 'selatan',
        'atas', 'data', 'lp', 'dalam', 'juni', 'adanya', 'mengenai', 'jkt',
        'atau', 'jawaban', 'tinggi', 'telah', 'maret', 'bapak', 'oktober',
        'januari', 'juli', 'mei', 'september', 'xi', 'agung', 'ada', 'dengan',
        'kedua', 'di', 'selatan', 'nama', 'ada', 'terkait', 'tentang', 'yang',
        'nomor', 'tidak', 'dengan', 'terhadap', 'sept', 'november', 'nov',
        'dalam', 'atau', 'bapak', 'nama', 'kami', 'ada', 'melalui',
        'assalamualaikum', 'wr', 'wb', 'jp', 'lp', 'md', 'mh', 'melakukuan',
        'sbg', 'selasa'
        'oleh', 'segera', 'tahun', 'melakukan', 'oleh', 'agustus', 'atau',
        'dki', 'kab', 'belum', 'untuk', 'adanya', 'kecamatan', 'yang', 'yg',
        'memberikan', 'mengenai', 'ayat', 'tanggal', 'dan', 'bukan', 'dab',
        'dan', 'ke', 'qq'
    ]
    sw = stopword1 + more_stopwords
    dictionary = ArrayDictionary(sw)
    strw = StopWordRemover(dictionary)
    removestop = []
    for line in Wtd:
        word_token = nltk.word_tokenize(line)
        word_token = [word for word in word_token if not word in sw]
        removestop.append(" ".join(word_token))
    doc_clean = removestop

    kata1 = {
        "adatno": "adat",
        "admnistrasi": "administrasi",
        "ahali": "ahli",
        "agutus": "agustus",
        "asset": "aset",
        "bantenh": "banten",

示例#23

0

显示文件

import pandas
import pickle as pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from dateutil.parser import parse
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory , ArrayDictionary , StopWordRemover

factory = StopWordRemoverFactory()
a = list(factory.get_stop_words())
if "di" in a: a.remove("di")
if "adalah" in a: a.remove("adalah")    
dictionary = ArrayDictionary(a)
stopwordId = StopWordRemover(dictionary)

sf= StemmerFactory()
stemmerId = sf.create_stemmer() 

def date_detection(doc,fuzzy=True):
    try: 
        parse(doc, fuzzy=fuzzy)
        return True

    except ValueError:
        return False
    except :
        return False
    
def all_caps_detection(doc):

示例#24

0

显示文件

def api_echo():
    if request.method == 'POST':

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        factory = StopWordRemoverFactory()

        more_stopword = []
        # add stopword
        with open('dataset/stopword.csv') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=',')
            for row in readCSV:
                more_stopword.append(row[0])

        dictionary = ArrayDictionary(more_stopword)
        str = StopWordRemover(dictionary)

        newsTrainer = Trainer(tokenizer)

        kesehatan = []
        konsultasi = []
        marketing = []

        with open("dataset/kesehatan.txt", "r") as ins:
            for line in ins:
                kesehatan.append({
                    'text': line.rstrip(),
                    'category': 'kesehatan'
                })

        with open("dataset/konsultasi.txt", "r") as ins:
            for line in ins:
                konsultasi.append({
                    'text': line.rstrip(),
                    'category': 'konsultasi'
                })

        with open("dataset/marketing.txt", "r") as ins:
            for line in ins:
                marketing.append({
                    'text': line.rstrip(),
                    'category': 'marketing'
                })

        # You need to train the system passing each text one by one to the trainer module.
        newsSet = kesehatan + konsultasi + marketing

        for news in newsSet:
            newsTrainer.train(news['text'], news['category'])

        # When you have sufficient trained data, you are almost done and can start to use
        # a classifier.
        newsClassifier = Classifier(newsTrainer.data, tokenizer)

        query = request.form['query'].encode("utf8")
        #query = "Apa saja level bonus yang didapat bagi seorang agen?"

        # stemming and remove stop word on Query
        out = stemmer.stem(query)
        out = str.remove(out)
        classification = newsClassifier.classify(out)

        # the classification variable holds the detected categories sorted
        #return classification[0][0]
        return jsonify(classification)