Пример #1
0
    def normalisasi(self, dokumen):
        store = []
        for artikel in dokumen:
            split = artikel.split()
            split.sort()
            store.append(split)

        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()
        store_n = []
        for x, sentence in enumerate(store):
            store_n.append([])
            for word in sentence:
                plus = stopword.remove(word)
                has = self.hasNumbers(plus)

                if plus != '' and \
                has == False and \
                '-' not in plus and \
                len(plus) <= 13 and \
                len(plus) >= 3:
                    store_n[x].append(plus)

        for i, kalimat in enumerate(store_n):
            for j, kata in enumerate(kalimat):
                if kata == '':
                    del store_n[i][j]
        return store_n
Пример #2
0
    def stopwords_removal(self, text, stopwords, output_stopwords):
        with open(dataOutputPath, encoding='utf-8') as f:
            text = f.read()
            f.close()

        with open(stopwords, encoding='utf-8') as f:
            list_stopwords = f.read()
            f.close()

        stop_factory = StopWordRemoverFactory()
        more_stopwords = list_stopwords.split("\n")

        #Tambahkan Stopword Baru
        data = stop_factory.get_stop_words() + more_stopwords
        stopword = stop_factory.create_stop_word_remover()
        remove_stopwords = stopword.remove(text)

        with open(pathStopwords, 'w', encoding='utf-8') as f:
            f.write(remove_stopwords)
            f.close()

        print(
            "Stopwords Removal success!\nCount Words Frequency on process...")

        return remove_stopwords
Пример #3
0
    def transform(self, X):
        tweet_final = []
        for tweet in X:
            tweet = tweet[0]
            tweet = tweet.lower()
            regex = re.compile('\shttp.+\s')
            tweet = regex.sub('', tweet)
            regex = re.compile('\shttps.+\s')
            tweet = regex.sub('', tweet)
            regex = re.compile('\spic.+\s')
            tweet = regex.sub('', tweet)
            regex = re.compile('\sftp.+\s')
            tweet = regex.sub('', tweet)
            regex = re.compile('[^a-zA-Z0-9]')
            tweet = regex.sub(' ', tweet)
            regex = re.compile('[0-9]+')
            tweet = regex.sub('', tweet)
            regex = re.compile(r'\W*\b\w{1,3}\b')
            tweet = regex.sub('', tweet)
            regex = re.compile('rt\s')
            tweet = regex.sub(' ', tweet)

            #remove stopwords
            stop_factory = StopWordRemoverFactory()
            stopword = stop_factory.create_stop_word_remover()
            tweet = stopword.remove(tweet)

            #stemming
            stem_factory = StemmerFactory()
            stemmer = stem_factory.create_stemmer()
            tweet = stemmer.stem(tweet)
            tweet_final.append(tweet)

        tweet_final = np.array(tweet_final)
        return tweet_final
 def clean_data(self, path):
     path = str(path)
     bye = open('ujaranbaru.txt', 'w+')
     with open(str(path), 'r') as readFile:
         reader = csv.reader(readFile)
         lines = list(reader)
         for i in range(len(lines)):
             str1 = ''.join(lines[i])
             caseF = str1.casefold()
             Runame = re.sub('@[^\s]+', '', caseF)
             Rhashtag = re.sub('#[^\s]+', '', Runame)
             CleanNumber = ''.join([i for i in Rhashtag if not i.isdigit()])
             line = re.sub('[(),\'.!$]', '', CleanNumber)
             link = re.sub('https[^\s]+', '', line)
             garing = re.sub('\\\[^\s]+', '', link)
             removeRT = garing.replace("rt", "")
             removespace = removeRT.lstrip()
             factory = StopWordRemoverFactory()
             stopword = factory.create_stop_word_remover()
             stopw = stopword.remove(removespace)
             factory = StemmerFactory()
             stemmer = factory.create_stemmer()
             steam = stemmer.stem(stopw)
             text = steam.split()
             if (len(text) >= 5):
                 bye.write(steam + '\n')
             self.progressBar_6.setValue((i + 1) / len(lines) * 100)
     bye.close()
Пример #5
0
def preprocessing_text(text):

    encoded_string = text.encode("ascii", "ignore")  #remove asci
    text = encoded_string.decode()  #remove asci

    text = re.sub(r'http\S+', '', text)  #remove url

    text = text.lower()  #lowercase

    text = ''.join([i for i in text if not i.isdigit()])  #remove number

    #text = ''.join([i for i in text if i not in text.punctuation])
    #text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation

    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*“”‘’_~+=|\t\n'''

    for char in text:
        if char in punctuations:
            text = text.replace(char, "")

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    text = stopword.remove(text)

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    return text
Пример #6
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    model = pickle.load(open('model_news.pkl', 'rb'))
    cv_ = pickle.load(open('news_cv.pkl', 'rb'))
    enc_ = pickle.load(open('enc_news.pkl', 'rb'))
    y_for_test = request.form['news_']
    y_for_test = pd.Series(y_for_test)
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    xy = []
    for i in y_for_test.values:
        stops_ = stopword.remove(stemmer.stem(i))
        wordy_ = ''
        for st in stops_.split(' '):
            if st.isalpha():
                wordy_ += st + ' '
        xy.append(wordy_)
    x_t = cv_.transform(xy)
    resu = model.predict(x_t)
    print('prediction:')
    s = [str(i) for i in list(enc_.inverse_transform(resu))]
    res = ", ".join(s)
    return render_template(
        'index.html',
        prediction_text='Topiknya adalah {}. Ya kan?'.format(res))
Пример #7
0
def blogging():
    # Insert to Blog DB
    _title = request.form["title"]
    _content = request.form["content"]
    _link = request.form["link"]
    _date = request.form["date"]
    # Normalizing
    _title = _title.replace("'", "\"")
    _content2 = _content.replace("'", "")
    _content = _content.replace("'", "")

    # ***PRE-PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _content2 = stopword.remove(_content2)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _content2 = stemmer.stem(_content2)

    BlogRepository().insert(Blog(_title, _content, _link, _content2, _date))

    # Insert to Tf DB
    return render_template('index.html')
def preprocess(text):
	token_list = []
	vectorizer = TfidfVectorizer()
	factory = StemmerFactory()
	stop_factory = StopWordRemoverFactory()
	ps = factory.create_stemmer()
	stopword = stop_factory.create_stop_word_remover()
	pattern = r'[0-9]'
	text = re.sub(pattern, '', text)
	shorthand = [" no "," sy "," dg "," kk "," ga "," sdh "," bgn "," klw "," bbrp "," kbr "," dri "," dr "," jgn "," yg "," tdk ", " jt "," gk ",
				 " atw "," klu ", " tsb "," utk "," tlh "]
	replacement= [" nomor "," saya "," dengan "," kakak "," tidak "," sudah "," bangun "," kalau "," beberapa "," kabar "
		," dari "," dari "," jangan "," yang "," tidak "," juta "," tidak "," atau "," kalau "," tersebut "," untuk ", " telah"]
	for i in range(len(shorthand)):
		# re.sub(shorthand[i],replacement[i],text)
		text = text.replace(shorthand[i],replacement[i])
	text = stopword.remove(text)
	text = re.sub(r'\b\w{1,2}\b', '', text)


	processed_text = ' '.join(ps.stem(token) for token in word_tokenize(text))
	token_list.append(processed_text)
	x = vectorizer.fit_transform(token_list)

	return processed_text
Пример #9
0
 def __init__(self):
     with open('./stopwords.txt') as f:
         more_stopword=f.read().split('\n')
     
     SWfactory = StopWordRemoverFactory()
     stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
     self.stopword = StopWordRemover(stopword_data)
Пример #10
0
def Preprocessing(data):
    print("Preprocessing")
    cleanData = []
    tokenizer = RegexpTokenizer(r'\w+')
    factory_stopwords = StopWordRemoverFactory()
    stopwordsFact = factory_stopwords.get_stop_words()
    stemmer = StemmerFactory().create_stemmer()
    count = 0
    for kalimat in data:
        removedHttp = re.sub(r"http\S+", '', kalimat)  #hilangin link http
        removedPic = re.sub(r"pic.twitter\S+", '',
                            removedHttp)  #hilangin link pic.twitter
        lower = removedPic.lower()  #casefolding
        tokenized = tokenizer.tokenize(lower)  #tokenizer + punctuation removal
        stopwords = []  #Stopwords removal
        for kata in tokenized:
            if kata not in stopwordsFact:
                stopwords.append(kata)
        stemmed = []  #stemming
        for kata in stopwords:  #stemming
            stemmed.append(stemmer.stem(kata))  #stemming
        cleanData.append(stemmed)
        count += 1
        print(count)
    return cleanData
Пример #11
0
def pre_processing(doc):  # NLP Text Pre-processing
    # Regular Expression - Menghapus karakter angka dan tanda baca
    tokenizer = RegexpTokenizer(r'\w+')
    texts = []

    # Stemmer - Menghilangkan infleksi/kata imbuhan ke dalam bentuk dasar
    stFactory = StemmerFactory()
    st = stFactory.create_stemmer()

    # Stopword - Menghilangkan kata umum yang tidak memiliki makna
    swFactory = StopWordRemoverFactory()
    sw = swFactory.create_stop_word_remover()

    for i in doc:
        raw = i.lower()  # Merubah kata menjadi huruf kecil
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if i in sw.remove(i)]
        stemmed_tokens = [st.stem(i) for i in stopped_tokens]

        texts.append(stemmed_tokens)

    # Detokenize
    detok = []
    for row in texts:
        sq = ''
        for word in row:
            sq = sq + ' ' + word
        detok.append(sq)

    texts = detok
    return texts[0]
Пример #12
0
def modif(kalimat):
    # nltk.download('punkt')
    pat = ""

    kalimat = kalimat.translate(str.maketrans('', '',
                                              string.punctuation)).lower()
    # case folding & menghilangkan tanda baca . ,

    tokens = nltk.tokenize.word_tokenize(kalimat)  # tokenization

    fac = StopWordRemoverFactory()  #set stopword
    stop = fac.get_stop_words()
    stop.append("kak")  #menambahkan "kak" ke dalam kamus stopword

    stop.remove("tidak")  #menghapus kata "tidak"
    stop.remove("boleh")  #menghapus kata "boleh"
    stop.remove("bisa")  #menghapus kata "bisa"
    stop.remove("dimana")
    removed = []
    for t in tokens:
        if t not in stop:
            removed.append(t)  #stopword removal

    pat = ""

    for w in removed:
        pat += w + " "

    return (pat)
Пример #13
0
class Test_StopWordRemoverFactoryTest(unittest.TestCase):
    def setUp(self):
        self.factory = StopWordRemoverFactory()
        return super(Test_StopWordRemoverFactoryTest, self).setUp()

    def test_createStopWordRemover(self):
        self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover)
    
    def test_stopwordRemoval(self):
        sremover = self.factory.create_stop_word_remover()
        self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang'))
        self.assertEqual('makan rumah', sremover.remove('makan di rumah yang'))
    
    def test_tokens_stopwordRemoval(self):
        tokens = ['pergi', 'ke', 'sekolah', 'yang', 'bagus', 'adalah', 'impian']
        sremover = self.factory.create_stop_word_remover()
        clean_tokens = sremover.remove_tokens(tokens)
        text = ' '.join(clean_tokens)
        self.assertEquals('pergi sekolah bagus impian', text)
        self.assertEqual('pergi', clean_tokens[0])
        self.assertEqual('sekolah', clean_tokens[1])
        self.assertEqual('bagus', clean_tokens[2])
        self.assertEqual('impian', clean_tokens[3])

    def test_execution_time(self):
        start = time.time()
        sentence  = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.'
        sremover = self.factory.create_stop_word_remover()
        sremover.remove(sentence)
        end = time.time()
        # print(execution_time)
        execution_time = end - start

        self.assertTrue(execution_time < 1)
Пример #14
0
def preprocessing_text(text):

    encoded_string = text.encode("ascii", "ignore")  #remove asci
    text = encoded_string.decode()  #remove asci

    text = text.lower()  #lowercase

    text = ''.join([i for i in text if not i.isdigit()])  #remove number

    #text = ''.join([i for i in text if i not in text.punctuation])

    text = re.sub(r'http\S+', '', text)  #remove url

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    text = stopword.remove(text)

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    text = re.sub("[^\w\s]", '', text)  #remove punctuation
    text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text)  #remove punctuation

    return text
Пример #15
0
    def run(self):
        i = 1
        status = True
        while status:
            df = pd.read_csv('data.csv', sep=",")

            data = list(df["indonesia"].astype(str).str.lower())
            kd = []
            i = 1
            for d in data:
                # StopWordRemover
                factory = StopWordRemoverFactory()
                stopword = factory.create_stop_word_remover()

                # Tokenize
                stop = nltk.tokenize.word_tokenize(stopword.remove(str(d)))

                # Stemmer
                factory = StemmerFactory()
                stemmer = factory.create_stemmer()
                katadasar = stemmer.stem(str(stop))

                kd.append(katadasar)
                self.update_progressbar.emit(i + 1)
                i = i + 1

            no = 0
            with open('post-preprocessing.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
                spamwriter = csv.writer(csvfile)
                spamwriter.writerow(["teks"])
                for d in kd:
                    spamwriter.writerow([kd[no]])
                    no = no + 1

            status = False
Пример #16
0
def preproses(request):
    baca_db = CrawlDetikNews.objects.all()
    kounter = 0
    for baca in baca_db:
        kounter += 1
        if kounter > 497 and kounter <= 500:
            # create stemmer
            factory = StemmerFactory()
            stemmer = factory.create_stemmer()
            # stemming process
            sentence = baca.headline + " " + baca.content
            output = stemmer.stem(sentence)
            baca.stemming = output

            # ------------------- Stopword Removal
            fa = StopWordRemoverFactory()
            stopword = fa.create_stop_word_remover()
            kalimat = output
            stop = stopword.remove(kalimat)
            stop = stop.replace(' - ', ' ')
            output = stop
            baca.stopword = output

            baca.save()

    return render(request, 'beranda/preprocessing.html', {
        "rootword": output,
        "ori": sentence
    })
Пример #17
0
def search():
	if request.method == 'GET':
		data = data = json.load(open('data/testing_data.json', encoding="utf-8"))
		algo = request.args.get('algo', '1')
		query1 = request.args.get('q1', '')
		query2 = request.args.get('q2', '')
		query3 = request.args.get('q3', '')
		max_response = request.args.get("max_resp", 10)
		start = time.time()
		print("timer start")

		queries = query1 + " " + query2 + " " + query3
		#remove stopwords
		factory = StopWordRemoverFactory()
		stopword = factory.create_stop_word_remover()
		stopword.remove(queries)

		#stemming
		factory = StemmerFactory()
		stemmer = factory.create_stemmer()
		queries = stemmer.stem(queries)

		if algo == '1':
			response = jsonify(tfidf(data, queries, max_response))
		else:
			response = jsonify(lsa(data, queries, max_response))
		end = time.time()
		print("timer stop. Runtime: ")
		print(end - start)
		return response
Пример #18
0
 def stopword(self):
     factory = StopWordRemoverFactory()
     stopword = factory.create_stop_word_remover()
     filtering = []
     for artikel in self.dokumen:
         plus = stopword.remove(artikel.isi)
         filtering.append(plus)
     return filtering
Пример #19
0
 def remove_stop_word(self, X):
     factory = StopWordRemoverFactory()
     stopword = factory.create_stop_word_remover()
     for i in range(X.shape[0]):
         X[i] = stopword.remove(X[i])
         if len(X[i]) == 0:
             X[i] = ''
     return X
Пример #20
0
 def stopword(self):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya']
     data = stop_factory + more_stopword
      
     stop_factory = StopWordRemoverFactory()
     dictionary = ArrayDictionary(data)
     self.stopword = StopWordRemover(dictionary)
    def StopwordRemoval(self, text):
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()

        text = ' '.join(text)
        removed = stopword.remove(text)

        return removed.split(' ')
Пример #22
0
    def __init__(self, title: str, plot: str, human_synopsis: str):
        self.title = title
        self.plot = plot
        self.human_synopsis = human_synopsis
        self.stopwords = StopWordRemoverFactory().create_stop_word_remover()
        self.stemmer = StemmerFactory().create_stemmer()

        self.ringkasan = ""
Пример #23
0
 def remove_stopword(self):
     # 1remove stop word
     factory = StopWordRemoverFactory()
     stopword = factory.create_stop_word_remover()
     self.__sentence = [
         stopword.remove(word) if word != self.word else word
         for word in self.__sentence
     ]
Пример #24
0
def generate_sastrawi_stopwords():
    # get Sastrawi stopwords as list
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()

    # write to txt file
    with open(stopwords_list_path + '/sastrawi-stopwords.txt', 'w') as file:
        for word in stopwords:
            file.write(word + "\n")
Пример #25
0
def word_tokenizer(text):
    #tokenizes and stems the text
    tokens = word_tokenize(text)
    fac2 = StemmerFactory()
    stemmer = fac2.create_stemmer()
    factory = StopWordRemoverFactory()
    tokens = [stemmer.stem(t) for t in tokens if t not in factory.get_stop_words()]
    
    return tokens
Пример #26
0
def main():
    usrinp = int(input("1. Get data, 2. Crime Word Cloud, :"))
    if usrinp == 1:
        consumer_key = 'fLnf3WIuilQI8XDAy36L4HCXp'
        consumer_secret = 'Qi2rtmYVSm1t6ATC7J4McALCHgXHGLUnWgoHHBvU4q9JQraiCv'
        access_token = '255792021-VQMmhleyYXcLGXXFXcf3cGwkM0FCKOKpwBpxe6fb'
        access_secret = '1RpbrPaaSd92x8dNt0wupVNx0MsyGqSxsfqRmIu44XcSN'

        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_secret)

        api = tweepy.API(auth)

        portal = [
            'detikcom',
            'kompascom',
            'CNNIndonesia',
            'Metro_TV',
            'Beritasatu',
            'liputan6dotcom',
            'SINDOnews',
        ]
        crime_text = [
            'begal', 'hacker', 'korupsi', 'tipu', 'koruptor', 'tewas', 'leceh',
            'seksual', 'curi', 'tembak', 'curi', 'sabu', 'ganja', 'narkoba'
        ]

        text_file = open('tweets.txt', 'w')

        for i in portal:
            stuff = api.user_timeline(screen_name=i,
                                      count=10000,
                                      include_rts=True)

            for j in stuff:
                # Process a single status
                # text_file.write(json.dumps(j._json))
                word = j.text.lower()
                if any(ext in word for ext in crime_text):
                    text_file.write(j.text)
                    text_file.write("\n")
        text_file.close()

    elif usrinp == 2:
        # wc_text_file = open('tweets.txt').readlines()
        wc_text_file = open('crime_news.txt').readlines()
        factory_stem = StemmerFactory()
        stemmer = factory_stem.create_stemmer()

        factory_stop = StopWordRemoverFactory()
        stopword = factory_stop.create_stop_word_remover()

        for i in wc_text_file:
            output_stem = stemmer.stem(i)
            output_stop = stopword.remove(output_stem)

            print(output_stop)
Пример #27
0
def process(txtString):
    factory = StopWordRemoverFactory()
    stopwords = factory.create_stop_word_remover()

    remove_digits = str.maketrans('', '', digits)
    doc = txtString.translate(remove_digits)
    doc = re.sub('[^A-Za-z0-9]+', ' ', doc)
    doc = doc.lower()

    return stopwords.remove(doc)
def Stopword_removal(sentence):
    stopword_factory = StopWordRemoverFactory()
    stopwords = stopword_factory.get_stop_words()
    words = sentence.split()
    output = ""
    for word in words:
        if word not in stopwords:
            output = output + " " + word

    return output
Пример #29
0
def debug():
    if 'admin' not in session:
        return redirect(url_for("index"))
    csv = pd.read_excel("dataset.xlsx")

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    print(stopword.remove("Kepala otak kau lek, medan aman atau berkah?"))
    return "sarjana kombur"
Пример #30
0
 def __init__(self, min_cut=0.1, max_cut=0.9):
     """
  Initilize the text summarizer.
  Words that have a frequency term lower than min_cut 
  or higer than max_cut will be ignored.
 """
     factory = StopWordRemoverFactory()
     self._min_cut = min_cut
     self._max_cut = max_cut
     self._stopwords = set(factory.get_stop_words() + list(punctuation))
class Test_StopWordRemoverFactoryTest(unittest.TestCase):
    def setUp(self):
        self.factory = StopWordRemoverFactory()
        return super(Test_StopWordRemoverFactoryTest, self).setUp()

    def test_createStopWordRemover(self):
        self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover)
 def setUp(self):
     self.factory = StopWordRemoverFactory()
     return super(Test_StopWordRemoverFactoryTest, self).setUp()