def process_tweet(tweet): """Process tweet function. Input: tweet: a string containing a tweet Output: tweets_clean: a list of words containing the processed tweet """ stemmer = TurkishStemmer() stopwords_english = stopwords.words('turkish') # remove stock market tickers like $GE tweet = re.sub(r'\$\w*', '', tweet) # remove old style retweet text "RT" tweet = re.sub(r'^RT[\s]+', '', tweet) # remove hyperlinks tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', tweet) # tokenize tweets tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tweet_tokens = tokenizer.tokenize(tweet) tweets_clean = [] for word in tweet_tokens: if (word not in stopwords_english and # remove stopwords word not in string.punctuation): # remove punctuation # tweets_clean.append(word) stem_word = stemmer.stemWord(word) # stemming word tweets_clean.append(stem_word) return ' '.join([elem for elem in tweets_clean])
def my_form_post(): get_article = request.form['text'] snow = TurkishStemmer() get_article = get_article.lower() cleanr = re.compile('<.*?>') get_article = re.sub(cleanr, ' ', get_article) get_article = re.sub(r'[?|!|:|´|\'|"|#]', r'', get_article) get_article = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', get_article) words = [ snow.stemWord(word) for word in get_article.split() if word not in set(stopwords.words('turkish')) ] # Stemming and removing stopwords get_article = ' '.join(words) predict = (model.predict([get_article])) predicted = predict[0] predicted = predicted.upper() predicted = predicted.replace("_", " ") return ''' <html> <head> <link rel="stylesheet" type="text/css" href="/static/mainstyle3.css"> <title>Tahmin Zamanı</title> </head> <body> <div class="container"> <h1>Haber başlığın şununla ilgili olabilir</h1> <h2 class="rainbow">{}</h2> </div> </body> </html>'''.format(predicted)
def run(): turkStem=TurkishStemmer() input_data = input("Lütfen sorunuzu girin.") words = nltk.word_tokenize(input_data) words = [word.lower() for word in words if word.isalpha()] after_stem = [turkStem.stemWord(word) for word in words] print("AFTER SNOWBALL STEMMER: ", after_stem) ##print(after_stem) ## print("after stem",turkStem.stemWord(a)) ## print(turkStem.stemWord("ilişkilendiremediklerimiz, selam, gözlük , gözlem")) return after_stem
def stemming_words(text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) text = ' '.join([str(word) for word in stemmed_words]) # print (stemmed_words) return text
def _make_stem(job): global df df_str = df["stem"].astype(str) turk_stemmer = TurkishStemmer() length = df.shape[0] for index in range(length): _print_progress_bar(index, length, job=job, prefix=f"{job} Progress:", length=50) words = df_str[index].split() words = " ".join(turk_stemmer.stemWords(words)) df["stem"][index] = words
def stemming_words(self, text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) # try: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word[0:5]) # except: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word) text = ' '.join([str(word) for word in stemmed_words]) return text
def pam_turkish(input_file): """ This function runs PAM algorithm for the specified Turkish text, extracts n-grams, returns topics with subtopics of the text file. """ print("importing...") from snowballstemmer import TurkishStemmer from nltk.corpus import stopwords stemmer = TurkishStemmer() stops = set(stopwords.words('turkish')) print("preparing corpus...") train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer._stem()), stopwords=lambda x: len(x) <= 2 or x in stops) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) train_corpus.process(open(input_file, encoding='utf-8')) # make PA model and train print("training model...") mdl = tp.PAModel(k1=5, k2=25, min_cf=10, min_df=1, corpus=train_corpus, seed=42) for i in range(0, 100, 10): # increase 100 for more accurate results, but it will take more time mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) # mdl.summary() # save pam for reuse mdl.save('trained_pam.bin') # for loading use mdl.load('trained_pam.bin') # Creating ngrams, max_len determines bigram or trigram, 3 means trigram ngrams = train_corpus.extract_ngrams(min_cf=2, max_len=3) for c in ngrams: if len(c.words) == 3: print(c.words[0], c.words[1], c.words[2], sep='\t') # ngram words topic_result = [] for k in range(mdl.k1): print("== Topic #{} ==".format(k)) subs = [] subs_prob = [] sub_topics = mdl.get_sub_topics(k, top_n=5) for subtopic, probability in sub_topics: for word, p in mdl.get_topic_words(subtopic, top_n=1): subs.append(word) subs_prob.append(probability) for word, prob in mdl.get_topic_words(k, top_n=1): print(word, prob, sep='\t') topic_result.append({"topic": word, "prob": prob, "subs": subs, "subs_prob": subs_prob}) return topic_result
class Infer(object): turkish_stemmer = TurkishStemmer() word2idx = Infer.load_wordIdx_txt( "D:/Users/gurkan.sahin/Desktop/NLP/cnn_text_class/word2idx.txt") def __init__(self): pass def get_pred_class(doc): words = Infer.stemming(doc) print(words) """ numeric_doc = [Infer.word2idx[word] for word in words] print(numeric_doc, len(numeric_doc)) """ def load_wordIdx_txt(dict_dir): import json with open(dict_dir, "r") as json_file: return json.load(json_file) def stemming(doc): words = doc.split() for idx in range(len(words)): words[idx] = Infer.turkish_stemmer.stemWord(words[idx]) return words
def FileOrder(self): kelime = TurkishStemmer() for i in self.fullText: if (i == "" or i == "\n"): pass else: self.parsText.append(i) for i in self.parsText: if (kelime.stemWord(i.lower()) == "kaynak"): self.source_indis = self.number if (kelime.stemWord(i.lower()) == "önsöz"): self.Onsoz = self.number if (kelime.stemWord(i.lower()) == "ekler"): self.IndısEk = self.number else: self.number += 1 print("\t Toplam Boşluk Karakteri Sayısı: ", len(self.fullText) - self.number) print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.number) print("\t Kaynakca Başlangıç indisi: ", self.source_indis) print("\t Onsoz Başlangıç indisi: ", self.Onsoz) print("\t Toplam Yapılan Atıf: ", (self.number - self.source_indis))
def DosyaDuzenle(self): kelime = TurkishStemmer() for i in self.allText: if (i == "" or i == "\n"): pass else: self.parsText.append(i) for i in self.parsText: if (kelime.stemWord(i.lower()) == "kaynak"): self.kaynak_indis = self.sayac if (kelime.stemWord(i.lower()) == "önsöz"): self.Onsoz = self.sayac if (kelime.stemWord(i.lower()) == "ekler"): self.IndısEk = self.sayac else: self.sayac += 1 print("\t Toplam Boşluk Karakteri Sayısı: ", len(self.allText) - self.sayac) print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.sayac) print("\t Kaynakca Başlangıç indisi: ", self.kaynak_indis) print("\t Onsoz Başlangıç indisi: ", self.Onsoz) print("\t Toplam Yapılan Atıf: ", (self.sayac - self.kaynak_indis))
'hem', 'milyon', 'kez', 'otuz', 'beş', 'elli', 'bizi', 'da', 'sekiz', 've', 'çok', 'bu', 'veya', 'ya', 'kırk', 'onların', 'ona', 'bana', 'yetmiş', 'milyar', 'þunu', 'senden', 'birşeyi', 'dokuz', 'yani', 'kimi', 'þeyler', 'kim', 'neden', 'senin', 'yedi', 'niye', 'üç', 'şey', 'mı', 'tüm', 'onlari', 'bunda', 'ise', 'þundan', 'hep', 'þuna', 'bin', 'ben', 'ondan', 'kimden', 'bazı', 'belki', 'ne', 'bundan', 'gibi', 'de', 'onlardan', 'sizi', 'sizin', 'daha', 'niçin', 'þunda', 'bunu', 'beni', 'ile', 'şu', 'şeyi', 'sizden', 'defa', 'biz', 'için', 'dahi', 'siz', 'nerde', 'kime', 'birþey', 'birkez', 'her', 'biri', 'on', 'mü', 'diye', 'acaba', 'sen', 'en', 'hepsi', 'bir', 'bizden', 'sanki', 'benim', 'nerede', 'onu', 'benden', 'yüz', 'birkaç', 'çünkü', 'nasýl', 'hiç', 'katrilyon' ] stopwords.extend(newStop) temp = [] snow = TurkishStemmer() for eachNew in all_news: eachNew.title = eachNew.title.lower() eachNew.content = eachNew.content.lower() # Converting to lowercase cleanr = re.compile('<.*?>') eachNew.title = re.sub(cleanr, ' ', eachNew.title) eachNew.content = re.sub(cleanr, ' ', eachNew.content) # Removing HTML tags eachNew.title = re.sub(r'[?|!|:|´|\'|"|#]', r'', eachNew.title) eachNew.content = re.sub(r'[?|!|´|:|\'|"|#]', r'', eachNew.content) eachNew.title = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', eachNew.title) eachNew.content = re.sub(r'[.|:|´|,|)|(|\|/]', r' ', eachNew.content) # Removing Punctuations words = [ snow.stemWord(word) for word in eachNew.title.split()
from snowballstemmer import TurkishStemmer snow = TurkishStemmer() s1 = "değiştir" s2 = "istemiyorum" print(snow.stemWord(s1)) print(snow.stemWord(s2)) import multiprocessing
from snowballstemmer import TurkishStemmer tr_stemmer = TurkishStemmer() text = "Merhaba selam alperen" stemmed_words = tr_stemmer.stemWords(text) my_data = [" ".join(a) for a in stemmed_words] print(my_data)
# -*- coding: utf-8 -*- """NLP_TR_LDA_MULTICORE.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/116UKkvGAYKyopDukBoSInwrBhJ5wLHh5 """ #https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925 """# Turkish Stemmer Test""" from snowballstemmer import TurkishStemmer turkStem = TurkishStemmer() turkStem.stemWord("gelmişti") """# Import Libraries and Load Data""" import pandas as pd import numpy as np import matplotlib.pyplot as plt import nltk import re import warnings from collections import Counter from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split import nltk nltk.download('stopwords') WPT = nltk.WordPunctTokenizer() stop_word_list = nltk.corpus.stopwords.words('turkish')
import json #datasetlerimiz json metin dosyası formatında olacak. import numpy as np import random import pickle #modeller pickle dosyası şeklinde kaydedilecek. from tensorflow.keras.models import Sequential #modellerimizdeki katmanların lineer bir dizisini tutacağız. from tensorflow.keras.layers import Dense, Embedding, Dropout, Activation, GlobalAveragePooling1D #katmanlarımız için gerekli olan yapılar. from tensorflow.keras.optimizers import SGD #gradient descent optimizasyonları için kullanacağız. import nltk #dil işleme kütüphanemiz. from snowballstemmer import TurkishStemmer #türkçe destekle kelime köklerini ayıracağız. nltk.download("punkt") #cümleleri kelimelere aıyrmak için öncelikle nltk modülümüzü indiriyoruz. with open("dataset.json") as file: #dataset dosyamızı açıyoruz. intents=json.load(file) #data değişkenine json dosyası açıldı. stemmer=TurkishStemmer() #kök ayırma işlemini türkçe destekle yapıyoruz. words=[] #ayıklanmış kelimelerimizin tutulacağı liste. classes=[] #json dosyamızdaki etiketlerimizin tutulacağı liste. documents=[] #json dosyamızdaki etiket ve patternların tutulacağı liste. ignore_letters=["!","'","?",",","."] #cümle içindeki bu noktalama işaretlerini atlıyoruz. for intent in intents["intents"]: for pattern in intent["patterns"]: word=nltk.word_tokenize(pattern) #json dosyamızdaki patternlerdeki cümleleri kelimelere ayırıyoruz. words.extend(word) #ayırdığımız kelimeleri listeye ekliyoruz. print(words) documents.append((word, intent["tag"])) #ayıklanmış kelime listemizi ve ait olduğu etiketi ekliyoruz. if intent["tag"] not in classes: classes.append(intent["tag"]) #etiketimizi listeye ekliyoruz.
!pip install gensim import gensim from gensim.utils import simple_preprocess #from gensim.parsing.preprocessing import STOPWORDS #Does not support Turkish yet. from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.stem.porter import * import numpy as np np.random.seed(400) import nltk nltk.download('wordnet') import pandas as pd from snowballstemmer import TurkishStemmer stemmer=TurkishStemmer() """# Tokenizing and Stemmin Functions""" def lemmatize_stemming(text): return stemmer.stemWord(text) #lemmitize was removed because it is not working in turkish # Tokenize and lemmatize def preprocess(text): result=[] for token in gensim.utils.simple_preprocess(text) : if token not in stop_word_list and len(token) > 3: result.append(lemmatize_stemming(token)) return result
import pandas as pd import numpy as np from snowballstemmer import TurkishStemmer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression, Perceptron from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier from sklearn.neighbors import KNeighborsClassifier ################################################################### turkStem = TurkishStemmer() df = pd.read_excel("istanbul_sozlesmesi_prep.xlsx") df.drop(["fav_count"], axis=1, inplace=True) #Bag of Words yöntemi ile sayısallaştırma yapar cv = CountVectorizer() word_vector = cv.fit_transform(df["text"].apply( lambda x: " ".join([turkStem.stemWord(i) for i in x.split()]))) #etiketli olan 600 verilik kısmı eğitim ve test seti olarak kullanmak üzere X ve y değişkenlerine atar X = word_vector[:600, :] y = df["category"].head(600) X_train, X_test, y_train, y_test = train_test_split(X, y,
stopwords = list(stopwords.words('turkish')) newStop = ['bir', 'ol', 'ola', 'belki', 'olur', 'bugün', 'yarın', 'şimdi', 'mu', 'onlar','seksen','ama','trilyon','buna' ,'bizim','şeyden','yirmi','altı','iki','seni','doksan','dört','bunun','ki','nereye','altmış','hem','milyon','kez','otuz','beş' ,'elli','bizi','da','sekiz','ve','çok','bu','veya','ya','kırk','onların','ona','bana','yetmiş','milyar','þunu' ,'senden','birşeyi','dokuz','yani','kimi','þeyler','kim','neden','senin','yedi','niye','üç','şey','mı','tüm','onlari' ,'bunda','ise','þundan','hep','þuna','bin','ben','ondan','kimden','bazı','belki','ne','bundan','gibi','de','onlardan','sizi','sizin' ,'daha','niçin','þunda','bunu','beni','ile','şu','şeyi','sizden','defa','biz','için','dahi','siz','nerde','kime','birþey' ,'birkez','her','biri','on','mü','diye','acaba','sen','en','hepsi','bir','bizden','sanki','benim','nerede','onu','benden' ,'yüz','birkaç','çünkü','nasýl','hiç','katrilyon'] stopwords.extend(newStop) while True: print("Enter article:") X = input() if (X == '0'): break snow = TurkishStemmer() X = X.lower() cleanr = re.compile('<.*?>') X = re.sub(cleanr, ' ', X) X = re.sub(r'[?|!|:|´|\'|"|#]', r'', X) X = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', X) words = [snow.stemWord(word) for word in X.split() if word not in stopwords] # Stemming and removing stopwords X = ' '.join(words) text = X # Create and generate a word cloud image: wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(text) # Display the generated image: plt.figure() plt.imshow(wordcloud, interpolation="bilinear")
import pandas as pd from sklearn.feature_extraction.text import CountVectorizer import mysql.connector import numpy as np import re from pandas import DataFrame from snowballstemmer import TurkishStemmer turkStem = TurkishStemmer() from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfTransformer from nltk.corpus import stopwords from nltk.stem import SnowballStemmer import pickle # and later you can load it #with open('filename.pkl', 'rb') as f: # clf = pickle.load(f) mydb = mysql.connector.connect( host="localhost", user="******", passwd="123hal123", database="comments", ) mycursor = mydb.cursor()
import tensorflow as tf from snowballstemmer import TurkishStemmer import numpy as np import random import json #import requests #import bs4 # nltk.download('punkt') # Json dosyası olarak oluşturulan Covid-19 metin veri setini yükleme with open(r"covidDataset.json", encoding="utf8") as file: data = json.load(file) # Değişken tanımlamaları stemmer = TurkishStemmer() words = [] labels = [] docs_x = [] docs_y = [] tag = " " global cevap # Cümlelerin kelimelere ve etiketlere ayrılması for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels:
import warnings from pandas import DataFrame from nltk.corpus import stopwords as stop from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk import pos_tag if False: nltk.download('wordnet') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') warnings.filterwarnings(action='ignore') wpt = nltk.WordPunctTokenizer() PorterStemmer = PorterStemmer() SnowballStemmer = TurkishStemmer() lemmatizer = WordNetLemmatizer() stop_words = set(stop.words('turkish')) def remove_hyperlink(sentence: str) -> str: """ This method remove hyperlinks & emails & mentions from given sentence Args: sentence: input sentence file, :type str Returns: hyperlink removed sentence """ sentence = re.sub(r"\S*@\S*\s?", " ", sentence) sentence = re.sub(r"www\S+", " ", sentence)
class Preprocess(object): vocab = [] word2idx = {} idx2word = {} turkish_stemmer = TurkishStemmer() def __init__(self): self.corpus = [] self.label = [] def read_corpus(self, corpus_dir, label_dir, first_k_char_stem=0): with open(corpus_dir, "r") as sentences: for __sentence in sentences: #stemmed_line = Preprocess.stemming(__sentence, first_k_char_stem) #first_k_char stemming stemmed_line = Preprocess.snowball_stemmer(__sentence) self.corpus.append(stemmed_line) [self.add_vocab(word) for word in stemmed_line] with open(label_dir, "r") as labels: for __label in labels: self.label.append(int(__label.strip())) def snowball_stemmer(sentence): words = sentence.split() for idx in range(len(words)): words[idx] = Preprocess.turkish_stemmer.stemWord(words[idx]) return words def stemming(sentence, first_k_char_stem): words = sentence.split() if first_k_char_stem != 0: for idx in range(len(words)): words[idx] = words[idx][:first_k_char_stem] return words def add_vocab(self, word): if word not in Preprocess.vocab: Preprocess.vocab.append(word) """ 0 index for padding word """ Preprocess.word2idx[word] = len(Preprocess.vocab) Preprocess.idx2word[len(Preprocess.vocab)] = word def get_vocab(): return Preprocess.vocab def get_corpus(self): return self.corpus def get_label(self): return self.label def get_word2idx(): return Preprocess.word2idx def get_idx2word(): return Preprocess.idx2word
clean_text = " ".join(list_without_punct) logging.info('Preprocessing has finished') print('unique word count: ', len(set(clean_text.split()))) print('whole word count: ', len(clean_text.split())) logging.info('Tokenize words') words = tokenize.word_tokenize(clean_text) nltk.download('stopwords') stop_word_list = nltk.corpus.stopwords.words('turkish') filtered_words = [token for token in words if token not in stop_word_list] logging.info('Stemming words') from snowballstemmer import TurkishStemmer turkStem = TurkishStemmer() stemmed_clean_text = [] for w in filtered_words: stemmed_clean_text.append(turkStem.stemWord(w)) logging.info('Convert list into list of list for word2Vec') list_of_list = [[x] for x in stemmed_clean_text] #CBOW Model logging.info('Cbow Model will be trained') cbowModel = gensim.models.Word2Vec(list_of_list, size=100, window=2, min_count=1, workers=4, sg=0)
# ============================================================================= # If its frequency is 1 inside all data, it be requed to remove because of Feature Selection # ============================================================================= #pd.Series(" ".join(df["News"]).split()).value_counts() # ============================================================================= # Tokenizing # ============================================================================= df = df.apply(lambda x: TextBlob(x).words) # ============================================================================= # Stemming # ============================================================================= stemmer = TurkishStemmer() df = df.apply(lambda x: " ".join(stemmer.stemWord(word) for word in x)) # ============================================================================= # AddingClass 0 ekonomi 1 magazin 2 saglik 3 spor # ============================================================================= Category = ["ekonomi" for i in range(150)] Category.extend(["magazin" for i in range(150)]) Category.extend(["saglik" for i in range(150)]) Category.extend(["spor" for i in range(150)]) Category.extend(["ekonomi" for i in range(80)]) Category.extend(["magazin" for i in range(80)]) Category.extend(["saglik" for i in range(80)]) Category.extend(["spor" for i in range(80)]) dframe = pd.DataFrame(df, columns=["News"])
!pip install gensim import gensim from gensim.utils import simple_preprocess #from gensim.parsing.preprocessing import STOPWORDS #Does not support Turkish yet. from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.stem.porter import * import numpy as np np.random.seed(400) nltk.download('wordnet') """**Checking Stemmer**""" import pandas as pd from snowballstemmer import TurkishStemmer stemmer=TurkishStemmer() original_words = ['Başarılı', 'insanlar', 'adamlar', 'öldüler', 'içindekiler','kapısındaki', 'yiyecekler,', 'çıkaranlar', 'lahanalar', 'takımların','sırası', 'futbolcuların', 'yedikleri'] singles = [stemmer.stemWord(plural) for plural in original_words] pd.DataFrame(data={'original word':original_words, 'stemmed':singles }) """**Stemming and Tokenizing Functions**""" def lemmatize_stemming(text): # Lemmetizing is removed because it is not appropriate for turkish return stemmer.stemWord(text) # Tokenize and lemmatize def preprocess(text): result=[] for token in gensim.utils.simple_preprocess(text) :