def main(): # Open all files related to removing stop words or punctuation from the data. sw_in = open(r"../data/arstoplist.txt") stopwords = sw_in.read().splitlines() punctlist = open("../data/arabpunct.txt").read().splitlines() directory = sys.argv[1] # Give location of input files. files = os.listdir("../out/" + directory + "/ar/") st = ISRIStemmer() #rx_en = re.compile(r'\D+') tokens = [] counter = 0 filelist = [] for f in files: if "txt" in f: counter += 1 f_in = open("../out/" + directory + "/ar/" + f, 'rU') lines = f_in.readlines() f_in.close() filelist.extend(lines) print("Files read.") stemmed = {} types = {} f_out = open('../out/testset-tokenized-' + directory + '.txt', 'w') compl_list = [] for line in filelist: #line = line.strip() #tokenize = word_tokenize(line) # Tokenize the text. tokenize = tokenizer(line) #tokenize.sort() # Comment this out after the test-set has been used? # Define all patterns that shall be excluded. rx_ar = re.compile( u'^[\u0621-\u064A]+$' ) # This exludes Arabic words that have numbers attached to them. rx_ar2 = re.compile(u'^(\u0622{2,})') for w in tokenize: if len(w) == 1: pass elif rx_ar2.match(w): pass elif rx_ar.match(w): f_out.write(w + "\n") compl_list.append(w) else: pass f_out.close() # wieder einfügen for w in compl_list: types[w] = 0 #if punctlist[0] in compl_list or punctlist[1] in compl_list or punctlist[2] or punctlist[3] in compl_list: # if len(w) > 1: # ERROR # new_w = w[:-1] # ERROR! This strips off Arabic letters although they are not in the punctlist # types[new_w] = 0 # tokens.append(new_w) # else: # types[w] = 0 # tokens.append(w) print(str(len(types)) + " different words.") print("Punctuation separated.") # Here the actual stemming happens. verbs = {} c = -1 for w in types: c += 1 if w not in stopwords: stm = st.stem(w) stemmed[w] = stm verbs[stm] = 0 if c % 10000 == 0: print(str(c) + " words stemmed.") print("File stemmed.") # print the stemmed words and their unstemmed versions to a file f_out = open('../out/stem_tok_' + directory + '.txt', 'w') wordlist = [] for w in verbs.keys(): if len( w ) > 4: # Don't save words that are longer than 4 letters. Verbs in Arabic are usually 3 letters long. Ivery rare cases they can be 2 or 4 letters long as well. pass else: wordlist.append(w) #f_out.write(w + "\t" + stemmed[w]) #f_out.write(w + "\n") wordlist.sort() for w in wordlist: f_out.write(w + "\n") f_out.write("No. of verbs:" + str(len(wordlist))) # Really verbs? Why not wordlist? f_out.close() # handle some corpora stats corp_stat = Counter(tokens) for w in list(corp_stat.keys())[0:11]: print("token: " + w + "\tno.: " + str(corp_stat[w]))
import pickle # load the dataset data = open('ManualAnnotatedFakeNewsDataset.txt').read() #data = open('AutomaticAnnotatedFakeNewsDataset.txt').read() labels, texts = [], [] for i, line in enumerate(data.split("\n")): content = line.split("\t") labels.append(content[0]) texts.append(" ".join(content[1:])) #stemming data1 = [] from nltk import word_tokenize from nltk.stem.isri import ISRIStemmer st = ISRIStemmer() for tx in texts: tweet = "" for a in word_tokenize(tx): tweet = tweet + st.stem(a) + " " data1.append(tweet.strip()) #print(data1[:10]) #tashfeen data2 = [] import pyarabic.arabrepr arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() for tx in texts:
def stemmimg_text(self, text): st = ISRIStemmer() return [st.stem(w) for w in text]
for doc in docs: # print (doc) for line in doc['content']: text = re.sub( r'[\d+ a-zA-Z? & , \xd8 « » . :"،]', ' ', line ) # remove non-alphabetical characters and non-arabic characters tkns = text.split() tokenss = [] for token in tkns: tokenss.append(token) tokens.append(tokenss) # produces list of lists of tokens cleaned_data = [item for item in tokens if item != []] return cleaned_data stemmer = ISRIStemmer() data = clean_data() # this is a list of lists of tokens def lemmatizer(token): #print ("Data lemmatized") token = stemmer.pre32( token) # removes the three-letter and two-letter prefixes token = stemmer.suf32( token) # removes the three-letter and two-letter suffixes token = stemmer.norm(token, num=1) # removes diacritics return token def stop_words(): stop_words = stopwords.words('arabic')
def __init__(self): self.stemmer = ISRIStemmer() self.stopWordsIndex = ArabicStopWordsIndex(self) self.stopWordsIndex.buildIndex()
import string # from nltk.stem.isri import ISRIStemmer isri = ISRIStemmer() text = "على قيادة المؤتمر الشعبي العام قراءة رسالة الشعب جيدا من خلال احتشاد ميدان السبعين ، والتي تعني تحمل مسؤليته" words = text.split() new_words = [] for word in words: #stem word new_word = isri.stem(word) #print("."+new_word+".") #dont append if stemming turns it into whitespace/"" if new_word != "": new_words.append(new_word) #return this new_text = ' '.join(new_words) print(new_text)
def __init__(self): self.st = ISRIStemmer()
def _getstem(_word): st = ISRIStemmer() return st.stem(_word)
from nltk.stem import RSLPStemmer from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from nltk import word_tokenize from nltk.stem.snowball import SnowballStemmer import tinysegmenter import traceback #from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer nltk.download('wordnet') lemmatizer = WordNetLemmatizer() use_compound_split_german = False if use_compound_split_german: import LanguageDetection stem_ar = ISRIStemmer() factory = StemmerFactory() sastrawi_stemmer = factory.create_stemmer() #arabic stemmer stem_pt = RSLPStemmer() #portugese_brazalian stemmer stem_ja = tinysegmenter.TinySegmenter() stem_nl = SnowballStemmer('dutch') stem_ru = SnowballStemmer('russian') stem_sv = SnowballStemmer('swedish') stem_fr = SnowballStemmer('french') stem_de = SnowballStemmer('german') def read_file(filename): try: with open(filename, "r") as file_dp: data = json.load(file_dp)