예제 #1
0
 def steamming(self):
     st = ISRIStemmer()
     lis = self.tokenizer()
     xx = ""
     for i in lis:
         xx = xx + ' ' + (st.stem(i))
     return xx
예제 #2
0
def stemm(tweetstr):
	stemmer = ISRIStemmer();
	stemstr = []
	for s in tweetstr:
		st = stemmer.stem(s)
		stemstr.append(st)
	return stemstr
def finding_changeable_con(word, size):
    st = ISRIStemmer()
    stemmed_word = st.stem(word)
    if stemmed_word == word:
        for token in stem_dict:
            if token == word:
                print("Stemmed Word : " + token)
                for x in range(len(stem_dict[token])):
                    derived_word = stem_dict[token][x]
                    print("Derived Word : ")
                    print(derived_word)
                    print("Sentences : ")
                    occurrences_list = word_dict[derived_word]
                    concordances_output = get_changeable_con(
                        occurrences_list, size)
                    print(*concordances_output, sep="\n")

    else:
        for token in word_dict:
            if token == word:
                print("Word : " + token)
                print("Stemmed Word : " + stemmed_word)
                print("Sentences : ")
                occurrences_list = word_dict[token]
                concordances_output = get_changeable_con(
                    occurrences_list, size)
                print(*concordances_output, sep="\n")
                print("\n")
    print("\n")
예제 #4
0
def stemm(tweetstr):
    stemmer = ISRIStemmer()
    stemstr = []
    for s in tweetstr:
        st = stemmer.stem(s)
        stemstr.append(st)
    return stemstr
 def Stem_word(self, body):
     st = ISRIStemmer()
     word = body.split(u" ")
     word_stem = list()
     for w in word:
         word_stem.append(st.stem(w))
     body = " ".join(word_stem) 
     return body
예제 #6
0
def steaming(text):
    st = ISRIStemmer()
    stemmed_words = []
    words = word_tokenize(text)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence
 def stemming_ISR(self, text):
     st = ISRIStemmer()
     stemmed_words = []
     words = word_tokenize(text)
     for w in words:
         stemmed_words.append(st.stem(w))
     stemmed_text = " ".join(stemmed_words)
     return stemmed_text
예제 #8
0
def stemTokenize(text):
    if locale == 'ar':
        stemmer = ISRIStemmer()
        return [stemmer.stem(w) for w in word_tokenize(text)]
    elif locale == 'da':
        stemmer = SnowballStemmer('danish')
        return [stemmer.stem(w) for w in word_tokenize(text)]
    elif locale == 'en':
        stemmer = SnowballStemmer('english')
        return [stemmer.stem(w) for w in word_tokenize(text)]
    elif locale == 'es':
        stemmer = SnowballStemmer('spanish')
        return [stemmer.stem(w) for w in word_tokenize(text)]
    elif locale == 'hi':
        t = hindi_nlu.Processor(text)
        t.tokenize()
        return [t.generate_stem_words(w) for w in t.tokens]
    elif locale == 'mr':
        t = hindi_nlu.Processor(text)
        t.tokenize()
        return [t.generate_stem_words(w) for w in t.tokens]
    elif locale == 'nl':
        stemmer = SnowballStemmer('dutch')
        return [stemmer.stem(w) for w in word_tokenize(text)]
    elif locale == 'sv':
        stemmer = SnowballStemmer('swedish')
        return [stemmer.stem(w) for w in word_tokenize(text)]
    else:
        stemmer = SnowballStemmer('english')
        return [stemmer.stem(w) for w in word_tokenize(text)]
예제 #9
0
class Stemming:
    def __init__(self):
        self.st = ISRIStemmer()

    def stemWord(self, text):
        word_tokens = word_tokenize(text)
        filtered_sentence = [self.st.stem(w) + ' ' for w in word_tokens]

        return ''.join(filtered_sentence)
예제 #10
0
def stem(
    text
):  #[st.stem(word) for word in text if not word in set(stopwords.words('english'))]
    st = ISRIStemmer()
    temp_text = ""
    for word in text.split():
        #print(st.stem(word))
        temp_text += st.stem(word) + " "
    text = temp_text
    return text
class Books():
    def __init__(self, category_id):
        self.category_id = category_id
        print('Books Class instantiated for Category {}.'.format(category_id))

        # NLTK Stemmer
        self.st = ISRIStemmer()

        # get all stop words
        # individual letters (typos & printing issues)
        sw1 = get_stop_words('arabic') + stopwords.words("arabic")
        sw2 = [
            'ا', 'أ', 'إ', 'ذ', 'ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ',
            'ح', 'ج', 'ش', 'س', 'ي', 'ب', 'ل', 'ا', 'ال', 'ت', 'ن', 'م', 'ك',
            'ئ', 'ء', 'ؤ', 'ر', 'لا', 'ى', 'ة', 'و', 'ز', 'ظ'
        ]
        self.sw = set(sw1 + sw2)

    def not_sw(self, text):  # excludes stop words
        return (text not in self.sw) or self.st.stem(text) not in self.sw

    def not_small_big(
            self,
            text):  # exclude single letters, combined words, and stop words
        return (len(text) >= 3) and (len(text) <= 9)

    def get_book_id(self, index_url):
        return re.findall(r'13\d\\(\d+)', str(index_url))[0]

    def strip_text(self, text):
        return araby.strip_tatweel(araby.strip_tashkeel(text))

    # This function is the main reason for having this class
    # Since  Doc2Vec can take a `iter` to go through each file
    # one at a time, instead of loading all the books into memory.
    def __iter__(self):
        for i, file_name in enumerate(
                glob('../../data/' + str(self.category_id) + '/*.json')):
            print('Started Book: {}.'.format(self.get_book_id(file_name)))
            try:
                with open(str(file_name)) as f:
                    book_text = json.load(f)['text']

                #### Start Processing
                start_time = time.time()
                processed_book = araby.tokenize(
                    self.strip_text(book_text),
                    conditions=[self.not_sw, araby.is_arabicword])
                print('Cleaned Book: {} in {} seconds.'.format(
                    self.get_book_id(file_name),
                    time.time() - start_time))
                yield TaggedDocument(processed_book, [i])

            except:
                print("Fix {}".format(file_name))
def one_string_steming(sentence):
    '''
    Argument:
        String of words
    return:
        list of words with steming which the root of the word
    '''
    sentence = one_string_tokenization(sentence)
    stemmer = ISRIStemmer()
    sentence = [stemmer.stem(word) for word in sentence]
    return sentence
def build_stem_dictionary(preprocessed_text, stop_words):
    # This method builds the Roots Dictionary as follows
    # {'stemmed_word1': ['derived_word1', 'derived_word2', ...],
    #  'stemmed_word2': ['derived_word1', 'derived_word2', 'derived_word3', ...], ...}
    st = ISRIStemmer()
    words_list = word_tokenize(preprocessed_text)
    for token in words_list:
        if token not in stop_words and token not in ['.']:
            stemmed_token = st.stem(token)
            if not stem_dict.get(stemmed_token):
                stem_dict[stemmed_token] = []
            if not token in stem_dict[stemmed_token]:
                stem_dict[stemmed_token].append(token)
예제 #14
0
def tokenize_documents(documents):

    stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered
    english = EnglishStemmer()
    arabic = ISRIStemmer()

    punctuation = { ord(char): None for char in string.punctuation}

    def valid_word(token, filtered=stop_words): 
        # Returns false for common words, links, and strange patterns
            if (token in filtered) or (token[0:4] == u'http') or\
            (token in string.punctuation):
                return False
            else:
                return True

    for doc in documents:

        row = doc[0]
        doc = doc[1]

        if doc is not None:

            # remove trailing whitespace
            doc = doc.strip()
            # remove twitter handles (words in doc starting with @)
            doc = re.sub(r"@\w+|\b@\w+", "", doc)
            # lowercase letters
            doc = doc.lower()
            # remove punctuation
            doc = doc.translate(punctuation)

            # tokenization: handles documents with arabic or foreign characters
            tokens = nltk.tokenize.wordpunct_tokenize(doc)

            cleaned_tokens = []
            for token in tokens:

                # for valid words, correct spellings of gaddafi and stem words
                if valid_word(token):
                
                    if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']:
                        token = u'gaddafi'
                    else:
                        token = arabic.stem(english.stem(token)) 

                    cleaned_tokens.append(token)    

            yield row
            yield cleaned_tokens
예제 #15
0
 def stemLexicon(self, newLex):  #newLex = prepareLexicon()
     stemmed_Lexicon_words = []
     polarity_Lex = []
     stLex = ISRIStemmer()
     for index, column in newLex.iloc[:].iterrows():
         word = newLex.at[index, 'ngram']
         polarity = newLex.at[index, 'polarity']
         stemmed_Lexicon_words.append(stLex.stem(word))
         polarity_Lex.append(polarity)
     stemmed_Lexicon_DF = pd.DataFrame({
         'ngram': stemmed_Lexicon_words,
         'polarity': polarity_Lex
     })
     return stemmed_Lexicon_DF  #of type list
예제 #16
0
def stem(string):

    # split given string into words
    words = string.split()
    stems_list = []

    isri_stemmer = ISRIStemmer()

    for word in words:
        # stem word
        stem_word = isri_stemmer.stem(word)
        # add new stem to dict
        stems_list.append(stem_word)

    return stems_list
예제 #17
0
 def sentencePreprocessingDF(self, df, row, col):
     arabic_sw_file = open("arabic_stop_words.txt", 'r+')
     ar_sw_list = arabic_sw_file.read()
     ar_sw_list = word_tokenize(ar_sw_list)
     #Includes stopwords removal, elongtion words removal, Stemming
     st = ISRIStemmer()
     tokenized_word_list = []
     tokenized_sentence = []
     words = word_tokenize(df.at[row, col])
     for word in words:
         if word not in ar_sw_list:
             word = self.replaceElongated(word)
             tokenized_word_list.append(st.stem(word))
             tokenized_sentence = " ".join(tokenized_word_list)
     return tokenized_sentence
예제 #18
0
 def get_test_negative_array_stemmed_without_sw(self):
     stemmer = ISRIStemmer()
     test_negative_array_stemmed_without_sw = []
     review_words_stemmed_without_sw = []
     for review in self.get_test_negative_array(self):
         review_words = nltk.word_tokenize(review)
         review_words_without_sw = [
             i for i in review_words if not i in self.get_arabic_sw(self)
         ]
         review_words_stemmed_without_sw = []
         for word in review_words_without_sw:
             review_words_stemmed_without_sw.append(stemmer.stem(word))
         test_negative_array_stemmed_without_sw.append(" ".join(
             str(x) for x in review_words_stemmed_without_sw))
     return test_negative_array_stemmed_without_sw
예제 #19
0
 def get_features(comment, lan):
     words = list(comment)
     if lan == 'ar':
         st = ISRIStemmer()
         features = [0] * len(word_features_ar2)
         for w in words:
             w = st.stem(w)
             if w in word_features_ar_dict:
                 features[word_features_ar_dict[w]] = 1
     else:
         features = [0] * len(word_features_en2)
         for w in words:
             w = stem(w)
             if w in word_features_en_dict:
                 features[word_features_en_dict[w]] = 1
     return features
예제 #20
0
파일: nlp_utils.py 프로젝트: owo/jitalk
def stem_tokens(token_list, src_lang):
    """
    Returns the stem of a given word depending on the source language.
    """

    stemmed = []

    if src_lang == 'en':
        ps = PorterStemmer()
        for token in token_list:
            stemmed.append(ps.stem(token))
    
    if src_lang == 'ar':
        isri = ISRIStemmer()
        for token in token_list:
            stemmed.append(isri.stem(token))

    return stemmed
def WordsFiltires(tokenstem):
    """

    :param tokenstem:
    :return WordsFiltires:
    """
    stopWords = set(stopwords.words('arabic'))
    stemmed_word = []
    WordsFiltires = []
    words = word_tokenize(tokenstem)
    st = ISRIStemmer()

    for word in words:
        if word in stopWords:
            continue
        stemmed_word.append(st.stem(word))
        WordsFiltires = ' '.join(stemmed_word)

    return WordsFiltires
	def stem_words(self):
		"""
		Stem all the words in each file	using
		ISRI Arabic stemmer based on algorithm:
			Arabic Stemming without a root dictionary.
		"""
		st = ISRIStemmer()
		for folder in os.listdir(self.processed_corpus_path):
			dir_path = os.path.join(os.sep, self.processed_corpus_path, folder)
			for a_file in os.listdir(dir_path):
				file_path = os.path.join(os.sep, dir_path, a_file)
				to_write = []
				with open (file_path, 'r') as infile:
					words = infile.readlines()
					for word in words:
						to_stem = word[:-1]
						stemmed = st.stem(to_stem)
						to_write.append(stemmed)
				# print(to_write)
				with open (file_path, 'w') as outfile:
					for word in to_write:
						outfile.write(word+'\n')
			print(folder+" stemmed ")
예제 #23
0
def WordsFiltires(tokenstem):
    """
    This function is to remove
    1- remove stop words
    2- stemmer

    :param tokenstem:
    :return WordsFiltires:
    """
    stopWords = set(stopwords.words('arabic'))
    stemmed_word = []
    WordsFiltires = []
    words = word_tokenize(tokenstem)
    st = ISRIStemmer()

    # -----stop words with stemming-----------
    for word in words:
        if word in stopWords:
            continue
        stemmed_word.append(st.stem(word))
        WordsFiltires = ' '.join(stemmed_word)

    return WordsFiltires
def GetProductsForSalary(x):
    s = False
    ps = PorterStemmer()
    words = word_tokenize(x)
    st = ISRIStemmer()
    length = len(words)
    # print(x)
    i = 0
    while i < length:
        z = st.stem(words[i])
        if re.search('\d', z):
            k = 0
            while (k < len(z)):
                if z[k] == "ش":
                    s = True
                    print("vvvvvvvvvvvvvvvvvvvvvvvvv", z)

        elif z == "شيكل" or z == "شيقل" or z == "ش" or z == "NIS" or z == "Nis":
            s = True

        i += 1
    print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT====>", s)
    return s
예제 #25
0
def app_req_similarity(app_words, req_words):
    """
    :param app_words:
    :param req_words:
    :return:Return the similarity (a score) between the request and a given app
            2 approaches are used (with and without stemming)
    """

    stemmer = ISRIStemmer()
    count_dict = {}
    stemmed_count_dict = {}

    # start calculating similarity
    for rw in req_words:
        if rw in app_words:
            if rw in count_dict:
                count_dict[rw] += 1
            else:
                count_dict[rw] = 1
        rw_stemmed = stemmer.stem(rw)
        if rw_stemmed in app_words:
            if rw_stemmed in stemmed_count_dict:
                stemmed_count_dict[rw_stemmed] += 1
            else:
                stemmed_count_dict[rw_stemmed] = 1

    # calculating score
    score = 0
    stemmed_score = 0
    for k in count_dict.keys():
        score = score + int(count_dict[k])
    score = score / len(req_words)
    for k in stemmed_count_dict.keys():
        stemmed_score = stemmed_score + int(stemmed_count_dict[k])
    stemmed_score = stemmed_score / len(req_words)
    print(score, stemmed_score)
    return score, stemmed_score
예제 #26
0
def clean_up_sentence(sentence):
    normalizeArabic(sentence)
    #using replacment dictionary to replace a popular words that clients may use
    for old, new in replacments.items():
        sentence = sentence.replace(old, new)
    sentence = sentence.replace('؟', ' ')
    #tokenize the pattern
    tokens = word_tokenize(sentence)
    #remove punctuation from each word
    remove_pun = str.maketrans('', '', string.punctuation)
    words = [w.translate(remove_pun) for w in tokens]
    #remove non-alphabetic characters
    alphabetic_words = [word for word in words if word.isalpha()]
    #remove arabic stop stop
    arabic_stop_word = stopwords.words('arabic')
    stop_words = set(arabic_stop_word)
    alphabetic_words = [
        word for word in alphabetic_words if not word in stop_words
    ]
    #stem each word
    stemer = ISRIStemmer()
    stemmed_words = [stemer.stem(word) for word in alphabetic_words]
    stemmed_words = list(dict.fromkeys(stemmed_words))
    return stemmed_words
예제 #27
0
파일: test2.py 프로젝트: pmsprenger/ma
def main():

    # Define which corpora to work with via sys.argv[1]
    corpora = sys.argv[1]

    # Define input data.
    k50 = "../out/mallet/testdez/" + corpora + "-50.txt"
    k100 = "../out/mallet/testdez/" + corpora + "-100.txt"
    k200 = "../out/mallet/testdez/" + corpora + "-200.txt"

    # Load ISRIStemmer.
    st = ISRIStemmer()

    # Create lists: all_plots, all_means.
    all_plots = []
    all_means = []

    # Create for loop over the three files.
    for i in (k50, k100, k200):
        # Open file, read it into variable f, close file.
        f_in = open(i)
        f = f_in.readlines()
        f_in.close()

        # Create lists: words, stemlist.
        words = []
        stemlist = []

        # Loop over the lines in f. Tokenize words, delete the numbers at the
        # beginning of each line (0:4). Append line to words.
        for line in f:
            line = tokenizer(line)
            del line[0:4]
            words.append(line)

        # Loop over words. Stem each word and append to stemlist.
        for listitem in words:
            stems = []
            for w in listitem:
                r = st.stem(w)
                stems.append(r)
            stemlist.append(stems)

        # Create lists: score, plotdata.
        score = []
        plotdata = []

        # Loop over lists in stemlist. Create a dictionary: d.
        # Loop over the words in topic:
        # if word is in d: add 1 to its value in d.
        # else: add word to d.
        for topic in stemlist:
            d = {}
            for item in topic:
                if item in d:
                    d[item] += 1
                else:
                    d[item] = 1

            # Get the value of each word in d and append it to plotdata.
            maximum = max(d, key=d.get)
            plotdata.append(d[maximum])

            # Calculate the score: 1 / len(d).
            # Append each d_score to score.
            d_score = 1 / len(d)
            score.append(d_score)

        # Calculate the mean of score. Append to all_means.
        mean = np.mean(score)
        all_means.append(mean)

        # Append plotdata to all_plots.
        all_plots.append(plotdata)
        print(plotdata)

    # Create figure: boxplot with data from "all_plots".
    xtick50 = "k=50, mean score over \n all topics: " + str(
        round(all_means[0], 4))
    xtick100 = "k=100, mean score over \n all topics: " + str(
        round(all_means[1], 4))
    xtick200 = "k=200, mean score over \n all topics: " + str(
        round(all_means[2], 4))
    fig = plt.figure(1, figsize=(9, 6))
    ax = fig.add_subplot(111)
    ax.boxplot(all_plots)
    ax.set_xticklabels([xtick50, xtick100, xtick200])
    ax.set_yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    ax.set_ylabel("Highest value of root repetition per topic",
                  rotation='vertical')
    ax.set_xlabel("k = topics")
    ax.set_title("UN")
    fig.savefig('../out/mallet/figures/testdez/un.png', bbox_inches='tight')
예제 #28
0
파일: Preprocess.py 프로젝트: abasbh/uob
class Preprocess:    
    def __init__(self):
        self.st = ISRIStemmer()
        self.getStopwords()
        self.getNegationwords()
        self.getSymbol()


       
    def analysis(self,line):
        line = self.enLine(line)
        line = self.tokenize(line)
        line = self.remSW(line)
        line = self.getTerms(line)
        line = self.remNE(line)
        line = self.removeNA(line)
        line = self.asLine(line)
        return line


    def analysisList(self,line_list):
        newList = list()
        for line in line_list:
            line = self.enLine(line)
            line = self.tokenize(line)
            line = self.remSW(line)
            line = self.getTerms(line)
            line = self.remNE(line)
            line = self.removeNA(line)
            line = self.asLine(line)
            newList.append(line)
        return newList
    
    def getStopwords(self):
        '''get stopwords from the stopwords file'''
        module_dir = os.path.dirname(__file__)  # get current directory
        file_path = os.path.join(module_dir, 'stopword.txt')
        f = open(file_path, 'r')
        stopwords = [line.rstrip() for line in f]
        sw = dict.fromkeys(stopwords)
        f.close()
        self.sw = [z.decode('utf-8') for z in sw]


    def getNegationwords(self):
        '''get negation words from the negation file'''
        module_dir = os.path.dirname(__file__)  # get current directory
        file_path = os.path.join(module_dir, 'negation.txt')
        f = open(file_path, 'r')
        newords = [line.rstrip() for line in f]
        ne = dict.fromkeys(newords)
        f.close()
        self.ne = [n.decode('utf-8') for n in ne]

    def getSymbol(self):
        '''get symbol from symbol file'''
        module_dir = os.path.dirname(__file__)  # get current directory
        file_path = os.path.join(module_dir, 'symbol.txt')
        f = open(file_path, 'r')
        sy = [line.rstrip() for line in f]
        ne = dict.fromkeys(sy)
        f.close()
        self.sy = [s.decode('utf-8') for s in sy]

        
    def enLine(self,line):
        ''' convert line to unicode '''
        try:
            line = line.decode('utf-8')
            self.log_msg = "string is not UTF-8, length %d bytes" % len(line)
        except UnicodeError:
            self.log_msg = "string is UTF-8"

        for s in self.sy:
            try:
                s = s.decode('utf-8')
            except UnicodeError:
                log_msg = "string is UTF-8"
            line = line.replace(s, u' ' + s + u' ')
            
        #line = line.replace(u'.', u' . ')
        #line = line.replace(u'.', u' . ')
        return line

            
    def removeNA(self,token):
        '''remove non-Arabic'''
        #x = re.compile(ur'[\u064B-\u065F]+', re.UNICODE)
        #line = [x.sub('', word) for word in line]
        x = re.compile(ur'[^\u0621-\u064A|_]+[\u1F300-\u1F5FF\u1F600-\u1F64F\u1F680-\u1F6FF\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE)
        token = [x.sub('', word) for word in token]
        x = re.compile(ur'[\u0023]+', re.UNICODE)
        token = [x.sub('', word) for word in token]

        token = [word for word in self.asLine(token).split()]
        return token


    def tokenize(self,line):
        if len(line) > 50000:
            n = len(line) / 50000
            l = list()
            for i in range(1,n):
                start = (i-1)*50000
                end = i * 50000
                l = l + word_tokenize(line[start:end])
            token = l
        else:
            token = word_tokenize(line)

        return token


    def remSW(self,token):
        token_clean = [x for x in token if x not in self.sw]
        return token_clean


    def remNE(self,token):
        for i in range(len(token)):
            if token[i] in self.ne:
                temp = token[i]
                for x in range(i+1,len(token)):
                    if token[x] in self.sy:
                        break
                    else:
                        token[x] = temp + '_' + token[x]
            
        token_clean = [x for x in token if x not in self.ne]
        token_clean = [x for x in token_clean if x not in self.sy]
        return token_clean


    def norma (self,word):
        if word[:2] == u'ال' :
            word = word[2:]
        #ألف 
        x = re.compile(ur'[\u0622|\u0623|\u0625]+', re.UNICODE)
        word = x.sub(ur'\u0627', word)
        #ياء + ألف مقصورة 
        x = re.compile(ur'[\u0649]+', re.UNICODE)
        word = x.sub(ur'\u064A', word)
        #تاء مربوطة + هاء
        x = re.compile(ur'[\u0629]+', re.UNICODE)
        word = x.sub(ur'\u0647', word)
        #تطويلة
        x = re.compile(ur'[\u0640]+', re.UNICODE)
        word = x.sub(ur'', word)
        return word


    def getTerms(self,token):     
        line = list()
        for i in range(len(token)):
            a = self.norma(token[i])
            a = self.st.stem(a)
            line.append(a)
        return line

    def asLine(self,token):
        return ' '.join(token)
예제 #29
0
from nltk.stem.isri import ISRIStemmer

# Make it work for Python 2+3 and with Unicode
try:
    to_unicode = unicode
except NameError:
    to_unicode = str
# Read JSON file
with open('golden_corpus/build/golden_corpus_arabic.json') as data_file:
    golden_corpus = json.load(data_file)

stemmer = ISRIStemmer()
i = cpt_roots = 0
stemmed = ''
while(i < len(golden_corpus)- 2):
    r = stemmer.stem(golden_corpus[i]["word"])
    if r == golden_corpus[i]["root"]:
        cpt_roots = cpt_roots + 1
    i = i + 1

rootssSuccessPercent = (cpt_roots*100)/float(len(golden_corpus))

print "======================================================"
print "================= Test ISRI-stemmer =================="
print "================= with Golden_Corpus  ================"
print "======================================================"
print "success rate roots = {:0.2f} %".format(rootssSuccessPercent)
print cpt_roots," root cases are passed from: ",len(golden_corpus)
print "======================================================"
print "=================     End Test        ================"
print "======================================================"
예제 #30
0
 def stemer(self, word):
     stem = ISRIStemmer()
     root = stem.stem(word)
     return root
예제 #31
0
case7p = [
"استبدلتموهم",
"فلتستقبلوهم"
]

case7 = [
"فلنبلونهم"
]

if __name__ == "__main__":
    reload(sys)
    sys.setdefaultencoding('utf8')
    s = ISRIStemmer()
    nltk.data.path.append('/home/kariminf/Data/NLTK/')

    fout = open("isri_test.txt", "w")

    fout.write("it(\"Case of 7 chars\", function() {\n")
    for case in case7:
        print(case)
        fout.write("    expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n")
    fout.write("});\n")

    fout.write("it(\"Case of plus than 7 chars\", function() {\n")
    for case in case7p:
        print(case)
        fout.write("    expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n")
    fout.write("});\n")

    fout.close()
예제 #32
0
import nltk
from nltk import word_tokenize

from nltk.stem.isri import ISRIStemmer

st = ISRIStemmer()

w = " البحث العلمي أو البحث أو التجربة التنموية هو أسلوب منظم في جمع المعلومات الموثوقة وتدوين الملاحظات والتحليل الموضوعي لتلك المعلومات باتباع أساليب ومناهج علمية محددة بقصد التأكد من صحتها أو تعديلها أو إضافة الجديد لها، ومن ثم التوصل إلى بعض القوانين والنظريات والتنبؤ بحدوث مثل هذه الظواهر والتحكم في أسبابها"

for a in word_tokenize(w):

    print(st.stem(a))
예제 #33
0
def getRootAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list: result.append(arstemmer.stem(word))
	return ' '.join(result)
def stemTokenize(text):
    stemmer = ISRIStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]


# Created by pyminifier (https://github.com/liftoff/pyminifier)
예제 #35
0
import sys
from nltk.stem.isri import ISRIStemmer
arstemmer = ISRIStemmer()
token = sys.argv[1]
root = arstemmer.stem(token)
print (root)
예제 #36
0
파일: main.py 프로젝트: imnawar/AnnArabic
def stem(w):
    isri_stemmer = ISRIStemmer()
    return isri_stemmer.stem(w)
예제 #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  test_isri.py
#

#
#

from nltk.stem.isri import ISRIStemmer
stemmer = ISRIStemmer()
word = u"بمكتباتنا"
stem = stemmer.stem(word)
print stem.encode('utf8')