Python ISRIStemmer.ISRIStemmer примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.stem.isri

Класс/Тип: ISRIStemmer

Метод/Функция: ISRIStemmer

Примеров на hotexamples.com: 9

Python ISRIStemmer.ISRIStemmer - 9 примеров найдено. Это лучшие примеры Python кода для nltk.stem.isri.ISRIStemmer.ISRIStemmer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

stem(30)

ISRIStemmer(9)

norm(9)

pre32(6)

suf32(6)

__init__(3)

waw(3)

pre1(1)

suf1(1)

Пример #1

Показать файл

Файл: stem.py Проект: pmsprenger/ma

def main():

    # Open all files related to removing stop words or punctuation from the data.
    sw_in = open(r"../data/arstoplist.txt")
    stopwords = sw_in.read().splitlines()
    punctlist = open("../data/arabpunct.txt").read().splitlines()

    directory = sys.argv[1]

    # Give location of input files.
    files = os.listdir("../out/" + directory + "/ar/")

    st = ISRIStemmer()
    #rx_en = re.compile(r'\D+')
    tokens = []
    counter = 0
    filelist = []

    for f in files:
        if "txt" in f:
            counter += 1
            f_in = open("../out/" + directory + "/ar/" + f, 'rU')
            lines = f_in.readlines()
            f_in.close()
            filelist.extend(lines)

    print("Files read.")

    stemmed = {}
    types = {}

    f_out = open('../out/testset-tokenized-' + directory + '.txt', 'w')
    compl_list = []

    for line in filelist:
        #line = line.strip()
        #tokenize = word_tokenize(line)

        # Tokenize the text.
        tokenize = tokenizer(line)

        #tokenize.sort() # Comment this out after the test-set has been used?

        # Define all patterns that shall be excluded.
        rx_ar = re.compile(
            u'^[\u0621-\u064A]+$'
        )  # This exludes Arabic words that have numbers attached to them.
        rx_ar2 = re.compile(u'^(\u0622{2,})')

        for w in tokenize:
            if len(w) == 1:
                pass
            elif rx_ar2.match(w):
                pass
            elif rx_ar.match(w):
                f_out.write(w + "\n")
                compl_list.append(w)
            else:
                pass
    f_out.close()

    # wieder einfügen

    for w in compl_list:
        types[w] = 0
        #if punctlist[0] in compl_list or punctlist[1] in compl_list or punctlist[2] or punctlist[3] in compl_list:
        #    if len(w) > 1: # ERROR
        #        new_w = w[:-1] # ERROR! This strips off Arabic letters although they are not in the punctlist
        #        types[new_w] = 0
        #        tokens.append(new_w)
        #    else:
        #        types[w] = 0
        #        tokens.append(w)

    print(str(len(types)) + " different words.")
    print("Punctuation separated.")

    # Here the actual stemming happens.
    verbs = {}
    c = -1
    for w in types:
        c += 1
        if w not in stopwords:
            stm = st.stem(w)
            stemmed[w] = stm
            verbs[stm] = 0
        if c % 10000 == 0:
            print(str(c) + " words stemmed.")
    print("File stemmed.")

    # print the stemmed words and their unstemmed versions to a file
    f_out = open('../out/stem_tok_' + directory + '.txt', 'w')
    wordlist = []
    for w in verbs.keys():
        if len(
                w
        ) > 4:  # Don't save words that are longer than 4 letters. Verbs in Arabic are usually 3 letters long. Ivery rare cases they can be 2 or 4 letters long as well.
            pass
        else:
            wordlist.append(w)
            #f_out.write(w + "\t" + stemmed[w])
            #f_out.write(w + "\n")
    wordlist.sort()
    for w in wordlist:
        f_out.write(w + "\n")
    f_out.write("No. of verbs:" +
                str(len(wordlist)))  # Really verbs? Why not wordlist?
    f_out.close()

    # handle some corpora stats
    corp_stat = Counter(tokens)
    for w in list(corp_stat.keys())[0:11]:
        print("token: " + w + "\tno.: " + str(corp_stat[w]))

Пример #2

Показать файл

Файл: LR.py Проект: yemen2016/FakeNewsDetection

import pickle
# load the dataset
data = open('ManualAnnotatedFakeNewsDataset.txt').read()
#data = open('AutomaticAnnotatedFakeNewsDataset.txt').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    labels.append(content[0])
    texts.append(" ".join(content[1:]))
#stemming
data1 = []
from nltk import word_tokenize

from nltk.stem.isri import ISRIStemmer

st = ISRIStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        tweet = tweet + st.stem(a) + " "
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:

Пример #3

Показать файл

Файл: nlp.py Проект: Abdellah1997/nlp-project-backend

 def stemmimg_text(self, text):
     st = ISRIStemmer()
     return [st.stem(w) for w in text]

Пример #4

Показать файл

Файл: topicmodel.py Проект: elsayed-issa/Arabic-News-Summaries

        for doc in docs:
            # print (doc)
            for line in doc['content']:
                text = re.sub(
                    r'[\d+ a-zA-Z? & , \xd8 « » . :"،]', ' ', line
                )  # remove non-alphabetical characters and non-arabic characters
                tkns = text.split()
                tokenss = []
                for token in tkns:
                    tokenss.append(token)
                tokens.append(tokenss)  # produces list of lists of tokens
    cleaned_data = [item for item in tokens if item != []]
    return cleaned_data


stemmer = ISRIStemmer()
data = clean_data()  # this is a list of lists of tokens


def lemmatizer(token):
    #print ("Data lemmatized")
    token = stemmer.pre32(
        token)  # removes the three-letter and two-letter prefixes
    token = stemmer.suf32(
        token)  # removes the three-letter and two-letter suffixes
    token = stemmer.norm(token, num=1)  # removes diacritics
    return token


def stop_words():
    stop_words = stopwords.words('arabic')

Пример #5

Показать файл

Файл: stemer.py Проект: Hassaine/Named-Entity-Extraction

 def __init__(self):
     self.stemmer = ISRIStemmer()
     self.stopWordsIndex = ArabicStopWordsIndex(self)
     self.stopWordsIndex.buildIndex()

Пример #6

Показать файл

import string

#
from nltk.stem.isri import ISRIStemmer

isri = ISRIStemmer()

text = "على قيادة المؤتمر الشعبي العام قراءة رسالة الشعب جيدا من خلال احتشاد ميدان السبعين ، والتي تعني تحمل مسؤليته"

words = text.split()

new_words = []

for word in words:
    #stem word
    new_word = isri.stem(word)
    #print("."+new_word+".")

    #dont append if stemming turns it into whitespace/""
    if new_word != "":
        new_words.append(new_word)

#return this
new_text = ' '.join(new_words)

print(new_text)

Пример #7

Показать файл

Файл: stemmer.py Проект: nour-ramadan/Smart-Search-Plugin

 def __init__(self):
     self.st = ISRIStemmer()

Пример #8

Показать файл

def _getstem(_word):
    st = ISRIStemmer()
    return st.stem(_word)

Пример #9

Показать файл

Файл: Lemmatize.py Проект: Koredotcom/KnowledgeGraphGenerator

from nltk.stem import RSLPStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import tinysegmenter
import traceback
#from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

use_compound_split_german = False
if use_compound_split_german:
    import LanguageDetection

stem_ar = ISRIStemmer()
factory = StemmerFactory()
sastrawi_stemmer = factory.create_stemmer()  #arabic stemmer
stem_pt = RSLPStemmer()  #portugese_brazalian stemmer
stem_ja = tinysegmenter.TinySegmenter()
stem_nl = SnowballStemmer('dutch')
stem_ru = SnowballStemmer('russian')
stem_sv = SnowballStemmer('swedish')
stem_fr = SnowballStemmer('french')
stem_de = SnowballStemmer('german')


def read_file(filename):
    try:
        with open(filename, "r") as file_dp:
            data = json.load(file_dp)