def read_data_from_file(
    filename
):  #"reading and preparing the training or testing datasets by extracting and separating tweets from labels."

    tweets = []  # list of text samples
    labels = []  # list of label ids
    labels_index = {}  # dictionary mapping label name to numeric id

    istemmer = ISRIStemmer()
    read_file = open(filename, "r+")  # read and write mode

    index = 0
    for line in read_file:

        line = line.split('\t')  # to get the tweet itself

        label = line[0]
        tweet = line[1].strip(" \"")

        tweet = clean_str(tweet)
        tweet = istemmer.norm(tweet)

        if (label not in labels_index):
            labels_index[label] = index
            index += 1

        tweets.append(tweet)
        labels.append(labels_index[label])

    read_file.close()

    return [tweets, labels]
Пример #2
0
 def steamming(self):
     st = ISRIStemmer()
     lis = self.tokenizer()
     xx = ""
     for i in lis:
         xx = xx + ' ' + (st.stem(i))
     return xx
Пример #3
0
def stemm(tweetstr):
	stemmer = ISRIStemmer();
	stemstr = []
	for s in tweetstr:
		st = stemmer.stem(s)
		stemstr.append(st)
	return stemstr
def read_data_from_file(
    filename, number_of_classes
):  #"reading and preparing the training or testing datasets by extracting and separating tweets from labels."

    tweets = []  # list of text samples
    labels = []  # list of label ids

    istemmer = ISRIStemmer()
    read_file = open(filename, "r+")  # read and write mode

    for line in read_file:
        tweet = ""

        filtered_line = line.split()  # to get the tweet itself

        label = list(map(int, filtered_line[-11:]))

        for word in filtered_line[1:-11]:
            tweet += word + " "

        tweet = tweet[:-1]
        tweet = clean_str(tweet)

        tweet = istemmer.norm(tweet)

        tweets.append(tweet)
        labels.append(label)

    read_file.close()

    return [tweets, labels]
def finding_changeable_con(word, size):
    st = ISRIStemmer()
    stemmed_word = st.stem(word)
    if stemmed_word == word:
        for token in stem_dict:
            if token == word:
                print("Stemmed Word : " + token)
                for x in range(len(stem_dict[token])):
                    derived_word = stem_dict[token][x]
                    print("Derived Word : ")
                    print(derived_word)
                    print("Sentences : ")
                    occurrences_list = word_dict[derived_word]
                    concordances_output = get_changeable_con(
                        occurrences_list, size)
                    print(*concordances_output, sep="\n")

    else:
        for token in word_dict:
            if token == word:
                print("Word : " + token)
                print("Stemmed Word : " + stemmed_word)
                print("Sentences : ")
                occurrences_list = word_dict[token]
                concordances_output = get_changeable_con(
                    occurrences_list, size)
                print(*concordances_output, sep="\n")
                print("\n")
    print("\n")
Пример #6
0
def data_preprocessing(article):

    article = re.sub('\n', ' ', article)  # Removing this character
    article = re.sub('الـ', '', article)  # Removing this character
    article = re.sub('لـ', '', article)  # Removing this character
    article = re.sub('بـ', '', article)  # Removing this character
    article = re.sub('ال', '', article)  # Removing this character
    article = re.sub('عربية نت ', '', article)  # Removing this sentence

    # Spilt the keyword name by comma
    tokens = word_tokenize(str(article))
    # Define a list of punctuation
    remove_pun = str.maketrans('', '', string.punctuation)
    # Remove punctuation from each word
    words = [w.translate(remove_pun) for w in tokens]
    # Remove non-alphabetic characters
    alphabetic_words = [word for word in words if word.isalpha()]
    # Remove arabic stopwords
    alphabetic_words = [
        word for word in alphabetic_words if not word in stop_words
    ]
    # Initialize arabic stemmer
    stemer = ISRIStemmer()
    # Stem each word
    stemmed_words = [stemer.suf32(word) for word in alphabetic_words]
    # Join and return the stemmed_words
    return " ".join(stemmed_words)
Пример #7
0
def stemm(tweetstr):
    stemmer = ISRIStemmer()
    stemstr = []
    for s in tweetstr:
        st = stemmer.stem(s)
        stemstr.append(st)
    return stemstr
 def stemming_ISR(self, text):
     st = ISRIStemmer()
     stemmed_words = []
     words = word_tokenize(text)
     for w in words:
         stemmed_words.append(st.stem(w))
     stemmed_text = " ".join(stemmed_words)
     return stemmed_text
 def Stem_word(self, body):
     st = ISRIStemmer()
     word = body.split(u" ")
     word_stem = list()
     for w in word:
         word_stem.append(st.stem(w))
     body = " ".join(word_stem) 
     return body
Пример #10
0
def steaming(text):
    st = ISRIStemmer()
    stemmed_words = []
    words = word_tokenize(text)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence
Пример #11
0
def stem(
    text
):  #[st.stem(word) for word in text if not word in set(stopwords.words('english'))]
    st = ISRIStemmer()
    temp_text = ""
    for word in text.split():
        #print(st.stem(word))
        temp_text += st.stem(word) + " "
    text = temp_text
    return text
def one_string_steming(sentence):
    '''
    Argument:
        String of words
    return:
        list of words with steming which the root of the word
    '''
    sentence = one_string_tokenization(sentence)
    stemmer = ISRIStemmer()
    sentence = [stemmer.stem(word) for word in sentence]
    return sentence
def build_stem_dictionary(preprocessed_text, stop_words):
    # This method builds the Roots Dictionary as follows
    # {'stemmed_word1': ['derived_word1', 'derived_word2', ...],
    #  'stemmed_word2': ['derived_word1', 'derived_word2', 'derived_word3', ...], ...}
    st = ISRIStemmer()
    words_list = word_tokenize(preprocessed_text)
    for token in words_list:
        if token not in stop_words and token not in ['.']:
            stemmed_token = st.stem(token)
            if not stem_dict.get(stemmed_token):
                stem_dict[stemmed_token] = []
            if not token in stem_dict[stemmed_token]:
                stem_dict[stemmed_token].append(token)
Пример #14
0
def arabic_social_media_text_filter(txt, debug=0):
    """
    This filter is for filtering Arabic text from social media.

    :param txt: utf-8 text, unicode
    :param debug: Any value greater than 0 prints messages about normalized vs original text.
    :param return:
    """
    txt = social_media_text_filter(txt, debug=debug)
    # Remove diacritics
    st = ISRIStemmer()
    txt = st.norm(txt)
    return txt
Пример #15
0
def arabic_social_media_text_filter(txt, debug=0):
    """
    This filter is for filtering Arabic text from social media.

    :param txt: utf-8 text, unicode
    :param debug: Any value greater than 0 prints messages about normalized vs original text.
    :param return:
    """
    txt = social_media_text_filter(txt, debug=debug)
    # Remove diacritics
    st = ISRIStemmer()
    txt = st.norm(txt)
    return txt
Пример #16
0
 def stemLexicon(self, newLex):  #newLex = prepareLexicon()
     stemmed_Lexicon_words = []
     polarity_Lex = []
     stLex = ISRIStemmer()
     for index, column in newLex.iloc[:].iterrows():
         word = newLex.at[index, 'ngram']
         polarity = newLex.at[index, 'polarity']
         stemmed_Lexicon_words.append(stLex.stem(word))
         polarity_Lex.append(polarity)
     stemmed_Lexicon_DF = pd.DataFrame({
         'ngram': stemmed_Lexicon_words,
         'polarity': polarity_Lex
     })
     return stemmed_Lexicon_DF  #of type list
Пример #17
0
def tokenize_documents(documents):

    stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered
    english = EnglishStemmer()
    arabic = ISRIStemmer()

    punctuation = { ord(char): None for char in string.punctuation}

    def valid_word(token, filtered=stop_words): 
        # Returns false for common words, links, and strange patterns
            if (token in filtered) or (token[0:4] == u'http') or\
            (token in string.punctuation):
                return False
            else:
                return True

    for doc in documents:

        row = doc[0]
        doc = doc[1]

        if doc is not None:

            # remove trailing whitespace
            doc = doc.strip()
            # remove twitter handles (words in doc starting with @)
            doc = re.sub(r"@\w+|\b@\w+", "", doc)
            # lowercase letters
            doc = doc.lower()
            # remove punctuation
            doc = doc.translate(punctuation)

            # tokenization: handles documents with arabic or foreign characters
            tokens = nltk.tokenize.wordpunct_tokenize(doc)

            cleaned_tokens = []
            for token in tokens:

                # for valid words, correct spellings of gaddafi and stem words
                if valid_word(token):
                
                    if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']:
                        token = u'gaddafi'
                    else:
                        token = arabic.stem(english.stem(token)) 

                    cleaned_tokens.append(token)    

            yield row
            yield cleaned_tokens
Пример #18
0
 def get_test_negative_array_stemmed_without_sw(self):
     stemmer = ISRIStemmer()
     test_negative_array_stemmed_without_sw = []
     review_words_stemmed_without_sw = []
     for review in self.get_test_negative_array(self):
         review_words = nltk.word_tokenize(review)
         review_words_without_sw = [
             i for i in review_words if not i in self.get_arabic_sw(self)
         ]
         review_words_stemmed_without_sw = []
         for word in review_words_without_sw:
             review_words_stemmed_without_sw.append(stemmer.stem(word))
         test_negative_array_stemmed_without_sw.append(" ".join(
             str(x) for x in review_words_stemmed_without_sw))
     return test_negative_array_stemmed_without_sw
Пример #19
0
def stem(string):

    # split given string into words
    words = string.split()
    stems_list = []

    isri_stemmer = ISRIStemmer()

    for word in words:
        # stem word
        stem_word = isri_stemmer.stem(word)
        # add new stem to dict
        stems_list.append(stem_word)

    return stems_list
Пример #20
0
 def sentencePreprocessingDF(self, df, row, col):
     arabic_sw_file = open("arabic_stop_words.txt", 'r+')
     ar_sw_list = arabic_sw_file.read()
     ar_sw_list = word_tokenize(ar_sw_list)
     #Includes stopwords removal, elongtion words removal, Stemming
     st = ISRIStemmer()
     tokenized_word_list = []
     tokenized_sentence = []
     words = word_tokenize(df.at[row, col])
     for word in words:
         if word not in ar_sw_list:
             word = self.replaceElongated(word)
             tokenized_word_list.append(st.stem(word))
             tokenized_sentence = " ".join(tokenized_word_list)
     return tokenized_sentence
    def __init__(self, category_id):
        self.category_id = category_id
        print('Books Class instantiated for Category {}.'.format(category_id))

        # NLTK Stemmer
        self.st = ISRIStemmer()

        # get all stop words
        # individual letters (typos & printing issues)
        sw1 = get_stop_words('arabic') + stopwords.words("arabic")
        sw2 = [
            'ا', 'أ', 'إ', 'ذ', 'ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ',
            'ح', 'ج', 'ش', 'س', 'ي', 'ب', 'ل', 'ا', 'ال', 'ت', 'ن', 'م', 'ك',
            'ئ', 'ء', 'ؤ', 'ر', 'لا', 'ى', 'ة', 'و', 'ز', 'ظ'
        ]
        self.sw = set(sw1 + sw2)
Пример #22
0
    def stem(string, stemmer="porter", **kwargs):

        if stemmer == "porter":
            impl = PorterStemmer()
        elif stemmer == "lancaster":
            impl = LancasterStemmer()
        elif stemmer == "regex":
            regexp = kwargs['regexp']
            if 'min' in kwargs:
                min = kwargs['min']
            else:
                mins = 0
            impl = RegexpStemmer(regexp=regexp, min=min)
        elif stemmer == "isri":
            impl = ISRIStemmer()
        elif stemmer == "snowball":
            if 'language' in kwargs:
                language = kwargs['language']
            else:
                language = 'english'
            impl = SnowballStemmer(language=language)
        elif stemmer == "rslp":
            impl = RSLPStemmer()
        elif stemmer == "cistem":
            if 'case_insensitive' in kwargs:
                case_insensitive = kwargs['case_insensitive']
            else:
                case_insensitive = False
            impl = Cistem(case_insensitive=case_insensitive)
        else:
            return string

        return impl.stem(string)
Пример #23
0
 def get_features(comment, lan):
     words = list(comment)
     if lan == 'ar':
         st = ISRIStemmer()
         features = [0] * len(word_features_ar2)
         for w in words:
             w = st.stem(w)
             if w in word_features_ar_dict:
                 features[word_features_ar_dict[w]] = 1
     else:
         features = [0] * len(word_features_en2)
         for w in words:
             w = stem(w)
             if w in word_features_en_dict:
                 features[word_features_en_dict[w]] = 1
     return features
Пример #24
0
    def text_stemming(self):
        """
        stem the text
        """
        if self.language == "french":
            stemmer = FrenchStemmer()
        elif self.language == "english":
            stemmer = PorterStemmer()
        elif self.language == "italian":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "german":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "spanish":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "dutch":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "portuguese":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "danish":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "greek":
            stemmer = GreekStemmer()
        elif self.language == "arabic":
            stemmer = ISRIStemmer()
        else:
            print(
                "Language need to be french, english, german,spanish or italian"
            )

        self.text = ' '.join(
            [stemmer.stem(word) for word in word_tokenize(self.text)])
Пример #25
0
    def __init__(self, file_name=None, lang=_SPANISH, stemming=False):
        """
        Initializes the parameters for specific language
        """
        self._text = os.getenv('TEXT', default='text')
        self.languages = [_SPANISH, _ENGLISH, _ARABIC]
        self.lang = lang
        if self.lang not in self.languages:
            raise ("Language not supported: " + lang)
        self.text_model = TextPreprocessing(lang=self.lang)
        self.stem = stemming

        if self.lang == _ENGLISH:
            self.stemmer = PorterStemmer()
        elif self.lang == _ARABIC:
            from nltk.stem.isri import ISRIStemmer
            self.stemmer = ISRIStemmer()
        else:
            self.stemmer = SnowballStemmer(self.lang)

        self.emotions = {}
        self.stem_emotions = {}
        if file_name is not None:
            emo_file = file_name
        else:
            if self.lang in [_ENGLISH, _ITALIAN, _GERMAN, _ARABIC]:
                emo_file = self.lang[:2] + "." + _AFFECTIVE_FILE
            elif self.lang == _SPANISH:
                emo_file = "es." + _AFFECTIVE_FILE
            emo_file = os.path.join(PATH, 'data', emo_file)
        self.load_emotions(emo_file)
Пример #26
0
class BasicStemmer(Stemmer):
    def __init__(self):
        self.stemmer = ISRIStemmer()
        self.stopWordsIndex = ArabicStopWordsIndex(self)
        self.stopWordsIndex.buildIndex()

    def getStems(self, tokens, flag=False):

        rootList = []

        for token in tokens:
            #token=stemmer.norm(token)
            root = self.stemmer.pre32(token)
            rootList.append(root)
            print(token, "  :  ", root)

        return rootList

    def stem(self, word):
        root = self.stemmer.pre32(word)
        root = self.stemmer.norm(root, 3)

        return root

    def loadStemsDictionnary(self, filePath="dictStems.txt"):
        lines = open(filePath, "r", encoding="windows-1256").readlines()
        dictionary = nltk.defaultdict(list)
        for line in lines:
            if not re.match("^;.*", line):
                parts = line.split('\t')
                if len(parts) != 4:
                    break
                else:
                    [rootStem, stem, tag, enGloss] = parts
                    dictionary[rootStem].append(
                        [stem, tag, ' '.join(enGloss.split(';'))])

        return dictionary

    def verify(self, word):
        if self.stopWordsIndex.access(word):
            return True

    def setStopWordsIndex(self, index: ArabicStopWordsIndex):
        self.stopWordsIndex = index
        self.stopWordsIndex.buildIndex()
Пример #27
0
 def get_training_array_stemmed_without_sw(self):
     stemmer = ISRIStemmer()
     training_array_stemmed_without_sw = []
     for review in self.get_positive_reviews_stemmed_without_sw(self):
         training_array_stemmed_without_sw.append((review, 'pos'))
     for review in self.get_negative_reviews_stemmed_without_sw(self):
         training_array_stemmed_without_sw.append((review, 'neg'))
     return training_array_stemmed_without_sw
Пример #28
0
class Stemming:
    def __init__(self):
        self.st = ISRIStemmer()

    def stemWord(self, text):
        word_tokens = word_tokenize(text)
        filtered_sentence = [self.st.stem(w) + ' ' for w in word_tokens]

        return ''.join(filtered_sentence)
Пример #29
0
def stem_tokens(token_list, src_lang):
    """
    Returns the stem of a given word depending on the source language.
    """

    stemmed = []

    if src_lang == 'en':
        ps = PorterStemmer()
        for token in token_list:
            stemmed.append(ps.stem(token))
    
    if src_lang == 'ar':
        isri = ISRIStemmer()
        for token in token_list:
            stemmed.append(isri.stem(token))

    return stemmed
def WordsFiltires(tokenstem):
    """

    :param tokenstem:
    :return WordsFiltires:
    """
    stopWords = set(stopwords.words('arabic'))
    stemmed_word = []
    WordsFiltires = []
    words = word_tokenize(tokenstem)
    st = ISRIStemmer()

    for word in words:
        if word in stopWords:
            continue
        stemmed_word.append(st.stem(word))
        WordsFiltires = ' '.join(stemmed_word)

    return WordsFiltires
	def __init__(self, query, model, processed_corpus_path):
		self.model = model
		self.processed_corpus_path=processed_corpus_path
		self.query = query
		self.query_tokens=[]
		self.query_term_freq={}
		self.term_weights={}
		self.stemmer = ISRIStemmer()
		self.threshold = 0.005
		self.top_res = 5
		self.ar_stop_words=[]
		with open ("/home/tex/Documents/IR/Wikipedia-Search-Engine/project/rankretrievalmodel/Arabic/stop_words", 'r') as infile:
			self.ar_stop_words=[word[:-1] for word in infile.readlines()]

		self.tokenize() 
		self.remove_stop_words()
		self.stem_tokens()
		self.term_freq()
		self.tfidf()
class Books():
    def __init__(self, category_id):
        self.category_id = category_id
        print('Books Class instantiated for Category {}.'.format(category_id))

        # NLTK Stemmer
        self.st = ISRIStemmer()

        # get all stop words
        # individual letters (typos & printing issues)
        sw1 = get_stop_words('arabic') + stopwords.words("arabic")
        sw2 = [
            'ا', 'أ', 'إ', 'ذ', 'ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ',
            'ح', 'ج', 'ش', 'س', 'ي', 'ب', 'ل', 'ا', 'ال', 'ت', 'ن', 'م', 'ك',
            'ئ', 'ء', 'ؤ', 'ر', 'لا', 'ى', 'ة', 'و', 'ز', 'ظ'
        ]
        self.sw = set(sw1 + sw2)

    def not_sw(self, text):  # excludes stop words
        return (text not in self.sw) or self.st.stem(text) not in self.sw

    def not_small_big(
            self,
            text):  # exclude single letters, combined words, and stop words
        return (len(text) >= 3) and (len(text) <= 9)

    def get_book_id(self, index_url):
        return re.findall(r'13\d\\(\d+)', str(index_url))[0]

    def strip_text(self, text):
        return araby.strip_tatweel(araby.strip_tashkeel(text))

    # This function is the main reason for having this class
    # Since  Doc2Vec can take a `iter` to go through each file
    # one at a time, instead of loading all the books into memory.
    def __iter__(self):
        for i, file_name in enumerate(
                glob('../../data/' + str(self.category_id) + '/*.json')):
            print('Started Book: {}.'.format(self.get_book_id(file_name)))
            try:
                with open(str(file_name)) as f:
                    book_text = json.load(f)['text']

                #### Start Processing
                start_time = time.time()
                processed_book = araby.tokenize(
                    self.strip_text(book_text),
                    conditions=[self.not_sw, araby.is_arabicword])
                print('Cleaned Book: {} in {} seconds.'.format(
                    self.get_book_id(file_name),
                    time.time() - start_time))
                yield TaggedDocument(processed_book, [i])

            except:
                print("Fix {}".format(file_name))
Пример #33
0
    def basic_init(self, lang=_SPANISH, sentence_delim=False, **kwargs):
        if sentence_delim is False:
            self._BEGIN_TAG = ""
            self._END_TAG = ""

        self.lang = lang
        self.sentence_delim = sentence_delim

        logger.info("sws for  {}".format(lang))
        self.stopWords = self.get_stopwords(lang)
        self.tokenizer = TweetTokenizer()
        self.stemmer = None
        if self.lang in [_SPANISH, _ITALIAN, _PORTUGUESE]:
            self.stemmer = SnowballStemmer(_SPANISH, ignore_stopwords=False)
        elif self.lang == _ENGLISH:
            from nltk.stem.porter import PorterStemmer
            self.stemmer = PorterStemmer()
        elif self.lang == _ARABIC:
            from nltk.stem.isri import ISRIStemmer
            self.stemmer = ISRIStemmer()
Пример #34
0
    def __init__(self, configFileName, stopWordsFileName,
                 languageModelSerializationFileName, linksDBFileName, dataset):
        '''
        Constructor
        '''
        # The dataset to work on to extract the model
        self.dataset = []

        # Term/Freq langauge model
        self.languageModel = {}
        self.languageModelFreqInfo = {}

        # Dict of stop words
        self.stopWords = {}

        # Store the dataset
        self.dataset = dataset

        # Initialize number of terms per label
        self.numTermsPerLabel = {}

        # Initialize the links DB
        self.linksDB = {}
        self.linksDBFileName = linksDBFileName

        # Parse the configurations file
        self.ParseConfigFile(configFileName)

        # Instanstiate the stemmer if stemming is enabled
        if self.enableStemming == "true":
            self.stemmer = ISRIStemmer()

        # Store the stop words
        self.UpdateStopWords(stopWordsFileName)

        # Store the serialization file
        self.languageModelSerializationFileName = languageModelSerializationFileName

        # Initialize total docs
        self.totalNumberOfDocs = len(self.dataset)
Пример #35
0
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
Пример #36
0
 def __init__(self):
     self.st = ISRIStemmer()
     self.getStopwords()
     self.getNegationwords()
     self.getSymbol()
Пример #37
0
class Preprocess:    
    def __init__(self):
        self.st = ISRIStemmer()
        self.getStopwords()
        self.getNegationwords()
        self.getSymbol()


       
    def analysis(self,line):
        line = self.enLine(line)
        line = self.tokenize(line)
        line = self.remSW(line)
        line = self.getTerms(line)
        line = self.remNE(line)
        line = self.removeNA(line)
        line = self.asLine(line)
        return line


    def analysisList(self,line_list):
        newList = list()
        for line in line_list:
            line = self.enLine(line)
            line = self.tokenize(line)
            line = self.remSW(line)
            line = self.getTerms(line)
            line = self.remNE(line)
            line = self.removeNA(line)
            line = self.asLine(line)
            newList.append(line)
        return newList
    
    def getStopwords(self):
        '''get stopwords from the stopwords file'''
        module_dir = os.path.dirname(__file__)  # get current directory
        file_path = os.path.join(module_dir, 'stopword.txt')
        f = open(file_path, 'r')
        stopwords = [line.rstrip() for line in f]
        sw = dict.fromkeys(stopwords)
        f.close()
        self.sw = [z.decode('utf-8') for z in sw]


    def getNegationwords(self):
        '''get negation words from the negation file'''
        module_dir = os.path.dirname(__file__)  # get current directory
        file_path = os.path.join(module_dir, 'negation.txt')
        f = open(file_path, 'r')
        newords = [line.rstrip() for line in f]
        ne = dict.fromkeys(newords)
        f.close()
        self.ne = [n.decode('utf-8') for n in ne]

    def getSymbol(self):
        '''get symbol from symbol file'''
        module_dir = os.path.dirname(__file__)  # get current directory
        file_path = os.path.join(module_dir, 'symbol.txt')
        f = open(file_path, 'r')
        sy = [line.rstrip() for line in f]
        ne = dict.fromkeys(sy)
        f.close()
        self.sy = [s.decode('utf-8') for s in sy]

        
    def enLine(self,line):
        ''' convert line to unicode '''
        try:
            line = line.decode('utf-8')
            self.log_msg = "string is not UTF-8, length %d bytes" % len(line)
        except UnicodeError:
            self.log_msg = "string is UTF-8"

        for s in self.sy:
            try:
                s = s.decode('utf-8')
            except UnicodeError:
                log_msg = "string is UTF-8"
            line = line.replace(s, u' ' + s + u' ')
            
        #line = line.replace(u'.', u' . ')
        #line = line.replace(u'.', u' . ')
        return line

            
    def removeNA(self,token):
        '''remove non-Arabic'''
        #x = re.compile(ur'[\u064B-\u065F]+', re.UNICODE)
        #line = [x.sub('', word) for word in line]
        x = re.compile(ur'[^\u0621-\u064A|_]+[\u1F300-\u1F5FF\u1F600-\u1F64F\u1F680-\u1F6FF\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE)
        token = [x.sub('', word) for word in token]
        x = re.compile(ur'[\u0023]+', re.UNICODE)
        token = [x.sub('', word) for word in token]

        token = [word for word in self.asLine(token).split()]
        return token


    def tokenize(self,line):
        if len(line) > 50000:
            n = len(line) / 50000
            l = list()
            for i in range(1,n):
                start = (i-1)*50000
                end = i * 50000
                l = l + word_tokenize(line[start:end])
            token = l
        else:
            token = word_tokenize(line)

        return token


    def remSW(self,token):
        token_clean = [x for x in token if x not in self.sw]
        return token_clean


    def remNE(self,token):
        for i in range(len(token)):
            if token[i] in self.ne:
                temp = token[i]
                for x in range(i+1,len(token)):
                    if token[x] in self.sy:
                        break
                    else:
                        token[x] = temp + '_' + token[x]
            
        token_clean = [x for x in token if x not in self.ne]
        token_clean = [x for x in token_clean if x not in self.sy]
        return token_clean


    def norma (self,word):
        if word[:2] == u'ال' :
            word = word[2:]
        #ألف 
        x = re.compile(ur'[\u0622|\u0623|\u0625]+', re.UNICODE)
        word = x.sub(ur'\u0627', word)
        #ياء + ألف مقصورة 
        x = re.compile(ur'[\u0649]+', re.UNICODE)
        word = x.sub(ur'\u064A', word)
        #تاء مربوطة + هاء
        x = re.compile(ur'[\u0629]+', re.UNICODE)
        word = x.sub(ur'\u0647', word)
        #تطويلة
        x = re.compile(ur'[\u0640]+', re.UNICODE)
        word = x.sub(ur'', word)
        return word


    def getTerms(self,token):     
        line = list()
        for i in range(len(token)):
            a = self.norma(token[i])
            a = self.st.stem(a)
            line.append(a)
        return line

    def asLine(self,token):
        return ' '.join(token)
Пример #38
0
import os
import sys
import json
import io
from nltk.stem.isri import ISRIStemmer

# Make it work for Python 2+3 and with Unicode
try:
    to_unicode = unicode
except NameError:
    to_unicode = str
# Read JSON file
with open('golden_corpus/build/golden_corpus_arabic.json') as data_file:
    golden_corpus = json.load(data_file)

stemmer = ISRIStemmer()
i = cpt_roots = 0
stemmed = ''
while(i < len(golden_corpus)- 2):
    r = stemmer.stem(golden_corpus[i]["word"])
    if r == golden_corpus[i]["root"]:
        cpt_roots = cpt_roots + 1
    i = i + 1

rootssSuccessPercent = (cpt_roots*100)/float(len(golden_corpus))

print "======================================================"
print "================= Test ISRI-stemmer =================="
print "================= with Golden_Corpus  ================"
print "======================================================"
print "success rate roots = {:0.2f} %".format(rootssSuccessPercent)
Пример #39
0
import nltk
from nltk.stem.isri import ISRIStemmer

case7p = [
"استبدلتموهم",
"فلتستقبلوهم"
]

case7 = [
"فلنبلونهم"
]

if __name__ == "__main__":
    reload(sys)
    sys.setdefaultencoding('utf8')
    s = ISRIStemmer()
    nltk.data.path.append('/home/kariminf/Data/NLTK/')

    fout = open("isri_test.txt", "w")

    fout.write("it(\"Case of 7 chars\", function() {\n")
    for case in case7:
        print(case)
        fout.write("    expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n")
    fout.write("});\n")

    fout.write("it(\"Case of plus than 7 chars\", function() {\n")
    for case in case7p:
        print(case)
        fout.write("    expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n")
    fout.write("});\n")
Пример #40
0
def remove_diacritics(text):
	arstemmer = ISRIStemmer()
	result = arstemmer.norm(text, num=1) #  remove diacritics which representing Arabic short vowels
	return result
Пример #41
0
def getRootAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list: result.append(arstemmer.stem(word))
	return ' '.join(result)