Пример #1
0
 def __init__(self, P):
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = ARLSTem()
     self.docs = self.get_answer_canditates(P)
     docs_stem = []
     for doc in self.docs:
         docs_stem.append(self.stem_string(doc))
     self.stopwords = stopwords.words('arabic')
     self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None)  # , stop_words=self.stopwords)
     self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem)
Пример #2
0
 def __init__(self, model_path):
     self.model_path = model_path
     print("loading fastText model ...")
     #self.model = pickle.load(open(self.model_path,"rb"))
     self.model = KeyedVectors.load_word2vec_format(self.model_path,
                                                    encoding='utf-8',
                                                    unicode_errors='ignore')
     print("done fastText loading model")
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = ARLSTem()
     self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
     self.vocab = self.model.vocab
Пример #3
0
 def __init__(self, P):
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = ARLSTem()
     self.docs = self.get_answer_canditates(P)
     docs_stem = []
     for doc in self.docs:
         docs_stem.append(self.stem_string(doc))
     # self.stopwords = stopwords.words('arabic')
     self.stopwords = open("stopwords-ur.txt").read().splitlines()
     self.stopwords = [i.lower() for i in self.stopwords]
     self.vectorizer = CountVectorizer(
         ngram_range=(1, 4))  # , stop_words=self.stopwords)
     self.count_matrix = self.vectorizer.fit_transform(docs_stem)
Пример #4
0
class fastTextEmbedder:
    def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path,
                                                       encoding='utf-8',
                                                       unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab

    def tokenize_string(self, str):
        """
        :param str: string sentence
        :return: tokens stemmed with respect to nltk
        """
        str_tokens = self.tokenizer.tokenize(str)
        tokens_stemmed = []
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                tokens_stemmed.append((token, self.stemmer.stem(token)))
        return tokens_stemmed

    def embed_tokens(self, sent, max_len):
        sent_tokens = self.tokenize_string(sent)
        embedding = np.zeros((max_len, 300))
        i = j = 0
        for i in range(0, min(len(sent_tokens), max_len)):
            e = np.zeros(300)
            if sent_tokens[i][0] in self.vocab:
                embedding[j] = self.model[sent_tokens[i][0]]
                j += 1
            elif sent_tokens[i][1] in self.vocab:
                embedding[j] = self.model[sent_tokens[i][1]]
                j += 1
        return embedding

    def embed(self, sent):
        """
        :param sent: string sentence
        :return: embedding of sentence as np vector of dim=300
        """
        sent_tokens = self.tokenize_string(sent)
        embedding = np.zeros(300)
        for token in sent_tokens:
            if token[0] in self.vocab:
                embedding += self.model[token[0]]
            elif token[1] in self.vocab:
                embedding += self.model[token[1]]
        return embedding
Пример #5
0
class countReader:
    SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'

    def __init__(self, P):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = self.get_answer_canditates(P)
        docs_stem = []
        for doc in self.docs:
            docs_stem.append(self.stem_string(doc))
        # self.stopwords = stopwords.words('arabic')
        self.stopwords = open("stopwords-ur.txt").read().splitlines()
        self.stopwords = [i.lower() for i in self.stopwords]
        self.vectorizer = CountVectorizer(
            ngram_range=(1, 4))  # , stop_words=self.stopwords)
        self.count_matrix = self.vectorizer.fit_transform(docs_stem)

    def stem_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        str_processed = ""
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                str_processed += token + " " + self.stemmer.stem(token) + " "
        return str_processed

    def concatenateString(self, paragraph, start, length):
        final_string = paragraph[start]
        for i in range(1, length):
            final_string += " " + paragraph[start + i]
        return final_string

    def get_answer_canditates(self, paragraph):
        para_sents = nltk.sent_tokenize(paragraph)
        candidates = []
        for sent in para_sents:
            para_words = sent.split()
            for i in range(0, len(para_words)):
                for j in range(1, min(15, len(para_words) - i + 1)):
                    candidate = self.concatenateString(para_words, i, j)
                    candidates.append(candidate)
        return candidates

    def read(self, P, Q):
        Q = self.stem_string(Q)
        query_tfidf = self.vectorizer.transform([Q])
        similarities_raw = cosine_similarity(self.count_matrix, query_tfidf)
        similarities = []
        for s in similarities_raw:
            similarities.append(s[0])
        max_index = np.argmax(similarities)
        return self.docs[max_index]
Пример #6
0
 def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
     self.k = k  # number of documents to return
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = ARLSTem()
     self.docs = docs
     self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None)
     if tfidf_matrix is None or vectorizer is None:
         self.tfidf_matrix = self.vectorizer.fit_transform(docs)
     else:
         self.vectorizer = vectorizer
         self.tfidf_matrix = tfidf_matrix
Пример #7
0
    def __init__(self):
        #preparing arabic emojis lexicon
        emojis_lexicon = pd.read_csv(
            'resources/emoji_lexicon.csv',
            header=None,
            names=['emoji', 'utf-8', 'unicode', 'arabic_translation'])
        self.emojis_lexicon_dict = dict()
        for index, row in emojis_lexicon.iterrows():
            self.emojis_lexicon_dict[row['emoji'].replace(
                ' ', '')] = row['arabic_translation']

        #preparing arabic stop words list
        f = open('resources/arabic_stop_words.txt', 'r',
                 encoding="utf8")  # read stop words from file
        self.stop_words = [
            line.strip() for line in f.readlines()
        ]  # construct list of stop wrods and remove the new line character

        #preparing punctuations list
        arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
        english_punctuations = string.punctuation
        self.all_punctuations = set(arabic_punctuations + english_punctuations)

        # initializing the stemmer
        self.stemmer = ARLSTem()  # requires minimum NLTK version of 3.2.5

        self.arabic_diacritics = re.compile(
            """
                                         ّ    | # Tashdid
                                         َ    | # Fatha
                                         ً    | # Tanwin Fath
                                         ُ    | # Damma
                                         ٌ    | # Tanwin Damm
                                         ِ    | # Kasra
                                         ٍ    | # Tanwin Kasr
                                         ْ    | # Sukun
                                         ـ     # Tatwil/Kashida

                                     """, re.VERBOSE)
Пример #8
0
def clean_sentence(text,
                   positive_lex=read_pos_lex(positive_lex_file_path),
                   negative_lex=read_neg_lex(negative_lex_file_path),
                   stemmer=ARLSTem(),
                   stopwords=custom_stop_words(stop_words_file_path)):
    text = clean_str(text)

    cleaned_sentence = re.sub(r"http\S+", link_word, text)  # link remove

    cleaned_sentence = re.sub(r"@\S+", mention_word,
                              cleaned_sentence)  # mention_replace

    cleaned_sentence = re.sub(r"#\S+", hashtag_word,
                              cleaned_sentence)  # hastag_replacer

    cleaned_sentence = re.sub(r"[A-Z]", "", cleaned_sentence)  # remove_capital

    cleaned_sentence = re.sub(r"[a-z]", "", cleaned_sentence)  # remove_small

    stemmed_sentence = stemmer.stem(cleaned_sentence)  # stemmer

    final = stemmer.norm(
        remove_number(
            remove_punctuations(
                normalization(remove_elongation(stemmed_sentence))))
    )  # removeelnogation #removenuber #normalize_word #remove punc

    final = word_tokenize(final)  # word_tokenize

    output = [w for w in final if not w in stopwords]

    pos_postfix = [positive_word for w in output if w in positive_lex]
    neg_postfix = [negative_word for w in output if w in negative_lex]

    # pos_words = [w for w in output if w in positive_lex]
    # neg_words = [w for w in output if w in negative_lex]
    # print(pos_words)
    # print(neg_words)

    output += pos_postfix
    output += neg_postfix

    textOnly = [w for w in output if match_preserved_word(w) == False]

    return ' '.join(output), ' '.join(textOnly)
Пример #9
0
 def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
     self.k = k  # number of documents to return
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = ARLSTem()
     self.docs = docs
     try:
         self.stopwords = stopwords.words('arabic')
     except LookupError:
         nltk.download('stopwords')
         self.stopwords = stopwords.words('arabic')
     self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams),
                                       norm=None,
                                       stop_words=self.stopwords)
     if tfidf_matrix is None or vectorizer is None:
         docs_stemmed = self.docs_stem()
         self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed)
     else:
         self.vectorizer = vectorizer
         self.tfidf_matrix = tfidf_matrix
Пример #10
0
# preprocessing

# stop words
# f= open("/content/drive/My Drive/Data/ar_stopwords.txt", "r")
# ar_stopwords = f.read().split()

import nltk

nltk.download('stopwords')
# stop words
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
nltk.download('wordnet')

from nltk.stem.arlstem import ARLSTem

stemmmer = ARLSTem()

def remove_stowords(elements):
  corps = []
  for string in elements:
    # string = string.strip()
    string = string.split()
    string = [stemmmer.stem(word) for word in string if not word in arb_stopwords]
    string = ' '.join(string)
    corps.append(string)
  return corps

def predict(input_ans , prof_name , template_name):
  model_path = 'static/templates/'
  model_path = model_path + prof_name + '/' + template_name +'/models/'
  model_path = os.path.join(model_path, 'islamic_model.h5' )
Пример #11
0
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('stopwords')
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
nltk.download('wordnet')
from textblob import TextBlob
from nltk.stem.arlstem import ARLSTem
stem = ARLSTem()
from static.Shakkala_model.diacritization import diacritization


def removeStopWords(text):
    string = text.strip()
    string = string.split()
    string = [word for word in string if not word in arb_stopwords]
    string = ' '.join(string)
    return string


def diacritize(text):
    return diacritization(text)


def tokenization(text):
    return text.split()


def stemmer(text):
    return stem.stem(text)
Пример #12
0
class Arabic_preprocessing:
    def __init__(self):
        #preparing arabic emojis lexicon
        emojis_lexicon = pd.read_csv(
            'resources/emoji_lexicon.csv',
            header=None,
            names=['emoji', 'utf-8', 'unicode', 'arabic_translation'])
        self.emojis_lexicon_dict = dict()
        for index, row in emojis_lexicon.iterrows():
            self.emojis_lexicon_dict[row['emoji'].replace(
                ' ', '')] = row['arabic_translation']

        #preparing arabic stop words list
        f = open('resources/arabic_stop_words.txt', 'r',
                 encoding="utf8")  # read stop words from file
        self.stop_words = [
            line.strip() for line in f.readlines()
        ]  # construct list of stop wrods and remove the new line character

        #preparing punctuations list
        arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
        english_punctuations = string.punctuation
        self.all_punctuations = set(arabic_punctuations + english_punctuations)

        # initializing the stemmer
        self.stemmer = ARLSTem()  # requires minimum NLTK version of 3.2.5

        self.arabic_diacritics = re.compile(
            """
                                         ّ    | # Tashdid
                                         َ    | # Fatha
                                         ً    | # Tanwin Fath
                                         ُ    | # Damma
                                         ٌ    | # Tanwin Damm
                                         ِ    | # Kasra
                                         ٍ    | # Tanwin Kasr
                                         ْ    | # Sukun
                                         ـ     # Tatwil/Kashida

                                     """, re.VERBOSE)

    def normalize_arabic(self, text):
        text = re.sub("[إأآاٱ]", "ا", text)
        text = re.sub("ى", "ي", text)
        #text = re.sub("ؤ", "ء", text)
        #text = re.sub("ئ", "ء", text)
        text = re.sub("ة", "ه", text)  # replace ta2 marboota by ha2
        text = re.sub("گ", "ك", text)
        text = re.sub("\u0640", '', text)  # remove tatweel
        return text

    def remove_punctuations(self, text):
        for p in self.all_punctuations:
            if p in text:
                text = text.replace(p, '')
        return text

    def remove_diacritics(self, text):
        text = re.sub(self.arabic_diacritics, '', text)
        return text

    def remove_repeating_char(self, text):
        return re.sub(r'(.)\1+', r'\1', text)

    def remove_mention(self, text):
        return re.sub(r'@\S+', '', text)

    def hashtag_match(self, match_object):
        return match_object.group(1).replace('_', ' ')

    def normalize_hashtag(self, text):
        return re.sub(r'#(\S+)', self.hashtag_match, text)

    def emojis_match(self, match_object):
        return ' ' + ' '.join(list(match_object.group(1))) + ' '

    def separate_emojis(self, text):
        emojis_unicode = r'([\U0001F600-\U0001F64F\U00002000-\U00003000]+)'
        return re.sub(emojis_unicode, self.emojis_match, text)

    def replace_emojis(self, text):
        new_text = ""
        for l in text:
            new_text += self.emojis_lexicon_dict[
                l] if l in self.emojis_lexicon_dict.keys() else l
        return new_text

    def remove_english_characters(self, text):
        return re.sub(r'[a-zA-Z]+', '', text)

    def clean_stop_words(self):
        # normalize, and remove diacritics from, stop words to increase posibility of matching with normalized data
        self.stop_words = [
            self.remove_diacritics(self.normalize_arabic(word))
            for word in self.stop_words
        ]

    def preprocess_arabic_text(self,
                               text,
                               stem=True,
                               replace_emojis=True,
                               normalize_arabic=True):
        self.clean_stop_words()
        text = text.replace('\\n', ' ').replace('\n', ' ')
        text = self.remove_mention(text)
        text = self.normalize_hashtag(text)
        text = self.remove_punctuations(text)
        text = self.remove_diacritics(text)
        if normalize_arabic: text = self.normalize_arabic(text)
        text = self.separate_emojis(text)
        if replace_emojis: text = self.replace_emojis(text)
        text = self.remove_repeating_char(text)
        text = self.remove_english_characters(text)
        words = nltk.word_tokenize(text)
        words = [word for word in words if word not in self.stop_words]
        if stem: words = [self.stemmer.stem(word) for word in words]
        return ' '.join(words)  # return sentence (str), not list of words
 def __init__(self, ):
     ARLSTem.__init__(self)
     pass
 def __init__(self):
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = ARLSTem()
     self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
class SWDbasline:
    def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'

    def tokenize_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        tokens_stemmed = []
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                tokens_stemmed.append(self.stemmer.stem(token))
        return tokens_stemmed

    def IC(self, w, P):
        return log(1 + 1 / self.C(w, P), 2)

    def C(self, w, P):
        sum = 0
        for word in P:
            if word == w:
                sum += 1
        return sum

    def sliding_window_helper(self, P, Q, A):
        res = []
        for i in range(0, len(A)):
            S = list(set().union(Q, A[i]))
            cur = 0
            for j in range(0, len(P) - len(S) + 1):
                sum = 0
                for w in range(0, len(S)):
                    if P[j + w] in S:
                        sum += self.IC(P[j + w], P)
                cur = max(cur, sum)
            res.append(cur)
        return res

    def sliding_window(self, P, Q, A):
        return self.sliding_window_helper(self.tokenize_string(P),
                                          self.tokenize_string(Q), A)

    def dist(self, P, q, a):
        res = len(P) + 1
        for i in range(0, len(P)):
            if P[i] == q or P[i] == a:
                if P[i] == q:
                    a, q = q, a
                index = self.find_after(P, q, i)
                if index != -1:
                    res = min(res, index - i)
        return res

    def find_after(self, L, w, i):
        for j in range(i, len(L)):
            if (L[j] == w):
                return j
        return -1

    def distance_based_helper(self, P, Q, A):
        res = []
        U = set(stopwords.words('arabic')) & set(P)
        SQ = list(set(P) & set(Q) - U)
        for i in range(0, len(A)):
            SA = list(((set(A[i]) & set(P)) - set(Q)) - U)
            d = len(P) + 1
            if (len(SQ) == 0 or len(SA) == 0):
                d = 1
            else:
                for q in SQ:
                    for a in SA:
                        d = min(d, self.dist(P, q, a))
            d *= 1 / (len(P) - 1)
            res.append(d)
        return res

    def distance_based(self, P, Q, A):
        return self.distance_based_helper(self.tokenize_string(P),
                                          self.tokenize_string(Q), A)

    def argmax(self, l):
        return l.index(max(l))

    def SW(self, P, Q, A):
        return self.argmax(self.sliding_window(P, Q, A))

    def concatenateString(self, paragraph, start, length):
        final_string = paragraph[start]
        for i in range(1, length):
            final_string += " " + paragraph[start + i]
        return final_string

    def get_answer_canditates(self, paragraph):
        candidates = nltk.sent_tokenize(paragraph)
        return candidates

    def read_score(self, P, Q):
        """
        Implemnts SWD algorithm
        :param P: paragraph string
        :param Q: question string
        :return: answer index
        """
        A = self.get_answer_canditates(P)
        ret_sw = self.sliding_window(P, Q, A)
        ret_d = self.distance_based(P, Q, A)
        max_indx = self.argmax([x - y for x, y in zip(ret_sw, ret_d)])
        max_val = max([x - y for x, y in zip(ret_sw, ret_d)])
        return A[max_indx], abs(max_val)

    def read(self, P, Q):
        """
        Implemnts SWD algorithm
        :param P: paragraph string
        :param Q: question string
        :return: answer index
        """
        A = self.get_answer_canditates(P)
        ret_sw = self.sliding_window(P, Q, A)
        ret_d = self.distance_based(P, Q, A)
        return A[self.argmax([x - y for x, y in zip(ret_sw, ret_d)])]