예제 #1
0
class fastTextEmbedder:
    def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path,
                                                       encoding='utf-8',
                                                       unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab

    def tokenize_string(self, str):
        """
        :param str: string sentence
        :return: tokens stemmed with respect to nltk
        """
        str_tokens = self.tokenizer.tokenize(str)
        tokens_stemmed = []
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                tokens_stemmed.append((token, self.stemmer.stem(token)))
        return tokens_stemmed

    def embed_tokens(self, sent, max_len):
        sent_tokens = self.tokenize_string(sent)
        embedding = np.zeros((max_len, 300))
        i = j = 0
        for i in range(0, min(len(sent_tokens), max_len)):
            e = np.zeros(300)
            if sent_tokens[i][0] in self.vocab:
                embedding[j] = self.model[sent_tokens[i][0]]
                j += 1
            elif sent_tokens[i][1] in self.vocab:
                embedding[j] = self.model[sent_tokens[i][1]]
                j += 1
        return embedding

    def embed(self, sent):
        """
        :param sent: string sentence
        :return: embedding of sentence as np vector of dim=300
        """
        sent_tokens = self.tokenize_string(sent)
        embedding = np.zeros(300)
        for token in sent_tokens:
            if token[0] in self.vocab:
                embedding += self.model[token[0]]
            elif token[1] in self.vocab:
                embedding += self.model[token[1]]
        return embedding
예제 #2
0
class countReader:
    SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'

    def __init__(self, P):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = self.get_answer_canditates(P)
        docs_stem = []
        for doc in self.docs:
            docs_stem.append(self.stem_string(doc))
        # self.stopwords = stopwords.words('arabic')
        self.stopwords = open("stopwords-ur.txt").read().splitlines()
        self.stopwords = [i.lower() for i in self.stopwords]
        self.vectorizer = CountVectorizer(
            ngram_range=(1, 4))  # , stop_words=self.stopwords)
        self.count_matrix = self.vectorizer.fit_transform(docs_stem)

    def stem_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        str_processed = ""
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                str_processed += token + " " + self.stemmer.stem(token) + " "
        return str_processed

    def concatenateString(self, paragraph, start, length):
        final_string = paragraph[start]
        for i in range(1, length):
            final_string += " " + paragraph[start + i]
        return final_string

    def get_answer_canditates(self, paragraph):
        para_sents = nltk.sent_tokenize(paragraph)
        candidates = []
        for sent in para_sents:
            para_words = sent.split()
            for i in range(0, len(para_words)):
                for j in range(1, min(15, len(para_words) - i + 1)):
                    candidate = self.concatenateString(para_words, i, j)
                    candidates.append(candidate)
        return candidates

    def read(self, P, Q):
        Q = self.stem_string(Q)
        query_tfidf = self.vectorizer.transform([Q])
        similarities_raw = cosine_similarity(self.count_matrix, query_tfidf)
        similarities = []
        for s in similarities_raw:
            similarities.append(s[0])
        max_index = np.argmax(similarities)
        return self.docs[max_index]
예제 #3
0
class Arabic_preprocessing:
    def __init__(self):
        #preparing arabic emojis lexicon
        emojis_lexicon = pd.read_csv(
            'resources/emoji_lexicon.csv',
            header=None,
            names=['emoji', 'utf-8', 'unicode', 'arabic_translation'])
        self.emojis_lexicon_dict = dict()
        for index, row in emojis_lexicon.iterrows():
            self.emojis_lexicon_dict[row['emoji'].replace(
                ' ', '')] = row['arabic_translation']

        #preparing arabic stop words list
        f = open('resources/arabic_stop_words.txt', 'r',
                 encoding="utf8")  # read stop words from file
        self.stop_words = [
            line.strip() for line in f.readlines()
        ]  # construct list of stop wrods and remove the new line character

        #preparing punctuations list
        arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
        english_punctuations = string.punctuation
        self.all_punctuations = set(arabic_punctuations + english_punctuations)

        # initializing the stemmer
        self.stemmer = ARLSTem()  # requires minimum NLTK version of 3.2.5

        self.arabic_diacritics = re.compile(
            """
                                         ّ    | # Tashdid
                                         َ    | # Fatha
                                         ً    | # Tanwin Fath
                                         ُ    | # Damma
                                         ٌ    | # Tanwin Damm
                                         ِ    | # Kasra
                                         ٍ    | # Tanwin Kasr
                                         ْ    | # Sukun
                                         ـ     # Tatwil/Kashida

                                     """, re.VERBOSE)

    def normalize_arabic(self, text):
        text = re.sub("[إأآاٱ]", "ا", text)
        text = re.sub("ى", "ي", text)
        #text = re.sub("ؤ", "ء", text)
        #text = re.sub("ئ", "ء", text)
        text = re.sub("ة", "ه", text)  # replace ta2 marboota by ha2
        text = re.sub("گ", "ك", text)
        text = re.sub("\u0640", '', text)  # remove tatweel
        return text

    def remove_punctuations(self, text):
        for p in self.all_punctuations:
            if p in text:
                text = text.replace(p, '')
        return text

    def remove_diacritics(self, text):
        text = re.sub(self.arabic_diacritics, '', text)
        return text

    def remove_repeating_char(self, text):
        return re.sub(r'(.)\1+', r'\1', text)

    def remove_mention(self, text):
        return re.sub(r'@\S+', '', text)

    def hashtag_match(self, match_object):
        return match_object.group(1).replace('_', ' ')

    def normalize_hashtag(self, text):
        return re.sub(r'#(\S+)', self.hashtag_match, text)

    def emojis_match(self, match_object):
        return ' ' + ' '.join(list(match_object.group(1))) + ' '

    def separate_emojis(self, text):
        emojis_unicode = r'([\U0001F600-\U0001F64F\U00002000-\U00003000]+)'
        return re.sub(emojis_unicode, self.emojis_match, text)

    def replace_emojis(self, text):
        new_text = ""
        for l in text:
            new_text += self.emojis_lexicon_dict[
                l] if l in self.emojis_lexicon_dict.keys() else l
        return new_text

    def remove_english_characters(self, text):
        return re.sub(r'[a-zA-Z]+', '', text)

    def clean_stop_words(self):
        # normalize, and remove diacritics from, stop words to increase posibility of matching with normalized data
        self.stop_words = [
            self.remove_diacritics(self.normalize_arabic(word))
            for word in self.stop_words
        ]

    def preprocess_arabic_text(self,
                               text,
                               stem=True,
                               replace_emojis=True,
                               normalize_arabic=True):
        self.clean_stop_words()
        text = text.replace('\\n', ' ').replace('\n', ' ')
        text = self.remove_mention(text)
        text = self.normalize_hashtag(text)
        text = self.remove_punctuations(text)
        text = self.remove_diacritics(text)
        if normalize_arabic: text = self.normalize_arabic(text)
        text = self.separate_emojis(text)
        if replace_emojis: text = self.replace_emojis(text)
        text = self.remove_repeating_char(text)
        text = self.remove_english_characters(text)
        words = nltk.word_tokenize(text)
        words = [word for word in words if word not in self.stop_words]
        if stem: words = [self.stemmer.stem(word) for word in words]
        return ' '.join(words)  # return sentence (str), not list of words
class SWDbasline:
    def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'

    def tokenize_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        tokens_stemmed = []
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                tokens_stemmed.append(self.stemmer.stem(token))
        return tokens_stemmed

    def IC(self, w, P):
        return log(1 + 1 / self.C(w, P), 2)

    def C(self, w, P):
        sum = 0
        for word in P:
            if word == w:
                sum += 1
        return sum

    def sliding_window_helper(self, P, Q, A):
        res = []
        for i in range(0, len(A)):
            S = list(set().union(Q, A[i]))
            cur = 0
            for j in range(0, len(P) - len(S) + 1):
                sum = 0
                for w in range(0, len(S)):
                    if P[j + w] in S:
                        sum += self.IC(P[j + w], P)
                cur = max(cur, sum)
            res.append(cur)
        return res

    def sliding_window(self, P, Q, A):
        return self.sliding_window_helper(self.tokenize_string(P),
                                          self.tokenize_string(Q), A)

    def dist(self, P, q, a):
        res = len(P) + 1
        for i in range(0, len(P)):
            if P[i] == q or P[i] == a:
                if P[i] == q:
                    a, q = q, a
                index = self.find_after(P, q, i)
                if index != -1:
                    res = min(res, index - i)
        return res

    def find_after(self, L, w, i):
        for j in range(i, len(L)):
            if (L[j] == w):
                return j
        return -1

    def distance_based_helper(self, P, Q, A):
        res = []
        U = set(stopwords.words('arabic')) & set(P)
        SQ = list(set(P) & set(Q) - U)
        for i in range(0, len(A)):
            SA = list(((set(A[i]) & set(P)) - set(Q)) - U)
            d = len(P) + 1
            if (len(SQ) == 0 or len(SA) == 0):
                d = 1
            else:
                for q in SQ:
                    for a in SA:
                        d = min(d, self.dist(P, q, a))
            d *= 1 / (len(P) - 1)
            res.append(d)
        return res

    def distance_based(self, P, Q, A):
        return self.distance_based_helper(self.tokenize_string(P),
                                          self.tokenize_string(Q), A)

    def argmax(self, l):
        return l.index(max(l))

    def SW(self, P, Q, A):
        return self.argmax(self.sliding_window(P, Q, A))

    def concatenateString(self, paragraph, start, length):
        final_string = paragraph[start]
        for i in range(1, length):
            final_string += " " + paragraph[start + i]
        return final_string

    def get_answer_canditates(self, paragraph):
        candidates = nltk.sent_tokenize(paragraph)
        return candidates

    def read_score(self, P, Q):
        """
        Implemnts SWD algorithm
        :param P: paragraph string
        :param Q: question string
        :return: answer index
        """
        A = self.get_answer_canditates(P)
        ret_sw = self.sliding_window(P, Q, A)
        ret_d = self.distance_based(P, Q, A)
        max_indx = self.argmax([x - y for x, y in zip(ret_sw, ret_d)])
        max_val = max([x - y for x, y in zip(ret_sw, ret_d)])
        return A[max_indx], abs(max_val)

    def read(self, P, Q):
        """
        Implemnts SWD algorithm
        :param P: paragraph string
        :param Q: question string
        :return: answer index
        """
        A = self.get_answer_canditates(P)
        ret_sw = self.sliding_window(P, Q, A)
        ret_d = self.distance_based(P, Q, A)
        return A[self.argmax([x - y for x, y in zip(ret_sw, ret_d)])]