示例#1
0
    class SpellErrors(QMultiTerm):
        """
        query that ignores  the spell errors of arabic letters such as:
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza forms
        """
        def __init__(self, fieldname, text, boost=1.0):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter(shaping=True,
                                            tashkil=False,
                                            spellerrors=True,
                                            hamza=True)

        def _words(self, ixreader):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare(self.text, indexed_text):
                        yield indexed_text

        def _compare(self, first, second):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = (self.ASF.normalize_all(first) == self.ASF.normalize_all(
                second))
            if eqiv:
                self.words.append(second)
            return eqiv
示例#2
0
 def __init__(self, fieldname, text, boost=1.0):
     self.fieldname = fieldname
     self.text = text
     self.boost = boost
     ASF = QArabicSymbolsFilter(shaping=False,
                                tashkil=True,
                                spellerrors=False,
                                hamza=False)
     self.words = [ASF.normalize_all(word) for word in text]
示例#3
0
 def __init__(self, fieldname, text, boost=1.0):
     self.fieldname = fieldname
     self.text = text
     self.boost = boost
     self.words = [text]
     self.ASF = QArabicSymbolsFilter(shaping=True,
                                     tashkil=False,
                                     spellerrors=True,
                                     hamza=True)
示例#4
0
    def transfer_vocalizations(self):
        """ load indexed vocalized words  from the main index and save them as a list in a dynamic py """
        QSE = QuranicSearchEngine(self.__ixpath)

        if QSE.OK:
            mfw = QSE.most_frequent_words(9999999, "aya_")
        else:
            mfw = []

        V = QArabicSymbolsFilter( \
                                       shaping = False, \
                                       tashkil = True, \
                                       spellerrors = False, \
                                       hamza = False \
        ).normalize_all

        vocalization_dict = {}
        for w in mfw:
            word = w[1]
            if vocalization_dict.has_key(V(word)):
                vocalization_dict[V(word)].append(word)
            else:
                vocalization_dict[V(word)] = [word]

        raw_str = self.dheader + u"\nvocalization_dict=" + str(
            vocalization_dict).replace(",", ",\n")

        fich = open(self.__dypypath + "vocalizations_dyn.py", "w+")
        fich.write(raw_str)

        return raw_str
示例#5
0
    def make_spellerrors_dict(self):
        """ make the spell errors dictionary
        @deprecated: forget this!
        """

        D = QseDocIndex()
        R = QReader(D)
        nor = QArabicSymbolsFilter(True, True, True, True).normalize_all
        spell_err = {}
        for term in R.reader.all_terms():
            if term[0] in ["aya"]:
                normalized = nor(term[1])
                if spell_err.has_key(normalized):
                    spell_err[normalized].append(term[1])
                else:
                    spell_err[normalized] = [term[1]]

        #print "\n".join( [unicode( key ) + u":" + ",".join( value ) for key, value in spell_err.items()] )

        raw_str = self.dheader + u"\nspell_err=" + str(spell_err)

        fich = open(self.__dypypath + "spellerrors_dyn.py", "w+")
        fich.write(raw_str)
示例#6
0
def test_arabic_symbol_filter():
    ASF = QArabicSymbolsFilter()
    assert ASF.normalize_all(u"عاصِمٌ") == u"عاصم"
示例#7
0
    def __init__(self,
                 QC_PATH="../../../store/quranic-corpus-morpology.xml",
                 DB="main.db"):

        import sqlite3

        print "connecting to database ...",
        maindb = sqlite3.connect(DB)
        cur = maindb.cursor()
        print "OK"

        print "creating tables:"
        cur.execute(""" drop table if exists wordQC""")
        cur.execute(""" create table if not exists  wordQC(
                        gid int unique,
                        word_gid int,
                        word_id int,
                        aya_id int,
                        sura_id int,

                        word varchar(25),
                        normalised varchar(25),
                        spelled varchar(25),
                        'order' int,
                        token varchar(25),
                        arabictoken varchar(25),
                        prefixes varchar(25),
                        suffixes varchar(25),


                        pos varchar(25),
                        type varchar(25),
                        arabicpos varchar(25),
                        mood varchar(25),
                        arabicmood varchar(25),
                        'case' varchar(25),
                        arabiccase varchar(25),
                        root varchar(25),
                        arabicroot varchar(25),
                        lemma varchar(25),
                        arabiclemma varchar(25),
                        special varchar(25),
                        arabicspecial varchar(25),

                        derivation varchar(25),
                        form varchar(25),
                        gender varchar(25),
                        person varchar(25),
                        number varchar(25),
                        voice varchar(25),
                        state varchar(25),
                        aspect varchar(25),

                        primary key(gid)

                    )

                    """)
        print ">wordQC table ... OK"

        print ">loading Qurany Corpus...",
        from quran_corpus_reader.main import API as QC
        A = QC(source=QC_PATH)
        print ".OK\n"
        IFEXIST = lambda d, attrib: d[attrib].encode("utf-8"
                                                     ) if attrib in d else ""
        gid, word_gid = 0, 0
        print ">inserting values of gid...",
        for iteration in A.all_words_generator():
            QASF = QArabicSymbolsFilter(shaping=True,
                                        tashkil=True,
                                        spellerrors=False,
                                        hamza=False,
                                        uthmani_symbols=True)
            QASF_spelled = QArabicSymbolsFilter(shaping=True,
                                                tashkil=True,
                                                spellerrors=True,
                                                hamza=True,
                                                uthmani_symbols=True)

            QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood,
                arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial,
                word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values
                ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s",  "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s",
                "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s",
                "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % {
                "gid":
                gid,
                "word_gid":
                word_gid,
                "word_id":
                iteration["word_id"],
                "aya_id":
                iteration["aya_id"],
                "sura_id":
                iteration["sura_id"],
                "order":
                order,
                "token":
                IFEXIST(d, "token"),
                "arabictoken":
                IFEXIST(d, "arabictoken"),
                "prefixes":
                ";".join(
                    [prefix["arabictoken"]
                     for prefix in glob["prefixes"]]).encode("utf-8"),
                "suffixes":
                ";".join(
                    [suffix["arabictoken"]
                     for suffix in glob["suffixes"]]).encode("utf-8"),
                "type":
                IFEXIST(d, "type"),
                "pos":
                IFEXIST(d, "pos"),
                "arabicpos":
                IFEXIST(d, "arabicpos"),
                "mood":
                IFEXIST(d, "mood"),
                "arabicmood":
                IFEXIST(d, "arabicmood"),
                "case":
                IFEXIST(d, "case"),
                "arabiccase":
                IFEXIST(d, "arabiccase"),
                "root":
                IFEXIST(d, "root"),
                "arabicroot":
                IFEXIST(d, "arabicroot"),
                "lemma":
                IFEXIST(d, "lemma"),
                "arabiclemma":
                IFEXIST(d, "arabiclemma"),
                "special":
                IFEXIST(d, "special"),
                "arabicspecial":
                IFEXIST(d, "arabicspecial"),
                "word":
                iteration["word"].encode("utf-8"),
                "normalised":
                QASF.normalize_all(iteration["word"]).encode("utf-8"),
                "spelled":
                QASF_spelled.normalize_all(iteration["word"]).encode("utf-8"),
                "derivation":
                IFEXIST(d, "derivation"),
                "form":
                IFEXIST(d, "form"),
                "gender":
                IFEXIST(d, "gender"),
                "person":
                IFEXIST(d, "person"),
                "number":
                IFEXIST(d, "number"),
                "voice":
                IFEXIST(d, "voice"),
                "state":
                IFEXIST(d, "state"),
                "aspect":
                IFEXIST(d, "aspect")
            }
            word_gid += 1
            if word_gid % 1000 == 0:
                print word_gid,
            print("\n")

            order = 0
            for d in iteration["morphology"]["base"]:
                gid += 1
                order += 1
                cur.execute(QUERY(d, iteration["morphology"]))

        print("OK")
        maindb.commit()
示例#8
0
# coding: utf-8
"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.text_processing import QArabicSymbolsFilter, unicode_

if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = "عاصِمٌ"
    TEXT = ASF.normalize_all(TEXT)
    print(TEXT)

    WORD1 = unicode_("عَاصِمُ")
    WORD2 = unicode_("عَاصمُ")
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_("فاعل")
    PHRASE = unicode_("كانَ")
    print(WORD3.apply_harakat_list(LIST_HARAKAT1))
    print(LIST_HARAKAT1, "\n", LIST_HARAKAT2)
    print(unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2))
    print(WORD1.shakl_compare(WORD1, WORD2))
    for i in PHRASE.tokenize_shakl():
        print(i, end='')

    WORD4 = unicode_("عاصم")
    WORD5 = unicode_("عاصِم")

    print(WORD4 == WORD5)
示例#9
0
    def _search_aya(self, flags):
        """
		return the results of aya search as a dictionary data structure
		"""
        # flags
        query = flags["query"] if flags.has_key("query") \
            else self._defaults["flags"]["query"]
        sortedby = flags["sortedby"] if flags.has_key("sortedby") \
            else self._defaults["flags"]["sortedby"]
        range = int(flags["perpage"]) if flags.has_key("perpage") \
            else flags["range"] if flags.has_key("range") \
            else self._defaults["flags"]["range"]
        ## offset = (page-1) * perpage   --  mode paging
        offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key("page") \
            else int(flags["offset"]) if flags.has_key("offset") \
            else self._defaults["flags"]["offset"]
        recitation = flags["recitation"] if flags.has_key("recitation") \
            else self._defaults["flags"]["recitation"]
        translation = flags["translation"] if flags.has_key("translation") \
            else self._defaults["flags"]["translation"]
        romanization = flags["romanization"] if flags.has_key("romanization") \
            else self._defaults["flags"]["romanization"]
        highlight = flags["highlight"] if flags.has_key("highlight") \
            else self._defaults["flags"]["highlight"]
        script = flags["script"] if flags.has_key("script") \
            else self._defaults["flags"]["script"]
        vocalized = IS_FLAG(flags, 'vocalized')
        fuzzy = IS_FLAG(flags, 'fuzzy')
        view = flags["view"] if flags.has_key("view") \
            else self._defaults["flags"]["view"]

        # pre-defined views
        if view == "minimal":
            # fuzzy = True
            # page = 25
            vocalized = False
            recitation = None
            translation = None
            prev_aya = next_aya = False
            sura_info = False
            word_info = False
            word_synonyms = False
            word_derivations = False
            word_vocalizations = False
            aya_position_info = aya_theme_info = aya_sajda_info = False
            aya_stat_info = False
            sura_stat_info = False
            annotation_aya = annotation_word = False
        elif view == "normal":
            prev_aya = next_aya = True
            sura_info = True
            word_info = True
            word_synonyms = False
            word_derivations = True
            word_vocalizations = True
            aya_position_info = aya_theme_info = aya_sajda_info = True
            aya_stat_info = True
            sura_stat_info = False
            annotation_aya = annotation_word = False
        elif view == "full":
            prev_aya = next_aya = True
            sura_info = True
            word_info = True
            word_synonyms = True
            word_derivations = True
            word_vocalizations = True
            aya_position_info = aya_theme_info = aya_sajda_info = True
            aya_stat_info = sura_stat_info = True
            annotation_aya = annotation_word = True
            romanization = "iso"
        elif view == "statistic":
            prev_aya = next_aya = False
            sura_info = True
            word_info = True
            word_synonyms = False
            word_derivations = True
            word_vocalizations = True
            aya_position_info = True
            aya_theme_info = aya_sajda_info = False
            aya_stat_info = True
            sura_stat_info = True
            annotation_aya = False
            annotation_word = False
        elif view == "linguistic":
            prev_aya = next_aya = False
            sura_info = False
            word_info = True
            word_synonyms = True
            word_derivations = True
            word_vocalizations = True
            aya_position_info = False
            aya_theme_info = aya_sajda_info = True
            aya_stat_info = False
            sura_stat_info = False
            annotation_aya = True
            annotation_word = True
            romanization = "buckwalter"
        elif view == "recitation":
            script = "uthmani"
            prev_aya = next_aya = True
            sura_info = True
            word_info = False
            word_synonyms = False
            word_derivations = False
            word_vocalizations = False
            aya_position_info = True
            aya_theme_info = False
            aya_sajda_info = True
            aya_stat_info = False
            sura_stat_info = False
            annotation_aya = False
            annotation_word = False
        else:  # if view == custom or undefined
            prev_aya = IS_FLAG(flags, 'prev_aya')
            next_aya = IS_FLAG(flags, 'next_aya')
            sura_info = IS_FLAG(flags, 'sura_info')
            sura_stat_info = IS_FLAG(flags, 'sura_stat_info')
            word_info = IS_FLAG(flags, 'word_info')
            word_synonyms = IS_FLAG(flags, 'word_synonyms')
            word_derivations = IS_FLAG(flags, 'word_derivations')
            word_vocalizations = IS_FLAG(flags, 'word_vocalizations')

            aya_position_info = IS_FLAG(flags, 'aya_position_info')
            aya_theme_info = IS_FLAG(flags, 'aya_theme_info')
            aya_stat_info = IS_FLAG(flags, 'aya_stat_info')
            aya_sajda_info = IS_FLAG(flags, 'aya_sajda_info')
            annotation_aya = IS_FLAG(flags, 'annotation_aya')
            annotation_word = IS_FLAG(flags, 'annotation_word')

        # print query
        # preprocess query
        query = query.replace("\\", "")
        if not isinstance(query, unicode):
            query = unicode(query, 'utf8')

        if ":" not in query:
            query = unicode(
                transliterate("buckwalter", query,
                              ignore="'_\"%*?#~[]{}:>+-|"))

        # Search
        SE = self.FQSE if fuzzy else self.QSE
        res, termz, searcher = SE.search_all(
            query, self._defaults["results_limit"]["aya"], sortedby=sortedby)
        terms = [
            term[1] for term in list(termz)[:self._defaults["maxkeywords"]]
        ]
        terms_uthmani = map(STANDARD2UTHMANI, terms)
        # pagination
        offset = 1 if offset < 1 else offset
        range = self._defaults[
            "minrange"] if range < self._defaults["minrange"] else range
        range = self._defaults[
            "maxrange"] if range > self._defaults["maxrange"] else range
        interval_end = offset + range - 1
        end = interval_end if interval_end < len(res) else len(res)
        start = offset if offset <= len(res) else -1
        reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end]
        # todo pagination should be done inside search operation for better performence
        # closing the searcher
        searcher.close()
        output = {}

        ## disable annotations for aya words if there is more then one result
        if annotation_aya and len(res) > 1:
            annotation_aya = False

        # if True:
        ## strip vocalization when vocalized = true
        V = QArabicSymbolsFilter( \
            shaping=False, \
            tashkil=not vocalized, \
            spellerrors=False, \
            hamza=False \
            ).normalize_all
        strip_vocalization = QArabicSymbolsFilter( \
            shaping=False, \
            tashkil=True, \
            spellerrors=False, \
            hamza=False \
            ).normalize_all
        # highligh function that consider None value and non-definition
        H = lambda X: self.QSE.highlight(
            X, terms, highlight
        ) if highlight != "none" and X else X if X else u"-----"
        # Numbers are 0 if not defined
        N = lambda X: X if X else 0
        # parse keywords lists , used for Sura names
        kword = re.compile(u"[^,،]+")
        keywords = lambda phrase: kword.findall(phrase)
        ##########################################
        extend_runtime = res.runtime
        # Words & Annotations
        words_output = {"individual": {}}
        if word_info:
            matches = 0
            docs = 0
            nb_vocalizations_globale = 0
            cpt = 1
            annotation_word_query = u"( 0 "
            for term in termz:
                if term[0] == "aya" or term[0] == "aya_":
                    if term[2]:
                        matches += term[2]
                    docs += term[3]
                    if term[0] == "aya_":
                        annotation_word_query += u" OR word:%s " % term[1]
                    else:  # if aya
                        annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI(
                            term[1])
                    if word_vocalizations:
                        vocalizations = vocalization_dict[strip_vocalization(term[1])] if vocalization_dict.has_key(
                            strip_vocalization(term[1])) \
                            else []
                        nb_vocalizations_globale += len(vocalizations)
                    if word_synonyms:
                        synonyms = syndict[term[1]] if syndict.has_key(term[1]) \
                            else []
                    derivations_extra = []
                    if word_derivations:
                        lemma = LOCATE(derivedict["word_"],
                                       derivedict["lemma"], term[1])
                        if lemma:  # if different of none
                            derivations = FILTER_DOUBLES(
                                FIND(derivedict["lemma"], derivedict["word_"],
                                     lemma))
                        else:
                            derivations = []
                        # go deeper with derivations
                        root = LOCATE(derivedict["word_"], derivedict["root"],
                                      term[1])
                        if root:  # if different of none
                            derivations_extra = list(
                                set(
                                    FILTER_DOUBLES(
                                        FIND(derivedict["root"],
                                             derivedict["word_"], lemma))) -
                                set(derivations))

                    words_output["individual"][cpt] = {
                        "word":
                        term[1],
                        "romanization":
                        transliterate(
                            romanization, term[1], ignore="", reverse=True) if
                        romanization in self.DOMAINS["romanization"] else None,
                        "nb_matches":
                        term[2],
                        "nb_ayas":
                        term[3],
                        "nb_vocalizations":
                        len(vocalizations)
                        if word_vocalizations else 0,  # unneeded
                        "vocalizations":
                        vocalizations if word_vocalizations else [],
                        "nb_synonyms":
                        len(synonyms) if word_synonyms else 0,  # unneeded
                        "synonyms":
                        synonyms if word_synonyms else [],
                        "lemma":
                        lemma if word_derivations else "",
                        "root":
                        root if word_derivations else "",
                        "nb_derivations":
                        len(derivations)
                        if word_derivations else 0,  # unneeded
                        "derivations":
                        derivations if word_derivations else [],
                        "nb_derivations_extra":
                        len(derivations_extra),
                        "derivations_extra":
                        derivations_extra,
                    }
                    cpt += 1
            annotation_word_query += u" ) "
            words_output["global"] = {
                "nb_words": cpt - 1,
                "nb_matches": matches,
                "nb_vocalizations": nb_vocalizations_globale
            }
        output["words"] = words_output
        # Magic_loop to built queries of Adjacents,translations and annotations in the same time
        if prev_aya or next_aya or translation or annotation_aya:
            adja_query = trad_query = annotation_aya_query = u"( 0"

            for r in reslist:
                if prev_aya:
                    adja_query += u" OR gid:%s " % unicode(r["gid"] - 1)
                if next_aya:
                    adja_query += u" OR gid:%s " % unicode(r["gid"] + 1)
                if translation:
                    trad_query += u" OR gid:%s " % unicode(r["gid"])
                if annotation_aya:
                    annotation_aya_query += u" OR  ( aya_id:%s AND  sura_id:%s ) " % (
                        unicode(r["aya_id"]), unicode(r["sura_id"]))

            adja_query += u" )"
            trad_query += u" )" + u" AND id:%s " % unicode(translation)
            annotation_aya_query += u" )"

        # Adjacents
        if prev_aya or next_aya:
            adja_res, searcher = self.QSE.find_extended(adja_query, "gid")
            adja_ayas = {
                0: {
                    "aya_": u"----",
                    "uth_": u"----",
                    "sura": u"---",
                    "aya_id": 0,
                    "sura_arabic": u"---"
                },
                6237: {
                    "aya_": u"----",
                    "uth_": u"----",
                    "sura": u"---",
                    "aya_id": 9999,
                    "sura_arabic": u"---"
                }
            }
            for adja in adja_res:
                adja_ayas[adja["gid"]] = {
                    "aya_": adja["aya_"],
                    "uth_": adja["uth_"],
                    "aya_id": adja["aya_id"],
                    "sura": adja["sura"],
                    "sura_arabic": adja["sura_arabic"]
                }
                extend_runtime += adja_res.runtime
            searcher.close()

        # translations
        if translation:
            trad_res, searcher = self.TSE.find_extended(trad_query, "gid")
            extend_runtime += trad_res.runtime
            trad_text = {}
            for tr in trad_res:
                trad_text[tr["gid"]] = tr["text"]
            searcher.close()

        # annotations for aya words
        if annotation_aya or (annotation_word and word_info):
            annotation_word_query = annotation_word_query if annotation_word and word_info else u"()"
            annotation_aya_query = annotation_aya_query if annotation_aya else u"()"
            annotation_query = annotation_aya_query + u" OR  " + annotation_word_query
            # print annotation_query.encode( "utf-8" )
            annot_res, searcher = self.WSE.find_extended(
                annotation_query, "gid")
            extend_runtime += annot_res.runtime
            ## prepare annotations for use
            annotations_by_word = {}
            annotations_by_position = {}
            for annot in annot_res:
                if (annotation_word and word_info):
                    if annot["normalized"] in terms_uthmani:
                        if annotations_by_word.has_key(annot["normalized"]):
                            if annotations_by_word[
                                    annot["normalized"]].has_key(
                                        annot["word"]):
                                annotations_by_word[annot["normalized"]][
                                    annot["word"]].append(annot)
                            else:
                                annotations_by_word[annot["normalized"]][
                                    annot["word"]] = [annot]
                        else:
                            annotations_by_word[annot["normalized"]] = {
                                annot["word"]: [annot]
                            }
                if annotation_aya:
                    if annotations_by_position.has_key(
                        (annot["sura_id"], annot["aya_id"])):
                        annotations_by_position[(
                            annot["sura_id"],
                            annot["aya_id"])][annot["word_id"]] = annot
                    else:
                        annotations_by_position[(annot["sura_id"],
                                                 annot["aya_id"])] = {
                                                     annot["word_id"]: annot
                                                 }
            searcher.close()

        ## merge word annotations to word output
        if (annotation_word and word_info):
            for cpt in xrange(1, len(output["words"]["individual"]) + 1):
                current_word = STANDARD2UTHMANI(
                    output["words"]["individual"][cpt]["word"])
                # print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res )
                if annotations_by_word.has_key(current_word):
                    current_word_annotations = annotations_by_word[
                        current_word]
                    output["words"]["individual"][cpt][
                        "annotations"] = current_word_annotations
                    output["words"]["individual"][cpt]["nb_annotations"] = len(
                        current_word_annotations)

        output["runtime"] = round(extend_runtime, 5)
        output["interval"] = {
            "start": start,
            "end": end,
            "total": len(res),
            "page": ((start - 1) / range) + 1,
            "nb_pages": ((len(res) - 1) / range) + 1
        }
        output["translation_info"] = {}
        ### Ayas
        cpt = start - 1
        output["ayas"] = {}
        for r in reslist:
            cpt += 1
            output["ayas"][cpt] = {

                "identifier": {"gid": r["gid"],
                               "aya_id": r["aya_id"],
                               "sura_id": r["sura_id"],
                               "sura_name": keywords(r["sura"])[0],
                               "sura_arabic_name": keywords(r["sura_arabic"])[0],
                               },

                "aya": {
                    "id": r["aya_id"],
                    "text": H(V(r["aya_"])) if script == "standard"
                    else H(r["uth_"]),
                    "text_no_highlight": r["aya"] if script == "standard"
                    else r["uth_"],
                    "translation": trad_text[r["gid"]] if (
                            translation != "None" and translation and trad_text.has_key(r["gid"])) else None,
                    "recitation": None if not recitation or not self._recitations.has_key(recitation) \
                        else u"https://www.everyayah.com/data/" + self._recitations[recitation]["subfolder"].encode(
                        "utf-8") + "/%03d%03d.mp3" % (r["sura_id"], r["aya_id"]),
                    "prev_aya": {
                        "id": adja_ayas[r["gid"] - 1]["aya_id"],
                        "sura": adja_ayas[r["gid"] - 1]["sura"],
                        "sura_arabic": adja_ayas[r["gid"] - 1]["sura_arabic"],
                        "text": V(adja_ayas[r["gid"] - 1]["aya_"]) if script == "standard"
                        else adja_ayas[r["gid"] - 1]["uth_"],
                    } if prev_aya else None
                    ,
                    "next_aya": {
                        "id": adja_ayas[r["gid"] + 1]["aya_id"],
                        "sura": adja_ayas[r["gid"] + 1]["sura"],
                        "sura_arabic": adja_ayas[r["gid"] + 1]["sura_arabic"],
                        "text": V(adja_ayas[r["gid"] + 1]["aya_"]) if script == "standard"
                        else adja_ayas[r["gid"] + 1]["uth_"],
                    } if next_aya else None
                    ,

                },

                "sura": {} if not sura_info
                else {
                    "name": keywords(r["sura"])[0],
                    "arabic_name": keywords(r["sura_arabic"])[0],
                    "english_name": keywords(r["sura_english"])[0],
                    "id": r["sura_id"],
                    "type": r["sura_type"],
                    "arabic_type": r["sura_type_arabic"],
                    "order": r["sura_order"],
                    "ayas": r["s_a"],
                    "stat": {} if not sura_stat_info
                    else {
                        "words": N(r["s_w"]),
                        "godnames": N(r["s_g"]),
                        "letters": N(r["s_l"])
                    }

                },

                "position": {} if not aya_position_info
                else {
                    "manzil": r["manzil"],
                    "juz": r["juz"],
                    "hizb": r["hizb"],
                    "rub": r["rub"] % 4,
                    "page": r["page"],
                    "page_IN": r["page_IN"],
                    "ruku": r["ruku"],
                },

                "theme": {} if not aya_theme_info
                else {
                    "chapter": r["chapter"],
                    "topic": r["topic"],
                    "subtopic": r["subtopic"]
                },

                "stat": {} if not aya_stat_info
                else {
                    "words": N(r["a_w"]),
                    "letters": N(r["a_l"]),
                    "godnames": N(r["a_g"])
                },

                "sajda": {} if not aya_sajda_info
                else {
                    "exist": (r["sajda"] == u"نعم"),
                    "type": r["sajda_type"] if (r["sajda"] == u"نعم") else None,
                    "id": N(r["sajda_id"]) if (r["sajda"] == u"نعم") else None,
                },

                "annotations": {} if not annotation_aya or not annotations_by_position.has_key(
                    (r["sura_id"], r["aya_id"]))
                else annotations_by_position[(r["sura_id"], r["aya_id"])]
            }

        return output
示例#10
0
    def _search_word(self, flags):
        """
		return the results of word search as a dictionary data structure
		"""
        # flags
        query = flags["query"] if flags.has_key("query") \
            else self._defaults["flags"]["query"]
        sortedby = flags["sortedby"] if flags.has_key("sortedby") \
            else self._defaults["flags"]["sortedby"]
        range = int(flags["perpage"]) if flags.has_key("perpage") \
            else flags["range"] if flags.has_key("range") \
            else self._defaults["flags"]["range"]
        offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key("page") \
            else int(flags["offset"]) if flags.has_key("offset") \
            else self._defaults["flags"]["offset"]
        romanization = flags["romanization"] if flags.has_key("romanization") \
            else self._defaults["flags"]["romanization"]
        highlight = flags["highlight"] if flags.has_key("highlight") \
            else self._defaults["flags"]["highlight"]
        script = flags["script"] if flags.has_key("script") \
            else self._defaults["flags"]["script"]
        vocalized = IS_FLAG(flags, 'vocalized')
        view = flags["view"] if flags.has_key("view") \
            else self._defaults["flags"]["view"]

        # pre-defined views
        if view == "minimal":
            vocalized = False
            aya = False
        elif view == "normal":
            pass
        elif view == "full":
            romanization = "iso"
            aya = True
        elif view == "statistic":
            pass
        elif view == "linguistic":
            romanization = "buckwalter"
        elif view == "recitation":
            script = "uthmani"
        else:  # if view == custom or undefined
            aya = IS_FLAG(flags, 'aya')
        # preprocess query
        query = query.replace("\\", "")
        if not isinstance(query, unicode):
            query = unicode(query, 'utf8')

        if ":" not in query:
            query = unicode(
                transliterate("buckwalter", query,
                              ignore="'_\"%*?#~[]{}:>+-|"))

        # Search
        SE = self.WSE
        res, termz, searcher = SE.search_all(
            query, self._defaults["results_limit"]["word"], sortedby=sortedby)
        terms = [
            term[1] for term in list(termz)[:self._defaults["maxkeywords"]]
        ]

        # pagination
        offset = 1 if offset < 1 else offset
        range = self._defaults[
            "minrange"] if range < self._defaults["minrange"] else range
        range = self._defaults[
            "maxrange"] if range > self._defaults["maxrange"] else range
        interval_end = offset + range - 1
        end = interval_end if interval_end < len(res) else len(res)
        start = offset if offset <= len(res) else -1
        reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end]
        # closing the searcher
        searcher.close()

        output = {}

        # if True:
        ## strip vocalization when vocalized = true
        V = QArabicSymbolsFilter(
            **{
                'shaping': True,
                'tashkil': not vocalized,
                'spellerrors': False,
                'hamza': False,
                'uthmani_symbols': True,
            }).normalize_all
        V_shadda = QArabicSymbolsFilter(
            **{
                'shaping': False,
                'tashkil': False,
                'spellerrors': False,
                'hamza': False,
                'shadda': True,
                'uthmani_symbols': True
            }).normalize_all
        # highligh function that consider None value and non-definition
        H = lambda X: SE.highlight(
            X, terms, highlight
        ) if highlight != "none" and X else X if X else u"-----"
        extend_runtime = res.runtime
        # Words & Annotations
        words_output = {"individual": {}}
        if True:
            matches = 0
            docs = 0
            cpt = 1
            for term in termz:
                if True:  # term[0] == "normalized" or term[0] == "word":
                    if term[2]:
                        matches += term[2]
                    docs += term[3]
                    words_output["individual"][cpt] = {
                        "field":
                        term[0],
                        "word":
                        term[1],
                        "romanization":
                        transliterate(
                            romanization, term[1], ignore="", reverse=True) if
                        romanization in self.DOMAINS["romanization"] else None,
                        "nb_matches":
                        term[2],
                        "nb_docs":
                        term[3],
                    }
                    cpt += 1
            words_output["global"] = {
                "nb_words": cpt - 1,
                "nb_matches": matches
            }
        output["keywords"] = words_output

        # Magic_loop to built queries of ayas,etc in the same time
        if aya:
            aya_query = u"( 0"
            for r in reslist:
                if aya:
                    aya_query += u" OR ( sura_id:%s AND aya_id:%s )  " % (
                        unicode(r["sura_id"]), unicode(r["aya_id"]))
            aya_query += u" )"

        # original ayas
        if aya:
            aya_res, searcher = self.QSE.find_extended(aya_query, "gid")

            extend_runtime += aya_res.runtime
            aya_info = {}
            for ay in aya_res:
                if aya_info.has_key(ay["sura_id"]):
                    aya_info[ay["sura_id"]][ay["aya_id"]] = ay
                else:
                    aya_info[ay["sura_id"]] = {ay["aya_id"]: ay}
            searcher.close()

        output["runtime"] = round(extend_runtime, 5)
        output["interval"] = {
            "start": start,
            "end": end,
            "total": len(res),
            "page": ((start - 1) / range) + 1,
            "nb_pages": ((len(res) - 1) / range) + 1
        }
        ### Words
        cpt = start - 1
        output["words"] = {}
        for r in reslist:
            cpt += 1
            output["words"][cpt] = {
                "identifier": {
                    "gid": r["gid"],
                    "word_gid": r["word_gid"],
                    "aya_id": r["aya_id"],
                    "sura_id": r["sura_id"],
                    "word_id": r["word_id"],
                },

                "word": {
                    "text": r["word"],
                    "part": u"جذع",
                    "part_order": r["order"],
                    "token": r["arabictoken"],
                    "prefixes": r["prefix"],
                    "suffixes": r["suffix"],
                    "POS": {
                        "english": r["pos"],
                        "arabic": r["arabicpos"],
                    },
                    "mood": {
                        "english": r["mood"],
                        "arabic": r["arabicmood"],
                    },
                    "case": {
                        "english": r["case"],
                        "arabic": r["arabiccase"],
                    },
                    "root": {
                        # "english": r["root"],
                        "arabic": r["arabicroot"],
                    },
                    "lemma": {
                        # "english": r["lemma"],
                        "arabic": r["arabiclemma"],
                    },

                    "special": {
                        # "english": r["special"],
                        "arabic": r["arabicspecial"],
                    },
                    "derivation": r["derivation"],
                    "form": r["form"],
                    "gender": r["gender"],
                    "person": r["person"],
                    "number": r["number"],
                    "voice": r["voice"],
                    "state": r["state"],
                    "aspect": r["aspect"],
                },
                "aya": None if not aya \
                    else {
                    "text": SE.highlight(aya_info[r["sura_id"]][r["aya_id"]]["uth_"], [r["word"]], highlight, False),
                    "aya_id": aya_info[r["sura_id"]][r["aya_id"]]["aya_id"],
                    "sura_name": aya_info[r["sura_id"]][r["aya_id"]]["sura"],
                    "sura_arabic_name": aya_info[r["sura_id"]][r["aya_id"]]["sura_arabic"],

                },
            }
        return output