Пример #1
0
    class SpellErrors(QMultiTerm):
        """
        query that ignores  the spell errors of arabic letters such as:
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza forms
        """
        def __init__(self, fieldname, text, boost=1.0):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter(shaping=True,
                                            tashkil=False,
                                            spellerrors=True,
                                            hamza=True)

        def _words(self, ixreader):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare(self.text, indexed_text):
                        yield indexed_text

        def _compare(self, first, second):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = (self.ASF.normalize_all(first) == self.ASF.normalize_all(
                second))
            if eqiv:
                self.words.append(second)
            return eqiv
Пример #2
0
    class SpellErrors( QMultiTerm ):
        """
        query that ignores  the spellerrors of arabic letters
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza formes
        """


        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter( shaping = True, tashkil = False, spellerrors = True, hamza = True )


        def _words( self, ixreader ):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare( self.text, indexed_text ):
                        yield indexed_text

        def _compare( self, first, second ):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = ( self.ASF.normalize_all( first ) == self.ASF.normalize_all( second ) )
            if eqiv:
                self.words.append( second )
            return eqiv
Пример #3
0
 def __init__(self, fieldname, text, boost=1.0):
     self.fieldname = fieldname
     self.text = text
     self.boost = boost
     ASF = QArabicSymbolsFilter(shaping=False,
                                tashkil=True,
                                spellerrors=False,
                                hamza=False)
     self.words = [ASF.normalize_all(word) for word in text]
Пример #4
0
        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            ASF = QArabicSymbolsFilter( shaping = False,
									   	tashkil = True,
									   	spellerrors = False,
									   	hamza = False )
            self.words = [ASF.normalize_all( word ) for word in text]
Пример #5
0
# coding: utf-8
"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_

if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = u"عاصِمٌ"
    TEXT = ASF.normalize_all(TEXT)
    print TEXT

    WORD1 = unicode_(u"عَاصِمُ")
    WORD2 = unicode_(u"عَاصمُ")
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_(u"فاعل")
    PHRASE = unicode_(u"كانَ")
    print WORD3.apply_harakat_list(LIST_HARAKAT1)
    print LIST_HARAKAT1, "\n", LIST_HARAKAT2
    print unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2)
    print WORD1.shakl_compare(WORD1, WORD2)
    for i in PHRASE.tokenize_shakl():
        print i,

    WORD4 = unicode_(u"عاصم")
    WORD5 = unicode_(u"عاصِم")

    print WORD4 == WORD5
Пример #6
0
# coding: utf-8

"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_


if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = u"عاصِمٌ"
    TEXT = ASF.normalize_all( TEXT )
    print TEXT

    WORD1 = unicode_( u"عَاصِمُ" )
    WORD2 = unicode_( u"عَاصمُ" )
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_( u"فاعل" )
    PHRASE = unicode_( u"كانَ" )
    print WORD3.apply_harakat_list( LIST_HARAKAT1 )
    print LIST_HARAKAT1, "\n", LIST_HARAKAT2
    print unicode_.compare_harakat( LIST_HARAKAT1, LIST_HARAKAT2 )
    print WORD1.shakl_compare( WORD1, WORD2 )
    for i in PHRASE.tokenize_shakl():
        print i,
    
    WORD4 = unicode_( u"عاصم" )
    WORD5 = unicode_( u"عاصِم" )
Пример #7
0
    def __init__( self, QC_PATH = "../../store/quranic-corpus-morpology.xml", DB = "main.db" ):
        """ make word table """

        import sqlite3

        print "connecting to database ...",
        maindb = sqlite3.connect( DB )
        cur = maindb.cursor()
        print "OK"

        print "creating tables:"
	cur.execute( """ drop table if exists wordQC""" )
        cur.execute( 
                        """ create table if not exists  wordQC(
                        gid int unique,
                        word_gid int,
                        word_id int,
                        aya_id int,
                        sura_id int,

                        word varchar(25),
                        normalised varchar(25),
                        spelled varchar(25),
                        'order' int,
                        token varchar(25),
                        arabictoken varchar(25),
                        prefixes varchar(25),
                        suffixes varchar(25),


                        pos varchar(25),
                        type varchar(25),
                        arabicpos varchar(25),
                        mood varchar(25),
                        arabicmood varchar(25),
                        'case' varchar(25),
                        arabiccase varchar(25),
                        root varchar(25),
                        arabicroot varchar(25),
                        lemma varchar(25),
                        arabiclemma varchar(25),
                        special varchar(25),
                        arabicspecial varchar(25),

                        derivation varchar(25),
                        form varchar(25),
                        gender varchar(25),
                        person varchar(25),
                        number varchar(25),
                        voice varchar(25),
                        state varchar(25),
                        aspect varchar(25),

                        primary key(gid)

                    )

                    """ )
        print ">wordQC table ... OK"


        print ">loading Qurany Corpus...",
        from PyCorpus.QuranyCorpus import API as QC
        A = QC( source = QC_PATH )
        print ".OK\n"
        IFEXIST = lambda d, attrib: d[attrib].encode( "utf-8" ) if attrib in d else ""
        gid, word_gid = 0, 0
        print ">inserting values of gid...",
        for iteration in A.all_words_generator():
            QASF = QArabicSymbolsFilter( shaping = True, 
                                         tashkil = True, 
                                         spellerrors = False, 
                                         hamza = False, 
                                         uthmani_symbols = True )
            QASF_spelled = QArabicSymbolsFilter( shaping = True, 
                                                 tashkil = True, 
                                                 spellerrors = True, 
                                                 hamza = True, 
                                                 uthmani_symbols = True
                                                 )

            QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood,
                arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial,
                word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values
                ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s",  "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s",
                "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s",
                "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % {
										    "gid":gid,
										    "word_gid":word_gid,
										    "word_id":iteration["word_id"],
										    "aya_id":iteration["aya_id"],
										    "sura_id":iteration["sura_id"],
										    "order":order,
										    "token":IFEXIST( d, "token" ),
										    "arabictoken":IFEXIST( d, "arabictoken" ),
										    "prefixes":";".join([prefix["arabictoken"] for prefix in glob["prefixes"] ]).encode( "utf-8" ),
										    "suffixes":";".join([suffix["arabictoken"] for suffix in glob["suffixes"] ]).encode( "utf-8" ),
										    "type":IFEXIST( d, "type" ),
										    "pos":IFEXIST( d, "pos" ),
										    "arabicpos":IFEXIST( d, "arabicpos" ),
										    "mood":IFEXIST( d, "mood" ),
										    "arabicmood":IFEXIST( d, "arabicmood" ),
										    "case":IFEXIST( d, "case" ),
										    "arabiccase":IFEXIST( d, "arabiccase" ),
										    "root":IFEXIST( d, "root" ),
										    "arabicroot":IFEXIST( d, "arabicroot" ),
										    "lemma":IFEXIST( d, "lemma" ),
										    "arabiclemma":IFEXIST( d, "arabiclemma" ),
										    "special":IFEXIST( d, "special" ),
										    "arabicspecial":IFEXIST( d, "arabicspecial" ),
										    "word":iteration["word"].encode( "utf-8" ),
										    "normalised":  QASF.normalize_all( iteration["word"] ).encode( "utf-8" ),
										    "spelled": QASF_spelled.normalize_all( iteration["word"] ).encode( "utf-8" ),
										    "derivation":IFEXIST( d, "derivation" ),
										    "form":IFEXIST( d, "form" ),
										    "gender":IFEXIST( d, "gender" ),
										    "person":IFEXIST( d, "person" ),
										    "number":IFEXIST( d, "number" ),
										    "voice":IFEXIST( d, "voice" ),
										    "state":IFEXIST( d, "state" ),
										    "aspect":IFEXIST( d, "aspect" )
										    }
            word_gid += 1
            if word_gid % 1000 == 0:
                print word_gid,
            print("\n")

            order = 0
            for d in iteration["morphology"]["base"]:
                gid += 1
                order += 1
                cur.execute( QUERY( d, iteration["morphology"] ) )

        print("OK")
        maindb.commit()