Пример #1
0
def show_collocations(text):
    """
    Show collocations found in the text.
    The collocations is looked up from a data base extracted from a corpus.
    @param text: a given vocalized text.
    @type text: unicode.
    @return : the text have collocations quoted
    @rtype: unicode
    """
    """import tashkeel.tashkeel as ArabicVocalizer    
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.stat_tashkeel(text)
    return vocalized_text
    """
    import maskouk.collocations as colloc
    coll = colloc.CollocationClass(True)
    text = coll.lookup4long_collocations(text)
    wordlist = araby.tokenize(text)
    vocalized_list, taglist = coll.lookup(wordlist)
    #return u" ".join(zip(vocalized_list,taglist))
    text_output = u""
    opened = False
    for word, tag in zip(vocalized_list, taglist):
        if tag in ("CB", "CI"):
            if not opened:
                text_output += "<mark class='coll'>"
                opened = True
            text_output += word + " "
        else:
            if opened:
                text_output += "</mark>"
                opened = False
            text_output += word + " "
    return text_output
Пример #2
0
    def test_lookup4long(self):
        """####detect long collocations in a phrase"""
        mydict = msk.CollocationClass()
        inpt = u' قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت'
        output = u' قلت لهم السّلامُ عَلَيكُمْ وَرَحْمَةُ اللهِ تَعَالَى وبركاته ثم رجعت'

        self.assertEqual(mydict.lookup4long_collocations(inpt), output)
Пример #3
0
 def test_ngramfinder(self):
     """####Detect collocation in a phrase"""
     mydict = msk.CollocationClass()
     text = u"لعبنا مباراة كرة القدم في بيت المقدس"
     inpt = araby.tokenize(text)
     output = ['لعبنا', 'مباراة', 'كرة القدم', 'في', 'بيت المقدس']
     self.assertEqual(mydict.ngramfinder(2, inpt), output)
Пример #4
0
    def test_is_possible_collocation(self):
        """####Detect candidate collocations in phrase"""
        mydict = msk.CollocationClass()
        text = u"ظهر رئيس الوزراء السيد عبد الملك بن عامر ومعه أمير دولة غرناطة ونهر النيل انطلاق السباق"
        inputs = [
            [['السباق', 'ظهر'], 100],
            [['ظهر', 'رئيس'], 100],
            [['رئيس', 'الوزراء'], 100],
            [['الوزراء', 'السيد'], 20],
            [['السيد', 'عبد'], 100],
            [['عبد', 'الملك'], 15],
            [['الملك', 'بن'], 100],
            [['بن', 'عامر'], 15],
            [['عامر', 'ومعه'], 100],
            [['ومعه', 'أمير'], 100],
            [['أمير', 'دولة'], 100],
            [['دولة', 'غرناطة'], 10],
            [['غرناطة', 'ونهر'], 100],
            [['ونهر', 'النيل'], 100],
            [['النيل', 'انطلاق'], 100],
            [['انطلاق', 'السباق'], 100],
        ]

        outputs = []
        for wlist, output in inputs:
            self.assertEqual(mydict.is_possible_collocation(wlist, length=2),
                             output)
Пример #5
0
    def __init__(self):
        # to display internal messages for debugging
        #~debug = False
        # limit of words to vocalize, default value is 1000 words.
        self.limit = 1000

        #  set the option value to enable the Last mark on voaclize
        # words in output
        # default value is True, can be disabled for debuging porpus
        self.enabled_last_mark = True

        # set the option to do statistical vocalization based
        # on collocations
        # default value is True, can be disabled for debuging porpus
        #self.enabled_stat_tashkeel = False
        self.enabled_stat_tashkeel = True

        # set the option to show the collocations marks
        # default value is False, can be enabled for debuging porpus
        self.enabled_show_collocation_mark = False

        # set the option to use scoring teashkeel chosing.
        self.select_by_score_enabled = False
        # set the option to do syntaxic Analysis
        # default value is True, can be disabled for debuging porpus
        self.enabled_syntaxic_analysis = True

        # set the option to do allow ajusting voaclization result,
        # for التقاء الساكنين
        # default value is True, can be disabled for debuging porpus
        self.enabled_ajust_vocalization = True

        # set the option to do Semantic Analysis
        # default value is True, can be disabled for debuging porpus
        self.enabled_semantic_analysis = True
        #~ self.enabled_semantic_analysis = False

        # enable the last mark (Harakat Al-I3rab)
        self.allow_syntax_last_mark = True

        # lexical analyzer
        self.analyzer = qalsadi.analex.Analex()
        self.analyzer.disable_allow_cache_use()
        #~ self.analyzer.enable_allow_cache_use()

        # syntaxic analyzer
        self.anasynt = aranasyn.anasyn.SyntaxAnalyzer()
        # semantic analyzer
        self.anasem = asmai.anasem.SemanticAnalyzer()
        #set the lexical analzer debugging
        self.analyzer.set_debug(debug)
        #set the lexical analzer  word limit
        self.analyzer.set_limit(self.limit)
        #collocations dictionary for statistical tashkeel
        self.collo = coll.CollocationClass(self.enabled_show_collocation_mark)

        # unknown vocalizer for unrecognized words
        self.unknown_vocalizer = unknown_tashkeel.UnknownTashkeel()
Пример #6
0
 def test_lookup(self):
     """####Detect collocation in a phrase"""
     mydict = msk.CollocationClass()
     text = u"لعبنا مباراة كرة القدم في بيت المقدس"
     inpt = araby.tokenize(text)
     output = ([
         'لعبنا', 'مباراة', 'كُرَة', 'الْقَدَمِ', 'في', 'بَيْت',
         'الْمَقْدِسِ'
     ], ['CO', 'CO', 'CB', 'CI', 'CO', 'CB', 'CI'])
     self.assertEqual(mydict.lookup(inpt), output)
Пример #7
0
    def test_is_collocated(self):
        """#### Test if collocation exists in database"""
        mydict = msk.CollocationClass()
        inpt = ['كرة', 'القدم']
        output = u"كرة القدم"
        self.assertEqual(mydict.is_collocated(inpt), output)

        inpt = ['شمس', 'النهار']
        output = False
        self.assertEqual(mydict.is_collocated(inpt), output)
Пример #8
0
def extract_enteties(text):
    """
    Extract enteties as numbers, named enteties, collocations.
    @param text: a given text.
    @type text: unicode.
    @return : the text have enteties phrases quoted
    @rtype: unicode
    """
    import pyarabic.number
    import pyarabic.named
    import maskouk.collocations as colloc
    coll = colloc.CollocationClass(True)
    wordlist = araby.tokenize(text)
    taglist_nb = pyarabic.number.detect_numbers(wordlist)
    voclist_nb = pyarabic.number.pre_tashkeel_number(wordlist)
    taglist_nmd = pyarabic.named.detect_named(wordlist)
    voclist_nmd = pyarabic.named.pretashkeel_named(wordlist)
    voclist_coll, taglist_coll = coll.lookup(wordlist)
    # return phrases
    text_output = []
    opened = False
    for word, tagnb, vocnb, tagnmd, vocnmd, tagcol, voccol in zip(
            wordlist, taglist_nb, voclist_nb, taglist_nmd, voclist_nmd,
            taglist_coll, voclist_coll):
        if tagnb == 'DB':
            if opened:
                text_output.append("</mark>")
            text_output.extend(["<mark class='number'>", vocnb])
            opened = True
        elif tagnmd == 'NB':
            if opened:
                text_output.append("</mark>")
            text_output.extend(["<mark class='named'>", vocnmd])
            opened = True
        elif tagcol == 'CB':
            if opened:
                text_output.append("</mark>")
            text_output.extend(["<mark class='coll'>", voccol])
            opened = True
        elif tagnmd == "NI":
            text_output.append(vocnmd)
        elif tagnb == "DI":
            text_output.append(vocnb)
        elif tagcol == "CI":
            text_output.append(voccol)
        else:
            if opened:
                text_output.append("</mark>")
                opened = False
            text_output.append(word)
    if opened:
        text_output.append("</mark>")
    return u" ".join(text_output)
Пример #9
0
def extract_enteties2(text):
    """
    Extract enteties as numbers, named enteties, collocations.
    @param text: a given text.
    @type text: unicode.
    @return : the text have enteties phrases quoted
    @rtype: unicode
    """
    import pyarabic.number
    import pyarabic.named
    import maskouk.collocations as colloc
    coll = colloc.CollocationClass(True)
    wordlist = araby.tokenize(text)
    taglist_nb = pyarabic.number.detect_numbers(wordlist)
    taglist_nmd = pyarabic.named.detect_named(wordlist)
    vocalized_list, taglist_coll = coll.lookup(wordlist)
    # return phrases
    text_output = ""
    opened = False
    for word, voc, tagnb, tagnmd, tagcol in zip(wordlist, vocalized_list,
                                                taglist_nb, taglist_nmd,
                                                taglist_coll):
        if tagnb in ('DI', 'DB'):
            if not opened:
                text_output += "<mark class='number'>"
                opened = True
            text_output += word + " "
        elif tagnmd in ('NI', 'NB'):
            if not opened:
                text_output += "<mark class='named'>"
                opened = True
            text_output += word + " "
        elif tagcol in ('CI', 'CB'):
            if not opened:
                text_output += "<mark class='coll'>"
                opened = True
            text_output += voc + " "
        else:
            if opened:
                text_output += "</mark>"
                opened = False
            text_output += word + " "
    return text_output
Пример #10
0
    def test_is_collocated_word(self):
        """####Test if a word has collocations in database"""
        mydict = msk.CollocationClass()
        inpt = u"كرة"
        output = {'القدم': 'كُرَة الْقَدَمِ'}
        self.assertEqual(mydict.is_collocated_word(inpt), output)

        inpt = u"بيت"
        output = {
            'العدة': 'بَيْت الْعِدَّةِ',
            'المستأجر': 'بَيْت الْمُسْتَأْجِرِ',
            'المشتري': 'بَيْتِ الْمُشْتَرِي',
            'الرجل': 'بَيْت الرَّجُلِ',
            'البناء': 'بَيْت الْبِنَاءِ',
            'الزوج': 'بَيْت الزَّوْجِ',
            'المال': 'بيت المال',
            'المقدس': 'بَيْت الْمَقْدِسِ',
            'البائع': 'بَيْت الْبَائِعِ',
            'الخلاء': 'بَيْت الْخَلَاءِ',
            'الأب': 'بَيْت الْأَبِ',
            'الله': 'بَيْت اللّهِ'
        }
        self.assertEqual(mydict.is_collocated_word(inpt), output)
Пример #11
0
    def __init__(self, mycache_path=False):
        # configure logging 
        logging.basicConfig(level=logging.INFO)
        #~ logging.basicConfig(level=logging.DEBUG)
        self.logger = logging.getLogger(__name__)
        #~ self.logger.info("Cache Path %s"%mycache_path)

        # to display internal messages for debugging
        #~debug = False
        # limit of words to vocalize, default value is 1000 words.
        self.limit = 1000
        
        #  set the option value to enable the Last mark on voaclize 
        # words in output
        # default value is True, can be disabled for debuging porpus
        self.enabled_last_mark = True
        
        # set the option to do statistical vocalization based 
        # on collocations
        # default value is True, can be disabled for debuging porpus
        #self.enabled_stat_tashkeel = False    
        self.enabled_stat_tashkeel = True   
            
        # set the option to show the collocations marks
        # default value is False, can be enabled for debuging porpus
        self.enabled_show_collocation_mark = False
        
        # set the option to use scoring teashkeel chosing.
        self.select_by_score_enabled = False
        # set the option to do syntaxic Analysis
        # default value is True, can be disabled for debuging porpus
        self.enabled_syntaxic_analysis = True

        # set the option to do allow ajusting voaclization result, 
        # for التقاء الساكنين
        # default value is True, can be disabled for debuging porpus
        self.enabled_ajust_vocalization = True        

        # set the option to do Semantic Analysis
        # default value is True, can be disabled for debuging porpus        
        self.enabled_semantic_analysis = True
        #~ self.enabled_semantic_analysis = False

        # enable the last mark (Harakat Al-I3rab) 
        self.allow_syntax_last_mark = True 

        # lexical analyzer
        self.analyzer = qalsadi.analex.Analex(cache_path = mycache_path)
        #~ self.logger.info("Cache Path cache %s"%self.analyzer.cache.DB_PATH)
        #~ self.logger.info("Cache Path cache %s"%self.analyzer.cache.db.path)
        #~ self.analyzer.disable_allow_cache_use()
        self.analyzer.enable_allow_cache_use()

        # syntaxic analyzer
        self.anasynt = aranasyn.anasyn.SyntaxAnalyzer(cache_path = mycache_path)
        #~ self.logger.info("Cache Path cache syntax %s"%self.anasynt.cache.db.path)
        
        # to disable the training when do Tashkeel
        self.syntax_train_enabled = False
        
        # semantic analyzer
        self.anasem = asmai.anasem.SemanticAnalyzer(cache_path = mycache_path)
        #~ self.logger.info("Cache Path cache anasem %s"%self.anasem.syncache.db.path)
       
        #set the lexical analzer debugging
        self.analyzer.set_debug(debug)
        #set the lexical analzer  word limit
        self.analyzer.set_limit(self.limit)
        #collocations dictionary for statistical tashkeel
        self.collo = coll.CollocationClass(self.enabled_show_collocation_mark)
        
        # unknown vocalizer for unrecognized words
        self.unknown_vocalizer = unknown_tashkeel.UnknownTashkeel()
Пример #12
0
def test():
    mydict = msk.CollocationClass()
    word1 = u"كرة"
    word2 = u"القدم"
    wlist = [word1, word2]
    # test if collocation exists
    print("step1:test if wordlist is collocation")
    results = mydict.is_collocated(wlist)
    print("inuput:", wlist)
    print("output:",results)
    wlist = [u"شمس", u"النهار"]
    results = mydict.is_collocated(wlist)
    print("inuput:", wlist)
    print("output:",results)
    # get all collocations for a specific word
    
    print("step2:get all collocations for a specific word")
    results  = mydict.is_collocated_word(word1)
    print(word1, results)
    print("step3:get all collocations for a specific word")
    word = u"بيت"
    # get all collocations for a specific word
    results  = mydict.is_collocated_word(word)
    print("inuput:", word)
    print("output:",results)
    # detect collocations in phrase    
    print("step4: detect collocations in phrase")
    text = u"لعبنا مباراة كرة القدم في بيت المقدس"
    wordlist = araby.tokenize(text)

    results  = mydict.ngramfinder(2, wordlist)
    print("inuput:", text)
    print("output:",results)
    # detect collocations in phrase    
    print("step4.1: detect collocations in phrase")
    text = u"لعبنا مباراة كرة القدم في بيت المقدس"
    wordlist = araby.tokenize(text)
    results   = mydict.lookup(wordlist)
    print("inuput:", text)
    print("output:",results)
    # get Long collocations
    print("step5: long collocations")
    text = u" قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت"

    results  = mydict.lookup4long_collocations(text)
    print("inuput:", text)
    print("output:",results)   
    # get Long collocations
    print("step5-b: long collocations")
    text = u" قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت"
    results  = mydict.lookup4long_collocations(text)
    print("inuput:", text)
    print("output:",results)   
    print("inpt = u'%s'"%text)
    print("output = u'%s'"%results)   
    # get Long collocations
    print("step6: detect possible collocations")
    text = u"ظهر رئيس الوزراء السيد عبد الملك بن عامر ومعه أمير دولة غرناطة ونهر النيل انطلاق السباق"
    wordlist = araby.tokenize(text)
    previous = "__"
    for wrd in wordlist:
        wlist = [previous, wrd]
        results  = mydict.is_possible_collocation(wlist, lenght = 2)
        print("inuput:", wlist)
        print("output:", results)   
        previous  = wrd
    print("[\n")
    for wrd in wordlist:
        wlist = [previous, wrd]
        results  = mydict.is_possible_collocation(wlist, lenght = 2)
        print("[",wlist, ",", results,"],")
        #~ print("output:", results)   
        previous  = wrd
    print("]")