示例#1
0
 def test_vocalized_similarity(self):
     """Test vocalized_similarity function ?"""
     word1 = u"ضَربٌ"
     word2 = u"ضَرْبٌ"
     self.assertTrue(ar.vocalizedlike(word1, word2))
     self.assertNotEqual(ar.vocalized_similarity(word1, word2), -2)
     self.assertTrue(ar.vocalized_similarity(word1, word2))
示例#2
0
 def test_vocalized_similarity(self):
     """Test vocalized_similarity function ?"""
     word1 = u"ضَربٌ"
     word2 = u"ضَرْبٌ"
     self.assertTrue(ar.vocalizedlike(word1, word2))
     self.assertNotEqual(ar.vocalized_similarity(word1, word2), -2)
     self.assertTrue(ar.vocalized_similarity(word1, word2))
示例#3
0
def detect_number_words(text):
    """
    Detect number words in a text.
    @param text: input text
    @type text: unicode
    @return : number words extracted from text
    @rtype: integer
    >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
    خمسمئة وثلاثة وعشرين
    """

    #~ words = araby.tokenize(text)
    #print words
    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            vocalized = vocalize_number(araby.strip_tashkeel(\
            numberedwords).split(' '), tags)
            #calcul  vocalization similarity :
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, \
                nextword)
            if sim < 0:
                print u'\t'.join([str(sim), numberedwords, vocalized, \
                 str(numeric), u' '.join([previous, phrase, nextword]), \
                  nextword, voc_unit, str(sim_unit)]).encode('utf8')
示例#4
0
def detect_number_words(text):
    """
    Detect number words in a text.
    @param text: input text
    @type text: unicode
    @return : number words extracted from text
    @rtype: integer
    >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
    خمسمئة وثلاثة وعشرين
    """

    #~ words = araby.tokenize(text)
    #print words
    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            vocalized = vocalize_number(araby.strip_tashkeel(\
            numberedwords).split(' '), tags)                
            #calcul  vocalization similarity : 
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, \
                nextword)                    
            if sim < 0:
                print u'\t'.join([str(sim), numberedwords, vocalized, \
                 str(numeric), u' '.join([previous, phrase, nextword]), \
                  nextword, voc_unit, str(sim_unit)]).encode('utf8')
示例#5
0
    def test_vocalized_similarity(self):

        # vocalized_similarity(word1, word2)
        word1 = u"ضَربٌ"
        word2 = u"ضَرْبٌ"
        word3 = u"ضَرْبٍ"
        self.assertTrue(Araby.vocalized_similarity(word1, word2))
        assert  Araby.vocalized_similarity(word1, word3) == -1
示例#6
0
    def compare(self, baseline, vocalized_output):
        """
        compare base line with automatic vocalized result
        """
        myconsole.lineCorrect = 0
        myconsole.lineWLMIncorrect = 0
        inputVocalizedLine = baseline

        inputlist = araby.tokenize(inputVocalizedLine)
        if type(vocalized_output) == list:
            outputlist = [x.get("chosen", '') for x in vocalized_output]
            result = vocalized_output
            outputlistsemi = [x.get("semi", '') for x in vocalized_output]
        elif type(vocalized_output) == str:
            outputlist = araby.tokenize(vocalized_output)
            outputlistsemi = [araby.strip_lastharaka(x) for x in outputlist]
        else:
            print("Incompatible  vocaluzed output, must be dict or string",
                  type(vocalized_output), vocalized_output)
            sys.exit()

        self.total += len(inputlist)
        self.lineTotal = len(inputlist)
        if len(inputlist) != len(outputlist):
            print("lists haven't the same length")
            print(len(inputlist), len(outputlist))
            print(u"# ".join(inputlist).encode('utf8'))
            print(u"# ".join(outputlist).encode('utf8'))
        else:
            for inword, outword, outsemiword in zip(inputlist, outputlist,
                                                    outputlistsemi):
                simi = araby.vocalized_similarity(inword, outword)
                if simi < 0:
                    self.LettersError += -simi
                    self.incorrect += 1
                    # evaluation without last haraka
                    simi2 = araby.vocalized_similarity(inword, outsemiword)
                    if simi2 < 0:
                        self.WLMIncorrect += 1
                        self.lineWLMIncorrect += 1
                else:
                    self.correct += 1
                    self.lineCorrect += 1
        self.counter += 1
示例#7
0
文件: number.py 项目: Guibod/pyarabic
def detect_number_words(text):
    """
    Detect number words in a text.
    
    Example:
        >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
        خمسمئة وثلاثة وعشرين 
    
    @param text: input text
    @type text: unicode
    @return: number words extracted from text
    @rtype: integer
    """

    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            wordlist = araby.strip_tashkeel(numberedwords).split(' ')
            vocalized = vocalize_number(wordlist, tags)
            #calcul  vocalization similarity :
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, nextword)

            if sim < 0:
                #~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized,
                #~ str(numeric), u' '.join([previous, phrase, nextword]),
                #~ nextword, voc_unit, str(sim_unit)]).encode('utf8')
                print('\t'.join(
                    [str(sim), ' '.join(numberedwords),
                     ' '.join(vocalized)]).encode('utf8'))
                print(str(numeric), ' '.join([previous, phrase,
                                              nextword]).encode('utf8'))
                print('\t'.join([nextword, voc_unit,
                                 str(sim_unit)]).encode('utf8'))
示例#8
0
def detect_number_words(text):
    """
    Detect number words in a text.

    Example:
        >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
        خمسمئة وثلاثة وعشرين

    @param text: input text
    @type text: unicode
    @return: number words extracted from text
    @rtype: integer
    """

    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            wordlist = araby.strip_tashkeel(numberedwords).split(' ')
            vocalized = vocalize_number(wordlist, tags)
            #calcul  vocalization similarity:
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, nextword)

            if sim < 0:
                #~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized,
                 #~ str(numeric), u' '.join([previous, phrase, nextword]),
                  #~ nextword, voc_unit, str(sim_unit)]).encode('utf8')
                print(u'\t'.join([str(sim), u' '.join(numberedwords), u' '.join(vocalized)]))
                print(str(numeric), u' '.join([previous, phrase, nextword]))
                print(u'\t'.join([nextword, voc_unit, str(sim_unit)]))
示例#9
0
def test():
    options = grabargs()

    filename = options['fname']
    outfilename = options['ofname']
    text = options['text']
    strip_tashkeel = options['strip_tashkeel']
    nocache = options['nocache']
    reducedTashkeel = options['reducedTashkeel']
    disableSyntax = options['disableSyntax']
    disableSemantic = options['disableSemantic']
    disableStat = options['disableStatistic']
    ignore = options['ignore']
    limit = options['limit']
    compare = options['compare']
    progress = options['progress']
    enable_syn_train = options['train']

    # filename = "samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
            print("input file:", filename)
            if not outfilename:
                outfilename = filename + " (Tashkeel).txt"
            print("output file:", outfilename)
            outfile = open(outfilename, "w")
        except:
            print(" Can't Open the given File ", filename)
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not strip_tashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if nocache:
            vocalizer.disable_cache()
            # print "nocache"
        if ignore:
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()
        if enable_syn_train:
            vocalizer.enable_syn_train()
            # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled

    # vocalizer.disableShowCollocationMark()
    # print "show delimiter", vocalizer.collo.showDelimiter
    # nolimit = True
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    percent = 0
    if compare:
        # dispaly stats for the current line
        print(
            "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine"
        )

    while line and (nolimit or counter <= limit):
        if not line.startswith('# '):
            line = line.strip()
            lineCorrect = 0
            lineWLMIncorrect = 0
            if strip_tashkeel:
                result = araby.strip_tashkeel(line)
            else:  # vocalize line by line
                if not compare:
                    result = vocalizer.tashkeel(line)
                if compare:
                    inputVocalizedLine = line
                    inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine)
                    inputUnvocalizedLine = araby.strip_tashkeel(line)
                    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(
                        inputUnvocalizedLine)

                    # stemmer = tashaphyne.stemming.ArabicLightStemmer()
                    # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine)
                    # ~inputlist = []
                    # ~for txt in texts:
                    # ~inputlist += vocalizer.analyzer.text_tokenize(txt)
                    outputlist = [x.get("chosen", '') for x in vocalized_dict]
                    result = u" ".join(outputlist)
                    outputlistsemi = [
                        x.get("semi", '') for x in vocalized_dict
                    ]
                    total += len(inputlist)
                    lineTotal = len(inputlist)
                    if len(inputlist) != len(outputlist):
                        print("lists haven't the same length")
                        print(len(inputlist), len(outputlist))
                        print(u"# ".join(inputlist).encode('utf8'))
                        print(u"# ".join(outputlist).encode('utf8'))
                    else:
                        for inword, outword, outsemiword in zip(
                                inputlist, outputlist, outputlistsemi):
                            simi = araby.vocalized_similarity(inword, outword)
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # evaluation without last haraka
                                simi2 = araby.vocalized_similarity(
                                    inword, outsemiword)
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1
                            else:
                                correct += 1
                                lineCorrect += 1

            # compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8')
            counter += 1

            # display stat for every line
            if compare:
                print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  # id
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  # Strip Correct
                    incorrect,  # fully WER
                    WLMIncorrect,  # Strip WER
                    LettersError,  # LER
                    total  # Total
                ))
                if lineTotal:
                    print("%0.2f%%\t" %
                          round(lineCorrect * 100.00 / lineTotal, 2)
                          )  # line Fully correct
                    print("%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2)
                          )  # line Strip correct

            # ~ print result.strip('\n').encode('utf8'),
            if text:
                print result.strip('\n').encode('utf8'),
            else:
                result_line = result.encode('utf8')
                print result_line
                # add line and new line to output file
                outfile.write(result_line)
                outfile.write("\n")

        if progress and not nolimit:
            # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent
            sys.stderr.write(
                "\r[%d%%]%d/%d lines    Full %0.2f Strip %0.2f     " % (
                    counter * 100 / limit,
                    counter,
                    limit,
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2)  # Strip Correct
                ))
            # ~sys.stderr.write("treatment of "+line.encode('utf8'))
            sys.stderr.flush()

        # get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None
    else:
        print("Done")
示例#10
0
def test():
    options = grabargs()

    filename = options['fname']
    text     = options['text']
    strip_tashkeel  = options['strip_tashkeel']
    nocache         = options['nocache']
    reducedTashkeel = options['reducedTashkeel']
    disableSyntax   = options['disableSyntax']
    disableSemantic = options['disableSemantic']
    disableStat     = options['disableStatistic']
    ignore = options['ignore']
    limit  = options['limit']
    compare = options['compare']
    progress = options['progress']
        
    #filename = "samples/randomtext.txt"    
    if not text and not filename:
        usage()
        sys.exit(0)
        
    if not text:
        try:
            myfile = open(filename)
        except:
            print " Can't Open the given File ", filename
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat 
    import pyarabic.araby as araby

    counter = 1
    if not limit : 
        limit = 100000000
    if not strip_tashkeel: 
        vocalizer = ArabicVocalizer.TashkeelClass()
        if nocache : 
            vocalizer.disable_cache()
            print "nocache"
        if ignore : 
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()

    #vocalizer.disableShowCollocationMark()
    #print "show delimiter", vocalizer.collo.showDelimiter
    #nolimit = True
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines)>0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    percent = 0
    if compare:
        #dispaly stats for the current line
        print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"
        
    while line and (nolimit or counter <= limit):
        if progress and not nolimit:
            #~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent
            sys.stderr.write("\r[%d%%]%d/%d lines" %(counter * 100/ limit, counter, limit))
            #~sys.stderr.write("treatment of "+line.encode('utf8'))
            sys.stderr.flush()
        if not line.startswith('#'):
            line = line.strip()
            lineCorrect = 0
            lineWLMIncorrect = 0
            if strip_tashkeel:
                result = araby.strip_tashkeel(line)
            else:    #vocalize line by line
                if not compare:
                    result = vocalizer.tashkeel(line)                    
                if compare:
                    inputVocalizedLine = line
                    inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine)
                    inputUnvocalizedLine = araby.strip_tashkeel(line)
                    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(inputUnvocalizedLine)


                    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
                    #~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine)
                    #~inputlist =[]
                    #~for txt in texts:
                        #~inputlist += vocalizer.analyzer.text_tokenize(txt)
                    outputlist = [x.get("chosen",'') for x in vocalized_dict]
                    result = u" ".join(outputlist)
                    outputlistsemi = [x.get("semi",'') for x in vocalized_dict]
                    total += len(inputlist)
                    lineTotal = len(inputlist)
                    if len(inputlist) != len(outputlist):
                        print "lists haven't the same length"
                        print len(inputlist), len(outputlist)
                        print u"#".join(inputlist).encode('utf8')
                        print u"#".join(outputlist).encode('utf8')
                    else:
                        for inword, outword, outsemiword in zip(inputlist, outputlist, outputlistsemi):
                            simi = araby.vocalized_similarity(inword, outword)
                            if simi<0:
                                LettersError += -simi
                                incorrect    += 1
                                # evaluation without last haraka
                                simi2 = araby.vocalized_similarity(inword, outsemiword)
                                if simi2<0: 
                                    WLMIncorrect     += 1
                                    lineWLMIncorrect += 1                                
                            else:
                                correct += 1
                                lineCorrect  += 1
                    
            #compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8')
            counter += 1

            #display stat for every line
            if compare:
                print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%(
                        counter-1, #id
                        round(correct*100.00/total, 2), #fully Correct
                        round((total-WLMIncorrect)*100.00/total, 2), #Strip Correct
                        incorrect, #fully WER
                        WLMIncorrect, #Strip WER
                        LettersError, #LER
                        total, #Total
                        ), 
                if lineTotal:
                    print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal, 2), #line Fully correct
                    print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal, 2), #line Strip correct
                        
            print result.encode('utf8')
        #get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter<len(lines):
                line = lines[counter]
            else:
                line = None