Пример #1
0
def VnTraining(args = sys.argv[2:]):
    pathToDict = args[0]
    dirPath = os.path.join(args[1] + "/")
    correctTrain = args[2]
    learntRules = args[3]
    
    print( '\nTraining RDRPOSTagger for Vietnamese POS Tagging...')   
    print( "Initial tagging...")
    
    getRawTextFromFile(dirPath + correctTrain, dirPath + correctTrain + ".RAW")
    DICT = readDictionary(pathToDict)
    VnInitTagger4Corpus(DICT, dirPath + correctTrain + ".RAW", dirPath + correctTrain + ".INIT")
    
    print ("Done Initialization!")
    
    print ('Building SCRDR-based POS tagging tree of rules...')
    
    for (improveThreshold, matchThreshold) in thresholds:
        timeStart = time.time()
        outputDir = "T%d-%d/" % (improveThreshold, matchThreshold)
        os.mkdir(dirPath + outputDir)         
        
        rdrTree = PosTaggingRDRTree(improveThreshold, matchThreshold) 
        rdrTree.buildTreeFromCorpus(dirPath + correctTrain + ".INIT", dirPath + correctTrain)
        
        print ("Write the tree to file...")
        rdrTree.writeToFileWithoutSeenCases(dirPath + outputDir + learntRules)
        #rdrTree.writeToFile(dirPath + outputDir + learntRules)       
        
        print ("\nTraining time for threshold %d-%d: %f seconds\n" % (improveThreshold, matchThreshold, time.time() - timeStart))
            
    print ('\nCompleted!')
Пример #2
0
def runVnRDRPOSTagger(args = sys.argv[1:]):
    if (len(args) == 0):
        printInstructions()
    elif args[0].lower().find("train") > -1:
        VnTraining()
    elif args[0].lower().find("tag") > -1:
        r = VnRDRTree()
        r.constructTreeFromRulesFile(args[1])
        DICT = readDictionary(args[2])
        r.tagRawCorpus(DICT, args[3])
    else:
        printInstructions()
Пример #3
0
# -*- coding: utf-8 -*-
import re
# from Utility.Utils import readDictionary, isAbbre, isVnProperNoun,  isVnUpperChar
from src.tagger.Utility.Utils import readDictionary, isAbbre, isVnProperNoun,  isVnUpperChar 
# VNUNKNWORDS = readDictionary("../jSCRDRtagger/addDicts/VNOTHERS.DICT")
# VNNAMES = readDictionary("../jSCRDRtagger/addDicts/VNNAMES.DICT")

VNUNKNWORDS = readDictionary("./resource/VNOTHERS.DICT")
VNNAMES = readDictionary("./resource/VNNAMES.DICT")


def VnInitTagger4Sentence(VNFREQ, sentence):
    """
    Initial tagger for Vietnamese sentence.
    VNUNKNWORDS and VNNAMES were not utilized in the version as described in our CICLing 2011 paper
    """
    words = sentence.strip().split()
    taggedSen = ''
    for word in words:
        if word in VNFREQ:
            taggedSen += word + "/" + VNFREQ[word] + " "
        elif word in VNUNKNWORDS:
            taggedSen += word + "/" + VNUNKNWORDS[word] + " "
        elif word in VNNAMES:
            taggedSen += word + "/Np "      
        else:         
            if (re.search(r"[0-9]+", word) != None):
                taggedSen += word + "/M "
            elif(len(word) == 1 and isVnUpperChar(word[0])):
                taggedSen += word + "/Y "
            else: