def Training(args = sys.argv[2:]): """ Training RDRPOSTagger using initialized corpus against golden corpus! The initialized corpus is already generated by the use of an external initial tagger. """ from src.tagger.SCRDRlearner.PosTaggingRDRTree import PosTaggingRDRTree dirPath = os.path.join(args[0] + "/") correctTrain = args[1] initializedTrain = args[2] learntRules = args[3] print ('\nTraining RDRPOSTagger in the use of initialized corpus against golden corpus....') print ('Building SCRDR-based POS tagging tree of rules...') for (improveThreshold, matchThreshold) in thresholds: outputDir = "T%d-%d/" % (improveThreshold, matchThreshold) os.mkdir(dirPath + outputDir) rdrTree = PosTaggingRDRTree(improveThreshold, matchThreshold) rdrTree.buildTreeFromCorpus(dirPath + initializedTrain, dirPath + correctTrain) print ("Write the tree to file...") rdrTree.writeToFileWithoutSeenCases(dirPath + outputDir + learntRules) #rdrTree.writeToFile(dirPath + outputDir + learntRules) print ('Completed!')
def VnTraining(args = sys.argv[2:]): pathToDict = args[0] dirPath = os.path.join(args[1] + "/") correctTrain = args[2] learntRules = args[3] print( '\nTraining RDRPOSTagger for Vietnamese POS Tagging...') print( "Initial tagging...") getRawTextFromFile(dirPath + correctTrain, dirPath + correctTrain + ".RAW") DICT = readDictionary(pathToDict) VnInitTagger4Corpus(DICT, dirPath + correctTrain + ".RAW", dirPath + correctTrain + ".INIT") print ("Done Initialization!") print ('Building SCRDR-based POS tagging tree of rules...') for (improveThreshold, matchThreshold) in thresholds: timeStart = time.time() outputDir = "T%d-%d/" % (improveThreshold, matchThreshold) os.mkdir(dirPath + outputDir) rdrTree = PosTaggingRDRTree(improveThreshold, matchThreshold) rdrTree.buildTreeFromCorpus(dirPath + correctTrain + ".INIT", dirPath + correctTrain) print ("Write the tree to file...") rdrTree.writeToFileWithoutSeenCases(dirPath + outputDir + learntRules) #rdrTree.writeToFile(dirPath + outputDir + learntRules) print ("\nTraining time for threshold %d-%d: %f seconds\n" % (improveThreshold, matchThreshold, time.time() - timeStart)) print ('\nCompleted!')
def TaggingInitializedCorpus(args = sys.argv[2:]): learntRulesPath = args[0] initTestCorpusPath = args[1] print ("\nTagging initialized corpus:", initTestCorpusPath) from src.tagger.SCRDRlearner.PosTaggingRDRTree import PosTaggingRDRTree r = PosTaggingRDRTree() r.constructTreeFromRulesFile(learntRulesPath) r.tagInitializedCorpus_new(initTestCorpusPath, initTestCorpusPath + ".TAGGED") print ('Completed!')