def run(args=sys.argv[1:]):
    if (len(args) == 0):
        printHelp()
    elif args[0].lower() == "train":
        try:
            print("\n====== Start ======")
            print(
                "\nGenerate from the gold standard training corpus a lexicon "
                + args[1] + ".DICT")
            createLexicon(args[1], 'full')
            createLexicon(args[1], 'short')
            print(
                "\nExtract from the gold standard training corpus a raw text corpus "
                + args[1] + ".RAW")
            getRawText(args[1], args[1] + ".RAW")
            print(
                "\nPerform initially POS tagging on the raw text corpus, to generate "
                + args[1] + ".INIT")
            DICT = readDictionary(args[1] + ".sDict")
            initializeCorpus(DICT, args[1] + ".RAW", args[1] + ".INIT")
            print(
                '\nLearn a tree model of rules for POS tagging from %s and %s'
                % (args[1], args[1] + ".INIT"))
            rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1])
            rdrTree.learnRDRTree(args[1] + ".INIT", args[1])
            print("\nWrite the learned tree model to file " + args[1] + ".RDR")
            rdrTree.writeToFile(args[1] + ".RDR")
            print('\nDone!')
            os.remove(args[1] + ".INIT")
            os.remove(args[1] + ".RAW")
            os.remove(args[1] + ".sDict")
        except Exception as e:
            print("\nERROR ==> ", e)
            printHelp()
    elif args[0].lower() == "tag":
        try:
            r = RDRPOSTagger()
            print("\n=> Read a POS tagging model from " + args[1])
            r.constructSCRDRtreeFromRDRfile(args[1])
            print("\n=> Read a lexicon from " + args[2])
            DICT = readDictionary(args[2])
            print("\n=> Perform POS tagging on " + args[3])
            r.tagRawCorpus(DICT, args[3])
        except Exception as e:
            print("\nERROR ==> ", e)
            printHelp()
    else:
        printHelp()
def run(args=sys.argv[1:]):
    if (len(args) == 0):
        printHelp()
    elif args[0].lower() == "train":
        try:
            print "\n====== Start ======"
            print "\nGenerate from the gold standard training corpus an English lexicon", args[
                1] + ".DICT"
            createLexicon(args[1], 'full')
            createLexicon(args[1], 'short')
            print "\nExtract from the gold standard training corpus a raw text corpus", args[
                1] + ".RAW"
            getRawText(args[1], args[1] + ".RAW")
            print "\nPerform initially POS tagging on the raw text corpus, to create", args[
                1] + ".INIT"
            DICT = readDictionary(args[1] + ".sDict")
            initializeEnCorpus(DICT, args[1] + ".RAW", args[1] + ".INIT")
            print '\nLearn a tree model of rules for English POS tagging from %s and %s' % (
                args[1], args[1] + ".INIT")
            rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1])
            rdrTree.learnRDRTree(args[1] + ".INIT", args[1])
            print "\nWrite the learned tree model to file ", args[1] + ".RDR"
            rdrTree.writeToFile(args[1] + ".RDR")
            print '\nDone!'
            os.remove(args[1] + ".INIT")
            os.remove(args[1] + ".RAW")
            os.remove(args[1] + ".sDict")
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
def run(args = sys.argv[1:]):
    if (len(args) == 0):
        printHelp()
    elif args[0].lower() == "train":
        try:
            print "\n====== Start ======"
            print "\nGenerate from the gold standard training corpus an English lexicon", args[1] + ".DICT"
            createLexicon(args[1], 'full')
            createLexicon(args[1], 'short')
            print "\nExtract from the gold standard training corpus a raw text corpus", args[1] + ".RAW"
            getRawText(args[1], args[1] + ".RAW")      
            print "\nPerform initially POS tagging on the raw text corpus, to create", args[1] + ".INIT"
            DICT = readDictionary(args[1] + ".sDict")
            initializeEnCorpus(DICT, args[1] + ".RAW", args[1] + ".INIT")
            print '\nLearn a tree model of rules for English POS tagging from %s and %s' % (args[1], args[1] + ".INIT")       
            rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) 
            rdrTree.learnRDRTree(args[1] + ".INIT", args[1])  
            print "\nWrite the learned tree model to file ", args[1] + ".RDR"
            rdrTree.writeToFile(args[1] + ".RDR")                
            print '\nDone!'    
            os.remove(args[1] + ".INIT")
            os.remove(args[1] + ".RAW")
            os.remove(args[1] + ".sDict")   
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
示例#4
0
def run(args=sys.argv[1:]):
    if (len(args) == 0):
        printHelp()
    elif args[0].lower() == "tag":
        try:
            r = RDRPOSTagger()
            r.constructSCRDRtreeFromRDRfile(args[1])
            DICT = readDictionary(args[2])
            r.tagRawCorpus(DICT, sys.stdin.readlines())
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
示例#5
0
    def __init__(self, language):
        self.language = language

        model = self.models.get(language)
        lexicon = self.dicts.get(language)
        if not model:
            raise (Exception(
                "Unsupported language for POS tagging: {}".format(language)))

        self.tagger = RDRPOSTagger()

        # Load the POS tagging model for X language
        self.tagger.constructSCRDRtreeFromRDRfile(
            os.path.join(multilingual_posTagger_home, model))

        # Load the lexicon for X language
        self.dict = readDictionary(
            os.path.join(multilingual_posTagger_home, lexicon))
示例#6
0
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus):
    """
    Return known-word accuracy, unknown-word accuracy and the overall accuracy  
    """
    tagged = open(taggedCorpus, "r").read().split()
    goldStandard = open(goldStandardCorpus, "r").read().split()
    if len(tagged) != len(goldStandard):
        print("The numbers of word tokens in %s and %s are not equal!" %
              (goldStandardCorpus, taggedCorpus))
        return 0

    fullDICT = readDictionary(fullDictFile)

    numwords = count = 0
    countKN = countUNKN = 0
    countCorrectKN = countCorrectUNKN = 0

    for i in range(len(tagged)):
        numwords += 1
        word1, tag1 = getWordTag(tagged[i])
        word2, tag2 = getWordTag(goldStandard[i])
        if word1 != word2 and word1 != "''" and word2 != "''":
            print(
                "Words are not the same in gold standard and tagged corpora, at the index "
                + str(i))
            return 0

        if tag1.lower() == tag2.lower():
            count += 1

        if word1 in fullDICT:
            countKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectKN += 1
        else:
            countUNKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectUNKN += 1

    if countUNKN == 0:
        return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords
    else:
        return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
示例#7
0
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus):
    """
    Return known-word accuracy, unknown-word accuracy and the overall accuracy  
    """
    tagged = open(taggedCorpus, "r").read().split()
    goldStandard = open(goldStandardCorpus, "r").read().split()
    if len(tagged) != len(goldStandard):
        print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)
        return 0
    
    fullDICT = readDictionary(fullDictFile)
    
    numwords = count = 0
    countKN = countUNKN = 0
    countCorrectKN = countCorrectUNKN = 0
    
    for i in xrange(len(tagged)):
        numwords += 1
        word1, tag1 = getWordTag(tagged[i])
        word2, tag2 = getWordTag(goldStandard[i])   
        if word1 != word2 and  word1 != "''" and word2 != "''":
            print "Words are not the same in gold standard and tagged corpora, at the index", i
            return 0
             
        if tag1.lower() == tag2.lower():
            count += 1
        
        if word1 in fullDICT:
            countKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectKN += 1
        else:
            countUNKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectUNKN += 1
        
    if countUNKN == 0:
        return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords
    else:
        return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
            rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1])
            rdrTree.learnRDRTree(args[1] + ".INIT", args[1])
            print "\nWrite the learned tree model to file ", args[1] + ".RDR"
            rdrTree.writeToFile(args[1] + ".RDR")
            print '\nDone!'
            os.remove(args[1] + ".INIT")
            os.remove(args[1] + ".RAW")
            os.remove(args[1] + ".sDict")
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
    elif args[0].lower() == "tag":
        try:
            r = RDRPOSTagger4En()
            print "\n=> Read an English POS tagging model from", args[1]
            r.constructSCRDRtreeFromRDRfile(args[1])
            print "\n=> Read an English lexicon from", args[2]
            DICT = readDictionary(args[2])
            print "\n=> Perform English POS tagging on", args[3]
            r.tagRawEnCorpus(DICT, args[3])
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
    else:
        printHelp()


if __name__ == "__main__":
    run()
    pass
            print '\nLearn a tree model of rules for English POS tagging from %s and %s' % (args[1], args[1] + ".INIT")       
            rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) 
            rdrTree.learnRDRTree(args[1] + ".INIT", args[1])  
            print "\nWrite the learned tree model to file ", args[1] + ".RDR"
            rdrTree.writeToFile(args[1] + ".RDR")                
            print '\nDone!'    
            os.remove(args[1] + ".INIT")
            os.remove(args[1] + ".RAW")
            os.remove(args[1] + ".sDict")   
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
    elif args[0].lower() == "tag":
        try:
            r = RDRPOSTagger4En()
            print "\n=> Read an English POS tagging model from", args[1]
            r.constructSCRDRtreeFromRDRfile(args[1])
            print "\n=> Read an English lexicon from", args[2]
            DICT = readDictionary(args[2])
            print "\n=> Perform English POS tagging on", args[3]
            r.tagRawEnCorpus(DICT, args[3])
        except Exception, e:
            print "\nERROR ==> ", e
            printHelp()
    else:
        printHelp()
        
if __name__ == "__main__":
    run()
    pass
示例#10
0
 def englishSetup(self):
     # initialize self with english setup (DICT and tree)
     self.constructSCRDRtreeFromRDRfile(current_python_file_dir +
                                        "/Models/POS/English.RDR")
     self.DICT = readDictionary(current_python_file_dir +
                                "/Models/POS/English.DICT")
示例#11
0
		if e in e_to_f:
			e_to_f[e][f] = p
		else:
			e_to_f[e] = {}
			e_to_f[e][f] = p

	return e_to_f

configs = read_config(sys.argv[1], sys.argv[2])
foreign_language = configs["foreign_language"]
model_rdr = "./Models/UniPOS/UD_"+foreign_language+"/train.UniPOS.RDR"
model_dict = "./Models/UniPOS/UD_"+foreign_language+"/train.UniPOS.DICT"
foreign_language_tagger = RDRPOSTagger()
foreign_language_tagger.constructSCRDRtreeFromRDRfile(model_rdr)
foreign_language_dictionary = readDictionary(model_dict)
k = int(configs["k"])
mu = float(configs["mu"])
sigma = float(configs["sigma"])

file = codecs.open(configs["input_file"],"r","utf-8")
bilingual_dictionary = read_dictionary(configs["bilingual_dictionary"])
output_file = codecs.open(configs["output_ranking"],"w","utf-8")

english_lines = []
foreign_lines = []
for line in file:
	tokens = line.split("\t")
	english_lines.append(tokens[1].strip())
	foreign_lines.append(tokens[2].strip())
output = defaultdict()