Пример #1
0
	def no_test_tonto(self):
		
		taggroups=self.taggroups
		
		bilphrasesSet=ruleLearningLib.AlignmentTemplateSet(taggroups)
		originalATList=list()
		
		numAt=0
		print >> sys.stderr, "Reading ALignment Templates/ Bilingual Phrases...."
		for line in sys.stdin:
			numAt+=1
			
			line=line.decode('utf-8').strip()
			at = ruleLearningLib.AlignmentTemplate()
			
			piecesOfline=line.split(u'|')
			textat=u'|'.join(piecesOfline[1:5])
			freq=piecesOfline[0].strip()
			
			sllemmastext=piecesOfline[5].strip()
			tllemmastext=piecesOfline[6].strip()
			sllemmas=sllemmastext.split(u'\t')
			tllemmas=tllemmastext.split(u'\t')
			
			at.parse(textat)
			
			
			at.freq=int(freq)
							
			tl_lemmas_from_dictionary_text=piecesOfline[7].strip()
			tl_lemmas_from_dictionary_list=tl_lemmas_from_dictionary_text.split(u'\t')
			
			bilphrase=copy.deepcopy(at)
			bilphrase.tl_lemmas_from_dictionary=tl_lemmas_from_dictionary_list
			bilphrase.lexicalise_all(sllemmas,tllemmas)
			bilphrase.id=numAt
			bilphrasesSet.add(bilphrase)
			
			originalATList.append((at,sllemmas,tllemmas,tl_lemmas_from_dictionary_list))
			
			#print bilphrase.tl_lemmas_from_dictionary
			
		print >> sys.stderr, " ....."+str(len(originalATList))+" items."
		solution=generaliseATs.generalise_by_linear_program(bilphrasesSet,originalATList,taggroups)
		
		for at in solution:
			print at
import argparse
import ruleLearningLib
import sys,gzip,math

ENCODING='utf-8'

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Filter TT1 alignment templates and keep only those with the lexical categories from a TT2.0 solution')
    parser.add_argument('--allowed_boxes')
    args = parser.parse_args(sys.argv[1:])
    
    allowedseqs=set()
    for line in open(args.allowed_boxes):
        line=line.decode(ENCODING).strip()
        cats=line.split(u"__")
        allowedseqs.add(tuple(cats))
    
    for line in sys.stdin:
        line=line.decode(ENCODING).strip()
        at = ruleLearningLib.AlignmentTemplate()
        at.parse(line)
        slseq=tuple([ l.get_pos() for l in at.parsed_sl_lexforms ])
        if slseq in allowedseqs:
            print line.encode(ENCODING)
Пример #3
0
    DEBUG = False

    parser = argparse.ArgumentParser(
        description='Chooses alignment templates.')
    parser.add_argument('--alignment_template', required=True)
    parser.add_argument('--tag_groups_file_name', required=True)
    parser.add_argument('--emptyrestrictionsmatcheverything',
                        action='store_true')
    parser.add_argument('--tt1_beam', action='store_true')
    args = parser.parse_args(sys.argv[1:])

    ruleLearningLib.AT_LexicalTagsProcessor.initialize(
        args.tag_groups_file_name, None)

    #parse AT
    myAT = ruleLearningLib.AlignmentTemplate()
    myAT.parse(args.alignment_template.decode('utf-8'))
    if DEBUG:
        print >> sys.stderr, "AT: " + myAT.to_string()

    for line in sys.stdin:
        line = line.strip().decode('utf-8')
        bilphrase = ruleLearningLib.AlignmentTemplate()
        if not args.tt1_beam:
            bilphrase.parse(u'|'.join(line.split(u'|')[1:]))
        else:
            bilphrase.parse(u'|'.join(line.split(u'|')[1:]), True)
        if not args.tt1_beam:
            bilphrase.add_explicit_restrictions()
        if DEBUG:
            print >> sys.stderr, "Checking: " + bilphrase.to_string()
Пример #4
0
	def setUp(self):
		self.at1 = ruleLearningLib.AlignmentTemplate()
		self.at2 = ruleLearningLib.AlignmentTemplate()
		self.at3 = ruleLearningLib.AlignmentTemplate()
		self.at4 = ruleLearningLib.AlignmentTemplate()
		self.at5 = ruleLearningLib.AlignmentTemplate()
		self.at6 = ruleLearningLib.AlignmentTemplate()
		self.at7 = ruleLearningLib.AlignmentTemplate()
		self.at8 = ruleLearningLib.AlignmentTemplate()
		self.at9 = ruleLearningLib.AlignmentTemplate()
		self.at10 = ruleLearningLib.AlignmentTemplate()
		self.at11 = ruleLearningLib.AlignmentTemplate()
		self.at12 = ruleLearningLib.AlignmentTemplate()
		self.at13 = ruleLearningLib.AlignmentTemplate()
		self.at14 = ruleLearningLib.AlignmentTemplate()
		self.at15 = ruleLearningLib.AlignmentTemplate()
		self.at16 = ruleLearningLib.AlignmentTemplate()
		self.at17 = ruleLearningLib.AlignmentTemplate()
		
		self.at18 = ruleLearningLib.AlignmentTemplate()
		self.bil18 = ruleLearningLib.AlignmentTemplate()
		
		self.at19= ruleLearningLib.AlignmentTemplate()
		self.bil19= ruleLearningLib.AlignmentTemplate()
		
		self.at20= ruleLearningLib.AlignmentTemplate()
		self.bil20= ruleLearningLib.AlignmentTemplate()

		self.at1.parse(u"<det><ind><m><sg> <n><m><sg> | <det><ind><f><pl> <n><f><pl> | 0:0 1:1 | <det> <n><f><pl>")
		self.at2.parse(u"<det><def><m><sg> dinero<n><m><sg> | <det><def><m><pl> diner<n><m><pl> | 0:0 1:1 | <det><def> <n><m><pl>")
		self.at3.parse(u"<det><def><m><sg> <n><m><sg> | <det><def><f><pl> <n><f><pl> | 0:0 1:1 | <det><def> <n><f><pl>")
		self.at4.parse(u"<det><ind><*gender><*numberat> <n><*gender><*numberat> | <det><ind><*gender><*numberat> <n><*gender><*numberat> | 0:0 1:1 | <det> <n>")
		self.at5.parse(u"<det><def><*gender><*numberat> <n><*gender><*numberat> | <det><def><*gender><*numberat> <n><*gender><*numberat> | 0:0 1:1 | <det><def> <n>")
		self.at6.parse(u"<vbser><ifi><p3><sg> <vblex><pp><m><sg> | anar<vaux><p3><sg> <vbser><inf> <vblex><pp><m><sg> | 0:1 1:2 | <vbser> <vblex>")
		self.at7.parse(u"<det><def><m><*numberat> <n><m><*numberat> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>")
		self.at8.parse(u"<det><def><m><*numberat> <n><f><*numberat> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>")
		self.at9.parse(u"<det><*determinertype><*gender><*numberat> <n><*gender><*numberat> | <det><*determinertype><*gender><*numberat> <n><*gender><*numberat> | 0:0 1:1 | <det> <n>")
		self.at10.parse(u"<det><def><f><sg> <n><f><sg> | <det><def><f><pl> <n><f><pl> | 0:0 1:1 | <det><def> <n><f><pl>")
		
		self.at13.parse(u"el<det><def><m><*numberat> <n><f><*numberat> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>")
		self.at14.parse(u"el<det><def><*gender><*numberat> <n><f><sg> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>")
		self.at15.parse(u"el<det><def><*gender><*numberat> <n><m><sg> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>")
		self.at16.parse(u"<det><*determinertype><*gender><*numberat> <n><*gender><*numberat> | <det><)000determinertype><)001gender><)001numberat> <n><)001gender><)001numberat> | 0:0 1:1 | <det> <n>")
		self.at17.parse(u"<det><ind><m><sg> <n><m><sg> | <det><ind><f><pl> <n><f><pl> | 0:0 1:1 | <det> <n><f><pl>")
		
		self.at11.parse(u"<vbser><ifi><p3><sg> <vblex><pp><m><sg> | anar<vaux><p3><sg> <vbser><inf> <vblex><pp><m><sg> | 0:1 1:2 | <vbser> <vblex>")
		
		self.at12.parse(u"suyo<det><pos><mf><sp> <n><empty_tag_ntype><f><sg> <pr> | el<det><def><f><sp> <n><empty_tag_ntype><f><sg> <pr> | 0:0 1:1 2:2 | __CLOSEWORD__ <n> <pr>")
		
		self.at18.parse(u"<det><def><f><*numberat> <n><empty_tag_ntype><f><*numberat> | <det><def><f><)000numberat> <n><empty_tag_ntype><f><)001numberat> | 0:0 1:1 | <det><def><f><pl> <n>")
		self.bil18.parse(u"el<det><def><f><pl> casa<n><empty_tag_ntype><f><pl> | el<det><def><f><pl> casa<n><empty_tag_ntype><f><pl> | 0:0 1:1 | <det><def> <n>")
		self.bil18.tl_lemmas_from_dictionary=[u"el",u"casa"]
		
		self.at19.parse(u"el<det><def><*gender><*numberat> <n><empty_tag_ntype><*gender><*numberat> | <n><empty_tag_ntype><)001gender><)000numberat> | 0:0 1:0 | <det> <n>")
		self.bil19.parse(u"el<det><def><f><sg> casa<n><empty_tag_ntype><f><sg> | casa<n><empty_tag_ntype><f><sg> | 0:0 1:0 | <det> <n>")
		self.bil19.tl_lemmas_from_dictionary=[u"el",u"casa"]
		
		self.at20.parse(u"<n><empty_tag_ntype><f><*numberat> <adj><empty_tag_adjtype><f><*numberat> | <n><empty_tag_ntype><f><)000numberat> <adj><empty_tag_adjtype><f><)001numberat> | 0:0 1:1 | <n><empty_tag_ntype><f> <adj><empty_tag_adjtype><f>")
		self.bil20.parse(u"companyia<n><empty_tag_ntype><f><pl> elèctric<adj><empty_tag_adjtype><f><pl> | compañía<n><empty_tag_ntype><f><pl> eléctrico<adj><empty_tag_adjtype><f><pl> | 0:0 1:1 | <n> <adj>")
		self.bil20.tl_lemmas_from_dictionary=[u"compañía",u"eléctrico"]
		
		self.atlist=[self.at1,self.at2,self.at3,self.at4,self.at5]
		
		myfile=open("taggroups",'r')
		self.taggroups=ruleLearningLib.read_tag_groups(myfile)
		myfile.close()
		
		ruleLearningLib.AT_LexicalTagsProcessor.initialize("taggroups","tagsequences")
Пример #5
0
	def test_explicit_restrictions(self):
		myat=self.at1.fast_clone()
		myat.add_explicit_restrictions()
		expectedAT=ruleLearningLib.AlignmentTemplate()
		expectedAT.parse(u"<det><ind><m><sg> <n><m><sg> | <det><ind><f><pl> <n><f><pl> | 0:0 1:1 | <det><ind><m><sg> <n><f><pl>")
		self.assertEqual(myat, expectedAT)