def setUp(self): unittest.TestCase.setUp(self) self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)") self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)") self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))") self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))') self.extExpr=ExtendedExpr(self.myexpr,None) self.extExprW=ExtendedExpr(self.myexprw,None) self.extExprQ=ExtendedExpr(self.myexprq,None) self.extExprS=ExtendedExpr(self.myexprs,None) self.bilingualPhraseSet=BilingualPhraseSet() self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1") self.mwe1=ParallelMWE() self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )") self.mwe2=ParallelMWE() self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )") self.bilphrase=BilingualExpr() self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True) synDict=dict() synDict["politics_N"]=set(["policy_N"]) ParallelMWE.synonymDict=synDict
def testExtractCandidateMWEs(self): bilExpr=BilingualExpr() bilExpr.set_exprs(self.extExpr,self.extExpr) self.assertTrue(bilExpr.is_equal_sides()) candidateMWEs=bilExpr.extract_candidate_mwes() self.assertEqual(len(candidateMWEs), 4) for mwestr in candidateMWEs: mwe =ParallelMWE() mwe.parse(" | ".join(mwestr.split(" | ")[2:])) self.assertTrue(mwe.is_equal_sides())
import gzip import sys if __name__ == "__main__": parser = argparse.ArgumentParser(description='Chooses rules.') parser.add_argument('--use_synonyms') parser.add_argument('--inverse_synonyms',action='store_true') parser.add_argument('--additional_references') parser.add_argument('--debug', action='store_true') args = parser.parse_args(sys.argv[1:]) set_debug(args.debug) #read synonyms if args.use_synonyms: ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms) mweset=ParallelMWESet() if args.additional_references: myfile=gzip.open(args.additional_references) for line in myfile: mwe=ParallelMWE() mwe.parse(line) mweset.add(mwe) myfile.close() #read mwes mwelist=list() for line in sys.stdin: line=line.strip()
class ExtendedExprTest(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)") self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)") self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))") self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))') self.extExpr=ExtendedExpr(self.myexpr,None) self.extExprW=ExtendedExpr(self.myexprw,None) self.extExprQ=ExtendedExpr(self.myexprq,None) self.extExprS=ExtendedExpr(self.myexprs,None) self.bilingualPhraseSet=BilingualPhraseSet() self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1") self.mwe1=ParallelMWE() self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )") self.mwe2=ParallelMWE() self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )") self.bilphrase=BilingualExpr() self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True) synDict=dict() synDict["politics_N"]=set(["policy_N"]) ParallelMWE.synonymDict=synDict def testNonLeafFunList(self): listOfFuns=self.extExpr.get_non_leaf_funtions() assert listOfFuns == ['AdjCN', 'PositA', 'UseN'] def testLeafFunList(self): listOfFuns=self.extExpr.get_leaf_functions() assert listOfFuns == ['crucial_A','item_N'] listOfFuns=self.extExprS.get_leaf_functions() self.assertEqual(listOfFuns , ['IndefArt','NumSg','complete_A','collapse_N','DefArt','String_U','dollar_N']) listOfFuns=self.extExprQ.get_leaf_functions() self.assertEqual(listOfFuns, ['?','wildcard_3','wildcard_1','wildcard_2']) def testWildcardFunList(self): listOfFuns=self.extExprW.get_wildcard_leaf_functions() self.assertEqual(listOfFuns,['wildcard_1']) def testExtractCandidateMWEs(self): bilExpr=BilingualExpr() bilExpr.set_exprs(self.extExpr,self.extExpr) self.assertTrue(bilExpr.is_equal_sides()) candidateMWEs=bilExpr.extract_candidate_mwes() self.assertEqual(len(candidateMWEs), 4) for mwestr in candidateMWEs: mwe =ParallelMWE() mwe.parse(" | ".join(mwestr.split(" | ")[2:])) self.assertTrue(mwe.is_equal_sides()) def testPrint(self): strrep=str(self.extExpr) myexpragain=pgf.readExpr(strrep) self.assertEqual(str(self.myexpr), str(myexpragain)) strrep=str(self.extExprS) myexpragain=pgf.readExpr(strrep) self.assertEqual(str(self.myexprs), str(myexpragain)) def testBilingualPhraseSet(self): self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "OTAN")) self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "la OTAN")) self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "la")) def testCompositionally(self): self.assertFalse(self.mwe1.is_bilexpr_matched_or_reproduced(self.bilphrase).reproduced)
if __name__ == "__main__": parser = argparse.ArgumentParser(description='select minimum amount of parallel MWEs to reproduce the bilingual expr.') parser.add_argument('--only_print_scores',action='store_true') parser.add_argument('--bilingual_exprs',required=True) parser.add_argument('--use_synonyms') parser.add_argument('--inverse_synonyms',action='store_true') parser.add_argument('--invert_synonym_direction',action='store_true') parser.add_argument('--threshold',default='2') parser.add_argument('--debug', action='store_true') args = parser.parse_args(sys.argv[1:]) set_debug(args.debug) if args.use_synonyms: ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms,args.invert_synonym_direction) mwes=list() #read MWEs for line in sys.stdin: line=line.strip() mwe=ParallelMWE() mwe.parse(line) mwes.append(mwe) reprlistofnonleafs=mwes[0].get_representative() bilExprs=list() #read bilingual exprs for line in gzip.open(args.bilingual_exprs,'r'): line=line.strip()
#!/usr/bin/env python # coding=utf-8 # -*- encoding: utf-8 -*- from lib.abstractLearningLib import BilingualExpr, set_debug, \ GFProbabilisticBilingualDictionary, ParallelMWE import sys import argparse if __name__ == "__main__": parser = argparse.ArgumentParser(description='Chooses rules.') parser.add_argument('--use_synonyms') parser.add_argument('--inverse_synonyms',action='store_true') parser.add_argument('--debug', action='store_true') args = parser.parse_args(sys.argv[1:]) set_debug(args.debug) if args.use_synonyms: ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms) for line in sys.stdin: #for line in ['1 | BaseNP (UsePN (SymbPN (MkSymb "Wilders"))) (DetCN (DetQuant (PossPron he_Pron) NumPl) (UseN supporter_N)) | BaseNP (UsePN (SymbPN (MkSymb "Wilders"))) (DetCN (DetQuant (PossPron it_Pron) NumPl) (UseN backer_N))']: line=line.strip() bilExpr=BilingualExpr() bilExpr.parse(line) for candidatemwe in bilExpr.extract_candidate_mwes(): print candidatemwe
parser = argparse.ArgumentParser(description='filter final MWEs') parser.add_argument('--different_sides',action='store_true') parser.add_argument('--contains_lexical',action='store_true') parser.add_argument('--not_contains_lexical',action='store_true') parser.add_argument('--contains_non_wildcard',action='store_true') parser.add_argument('--contains_wildcard',action='store_true') parser.add_argument('--not_contains_wildcard',action='store_true') parser.add_argument('--use_synonyms') parser.add_argument('--inverse_synonyms',action='store_true') args = parser.parse_args(sys.argv[1:]) inputSource=sys.stdin if args.use_synonyms: ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms) for line in inputSource: line=line.strip() mwe=ParallelMWE() mwe.parse(line) isValid=True if args.different_sides: isValid = isValid and not mwe.is_equal_sides() if args.contains_lexical: isValid = isValid and ( len(mwe.slexpr.get_open_leaf_functions())>0 or len(mwe.tlexpr.get_open_leaf_functions())>0 ) if args.not_contains_lexical: