def testExtractCandidateMWEs(self):
     bilExpr=BilingualExpr()
     bilExpr.set_exprs(self.extExpr,self.extExpr)
     self.assertTrue(bilExpr.is_equal_sides())
     
     candidateMWEs=bilExpr.extract_candidate_mwes()
     self.assertEqual(len(candidateMWEs), 4)
     for mwestr in candidateMWEs:
         mwe =ParallelMWE()
         mwe.parse(" | ".join(mwestr.split(" | ")[2:]))
         self.assertTrue(mwe.is_equal_sides())
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)")
     self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)")
     self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))")
     self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))')
     self.extExpr=ExtendedExpr(self.myexpr,None)
     self.extExprW=ExtendedExpr(self.myexprw,None)
     self.extExprQ=ExtendedExpr(self.myexprq,None)
     self.extExprS=ExtendedExpr(self.myexprs,None)
     self.bilingualPhraseSet=BilingualPhraseSet()
     self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1")
     
     self.mwe1=ParallelMWE()
     self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )")
     
     self.mwe2=ParallelMWE()
     self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )")
     
     self.bilphrase=BilingualExpr()
     self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) )  | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True)
     
     synDict=dict()
     synDict["politics_N"]=set(["policy_N"])
     ParallelMWE.synonymDict=synDict
Пример #3
0
    
    mwes=list()
    #read MWEs
    for line in sys.stdin:
        line=line.strip()
        mwe=ParallelMWE()
        mwe.parse(line)
        mwes.append(mwe)
    
    reprlistofnonleafs=mwes[0].get_representative()
    
    bilExprs=list()
    #read bilingual exprs
    for line in gzip.open(args.bilingual_exprs,'r'):
        line=line.strip()
        bilExpr=BilingualExpr()
        bilExpr.parse(line)
        if bilExpr.slexpr.get_non_leaf_funtions() == reprlistofnonleafs or args.only_print_scores:
            bilExprs.append(bilExpr)
    
    if args.only_print_scores:
        for mwe in mwes:
            mwe.compute_reproduced_and_matching_bilexprs([bilExpr for bilExpr in bilExprs if bilExpr.slexpr.get_non_leaf_funtions()== mwe.get_representative()])
            print mwe
    else:
        solution=select_mwes((mwes,bilExprs),int(args.threshold),0.0)
        for mwe in solution:
            print mwe

    #pool = Pool()
    #result = pool.map(abstractLearningLib.select_mwes , abstractLearningLib.MWEReader.s_groups)
class ExtendedExprTest(unittest.TestCase):
    
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)")
        self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)")
        self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))")
        self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))')
        self.extExpr=ExtendedExpr(self.myexpr,None)
        self.extExprW=ExtendedExpr(self.myexprw,None)
        self.extExprQ=ExtendedExpr(self.myexprq,None)
        self.extExprS=ExtendedExpr(self.myexprs,None)
        self.bilingualPhraseSet=BilingualPhraseSet()
        self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1")
        
        self.mwe1=ParallelMWE()
        self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )")
        
        self.mwe2=ParallelMWE()
        self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )")
        
        self.bilphrase=BilingualExpr()
        self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) )  | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True)
        
        synDict=dict()
        synDict["politics_N"]=set(["policy_N"])
        ParallelMWE.synonymDict=synDict
    
    def testNonLeafFunList(self):
        listOfFuns=self.extExpr.get_non_leaf_funtions()
        assert listOfFuns == ['AdjCN', 'PositA', 'UseN']
        
    def testLeafFunList(self):
        listOfFuns=self.extExpr.get_leaf_functions()
        assert listOfFuns == ['crucial_A','item_N']
        
        listOfFuns=self.extExprS.get_leaf_functions()
        self.assertEqual(listOfFuns , ['IndefArt','NumSg','complete_A','collapse_N','DefArt','String_U','dollar_N'])
        
        listOfFuns=self.extExprQ.get_leaf_functions()
        self.assertEqual(listOfFuns, ['?','wildcard_3','wildcard_1','wildcard_2']) 
    
    def testWildcardFunList(self):
        listOfFuns=self.extExprW.get_wildcard_leaf_functions()
        self.assertEqual(listOfFuns,['wildcard_1'])
    
    def testExtractCandidateMWEs(self):
        bilExpr=BilingualExpr()
        bilExpr.set_exprs(self.extExpr,self.extExpr)
        self.assertTrue(bilExpr.is_equal_sides())
        
        candidateMWEs=bilExpr.extract_candidate_mwes()
        self.assertEqual(len(candidateMWEs), 4)
        for mwestr in candidateMWEs:
            mwe =ParallelMWE()
            mwe.parse(" | ".join(mwestr.split(" | ")[2:]))
            self.assertTrue(mwe.is_equal_sides())
    
    def testPrint(self):
        strrep=str(self.extExpr)
        myexpragain=pgf.readExpr(strrep)
        self.assertEqual(str(self.myexpr), str(myexpragain))
        
        strrep=str(self.extExprS)
        myexpragain=pgf.readExpr(strrep)
        self.assertEqual(str(self.myexprs), str(myexpragain))
    
    def testBilingualPhraseSet(self):
        self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "OTAN"))
        self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "la OTAN"))
        self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "la"))
    
    def testCompositionally(self):
        self.assertFalse(self.mwe1.is_bilexpr_matched_or_reproduced(self.bilphrase).reproduced)
#!/usr/bin/env python
# coding=utf-8
# -*- encoding: utf-8 -*-

from lib.abstractLearningLib import BilingualExpr, set_debug, \
    GFProbabilisticBilingualDictionary, ParallelMWE
import sys
import argparse

if __name__ == "__main__":
    
    parser = argparse.ArgumentParser(description='Chooses rules.')
    parser.add_argument('--use_synonyms')
    parser.add_argument('--inverse_synonyms',action='store_true')
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args(sys.argv[1:])
    
    set_debug(args.debug)
    
    if args.use_synonyms:
        ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms)
    
    for line in sys.stdin:
    #for line in ['1 | BaseNP (UsePN (SymbPN (MkSymb "Wilders"))) (DetCN (DetQuant (PossPron he_Pron) NumPl) (UseN supporter_N)) | BaseNP (UsePN (SymbPN (MkSymb "Wilders"))) (DetCN (DetQuant (PossPron it_Pron) NumPl) (UseN backer_N))']:
        line=line.strip()
        bilExpr=BilingualExpr()
        bilExpr.parse(line)
        for candidatemwe in bilExpr.extract_candidate_mwes():
            print candidatemwe