self.static_dics.setdefault(ls[0].decode('utf-8'),[]).extend( ls[1].split(",") ) def __getitem__(self,tokens): if len(tokens)==1: return self.static_dics.get(tokens[0],[]) else: return [] def __setitem__(self,tok,cats): self.static_dics[tok] = cats def has_key(self,tok): return (tok in self.static_dics) def get(self,toklist,defval): ret = self.__getitem__(toklist) return ret parser = CCGParser() parser.combinators = [LApp,RApp,LB,RB,Conj,RT("NP[sbj]"),LBx] parser.terminators = ["ROOT","S","S[wq]","S[q]","S[imp]"] parser.lexicon = Lexicon() parser.concatenator = "" def tokenize(s): if len(s)==0: return s elif s[-1]==".": tokens = s[:-1].split() tokens.append( s[-1] ) return tokens else: return s.split()
return self.static_dics.get(tokens[0], []) else: return [] def __setitem__(self, tok, cats): self.static_dics[tok] = cats def has_key(self, tok): return (tok in self.static_dics) def get(self, toklist, defval): ret = self.__getitem__(toklist) return ret parser = CCGParser() parser.combinators = [LApp, RApp, LB, RB, Conj, RT("NP[sbj]"), LBx] parser.terminators = ["ROOT", "S", "S[wq]", "S[q]", "S[imp]"] parser.lexicon = Lexicon() parser.concatenator = "" def tokenize(s): if len(s) == 0: return s elif s[-1] == ".": tokens = s[:-1].split() tokens.append(s[-1]) return tokens else: return s.split()
"(NP/NP)\\NP" ] lexicon["don't"] = ["(S\\NP)/(S\\NP)"] return lexicon #-- special rule for English def Rel(lt, rt): if lt != Symbol("NP"): return None if rt == [BwdApp, Symbol("S[pss]"), Symbol("NP")]: return lt return None parser = CCGParser() parser.combinators = [ LApp, RApp, LB, RB, LT("NP"), LT("S\\NP"), RT("NP"), Conj, SkipComma, Rel ] parser.terminators = ["ROOT", "S", "S[q]", "S[wq]", "S[imp]"] parser.lexicon = default_lexicon() parser.concatenator = " " def run(text, type=0): for tokens in tokenize(text): print(u"test run : tokens={0}".format(str(tokens))) for t in parser.parse(tokens):
def default_lexicon(): ret = {} ret[u"。"] = ["ROOT\\S", "ROOT\\S[imp]", "ROOT\\S[q]", "ROOT\\S[wq]"] ret[u"?"] = ["ROOT\\S[q]", "ROOT\\S[wq]"] for line in open( os.path.join(os.path.dirname(os.path.abspath(__file__)), "ccglex.ma")): line = line.strip() if len(line) == 0: continue if line[0] == "#": continue ls = line.split('\t') ret.setdefault(ls[0].decode('utf-8'), []).extend(ls[2].split(",")) return ret parser = CCGParser() parser.combinators = [LApp, RApp, LB, RB, LBx, Conj, SkipComma, RT("NP")] parser.terminators = ["ROOT", "S", "S[wq]", "S[q]", "S[imp]"] parser.lexicon = default_lexicon() parser.concatenator = "" if __name__ == "__main__": def __repr__(s): if sys.stdout.encoding == 'UTF-8': return s else: return repr(s) for line in sys.stdin: line = line.strip()
return lexicon #-- special rule for English def Rel(lt,rt): if lt!=Symbol("NP"): return None if rt==[BwdApp , Symbol("S[pss]") , Symbol("NP")]: return lt return None parser = CCGParser() parser.combinators = [LApp,RApp,LB,RB,LT("NP"),LT("S\\NP"),RT("NP"),Conj,SkipComma,Rel] parser.terminators = ["ROOT","S","S[q]","S[wq]","S[imp]"] parser.lexicon = default_lexicon() parser.concatenator = " " def run(text,type=0): for tokens in tokenize(text): print(u"test run : tokens={0}".format(str(tokens))) for t in parser.parse(tokens): if type==0: for r in t.leaves(): print(u"{0}\t{1}".format(r.token , r.catname)) break
def default_lexicon(): ret = {} ret[u"。"] = ["ROOT\\S" , "ROOT\\S[imp]" , "ROOT\\S[q]" , "ROOT\\S[wq]"] ret[u"?"] = ["ROOT\\S[q]" , "ROOT\\S[wq]"] for line in open(os.path.join(os.path.dirname( os.path.abspath(__file__) ) ,"ccglex.ma")): line = line.strip() if len(line)==0:continue if line[0]=="#":continue ls = line.split('\t') ret.setdefault(ls[0].decode('utf-8'),[]).extend( ls[2].split(",") ) return ret parser = CCGParser() parser.combinators = [LApp,RApp,LB,RB,LBx,Conj,SkipComma,RT("NP")] parser.terminators = ["ROOT","S","S[wq]","S[q]","S[imp]"] parser.lexicon = default_lexicon() parser.concatenator = "" if __name__=="__main__": def __repr__(s): if sys.stdout.encoding=='UTF-8': return s else: return repr(s) for line in sys.stdin: line = line.strip() line = line.decode('utf-8')
return True elif term[0].value()=="forall": return False else: assert(len(term)>=2),lt return (check(term[1]) and check(term[2])) if type(rt)==list or rt.value()!="COMMA": return None elif not check(lt): return None return lt parser = CCGParser() parser.combinators = [LApp,RApp,LB,RB,Conj,FwdRel,SkipCommaJP,RT("NP[sbj]"),RBx] parser.terminators = ["ROOT","S","S[exc]","S[imp]","S[null]","S[q]","S[wq]","S[null-q]","S[nom]"] parser.lexicon = default_lexicon() parser.concatenator = "" def run(text,type=0): for sentence in sentencize(text): print(u"test run : sentence={0}".format(sentence)) parser.lexicon.guess(sentence) for t in parser.parse(sentence): if type==0: for r in t.leaves(): if r.token in parser.lexicon.guess_dics: print(u"{0}\t{1}\t(guess)".format(r.token , r.catname)) else:
else: return True elif term[0].value() == "forall": return False else: assert (len(term) >= 2), lt return (check(term[1]) and check(term[2])) if type(rt) == list or rt.value() != "COMMA": return None elif not check(lt): return None return lt parser = CCGParser() parser.combinators = [ LApp, RApp, LB, RB, Conj, FwdRel, SkipCommaJP, RT("NP[sbj]"), RBx ] parser.terminators = [ "ROOT", "S", "S[exc]", "S[imp]", "S[null]", "S[q]", "S[wq]", "S[null-q]", "S[nom]" ] parser.lexicon = default_lexicon() parser.concatenator = "" def run(text, type=0): for sentence in sentencize(text): print(u"test run : sentence={0}".format(sentence))