def feature_structures(): fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') print "fs1=", fs1 print "fs1[TENSE]=", fs1['TENSE'] fs1['CASE'] = 'acc' fs2 = nltk.FeatStruct(POS='N', AGR=fs1) print "fs2=", fs2 person = nltk.FeatStruct(name='Lee', telno='212 444 1212', age=33) print "person=", person print nltk.FeatStruct(""" [NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], SPOUSE=[Name='Kim', ADDRESS->(1)]] """)
def fuf_file_to_featstruct(fuf_filename): """ Convert fuf file to C{nltk.FeatStruct} and processed the type definitions Returns the type table and the converted feature structure @param fuf_filename: The name of the file that contains the grammar @type fuf_filename: string @return: The type table (C{fstypes.FeatureTypeTable}) and the grammar as a C{nltk.featstruct.FeatStruct}. """ # Convert the fuf code into expression lists sfp = SexpFileParser(fuf_filename) lsexp = sfp.parse() assert lsexp type_table = FeatureTypeTable() fs = nltk.FeatStruct() # process the type defs and the grammar for sexp in lsexp: if isinstance(sexp[0], basestring) and sexp[0] == 'define-feature-type': assert len(sexp) == 3 name, children = sexp[1], sexp[2] type_table.define_type(name, children) else: # assuming that it is a feature structure fs = _convert_fuf_featstruct(sexp) # there should be nothing following the feature definition break return type_table, fs
def _convert_fuf_featstruct(sexp): assert sexp.lparen == '(' fs = nltk.FeatStruct() for child in sexp: if isinstance(child, basestring): feat, val = _convert_fuf_feature(sexp) fs[feat] = val break else: feat, val = _convert_fuf_feature(child) fs[feat] = val return fs
def _get_value(self, fs, path): """ Find and return the value within the feature structure given a path. @param fs: Feature structre @type fs: C{nltk.featstruct.FeatStruct} @param path: list of keys to follow @type path: list @return: the feature value at the end of the path """ target = None # in case we find another link keep a copy ancestors = [fs] # to to the end last_step = path[-1] path = path[:-1] for step in path: if step in fs and not isinstance(fs[step], ReentranceLink): fs = fs[step] ancestors.append(fs) elif step not in fs: fs[step] = nltk.FeatStruct() fs = fs[step] ancestors.append(fs) elif isinstance(fs[step], ReentranceLink): parent = ancestors[-1 * fs[step].up] new_path = fs[step].down fs[step] = self._get_value(parent, new_path) fs = fs[step] if isinstance(fs, nltk.sem.Variable): return fs if last_step in fs: assert (not isinstance(fs[last_step], ReentranceLink)) return fs[last_step] # All the way through the path but the value doesn't exist # create a variable fs[last_step] = self._unique_var() return fs[last_step]
def parseFoma(sentence): tokens = sentence.split() tokenAnalyses = {} rules = [] count = 0 for token in tokens: aVal = [] result = list(fst.apply_up(str.encode(token))) for r in result: elements = r.decode('utf8').split('+') print(r.decode('utf8')) lemma = elements[0] tokFeat = nltk.FeatStruct("[PRED=" + lemma + "]") cat = elements[1] if len(elements) > 2: feats = tuple(elements[2:]) else: feats = () for x in feats: fRes2 = feat2LFG(x) fRes = tokFeat.unify(fRes2) if fRes: tokFeat = fRes else: print("Error unifying:", tokFeat, fRes2) flatFStr = flatFStructure(tokFeat) aVal.append(cat + flatFStr) rules.append(cat + flatFStr + " -> " + "'" + token + "'") tokenAnalyses[count] = aVal count += 1 grammarText2 = grammarText + "\n" + "\n".join(rules) grammar = FeatureGrammar.fromstring(grammarText2) parser = nltk.parse.FeatureChartParser(grammar) result = list(parser.parse(tokens)) if result: for x in result: print(x) else: print("*", sentence)
def feature_structure_unification(): fs1 = nltk.FeatStruct(NUMBER=74, STREE='rue Pascal') fs2 = nltk.FeatStruct(CITY='Paris') print fs1.unify(fs2) # result of unification if fs1 subsumes fs2 or vice versa, the more # specific of the two. fs0 = nltk.FeatStruct(""" [NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], SPOUSE=[Name='Kim', ADDRESS->(1)]] """) print "fs0=", fs0 fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]") print fs1.unify(fs0) print "fs1=", fs1 fs2 = nltk.FeatStruct(""" [NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], SPOUSE=[NAME=Kim, ADRRESS->(1)]] """) print "fs1.unify(fs2)=", fs1.unify(fs2) fs3 = nltk.FeatStruct("[ADDRESS=?x, ADDRESS2=?x]") print "fs2.unify(fs3)=", fs2.unify(fs3)
def _convert_fuf_feature(sexp): assert sexp.lparen == '(', sexp feat, name, index, val = ('', '', '', '') # Special handling for the alt feature if sexp[0] == 'alt': feat, name, index, val = parse_alt(sexp) elif sexp[0] == 'opt': feat, name, index, val = parse_opt(sexp) elif len(sexp) == 3 and sexp[1] == '===': feat, val = _convert_triple_eq(sexp) elif len(sexp) == 3 and sexp[1] == '~': del sexp[1] result = _list_convert(sexp[1]) sexp[1] = result print sexp[1] feat, val = sexp else: assert len(sexp) == 2, sexp[1] assert isinstance(sexp[0], basestring), sexp feat, val = sexp # Special handling for pattern feature if feat in ('pattern', 'cset'): assert isinstance(val, SexpList) and val.lparen == '(' return feat, nltk.FeatureValueTuple(val) # Special handling of the alt feature if feat == 'alt': assert isinstance(val, SexpList) and val.lparen == '(' choices = list() for c in val: if isinstance(c, basestring): choices.append(c) else: choices.append(_convert_fuf_featstruct(c)) val = nltk.FeatStruct( dict([('%d' % (i + 1), choice) for i, choice in enumerate(choices)])) # Process the alt with a name if len(name) > 0: return "%s_%s" % (feat, name), val # there is an index defined on this alt if isinstance(index, SexpList): ifs = _convert_fuf_featstruct(index) val["_index_"] = ifs[":index"] return feat, val if isinstance(val, SexpList): # If value is a feature structure, then recurse. if val.lparen == '(': return feat, _convert_fuf_featstruct(val) # If value is a pointer, then do something. if val.lparen == '{': # We'll resolve this later, using _resolve_fs_links(): return feat, ReentranceLink(val) else: assert False, 'unexpected sexp type' # Otherwise, return the value as a string. return feat, val
def feat2LFG(f): result = featureMapping.get(f, "") return(nltk.FeatStruct("".join( ("[", result, "]") )))
tokens = 'Kim likes children'.split() from nltk import load_parser cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2) for tree in cp.parse(tokens): print(tree) # 9.1.3 术语 # 简单的值通常称为原子。 # 原子值的一种特殊情况是布尔值。 # AGR是一个复杂值。 # 属性——值矩阵(Attribute-Value Matrix,AVM) # 9.2 处理特征结构 # 特征结构的构建;两个不同特征结构的统一(合一)运算。 fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') print(fs1) fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') print(fs1['GND']) fs1['CASE'] = 'acc' print(fs1) fs2 = nltk.FeatStruct(POS='N', AGR=fs1) print(fs2) print(fs2['AGR']) print(nltk.FeatStruct("[POS='N',AGR=[PER=3, NUM='pl', GND='fem']]")) # 特征结构也可以用来表示其他数据 print(nltk.FeatStruct(NAME='Lee', TELNO='13918181818', AGE=33))
surprise = { 'CAT': 'V', 'ORTH': 'surprised', 'REL': 'surprise', 'SRC': 'sbj', 'EXP': 'obj' } # SRC = source; EXP = experiencer nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') tokens = 'Kim likes children'.split() from nltk import load_parser cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2) trees = cp.parse(tokens) for tree in trees: tree.draw() fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') print fs1 print fs1['TENSE'] print nltk.FeatStruct("[POS='N', AGR=[PER=3, NUM='pl', GND='fem']]") print nltk.FeatStruct( """[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],SPOUSE=[NAME='Kim', ADDRESS->(1)]]""" ) fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal') fs2 = nltk.FeatStruct(CITY='Paris') print fs1.unify(fs2) # More codes and details on http://www.nltk.org/book_1ed/ch09.html
for subtree in tree.subtree(filter) ] # 文法是难点,也是重点 #Chap.9 还是继续文法,需要扩充阅读 #9-1.基于特征的文法例子 nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') tokens = 'Kim likes children'.split() from nltk import load_parser cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2) trees = cp.parse_one(tokens) fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') print(fs1) fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') fs2 = nltk.FeatStruct(POS='N', AGR=fs1) print(fs2['AGR']['PER']) #将特征结构看作为有向无环图(directed acyclic graphs, DAGs) #当两条路径具有相同的值时,它们被称为是 # 为了在我们的矩阵式表示中表示重入,我们将在共享的特征结构第一次出现的地方加一 # 个括号括起的数字前缀,例如(1)。以后任何对这个结构的引用将使用符号->(1),如下所 # 示。 # 等价的。 print nltk.FeatStruct( """[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], ... SPOUSE=[NAME='Kim', ADDRESS->(1)]]""")
import nltk from nltk import load_parser """ http://www.shareditor.com/blogshow?blogId=71 7. 文法分析还是基于特征好啊 """ # 文法分析在于分析词语的排序 # 文法特征的限制:句法协议、属性、约束、术语 # 特征结构 fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') print(fs1) fs2 = nltk.FeatStruct(POS='N', AGR=fs1) print(fs2) # 提供的特征 cp = load_parser('/Users/xingoo/nltk_data/grammars/book_grammars/sql0.fcfg') query = 'What cities are located in China' for tree in cp.parse(nltk.word_tokenize(query)): print(tree)
import nltk from nltk import load_parser # 基于特征的文法 nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') # 跟踪基于特征的图表分析器 tokens = 'Kim likes children'.split() cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2) for tree in cp.parse(tokens): print(tree) # 构造特征结构 fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') print(fs1) fs1['CASE'] = 'acc' fs2 = nltk.FeatStruct(POS='N', AGR=fs1) print(fs2) print(nltk.FeatStruct("[POS='N', AGR=[PER=3, NUM='pl', GND='fem']]")) print(nltk.FeatStruct(NAME='Lee', TELNO='01 27 86 42 96', AGE=33)) # 结构共享 print( nltk.FeatStruct("""[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], SPOUSE=[NAME='Kim', ADDRESS->(1)]]""")) print(nltk.FeatStruct("[A='a', B=(1)[C='c'], D->(1), E->(1)]")) # 统一 fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')
import nltk fs1 = nltk.FeatStruct("[A = ?x, B= [C = ?x]]") fs2 = nltk.FeatStruct("[B = [D = d]]") fs3 = nltk.FeatStruct("[B = [C = d]]") fs4 = nltk.FeatStruct("[A = (1)[B = b], C->(1)]") fs5 = nltk.FeatStruct("[A = (1)[D = ?x], C = [E -> (1), F = ?x] ]") fs6 = nltk.FeatStruct("[A = [D = d]]") fs7 = nltk.FeatStruct("[A = [D = d], C = [F = [D = d]]]") fs8 = nltk.FeatStruct("[A = (1)[D = ?x, G = ?x], C = [B = ?x, E -> (1)] ]") fs9 = nltk.FeatStruct("[A = [B = b], C = [E = [G = e]]]") fs10 = nltk.FeatStruct("[A = (1)[B = b], C -> (1)]") print('f1 and f2') print(fs1.unify(fs2)) print() print(nltk.FeatStruct("[A = ?x, B= [C = ?x, D = d]]")) print('f1 and f3') print(fs1.unify(fs3)) print() print(nltk.FeatStruct("[A = d, B= [C = d]]")) print('f4 and f5') print(fs4.unify(fs5)) print() print(nltk.FeatStruct("[A = (1)[B = b, D = ?x, E -> (1), F = ?x], C->(1)]")) print('f5 and f6') print(fs5.unify(fs6)) print()
# Processing feature structures import nltk from nltk import load_parser # in nltk we define feature structures as follows. fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') # we can view the structures as a kind of dictionary. # such we can acess values by indexing in the ususal way. print fs1['GND'] # adding to the feature structure fs1['CASE'] = 'acc' # We can also build more complex feature structures as follows. fs2 = nltk.FeatStruct(POS='N', ARG=fs1) print fs2 print fs2['ARG'] print fs2['ARG']['PER'] # feature structures are general purpose structures # such we don't have to only use the for linguistic features. print(nltk.FeatStruct(NAME='Lee', TELNO='01 27 86 42 96', AGE=33)) ''' We can think of feature structures as graphs more specifically. Directed acyclic graphs (DAGs)