def vis_parse(self, sent): #{{{ deps = DependenciesCollection() parsed = sent[:] parsed = [ROOT] + parsed sent = [ROOT] + sent connections = 0 mistake = False for tok in parsed: tok['s'] = tok['form'] fcache = {} scache = {} while len(parsed) > 1: # find best action best = -9999999 best_pair = None scores = {} for i, (tok1, tok2) in enumerate(zip(parsed, parsed[1:])): tid = tok1['id'] if tid in fcache: feats = fcache[tid] else: feats = self.featExt.extract(parsed, deps, i, sent) fcache[tid] = feats if tid in scache: s1, s2 = scache[tid] else: scr = self.scorer.get_scores(feats) s1 = scr[0] s2 = scr[1] scache[tid] = s1, s2 if s1 > best: best = s1 best_pair = (tok1, tok2) if s2 > best: best = s2 best_pair = (tok2, tok1) scores[(i, i + 1)] = s1 scores[(i + 1, i)] = s2 c, p = best_pair # remove the neighbours of parent from the cache i = parsed.index(p) frm = i - 4 to = i + 4 if frm < 0: frm = 0 if to >= len(parsed): to = len(parsed) - 1 for tok in parsed[frm:to]: try: del fcache[tok['id']] del scache[tok['id']] except: pass ### yield (self.oracle, sent, parsed, deps, scores) # apply action deps.add(p, c) connections += 1 parsed = [x for x in parsed if x != c] yield (self.oracle, sent, parsed, deps, scores)
def _build_gold(self, sent): # build gold deps deps = DependenciesCollection() for token in sent[1:]: child = token parent = sent[child['parent']] deps.add(parent, child) return deps
def vis_parse(self, sent): #{{{ deps = DependenciesCollection() parsed = sent[:] parsed=[ROOT]+parsed sent = [ROOT]+sent connections = 0 mistake=False for tok in parsed: tok['s']=tok['form'] fcache={} scache={} while len(parsed)>1: # find best action best = -9999999 best_pair = None scores = {} for i,(tok1,tok2) in enumerate(zip(parsed,parsed[1:])): tid=tok1['id'] if tid in fcache: feats = fcache[tid] else: feats = self.featExt.extract(parsed,deps,i,sent) fcache[tid] = feats if tid in scache: s1,s2 = scache[tid] else: scr = self.scorer.get_scores(feats) s1 = scr[0] s2 = scr[1] scache[tid]=s1,s2 if s1 > best: best = s1 best_pair = (tok1,tok2) if s2 > best: best = s2 best_pair = (tok2,tok1) scores[(i,i+1)]=s1 scores[(i+1,i)]=s2 c,p = best_pair # remove the neighbours of parent from the cache i = parsed.index(p) frm=i-4 to=i+4 if frm<0: frm = 0 if to>=len(parsed):to=len(parsed)-1 for tok in parsed[frm:to]: try: del fcache[tok['id']] del scache[tok['id']] except: pass ### yield (self.oracle,sent, parsed, deps, scores) # apply action deps.add(p,c) connections += 1 parsed = [x for x in parsed if x!=c] yield (self.oracle,sent, parsed, deps, scores)
def parse_labeled(self, sent): #{{{ id_to_action_mapper = self.id_to_action_mapper deps = DependenciesCollection() parsed = sent[:] parsed = [ROOT] + parsed sent = [ROOT] + sent scache = {} fe = self.featExt.extract gscore = self.scorer.get_scores lp = len(parsed) while lp > 1: # find best action _pairs = [] for i, (tok1, tok2) in enumerate(izip(parsed, islice(parsed, 1, None))): tid = tok1['id'] if tid in scache: (max_score_0, max_score_1, max_lbl_0, max_lbl_1) = scache[tid] else: feats = fe(parsed, deps, i, sent) scr = gscore(feats) scache[ tid] = scr # TODO: should I copy with dict() or is it safe? scored = [(score, id_to_action_mapper[aid]) for (aid, score) in enumerate(scr)] s0 = [(s, lbl) for (s, (dr, lbl)) in scored if dr == 0] s1 = [(s, lbl) for (s, (dr, lbl)) in scored if dr == 1] max_score_0, max_lbl_0 = max(s0) max_score_1, max_lbl_1 = max(s1) scache[tid] = (max_score_0, max_score_1, max_lbl_0, max_lbl_1) _pairs.append((max_score_0, tok1, tok2, max_lbl_0, i + 1)) _pairs.append((max_score_1, tok2, tok1, max_lbl_1, i)) best, c, p, lbl, locidx = max(_pairs) # remove the neighbours of parent from the cache i = locidx frm = i - 4 to = i + 4 if frm < 0: frm = 0 if to >= lp: to = lp - 1 for tok in parsed[frm:to]: try: del scache[tok['id']] except: pass # apply action deps.add(p, c, lbl) parsed.remove(c) lp -= 1 return deps
def parse(self, sent): # {{{ deps = DependenciesCollection() parsed = sent[:] parsed = [ROOT] + parsed sent = [ROOT] + sent scache = {} fcache = {} fe = self.featExt.extract gscore = self.scorer.get_scores lp = len(parsed) while lp > 1: # find best action _pairs = [] for i, (tok1, tok2) in enumerate(izip(parsed, islice(parsed, 1, None))): tid = tok1['id'] if tid in fcache: feats = fcache[tid] else: feats = self.featExt.extract(parsed,deps,i,sent) fcache[tid] = feats if tid in scache: s1, s2 = scache[tid] else: scr = gscore(feats) s1 = scr[0] s2 = scr[1] scache[tid] = s1, s2 _pairs.append((s1, tok1, tok2, i + 1)) _pairs.append((s2, tok2, tok1, i)) best, c, p, locidx = max(_pairs) # remove the neighbours of parent from the cache i = locidx frm = i - 4 to = i + 4 if frm < 0: frm = 0 if to >= lp: to = lp - 1 for tok in parsed[frm:to]: try: del fcache[tok['id']] del scache[tok['id']] except: pass # apply action deps.add(p, c) parsed.remove(c) lp -= 1 return deps
def _get_state(self, pending, features=[], score=float('-inf'), clas=None, deps=DependenciesCollection(), valid=True): """ state in beam :param pending: list of token :param features: global features until prv action :param score: score of this state :param clas: class of prev action :param deps: current deps :param valid: is this state valid :return: a dict """ # copy pending pending = list(pending) # copy features features = copy.copy(features) # copy deps deps = copy.copy(deps) return { 'pending': pending, 'features': features, 'score': score, 'cls': clas, 'deps': deps, 'valid': valid }
def __init__(self,sent): self.stack=[] self.sent=sent self.deps=DependenciesCollection(sent) self.i=0 self.actions = [] self._action_scores=[] self.cost=0
def parse(self, sent): #{{{ deps = DependenciesCollection() parsed = sent[:] parsed=[ROOT]+parsed sent = [ROOT]+sent scache={} fe=self.featExt.extract gscore=self.scorer.get_scores lp = len(parsed) while lp>1: # find best action _pairs=[] for i,(tok1,tok2) in enumerate(izip(parsed,islice(parsed,1,None))): tid=tok1['id'] if tid in scache: s1,s2 = scache[tid] else: feats = fe(parsed,deps,i,sent) scr = gscore(feats) s1 = scr[0] s2 = scr[1] scache[tid]=s1,s2 _pairs.append((s1,tok1,tok2,i+1)) _pairs.append((s2,tok2,tok1,i)) best,c,p,locidx = max(_pairs) # remove the neighbours of parent from the cache i = locidx frm=i-4 to=i+4 if frm<0: frm = 0 if to>=lp:to=lp-1 for tok in parsed[frm:to]: try: del scache[tok['id']] except: pass # apply action deps.add(p,c) parsed.remove(c) lp-=1 return deps
def beam_parse(self, sent): # {{{ deps = DependenciesCollection() parsed = sent[:] parsed = [ROOT] + parsed sent = [ROOT] + sent fe = self.featExt.extract gscore = self.scorer.get_scores lp = len(parsed) init_state = { "scache": {}, "fcache": {}, "deps":deps, "parsed":parsed, "features":[], "score":0 } global_beam = [init_state] for x in range(lp-1): beam = Beam(self.beam_width) for state in global_beam: lc_parsed = state['parsed'] lc_fcache = state['fcache'] lc_scache = state['scache'] lc_deps = state['deps'] for i, (tok1, tok2) in enumerate(izip(lc_parsed, islice(lc_parsed, 1, None))): tid = tok1['id'] if tid in lc_fcache: feats = lc_fcache[tid] else: feats = fe(lc_parsed,lc_deps,i,sent) lc_fcache[tid] = feats # feats += state['features'] if tid in lc_scache: s1,s2 = lc_scache[tid] else: scr = gscore(feats) s1 = scr[0] s2 = scr[1] lc_scache[tid] = s1,s2 beam.add_to_beam(s1,tok1,tok2,i+1,lc_deps,lc_parsed,lc_fcache,lc_scache,feats) beam.add_to_beam(s2, tok2, tok1, i, lc_deps, lc_parsed, lc_fcache, lc_scache,feats) global_beam = beam.get_beams() return global_beam[-1]["deps"]
def train(self, sent): #{{{ updates=0 sent = [ROOT]+sent self.scorer.tick() deps = DependenciesCollection() parsed = sent[:] fcache = {} scache = {} while len(parsed)>1: #{{{ # find best action best = -9999999 best_pair = None scored = [] for i,(tok1,tok2) in enumerate(zip(parsed,parsed[1:])): tid = tok1['id'] if tid in fcache: feats = fcache[tid] else: feats = self.featExt.extract(parsed,deps,i,sent) fcache[tid]=feats if tid in scache: s1,s2 = scache[tid] else: scores = self.scorer.get_scores(feats) s1 = scores[0] s2 = scores[1] scache[tid] = s1,s2 scored.append((s1,0,feats,tok1,tok2)) scored.append((s2,1,feats,tok2,tok1)) scored=sorted(scored,key=lambda (s,cls,f,t1,t2):-s) s,cls,f,c,p = scored[0] if self.oracle.allow_connection(sent,deps,p,c): # remove the neighbours of parent from the cache i = parsed.index(p) frm=i-4 to=i+4 if frm<0: frm = 0 if to>=len(parsed):to=len(parsed)-1 for tok in parsed[frm:to]: try: del fcache[tok['id']] del scache[tok['id']] except: pass ### deps.add(p,c) parsed = [x for x in parsed if x!=c] else: scache = {} # clear the cache -- numbers changed.. # find best allowable pair for s,gcls,gf,gc,gp in scored[1:]: if self.oracle.allow_connection(sent,deps,gp,gc): break self.scorer.add(f,cls,-1) self.scorer.add(gf,gcls,1) updates+=1 if updates>200: print "STUCK, probably because of incomplete feature set" print " ".join([x['form'] for x in sent]) print " ".join([x['form'] for x in parsed]) return
def train(self, sent): updates = 0 sent = [ROOT] + sent self.scorer.tick() deps = DependenciesCollection() parsed = sent[:] fcache = {} scache = {} while len(parsed) > 1: best = -999999 best_pair = None scored = [] for i, (tok1, tok2) in enumerate(zip(parsed, parsed[1:])): tid = tok1["id"] if tid in fcache: feats = fcache[tid] else: feats = self.featExt.extract(parsed, deps, i, sent) fcache[tid] = feats if tid in scache: s1, s2 = scache[tid] else: scores = self.scorer.get_scores(feats) s1 = scores[0] s2 = scores[1] scache[tid] = s1, s2 scored.append((s1, 0, feats, tok1, tok2)) scored.append((s2, 1, feats, tok2, tok1)) # xap xep tu lon den nho, -s scored = sorted(scored, key=lambda (s, cls, f, t1, t2): -s) s, cls, f, c, p = scored[0] if self.oracle.allow_connection(sent, deps, p, c): # remove the neighbours of parent from the cache i = parsed.index(p) frm = i - 4 to = i + 4 if frm < 0: frm = 0 if to >= len(parsed): to = len(parsed) - 1 for tok in parsed[frm:to]: try: del fcache[tok['id']] del scache[tok['id']] except: pass ### deps.add(p, c) parsed = [x for x in parsed if x != c] else: scache = {} # clear the cache -- numbers changed.. # find best allowable pair for s, gcls, gf, gc, gp in scored[1:]: if self.oracle.allow_connection(sent, deps, gp, gc): break self.scorer.add(f, cls, -1) self.scorer.add(gf, gcls, 1) updates += 1 if updates > 200: print "STUCK, probably because of incomplete feature set" print " ".join([x['form'] for x in sent]) print " ".join([x['form'] for x in parsed]) return
def train_labeled(self, sent, iter_number, explore_policy=None): #{{{ id_to_action_mapper = self.id_to_action_mapper updates = 0 sent = [ROOT] + sent self.scorer.tick() deps = DependenciesCollection() parsed = sent[:] fcache = {} scache = {} while len(parsed) > 1: #{{{ # find best action best = -9999999 best_pair = None scored = [] for i, (tok1, tok2) in enumerate(zip(parsed, parsed[1:])): tid = tok1['id'] if tid in fcache: feats = fcache[tid] else: feats = self.featExt.extract(parsed, deps, i, sent) fcache[tid] = feats if tid in scache: scores = scache[tid] else: scores = self.scorer.get_scores(feats) scache[tid] = scores for aid, score in scores.iteritems(): dr, lbl = id_to_action_mapper[aid] if dr == 0: scored.append((score, (aid, lbl), feats, tok1, tok2)) else: assert (dr == 1) scored.append((score, (aid, lbl), feats, tok2, tok1)) #print [(x[0],x[1]) for x in scored] scored = sorted(scored, key=lambda (s, cls, f, t1, t2): -s) s, cls, f, c, p = scored[0] #print "selected:",cls,p['id'],c['id'],s cost = self.oracle.action_cost(parsed, p, c, cls[1]) if cost == 0: correct = True else: correct = False scache = {} # clear the cache -- numbers changed.. # find best allowable pair for s, gcls, gf, gc, gp in scored[1:]: if self.oracle.action_cost(parsed, gp, gc, gcls[1]) == 0: break self.scorer.add(f, cls[0], -1) self.scorer.add(gf, gcls[0], 1) updates += 1 if updates > 200: print "STUCK, probably because of incomplete feature set", id_to_action_mapper[ cls[0]], id_to_action_mapper[gcls[0]] print " ".join([x['form'] for x in sent]) print " ".join([x['form'] for x in parsed]) return if correct or (explore_policy and explore_policy.should_explore(iter_number)): # remove the neighbours of parent from the cache i = parsed.index(p) frm = i - 4 to = i + 4 if frm < 0: frm = 0 if to >= len(parsed): to = len(parsed) - 1 for tok in parsed[frm:to]: try: del fcache[tok['id']] del scache[tok['id']] except: pass ### deps.add(p, c, cls[1]) parsed = [x for x in parsed if x != c]
def parse_with_span_constraints(self, sent, spans): #{{{ """ spans is a list of the tuples of the form (s,e) where s and e are integers, where s is the index of the first token in the span, and e is the index of the last token in the span. spans may not overlap or contain each other (this is not verified). The constraint is that all tokens in each span must share a head, and only that head may have children outside of the span. """ deps = DependenciesCollection() parsed = sent[:] parsed = [ROOT] + parsed sent = [ROOT] + sent remaining_toks_in_span = {-1: 0} for sid, (s, e) in enumerate(spans): if e >= len(sent): continue remaining_toks_in_span[sid] = (e - s) for tok in sent[s:e + 1]: tok['span_id'] = sid scache = {} fe = self.featExt.extract gscore = self.scorer.get_scores lp = len(parsed) while lp > 1: # find best action _pairs = [] for i, (tok1, tok2) in enumerate(izip(parsed, islice(parsed, 1, None))): # if tok1,tok2 not allowed by the span constraints, skip. # in order to be allowed, we need either: # tok1 and tok2 inside the same span. # tok1 and tok2 not inside any span. # a single token in a span is not considered to be inside a span. sid1 = tok1.get('span_id', -1) sid2 = tok2.get('span_id', -1) if sid1 != sid2: if remaining_toks_in_span[ sid1] > 0 or remaining_toks_in_span[sid2] > 0: continue tid = tok1['id'] if tid in scache: s1, s2 = scache[tid] else: feats = fe(parsed, deps, i, sent) scr = gscore(feats) s1 = scr[0] s2 = scr[1] scache[tid] = s1, s2 _pairs.append((s1, tok1, tok2, i + 1)) _pairs.append((s2, tok2, tok1, i)) best, c, p, locidx = max(_pairs) # remove the neighbours of parent from the cache i = locidx frm = i - 4 to = i + 4 if frm < 0: frm = 0 if to >= lp: to = lp - 1 for tok in parsed[frm:to]: try: del scache[tok['id']] except: pass # apply action deps.add(p, c) parsed.remove(c) remaining_toks_in_span[c.get('span_id', -1)] -= 1 lp -= 1 return deps