def create_instances(self, examples): all_instances = [] succ = 0 for id, ex in enumerate(logged_loop(examples)): n_words = len(ex['word']) - 1 # arcs = {(h, t, label)} stack = [0] buf = [i + 1 for i in xrange(n_words)] arcs = [] instances = [] for i in xrange(n_words * 2): gold_t = self.get_oracle(stack, buf, ex) if gold_t is None: break legal_labels = self.legal_labels(stack, buf) assert legal_labels[gold_t] == 1 instances.append((self.extract_features(stack, buf, arcs, ex), legal_labels, gold_t)) if gold_t == self.n_trans - 1: stack.append(buf[0]) buf = buf[1:] elif gold_t < self.n_deprel: arcs.append((stack[-1], stack[-2], gold_t)) stack = stack[:-2] + [stack[-1]] else: arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel)) stack = stack[:-1] else: succ += 1 all_instances += instances return all_instances
def create_instances(self, examples): all_instances = [] succ = 0 for id, ex in enumerate(logged_loop(examples)): n_words = len(ex['word']) - 1 # arcs = {(h, t, label)} stack = [0] buf = [i + 1 for i in xrange(n_words)] arcs = [] instances = [] for i in xrange(n_words * 2): gold_t = self.get_oracle(stack, buf, ex) if gold_t is None: break legal_labels = self.legal_labels(stack, buf) assert legal_labels[gold_t] == 1 instances.append( (self.extract_features(stack, buf, arcs, ex), legal_labels, gold_t)) if gold_t == self.n_trans - 1: stack.append(buf[0]) buf = buf[1:] elif gold_t < self.n_deprel: arcs.append((stack[-1], stack[-2], gold_t)) stack = stack[:-2] + [stack[-1]] else: arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel)) stack = stack[:-1] else: succ += 1 all_instances += instances return all_instances
def create_instances(self, examples): all_instances = [] succ = 0 for id, ex in enumerate(logged_loop(examples)): n_words = len(ex['word']) - 1 # arcs = {(h, t, label)} stack = [0] # buf初始保存句子中所有的词 # 这个序号和head集合中的序号是对应的,序号范围在句子的长度内 buf = [i + 1 for i in xrange(n_words)] arcs = [] instances = [] for i in xrange(n_words * 2): # 总共会执行2*n_words次trans操作,n为句子中单词总数 # 得到当前状态下正确的trans操作 gold_t = self.get_oracle(stack, buf, ex) if gold_t is None: break legal_labels = self.legal_labels(stack, buf) assert legal_labels[gold_t] == 1 # 为句子的每一次状态变化创建一个状态向量实例f=[18+18+12] # 每个状态向量表征当前未变化前的状态,对应一个正确的可执行操作的标识 # 总共有2*n_words个状态向量F instances.append( (self.extract_features(stack, buf, arcs, ex), legal_labels, gold_t)) # 更新状态 if gold_t == self.n_trans - 1: # ==2 shift stack.append(buf[0]) buf = buf[1:] elif gold_t < self.n_deprel: # == 0 left-arc arcs.append((stack[-1], stack[-2], gold_t)) stack = stack[:-2] + [stack[-1]] # stack去除-2元素 else: # ==1 right-arc right_label = gold_t if self.unlabeled else gold_t - self.n_deprel arcs.append((stack[-2], stack[-1], right_label)) # 添加新获得的弧,(h,t)对应词序 stack = stack[:-1] # stack去除-1元素 else: succ += 1 all_instances += instances # 将提取回的句子特征加入总特征实例集合 return all_instances
def create_instances(self, examples): ''' :param examples: [{word:[..], pos:[..], head:[..], label:[..]}..],一个字典表示一个句子,里面用token2id的id表示 :return: [[([n_feature长的特征], [可以采取的所有操作], 真实的操作)..]..] ''' all_instances = [] succ = 0 for id, ex in enumerate(logged_loop(examples)): n_words = len(ex['word']) - 1 # arcs = {(h, t, label)} stack = [0] buf = [i + 1 for i in range(n_words)] arcs = [] instances = [] for i in range(n_words * 2): # 每个词有进有出,一共有2n步 gold_t = self.get_oracle(stack, buf, ex) if gold_t is None: break legal_labels = self.legal_labels(stack, buf) assert legal_labels[gold_t] == 1 instances.append((self.extract_features(stack, buf, arcs, ex), legal_labels, gold_t)) if gold_t == self.n_trans - 1: stack.append(buf[0]) buf = buf[1:] elif gold_t < self.n_deprel: # 说明是L-XXX arcs.append((stack[-1], stack[-2], gold_t)) stack = stack[:-2] + [stack[-1]] else: arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel)) stack = stack[:-1] else: succ += 1 all_instances += instances return all_instances