def preprocess(self, data, vocab, opt): """ Preprocess the data and convert to ids. """ processed = [] for d in data: tokens = d['token'] if opt['lower']: tokens = [t.lower() for t in tokens] # anonymize tokens ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] tokens[ss:se + 1] = ['SUBJ-' + d['subj_type']] * (se - ss + 1) tokens[os:oe + 1] = ['OBJ-' + d['obj_type']] * (oe - os + 1) tokens_idxs = map_to_ids(tokens, vocab.word2id) head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) pos = map_to_ids(d['stanford_pos'], constant.POS_TO_ID) ner = map_to_ids(d['stanford_ner'], constant.NER_TO_ID) deprel = map_to_ids(d['stanford_deprel'], constant.DEPREL_TO_ID) l = len(tokens) subj_positions = get_positions(d['subj_start'], d['subj_end'], l) obj_positions = get_positions(d['obj_start'], d['obj_end'], l) relation = constant.LABEL_TO_ID[d['relation']] stp_tokens_idxs = tree_to_seq( head_to_tree(np.array(head), np.array(tokens), l, 0, np.array(subj_positions), np.array(obj_positions)), tokens) hop1_tokens_idxs = tree_to_seq( head_to_tree(np.array(head), np.array(tokens), l, 1, np.array(subj_positions), np.array(obj_positions)), tokens) stp_tokens_idxs, stp_pos, stp_ner, stp_deprel, stp_subj_positions, stp_obj_positions = get_path_input( tokens, pos, ner, deprel, stp_tokens_idxs, 'SUBJ-' + d['subj_type'], 'OBJ-' + d['obj_type'], vocab) hop1_tokens_idxs, hop1_pos, hop1_ner, hop1_deprel, hop1_subj_positions, hop1_obj_positions = get_path_input( tokens, pos, ner, deprel, hop1_tokens_idxs, 'SUBJ-' + d['subj_type'], 'OBJ-' + d['obj_type'], vocab) processed += [ (tokens_idxs, pos, ner, deprel, subj_positions, obj_positions, relation, stp_tokens_idxs, stp_pos, stp_ner, stp_deprel, stp_subj_positions, stp_obj_positions, relation, hop1_tokens_idxs, hop1_pos, hop1_ner, hop1_deprel, hop1_subj_positions, hop1_obj_positions, relation) ] return processed
def inputs_to_tree_reps(self, dep_head, seq_len, subj_pos, obj_pos, dep_rel, device): maxlen = max(seq_len) trees = [ head_to_tree(dep_head[i], seq_len[i], self.config.syntax['prune_k'], subj_pos[i], obj_pos[i], dep_rel[i]) for i in range(len(seq_len)) ] # Making "self_loop=True" as adj will be used as a masking matrix during graph attention adj_matrix_list, dep_rel_matrix_list = [], [] for tree in trees: adj_matrix, dep_rel_matrix = tree_to_adj( maxlen, tree, directed=False, self_loop=self.config.syntax['adj_self_loop']) adj_matrix = adj_matrix.reshape(1, maxlen, maxlen) adj_matrix_list.append(adj_matrix) dep_rel_matrix = dep_rel_matrix.reshape(1, maxlen, maxlen) dep_rel_matrix_list.append(dep_rel_matrix) batch_adj_matrix = torch.from_numpy( np.concatenate(adj_matrix_list, axis=0)) batch_dep_rel_matrix = torch.from_numpy( np.concatenate(dep_rel_matrix_list, axis=0)) return Variable(batch_adj_matrix.to(device)), \ Variable(batch_dep_rel_matrix.to(device))
def inputs_to_tree_reps(head, words, l, prune, subj_pos, obj_pos): head, words, subj_pos, obj_pos = head.cpu().numpy(), words.cpu().numpy(), subj_pos.cpu().numpy(), obj_pos.cpu().numpy() trees = [head_to_tree(head[i], words[i], l[i], prune, subj_pos[i], obj_pos[i]) for i in range(len(l))] adj = [tree_to_adj(maxlen, tree, directed=False, self_loop=False).reshape(1, maxlen, maxlen) for tree in trees] adj = np.concatenate(adj, axis=0) adj = torch.from_numpy(adj) return Variable(adj.cuda()) if self.opt['cuda'] else Variable(adj)
def extract_trees(data, prune_k): relation2trees = {} for d in data: tokens = list(d['token']) # anonymize tokens ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] tokens[ss:se + 1] = ['SUBJ-' + d['subj_type']] * (se - ss + 1) tokens[os:oe + 1] = ['OBJ-' + d['obj_type']] * (oe - os + 1) head = [int(x) for x in d['stanford_head']] deprel = d['stanford_deprel'] l = len(tokens) subj_positions = get_positions(d['subj_start'], d['subj_end'], l) obj_positions = get_positions(d['obj_start'], d['obj_end'], l) relation = d['relation'] if relation not in relation2trees: relation2trees[relation] = [] _, tree = head_to_tree(head=np.array(head), tokens=np.array(tokens), len_=l, prune=prune_k, subj_pos=np.array(subj_positions), obj_pos=np.array(obj_positions), deprel=np.array(deprel)) relation2trees[relation].append(tree) return relation2trees
def inputs_to_tree_reps(head, l): trees = [head_to_tree(head[i], l[i]) for i in range(len(l))] adj = [ tree_to_adj(maxlen, tree, directed=False).reshape(1, maxlen, maxlen) for tree in trees ] adj = np.concatenate(adj, axis=0) adj = torch.from_numpy(adj) return Variable(adj.cuda()) if self.opt['cuda'] else Variable(adj)
def inputs_to_tree_reps(head, words, l, prune, subj_pos, obj_pos, deprel=None, maxlen=100): head, words, subj_pos, obj_pos = head.cpu().numpy(), words.cpu().numpy( ), subj_pos.cpu().numpy(), obj_pos.cpu().numpy() if deprel is not None: deprel = deprel.cpu().numpy() trees = [ head_to_tree(head[i], words[i], l[i], prune, subj_pos[i], obj_pos[i], deprel[i]) for i in range(len(l)) ] else: trees = [ head_to_tree(head[i], words[i], l[i], prune, subj_pos[i], obj_pos[i]) for i in range(len(l)) ] # adj 邻接边为边类型 adjs = [] adjs_r = [] for tree in trees: adj = tree_to_adj(maxlen, tree, directed=True, edge_info=True) adj_r = adj.T.copy() adj_r[adj_r > 1] += constant.DEPREL_COUNT adjs.append(adj.reshape(1, maxlen, maxlen)) adjs_r.append(adj_r.reshape(1, maxlen, maxlen)) adjs = np.concatenate(adjs, axis=0) adjs_r = np.concatenate(adjs_r, axis=0) adjs = torch.from_numpy(adjs) adjs_r = torch.from_numpy(adjs_r) adjs = Variable(adjs.cuda()) adjs_r = Variable(adjs_r.cuda()) return adjs, adjs_r
def inputs_to_tree_reps(head, token_id, l, prune, subj_positions, obj_positions): trees = [head_to_tree(head[i], token_id[i], l[i], prune, subj_positions[i], obj_positions[i]) for i in range(len(l))] adj = [tree_to_adj(maxlen, tree, directed=False, self_loop=False).reshape(1, maxlen, maxlen) for tree in trees] new_adj=[] for instance in adj: target_zero_array = [np.zeros((len(instance[0][0]), len(instance[0][0])), dtype=np.float32)] if (instance==target_zero_array).all(): new_adj.append([np.eye(len(instance[0][0]), k=1, dtype=np.float32)+np.eye(len(instance[0][0]), k=-1, dtype=np.float32)]) else: new_adj.append(instance) adj = np.concatenate(new_adj, axis=0) adj = torch.from_numpy(adj) return adj.cuda() if self.opt['cuda'] else adj
def simulate_data(self, id, aug_id, subj_start, subj_end, obj_start, obj_end): d = self.data_index[id] sid = id imp = 0 relation = self.label2id[d['relation']] # if sid in imps.keys(): # imp = imps[sid] tokens = list(d['token']) raw_tokens = copy.deepcopy(tokens) pos = map_to_ids(d['stanford_pos'], constant.POS_TO_ID) ner = map_to_ids(d['stanford_ner'], constant.NER_TO_ID) deprel = map_to_ids(d['stanford_deprel'], constant.DEPREL_TO_ID) head = [int(x) for x in d['stanford_head']] # subj_id = list(range(ss, se + 1)) # obj_id = list(range(os, oe + 1)) l = len(tokens) subj_idx = list(range(subj_start, subj_end + 1)) obj_idx = list(range(obj_start, obj_end + 1)) tree, domains, distance = head_to_tree(head, deprel, subj_idx, obj_idx) # if d['subj_start'] == subj_span[0] and d['subj_end'] == subj_span[-1] and d['obj_start'] == obj_span[0] and \ # d['obj_end'] == obj_span[-1]: # continue raw_tokens.append(distance) subj_type = d['stanford_ner'][subj_start] obj_type = d['stanford_ner'][obj_start] # if distance <= 6: # continue depmap, ret, rel, resrel, domain, domain_id, redomian_id = tree_to_adj(l, domains, tree) tokens[subj_start:subj_end + 1] = ['SUBJ-' + subj_type] * (subj_end - subj_start + 1) tokens[obj_start:obj_end + 1] = ['OBJ-' + obj_type] * (obj_end - obj_start + 1) tokens = map_to_ids(tokens, self.vocab.word2id) subj_positions = get_positions(subj_start, subj_end, l) obj_positions = get_positions(obj_start, obj_end, l) raw_tokens[subj_start:subj_end + 1] = zip(raw_tokens[subj_start:subj_end + 1], ( ['SUBJ-' + subj_type] * (subj_end - subj_start + 1))) raw_tokens[obj_start:obj_end + 1] = zip(raw_tokens[obj_start:obj_end + 1], ( ['OBJ-' + obj_type] * (obj_end - obj_start + 1))) batch = [(tokens, pos, subj_positions, obj_positions, ner, depmap, ret, rel, resrel, deprel, domain, domain_id, redomian_id,aug_id,distance,relation)] # batch_size = len(batch) # batch = list(zip(*batch)) # # assert len(batch) == 10 # # # sort all fields by lens for easy RNN operations # lens = [len(x) for x in batch[0]] # batch, orig_idx = sort_all(batch, lens) # lens = sorted(lens, reverse=True) # maxlen = lens[0] # domains = [b.shape[1] for b in batch[8]] # max_domain = max(domains) # words=batch[0] # words = get_long_tensor(words, batch_size) # masks = torch.eq(words, 0) # pos = get_long_tensor(batch[1], batch_size) # ner = get_long_tensor(batch[4], batch_size) # # depmap = padmat(batch[5], maxlen, maxlen) # for i in range(batch_size): # depmap[i][len(tokens):, 0] = 1 # ret = padmat(batch[6], maxlen, maxlen) # rel = padmat(batch[7], maxlen, maxlen) # resrel = padmat(batch[8], maxlen, maxlen) # deprel = get_long_tensor(batch[9], batch_size) # domain = padmat(batch[10], maxlen, max_domain) # domain_id = padmat(batch[11], maxlen, max_domain) # redomain_id = padmat(batch[12], maxlen, max_domain) # # head = get_long_tensor(batch[4], batch_size) # subj_positions = get_long_tensor(batch[2], batch_size) # obj_positions = get_long_tensor(batch[3], batch_size) # # subj_type = get_long_tensor(batch[7], batch_size) # # obj_type = get_long_tensor(batch[8], batch_size) # length = torch.LongTensor(batch[13]) # raw_tokens = batch[14] # rels = torch.LongTensor(batch[15]) # ids = batch[16] # imp = batch[17] # return ( # words, masks, pos, subj_positions, obj_positions, ner, depmap, ret, rel, resrel, deprel, domain, domain_id, # redomain_id, rels,orig_idx, length, raw_tokens, ids, imp, batch[16]) return batch
def packdata(self,d): if not self.is_soft: relation = d['relation'] if isinstance(relation, list): relation = relation.index(max(relation)) else: relation = self.label2id[relation] else: relation = d['soft_label'] # if self.subj is not None: # if d['subj_type']!=self.subj or d['obj_type']!=self.obj: # return [] #rev_relation = 0 # # if relation!=40: # continue # rev_relation=self.findRevLabel(d['id'],"test") # label_count[relation]+=1 rd = copy.deepcopy(d) # if 'conj' in d['stanford_deprel']: # rd=self.removeconj(rd) head = [int(x) for x in d['stanford_head']] ners2id=constant.NER_TO_ID id2ners=dict([(v, k) for k, v in ners2id.items()]) # subj_id = list(range(ss,se+1)) # obj_id=list(range(os,oe+1)) assert any([x == 0 for x in head]) tokens = list(rd['token']) containDot=True if self.opt['lower']: tokens = [t.lower() for t in tokens] if tokens[-1]!='.': containDot=False raw_tokens = copy.deepcopy(tokens) # tokens = map_to_ids(tokens, vocab.word2id) pos = map_to_ids(rd['stanford_pos'], constant.POS_TO_ID) ner = map_to_ids(rd['stanford_ner'], constant.NER_TO_ID) deprel = map_to_ids(rd['stanford_deprel'], constant.DEPREL_TO_ID) head = [int(x) for x in rd['stanford_head']] sid = rd['id'] assert any([x == 0 for x in head]) l = len(tokens) if not self.corefresolve: ss, se = rd['subj_start'], rd['subj_end'] os, oe = rd['obj_start'], rd['obj_end'] subj_id = list(range(ss, se + 1)) obj_id = list(range(os, oe + 1)) tree, domains, distance= head_to_tree(head, deprel, subj_id, obj_id) depmap, ret, rel, resrel, domain, domain_subj, domain_obj = tree_to_adj(l, domains, tree) # subj_entities, obj_entities = self.getEntitySpan(rd) # entities_span = list(set(subj_entities + obj_entities)) # entity_gragh=self.getEntityGragh(depmap, entities_span) # anonymize tokens tokens[ss:se + 1] = ['SUBJ-' + rd['subj_type']] * (se - ss + 1) tokens[os:oe + 1] = ['OBJ-' + rd['obj_type']] * (oe - os + 1) raw_tokens[ss:se + 1] = zip(raw_tokens[ss:se + 1], (['RAWSUBJ-' + d['subj_type']] * (se - ss + 1))) raw_tokens[os:oe + 1] = zip(raw_tokens[os:oe + 1], (['RAWOBJ-' + d['obj_type']] * (oe - os + 1))) raw_tokens.append(distance) subj_positions = get_positions(ss, se, l) obj_positions = get_positions(os, oe, l) tokens = map_to_ids(tokens, self.vocab.word2id) sdp_mask=1*(domain.T[1]==0) else: #entityspans=self.getEntitySpan(d) src_subj=list(range(d['subj_start'],d['subj_end'] + 1)) src_obj=list(range(d['obj_start'], d['obj_end'] + 1)) if 'subj_list' in rd.keys(): subj_list=rd['subj_list'] obj_list=rd['obj_list'] else: subj_list=[src_subj] obj_list=[src_obj] # for span in subj_list: # entityspans.remove(subj_list) # relpairs=[] # # interpairs=[] # # corefpairs=[] # for subj in subj_list: # for obj in obj_list: # if subj!=obj: # relpairs.append([subj,obj]) # corefpairs.append() # for s in entityspans: # for o in entityspans: # if s!=o: # if [s,o] not in relpairs: # interpairs.append([s,o]) # entity_mask=[] # coref_mask=[] # for pair in interpairs: # mask=[0] * l # mask[pair[0]]=1 # mask[pair[1]]=1 # subj_mask=[] def notinter(a,b): return len(set(a)&set(b))==0 relpairs=[] for subj in subj_list: for obj in obj_list: if notinter(subj,obj): relpairs.append([subj,obj]) entity_dep=constant.no_pass entity_ids=[] for i in range(len(tokens)): if deprel[i] in entity_dep: entity_ids.append([i]) entity_ner = [ners2id[d['subj_type']]] if d['obj_type'] in ners2id.keys(): entity_ner.append(ners2id[d['obj_type']]) if 3 in entity_ner: entity_pos = [15, 20] else: entity_pos = [] tree, domains, distance,relpair,midhead,entity_chains,sdp_domain = head_to_treeEval(head, deprel, ner,pos,entity_ner,entity_pos,relpairs,build_mid=True) # filterrelpair=[] # for pair in relpairs: # subj=pair[0] # obj=pair[1] # if isinstance(subj,list): # subj_end=subj[-1] # subj_start=subj[0] # cur=subj_end # h=head[cur]-1 # while (h<=subj_end and h>=subj_start): # cur=h # h = head[cur] - 1 # layers=midhead[cur] # if isinstance(obj,list): # obj_end=obj[-1] # obj_start=obj[0] # cur=obj_end # h=head[cur]-1 # while (h<=obj_end and h>=obj_start): # cur=h # h = head[cur] - 1 # layero=midhead[cur] # if not(layero!=layers and (layers>obj_end or layers<obj_start) and (layero>subj_end or layero<subj_start)): # filterrelpair.append([subj,obj]) iscross=0 # if len(filterrelpair)==0: # iscross=1 # if relation!=0: # iscross=True # print("miss lit. su") # return [] # else: # tree, domains, distance, relpair, midhead = head_to_treeEval(head, deprel, relpairs, build_mid=False) depmap, ret, rel, resrel, domain,sdp_domain, domain_subj, domain_obj = tree_to_adj(l, domains, tree,entity_chains,sdp_domain) # relpairs = rawrelpair obj_mask=[-1]*l subj_mask=[-1]*l aspect=[] #relpairs=[rawrelpair] relpairs=[relpair] for pair in relpairs: subj_span=pair[0] obj_span=pair[1] rtokens=copy.deepcopy(tokens) rrawtokens=copy.deepcopy(tokens) rsubjmask=copy.deepcopy(subj_mask) robjmask=copy.deepcopy(obj_mask) for entity_pair in entity_chains[1:]: entity_span=entity_pair[0] entityner=ner[entity_span[0]] if entityner==2: entityner=3 rtokens[entity_span[0]:entity_span[-1] + 1] = ['ENTITY_' +id2ners[entityner]] * (entity_span[-1] - entity_span[0] + 1) rrawtokens[entity_span[0]:entity_span[-1] + 1] = zip(rrawtokens[entity_span[0]:entity_span[-1] + 1], ( ['ENTITY_' + id2ners[entityner]] * (entity_span[-1] - entity_span[0] + 1))) rtokens[subj_span[0]:subj_span[-1] + 1] = ['SUBJ-' + rd['subj_type']] * (subj_span[-1] - subj_span[0] + 1) #rtokens[subj_span[0]:subj_span[-1] + 1] = ['ENTITY_' + rd['subj_type']] * ( # subj_span[-1] - subj_span[0] + 1) rsubjmask[subj_span[0]:subj_span[-1] + 1] = [0] * (subj_span[-1] - subj_span[0] + 1) rrawtokens[subj_span[0]:subj_span[-1] + 1] = zip(rrawtokens[subj_span[0]:subj_span[-1] + 1], ( ['RAWSUBJ-' + d['subj_type']] * (subj_span[-1] - subj_span[0] + 1))) rtokens[obj_span[0]:obj_span[-1] + 1] = ['OBJ-' + rd['obj_type']] * (obj_span[-1] - obj_span[0] + 1) robjmask[obj_span[0]:obj_span[-1]+1]=[0]*(obj_span[-1]-obj_span[0]+1) rrawtokens[obj_span[0]:obj_span[-1] + 1] = zip(rrawtokens[obj_span[0]:obj_span[-1] + 1], (['RAWOBJ-' + d['obj_type']] * (obj_span[-1] - obj_span[0] + 1))) rrawtokens.append(distance) rtokens = map_to_ids(rtokens, self.vocab.word2id) mask = [1] * len(rtokens) if containDot: mask[-1] = 0 aspect.append((rtokens, pos, rsubjmask, robjmask, ner, depmap, ret, rel, resrel, deprel, domain,sdp_domain,domain_subj, domain_obj, mask,sid,iscross, distance, relation,rrawtokens)) if [src_subj,src_obj] not in relpairs: tree, domains, distanceraw,relpair,midhead,entity_chains,sdp_domain = head_to_treeEval(head, deprel,ner, pos,entity_ner,entity_pos,[[src_subj,src_obj]],build_mid=True) distance=distanceraw depmap, ret, rel, resrel, domain, sdp_domain,domain_subj, domain_obj = tree_to_adj(l, domains, tree,entity_chains,sdp_domain) obj_span=src_obj subj_span=src_subj rtokens = copy.deepcopy(tokens) rrawtokens = copy.deepcopy(tokens) rsubjmask = copy.deepcopy(subj_mask) robjmask = copy.deepcopy(obj_mask) for entity_pair in entity_chains[1:]: entity_span=entity_pair[-1] entityner=ner[entity_span[0]] if entityner==2: entityner=3 rtokens[entity_span[0]:entity_span[-1] + 1] = ['ENTITY_' +id2ners[entityner]] * (entity_span[-1] - entity_span[0] + 1) rrawtokens[entity_span[0]:entity_span[-1] + 1] = zip(rrawtokens[entity_span[0]:entity_span[-1] + 1],(['ENTITY_' + id2ners[entityner]] * (entity_span[-1] - entity_span[0] + 1))) rtokens[subj_span[0]:subj_span[-1] + 1] = ['SUBJ-' + rd['subj_type']] * (subj_span[-1] - subj_span[0] + 1) # rtokens[subj_span[0]:subj_span[-1] + 1] = ['ENTITY_' + rd['subj_type']] * ( # subj_span[-1] - subj_span[0] + 1) rsubjmask[subj_span[0]:subj_span[-1] + 1] = [0] * (subj_span[-1] - subj_span[0] + 1) rrawtokens[subj_span[0]:subj_span[-1] + 1] = zip(rrawtokens[subj_span[0]:subj_span[-1] + 1], ( ['RAWSUBJ-' + d['subj_type']] * (subj_span[-1] - subj_span[0] + 1))) rtokens[obj_span[0]:obj_span[-1] + 1] = ['OBJ-' + rd['obj_type']] * (obj_span[-1] - obj_span[0] + 1) robjmask[obj_span[0]:obj_span[-1] + 1] = [0] * (obj_span[-1] - obj_span[0] + 1) rrawtokens[obj_span[0]:obj_span[-1] + 1] = zip(rrawtokens[obj_span[0]:obj_span[-1] + 1], ( ['RAWOBJ-' + d['obj_type']] * (obj_span[-1] - obj_span[0] + 1))) rrawtokens.append(distance) rtokens = map_to_ids(rtokens, self.vocab.word2id) mask = [1] * len(rtokens) if containDot: mask[-1] = 0 aspect.append( (rtokens, pos, rsubjmask, robjmask, ner, depmap, ret, rel, resrel, deprel, domain, sdp_domain,domain_subj, domain_obj, mask, sid, iscross,distance, relation, rrawtokens)) # if len(aspect)<2 or relation!=0: # return [] # subj_type = [constant.SUBJ_NER_TO_ID[d['subj_type']]] # obj_type = [constant.OBJ_NER_TO_ID[d['obj_type']]] # processed += [(tokens, pos, ner, deprel, head, subj_positions, obj_positions, subj_type, obj_type, length,relation)] batch= [aspect] return batch
def input_to_adj(self, head, words, prune_k, subj_pos, obj_pos): tree = head_to_tree(head, words, len(words), prune_k, subj_pos, obj_pos) adj = tree_to_adj(len(words), tree, directed=False, self_loop=False) adj = torch.from_numpy(adj) return adj