def read_amr_file(inpath): nodes = [] # [batch, node_num,] in_neigh_indices = [] # [batch, node_num, neighbor_num,] in_neigh_edges = [] out_neigh_indices = [] # [batch, node_num, neighbor_num,] out_neigh_edges = [] sentences = [] # [batch, sent_length,] ids = [] type = [] max_in_neigh = 0 max_out_neigh = 0 max_node = 0 max_sent = 0 with open(inpath, "rU") as f: for inst in json.load(f): amr = filter(lambda x: x in printable, inst['amr']) sent = filter(lambda x: x in printable, inst['sent']).strip().split() id = inst['id'] if inst.has_key('id') else None amr_node = [] amr_edge = [] amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge) # 1. nodes.append(amr_node) # 2. & 3. in_indices = [[ i, ] for i, x in enumerate(amr_node)] in_edges = [[ ':self', ] for i, x in enumerate(amr_node)] out_indices = [[ i, ] for i, x in enumerate(amr_node)] out_edges = [[ ':self', ] for i, x in enumerate(amr_node)] for (i, j, lb) in amr_edge: in_indices[j].append(i) in_edges[j].append(lb) out_indices[i].append(j) out_edges[i].append(lb) in_neigh_indices.append(in_indices) in_neigh_edges.append(in_edges) out_neigh_indices.append(out_indices) out_neigh_edges.append(out_edges) # 4. sentences.append(sent) ids.append(id) # update lengths max_in_neigh = max(max_in_neigh, max(len(x) for x in in_indices)) max_out_neigh = max(max_out_neigh, max(len(x) for x in out_indices)) max_node = max(max_node, len(amr_node)) max_sent = max(max_sent, len(sent)) type.append('amr') return zip(nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges, sentences, ids, type), \ max_node, max_in_neigh, max_out_neigh, max_sent
def stat(inpath): nodes = 0 nums = 0 with open(inpath, "rU") as f: for i, line in enumerate(f): amr = line.strip() amr_node = [] amr_edge = [] amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge) nodes += len(amr_node) nums += 1 print 1.0*nodes/nums
def read_amr_file(amr_path): nodes = [] # [batch, node_num,] in_neigh_indices = [] # [batch, node_num, neighbor_num,] in_neigh_edges = [] out_neigh_indices = [] # [batch, node_num, neighbor_num,] out_neigh_edges = [] max_in_neigh = 0 max_out_neigh = 0 max_node = 0 with open(amr_path, "r") as f: for inst in f: amr = inst.strip() amr_node = [] amr_edge = [] amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge) #print(amr_edge) # 1. nodes.append(amr_node) # 2. & 3. in_indices = [[i,] for i, x in enumerate(amr_node)] in_edges = [[':self',] for i, x in enumerate(amr_node)] out_indices = [[i,] for i, x in enumerate(amr_node)] out_edges = [[':self',] for i, x in enumerate(amr_node)] for (i,j,lb) in amr_edge: in_indices[j].append(i) in_edges[j].append(lb) out_indices[i].append(j) out_edges[i].append(lb) in_neigh_indices.append(in_indices) in_neigh_edges.append(in_edges) out_neigh_indices.append(out_indices) out_neigh_edges.append(out_edges) # update lengths max_in_neigh = max(max_in_neigh, max(len(x) for x in in_indices)) max_out_neigh = max(max_out_neigh, max(len(x) for x in out_indices)) max_node = max(max_node, len(amr_node)) return zip(nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges)
def read_amr_file(inpath, use_bpe=False): nodes = [] # [batch, node_num,] in_neigh_indices = [] # [batch, node_num, neighbor_num,] in_neigh_edges = [] out_neigh_indices = [] # [batch, node_num, neighbor_num,] out_neigh_edges = [] sentences = [] # [batch, sent_length,] sentences_pos = [] # [batch, sent_length,] ids = [] max_in_neigh = 0 max_out_neigh = 0 max_node = 0 max_sent = 0 with open(inpath, "rU") as f: for inst in json.load(f): if use_bpe: amr = inst['amr_bpe'] sent = inst['sent_bpe'].strip().split() sent_pos = inst['sent_mask_bpe'].strip().split() id = inst['id'] if inst.has_key('id') else None amr_node = [] amr_edge = [] amr_utils.read_bpe_anonymized(amr.strip().split(), amr_node, amr_edge) # print('\nhere!!! use bpe\n') # exit(0) else: amr = inst['amr'] sent = inst['sent'].strip().split('<SPACE>') sent_pos = inst['sent_mask'].strip().split('<SPACE>') assert len(sent_pos) == len(sent), "sent_pos is {0}, sent is {1}".format(len(sent_pos), len(sent)) id = inst['id'] if inst.has_key('id') else None amr_node = [] amr_edge = [] amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge) # 1. nodes.append(amr_node) # 2. & 3. in_indices = [[i,] for i, x in enumerate(amr_node)] in_edges = [[':self',] for i, x in enumerate(amr_node)] out_indices = [[i,] for i, x in enumerate(amr_node)] out_edges = [[':self',] for i, x in enumerate(amr_node)] for (i,j,lb) in amr_edge: in_indices[j].append(i) in_edges[j].append(lb) out_indices[i].append(j) out_edges[i].append(lb) in_neigh_indices.append(in_indices) in_neigh_edges.append(in_edges) out_neigh_indices.append(out_indices) out_neigh_edges.append(out_edges) # 4. sentences.append(sent) sentences_pos.append(sent_pos) ids.append(id) # update lengths max_in_neigh = max(max_in_neigh, max(len(x) for x in in_indices)) max_out_neigh = max(max_out_neigh, max(len(x) for x in out_indices)) max_node = max(max_node, len(amr_node)) max_sent = max(max_sent, len(sent)) return zip(nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges, sentences, sentences_pos, ids), max_node, max_in_neigh, max_out_neigh, max_sent