Пример #1
0
def read_amr_file(inpath):
    nodes = []  # [batch, node_num,]
    in_neigh_indices = []  # [batch, node_num, neighbor_num,]
    in_neigh_edges = []
    out_neigh_indices = []  # [batch, node_num, neighbor_num,]
    out_neigh_edges = []
    sentences = []  # [batch, sent_length,]
    ids = []
    type = []
    max_in_neigh = 0
    max_out_neigh = 0
    max_node = 0
    max_sent = 0
    with open(inpath, "rU") as f:

        for inst in json.load(f):
            amr = filter(lambda x: x in printable, inst['amr'])
            sent = filter(lambda x: x in printable,
                          inst['sent']).strip().split()
            id = inst['id'] if inst.has_key('id') else None
            amr_node = []
            amr_edge = []
            amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge)
            # 1.
            nodes.append(amr_node)
            # 2. & 3.
            in_indices = [[
                i,
            ] for i, x in enumerate(amr_node)]
            in_edges = [[
                ':self',
            ] for i, x in enumerate(amr_node)]
            out_indices = [[
                i,
            ] for i, x in enumerate(amr_node)]
            out_edges = [[
                ':self',
            ] for i, x in enumerate(amr_node)]
            for (i, j, lb) in amr_edge:
                in_indices[j].append(i)
                in_edges[j].append(lb)
                out_indices[i].append(j)
                out_edges[i].append(lb)
            in_neigh_indices.append(in_indices)
            in_neigh_edges.append(in_edges)
            out_neigh_indices.append(out_indices)
            out_neigh_edges.append(out_edges)
            # 4.
            sentences.append(sent)
            ids.append(id)
            # update lengths
            max_in_neigh = max(max_in_neigh, max(len(x) for x in in_indices))
            max_out_neigh = max(max_out_neigh,
                                max(len(x) for x in out_indices))
            max_node = max(max_node, len(amr_node))
            max_sent = max(max_sent, len(sent))
            type.append('amr')
    return zip(nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges, sentences, ids, type), \
               max_node, max_in_neigh, max_out_neigh, max_sent
Пример #2
0
def stat(inpath):
    nodes = 0
    nums = 0
    with open(inpath, "rU") as f:
        for i, line in enumerate(f):
            amr = line.strip()
            amr_node = []
            amr_edge = []
            amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge)
            nodes += len(amr_node)
            nums += 1
    print 1.0*nodes/nums
Пример #3
0
def read_amr_file(amr_path):
    nodes = [] # [batch, node_num,]
    in_neigh_indices = [] # [batch, node_num, neighbor_num,]
    in_neigh_edges = []
    out_neigh_indices = [] # [batch, node_num, neighbor_num,]
    out_neigh_edges = []
    max_in_neigh = 0
    max_out_neigh = 0
    max_node = 0
    with open(amr_path, "r") as f:
        for inst in f:
            amr = inst.strip()
            amr_node = []
            amr_edge = []
            amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge)
            #print(amr_edge)
            # 1.
            nodes.append(amr_node)
            # 2. & 3.
            in_indices = [[i,] for i, x in enumerate(amr_node)]
            in_edges = [[':self',] for i, x in enumerate(amr_node)]
            out_indices = [[i,] for i, x in enumerate(amr_node)]
            out_edges = [[':self',] for i, x in enumerate(amr_node)]
            for (i,j,lb) in amr_edge:
                in_indices[j].append(i)
                in_edges[j].append(lb)
                out_indices[i].append(j)
                out_edges[i].append(lb)
            in_neigh_indices.append(in_indices)
            in_neigh_edges.append(in_edges)
            out_neigh_indices.append(out_indices)
            out_neigh_edges.append(out_edges)
            # update lengths
            max_in_neigh = max(max_in_neigh, max(len(x) for x in in_indices))
            max_out_neigh = max(max_out_neigh, max(len(x) for x in out_indices))
            max_node = max(max_node, len(amr_node))
    return zip(nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges)
def read_amr_file(inpath, use_bpe=False):
    nodes = [] # [batch, node_num,]
    in_neigh_indices = [] # [batch, node_num, neighbor_num,]
    in_neigh_edges = []
    out_neigh_indices = [] # [batch, node_num, neighbor_num,]
    out_neigh_edges = []
    sentences = [] # [batch, sent_length,]
    sentences_pos = []  # [batch, sent_length,]
    ids = []
    max_in_neigh = 0
    max_out_neigh = 0
    max_node = 0
    max_sent = 0
    with open(inpath, "rU") as f:
        for inst in json.load(f):
            if use_bpe:
                amr = inst['amr_bpe']
                sent = inst['sent_bpe'].strip().split()
                sent_pos = inst['sent_mask_bpe'].strip().split()
                id = inst['id'] if inst.has_key('id') else None
                amr_node = []
                amr_edge = []
                amr_utils.read_bpe_anonymized(amr.strip().split(), amr_node, amr_edge)
                # print('\nhere!!! use bpe\n')
                # exit(0)
            else:
                amr = inst['amr']
                sent = inst['sent'].strip().split('<SPACE>')
                sent_pos = inst['sent_mask'].strip().split('<SPACE>')
                assert len(sent_pos) == len(sent), "sent_pos is {0}, sent is {1}".format(len(sent_pos), len(sent))
                id = inst['id'] if inst.has_key('id') else None
                amr_node = []
                amr_edge = []
                amr_utils.read_anonymized(amr.strip().split(), amr_node, amr_edge)
            # 1.
            nodes.append(amr_node)
            # 2. & 3.
            in_indices = [[i,] for i, x in enumerate(amr_node)]
            in_edges = [[':self',] for i, x in enumerate(amr_node)]
            out_indices = [[i,] for i, x in enumerate(amr_node)]
            out_edges = [[':self',] for i, x in enumerate(amr_node)]
            for (i,j,lb) in amr_edge:
                in_indices[j].append(i)
                in_edges[j].append(lb)
                out_indices[i].append(j)
                out_edges[i].append(lb)
            in_neigh_indices.append(in_indices)
            in_neigh_edges.append(in_edges)
            out_neigh_indices.append(out_indices)
            out_neigh_edges.append(out_edges)
            # 4.
            sentences.append(sent)
            sentences_pos.append(sent_pos)
            ids.append(id)
            # update lengths
            max_in_neigh = max(max_in_neigh, max(len(x) for x in in_indices))
            max_out_neigh = max(max_out_neigh, max(len(x) for x in out_indices))
            max_node = max(max_node, len(amr_node))
            max_sent = max(max_sent, len(sent))
    return zip(nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges, sentences, sentences_pos,
               ids), max_node, max_in_neigh, max_out_neigh, max_sent