Пример #1
0
def to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair(
        graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                continue  # here we discard these data points
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    entity_cnt = []
    pos2id = dict()
    entity_pair = []
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        entity_pair_instance = []
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            new_j = calculate_order_conversion(j, len(g["vertexSet"]))
            entity_matrix[index, new_j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, new_j] = property_kbid
            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair.append(entity_pair_instance)
    entity_cnt = np.array(entity_cnt, dtype=np.int32)

    return sentences_matrix, entity_matrix, y_matrix, entity_cnt, entity_pair
Пример #2
0
def to_indices_and_entity_pair(graphs,
                               word2idx,
                               property2idx,
                               max_sent_len,
                               replace_entities_with_unkown=False,
                               mode='train',
                               **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    num_edges = len([
        e for g in graphs for e in g['edgeSet']
        if e['kbID'] not in property_blacklist
    ])
    print("Dataset number of edges: {}".format(num_edges))
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    entity_cnt = []
    pos2id = dict()
    entity_pair = []
    for g in tqdm.tqdm(graphs, ascii=True):
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            if edge['kbID'] not in property_blacklist:
                sentences_matrix[index, :len(token_ids)] = \
                    [word2idx[embedding_utils.unknown] if i in edge["left"] + edge["right"] else t for i, t in enumerate(token_ids)] \
                        if replace_entities_with_unkown else token_ids
                entity_matrix[index, :len(token_ids)] = \
                    [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
                if mode == "train":
                    _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
                    property_kbid = property2idx.get(
                        property_kbid, property2idx[embedding_utils.unknown])
                    y_matrix[index] = property_kbid
                entity_pair.append((pos2id[tuple(edge['left'])],
                                    pos2id[tuple(edge['right'])]))
                index += 1
    return [sentences_matrix, entity_matrix, y_matrix, entity_pair]
Пример #3
0
def to_indices_with_relative_positions_and_pcnn_mask_and_entity_pair(
        graphs, word2idx, property2idx, max_sent_len, position2idx, **kwargs):
    num_edges = len([e for g in graphs for e in g['edgeSet']])
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8")
    pcnn_mask = np.zeros((num_edges, 3, max_sent_len), dtype="float32")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    max_entity_index = max_sent_len - 1
    entity_pair = []
    pos2id = dict()
    for g in tqdm.tqdm(graphs, ascii=True):
        try:
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        entity_pair_instance = []
        for edge in g["edgeSet"]:
            sentences_matrix[index, :len(token_ids)] = token_ids
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            try:
                property_kbid = property2idx.get(
                    property_kbid, property2idx[embedding_utils.unknown])
            except:
                pdb.set_trace()
            entity_vector = graph_utils.get_entity_indexed_vector(
                token_ids, edge, mode="position")
            entity_vector = [
                (-max_entity_index if m1 < -max_entity_index else
                 max_entity_index if m1 > max_entity_index else m1,
                 -max_entity_index if m2 < -max_entity_index else
                 max_entity_index if m2 > max_entity_index else m2)
                for _, m1, m2 in entity_vector
            ]
            entity_matrix[index, :, :len(token_ids)] = [[
                position2idx[m] for m, _ in entity_vector
            ], [position2idx[m] for _, m in entity_vector]]
            pcnn_mask[index, 0, :len(token_ids)], pcnn_mask[
                index, 1, :len(token_ids)], pcnn_mask[
                    index, 2, :len(token_ids)] = graph_utils.get_pcnn_mask(
                        token_ids, edge)
            y_matrix[index] = property_kbid
            index += 1
            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair += entity_pair_instance
    return [sentences_matrix, entity_matrix, y_matrix, pcnn_mask, entity_pair]
Пример #4
0
def to_indices_with_real_entities_completely(graphs,
                                             word2idx,
                                             property2idx,
                                             max_sent_len,
                                             mode='train',
                                             **kwargs):
    """
    This function add N/A relations to all entity pairs with no relation in dataset
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            entity_matrix[index, j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, j] = property_kbid
    return sentences_matrix, entity_matrix, y_matrix
Пример #5
0
def to_indices_with_relative_positions(graphs, word2idx, property2idx,
                                       max_sent_len, position2idx, **kwargs):
    num_edges = len([e for g in graphs for e in g['edgeSet']])
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    max_entity_index = max_sent_len - 1
    for g in tqdm.tqdm(graphs, ascii=True):
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            sentences_matrix[index, :len(token_ids)] = token_ids
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            try:
                property_kbid = property2idx.get(
                    property_kbid, property2idx[embedding_utils.unknown])
            except:
                pdb.set_trace()
            entity_vector = graph_utils.get_entity_indexed_vector(
                token_ids, edge, mode="position")
            entity_vector = [
                (-max_entity_index if m1 < -max_entity_index else
                 max_entity_index if m1 > max_entity_index else m1,
                 -max_entity_index if m2 < -max_entity_index else
                 max_entity_index if m2 > max_entity_index else m2)
                for _, m1, m2 in entity_vector
            ]
            entity_matrix[index, :, :len(token_ids)] = [[
                position2idx[m] for m, _ in entity_vector
            ], [position2idx[m] for _, m in entity_vector]]

            y_matrix[index] = property_kbid
            index += 1
    return [sentences_matrix, entity_matrix, y_matrix]