def to_indices_with_relative_positions(graphs, word2idx):
    max_sent_len = model_params['max_sent_len']
    num_edges = len([e for g in graphs for e in g['edgeSet']])
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    max_entity_index = max_sent_len - 1
    for g in tqdm.tqdm(graphs, ascii=True):
        token_ids = embeddings.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            sentences_matrix[index, :len(token_ids)] = token_ids
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx[property_kbid]
            entity_vector = graph_utils.get_entity_indexed_vector(
                token_ids, edge, mode="position")
            entity_vector = [
                (-max_entity_index if m1 < -max_entity_index else
                 max_entity_index if m1 > max_entity_index else m1,
                 -max_entity_index if m2 < -max_entity_index else
                 max_entity_index if m2 > max_entity_index else m2)
                for _, m1, m2 in entity_vector
            ]
            entity_matrix[index, :, :len(token_ids)] = [[
                position2idx[m] for m, _ in entity_vector
            ], [position2idx[m] for _, m in entity_vector]]

            y_matrix[index] = property_kbid
            index += 1
    return [sentences_matrix, entity_matrix, y_matrix]
def to_indices_with_extracted_entities(graphs, word2idx):
    max_sent_len = model_params['max_sent_len']
    graphs = split_graphs(graphs)
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        token_sent_ids = embeddings.get_idx_sequence(g["tokens"], word2idx)
        if len(token_sent_ids) > max_sent_len:
            token_sent_ids = token_sent_ids[:max_sent_len]
        sentences_matrix[index, :len(token_sent_ids)] = token_sent_ids
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            entity_markers = [
                m for _, m in graph_utils.get_entity_indexed_vector(
                    g["tokens"], edge, mode=POSITION_EMBEDDING_MODE)
            ]
            entity_matrix[
                index,
                j, :len(token_sent_ids)] = entity_markers[:len(token_sent_ids)]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embeddings.all_zeroes])
            y_matrix[index, j] = property_kbid
    return sentences_matrix, entity_matrix, y_matrix
예제 #3
0
def to_indices_with_extracted_entities(graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({"tokens": g["tokens"], "edgeSet": g["edgeSet"][i:i+ MAX_EDGES_PER_GRAPH]})
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len), dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        token_ids = embeddings.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            entity_matrix[index, j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(property_kbid, property2idx[embeddings.unknown])
            y_matrix[index, j] = property_kbid
    return sentences_matrix, entity_matrix, y_matrix
def to_indices(graphs, word2idx):
    max_sent_len = model_params['max_sent_len']
    num_edges = sum(1 for g in graphs for e in g['edgeSet'])
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    for g in tqdm.tqdm(graphs, ascii=True):
        token_sent_ids = embeddings.get_idx_sequence(g["tokens"], word2idx)
        if len(token_sent_ids) > max_sent_len:
            token_sent_ids = token_sent_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            if edge['kbID'] not in property_blacklist:
                left_border, right_border = graph_utils.get_sentence_boundaries(
                    g["tokens"], edge)
                entity_markers = [
                    m for _, m in graph_utils.get_entity_indexed_vector(
                        g["tokens"], edge, mode=POSITION_EMBEDDING_MODE)
                ][left_border:right_border]
                token_sent_ids = token_sent_ids[left_border:right_border]
                sentences_matrix[index, :len(token_sent_ids)] = token_sent_ids
                entity_matrix[index, :len(
                    token_sent_ids)] = entity_markers[:len(token_sent_ids)]
                _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
                property_kbid = property2idx.get(
                    property_kbid, property2idx[embeddings.all_zeroes])
                y_matrix[index] = property_kbid
                index += 1
    return [sentences_matrix, entity_matrix, y_matrix]
예제 #5
0
def to_indices(graphs, word2idx, property2idx, max_sent_len, replace_entities_with_unkown = False, mode='train', **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    num_edges = len([e for g in graphs for e in g['edgeSet'] if e['kbID'] not in property_blacklist])
    print("Dataset number of edges: {}".format(num_edges))
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    for g in tqdm.tqdm(graphs, ascii=True):
        token_ids = embeddings.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            if edge['kbID'] not in property_blacklist:
                sentences_matrix[index, :len(token_ids)] = \
                    [word2idx[embeddings.unknown] if i in edge["left"] + edge["right"] else t for i, t in enumerate(token_ids)] \
                        if replace_entities_with_unkown else token_ids
                entity_matrix[index, :len(token_ids)] = \
                    [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
                if mode == "train":
                    _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
                    property_kbid = property2idx.get(property_kbid, property2idx[embeddings.unknown])
                    y_matrix[index] = property_kbid
                index += 1
    return [sentences_matrix, entity_matrix, y_matrix]