def to_indices_with_relative_positions(graphs, word2idx): max_sent_len = model_params['max_sent_len'] num_edges = len([e for g in graphs for e in g['edgeSet']]) sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32") entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8") y_matrix = np.zeros(num_edges, dtype="int16") index = 0 max_entity_index = max_sent_len - 1 for g in tqdm.tqdm(graphs, ascii=True): token_ids = embeddings.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] for edge in g["edgeSet"]: sentences_matrix[index, :len(token_ids)] = token_ids _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx[property_kbid] entity_vector = graph_utils.get_entity_indexed_vector( token_ids, edge, mode="position") entity_vector = [ (-max_entity_index if m1 < -max_entity_index else max_entity_index if m1 > max_entity_index else m1, -max_entity_index if m2 < -max_entity_index else max_entity_index if m2 > max_entity_index else m2) for _, m1, m2 in entity_vector ] entity_matrix[index, :, :len(token_ids)] = [[ position2idx[m] for m, _ in entity_vector ], [position2idx[m] for _, m in entity_vector]] y_matrix[index] = property_kbid index += 1 return [sentences_matrix, entity_matrix, y_matrix]
def to_indices_with_extracted_entities(graphs, word2idx): max_sent_len = model_params['max_sent_len'] graphs = split_graphs(graphs) sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32") entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len), dtype="int8") y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16") for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)): token_sent_ids = embeddings.get_idx_sequence(g["tokens"], word2idx) if len(token_sent_ids) > max_sent_len: token_sent_ids = token_sent_ids[:max_sent_len] sentences_matrix[index, :len(token_sent_ids)] = token_sent_ids for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]): entity_markers = [ m for _, m in graph_utils.get_entity_indexed_vector( g["tokens"], edge, mode=POSITION_EMBEDDING_MODE) ] entity_matrix[ index, j, :len(token_sent_ids)] = entity_markers[:len(token_sent_ids)] _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get( property_kbid, property2idx[embeddings.all_zeroes]) y_matrix[index, j] = property_kbid return sentences_matrix, entity_matrix, y_matrix
def to_indices(graphs, word2idx): max_sent_len = model_params['max_sent_len'] num_edges = sum(1 for g in graphs for e in g['edgeSet']) sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32") entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8") y_matrix = np.zeros(num_edges, dtype="int16") index = 0 for g in tqdm.tqdm(graphs, ascii=True): token_sent_ids = embeddings.get_idx_sequence(g["tokens"], word2idx) if len(token_sent_ids) > max_sent_len: token_sent_ids = token_sent_ids[:max_sent_len] for edge in g["edgeSet"]: if edge['kbID'] not in property_blacklist: left_border, right_border = graph_utils.get_sentence_boundaries( g["tokens"], edge) entity_markers = [ m for _, m in graph_utils.get_entity_indexed_vector( g["tokens"], edge, mode=POSITION_EMBEDDING_MODE) ][left_border:right_border] token_sent_ids = token_sent_ids[left_border:right_border] sentences_matrix[index, :len(token_sent_ids)] = token_sent_ids entity_matrix[index, :len( token_sent_ids)] = entity_markers[:len(token_sent_ids)] _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get( property_kbid, property2idx[embeddings.all_zeroes]) y_matrix[index] = property_kbid index += 1 return [sentences_matrix, entity_matrix, y_matrix]
def to_indices_with_extracted_entities(graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs): """ :param graphs: :param word2idx: :param property2idx: :param max_sent_len: :return: """ graphs_to_process = [] for g in graphs: if len(g['edgeSet']) > 0: if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH: graphs_to_process.append(g) else: for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH): graphs_to_process.append({"tokens": g["tokens"], "edgeSet": g["edgeSet"][i:i+ MAX_EDGES_PER_GRAPH]}) graphs = graphs_to_process sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32") entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len), dtype="int8") y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16") for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)): token_ids = embeddings.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] sentences_matrix[index, :len(token_ids)] = token_ids for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]): entity_matrix[index, j, :len(token_ids)] = \ [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")] _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get(property_kbid, property2idx[embeddings.unknown]) y_matrix[index, j] = property_kbid return sentences_matrix, entity_matrix, y_matrix
def to_indices(graphs, word2idx, property2idx, max_sent_len, replace_entities_with_unkown = False, mode='train', **kwargs): """ :param graphs: :param word2idx: :param property2idx: :param max_sent_len: :return: """ num_edges = len([e for g in graphs for e in g['edgeSet'] if e['kbID'] not in property_blacklist]) print("Dataset number of edges: {}".format(num_edges)) sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32") entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8") y_matrix = np.zeros(num_edges, dtype="int16") index = 0 for g in tqdm.tqdm(graphs, ascii=True): token_ids = embeddings.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] for edge in g["edgeSet"]: if edge['kbID'] not in property_blacklist: sentences_matrix[index, :len(token_ids)] = \ [word2idx[embeddings.unknown] if i in edge["left"] + edge["right"] else t for i, t in enumerate(token_ids)] \ if replace_entities_with_unkown else token_ids entity_matrix[index, :len(token_ids)] = \ [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")] if mode == "train": _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get(property_kbid, property2idx[embeddings.unknown]) y_matrix[index] = property_kbid index += 1 return [sentences_matrix, entity_matrix, y_matrix]
def to_indices_with_extracted_entities(graphs, word2idx): """ :param graphs: :param word2idx: :return: sentences_matrix[graph(sentence)_index, token_ids_in_sentence] = token_ids_in_word2idx entity_matrix[graph(sentence)_index, edge(relation)_index, token_ids_in_sentence] = entity_marker(1~4)('The', 4) y_matrix[graph(sentence)_index, edge(relation)_index] = relation_type (relation_type) """ max_sent_len = model_params['max_sent_len'] # 200 sentences_matrix = np.zeros( (len(graphs), max_sent_len), dtype="int32") # (sentence_number, sentence_len) arg1_matrix = np.zeros((len(graphs), max_sent_len), dtype="int8") arg2_matrix = np.zeros((len(graphs), max_sent_len), dtype="int8") y_matrix = np.zeros((len(graphs), 1), dtype="int16") # relation type 1~7 for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)): token_wordvec_ids = embeddings.get_idx_sequence(g["Tokens"], word2idx) sentences_matrix[index, :len(token_wordvec_ids)] = token_wordvec_ids arg1_matrix[index, :len(token_wordvec_ids)] = 1 arg2_matrix[index, :len(token_wordvec_ids)] = 1 arg1_matrix[index, range(g["mentionArg1"]["start"], g["mentionArg1"]["end"] + 1)] = 2 arg2_matrix[index, range(g["mentionArg2"]["start"], g["mentionArg2"]["end"] + 1)] = 2 relation_type = g["relationType"] relation_type_id = property2idx.get(relation_type) y_matrix[index] = relation_type_id return sentences_matrix, arg1_matrix, arg2_matrix, y_matrix