def load_fb15k_data(dataset, sim_relations, keep_fraction=100): train_network = FB15kReader(dataset) dev_network = FB15kReader(dataset) test_network = FB15kReader(dataset) train_network.read_network(data_dir="data/FB15k-237/", keep_fraction=keep_fraction, split="train") train_network.print_summary() node_list = train_network.graph.iter_nodes() node_degrees = [node.get_degree() for node in node_list] degree_counter = Counter(node_degrees) avg_degree = sum([k * v for k, v in degree_counter.items()]) / sum( [v for k, v in degree_counter.items()]) print("Average Degree: ", avg_degree) dev_network.read_network(data_dir="data/FB15k-237/", split="valid", train_network=train_network) test_network.read_network(data_dir="data/FB15k-237/", split="test", train_network=train_network) word_vocab = train_network.graph.node2id train_data, _ = reader_utils.prepare_batch_dgl(word_vocab, train_network, train_network) test_data, test_labels = reader_utils.prepare_batch_dgl( word_vocab, test_network, train_network) valid_data, valid_labels = reader_utils.prepare_batch_dgl( word_vocab, dev_network, train_network) return len( train_network.graph.nodes), train_data, valid_data, test_data, len( train_network.graph.relations ), valid_labels, test_labels, train_network
def load_cn_data(dataset, sim_relations, eval_accuracy=False): train_network = ConceptNetTSVReader(dataset) dev1_network = ConceptNetTSVReader(dataset) dev2_network = ConceptNetTSVReader(dataset) test_network = ConceptNetTSVReader(dataset) positive_only = not eval_accuracy train_network.read_network(data_dir="data/", split="train") train_network.print_summary() #node_list = train_network.graph.iter_nodes() #node_degrees = [node.get_degree() for node in node_list] #degree_counter = Counter(node_degrees) #avg_degree = sum([k*v for k,v in degree_counter.items()]) / sum([v for k,v in degree_counter.items()]) #print("Average Degree: ", avg_degree) dev1_network.read_network(data_dir="data/", split="valid1", train_network=train_network, positive_only=positive_only) dev2_network.read_network(data_dir="data/", split="valid2", train_network=train_network, positive_only=positive_only) test_network.read_network(data_dir="data/", split="valid2", train_network=train_network, positive_only=positive_only) # Add sim nodes if sim_relations: print("Adding sim edges..") train_network.add_sim_edges_bert() #word_vocab, word_freqs = reader_utils.create_vocab(train_network) word_vocab = train_network.graph.node2id train_data, _ = reader_utils.prepare_batch_dgl(word_vocab, train_network, train_network) test_data, test_labels = reader_utils.prepare_batch_dgl( word_vocab, test_network, train_network) valid1_data, valid1_labels = reader_utils.prepare_batch_dgl( word_vocab, dev1_network, train_network) valid2_data, valid2_labels = reader_utils.prepare_batch_dgl( word_vocab, dev2_network, train_network) return len( train_network.graph.nodes), train_data, valid1_data, test_data, len( train_network.graph.relations ), valid1_labels, test_labels, train_network
def load_cn_full_data(dataset, sim_relations): train_network = ConceptNetFullReader(dataset) dev_network = ConceptNetFullReader(dataset) test_network = ConceptNetFullReader(dataset) train_network.read_network( data_dir="/net/efs.mosaic/chaitanyam/ConvE/data/", split="train") train_network.print_summary() node_list = train_network.graph.iter_nodes() node_degrees = [node.get_degree() for node in node_list] degree_counter = Counter(node_degrees) avg_degree = sum([k * v for k, v in degree_counter.items()]) / sum( [v for k, v in degree_counter.items()]) print("Average Degree: ", avg_degree) dev_network.read_network(data_dir="/net/efs.mosaic/chaitanyam/ConvE/data/", split="valid", train_network=train_network) test_network.read_network( data_dir="/net/efs.mosaic/chaitanyam/ConvE/data/", split="test", train_network=train_network) #node_names = [] #for node in train_network.graph.iter_nodes(): # node_names.append(node.name) #with open("cn-full_node_names.txt", 'w') as f: # f.writelines([n.split("/")[-2].replace("_", " ")+"\n" for n in node_names if n not in string.punctuation and not n.isdigit()]) #import sys; sys.exit(0) if sim_relations: print("Adding sim edges..") train_network.add_sim_edges_bert() #word_vocab, word_freqs = reader_utils.create_vocab(train_network) word_vocab = train_network.graph.node2id train_data, _ = reader_utils.prepare_batch_dgl(word_vocab, train_network, train_network) test_data, test_labels = reader_utils.prepare_batch_dgl( word_vocab, test_network, train_network) valid_data, valid_labels = reader_utils.prepare_batch_dgl( word_vocab, dev_network, train_network) return len( train_network.graph.nodes), train_data, valid_data, test_data, len( train_network.graph.relations ), valid_labels, test_labels, train_network
def load_atomic_data(dataset, sim_relations): train_network = AtomicTSVReader(dataset) dev_network = AtomicTSVReader(dataset) test_network = AtomicTSVReader(dataset) #train_network.read_network(data_dir="data/atomic-original/", split="train") train_network.read_network(data_dir="data/atomic/", split="train") train_network.print_summary() node_list = train_network.graph.iter_nodes() node_degrees = [node.get_degree() for node in node_list] degree_counter = Counter(node_degrees) avg_degree = sum([k * v for k, v in degree_counter.items()]) / sum( [v for k, v in degree_counter.items()]) print("Average Degree: ", avg_degree) #dev_network.read_network(data_dir="data/atomic-original/", split="valid", train_network=train_network) #test_network.read_network(data_dir="data/atomic-original/", split="test", train_network=train_network) dev_network.read_network(data_dir="data/atomic/", split="valid", train_network=train_network) test_network.read_network(data_dir="data/atomic/", split="test", train_network=train_network) word_vocab = train_network.graph.node2id #node_names = [] #for node in train_network.graph.iter_nodes():?????? # node_names.append(node.name + "\n") #with open("atomic_node_names.txt", 'w') as f: # f.writelines([reader_utils.preprocess_atomic_sentence(n.replace("-", " ??????")) for n in node_names]) #import sys; sys.exit(0) # Add sim nodes if sim_relations: print("Adding sim edges..") train_network.add_sim_edges_bert() train_data, _ = reader_utils.prepare_batch_dgl(word_vocab, train_network, train_network) test_data, test_labels = reader_utils.prepare_batch_dgl( word_vocab, test_network, train_network) valid_data, valid_labels = reader_utils.prepare_batch_dgl( word_vocab, dev_network, train_network) return len( train_network.graph.nodes), train_data, valid_data, test_data, len( train_network.graph.relations ), valid_labels, test_labels, train_network
def load_data(dataset, reader_cls, data_dir, sim_relations): train_network = reader_cls(dataset) dev_network = reader_cls(dataset) test_network = reader_cls(dataset) train_network.read_network(data_dir=data_dir, split="train") train_network.print_summary() node_list = train_network.graph.iter_nodes() node_degrees = [node.get_degree() for node in node_list] degree_counter = Counter(node_degrees) avg_degree = sum([k * v for k, v in degree_counter.items()]) / sum( [v for k, v in degree_counter.items()]) print("Average Degree: ", avg_degree) dev_network.read_network(data_dir=data_dir, split="valid", train_network=train_network) test_network.read_network(data_dir=data_dir, split="test", train_network=train_network) word_vocab = train_network.graph.node2id # Add sim nodes if sim_relations: print("Adding sim edges..") train_network.add_sim_edges_bert() train_data, _ = reader_utils.prepare_batch_dgl(word_vocab, train_network, train_network) test_data, test_labels = reader_utils.prepare_batch_dgl( word_vocab, test_network, train_network) valid_data, valid_labels = reader_utils.prepare_batch_dgl( word_vocab, dev_network, train_network) return train_data, valid_data, test_data, valid_labels, test_labels, train_network