def generate_subgraph_datasets(params, splits=['train', 'valid'], saved_relation2id=None, max_label_value=None): testing = 'test' in splits adj_list, triplets, entity2id, relation2id, id2entity, id2relation = process_files(params.file_paths, saved_relation2id) # plot_rel_dist(adj_list, os.path.join(params.main_dir, f'data/{params.dataset}/rel_dist.png')) data_path = os.path.join(params.main_dir, f'data/{params.dataset}/relation2id.json') if not os.path.isdir(data_path) and not testing: with open(data_path, 'w') as f: json.dump(relation2id, f) graphs = {} for split_name in splits: graphs[split_name] = {'triplets': triplets[split_name], 'max_size': params.max_links} # Sample train and valid/test links for split_name, split in graphs.items(): logging.info(f"Sampling negative links for {split_name}") split['pos'], split['neg'] = sample_neg(adj_list, split['triplets'], params.num_neg_samples_per_link, max_size=split['max_size'], constrained_neg_prob=params.constrained_neg_prob) if testing: directory = os.path.join(params.main_dir, 'data/{}/'.format(params.dataset)) save_to_file(directory, f'neg_{params.test_file}_{params.constrained_neg_prob}.txt', graphs['test']['neg'], id2entity, id2relation) links2subgraphs(adj_list, graphs, params, max_label_value)
def generate_subgraph_datasets(params, splits=['train', 'valid', 'test'], saved_relation2id=None, max_label_value=None): testing = 'test' in splits #adj_list, triplets, entity2id, relation2id, id2entity, id2relation, rel = process_files(params.file_paths, saved_relation2id) triple_file = 'data/{}/relations_2hop.txt'.format(params.dataset) if params.dataset == 'drugbank': adj_list, triplets, entity2id, relation2id, id2entity, id2relation, rel = process_files_ddi( params.file_paths, triple_file, saved_relation2id) else: adj_list, triplets, entity2id, relation2id, id2entity, id2relation, rel, triplets_mr, polarity_mr = process_files_decagon( params.file_paths, triple_file, saved_relation2id) # plot_rel_dist(adj_list, os.path.join(params.main_dir, f'data/{params.dataset}/rel_dist.png')) #print(triplets.keys(), triplets_mr.keys()) data_path = os.path.join(params.main_dir, f'data/{params.dataset}/relation2id.json') if not os.path.isdir(data_path) and testing: with open(data_path, 'w') as f: json.dump(relation2id, f) graphs = {} for split_name in splits: if params.dataset == 'drugbank': graphs[split_name] = { 'triplets': triplets[split_name], 'max_size': params.max_links } elif params.dataset == 'BioSNAP': graphs[split_name] = { 'triplets': triplets_mr[split_name], 'max_size': params.max_links, "polarity_mr": polarity_mr[split_name] } # Sample train and valid/test links for split_name, split in graphs.items(): logging.info(f"Sampling negative links for {split_name}") split['pos'], split['neg'] = sample_neg( adj_list, split['triplets'], params.num_neg_samples_per_link, max_size=split['max_size'], constrained_neg_prob=params.constrained_neg_prob) #print(graphs.keys()) if testing: directory = os.path.join(params.main_dir, 'data/{}/'.format(params.dataset)) save_to_file( directory, f'neg_{params.test_file}_{params.constrained_neg_prob}.txt', graphs['test']['neg'], id2entity, id2relation) links2subgraphs(adj_list, graphs, params, max_label_value)