Exemplo n.º 1
0
class TestPartialBuild(unittest.TestCase):
    """Class to test the partialBuild class from the knowledge graph script."""
    def setUp(self):
        warnings.simplefilter('ignore', ResourceWarning)

        # initialize file location
        current_directory = os.path.dirname(__file__)
        dir_loc = os.path.join(current_directory, 'data')
        self.dir_loc = os.path.abspath(dir_loc)

        # set-up environment - make temp directory
        dir_loc_resources = os.path.join(current_directory, 'resources')
        self.dir_loc_resources = os.path.abspath(dir_loc_resources)
        os.mkdir(self.dir_loc_resources)
        os.mkdir(self.dir_loc_resources + '/knowledge_graphs')
        os.mkdir(self.dir_loc_resources + '/relations_data')
        os.mkdir(self.dir_loc_resources + '/node_data')
        os.mkdir(self.dir_loc_resources + '/ontologies')
        os.mkdir(self.dir_loc_resources + '/construction_approach')

        # handle logging
        self.logs = os.path.abspath(current_directory + '/builds/logs')
        logging.disable(logging.CRITICAL)
        if len(glob.glob(self.logs + '/*.log')) > 0:
            os.remove(glob.glob(self.logs + '/*.log')[0])

        # copy needed data data
        # node metadata
        shutil.copyfile(
            self.dir_loc + '/node_data/node_metadata_dict.pkl',
            self.dir_loc_resources + '/node_data/node_metadata_dict.pkl')
        # ontology data
        shutil.copyfile(
            self.dir_loc + '/ontologies/empty_hp_with_imports.owl',
            self.dir_loc_resources + '/ontologies/hp_with_imports.owl')
        # merged ontology data
        shutil.copyfile(
            self.dir_loc + '/ontologies/so_with_imports.owl',
            self.dir_loc_resources +
            '/knowledge_graphs/PheKnowLator_MergedOntologies.owl')
        # relations data
        shutil.copyfile(
            self.dir_loc + '/RELATIONS_LABELS.txt',
            self.dir_loc_resources + '/relations_data/RELATIONS_LABELS.txt')
        # inverse relations
        shutil.copyfile(
            self.dir_loc + '/INVERSE_RELATIONS.txt',
            self.dir_loc_resources + '/relations_data/INVERSE_RELATIONS.txt')
        # empty master edges
        shutil.copyfile(
            self.dir_loc + '/Master_Edge_List_Dict_empty.json',
            self.dir_loc_resources + '/Master_Edge_List_Dict_empty.json')

        # create edge list
        edge_dict = {
            "gene-phenotype": {
                "data_type":
                "entity-class",
                "edge_relation":
                "RO_0003302",
                "uri": [
                    "http://www.ncbi.nlm.nih.gov/gene/",
                    "http://purl.obolibrary.org/obo/"
                ],
                "edge_list": [["2", "SO_0000162"], ["2", "SO_0000196"],
                              ["2", "SO_0000323"], ["9", "SO_0001490"],
                              ["9", "SO_0000301"], ["9", "SO_0001560"],
                              ["9", "SO_0001560"], ["10", "SO_0000444"],
                              ["10", "SO_0002138"], ["10", "SO_0000511"]]
            },
            "gene-gene": {
                "data_type":
                "entity-entity",
                "edge_relation":
                "RO_0002435",
                "uri": [
                    "http://www.ncbi.nlm.nih.gov/gene/",
                    "http://www.ncbi.nlm.nih.gov/gene/"
                ],
                "edge_list": [["1", "2"], ["2", "3"], ["3", "18"],
                              ["17", "19"], ["4", "17"], ["5", "11"],
                              ["11", "12"], ["4", "5"]]
            },
            "disease-disease": {
                "data_type":
                "class-class",
                "edge_relation":
                "RO_0002435",
                "uri": [
                    "http://www.ncbi.nlm.nih.gov/gene/",
                    "http://www.ncbi.nlm.nih.gov/gene/"
                ],
                "edge_list": [["DOID_3075", "DOID_1080"],
                              ["DOID_3075", "DOID_4267"],
                              ["DOID_4800", "DOID_10190"],
                              ["DOID_4800", "DOID_80219"],
                              ["DOID_2729", "DOID_1962"],
                              ["DOID_2729", "DOID_5096"],
                              ["DOID_8837", "DOID_6774"],
                              ["DOID_8837", "DOID_8754"]]
            },
            "entity_namespaces": {
                "gene": "http://purl.uniprot.org/geneid/"
            }
        }

        # save data
        with open(self.dir_loc_resources + '/Master_Edge_List_Dict.json',
                  'w') as filepath:
            json.dump(edge_dict, filepath)

        # create subclass mapping data
        subcls_map = {
            "1": ['SO_0001217'],
            "2": ['SO_0001217'],
            "3": ['SO_0001217'],
            "4": ['SO_0001217'],
            "5": ['SO_0001217'],
            "11": ['SO_0001217'],
            "12": ['SO_0001217'],
            "17": ['SO_0001217'],
            "18": ['SO_0001217'],
            "5096": ['SO_0001217'],
            "6774": ['SO_0001217'],
            "19": ['SO_0001217']
        }

        # save data
        with open(
                self.dir_loc_resources +
                '/construction_approach/subclass_construction_map.pkl',
                'wb') as f:
            pickle.dump(subcls_map, f, protocol=4)

        # set write location
        self.write_location = self.dir_loc_resources + '/knowledge_graphs'

        # instantiate class
        self.kg = PartialBuild('subclass', 'yes', 'yes', 'yes', 1,
                               self.write_location)

        # update class attributes
        dir_loc_owltools = os.path.join(current_directory, 'utils/owltools')
        self.kg.owl_tools = os.path.abspath(dir_loc_owltools)

        return None

    def test_class_initialization(self):
        """Tests initialization of the class."""

        # check build type
        self.assertEqual(self.kg.gets_build_type(), 'Partial Build')
        self.assertFalse(self.kg.gets_build_type() == 'Full Build')
        self.assertFalse(self.kg.gets_build_type() == 'Post-Closure Build')

        return None

    def test_construct_knowledge_graph(self):
        """Tests the construct_knowledge_graph method."""

        # test out the build
        self.kg.construct_knowledge_graph()
        full_kg_owl = '_'.join(self.kg.full_kg.split('_')[0:-1]) + '_OWL.owl'

        # check for output files
        f_name = full_kg_owl[:-4] + '_LogicOnly.nt'
        self.assertTrue(
            os.path.exists(self.dir_loc_resources + '/knowledge_graphs/' +
                           f_name))
        f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt'
        self.assertTrue(
            os.path.exists(self.dir_loc_resources + '/knowledge_graphs/' +
                           f_name))
        f_name = full_kg_owl[:-4] + '.nt'
        self.assertTrue(
            os.path.exists(self.dir_loc_resources + '/knowledge_graphs/' +
                           f_name))

        return None

    def tearDown(self):
        warnings.simplefilter('default', ResourceWarning)

        # remove resource directory
        shutil.rmtree(self.dir_loc_resources)

        return None
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description=('PheKnowLator: This program builds a biomedical knowledge graph using'
                                                  ' Open Biomedical Ontologies and linked open data. The program takes '
                                                  'the following arguments:'))
    parser.add_argument('-p', '--cpus', help='# workers to use; defaults to use all available cores', default=None)
    parser.add_argument('-g', '--onts', help='name/path to text file containing ontologies', required=True)
    parser.add_argument('-e', '--edg', help='name/path to text file containing edge sources', required=True)
    parser.add_argument('-a', '--app', help='construction approach to use (i.e. instance or subclass)', required=True)
    parser.add_argument('-t', '--res', help='name/path to text file containing resource_info', required=True)
    parser.add_argument('-b', '--kg', help='build type: "partial", "full", or "post-closure"', required=True)
    parser.add_argument('-r', '--rel', help='yes/no - adding inverse relations to knowledge graph', required=True)
    parser.add_argument('-s', '--owl', help='yes/no - removing OWL Semantics from knowledge graph', required=True)
    parser.add_argument('-m', '--nde', help='yes/no - adding node metadata to knowledge graph', required=True)
    parser.add_argument('-o', '--out', help='name/path to directory where to write knowledge graph', required=True)
    args = parser.parse_args()

    ######################
    #### READ IN DATA ####
    ######################

    # STEP 1: CREATE INPUT DOCUMENTS
    # see https://github.com/callahantiff/PheKnowLator/wiki/Dependencies page for how to prepare input data files

    # STEP 2: DOWNLOAD AND PREPROCESS DATA
    # see the 'Data_Preparation.ipynb' and 'Ontology_Cleaning.ipynb' file for examples and guidelines

    # STEP 3: DOWNLOAD ONTOLOGIES
    print('\n' + '=' * 40 + '\nPKT: DOWNLOADING DATA: ONTOLOGY DATA\n' + '=' * 40 + '\n')
    start = time.time()
    ont = OntData(data_path=args.onts, resource_data=args.res)
    # ont = OntData(data_path='resources/ontology_source_list.txt', resource_data='resources/resource_info.txt')
    ont.downloads_data_from_url()
    end = time.time(); timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print('\nPKT: TOTAL SECONDS TO DOWNLOAD ONTOLOGIES: {} @ {}'.format(end - start, timestamp))

    # STEP 4: DOWNLOAD EDGE DATA SOURCES
    print('\n' + '=' * 37 + '\nPKT: DOWNLOADING DATA: CLASS DATA\n' + '=' * 37 + '\n')
    start = time.time()
    ent = LinkedData(data_path=args.edg, resource_data=args.res)
    # ent = LinkedData(data_path='resources/edge_source_list.txt', resource_data='resources/resource_info.txt')
    ent.downloads_data_from_url()
    end = time.time(); timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print('\nPKT: TOTAL SECONDS TO DOWNLOAD NON-ONTOLOGY DATA: {} @ {}'.format(end - start, timestamp))

    #####################
    # CREATE EDGE LISTS #
    #####################

    # set-up environment
    cpus = psutil.cpu_count(logical=True) if args.cpus is None else args.cpus; ray.init(ignore_reinit_error=True)

    print('\n' + '=' * 28 + '\nPKT: CONSTRUCT EDGE LISTS\n' + '=' * 28 + '\n')
    start = time.time()
    combined_edges = dict(ent.data_files, **ont.data_files)
    # master_edges = CreatesEdgeList(data_files=combined_edges, source_file='resources/resource_info.txt')
    master_edges = CreatesEdgeList(data_files=combined_edges, source_file=args.res)
    master_edges.runs_creates_knowledge_graph_edges(source_file=args.res, data_files=combined_edges, cpus=cpus)
    end = time.time(); timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print('\nPKT: TOTAL SECONDS TO BUILD THE MASTER EDGE LIST: {} @ {}'.format(end - start, timestamp))

    del ont, ent, master_edges  # clean up environment before build knowledge graph

    #########################
    # BUILD KNOWLEDGE GRAPH #
    #########################

    print('\n' + '=' * 33 + '\nPKT: BUILDING KNOWLEDGE GRAPH\n' + '=' * 33 + '\n')
    start = time.time()

    if args.kg == 'partial':
        kg = PartialBuild(construction=args.app,
                          node_data=args.nde,
                          inverse_relations=args.rel,
                          decode_owl=args.owl,
                          cpus=cpus,
                          write_location=args.out)
    elif args.kg == 'post-closure':
        kg = PostClosureBuild(construction=args.app,
                              node_data=args.nde,
                              inverse_relations=args.rel,
                              decode_owl=args.owl,
                              cpus=cpus,
                              write_location=args.out)
    else:
        kg = FullBuild(construction=args.app,
                       node_data=args.nde,
                       inverse_relations=args.rel,
                       decode_owl=args.owl,
                       cpus=cpus,
                       write_location=args.out)
    kg.construct_knowledge_graph()

    # ray.shutdown()  # uncomment if running this independently of the CI/CD builds
    end = time.time(); timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print('\nPKT: TOTAL SECONDS TO CONSTRUCT A KG: {} @ {}'.format(end - start, timestamp))