class DatasetTestCase(unittest.TestCase): """ For testing metadata emitted by Dataset class Dataset creates a graph describing the metadata associated with the dataset in question, which should follow the HCLS specification for dataset descriptions https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/ """ @classmethod def setUpClass(cls): cls.curie_map = curiemap.get() # parameters passed to code, to be returned in graph cls.monarch_archive_curie_prefix = "MonarchArchive" cls.identifier = "fakeingest" cls.ingest_description = "some ingest description" cls.ingest_url = "http://fakeingest.com" cls.ingest_title = "this ingest title" cls.ingest_logo_url = "logo.png" cls.license_url = "https://choosealicense.com/licenses/mit/" cls.license_url_default = "https://project-open-data.cio.gov/unknown-license/" cls.data_rights = "https://www.gnu.org/licenses/gpl-3.0.html" cls.distribution_type = "ttl" # parse test graph once, to test triples counts/statistics below cls.test_ttl = "tests/resources/fakeingest/test_graph_simple.ttl" cls.test_graph = RDFGraph() cls.test_graph.parse(cls.test_ttl, format="turtle") # expected things: cls.expected_curie_prefix = "MonarchArchive" cls.timestamp_date = datetime.today().strftime("%Y%m%d") cls.base_cito = 'http://purl.org/spar/cito/' cls.base_dcat = 'http://www.w3.org/ns/dcat#' cls.base_dcterms = 'http://purl.org/dc/terms/' cls.base_dctypes = 'http://purl.org/dc/dcmitype/' cls.base_pav = 'http://purl.org/pav/' cls.base_rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' cls.base_rdfs = 'http://www.w3.org/2000/01/rdf-schema#' cls.base_schema = 'http://schema.org/' cls.base_void = 'http://rdfs.org/ns/void#' cls.base_owl = 'http://www.w3.org/2002/07/owl#' cls.base_logo_url = "https://github.com/monarch-initiative/monarch-ui/blob/master/public/img/sources/" # expected summary level IRI cls.summary_level_IRI = URIRef( cls.curie_map.get(cls.expected_curie_prefix) + "#" + cls.identifier) # expected version level IRI cls.data_release_version = "19700101" cls.version_level_IRI = URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.data_release_version + "/" + "#" + cls.identifier) cls.version_level_IRI_default_version = \ URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.timestamp_date + "/" + "#" + cls.identifier) # expected distribution level IRI (for ttl resource) cls.distribution_level_IRI_ttl = \ URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.data_release_version + "/rdf/" + cls.identifier + "." + cls.distribution_type) cls.distribution_level_IRI_ttl_default_version = \ URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.timestamp_date + "/rdf/" + cls.identifier + "." + cls.distribution_type) # set expected IRIs for predicates and other things cls.iri_rdf_type = URIRef(cls.base_rdf + "type") cls.iri_title = URIRef(cls.base_dcterms + "title") cls.iri_dataset = URIRef(cls.base_dctypes + "Dataset") cls.iri_description = URIRef(cls.base_dcterms + "description") cls.iri_publisher = URIRef(cls.base_dcterms + "Publisher") cls.iri_source = URIRef(cls.base_dcterms + "source") cls.iri_logo = URIRef(cls.base_schema + "logo") cls.iri_mi_org = URIRef("https://monarchinitiative.org/") cls.iri_created = URIRef(cls.base_dcterms + "created") cls.iri_version = URIRef(cls.base_pav + "version") cls.iri_retrieved_on = URIRef(cls.base_pav + "retrievedOn") cls.iri_creator = URIRef(cls.base_dcterms + "creator") cls.iri_is_version_of = URIRef(cls.base_dcterms + "isVersionOf") cls.iri_distribution = URIRef(cls.base_dcat + "Distribution") cls.iri_created_with = URIRef(cls.base_pav + "createdWith") cls.iri_format = URIRef(cls.base_dcterms + "format") cls.iri_download_url = URIRef(cls.base_dcterms + "downloadURL") cls.iri_license = URIRef(cls.base_dcterms + "license") cls.iri_data_rights = URIRef(cls.base_dcterms + "rights") cls.iri_cites_as_authority = URIRef(cls.base_cito + "citesAsAuthority") cls.iri_rdfs_label = URIRef(cls.base_rdfs + "label") cls.iri_owl_ontology = URIRef(cls.base_owl + "Ontology") cls.iri_owl_version_iri = URIRef(cls.base_owl + "versionIRI") cls.iri_owl_version_info = URIRef(cls.base_owl + "versionInfo") cls.iri_returned_logo = URIRef(cls.base_logo_url + cls.ingest_logo_url) cls.iri_expected_download_url_value = \ URIRef( cls.curie_map.get("MonarchArchive") + cls.data_release_version + "/rdf/" + cls.identifier + "." + cls.distribution_type) cls.iri_dipper = URIRef("https://github.com/monarch-initiative/dipper") cls.iri_ttl_spec = URIRef("https://www.w3.org/TR/turtle/") @classmethod def tearDownClass(cls): pass def setUp(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=self.data_release_version, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=self.license_url, data_rights=self.data_rights) # put all triples in a list for debugging below self.all_triples = list(self.dataset.graph.triples((None, None, None))) def tearDown(self): pass def test_dataset_has_graph(self): self.assertIsInstance(self.dataset.graph, Graph, "dataset doesn't contain an RDF graph") def test_get_graph(self): self.assertIsInstance(self.dataset.get_graph(), RDFGraph, "get_graph() didn't return an RDF graph") def test_get_license(self): gpl2_iri = "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html" self.dataset.license_url = gpl2_iri self.assertEqual(self.dataset.get_license(), gpl2_iri, "set_license didn't set license_url correctly") def test_set_citation(self): citation_iri =\ "http://purl.obolibrary.org/obo/uberon/releases/2016-01-26/uberon.owl" self.dataset.set_citation(citation_iri) self.assertTrue(self.dataset.citation.issuperset([citation_iri])) triples = list( self.dataset.graph.triples( (self.version_level_IRI, URIRef(self.iri_cites_as_authority), URIRef(citation_iri)))) self.assertTrue(len(triples) == 1, "missing citation triple") def test_set_ingest_source_file_version_num(self): this_version = "version1234" file_iri = "http://somefilesource.org/file.txt" self.dataset.set_ingest_source_file_version_num(file_iri, this_version) triples = list( self.dataset.graph.triples( (URIRef(file_iri), self.iri_version, Literal(this_version)))) self.assertTrue( len(triples) == 1, "ingest source file version not set") def test_set_ingest_source_file_version_date(self): this_version = "1970-01-01" file_iri = "http://somefilesource.org/file.txt" self.dataset.set_ingest_source_file_version_date( file_iri, this_version) triples = list( self.dataset.graph.triples((URIRef(file_iri), self.iri_version, Literal(this_version, datatype=XSD.date)))) self.assertTrue( len(triples) == 1, "ingest source file version not set with literal type of date") # # Test summary level triples: # def test_summary_level_type(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_rdf_type, self.iri_dataset))) self.assertTrue(len(triples) == 1, "missing summary level type triple") def test_summary_level_title(self): triples = list( self.dataset.graph.triples((self.summary_level_IRI, self.iri_title, Literal(self.ingest_title)))) self.assertTrue( len(triples) == 1, "missing summary level title triple") def test_summary_level_description(self): # by default, description is the class's docstring triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_description, Literal(self.ingest_description)))) self.assertTrue( len(triples) == 1, "missing summary level description triple") def test_summary_level_publisher(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_publisher, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing summary level publisher triple") def test_summary_level_source_web_page(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_source, URIRef(self.ingest_url)))) self.assertTrue( len(triples) == 1, "missing summary level source page triple") def test_summary_level_source_logo(self): triples = list( self.dataset.graph.triples((self.summary_level_IRI, self.iri_logo, URIRef(self.iri_returned_logo)))) self.assertTrue( len(triples) == 1, "missing summary level source logo triple") def test_summary_level_ontology_type_declaration(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_rdf_type, self.iri_owl_ontology))) self.assertTrue( len(triples) == 1, "missing distribution level owl ontology type triple") def test_summary_level_owl_version_iri(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_owl_version_iri, None))) self.assertTrue( len(triples) == 1, "missing distribution level owl version iri triple") self.assertEqual(triples[0][2], URIRef(self.version_level_IRI), "owl version iri triple has the wrong object") # # Test version level resource triples: # def test_version_level_type(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_rdf_type, self.iri_dataset))) self.assertTrue(len(triples) == 1, "missing version level type triple") def test_version_level_title(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_title, None))) self.assertTrue( len(triples) == 1, "missing version level title triple") self.assertEqual( triples[0][2], Literal(self.ingest_title + " Monarch version " + self.data_release_version), "version level title triple has wrong value") def test_version_level_description(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_description, Literal(self.ingest_description)))) self.assertTrue( len(triples) == 1, "missing version level description triple") def test_version_level_created(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_created, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 version level created triple") self.assertEqual( triples[0][2], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date), "version level created triple has the wrong timestamp") def test_version_level_version_default(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_version, None))) self.assertTrue( len(triples) == 1, "didn't get exactly one version level version triple") self.assertEqual( triples[0][2], Literal(self.data_release_version, datatype=XSD.date), "version level version triple (default) has the wrong " + "timestamp") def test_version_level_version_set_explicitly(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=self.data_release_version, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=None, data_rights=self.data_rights) triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_version, None))) self.assertTrue( len(triples) == 1, "didn't get exactly one version level version triple") self.assertEqual( triples[0][2], Literal(self.data_release_version, datatype=XSD.date), "version level version triple (set explicitly) is wrong ") def test_version_level_creator(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_creator, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing version level creator triple") def test_version_level_publisher(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_publisher, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing version level publisher triple") def test_version_level_isVersionOf(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_is_version_of, self.summary_level_IRI))) self.assertTrue( len(triples) == 1, "missing version level isVersionOf triple") def test_version_level_distribution(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_distribution, self.distribution_level_IRI_ttl))) self.assertTrue( len(triples) == 1, "missing version level distribution triple") # # test distribution level triples # def test_distribution_level_dataset_type(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_rdf_type, self.iri_dataset))) self.assertTrue( len(triples) == 1, "missing version level type dataset triple") def test_distribution_level_distribution_type(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_rdf_type, self.iri_distribution))) self.assertTrue( len(triples) == 1, "missing version level type distribution triple") def test_distribution_level_title(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_title, None))) self.assertTrue( len(triples) == 1, "missing distribution level type title triple") self.assertEqual( triples[0][2], Literal(self.ingest_title + " distribution " + self.distribution_type), "distribution level title triple has wrong value") def test_distribution_level_description(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_description, Literal(self.ingest_description)))) self.assertTrue( len(triples) == 1, "missing version level type description triple") def test_distribution_level_created(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_created, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 version level type created triple") self.assertEqual( triples[0][2], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) def test_distribution_level_version(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_version, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 version level type version triple") self.assertEqual(triples[0][2], Literal(self.data_release_version, datatype=XSD.date)) def test_distribution_level_creator(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_creator, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing distribution level creator triple") def test_distribution_level_publisher(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_publisher, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing distribution level publisher triple") def test_distribution_level_created_with(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_created_with, self.iri_dipper))) self.assertTrue( len(triples) == 1, "missing distribution level createdWith triple") def test_distribution_level_format(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_format, self.iri_ttl_spec))) self.assertTrue( len(triples) == 1, "missing distribution level format triple") def test_distribution_level_download_url(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_download_url, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 downloadURL triple") self.assertEqual(triples[0][2], self.iri_expected_download_url_value, "didn't get the expected downloadURL value") def test_distribution_level_license_url(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_license, URIRef(self.license_url)))) self.assertTrue( len(triples) == 1, "missing distribution level license triple") def test_distribution_level_data_rights(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_data_rights, URIRef(self.data_rights)))) self.assertTrue( len(triples) == 1, "missing distribution level data rights triple") def test_distribution_level_no_license_url_default_value(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=None, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=None, data_rights=self.data_rights) triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl_default_version, self.iri_license, URIRef(self.license_url_default)))) self.assertTrue( len(triples) == 1, "distribution level default license triple not set")
class Source: """ Abstract class for any data sources that we'll import and process. Each of the subclasses will fetch() the data, scrub() it as necessary, then parse() it into a graph. The graph will then be written out to a single self.name().<dest_fmt> file. Also provides a means to marshal metadata in a consistent fashion Houses the global translation table (from ontology label to ontology term) so it may as well be used everywhere. """ DIPPERCACHE = 'https://archive.monarchinitiative.org/DipperCache' namespaces = {} files = {} ARGV = {} def __init__( self, graph_type='rdf_graph', # or streamed_graph are_bnodes_skized=False, # typically True data_release_version=None, name=None, # identifier; make an URI for nquads ingest_title=None, ingest_url=None, ingest_logo=None, # this should be the name of file on 'MonarchLogoRepo' ingest_description=None, license_url=None, # only if it is _our_ lic data_rights=None, # their page that points to their current lic file_handle=None, ): # pull in the common test identifiers self.all_test_ids = self.open_and_parse_yaml('../../resources/test_ids.yaml') self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized self.data_release_version = data_release_version self.ingest_title = ingest_title self.ingest_url = ingest_url self.ingest_logo = ingest_logo self.ingest_description = ingest_description self.license_url = license_url self.data_rights = data_rights self.localtt = self.load_local_translationtable(name) self.remote_file_timestamps = dict() if name is not None: self.name = name.lower() elif self.whoami() is not None: self.name = self.whoami().lower() LOG.info("Processing Source \"%s\"", self.name) self.test_only = False self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = '/'.join(('raw', self.name)) self.testname = name + "_test" self.testfile = '/'.join((self.outdir, self.testname + ".ttl")) self.datasetfile = None # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) raw_pth = os.path.abspath(self.rawdir) LOG.info("creating raw directory for %s at %s", self.name, raw_pth) # else: # raw data dir does exist. maybe should consider what is in it? # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) out_pth = os.path.abspath(self.outdir) LOG.info("created output directory %s", out_pth) else: out_pth = os.path.abspath(self.outdir) LOG.info("Creating Test graph %s", self.testname) # note: tools such as protoge need skolemized blank nodes self.testgraph = RDFGraph(True, self.testname) if graph_type == 'rdf_graph': graph_id = ':MONARCH_' + str(self.name) + "_" + \ datetime.now().isoformat(' ').split()[0] LOG.info("Creating graph %s", graph_id) self.graph = RDFGraph(are_bnodes_skized, graph_id) elif graph_type == 'streamed_graph': # need to expand on export formats dest_file = open(out_pth + '/' + name + '.nt', 'w') # where is the close? self.graph = StreamedGraph(are_bnodes_skized, dest_file) # leave test files as turtle (better human readibility) else: LOG.error( "%s graph type not supported\n" "valid types: rdf_graph, streamed_graph", graph_type) # pull in global ontology mapping datastructures self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # self.prefix_base = {v: k for k, v in self.curie_map.items()} # will be set to True if the intention is # to only process and write the test data self.test_only = False self.test_mode = False if self.ingest_description and getdoc(self) is not None: self.ingest_description = getdoc(self) self.dataset = Dataset( identifier=self.name, data_release_version=self.data_release_version, ingest_name=self.name, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo, ingest_description=self.ingest_description, # description license_url=self.license_url, # only _OUR_ lic data_rights=self.data_rights, # tries to point to others lics graph_type=graph_type, file_handle=file_handle ) # see jenkins file human, mouse, zebrafish, fly, worm rat self.COMMON_TAXON = ['9606','10090','7955','7227','6239'] # '10116' def fetch(self, is_dl_forced=False): """ abstract method to fetch all data from an external resource. this should be overridden by subclasses :return: None """ raise NotImplementedError def parse(self, limit): """ abstract method to parse all data from an external resource, that was fetched in fetch() this should be overridden by subclasses :return: None """ raise NotImplementedError def write(self, fmt='turtle', stream=None, write_metadata_in_main_graph=False): """ This convenience method will write out all of the graphs associated with the source. Right now these are hardcoded to be a single main "graph" and a "src_dataset.ttl" and a "src_test.ttl" If you do not supply stream='stdout' it will default write these to files. In addition, if the version number isn't yet set in the dataset, it will be set to the date on file. :return: None """ fmt_ext = { 'rdfxml': 'xml', 'turtle': 'ttl', 'nt': 'nt', # ntriples 'nquads': 'nq', 'n3': 'n3' # notation3 } # make the regular graph output file dest = None if self.name is not None: dest = '/'.join((self.outdir, self.name)) if fmt in fmt_ext: dest = '.'.join((dest, fmt_ext.get(fmt))) else: dest = '.'.join((dest, fmt)) LOG.info("Setting outfile to %s", dest) # make the dataset_file name, always format as turtle self.datasetfile = '/'.join( (self.outdir, self.name + '_dataset.ttl')) LOG.info("Setting dataset file to %s", self.datasetfile) else: LOG.warning("No output file set. Using stdout") stream = 'stdout' graph_util = GraphUtils(None) # the _dataset description is always turtle graph_util.write(self.dataset.get_graph(), 'turtle', filename=self.datasetfile) if self.test_mode: # unless we stop hardcoding, the test dataset is always turtle LOG.info("Setting testfile to %s", self.testfile) graph_util.write(self.testgraph, 'turtle', filename=self.testfile) if write_metadata_in_main_graph: self.graph = self.graph + self.dataset.get_graph() # print graph out if stream is None: outfile = dest elif stream.lower().strip() == 'stdout': outfile = None else: LOG.error("I don't understand our stream.") return graph_util.write(self.graph, fmt, filename=outfile) def whoami(self): ''' pointless convieniance ''' LOG.info("Ingest is %s", self.name) @staticmethod def make_id(long_string, prefix='MONARCH'): """ a method to create DETERMINISTIC identifiers based on a string's digest. currently implemented with sha1 :param long_string: :return: """ return ':'.join((prefix, Source.hash_id(long_string))) @staticmethod def hash_id(wordage): # same as graph/GraphUtils.digest_id(wordage) """ prepend 'b' to avoid leading with digit truncate to a 20 char sized word with a leading 'b' return truncated sha1 hash of string. by the birthday paradox; expect 50% chance of collision after 69 billion invocations however these are only hoped to be unique within a single file Consider reducing to 17 hex chars to fit in a 64 bit word 16 discounting a leading constant gives a 50% chance of collision at about 4.3b billion unique input strings (currently _many_ orders of magnitude below that) :param long_string: str string to be hashed :return: str hash of id """ return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20] def check_if_remote_is_newer(self, remote, local, headers): """ Given a remote file location, and the corresponding local file this will check the datetime stamp on the files to see if the remote one is newer. This is a convenience method to be used so that we don't have to re-fetch files that we already have saved locally :param remote: URL of file to fetch from remote server :param local: pathname to save file to locally :return: True if the remote file is newer and should be downloaded """ is_remote_newer = False LOG.info("Checking if remote file is newer than local \n(%s)", local) # check if local file exists # if no local file, then remote is newer if os.path.exists(local): LOG.info("Local File exists as %s", local) else: LOG.info("Local File does NOT exist as %s", local) return True # get remote file details (if possible) if headers is None: headers = self._get_default_request_headers() req = urllib.request.Request(remote, headers=headers) LOG.info("Request header for %s \n\tis: %s", remote, str(req.header_items())) try: response = urllib.request.urlopen(req) except urllib.error.URLError as err: resp_headers = None size = 0 last_modified = None LOG.error('%s\n\tFor: %s', err, remote) is_remote_newer = None if is_remote_newer is not None: resp_headers = response.info() size = resp_headers.get('Content-Length') last_modified = resp_headers.get('Last-Modified') if size is not None and size != '': size = int(size) else: size = 0 fstat = os.stat(local) LOG.info( "Local File date: %s", datetime.utcfromtimestamp(fstat[ST_CTIME])) if last_modified is not None: # Thu, 07 Aug 2008 16:20:19 GMT dt_obj = datetime.strptime( last_modified, "%a, %d %b %Y %H:%M:%S %Z") self.remote_file_timestamps[remote] = dt_obj # get local file details # check date on local vs remote file if dt_obj > datetime.utcfromtimestamp(fstat[ST_CTIME]): # check if file size is different if fstat[ST_SIZE] < size: LOG.info("New Remote File exists") is_remote_newer = True if fstat[ST_SIZE] > size: LOG.warning("New Remote File exists but it is SMALLER") is_remote_newer = True # filesize is a fairly imperfect metric here LOG.info("New Remote File has same filesize--will not download") elif fstat[ST_SIZE] != size: LOG.info( "Remote File is %i \t Local File is %i", size, fstat[ST_SIZE]) is_remote_newer = True response.close() return is_remote_newer def get_files(self, is_dl_forced, files=None, delay=0): """ Given a set of files for this source, it will go fetch them, and set a default version by date. If you need to set the version number by another method, then it can be set again. :param is_dl_forced - boolean :param files dict - override instance files dict :return: None """ fstat = None if files is None: files = self.files for src_key in files: headers = None filesource = files[src_key] if 'clean' in filesource: cleaned_file_iri = filesource['clean'] else: cleaned_file_iri = filesource['url'] # attempt to fetch from a web cache remote_file = '/'.join((self.DIPPERCACHE, self.name, filesource['file'])) local_file = '/'.join((self.rawdir, filesource['file'])) cache_response = self.fetch_from_url(remote_file, local_file, is_dl_forced) if cache_response: LOG.info( "Found File '%s/%s' in DipperCache", self.name, filesource['file']) self.dataset.set_ingest_source(cleaned_file_iri) if remote_file in self.remote_file_timestamps: # Here the timestamp on the file in DipperCache is a best effort # representation of the earliest time the file # _could_ have been retrieved from source. # not necessarily when it _was_ retrieved (e.g not 11 years ago) # which will be not-before the timestamp (modulo timezones). timestamp = Literal( self.remote_file_timestamps[remote_file], datatype=XSD.dateTime) self.dataset.graph.addTriple( cleaned_file_iri, self.globaltt['retrieved_on'], timestamp) else: LOG.warning( "File %s/%s absent from DipperCache", self.name, filesource['file']) if 'headers' in filesource: headers = filesource['headers'] LOG.info("Getting %s", src_key) # if the key 'clean' exists in the sources `files` dict # expose that instead of the longer url self.dataset.set_ingest_source(cleaned_file_iri) LOG.info('Fetching %s in %i seconds', cleaned_file_iri, delay) time.sleep(delay) if not self.fetch_from_url( filesource['url'], '/'.join((self.rawdir, filesource['file'])), is_dl_forced, headers): LOG.warning('FAILED FETCH of %s', filesource['url']) fstat = os.stat('/'.join((self.rawdir, filesource['file']))) self.dataset.graph.addTriple( self.dataset.version_level_curie, self.globaltt["Source"], cleaned_file_iri) filedate = Literal( datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y%m%d"), datatype=XSD.date) self.dataset.graph.addTriple( cleaned_file_iri, self.globaltt['retrieved_on'], filedate) def fetch_from_url( self, remoteurl, localfile=None, is_dl_forced=False, headers=None): """ Given a remote url and a local filename, attempt to determine if the remote file is newer; if it is, fetch the remote file and save it to the specified localfile, reporting the basic file information once it is downloaded :param remoteurl: URL of remote file to fetch :param localfile: pathname of file to save locally :return: bool """ response = None result = False rmt_check = self.check_if_remote_is_newer(remoteurl, localfile, headers) if (is_dl_forced is True) or (localfile is None) or ( rmt_check is not None and rmt_check): if headers is None: headers = self._get_default_request_headers() try: request = urllib.request.Request(remoteurl, headers=headers) response = urllib.request.urlopen(request) except urllib.error.HTTPError as httpErr: # raise Exception(httpErr.read()) LOG.error('NETWORK issue %s\n\tFor: %s', httpErr.read(), remoteurl) return False # allows re try (e.g. not found in Cache) except urllib.error.URLError as urlErr: LOG.error('URLError %s\n\tFor: %s', urlErr, remoteurl) result = response is not None if localfile is not None and result: with open(localfile, 'wb') as binwrite: while True: chunk = response.read(CHUNK) if not chunk: break binwrite.write(chunk) LOG.info("Finished. Wrote file to %s", localfile) if self.compare_local_remote_bytes(remoteurl, localfile, headers): LOG.debug("local file is same size as remote after download") else: raise Exception( "Error downloading file: " "local file size != remote file size") fstat = os.stat(localfile) LOG.info("file size: %s", fstat[ST_SIZE]) LOG.info( "file created: %s", time.asctime(time.localtime(fstat[ST_CTIME]))) response.close() else: LOG.error('Local filename is required') exit(-1) else: LOG.info("Using existing file %s", localfile) return result # TODO: rephrase as mysql-dump-xml specific format def process_xml_table(self, elem, table_name, processing_function, limit): """ This is a convenience function to process the elements of an xml dump of a mysql relational database. The "elem" is akin to a mysql table, with it's name of ```table_name```. It will process each ```row``` given the ```processing_function``` supplied. :param elem: The element data :param table_name: The name of the table to process :param processing_function: The row processing function :param limit: Appears to be making calls to the elementTree library although it not explicitly imported here. :return: """ line_counter = 0 table_data = elem.find("[@name='" + table_name + "']") if table_data is not None: LOG.info("Processing " + table_name) row = {} for line in table_data.findall('row'): for field in line.findall('field'): atts = dict(field.attrib) row[atts['name']] = field.text processing_function(row) line_counter += 1 if self.test_mode and limit is not None and line_counter > limit: continue elem.clear() # discard the element @staticmethod def _check_list_len(row, length): """ Sanity check for csv parser :param row :param length :return:None """ if len(row) != length: raise Exception( "row length does not match expected length of " + str(length) + "\nrow: " + str(row)) @staticmethod def get_file_md5(directory, filename, blocksize=2**20): # reference: # http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python md5 = hashlib.md5() with open(os.path.join(directory, filename), "rb") as bin_reader: while True: buff = bin_reader.read(blocksize) if not buff: break md5.update(buff) return md5.hexdigest() def get_remote_content_len(self, remote, headers=None): """ :param remote: :return: size of remote file """ if headers is None: headers = self._get_default_request_headers() req = urllib.request.Request(remote, headers=headers) try: response = urllib.request.urlopen(req) resp_header = response.info() byte_size = resp_header.get('Content-length') except OSError as err: byte_size = None LOG.error('%s\n\tFor: %s', err, remote) return byte_size @staticmethod def get_local_file_size(localfile): """ :param localfile: :return: size of file """ byte_size = os.stat(localfile) return byte_size[ST_SIZE] def compare_local_remote_bytes(self, remotefile, localfile, remote_headers=None): """ test to see if fetched file is the same size as the remote file using information in the content-length field in the HTTP header :return: True or False """ is_equal = True remote_size = self.get_remote_content_len(remotefile, remote_headers) local_size = self.get_local_file_size(localfile) if remote_size is not None and local_size != int(remote_size): is_equal = False LOG.error( 'local file and remote file different sizes\n' '%s has size %s, %s has size %s', localfile, local_size, remotefile, remote_size) return is_equal @staticmethod def file_len(fname): with open(fname) as lines: length = sum(1 for line in lines) return length def settestonly(self, testonly): """ Set that this source should only be processed in testMode :param testOnly: :return: None """ self.test_only = testonly def settestmode(self, mode): """ Set testMode to (mode). - True: run the Source in testMode; - False: run it in full mode :param mode: :return: None """ self.test_mode = mode def getTestSuite(self): """ An abstract method that should be overwritten with tests appropriate for the specific source. :return: """ return None @staticmethod def remove_backslash_r(filename, encoding): """ A helpful utility to remove Carriage Return from any file. This will read a file into memory, and overwrite the contents of the original file. TODO: This function may be a liability :param filename: :return: """ with open(filename, 'r', encoding=encoding, newline=r'\n') as filereader: contents = filereader.read() contents = re.sub(r'\r', '', contents) with open(filename, "w") as filewriter: filewriter.truncate() filewriter.write(contents) @staticmethod def open_and_parse_yaml(yamlfile): """ :param file: String, path to file containing label-id mappings in the first two columns of each row :return: dict where keys are labels and values are ids """ # ??? what if the yaml file does not contain a dict datastructure? mapping = dict() if os.path.exists(os.path.join(os.path.dirname(__file__), yamlfile)): map_file = open(os.path.join(os.path.dirname(__file__), yamlfile), 'r') mapping = yaml.safe_load(map_file) map_file.close() else: LOG.warning("file: %s not found", yamlfile) return mapping @staticmethod def parse_mapping_file(file): """ :param file: String, path to file containing label-id mappings in the first two columns of each row :return: dict where keys are labels and values are ids """ id_map = {} if os.path.exists(os.path.join(os.path.dirname(__file__), file)): with open(os.path.join(os.path.dirname(__file__), file)) as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: key = row[0] value = row[1] id_map[key] = value return id_map @staticmethod def _get_default_request_headers(): return { 'User-Agent': USER_AGENT } # @staticmethod # def getTestSuite(ingest): # WIP # ''' # try to avoid having one of these per ingest # ''' # import unittest # testcase = ingest + 'TestCase' # # construct import names ... how # from tests.test_ + ingest import testcase # return unittest.TestLoader().loadTestsFromTestCase(testcase) def load_local_translationtable(self, name): ''' Load "ingest specific" translation from whatever they called something to the ontology label we need to map it to. To facilitate seeing more ontology labels in dipper ingests a reverse mapping from ontology labels to external strings is also generated and available as a dict localtcid '---\n# %s.yaml\n"": "" # example' ''' localtt_file = '../../translationtable/' + name + '.yaml' try: with open(os.path.join(os.path.dirname(__file__), localtt_file)): pass except IOError: # write a stub file as a place holder if none exists with open(os.path.join(os.path.dirname(__file__), localtt_file), 'w') as write_yaml: print('---\n# %s.yaml\n"": "" # example' % name, file=write_yaml) finally: with open(os.path.join(os.path.dirname(__file__), localtt_file), 'r') as read_yaml: localtt = yaml.safe_load(read_yaml) # inverse local translation. # note: keeping this invertable will be work. # Useful to not litter an ingest with external syntax self.localtcid = {v: k for k, v in localtt.items()} return localtt def resolve(self, word, mandatory=True, default=None): ''' composite mapping given f(x) and g(x) here: localtt & globaltt respectivly return g(f(x))|g(x)||f(x)|x in order of preference returns x|default on fall through if finding a mapping is not mandatory (by default finding is mandatory). This may be specialized further from any mapping to a global mapping only; if need be. :param word: the string to find as a key in translation tables :param mandatory: boolean to cause failure when no key exists :param default: string to return if nothing is found (& not manandatory) :return value from global translation table, or value from local translation table, or the query key if finding a value is not mandatory (in this order) ''' assert word is not None # we may not agree with a remote sources use of a global term we have # this provides opportunity for us to override if word in self.localtt: label = self.localtt[word] if label in self.globaltt: term_id = self.globaltt[label] else: logging.info( # "Translated to '%s' but no global term_id for: '%s'", label, word) term_id = label elif word in self.globaltt: term_id = self.globaltt[word] else: if mandatory: raise KeyError("Mapping required for: ", word) logging.warning("We have no translation for: '%s'", word) if default is not None: term_id = default else: term_id = word return term_id @staticmethod def check_fileheader(expected, received, src_key=None): ''' Compare file headers received versus file headers expected if the expected headers are a subset (proper or not) of received headers report suscess (warn if proper subset) param: expected list param: received list return: truthyness ''' exp = set(expected) got = set(received) if expected != received: LOG.error( 'file resource: %s\nExpected header:\n %s\nRecieved header:\n %s', src_key, expected, received) # pass reordering and adding new columns (after protesting) # hard fail on missing expected columns (temper with mandatory cols?) if exp - got != set(): LOG.error('Missing: %s', exp - got) raise AssertionError('Incomming headers are missing expected column.') if got - exp != set(): LOG.warning('Addtional new columns: %s', got - exp) else: LOG.warning('Check columns order') return (exp ^ got) & exp == set() def command_args(self): ''' To make arbitrary variables from dipper-etl.py's calling enviroment available when working in source ingests in a hopefully universal way Does not appear to be populated till after an ingest's _init_() finishes. ''' # testing LOG.info( 'Command line arguments available to %s:\n%s', self.name, "\n".join(["\t'{}': '{}'".format(k, v) for k, v in self.ARGV.items()]))