def execute(self, transform_manager, input=None): inputs = [input] if input else [] inputs += [other(transform_manager) for other in self.others] transform_manager.start(self, inputs) inputs = [parse(open(fn)) for fn in inputs] triples = itertools.chain(*inputs) with transform_manager('nt') as output: serialize(triples, output) return output.name
def archive(self): notation = self.notation or hashlib.sha1(self.dataset).hexdigest() archive_path = os.path.join(SOURCE_DIRECTORY, 'archive', self.store.slug, notation.replace('/', '-')) archive_graph_name = rdflib.URIRef('{0}archive/{1}'.format(settings.GRAPH_BASE, notation)) data_dump_url = rdflib.URIRef('{0}archive/{1}/{2}/latest.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-'))) if not os.path.exists(archive_path): os.makedirs(archive_path, 0755) nt_fd, nt_name = tempfile.mkstemp('.nt') rdf_fd, rdf_name = tempfile.mkstemp('.rdf') try: nt_out, rdf_out = os.fdopen(nt_fd, 'w'), os.fdopen(rdf_fd, 'w') for graph_name in self.graph_names: self._graph_triples(nt_out, graph_name) nt_out.close() sort = subprocess.Popen(['sort', '-u', nt_name], stdout=subprocess.PIPE) try: triples = itertools.chain(self._get_metadata(rdflib.URIRef(''), archive_graph_name), parse(sort.stdout, 'nt').get_triples()) serialize(triples, rdf_out, rdf_name) finally: # Make sure stdout gets closed so that if the try block raises # an exception we don't keep a sort process hanging around. sort.stdout.close() sort.wait() rdf_out.close() previous_name = os.path.join(archive_path, 'latest.rdf') # Only update if the file has changed, or hasn't been archived before. if not os.path.exists(previous_name) or not filecmp._do_cmp(previous_name, rdf_name): new_name = os.path.join(archive_path, self.updated.astimezone(pytz.utc).isoformat() + '.rdf') shutil.move(rdf_name, new_name) os.chmod(new_name, 0644) if os.path.exists(previous_name): os.unlink(previous_name) os.symlink(new_name, previous_name) # Upload the metadata to the store using an absolute URI. metadata = self._get_metadata(data_dump_url, archive_graph_name) Uploader.upload([self.store], archive_graph_name, graph=metadata) finally: os.unlink(nt_name) if os.path.exists(rdf_name): os.unlink(rdf_name) self.filter_old_archives(archive_path)
def execute(self, transform_manager): endpoint = Endpoint(transform_manager.store.query_endpoint, preferred_media_types=('text/plain',)) if isinstance(self.query, basestring): query = self.query else: query_filename = self.query.execute(transform_manager) with open(query_filename, 'r') as query_file: query = query_file.read() with open(transform_manager('nt'), 'w') as output: transform_manager.start(self, []) serialize(endpoint.query(query, defer=True), output) transform_manager.end([output.name]) return output.name
def execute(self, transform_manager): endpoint = Endpoint(transform_manager.store.query_endpoint, preferred_media_types=('text/plain', )) if isinstance(self.query, basestring): query = self.query else: query_filename = self.query.execute(transform_manager) with open(query_filename, 'r') as query_file: query = query_file.read() with open(transform_manager('nt'), 'w') as output: transform_manager.start(self, []) serialize(endpoint.query(query, defer=True), output) transform_manager.end([output.name]) return output.name
def run_normalization(self, normalization, triples): try: in_file, out_file = [tempfile.NamedTemporaryFile(suffix='.rdf', delete=False) for i in range(2)] serialize(triples, in_file) while not normalization.done: in_file.seek(0) out_file.seek(0) pipeline = normalization(parse(in_file).get_triples()) serialize(pipeline, out_file) out_file.truncate() in_file, out_file = out_file, in_file in_file.seek(0) graph = rdflib.ConjunctiveGraph() graph.parse(in_file, preserve_bnode_ids=True) return graph finally: in_file.close() out_file.close()
def execute(self, transform_manager, input): transform_manager.start(self, []) endpoint = Endpoint(transform_manager.store.query_endpoint) for normalization in self.normalizations: normalization.endpoint = endpoint normalization.store = transform_manager.store while self.normalizations: with open(input, 'r') as source: pipeline = parse(source).get_triples() for normalization in self.normalizations: pipeline = normalization(pipeline) with open(transform_manager('rdf'), 'w') as target: serialize(pipeline, target) input = target.name self.normalizations = [n for n in self.normalizations if not n.done] return input
def execute(self, transform_manager, input): transform_manager.start(self, []) endpoint = Endpoint(transform_manager.store.query_endpoint) for normalization in self.normalizations: normalization.endpoint = endpoint normalization.store = transform_manager.store while self.normalizations: with open(input, 'r') as source: pipeline = parse(source).get_triples() for normalization in self.normalizations: pipeline = normalization(pipeline) with open(transform_manager('rdf'), 'w') as target: serialize(pipeline, target) input = target.name self.normalizations = [ n for n in self.normalizations if not n.done ] return input
def run_normalization(self, normalization, triples): try: in_file, out_file = [ tempfile.NamedTemporaryFile(suffix='.rdf', delete=False) for i in range(2) ] serialize(triples, in_file) while not normalization.done: in_file.seek(0) out_file.seek(0) pipeline = normalization(parse(in_file).get_triples()) serialize(pipeline, out_file) out_file.truncate() in_file, out_file = out_file, in_file in_file.seek(0) graph = rdflib.ConjunctiveGraph() graph.parse(in_file, preserve_bnode_ids=True) return graph finally: in_file.close() out_file.close()
def archive(self): notation = self.notation or hashlib.sha1(self.dataset).hexdigest() archive_path = os.path.join(SOURCE_DIRECTORY, 'archive', self.store.slug, notation.replace('/', '-')) archive_graph_name = rdflib.URIRef('{0}archive/{1}'.format(settings.GRAPH_BASE, notation)) data_dump_url = rdflib.URIRef('{0}archive/{1}/{2}/latest.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-'))) data_dump_with_labels_url = rdflib.URIRef('{0}archive/{1}/{2}/latest-with-labels.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-'))) if not os.path.exists(archive_path): os.makedirs(archive_path, 0755) nt_fd, nt_name = tempfile.mkstemp('.nt') rdf_fd, rdf_name = tempfile.mkstemp('.rdf') rdf_with_labels_fd, rdf_with_labels_name = tempfile.mkstemp('.rdf') try: nt_out, rdf_out = os.fdopen(nt_fd, 'w'), os.fdopen(rdf_fd, 'w') rdf_with_labels_out = os.fdopen(rdf_with_labels_fd, 'w') for graph_name in self.graph_names: self._graph_triples(nt_out, graph_name) nt_out.close() with tempfile.TemporaryFile() as sorted_triples: subprocess.call(['sort', '-u', nt_name], stdout=sorted_triples) sorted_triples.seek(0) triples = itertools.chain(self._get_metadata(rdflib.URIRef(''), data_dump_with_labels_url, archive_graph_name), parse(sorted_triples, 'nt').get_triples()) serialize(triples, rdf_out, 'rdf') rdf_out.close() sorted_triples.seek(0) triples = itertools.chain(self._get_metadata(rdflib.URIRef(''), data_dump_with_labels_url, archive_graph_name), self.with_labels(parse(sorted_triples, 'nt').get_triples())) serialize(triples, rdf_with_labels_out, 'rdf') rdf_with_labels_out.close() previous_name = os.path.join(archive_path, 'latest.rdf') # Only update if the file has changed, or hasn't been archived before. if not os.path.exists(previous_name) or not filecmp._do_cmp(previous_name, rdf_name): new_name = os.path.join(archive_path, self.updated.astimezone(pytz.utc).isoformat() + '.rdf') shutil.move(rdf_name, new_name) os.chmod(new_name, 0644) if os.path.exists(previous_name): os.unlink(previous_name) os.symlink(new_name, previous_name) new_with_labels_name = os.path.join(archive_path, 'latest-with-labels.rdf') shutil.move(rdf_with_labels_name, new_with_labels_name) os.chmod(new_with_labels_name, 0644) # Upload the metadata to the store using an absolute URI. metadata = self._get_metadata(data_dump_url, data_dump_with_labels_url, archive_graph_name) Uploader.upload([self.store], archive_graph_name, graph=metadata) finally: os.unlink(nt_name) if os.path.exists(rdf_name): os.unlink(rdf_name) self.filter_old_archives(archive_path)