Пример #1
0
    def execute(self, transform_manager, input=None):
        inputs = [input] if input else []
        inputs += [other(transform_manager) for other in self.others]

        transform_manager.start(self, inputs)

        inputs = [parse(open(fn)) for fn in inputs]
        triples = itertools.chain(*inputs)

        with transform_manager('nt') as output:
            serialize(triples, output)
            return output.name
Пример #2
0
    def execute(self, transform_manager, input=None):
        inputs = [input] if input else []
        inputs += [other(transform_manager) for other in self.others]

        transform_manager.start(self, inputs)

        inputs = [parse(open(fn)) for fn in inputs]
        triples = itertools.chain(*inputs)

        with transform_manager('nt') as output:
            serialize(triples, output)
            return output.name
Пример #3
0
    def archive(self):
        notation = self.notation or hashlib.sha1(self.dataset).hexdigest()

        archive_path = os.path.join(SOURCE_DIRECTORY, 'archive', self.store.slug, notation.replace('/', '-'))
        archive_graph_name = rdflib.URIRef('{0}archive/{1}'.format(settings.GRAPH_BASE, notation))
        data_dump_url = rdflib.URIRef('{0}archive/{1}/{2}/latest.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-')))

        if not os.path.exists(archive_path):
            os.makedirs(archive_path, 0755)

        nt_fd, nt_name = tempfile.mkstemp('.nt')
        rdf_fd, rdf_name = tempfile.mkstemp('.rdf')
        try:
            nt_out, rdf_out = os.fdopen(nt_fd, 'w'), os.fdopen(rdf_fd, 'w')
            for graph_name in self.graph_names:
                self._graph_triples(nt_out, graph_name)
            nt_out.close()

            sort = subprocess.Popen(['sort', '-u', nt_name], stdout=subprocess.PIPE)
            try:
                triples = itertools.chain(self._get_metadata(rdflib.URIRef(''),
                                                             archive_graph_name),
                                          parse(sort.stdout, 'nt').get_triples())
                serialize(triples, rdf_out, rdf_name)
            finally:
                # Make sure stdout gets closed so that if the try block raises
                # an exception we don't keep a sort process hanging around.
                sort.stdout.close()
                sort.wait()
            rdf_out.close()

            previous_name = os.path.join(archive_path, 'latest.rdf')
            # Only update if the file has changed, or hasn't been archived before.
            if not os.path.exists(previous_name) or not filecmp._do_cmp(previous_name, rdf_name):
                new_name = os.path.join(archive_path,
                                        self.updated.astimezone(pytz.utc).isoformat() + '.rdf')
                shutil.move(rdf_name, new_name)
                os.chmod(new_name, 0644)
                if os.path.exists(previous_name):
                    os.unlink(previous_name)
                os.symlink(new_name, previous_name)

                # Upload the metadata to the store using an absolute URI.
                metadata = self._get_metadata(data_dump_url, archive_graph_name)
                Uploader.upload([self.store], archive_graph_name, graph=metadata)
        finally:
            os.unlink(nt_name)
            if os.path.exists(rdf_name):
                os.unlink(rdf_name)
            self.filter_old_archives(archive_path)
Пример #4
0
    def execute(self, transform_manager):

        endpoint = Endpoint(transform_manager.store.query_endpoint, preferred_media_types=('text/plain',))

        if isinstance(self.query, basestring):
            query = self.query
        else:
            query_filename = self.query.execute(transform_manager)
            with open(query_filename, 'r') as query_file:
                query = query_file.read()

        with open(transform_manager('nt'), 'w') as output:
            transform_manager.start(self, [])
            serialize(endpoint.query(query, defer=True), output)
            transform_manager.end([output.name])
        return output.name
Пример #5
0
    def execute(self, transform_manager):

        endpoint = Endpoint(transform_manager.store.query_endpoint,
                            preferred_media_types=('text/plain', ))

        if isinstance(self.query, basestring):
            query = self.query
        else:
            query_filename = self.query.execute(transform_manager)
            with open(query_filename, 'r') as query_file:
                query = query_file.read()

        with open(transform_manager('nt'), 'w') as output:
            transform_manager.start(self, [])
            serialize(endpoint.query(query, defer=True), output)
            transform_manager.end([output.name])
        return output.name
Пример #6
0
    def run_normalization(self, normalization, triples):
        try:
            in_file, out_file = [tempfile.NamedTemporaryFile(suffix='.rdf', delete=False) for i in range(2)]
            serialize(triples, in_file)

            while not normalization.done:
                in_file.seek(0)
                out_file.seek(0)
                pipeline = normalization(parse(in_file).get_triples())
                serialize(pipeline, out_file)
                out_file.truncate()
                in_file, out_file = out_file, in_file

            in_file.seek(0)

            graph = rdflib.ConjunctiveGraph()
            graph.parse(in_file, preserve_bnode_ids=True)
            return graph
        finally:
            in_file.close()
            out_file.close()
Пример #7
0
    def execute(self, transform_manager, input):
        transform_manager.start(self, [])

        endpoint = Endpoint(transform_manager.store.query_endpoint)

        for normalization in self.normalizations:
            normalization.endpoint = endpoint
            normalization.store = transform_manager.store

        while self.normalizations:
            with open(input, 'r') as source:
                pipeline = parse(source).get_triples()
                for normalization in self.normalizations:
                    pipeline = normalization(pipeline)
                with open(transform_manager('rdf'), 'w') as target:
                    serialize(pipeline, target)

            input = target.name
            self.normalizations = [n for n in self.normalizations if not n.done]

        return input
Пример #8
0
    def execute(self, transform_manager, input):
        transform_manager.start(self, [])

        endpoint = Endpoint(transform_manager.store.query_endpoint)

        for normalization in self.normalizations:
            normalization.endpoint = endpoint
            normalization.store = transform_manager.store

        while self.normalizations:
            with open(input, 'r') as source:
                pipeline = parse(source).get_triples()
                for normalization in self.normalizations:
                    pipeline = normalization(pipeline)
                with open(transform_manager('rdf'), 'w') as target:
                    serialize(pipeline, target)

            input = target.name
            self.normalizations = [
                n for n in self.normalizations if not n.done
            ]

        return input
Пример #9
0
    def run_normalization(self, normalization, triples):
        try:
            in_file, out_file = [
                tempfile.NamedTemporaryFile(suffix='.rdf', delete=False)
                for i in range(2)
            ]
            serialize(triples, in_file)

            while not normalization.done:
                in_file.seek(0)
                out_file.seek(0)
                pipeline = normalization(parse(in_file).get_triples())
                serialize(pipeline, out_file)
                out_file.truncate()
                in_file, out_file = out_file, in_file

            in_file.seek(0)

            graph = rdflib.ConjunctiveGraph()
            graph.parse(in_file, preserve_bnode_ids=True)
            return graph
        finally:
            in_file.close()
            out_file.close()
Пример #10
0
    def archive(self):
        notation = self.notation or hashlib.sha1(self.dataset).hexdigest()

        archive_path = os.path.join(SOURCE_DIRECTORY, 'archive', self.store.slug, notation.replace('/', '-'))
        archive_graph_name = rdflib.URIRef('{0}archive/{1}'.format(settings.GRAPH_BASE, notation))
        data_dump_url = rdflib.URIRef('{0}archive/{1}/{2}/latest.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-')))
        data_dump_with_labels_url = rdflib.URIRef('{0}archive/{1}/{2}/latest-with-labels.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-')))

        if not os.path.exists(archive_path):
            os.makedirs(archive_path, 0755)

        nt_fd, nt_name = tempfile.mkstemp('.nt')
        rdf_fd, rdf_name = tempfile.mkstemp('.rdf')
        rdf_with_labels_fd, rdf_with_labels_name = tempfile.mkstemp('.rdf')
        try:
            nt_out, rdf_out = os.fdopen(nt_fd, 'w'), os.fdopen(rdf_fd, 'w')
            rdf_with_labels_out = os.fdopen(rdf_with_labels_fd, 'w')
            for graph_name in self.graph_names:
                self._graph_triples(nt_out, graph_name)
            nt_out.close()

            with tempfile.TemporaryFile() as sorted_triples:
                subprocess.call(['sort', '-u', nt_name], stdout=sorted_triples)

                sorted_triples.seek(0)
                triples = itertools.chain(self._get_metadata(rdflib.URIRef(''),
                                                             data_dump_with_labels_url,
                                                             archive_graph_name),
                                          parse(sorted_triples, 'nt').get_triples())
                serialize(triples, rdf_out, 'rdf')
                rdf_out.close()

                sorted_triples.seek(0)
                triples = itertools.chain(self._get_metadata(rdflib.URIRef(''),
                                                             data_dump_with_labels_url,
                                                             archive_graph_name),
                                          self.with_labels(parse(sorted_triples, 'nt').get_triples()))
                serialize(triples, rdf_with_labels_out, 'rdf')
                rdf_with_labels_out.close()

            previous_name = os.path.join(archive_path, 'latest.rdf')
            # Only update if the file has changed, or hasn't been archived before.
            if not os.path.exists(previous_name) or not filecmp._do_cmp(previous_name, rdf_name):
                new_name = os.path.join(archive_path,
                                        self.updated.astimezone(pytz.utc).isoformat() + '.rdf')
                shutil.move(rdf_name, new_name)
                os.chmod(new_name, 0644)
                if os.path.exists(previous_name):
                    os.unlink(previous_name)
                os.symlink(new_name, previous_name)

                new_with_labels_name = os.path.join(archive_path, 'latest-with-labels.rdf')
                shutil.move(rdf_with_labels_name, new_with_labels_name)
                os.chmod(new_with_labels_name, 0644)

                # Upload the metadata to the store using an absolute URI.
                metadata = self._get_metadata(data_dump_url, data_dump_with_labels_url, archive_graph_name)
                Uploader.upload([self.store], archive_graph_name, graph=metadata)
        finally:
            os.unlink(nt_name)
            if os.path.exists(rdf_name):
                os.unlink(rdf_name)
            self.filter_old_archives(archive_path)