Пример #1
0
class Storer(object):
    def __init__(self,
                 graph_set=None,
                 repok=None,
                 reperr=None,
                 context_map={},
                 default_dir="_",
                 dir_split=0,
                 n_file_item=1,
                 nt=False,
                 nq=False):
        self.nt = nt
        self.nq = nq
        self.dir_split = dir_split
        self.n_file_item = n_file_item
        self.default_dir = default_dir

        if not nt and not nq:
            self.context_map = context_map
            for context_url in context_map:
                context_file_path = context_map[context_url]
                with open(context_file_path) as f:
                    context_json = json.load(f)
                    self.context_map[context_url] = context_json

        if graph_set is None:
            self.g = []
        else:
            self.g = graph_set.graphs()
        if repok is None:
            self.repok = Reporter(prefix="[Storer: INFO] ")
        else:
            self.repok = repok
        if reperr is None:
            self.reperr = Reporter(prefix="[Storer: ERROR] ")
        else:
            self.reperr = reperr
        self.preface_query = ""

    @staticmethod
    def hack_dates():
        if XSD.gYear in _toPythonMapping:
            _toPythonMapping.pop(XSD.gYear)
        if XSD.gYearMonth in _toPythonMapping:
            _toPythonMapping.pop(XSD.gYearMonth)

    def store_graphs_in_file(self, file_path, context_path):
        self.repok.new_article()
        self.reperr.new_article()
        self.repok.add_sentence(
            "Store the graphs into a file: starting process")

        cg = ConjunctiveGraph()
        for g in self.g:
            cg.addN([item + (g.identifier, ) for item in list(g)])

        self.__store_in_file(cg, file_path, context_path)

    def store_all(self,
                  base_dir,
                  base_iri,
                  context_path,
                  tmp_dir=None,
                  g_set=[],
                  override=False,
                  remove_data=False):
        for g in g_set:
            self.g += [g]

        self.repok.new_article()
        self.reperr.new_article()

        self.repok.add_sentence("Starting the process")

        processed_graphs = {}
        for cur_g in self.g:
            processed_graphs = self.store(cur_g, base_dir, base_iri,
                                          context_path, tmp_dir, override,
                                          processed_graphs, False, remove_data)

        stored_graph_path = []
        for cur_file_path in processed_graphs:
            stored_graph_path += [cur_file_path]

            self.__store_in_file(processed_graphs[cur_file_path],
                                 cur_file_path, context_path)

        return stored_graph_path

    def upload_and_store(self,
                         base_dir,
                         triplestore_url,
                         base_iri,
                         context_path,
                         tmp_dir=None,
                         g_set=[],
                         override=False):

        stored_graph_path = self.store_all(base_dir, base_iri, context_path,
                                           tmp_dir, g_set, override)

        # Some graphs were not stored properly, then no one will be updloaded to the triplestore
        # but we highlights those ones that could be added in principle, by mentioning them
        # with a ".notupdloaded" marker
        if None in stored_graph_path:
            for file_path in stored_graph_path:
                # Create a marker for the file not uploaded in the triplestore
                open("%s.notuploaded" % file_path, "w").close()
                self.reperr.add_sentence(
                    "[6] "
                    "The statements of in the JSON-LD file '%s' were not "
                    "uploaded into the triplestore." % file_path)
        else:  # All the files have been stored
            self.upload_all(self.g, triplestore_url, base_dir)

    def query(self,
              query_string,
              triplestore_url,
              n_statements=None,
              base_dir=None):
        if query_string != "":
            try:
                tp = SPARQLWrapper(triplestore_url)
                tp.setMethod('POST')
                tp.setQuery(query_string)
                tp.query()

                if n_statements is None:
                    self.repok.add_sentence(
                        "Triplestore updated by means of a SPARQL Update query."
                    )
                else:
                    self.repok.add_sentence(
                        "Triplestore updated with %s more RDF statements." %
                        n_statements)

                return True

            except Exception as e:
                self.reperr.add_sentence(
                    "[1] "
                    "Graph was not loaded into the "
                    "triplestore due to communication problems: %s" % str(e))
                if base_dir is not None:
                    tp_err_dir = base_dir + os.sep + "tp_err"
                    if not os.path.exists(tp_err_dir):
                        os.makedirs(tp_err_dir)
                    cur_file_err = tp_err_dir + os.sep + \
                                   datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f_not_uploaded.txt')
                    with io.open(cur_file_err, "w", encoding="utf-8") as f:
                        f.write(query_string)

        return False

    def do_action_all(self, all_g, triplestore_url, base_dir, query_f):
        result = True

        self.repok.new_article()
        self.reperr.new_article()

        query_string = None
        total_new_statements = None

        for idx, cur_g in enumerate(all_g):
            cur_idx = idx % 10
            if cur_idx == 0:
                if query_string is not None:
                    result &= self.query(query_string, triplestore_url,
                                         total_new_statements, base_dir)
                query_string = ""
                total_new_statements = 0
            else:
                query_string += " ; "
                total_new_statements += len(cur_g)

            query_string += self.get_preface_query(cur_g) + query_f(cur_g)

        if query_string is not None and query_string != "":
            result &= self.query(query_string, triplestore_url,
                                 total_new_statements, base_dir)

        return result

    def update_all(self, all_add_g, all_remove_g, triplestore_url, base_dir):
        return self.do_action_all(all_remove_g, triplestore_url, base_dir, Storer._make_delete_query) and \
               self.upload_all(all_add_g, triplestore_url, base_dir)

    def upload_all(self, all_g, triplestore_url, base_dir):
        return self.do_action_all(all_g, triplestore_url, base_dir,
                                  Storer._make_insert_query)

    def execute_upload_query(self, query_string, triplestore_url):
        self.repok.new_article()
        self.reperr.new_article()

        return self.query(query_string, triplestore_url)

    def upload(self, cur_g, triplestore_url):
        self.repok.new_article()
        self.reperr.new_article()

        query_string = Storer._make_insert_query(cur_g)

        return self.query(query_string, triplestore_url, len(cur_g))

    def set_preface_query(self, query_string):
        self.preface_query = query_string

    def get_preface_query(self, cur_g):
        if self.preface_query != "":
            if type(cur_g.identifier) is BNode:
                return "CLEAR DEFAULT ; "
            else:
                return "WITH <%s> " % str(
                    cur_g.identifier) + self.preface_query + " ; "
        else:
            return ""

    @staticmethod
    def _make_insert_query(cur_g):
        return Storer.__make_query(cur_g, "INSERT")

    @staticmethod
    def _make_delete_query(cur_g):
        return Storer.__make_query(cur_g, "DELETE")

    @staticmethod
    def __make_query(cur_g, query_type="INSERT"):
        if type(cur_g.identifier) is BNode:
            return "%s DATA { %s }" % (
                query_type, cur_g.serialize(format="nt11",
                                            encoding="utf-8").decode("utf-8"))
        else:
            return "%s DATA { GRAPH <%s> { %s } }" % \
                   (query_type, str(cur_g.identifier), cur_g.serialize(format="nt11", encoding="utf-8").decode("utf-8"))

    def __store_in_file(self, cur_g, cur_file_path, context_path):
        # Note: the following lines from here and until 'cur_json_ld' are a sort of hack for including all
        # the triples of the input graph into the final stored file. Some how, some of them are not written
        # in such file otherwise - in particular the provenance ones.
        new_g = ConjunctiveGraph()
        for s, p, o in cur_g.triples((None, None, None)):
            g_iri = None
            for g_context in cur_g.contexts((s, p, o)):
                g_iri = g_context.identifier
                break

            new_g.addN([(s, p, o, g_iri)])

        if not self.nt and not self.nq and context_path:
            cur_json_ld = json.loads(
                new_g.serialize(
                    format="json-ld",
                    context=self.__get_context(context_path)).decode("utf-8"))

            if isinstance(cur_json_ld, dict):
                cur_json_ld["@context"] = context_path
            else:  # it is a list
                for item in cur_json_ld:
                    item["@context"] = context_path

            with open(cur_file_path, "w") as f:
                json.dump(cur_json_ld, f, indent=4, ensure_ascii=False)
        elif self.nt:
            new_g.serialize(cur_file_path, format="nt11", encoding="utf-8")
        elif self.nq:
            new_g.serialize(cur_file_path, format="nquads", encoding="utf-8")

        self.repok.add_sentence("File '%s' added." % cur_file_path)

    def dir_and_file_paths(self, cur_g, base_dir, base_iri):
        cur_subject = set(cur_g.subjects(None, None)).pop()
        if self.nt or self.nq:
            is_json = False
        else:
            is_json = True
        return find_paths(str(cur_subject),
                          base_dir,
                          base_iri,
                          self.default_dir,
                          self.dir_split,
                          self.n_file_item,
                          is_json=is_json)

    def update(self,
               add_g,
               remove_g,
               base_dir,
               base_iri,
               context_path,
               tmp_dir=None,
               override=False,
               already_processed={},
               store_now=True):
        self.repok.new_article()
        self.reperr.new_article()

        if len(remove_g) > 0:
            cur_dir_path, cur_file_path = self.dir_and_file_paths(
                remove_g, base_dir, base_iri)

            if cur_file_path in already_processed:
                final_g = already_processed[cur_file_path]
            elif os.path.exists(cur_file_path):
                # This is a conjunctive graps that contains all the triples (and graphs)
                # the file is actually defining - they could be more than those using
                # 'cur_subject' as subject.
                final_g = self.load(cur_file_path, tmp_dir=tmp_dir)
                already_processed[cur_file_path] = final_g

            for s, p, o, g in [
                    item + (remove_g.identifier, ) for item in list(remove_g)
            ]:
                final_g.remove((s, p, o, g))

        if len(add_g) > 0:
            self.store(add_g, base_dir, base_iri, context_path, tmp_dir,
                       override, already_processed, store_now)
        elif len(remove_g) > 0 and store_now:
            self.__store_in_file(final_g, cur_file_path, context_path)

        return already_processed

    def store(self,
              cur_g,
              base_dir,
              base_iri,
              context_path,
              tmp_dir=None,
              override=False,
              already_processed={},
              store_now=True,
              remove_data=False):
        self.repok.new_article()
        self.reperr.new_article()

        if len(cur_g) > 0:
            cur_dir_path, cur_file_path = self.dir_and_file_paths(
                cur_g, base_dir, base_iri)

            try:
                if not os.path.exists(cur_dir_path):
                    os.makedirs(cur_dir_path)

                final_g = ConjunctiveGraph()
                final_g.addN(
                    [item + (cur_g.identifier, ) for item in list(cur_g)])

                # Remove the data
                if remove_data:
                    stored_g = None
                    if cur_file_path in already_processed:
                        stored_g = already_processed[cur_file_path]
                    elif os.path.exists(cur_file_path):
                        stored_g = self.load(cur_file_path, cur_g, tmp_dir)

                    for s, p, o, g in final_g.quads((None, None, None, None)):
                        stored_g.remove((s, p, o, g))

                    final_g = stored_g
                elif not override:  # Merging the data
                    if cur_file_path in already_processed:
                        stored_g = already_processed[cur_file_path]
                        stored_g.addN(final_g.quads((None, None, None, None)))
                        final_g = stored_g
                    elif os.path.exists(cur_file_path):
                        # This is a conjunctive graps that contains all the triples (and graphs)
                        # the file is actually defining - they could be more than those using
                        # 'cur_subject' as subject.
                        final_g = self.load(cur_file_path, cur_g, tmp_dir)

                already_processed[cur_file_path] = final_g

                if store_now:
                    self.__store_in_file(final_g, cur_file_path, context_path)

                return already_processed
            except Exception as e:
                self.reperr.add_sentence(
                    "[5] It was impossible to store the RDF statements in %s. %s"
                    % (cur_file_path, str(e)))

        return None

    def __get_context(self, context_url):
        if context_url in self.context_map:
            return self.context_map[context_url]
        else:
            return context_url

    def __get_first_context(self):
        for context_url in self.context_map:
            return self.context_map[context_url]

    def load(self, rdf_file_path, cur_graph=None, tmp_dir=None):
        self.repok.new_article()
        self.reperr.new_article()

        if os.path.isfile(rdf_file_path):
            Storer.hack_dates()
            # The line above has been added for handling gYear and gYearMonth correctly.
            # More info at https://github.com/RDFLib/rdflib/issues/806.

            try:
                cur_graph = self.__load_graph(rdf_file_path, cur_graph)
            except IOError:
                if tmp_dir is not None:
                    current_file_path = tmp_dir + os.sep + "tmp_rdf_file.rdf"
                    shutil.copyfile(rdf_file_path, current_file_path)
                    try:
                        cur_graph = self.__load_graph(current_file_path,
                                                      cur_graph)
                    except IOError as e:
                        self.reperr.add_sentence(
                            "[2] "
                            "It was impossible to handle the format used for "
                            "storing the file (stored in the temporary path) '%s'. "
                            "Additional details: %s" %
                            (current_file_path, str(e)))
                    os.remove(current_file_path)
                else:
                    self.reperr.add_sentence(
                        "[3] "
                        "It was impossible to try to load the file from the "
                        "temporary path '%s' since that has not been specified in "
                        "advance" % rdf_file_path)
        else:
            self.reperr.add_sentence(
                "[4] "
                "The file specified ('%s') doesn't exist." % rdf_file_path)

        return cur_graph

    def __load_graph(self, file_path, cur_graph=None):
        formats = ["json-ld", "rdfxml", "turtle", "trig", "nt11", "nquads"]

        current_graph = ConjunctiveGraph()

        if cur_graph is not None:
            current_graph.parse(data=cur_graph.serialize(format="trig"),
                                format="trig")

        for cur_format in formats:
            try:
                if cur_format == "json-ld":
                    with open(file_path) as f:
                        json_ld_file = json.load(f)
                        if isinstance(json_ld_file, dict):
                            json_ld_file = [json_ld_file]

                        for json_ld_resource in json_ld_file:
                            # Trick to force the use of a pre-loaded context if the format
                            # specified is JSON-LD
                            context_json = None
                            if "@context" in json_ld_resource:
                                cur_context = json_ld_resource["@context"]
                                if cur_context in self.context_map:
                                    context_json = self.__get_context(
                                        cur_context)["@context"]
                                    json_ld_resource["@context"] = context_json

                            current_graph.parse(data=json.dumps(
                                json_ld_resource, ensure_ascii=False),
                                                format=cur_format)
                else:
                    current_graph.parse(file_path, format=cur_format)

                return current_graph
            except Exception as e:
                errors = " | " + str(e)  # Try another format

        raise IOError(
            "1",
            "It was impossible to handle the format used for storing the file '%s'%s"
            % (file_path, errors))
Пример #2
0
                            required=True,
                            help="The output file.")

    args = arg_parser.parse_args()

    repok = Reporter(True, prefix="[creatent.py: INFO] ")
    reperr = Reporter(True, prefix="[creatent.py: ERROR] ")
    repok.new_article()
    reperr.new_article()

    s = Storer(context_map={context_path: context_file_path},
               dir_split=dir_split_number,
               n_file_item=items_per_file,
               default_dir=default_dir)

    for cur_dir, cur_subdir, cur_files in os.walk(args.input):
        with open(args.output, 'a') as f:
            for cur_file in cur_files:
                if match("^[0-9]+\.json", cur_file) is not None:
                    cur_g = s.load(cur_dir + os.sep + cur_file,
                                   tmp_dir=temp_dir_for_rdf_loading)
                    nt_strings = cur_g.serialize(
                        format="nt11", encoding="utf-8").decode("utf-8")
                    f.write(nt_strings)

    repok.add_sentence("Done.")
    if not reperr.is_empty():
        reperr.write_file(
            "creatent.rep.%s.err.txt" %
            (sub("_+", "_", sub("[\.%s/]" % os.sep, "_", args.input))))