示例#1
0
    def _download_map(self, trans, down_map, skip_list=[]):
        """Download from an internally constructed map."""
        filehash = {}
        if not self.mirror:
            print("Reading files from filesystem...")
            for path in self.scan(self.path):
                filehash[path] = 1

        self.logger.debug("Downloading map")
        self.logger.debug(down_map)

        download_count = len(down_map)

        print("Downloading {} documents...".format(download_count))

        docs = Documents(self.connection)
        docs.set_database(self.database)
        docs.set_txid(trans.txid())
        docs.set_format('xml')
        docs.set_accept("multipart/mixed")
        if self.mirror:
            docs.set_categories(['content', 'metadata'])
        else:
            docs.set_category('content')

        dlprog = 0
        dlcount = 0
        for uri in down_map.keys():
            dlcount += 1
            docs.add_uri(uri)

            if uri in filehash:
                del filehash[uri]

            if dlcount >= self.batchsize:
                dlprog += dlcount
                perc = (float(dlprog) / download_count) * 100.0
                print("{0:.0f}% ... {1}/{2} files" \
                          .format(perc, dlprog, download_count))

                self._download_batch(docs, down_map)
                docs.clear()
                docs.set_database(self.database)
                docs.set_txid(trans.txid())
                docs.set_format('xml')
                docs.set_accept("multipart/mixed")
                if self.mirror:
                    docs.set_categories(['content', 'metadata'])
                else:
                    docs.set_category('content')
                dlcount = 0

        if dlcount > 0:
            perc = (float(dlcount) / download_count) * 100.0
            print("{0:.0f}% ... {1} files".format(perc, dlcount))
            self._download_batch(docs, down_map)

        delfiles = []
        for path in filehash.keys():
            localfile = self.path + path
            if localfile in skip_list:
                pass
            else:
                delfiles.append(localfile)

        if not self.mirror and delfiles:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(len(delfiles)))
            else:
                print("Deleting {} files...".format(len(delfiles)))
                if not self.dryrun:
                    for path in delfiles:
                        os.remove(path)
                    self._remove_empty_dirs(self.path)
示例#2
0
    def _download_map(self, trans, down_map, skip_list=[]):
        """Download from an internally constructed map."""
        filehash = {}
        if not self.mirror:
            print("Reading files from filesystem...")
            for path in self.scan(self.path):
                filehash[path] = 1

        self.logger.debug("Downloading map")
        self.logger.debug(down_map)

        download_count = len(down_map)

        print("Downloading {} documents...".format(download_count))

        docs = Documents(self.connection)
        docs.set_database(self.database)
        docs.set_txid(trans.txid())
        docs.set_format('xml')
        docs.set_accept("multipart/mixed")
        if self.mirror:
            docs.set_categories(['content', 'metadata'])
        else:
            docs.set_category('content')

        dlprog = 0
        dlcount = 0
        for uri in down_map.keys():
            dlcount += 1
            docs.add_uri(uri)

            if uri in filehash:
                del filehash[uri]

            if dlcount >= self.batchsize:
                dlprog += dlcount
                perc = (float(dlprog) / download_count) * 100.0
                print("{0:.0f}% ... {1}/{2} files" \
                          .format(perc, dlprog, download_count))

                self._download_batch(docs, down_map)
                docs.clear()
                docs.set_database(self.database)
                docs.set_txid(trans.txid())
                docs.set_format('xml')
                docs.set_accept("multipart/mixed")
                if self.mirror:
                    docs.set_categories(['content', 'metadata'])
                else:
                    docs.set_category('content')
                dlcount = 0

        if dlcount > 0:
            perc = (float(dlcount) / download_count) * 100.0
            print("{0:.0f}% ... {1} files".format(perc, dlcount))
            self._download_batch(docs, down_map)

        delfiles = []
        for path in filehash.keys():
            localfile = self.path + path
            if localfile in skip_list:
                pass
            else:
                delfiles.append(localfile)

        if not self.mirror and delfiles:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(len(delfiles)))
            else:
                print("Deleting {} files...".format(len(delfiles)))
                if not self.dryrun:
                    for path in delfiles:
                        os.remove(path)
                    self._remove_empty_dirs(self.path)
示例#3
0
    def _upload_map(self, trans, upload_map):
        """Upload from an internally constructed map."""
        print("Reading URIs from server...")
        uris = self.utils.uris(self.database)
        urihash = {}
        for uri in uris:
            urihash[uri] = 1

        print("Getting timestamps from server...")
        stamps = self.get_timestamps(list(upload_map))
        if not stamps:
            print("No timestamps, assuming all files newer.")
        else:
            uptodate = []
            for key in upload_map:
                if key in stamps:
                    source = upload_map[key]['content']
                    statinfo = os.stat(source)
                    stamp = self._convert_timestamp(stamps[key])
                    if statinfo.st_mtime < stamp.timestamp():
                        uptodate.append(key)

            if uptodate:
                print("{} documents are up-to-date...".format(len(uptodate)))
                for key in uptodate:
                    del upload_map[key]
                    del urihash[
                        key]  # remove it from this list so we don't delete it

        upload_count = len(upload_map)
        print("Uploading {} files...".format(upload_count))

        docs = Documents(self.connection)

        bulk = BulkLoader(self.connection)
        bulk.set_database(self.database)
        bulk.set_txid(trans.txid())

        files = list(upload_map.keys())
        done = not files
        upload_size = 0
        ulcount = 0
        while not done:
            doc = files.pop(0)
            done = not files

            docs.clear()

            source = upload_map[doc]['content']
            target = self.root + doc

            body_content_type = "application/octet-stream"

            if 'metadata' in upload_map[doc]:
                metasource = upload_map[doc]['metadata']
                metaxml = ET.parse(metasource)
                root = metaxml.getroot()

                txml = root \
                  .find("{http://marklogic.com/ns/mldbmirror/}content-type")
                if txml is not None:
                    body_content_type = txml.text
                    root.remove(txml)

                if 'uuid' in upload_map[doc] and upload_map[doc]['uuid']:
                    txml = root \
                      .find("{http://marklogic.com/ns/mldbmirror/}uri")
                    if txml is None:
                        raise RuntimeError("No URI provided in metadata.")
                    else:
                        target = txml.text
                        root.remove(txml)

                text = ET.tostring(root, encoding="unicode", method="xml")
                docs.set_metadata(text, "application/xml")
            else:
                metasource = None
                collections = []
                permissions = []

                for cfg in self.config["config"]:
                    if type(cfg["match"]) is list:
                        matches = cfg["match"]
                    else:
                        matches = [cfg["match"]]

                    for match in matches:
                        if re.match(match, target):
                            if "content-type" in cfg:
                                body_content_type = cfg["content-type"]
                            if "permissions" in cfg:
                                permissions = cfg["permissions"]
                            if "permissions+" in cfg:
                                permissions = permissions + cfg["permissions+"]
                            if "collections" in cfg:
                                collections = cfg["collections"]
                            if "collections+" in cfg:
                                collections = collections + cfg["collections+"]

                docs.set_collections(collections)
                docs.set_permissions(None)
                for perm in permissions:
                    for key in perm:
                        docs.add_permission(key, perm[key])

            if target in urihash:
                del urihash[target]

            ulcount += 1
            statinfo = os.stat(source)
            upload_size += statinfo.st_size

            docs.set_uri(target)

            datafile = open(source, "rb")
            docs.set_content(datafile.read(), body_content_type)
            datafile.close()

            bulk.add(docs)

            if self.verbose:
                print("-> {}".format(target))

            if upload_size > self.threshold:
                perc = (float(ulcount) / upload_count) * 100.0
                print("{0:.0f}% ... {1} files, {2} bytes" \
                          .format(perc, bulk.size(), upload_size))
                if self.dryrun:
                    bulk.clear_content()
                else:
                    bulk.post()
                upload_size = 0

        if bulk.size() > 0:
            perc = (float(ulcount) / upload_count) * 100.0
            print("{0:.0f}% ... {1} files, {2} bytes" \
                      .format(perc, bulk.size(), upload_size))
            if self.dryrun:
                bulk.clear_content()
            else:
                bulk.post()

        docs.clear()
        docs.set_txid(trans.txid())
        docs.set_database(self.database)
        delcount = 0
        for uri in urihash:
            if uri.startswith(self.root):
                if self.verbose:
                    print("DEL {}".format(uri))
                docs.add_uri(uri)
                delcount += 1

        if delcount > 0:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(delcount))
            else:
                print("Deleting {} URIs...".format(delcount))
                if not self.dryrun:
                    docs.delete()
示例#4
0
    def _upload_map(self, trans, upload_map):
        """Upload from an internally constructed map."""
        print("Reading URIs from server...")
        uris = self.utils.uris(self.database)
        urihash = {}
        for uri in uris:
            urihash[uri] = 1

        print("Getting timestamps from server...")
        stamps = self.get_timestamps(list(upload_map))
        if not stamps:
            print("No timestamps, assuming all files newer.")
        else:
            uptodate = []
            for key in upload_map:
                if key in stamps:
                    source = upload_map[key]['content']
                    statinfo = os.stat(source)
                    stamp = self._convert_timestamp(stamps[key])
                    if statinfo.st_mtime < stamp.timestamp():
                        uptodate.append(key)

            if uptodate:
                print("{} documents are up-to-date...".format(len(uptodate)))
                for key in uptodate:
                    del upload_map[key]
                    del urihash[key]    # remove it from this list so we don't delete it

        upload_count = len(upload_map)
        print("Uploading {} files...".format(upload_count))

        docs = Documents(self.connection)

        bulk = BulkLoader(self.connection)
        bulk.set_database(self.database)
        bulk.set_txid(trans.txid())

        files = list(upload_map.keys())
        done = not files
        upload_size = 0
        ulcount = 0
        while not done:
            doc = files.pop(0)
            done = not files

            docs.clear()

            source = upload_map[doc]['content']
            target = self.root + doc

            body_content_type = "application/octet-stream"

            if 'metadata' in upload_map[doc]:
                metasource = upload_map[doc]['metadata']
                metaxml = ET.parse(metasource)
                root = metaxml.getroot()

                txml = root \
                  .find("{http://marklogic.com/ns/mldbmirror/}content-type")
                if txml is not None:
                    body_content_type = txml.text
                    root.remove(txml)

                if 'uuid' in upload_map[doc] and upload_map[doc]['uuid']:
                    txml = root \
                      .find("{http://marklogic.com/ns/mldbmirror/}uri")
                    if txml is None:
                        raise RuntimeError("No URI provided in metadata.")
                    else:
                        target = txml.text
                        root.remove(txml)

                text = ET.tostring(root, encoding="unicode", method="xml")
                docs.set_metadata(text, "application/xml")
            else:
                metasource = None
                collections = []
                permissions = []

                for cfg in self.config["config"]:
                    if type(cfg["match"]) is list:
                        matches = cfg["match"]
                    else:
                        matches = [cfg["match"]]

                    for match in matches:
                        if re.match(match, target):
                            if "content-type" in cfg:
                                body_content_type = cfg["content-type"]
                            if "permissions" in cfg:
                                permissions = cfg["permissions"]
                            if "permissions+" in cfg:
                                permissions = permissions + cfg["permissions+"]
                            if "collections" in cfg:
                                collections = cfg["collections"]
                            if "collections+" in cfg:
                                collections = collections + cfg["collections+"]

                docs.set_collections(collections)
                docs.set_permissions(None)
                for perm in permissions:
                    for key in perm:
                        docs.add_permission(key, perm[key])

            if target in urihash:
                del urihash[target]

            ulcount += 1
            statinfo = os.stat(source)
            upload_size += statinfo.st_size

            docs.set_uri(target)

            datafile = open(source, "rb")
            docs.set_content(datafile.read(), body_content_type)
            datafile.close()

            bulk.add(docs)

            if self.verbose:
                print("-> {}".format(target))

            if upload_size > self.threshold:
                perc = (float(ulcount) / upload_count) * 100.0
                print("{0:.0f}% ... {1} files, {2} bytes" \
                          .format(perc, bulk.size(), upload_size))
                if self.dryrun:
                    bulk.clear_content()
                else:
                    bulk.post()
                upload_size = 0

        if bulk.size() > 0:
            perc = (float(ulcount) / upload_count) * 100.0
            print("{0:.0f}% ... {1} files, {2} bytes" \
                      .format(perc, bulk.size(), upload_size))
            if self.dryrun:
                bulk.clear_content()
            else:
                bulk.post()

        docs.clear()
        docs.set_txid(trans.txid())
        docs.set_database(self.database)
        delcount = 0
        for uri in urihash:
            if uri.startswith(self.root):
                if self.verbose:
                    print("DEL {}".format(uri))
                docs.add_uri(uri)
                delcount += 1

        if delcount > 0:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(delcount))
            else:
                print("Deleting {} URIs...".format(delcount))
                if not self.dryrun:
                    docs.delete()