示例#1
0
文件: client.py 项目: EHRI/resync
    def build_resource_list(self, paths=None, set_path=False):
        """Return a resource list for files on local disk

        The set of files is taken by disk scan from the paths specified or
        else defaults to the paths specified in the current mappings

        paths - override paths from mappings if specified

        set_path - set true to set the path information for each resource 
            included. This is used to build a resource list as the basis
            for creating a dump.

        Return ResourceList. Uses existing self.mapper settings.
        """
        # 0. Sanity checks, parse paths is specified
        if (len(self.mapper)<1):
            raise ClientFatalError("No source to destination mapping specified")
        if (paths is not None):
            # Expect comma separated list of paths
            paths=paths.split(',')
        # 1. Build from disk
        rlb = ResourceListBuilder(set_md5=self.checksum,mapper=self.mapper)
        rlb.set_path=set_path
        rlb.add_exclude_files(self.exclude_patterns)
        rl = rlb.from_disk(paths=paths)
        # 2. Set defaults and overrides
        rl.allow_multifile = self.allow_multifile
        rl.pretty_xml = self.pretty_xml
        rl.mapper = self.mapper
        if (self.max_sitemap_entries is not None):
            rl.max_sitemap_entries = self.max_sitemap_entries
        return(rl)
示例#2
0
def sync_audit(map, counter):
    """Run resync audit."""
    client = Client()
    # ignore fail to continue running, log later
    client.ignore_failures = True
    client.set_mappings(map)
    # init_logging(verbose=True)
    src_resource_list = client.find_resource_list()
    rlb = ResourceListBuilder(mapper=client.mapper)
    dst_resource_list = rlb.from_disk()
    # Compare these resource lists respecting any comparison options
    (same, updated, deleted,
     created) = dst_resource_list.compare(src_resource_list)
    result = dict(created=[], updated=[], deleted=[])
    for item in created:
        record_id = item.uri.rsplit('/', 1)[1]
        result['created'].append(record_id)
    for item in updated:
        record_id = item.uri.rsplit('/', 1)[1]
        result['updated'].append(record_id)
    for item in deleted:
        record_id = item.uri.rsplit('/', 1)[1]
        result['deleted'].append(record_id)
    update_counter(counter, result)
    return dict(same=len(same),
                updated=len(updated),
                deleted=len(deleted),
                created=len(created))
 def test2_pretty_output(self):
     rlb = ResourceListBuilder()
     rlb.mapper = Mapper(["http://example.org/t", "resync/test/testdata/dir1"])
     rl = rlb.from_disk()
     rl.md["modified"] = None  # don't write so we can test output easily
     self.assertEqual(
         rl.as_xml(pretty_xml=True),
         '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<rs:md capability="resourcelist" />\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:md length="20" /></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:md length="45" /></url>\n</urlset>',
     )
示例#4
0
 def test05_from_disk_paths(self):
     rlb = ResourceListBuilder()
     rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     # no path, should get no resources
     rl = rlb.from_disk(paths=[])
     self.assertEqual( len(rl), 0)
     # full path, 2 resources
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1'])
     self.assertEqual( len(rl), 2)
     # new object with mapper covering larger space of disk
     rlb = ResourceListBuilder(set_path=True)
     rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata'])
     # same path with 2 resources
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1'])
     self.assertEqual( len(rl), 2)
     # same path with 2 resources
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1','resync/test/testdata/dir2'])
     self.assertEqual( len(rl), 3)
     # path that is just a single file
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1/file_a'])
     self.assertEqual( len(rl), 1)
     rli = iter(rl)
     r = rli.next()
     self.assertTrue( r is not None )
     self.assertEqual( r.uri, 'http://example.org/t/dir1/file_a' )
     self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' )
     self.assertEqual( r.md5, None )
     self.assertEqual( r.length, 20 )
     self.assertEqual( r.path, 'resync/test/testdata/dir1/file_a' ) 
示例#5
0
 def test04_data(self):
     rlb = ResourceListBuilder(set_path=True,set_md5=True)
     rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual( len(rl), 2)
     r = rl.resources.get('http://example.org/t/file_a')
     self.assertTrue( r is not None )
     self.assertEqual( r.uri, 'http://example.org/t/file_a' )
     self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' )
     self.assertEqual( r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==' )
     self.assertEqual( r.path, 'resync/test/testdata/dir1/file_a' ) 
 def test4_data(self):
     rlb = ResourceListBuilder(do_md5=True)
     rlb.mapper = Mapper(["http://example.org/t", "resync/test/testdata/dir1"])
     rl = rlb.from_disk(set_path=True)
     self.assertEqual(len(rl), 2)
     r1 = rl.resources.get("http://example.org/t/file_a")
     self.assertTrue(r1 is not None)
     self.assertEqual(r1.uri, "http://example.org/t/file_a")
     self.assertEqual(r1.lastmod, "2012-07-25T17:13:46Z")
     self.assertEqual(r1.md5, "a/Jv1mYBtSjS4LR+qoft/Q==")
     self.assertEqual(r1.path, "resync/test/testdata/dir1/file_a")
示例#7
0
 def test04_data(self):
     rlb = ResourceListBuilder(set_path=True, set_md5=True)
     rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual(len(rl), 2)
     r = rl.resources.get('http://example.org/t/file_a')
     self.assertTrue(r is not None)
     self.assertEqual(r.uri, 'http://example.org/t/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==')
     self.assertEqual(r.path, 'tests/testdata/dir1/file_a')
 def test04_data(self):
     rlb = ResourceListBuilder(set_path=True, set_hashes=['md5'])
     rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual(len(rl), 2)
     r = rl.resources.get('http://example.org/t/file_a')
     self.assertTrue(r is not None)
     self.assertEqual(r.uri, 'http://example.org/t/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, '6bf26fd66601b528d2e0b47eaa87edfd')
     self.assertEqual(r.path, 'tests/testdata/dir1/file_a')
示例#9
0
    def resource_list(self):
        """Return resource_list on disk based on current mappings

        Return resource_list. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Build from disk
        rlb = ResourceListBuilder(do_md5=self.checksum,mapper=self.mapper)
        rlb.add_exclude_files(self.exclude_patterns)
        return( rlb.from_disk() )
 def test06_odd_file_names(self):
     """Verfify we can read unicode file names properly."""
     rlb = ResourceListBuilder()
     rlb.mapper = Mapper(['x:', 'tests/testdata/odd_file_names'])
     rl = rlb.from_disk(paths=['tests/testdata/odd_file_names'])
     # Get list of URIs to test
     uris = [x.uri for x in rl]
     self.assertTrue('x:/not_odd.txt' in uris)
     self.assertTrue('x:/with&ampersand.txt' in uris)
     self.assertTrue('x:/with spaces.txt' in uris)
     # File names for accented chars represented with combining chars
     self.assertTrue(u'x:/Pi\u006e\u0303a_Colada.txt' in uris)
     self.assertFalse(u'x:/Pi\u00f1a_Colada.txt' in uris)
     self.assertTrue(u'x:/A_\u0041\u0303_tilde.txt' in uris)
     self.assertFalse(u'x:/A_\u00c3_tilde.txt' in uris)
     # Snowman is single char
     self.assertFalse(u'x:snowman_\u2603.txt' in uris)
 def test03_set_hashes(self):
     rlb = ResourceListBuilder(set_hashes=['md5'])
     rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual(len(rl), 2)
     rli = iter(rl)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, '6bf26fd66601b528d2e0b47eaa87edfd')
     self.assertEqual(r.length, 20)
     self.assertEqual(r.path, None)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_b')
     self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z')
     self.assertEqual(r.md5, '452e54bdae1626ac5d6e7be81b39de21')
     self.assertEqual(r.length, 45)
     self.assertEqual(r.path, None)
示例#12
0
 def test03_set_md5(self):
     rlb = ResourceListBuilder(set_md5=True)
     rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual( len(rl), 2 )
     rli = iter(rl)
     r = rli.next()
     self.assertEqual( r.uri, 'http://example.org/t/file_a' )
     self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' )
     self.assertEqual( r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==' )
     self.assertEqual( r.length, 20 )
     self.assertEqual( r.path, None )
     r = rli.next()
     self.assertEqual( r.uri, 'http://example.org/t/file_b' )
     self.assertEqual( r.lastmod, '2001-09-09T01:46:40Z' )
     self.assertEqual( r.md5, 'RS5Uva4WJqxdbnvoGzneIQ==' )
     self.assertEqual( r.length, 45 )
     self.assertEqual( r.path, None )
示例#13
0
 def test02_no_length(self):
     rlb = ResourceListBuilder(set_length=False)
     rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual( len(rl), 2 )
     rli = iter(rl)
     r = rli.next()
     self.assertEqual( r.uri, 'http://example.org/t/file_a' )
     self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' )
     self.assertEqual( r.md5, None )
     self.assertEqual( r.length, None )
     self.assertEqual( r.path, None )
     r = rli.next()
     self.assertEqual( r.uri, 'http://example.org/t/file_b' )
     self.assertEqual( r.lastmod, '2001-09-09T01:46:40Z' )
     self.assertEqual( r.md5, None )
     self.assertEqual( r.length, None )
     self.assertEqual( r.path, None )
示例#14
0
 def test03_set_md5(self):
     rlb = ResourceListBuilder(set_md5=True)
     rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual(len(rl), 2)
     rli = iter(rl)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==')
     self.assertEqual(r.length, 20)
     self.assertEqual(r.path, None)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_b')
     self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z')
     self.assertEqual(r.md5, 'RS5Uva4WJqxdbnvoGzneIQ==')
     self.assertEqual(r.length, 45)
     self.assertEqual(r.path, None)
示例#15
0
 def test02_no_length(self):
     rlb = ResourceListBuilder(set_length=False)
     rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual(len(rl), 2)
     rli = iter(rl)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, None)
     self.assertEqual(r.length, None)
     self.assertEqual(r.path, None)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_b')
     self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z')
     self.assertEqual(r.md5, None)
     self.assertEqual(r.length, None)
     self.assertEqual(r.path, None)
 def test3_with_md5(self):
     rlb = ResourceListBuilder(do_md5=True)
     rlb.mapper = Mapper(["http://example.org/t", "resync/test/testdata/dir1"])
     rl = rlb.from_disk()
     xml = rl.as_xml()
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:md hash="md5:a/Jv1mYBtSjS4LR\+qoft/Q==" length="20" />',
             xml,
         ),
     )  # must escape + in md5
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:md hash="md5:RS5Uva4WJqxdbnvoGzneIQ==" length="45" />',
             xml,
         ),
     )
示例#17
0
 def test01_simple_scan(self):
     rlb = ResourceListBuilder()
     rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual(len(rl), 2)
     rli = iter(rl)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, None)
     self.assertEqual(r.length, 20)
     self.assertEqual(r.path, None)
     r = next(rli)
     self.assertEqual(r.uri, 'http://example.org/t/file_b')
     self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z')
     self.assertEqual(r.md5, None)
     self.assertEqual(r.length, 45)
     self.assertEqual(r.path, None)
     # Make sure at and completed were set
     self.assertTrue(rl.md_at is not None)
     self.assertTrue(rl.md_completed is not None)
示例#18
0
 def test01_simple_scan(self):
     rlb = ResourceListBuilder()
     rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     rl = rlb.from_disk()
     self.assertEqual( len(rl), 2 )
     rli = iter(rl)
     r = rli.next()
     self.assertEqual( r.uri, 'http://example.org/t/file_a' )
     self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' )
     self.assertEqual( r.md5, None )
     self.assertEqual( r.length, 20 )
     self.assertEqual( r.path, None )
     r = rli.next()
     self.assertEqual( r.uri, 'http://example.org/t/file_b' )
     self.assertEqual( r.lastmod, '2001-09-09T01:46:40Z' )
     self.assertEqual( r.md5, None )
     self.assertEqual( r.length, 45 )
     self.assertEqual( r.path, None )
     # Make sure at and completed were set
     self.assertTrue( rl.md_at is not None )
     self.assertTrue( rl.md_completed is not None )
示例#19
0
    def build_resource_list(self, paths=None, set_path=False):
        """Return a resource list for files on local disk

        The set of files is taken by disk scan from the paths specified or
        else defaults to the paths specified in the current mappings

        paths - override paths from mappings if specified

        set_path - set true to set the path information for each resource 
            included. This is used to build a resource list as the basis
            for creating a dump.

        Return ResourceList. Uses existing self.mapper settings.
        """
        # 0. Sanity checks, parse paths is specified
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (paths is not None):
            # Expect comma separated list of paths
            paths = paths.split(',')
        # 1. Build from disk
        rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper)
        rlb.set_path = set_path
        rlb.add_exclude_files(self.exclude_patterns)
        rl = rlb.from_disk(paths=paths)
        # 2. Set defaults and overrides
        rl.allow_multifile = self.allow_multifile
        rl.pretty_xml = self.pretty_xml
        rl.mapper = self.mapper
        if (self.max_sitemap_entries is not None):
            rl.max_sitemap_entries = self.max_sitemap_entries
        return (rl)
 def test05_from_disk_paths(self):
     rlb = ResourceListBuilder()
     rlb.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     # no path, should get no resources
     rl = rlb.from_disk(paths=[])
     self.assertEqual(len(rl), 0)
     # full path, 2 resources
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1'])
     self.assertEqual(len(rl), 2)
     # new object with mapper covering larger space of disk
     rlb = ResourceListBuilder(set_path=True)
     rlb.mapper = Mapper(['http://example.org/t', 'resync/test/testdata'])
     # same path with 2 resources
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1'])
     self.assertEqual(len(rl), 2)
     # same path with 2 resources
     rl = rlb.from_disk(
         paths=['resync/test/testdata/dir1', 'resync/test/testdata/dir2'])
     self.assertEqual(len(rl), 3)
     # path that is just a single file
     rl = rlb.from_disk(paths=['resync/test/testdata/dir1/file_a'])
     self.assertEqual(len(rl), 1)
     rli = iter(rl)
     r = rli.next()
     self.assertTrue(r is not None)
     self.assertEqual(r.uri, 'http://example.org/t/dir1/file_a')
     self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r.md5, None)
     self.assertEqual(r.length, 20)
     self.assertEqual(r.path, 'resync/test/testdata/dir1/file_a')
示例#21
0
    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ('audit' if (audit_only) else 'baseline sync')
        self.logger.debug("Starting " + action)
        ### 0. Sanity checks
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (not audit_only and self.mapper.unsafe()):
            raise ClientFatalError(
                "Source to destination mappings unsafe: %s" % str(self.mapper))
        ### 1. Get inventories from both src and dst
        # 1.a source resource list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(
                allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError(
                "Can't read source resource list from %s (%s)" %
                (self.sitemap, str(e)))
        self.logger.info("Read source resource list, %d resources listed" %
                         (len(src_resource_list)))
        if (len(src_resource_list) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source resource list"
            )
        # 1.b destination resource list mapped back to source URIs
        rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper)
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource lists respecting any comparison options
        (same, updated, deleted,
         created) = dst_resource_list.compare(src_resource_list)
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated) + len(deleted) +
                                 len(created) == 0),
                        audit=True,
                        same=len(same),
                        created=len(created),
                        updated=len(updated),
                        deleted=len(deleted))
        if (audit_only or len(created) + len(updated) + len(deleted) == 0):
            self.logger.debug("Completed " + action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        if (not self.noauth):
            uauth = UrlAuthority(self.sitemap, strict=self.strictauth)
            for resource in src_resource_list:
                if (not uauth.has_authority_over(resource.uri)):
                    raise ClientFatalError(
                        "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (self.sitemap, resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" %
                      len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" %
                            (len(created) + len(updated), delete_msg))
        self.last_timestamp = 0
        num_created = 0
        num_updated = 0
        num_deleted = 0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri, file))
            num_created += self.update_resource(resource, file, 'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri, file))
            num_updated += self.update_resource(resource, file, 'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted += self.delete_resource(resource, file, allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp > 0):
            ClientState().set_state(self.sitemap, self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" %
                             (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated) + len(deleted) +
                                 len(created) == 0),
                        same=len(same),
                        created=num_created,
                        updated=num_updated,
                        deleted=num_deleted,
                        to_delete=len(deleted))
        self.logger.debug("Completed %s" % (action))
示例#22
0
    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ( 'audit' if (audit_only) else 'baseline sync' ) 
        self.logger.debug("Starting "+action)
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get inventories from both src and dst 
        # 1.a source resource_list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e)))
        self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list)))
        if (len(src_resource_list)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source resource_list")
        # 1.b destination resource_list mapped back to source URIs
        rlb = ResourceListBuilder(mapper=self.mapper)
        rlb.do_md5=self.checksum
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource_lists respecting any comparison options
        (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list)   
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        audit=True,same=len(same),created=len(created),
                        updated=len(updated),deleted=len(deleted))
        if (audit_only or len(created)+len(updated)+len(deleted)==0):
            self.logger.debug("Completed "+action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        uauth = UrlAuthority(self.sitemap)
        for resource in src_resource_list:
            if (not uauth.has_authority_over(resource.uri)):
                if (self.noauth):
                    #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg))
        self.last_timestamp = 0
        num_created=0
        num_updated=0
        num_deleted=0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri,file))
            num_created+=self.update_resource(resource,file,'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri,file))
            num_updated+=self.update_resource(resource,file,'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted+=self.delete_resource(resource,file,allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        same=len(same),created=num_created,
                        updated=num_updated,deleted=num_deleted)
        self.logger.debug("Completed %s" % (action))
示例#23
0
    def base_line(self, unzipdir):
        """
        Synchronize the unzipped contents of a resource dump with the local resources
        :param unzipdir: the directory of the unzipped packed contents.
        :return:
        """
        manifest_file_name = os.path.join(unzipdir, "manifest.xml")
        try:
            sitemap = Sitemap()
            manifest_doc = sitemap.parse_xml(fh=manifest_file_name)
            # the manifest_doc is a resync.resource_container.ResourceContainer
            capability = manifest_doc.capability
            assert capability == CAPA_RESOURCEDUMP_MANIFEST, "Capability is not %s but %s" % (
                CAPA_RESOURCEDUMP_MANIFEST, capability)
            self.status = Status.parsed
            self.__inform_sitemap_received__(capability, manifest_file_name)

            config = Config()
            netloc = config.boolean_prop(Config.key_use_netloc, False)
            base_uri, destination = DestinationMap().find_destination(
                self.pack_uri, netloc=netloc)
            assert destination is not None, "Found no destination folder in DestinationMap"
            mapper = Mapper((base_uri, destination))
            rlb = ResourceListBuilder(mapper=mapper)
            dst_resource_list = rlb.from_disk()
            # Compares on uri
            same, updated, deleted, created = dst_resource_list.compare(
                manifest_doc)

            raise NotImplementedError("This class is not fully implemented.")

            print(len(same), len(updated), len(deleted), len(created))

            print("same")
            for resource in same:
                print(resource)
            print("updated")
            for resource in updated:
                print(resource)
            print("deleted")
            for resource in deleted:
                print(resource)
            print("created")
            for resource in created:
                print(resource)
                base_uri, local_path = DestinationMap().find_local_path(
                    resource.uri)
                print(base_uri, local_path)

        except AssertionError as err:
            self.logger.debug("%s Error: %s" % (self.pack_uri, str(err)))
            self.status = Status.parse_error
            self.exceptions.append(err)
        except SitemapParseError as err:
            self.logger.debug("%s Unreadable source: %s" %
                              (self.source_uri, str(err)))
            self.status = Status.parse_error
            self.exceptions.append(err)

        self.status = Status.processed_with_exceptions if self.has_exceptions(
        ) else Status.processed