def resume_sync(service, args): if args.clear: _clear_sync_states(service) else: state = _latest_sync_state(service) eprint('Resuming sync from state %s' % state.full_path) _sync(service, state)
def dedup_apply(service, args): eprint('Now computing and removing duplicates. Prefix order is %s.' % args.prefixes) duplicates, paths = compute_duplicates( service, get_snapshot(service, args) ) # Strips white spaces. prefixes = [prefix.strip() for prefix in args.prefixes.split(',')] # Adds trailing backlash to make prefixes unique. prefixes = [prefix + ('/' if not prefix.endswith('/') else '') for prefix in prefixes] runner = ErrorHandlingRunner(service, delegate=GAPIBatchRunner) for md5, entries in duplicates.items(): preferences = list(zip(entries, list(rank(prefixes, [paths[entry] for entry in entries])))) for duplicate, _ in sorted(preferences, key=lambda x: x[1])[1:]: rid = '%s (%s)' % (paths[duplicate], duplicate.id) eprint('Queue request for deleting duplicate %s' % rid) runner.add(request_id=rid, request=duplicate.delete()) if not args.dry_run: eprint('\n --- Now running %d deletion requests in batch.' % len(runner.requests)) for rid, result, _ in runner.execute(): eprint('Successfully deleted %s' % rid) else: eprint('Dry run. No changes applied.')
def _latest_sync_state(service) -> SyncState: path = getcwd() states = sorted(list(SyncState.sync_states(path, service)), key=lambda x: x.timestamp, reverse=True) if len(states) > 1: eprint('More than one sync state found. Will use the most recent.') elif len(states) == 0: eprint('No sync state found in %s. Nothing to resume.' % path) sys.exit(-1) return states[0]
def connect(): flow = _connect_flow(path.join(path.dirname(__file__), GAPI_ID)) payload, need_auth = next(flow) if need_auth: eprint( 'No valid authentication token found. Authorization required. Open the ' 'following URL in your browser: \n' + payload) payload, _ = flow.send( input('And enter the Google Drive auth key here:')) return payload
def store(self): eprint( 'Saving sync state at %s. Use _resume_ to resume sync in case of failures.' % self.full_path) try: self.stored = True Path(self.full_path).write_text(json.dumps( SyncStateSchema().dump(self).data), encoding='utf-8') except Exception: self.stored = False raise
def _cacheable_exclusions(service, args) -> Snapshot: ss = Snapshot() # Stored snapshots and remote folders are cached. for _snapshot in args.exclude_snapshot: eprint('Reading stored snapshot at <%s>' % _snapshot) ss = ss.merge(load_snapshot(service, _snapshot)) for remote_folder in args.exclude_folder: eprint('Examining contents of Google Drive folder <%s>' % remote_folder) ss = ss.merge(snapshot(service, [remote_folder])) return ss
def _new_sync_state(service, args) -> SyncState: # Compute MD5s for local files and fetches remotes from snapshot. eprint('Computing MD5 hashes for files under %s' % args.local) local = [] for local_file in _local_files(args.local): if not _mime_allow(local_file, args.include_pictures_only): eprint('Exclude file %s with MIME type \'%s\'' % (local_file.path, local_file.mime_type)) else: local.append(local_file) if not args.allow_duplicates: _check_duplicates(_by_md5(local)) return SyncState(_cacheable_exclusions(service, args), args, local, getcwd())
def _sync(service, state: SyncState): args = state.args local = _by_md5(state.local_files) # Read dynamic exclusions. state.snapshot = state.snapshot.merge( _uncacheable_exclusions(service, state.args)) remote_folder = unique(ResourcePath.from_name(service, args.remote), args.remote) remote = set(entry.md5Checksum for entry in state.snapshot.entries if isinstance(entry, DriveFile)) # We sync whatever is missing in the remote MD5 set according to the exclusion lists. to_sync = local.keys() - remote n_sync = len(to_sync) eprint('There are %d local files, %d remote files' % (len(local), len(remote))) eprint('%d files will be synced' % n_sync) # Uploads are better handled sequentially: they're unsupported by the batch API # (https://developers.google.com/drive/api/v3/batch) and are apparently handled sequentially on Google's side # (https://stackoverflow.com/questions/10311969/what-is-the-limit-on-google-drive-api-usage). # Indeed, since the main bottleneck is probably the client's upload speed anyways, handling concurrent upload # requests is probably pointless. runner = ErrorHandlingRunner(service, delegate=SequentialRequestRunner) for i, key in enumerate(to_sync, start=1): local_files = local[key] runner.add(request_id=local_files[0].path, request=remote_folder.create_file(local_files[0].path)) if not state.stored: state.store() eprint('Uploading %d files.' % n_sync) if state.args.dry_run: eprint('Dry run: no changes made.') return for i, result in enumerate(runner.execute(), start=1): eprint('%s successfully uploaded to %s (%d of %d)' % (result.id, state.args.remote, i, n_sync)) state.clear()
def snapshot(service, folder_paths): folders = [ u(ResourcePath.from_name(service, path), path) for path in folder_paths ] entries = [] for folder in folders: if not isinstance(folder, DriveFolder): eprint('<%s> is not a folder. Aborting.' % str(folder)) sys.exit(-1) entries.extend(folder.list(recurse=True)) # Some entries may be scooped in more than once if root folders # contain some of the same subfolders (which would imply someone # has more than one parent). We discard these spurious duplicates # by keeping entries with unique ids. unique = set(entries) eprint('There were %d entries, %d unique.' % (len(entries), len(unique))) return Snapshot(list(unique))
def _local_files(local_folder, recurse=True) -> List[LocalFile]: file_paths = os.listdir(local_folder) files = [] folders = [] for file_path in file_paths: full_path = os.path.join(local_folder, file_path) if os.path.isdir(full_path): folders.append(full_path) continue eprint('Analyzing %s' % full_path) files.append( LocalFile(path=full_path, mime_type=detector.from_file(full_path), md5_checksum=hashlib.md5( Path(full_path).read_bytes()).hexdigest())) if recurse: for folder in folders: eprint('Recursing into %s' % folder) files.extend(_local_files(folder, recurse)) return files
def _credentials(apisecret): flow = client.flow_from_clientsecrets( apisecret, scope='https://www.googleapis.com/auth/drive', redirect_uri='urn:ietf:wg:oauth:2.0:oob') storage = ofile.Storage('.credentials.json') exists = path.exists('.credentials.json') if exists: credentials = storage.get() if not exists or credentials.invalid: auth_uri = flow.step1_get_authorize_url() code = yield (auth_uri, True) credentials = flow.step2_exchange(code) storage.put(credentials) if credentials is None: raise ValueError('Failed to obtain access credentials.') eprint('Google Drive authentication successful.') yield (credentials, False)
def _check_duplicates(local): duplicates = {k: v for k, v in local.items() if len(v) > 1} if len(duplicates) == 0: return eprint( 'Error: duplicates found in the local folder. Re-run with --allow-duplicates to ' 'run the synchronization anyway. Duplicates are listed as follows.\n' % local) for md5, entries in duplicates.items(): eprint('------ %s -------' % md5) for entry in entries: eprint(entry) eprint('') sys.exit(-1)
def execute(self) -> Generator[RequestResult, None, None]: pending = {str(request.id): request for request in self.requests} warnings = set() attempts = 1 backoff = self.min_backoff start = time.monotonic() while ((time.monotonic() - start) < self.timeout) and (attempts <= self.max_retries): for result in self._run_requests(pending.values()): if result.error is None: # No errors with the current request, just return the element. del pending[result.id] yield result # Resets the retry and backoff counters as at least one request got through. attempts = 1 backoff = self.min_backoff continue # Got an error. Let's see what policies we have for that. policy, error_code = self.policies.matching(result.error) # Case 1: Don't know how to handle this. Just bubble it up. if policy is None: raise result.error # Prints error-specific warning. if policy.print_always or (error_code not in warnings): policy.print_warning(rid=result.id) warnings.add(error_code) # Case 2: Should skip the current entry as if it were already satisfied. if policy.action == HttpErrorPolicy.SKIP: del pending[result.id] continue # Case 3: Should retry after we're done. elif policy.action == HttpErrorPolicy.RETRY: pass # Case 4: After having printed the warning message, we should fail after. elif policy.action == HttpErrorPolicy.FAIL: raise result.error # Case 5: Should never happen, so we throw an error if it does. else: raise Exception('Don\'t know how to handle action %d.' % policy.action) # All requests satisfied: we're done. if not pending: return # If there are still pending requests left, this means we have to retry them. Attempts to back off. eprint('Some of the requests could not be fulfilled. Backing off and retrying.') # Backs off. time.sleep(backoff) backoff = backoff * 2 attempts += 1 raise Exception( 'Could not process request(s). %s' % ( 'Too many retry attempts.' if attempts > self.max_retries else 'Operation timed out.' ) )
def _uncacheable_exclusions(service, args) -> Snapshot: ss = Snapshot() eprint('Examining contents of Google Drive folder <%s>' % args.remote) ss = ss.merge(snapshot(service, [args.remote])) return ss
def read(path, service) -> 'SyncState': eprint('Reading sync state from %s.' % path) sss = SyncStateSchema() sss.context = {'service': service} return sss.load(json.loads( Path(path).read_text(encoding='utf-8'))).data
def clear(self): eprint('Removing sync state %s' % self.full_path) os.remove(self.full_path)
def print_warning(self, rid): eprint(self.warning.format(rid=rid))
def dedup_list(service, args): eprint('Computing duplicates and resolving resource paths.') duplicates, paths = compute_duplicates(service, get_snapshot(service, args)) summary = { md5: [ { 'id': duplicate.id, 'path': str(paths[duplicate]) } for duplicate in entries ] for md5, entries in duplicates.items() } if len(duplicates) == 0: eprint('Hooray! There are no duplicates in the snapshot.') else: eprint('Duplicates were found.') if args.json: print(json.dumps(summary, indent=3)) else: for md5, entries in summary.items(): eprint('------ %s -------' % md5) for entry in entries: eprint('%s (%s)' % (entry['path'], entry['id'])) eprint('')