def sketch_data(datas, alpha, bins, minval, quantiles): quantiles = ','.join(map(str, quantiles)) with shell.tempdir(): for data in datas: shell.run(f'bsv | bschema a:f64 | bquantile-sketch f64 -b {bins} -a {alpha} >> sketches', stdin='\n'.join(map(str, data)) + '\n') csv = shell.run(f'cat sketches | bquantile-merge {quantiles} | bschema f64:a,f64:a | csv') return [float(v) for line in csv.splitlines() for [q, v] in [line.split(',')]]
def test_appends(): with shell.tempdir(): stdin = """ 0,b,c,d 1,e,f,g 2,h,i,j """ stdout = """ prefix00 prefix01 prefix02 """ assert rm_whitespace(unindent(stdout)) == shell.run(f'bsv | bpartition 10 prefix', stdin=unindent(stdin)) assert rm_whitespace(unindent(stdout)) == shell.run(f'bsv | bpartition 10 prefix', stdin=unindent(stdin)) stdout = """ prefix00:b,c,d prefix00:b,c,d prefix01:e,f,g prefix01:e,f,g prefix02:h,i,j prefix02:h,i,j """ assert unindent(stdout).strip() == shell.run(f'bcat --prefix prefix*') stdout = """ prefix00 prefix01 prefix02 """ assert unindent(stdout).strip() == shell.run('ls prefix*')
def test_basic(): with shell.tempdir(): shell.run('for char in a a b b c c; do echo $char | bsv >> $char; done') stdout = """ a:a b:b c:c """ assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix --head 1 a b c') stdout = """ a:a a:a b:b b:b c:c c:c """ assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix --head 2 a b c') assert rm_whitespace(unindent(stdout)) == shell.run('bcat --head 2 --prefix a b c') assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix a b c') stdout = """ a b c """ assert rm_whitespace(unindent(stdout)) == shell.run('bcat --head 1 a b c') stdout = """ a a b b c c """ assert rm_whitespace(unindent(stdout)) == shell.run('bcat a b c')
def test_cp_dot_to_dot(): with servers(): with shell.tempdir(): run('mkdir dir1 dir2') run('touch dir1/file1.txt dir2/file2.txt dir2/file3.txt') run('s4 cp -r . s4://bucket') assert run( "s4 ls -r s4://bucket | awk '{print $NF}'").splitlines() == [ 'dir1/file1.txt', 'dir2/file2.txt', 'dir2/file3.txt', ] run('s4 cp -r s4://bucket .') assert sorted(run('find dir* -type f').splitlines()) == [ 'dir1/file1.txt', 'dir2/file2.txt', 'dir2/file3.txt', ] run('rm -rf dir*') run('s4 cp -r s4://bucket/dir2 .') assert sorted(run('find dir* -type f').splitlines()) == [ 'dir2/file2.txt', 'dir2/file3.txt', ] run('rm -rf dir*') run('s4 cp -r s4://bucket/dir2/ .') assert sorted(run('find dir* -type f').splitlines()) == [ 'dir2/file2.txt', 'dir2/file3.txt', ]
def clone_source(): with shell.climb_git_root(): orig = os.getcwd() with shell.tempdir(cleanup=False): shell.run(f"rsync -avhc {orig}/ . --exclude '.git' --exclude '.tox' --exclude '.backups' --exclude '__pycache__'") shell.run('mkdir .git') return os.getcwd()
def test_binary(): with shell.tempdir(): with open('1.txt', 'w') as f: f.write('123') run('cat 1.txt | lz4 -1 |', preamble, 'cp - s3://bucket/binary/1.txt') assert '123' == run(preamble, 'cp s3://bucket/binary/1.txt - | lz4 -d -c')
def test_props(args): num_buckets, csv = args result = expected(num_buckets, csv) with shell.tempdir(): stdout = '\n'.join(sorted({l.split(':')[0] for l in result.splitlines()})) assert stdout == shell.run(f'bsv | bpartition {num_buckets} prefix', stdin=csv, echo=True) assert result == shell.run(f'bcat --prefix prefix*')
def test_basic(): with shell.tempdir(): stdin = """ b,c,d e,f,g h,i,j """ stdout = """ prefix_02 prefix_04 prefix_05 """ assert rm_whitespace(unindent(stdout)) == shell.run( 'bsv | bpartition -l 10 prefix', stdin=unindent(stdin)) stdout = """ prefix_02:h,i,j prefix_04:e,f,g prefix_05:b,c,d """ assert unindent(stdout).strip() == shell.run('bcat -l -p prefix*') stdout = """ prefix_02 prefix_04 prefix_05 """ assert unindent(stdout).strip() == shell.run('ls prefix*')
def test_props(args): buffer, lines, chunks_per_file = args if not chunks_per_file: chunks_per_file = '' with shell.tempdir(): shell.run(f'_gen_csv 2 {lines} | bsv.{buffer} > data.bsv', echo=True) shell.run(f'cat data.bsv | bsplit.{buffer} prefix {chunks_per_file} > filenames') assert shell.run(f'cat data.bsv | csv.{buffer} | xxh3') == shell.run(f'cat filenames | while read path; do cat $path; done | csv.{buffer} | xxh3')
def push(): with shell.tempdir(): with open('s4.conf', 'w') as f: f.write(conf) run('aws-ec2-scp -y s4.conf :.s4.conf', *ids) with shell.climb_git_root(): run('aws-ec2-rsync -y . :/mnt/s4', cluster_name) run('aws-ec2-ssh -yc scripts/install_archlinux.sh', *ids) state['ids'] = ids
def test_cp(): with shell.tempdir(): run('mkdir -p foo/3') with open('foo/1.txt', 'w') as f: f.write('123') with open('foo/2.txt', 'w') as f: f.write('234') with open('foo/3/4.txt', 'w') as f: f.write('456') run(preamble, 'cp foo/ s3://bucket/cp/dst/ --recursive') assert rm_whitespace(run(preamble, 'ls bucket/cp/dst/')) == rm_whitespace(""" PRE 3/ _ _ _ 1.txt _ _ _ 2.txt """) assert rm_whitespace(run( preamble, 'ls bucket/cp/dst/ --recursive')) == rm_whitespace(""" _ _ _ cp/dst/1.txt _ _ _ cp/dst/2.txt _ _ _ cp/dst/3/4.txt """) run(preamble, 'cp s3://bucket/cp/dst/ dst1/ --recursive') assert run('grep ".*" $(find dst1/ -type f|LC_ALL=C sort)' ) == rm_whitespace(""" dst1/1.txt:123 dst1/2.txt:234 dst1/3/4.txt:456 """) run(preamble, 'cp s3://bucket/cp/dst/ . --recursive') assert run('grep ".*" $(find dst/ -type f|LC_ALL=C sort)' ) == rm_whitespace(""" dst/1.txt:123 dst/2.txt:234 dst/3/4.txt:456 """) run('rm -rf dst') run(preamble, 'cp foo s3://bucket/cp/dst2 --recursive') assert rm_whitespace(run(preamble, 'ls bucket/cp/dst2/')) == rm_whitespace(""" PRE 3/ _ _ _ 1.txt _ _ _ 2.txt """) assert rm_whitespace(run( preamble, 'ls bucket/cp/dst2/ --recursive')) == rm_whitespace(""" _ _ _ cp/dst2/1.txt _ _ _ cp/dst2/2.txt _ _ _ cp/dst2/3/4.txt """) run(preamble, 'cp s3://bucket/cp/dst . --recursive') assert run('grep ".*" $(find dst/ -type f|LC_ALL=C sort)' ) == rm_whitespace(""" dst/1.txt:123 dst/2.txt:234 dst/3/4.txt:456 """)
def test_props(csvs): result = expected(csvs) if result.strip(): with shell.tempdir(): paths = [] for i, csv in enumerate(csvs): path = f'file{i}.bsv' shell.run(f'bsv > {path}', stdin=csv) paths.append(path) assert result.strip() == shell.run(f'brmerge', *paths, ' | bcut 1 | csv', echo=True) assert shell.run('cat', *paths, '| brsort | bcut 1 | csv') == shell.run(f'brmerge', *paths, ' | bcut 1 | csv')
def _tar_script(src, name, echo_only=False): name = ('-name %s' % name) if name else '' script = ( 'cd %(src)s\n' 'src=$(pwd)\n' 'cd $(dirname $src)\n' "FILES=$(find -L $(basename $src) -type f %(name)s -o -type l %(name)s)\n" 'echo $FILES|tr " " "\\n"|grep -v \.git 1>&2\n' + ('' if echo_only else 'tar cfh - $FILES')) % locals() with shell.tempdir(cleanup=False): with open('script.sh', 'w') as f: f.write(script) return os.path.abspath('script.sh')
def test_fails_when_too_many_columns(): with shell.climb_git_root(): stdin = 'a,' * (2**16 - 1) with shell.tempdir(cleanup=False): with open('input', 'w') as f: f.write(stdin) path = os.path.abspath('input') try: res = shell.run('cat', path, '| bin/_csv >/dev/null', warn=True) finally: shell.run('rm', path) assert res['exitcode'] == 1 assert 'fatal: line with more than 65535 columns' == res['stderr']
def test_basic(): with shell.tempdir(): shell.run('echo -e "a,a\nc,c\ne,e\n" | bsv > a.bsv') shell.run('echo -e "b,b\nd,d\nf,f\n" | bsv > b.bsv') stdout = """ a,a b,b c,c d,d e,e f,f """ assert rm_whitespace(unindent(stdout)) == shell.run( f'bmerge a.bsv b.bsv | csv', stream=True)
def test_without_prefix(): with shell.tempdir(): stdin = """ b,c,d e,f,g h,i,j """ stdout = """ 02 04 05 """ assert rm_whitespace(unindent(stdout)) == shell.run( 'bsv | bpartition -l 10', stdin=unindent(stdin))
def servers(timeout=30, extra_conf='', num_servers=3): util.log.setup(format='%(message)s') shell.set['stream'] = True with util.time.timeout(timeout): with shell.stream(): with shell.tempdir(): procs = start_all(extra_conf, num_servers) watch = [True] pool.thread.new(watcher, watch, procs) try: yield finally: watch[0] = False for proc in procs: proc.terminate()
def test_props_compatability(csvs): result = expected(csvs) if result.strip(): with shell.tempdir(): bsv_paths = [] for i, csv in enumerate(csvs): path = f'file{i}.bsv' shell.run(f'bsv > {path}', stdin=csv) bsv_paths.append(path) csv_paths = [] for i, csv in enumerate(csvs): path = f'file{i}.csv' shell.run(f'cat - > {path}', stdin=csv) csv_paths.append(path) assert shell.run(f'LC_ALL=C sort -m -k1,1', *csv_paths, ' | cut -d, -f1') == shell.run(f'bmerge', *bsv_paths, ' | bcut 1 | csv', echo=True)
def test_basic(): with shell.tempdir(): with open('input.txt', 'w') as f: f.write('123') run(preamble, 'cp input.txt s3://bucket/basic/dir/file.txt') run('echo asdf |', preamble, 'cp - s3://bucket/basic/dir/stdin.txt') assert run(preamble, 'ls s3://bucket/ --recursive').splitlines() == [ '_ _ _ basic/dir/file.txt', '_ _ _ basic/dir/stdin.txt' ] assert run(preamble, 'cp s3://bucket/basic/dir/file.txt -') == "123" assert run(preamble, 'cp s3://bucket/basic/dir/stdin.txt -') == "asdf" run(preamble, 'cp s3://bucket/basic/dir/file.txt file.downloaded') with open('file.downloaded') as f: assert f.read() == "123" run(preamble, 'cp s3://bucket/basic/dir/stdin.txt stdin.downloaded') with open('stdin.downloaded') as f: assert f.read() == "asdf\n" run("mkdir foo") run(preamble, 'cp s3://bucket/basic/dir/stdin.txt foo/', stream=True) with open('foo/stdin.txt') as f: assert f.read() == "asdf\n"
def test_dupes(): with shell.tempdir(): shell.run('echo -e "a,a\na,a\nc,c\nc,c\ne,e\ne,e\n" | bsv > a.bsv') shell.run('echo -e "b,b\nd,d\nf,f\n" | bsv > b.bsv') stdout = """ a,a a,a b,b c,c c,c d,d e,e e,e f,f """ assert rm_whitespace(unindent(stdout)) == shell.run( 'echo a.bsv b.bsv | bmerge | csv', stream=True) assert rm_whitespace(unindent(stdout)) == shell.run( '(echo a.bsv; echo b.bsv) | bmerge | csv', stream=True) assert rm_whitespace(unindent(stdout)) == shell.run( '(echo a.bsv; echo; echo b.bsv) | bmerge | csv', stream=True)
def new(name: 'name of the instance', gigs: 'size in gigs of data disk' = 128, size: 'instance size' = shell.conf.get_or_prompt_pref( 'size', __file__, message='instance size'), location=shell.conf.get_or_prompt_pref('location', __file__, message='azure location'), no_wait: 'do not wait for ssh' = False, num: 'number of instances' = 1, init=_data_disk_init, group=None): assert not init.startswith( '#!' ), 'init commands are bash snippets, and should not include a hashbang' init = '#!/bin/bash\npath=/tmp/$(uuidgen); echo %s | base64 -d > $path; sudo -u ubuntu bash -e $path /var/log/cloud_init_script.log 2>&1' % util.strings.b64_encode( init) if not group: group_name = name else: group_name = group assert not list(id(name)), 'name must be globally unique' if not _group_exists(group_name): run('az group create --name', group_name, '--location', location, echo=True) run('az network vnet create --resource-group', group_name, '--name', group_name, '--location', location, '--subnet-name', group_name, echo=True) run('az network nsg create --resource-group', group_name, '--name', group_name, '--location', location, echo=True) run('az network nsg rule create --resource-group', group_name, '--nsg-name', group_name, '-n ssh', '--priority 100', '--source-address-prefix "*"', '--destination-address-prefix "*"', '--destination-port-range 22', '--access Allow', '--protocol Tcp', echo=True) with shell.tempdir(): with open('cloud-init.txt', 'w') as f: f.write(init) for i in range(num): run('az vm create', '--resource-group', group_name, '--vnet-name', group_name, '--subnet', group_name, '--nsg', group_name, '--name', (name if num == 1 else '%s-%s' % (name, i + 1)), '--image', 'Canonical:UbuntuServer:14.04.4-LTS:latest', '--ssh-key-value', '~/.ssh/id_rsa.pub', '--admin-username', 'ubuntu', '--data-disk-sizes-gb', gigs, '--custom-data', 'cloud-init.txt', '--size', size, ('--no-wait' if num > 1 else ''), stream=True) wait_for_ssh(group=group, num=num)
def new( name: 'name of all instances', arg: 'one instance per arg, and that arg is str formatted into cmd, pre_cmd, and tags as {arg}' = None, label: 'one label per arg, to use as ec2 tag since arg is often inapproriate, defaults to arg if not provided' = None, pre_cmd: 'optional cmd which runs before cmd is backgrounded. will be retried on failure. format with {arg}.' = None, cmd: 'cmd which is run in the background. format with {arg}.' = None, tag: 'tag to set as "<key>=<value>' = None, no_rm: 'stop instance instead of terminating when done' = False, chunk_size: 'how many args to launch at once' = 50, bucket: 's3 bucket to upload logs to' = shell.conf.get_or_prompt_pref( 'launch_logs_bucket', __file__, message='bucket for launch_logs'), spot: 'spot price to bid' = None, key: 'key pair name' = shell.conf.get_or_prompt_pref( 'key', aws.ec2.__file__, message='key pair name'), ami: 'ami id' = shell.conf.get_or_prompt_pref('ami', aws.ec2.__file__, message='ami id'), sg: 'security group name' = shell.conf.get_or_prompt_pref( 'sg', aws.ec2.__file__, message='security group name'), type: 'instance type' = shell.conf.get_or_prompt_pref( 'type', aws.ec2.__file__, message='instance type'), vpc: 'vpc name' = shell.conf.get_or_prompt_pref('vpc', aws.ec2.__file__, message='vpc name'), zone: 'ec2 availability zone' = None, role: 'ec2 iam role' = None, gigs: 'gb capacity of primary disk' = 8): optional = ['no_rm', 'zone', 'spot', 'tag', 'pre_cmd', 'label'] for k, v in locals().items(): assert v is not None or k in optional, 'required flag missing: --' + k.replace( '_', '-') tags, args, labels = tuple(tag or ()), tuple(arg or ()), tuple(label or ()) args = [str(a) for a in args] if labels: assert len(args) == len( labels ), 'there must be an equal number of args and labels, %s != %s' % ( len(args), len(labels)) else: labels = args labels = [_tagify(x) for x in labels] for tag in tags: assert '=' in tag, 'tags should be "<key>=<value>", not: %s' % tag for label, arg in zip(labels, args): if label == arg: logging.info('going to launch arg: %s', arg) else: logging.info('going to launch label: %s, arg: %s', label, arg) if pre_cmd and os.path.exists(pre_cmd): logging.info('reading pre_cmd from file: %s', os.path.abspath(pre_cmd)) with open(pre_cmd) as f: pre_cmd = f.read() if os.path.exists(cmd): logging.info('reading cmd from file: %s', os.path.abspath(cmd)) with open(cmd) as f: cmd = f.read() for _ in range(10): launch = str(uuid.uuid4()) path = 's3://%(bucket)s/launch_logs/launch=%(launch)s' % locals() try: shell.run('aws s3 ls', path) except: break else: assert False, 'failed to generate a unique launch id. clean up: s3://%(bucket)s/launch_logs/' % locals( ) logging.info('launch=%s', launch) data = json.dumps({ 'name': name, 'args': args, 'labels': labels, 'pre_cmd': pre_cmd, 'cmd': cmd, 'tags': tags, 'no_rm': no_rm, 'bucket': bucket, 'spot': spot, 'key': key, 'ami': ami, 'sg': sg, 'type': type, 'vpc': vpc, 'gigs': gigs }) if 'LAUNCH_LOCAL' in os.environ: for arg in args: with shell.tempdir(), shell.set_stream(): shell.run(pre_cmd.format(arg=arg)) shell.run(cmd.format(arg=arg)) else: shell.run( 'aws s3 cp - s3://%(bucket)s/launch_logs/launch=%(launch)s/params.json' % locals(), stdin=data) tags += ('launch=%s' % launch, ) for i, (args_chunk, labels_chunk) in enumerate( zip(chunk(args, chunk_size), chunk(labels, chunk_size))): logging.info('launching chunk %s of %s, chunk size: %s', i + 1, len(args) // chunk_size + 1, chunk_size) instance_ids = aws.ec2.new(name, role=role, spot=spot, key=key, ami=ami, sg=sg, type=type, vpc=vpc, zone=zone, gigs=gigs, num=len(args_chunk)) errors = [] def run_cmd(instance_id, arg, label): def fn(): try: if pre_cmd: aws.ec2._retry(aws.ec2.ssh)( instance_id, yes=True, cmd=pre_cmd.format(arg=arg), prefixed=True) aws.ec2.ssh(instance_id, no_tty=True, yes=True, cmd=_cmd(arg, cmd, no_rm, bucket), prefixed=True) instance = aws.ec2._ls([instance_id])[0] aws.ec2._retry(instance.create_tags)( Tags=[{ 'Key': k, 'Value': v } for tag in tags + ('label=%s' % label, 'chunk=%s' % i) for [k, v] in [tag.split('=', 1)]]) logging.info('tagged: %s', aws.ec2._pretty(instance)) logging.info('ran cmd against %s for label %s', instance_id, label) except: errors.append(traceback.format_exc()) return fn pool.thread.wait(*map(run_cmd, instance_ids, args_chunk, labels_chunk), max_threads=10) if errors: logging.info(util.colors.red('errors:')) for e in errors: logging.info(e) sys.exit(1) return 'launch=%s' % launch
def test_basic(): with sh.tempdir(): uid = str(uuid.uuid4()) os.environ['BACKUP_RCLONE_REMOTE'] = os.environ['BACKUP_TEST_RCLONE_REMOTE'] os.environ['BACKUP_DESTINATION'] = os.environ['BACKUP_TEST_DESTINATION'] + '/' + uid os.environ['BACKUP_STORAGE_CLASS'] = 'STANDARD_IA' os.environ['BACKUP_CHUNK_MEGABYTES'] = '100' os.environ['BACKUP_ROOT'] = os.getcwd() for k, v in os.environ.items(): if k.startswith('BACKUP_'): print(k, '=>', v) ## sh.run('echo foo > bar.txt') sh.run('backup-add') assert diff() == [ ('addition:', './bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] assert additions() == [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] sh.run('backup-commit') assert log() == [ '0000000000.DATE.tar.lz4.gpg.00000 HASH 1510', 'init', ] assert index() == [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] ## sh.run('echo foo > bar2.txt') sh.run('backup-add') assert diff() == [ ('addition:', './bar2.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] assert additions() == [ ('./bar2.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] sh.run('backup-commit') assert log() == [ 'index-only-update', '0000000000.DATE.tar.lz4.gpg.00000 HASH 1510', 'init', ] assert index() == [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./bar2.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] ## sh.run('echo asdf > asdf.txt') sh.run('backup-add') assert diff() == [ ('addition:', './asdf.txt', '0000000001.DATE.tar.lz4.gpg.00000', '36b807d5', '5') ] assert additions() == [ ('./asdf.txt', '0000000001.DATE.tar.lz4.gpg.00000', '36b807d5', '5') ] sh.run('backup-commit') assert log() == [ '0000000001.DATE.tar.lz4.gpg.00000 HASH 1513', 'index-only-update', '0000000000.DATE.tar.lz4.gpg.00000 HASH 1510', 'init', ] assert index() == [ ('./asdf.txt', '0000000001.DATE.tar.lz4.gpg.00000', '36b807d5', '5'), ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./bar2.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ] ## with sh.tempdir(): os.environ['BACKUP_ROOT'] = os.getcwd() _ = find('.') # clone the repo with the first call to find() assert [find('.', commit) for commit in commits()] == [find(r'\.txt$', commit) for commit in commits()] == [ [ ('./asdf.txt', '0000000001.DATE.tar.lz4.gpg.00000', '36b807d5', '5'), ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./bar2.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ], [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./bar2.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ], [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ], [], ] assert [find('.*asdf.*', commit) for commit in commits()] == [ [ ('./asdf.txt', '0000000001.DATE.tar.lz4.gpg.00000', '36b807d5', '5'), ], [], [], [], ] ## with sh.tempdir(): os.environ['BACKUP_ROOT'] = os.getcwd() _ = find('.') assert [restore('.', commit) for commit in commits()] == [ [ ('./bar.txt', 'd202d795'), ('./bar2.txt', 'd202d795'), ('./asdf.txt', '36b807d5'), ], [ ('./bar.txt', 'd202d795'), ('./bar2.txt', 'd202d795'), ], [ ('./bar.txt', 'd202d795'), ], [], ] assert [restore(r'\./bar2\.txt$', commit) for commit in commits()] == [ [ ('./bar2.txt', 'd202d795'), ], [ ('./bar2.txt', 'd202d795'), ], [], [], ] assert sh.run('cat bar.txt') == 'foo' assert sh.run('cat bar2.txt') == 'foo' assert sh.run('cat asdf.txt') == 'asdf'
def test_symlink(): with sh.tempdir(): uid = str(uuid.uuid4()) os.environ['BACKUP_RCLONE_REMOTE'] = os.environ['BACKUP_TEST_RCLONE_REMOTE'] os.environ['BACKUP_DESTINATION'] = os.environ['BACKUP_TEST_DESTINATION'] + '/' + uid os.environ['BACKUP_STORAGE_CLASS'] = 'STANDARD_IA' os.environ['BACKUP_CHUNK_MEGABYTES'] = '100' os.environ['BACKUP_ROOT'] = os.getcwd() for k, v in os.environ.items(): if k.startswith('BACKUP_'): print(k, '=>', v) ## sh.run('echo foo > bar.txt') sh.run('ln -s bar.txt link.txt') sh.run('backup-add') assert diff() == [ ('addition:', './bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('addition:', './link.txt', 'symlink', './bar.txt', '0'), ] assert additions() == [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./link.txt', 'symlink', './bar.txt', '0'), ] sh.run('backup-commit') assert log() == [ '0000000000.DATE.tar.lz4.gpg.00000 HASH 1510', 'init', ] assert index() == [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./link.txt', 'symlink', './bar.txt', '0'), ] ## sh.run('mkdir dir') sh.run('cd dir && ln -s ../bar.txt link.txt') sh.run('backup-add') assert diff() == [ ('addition:', './dir/link.txt', 'symlink', './bar.txt', '0'), ] assert additions() == [ ('./dir/link.txt', 'symlink', './bar.txt', '0'), ] sh.run('backup-commit') assert log() == [ 'index-only-update', '0000000000.DATE.tar.lz4.gpg.00000 HASH 1510', 'init', ] assert index() == [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./dir/link.txt', 'symlink', './bar.txt', '0'), ('./link.txt', 'symlink', './bar.txt', '0'), ] ## with sh.tempdir(): os.environ['BACKUP_ROOT'] = os.getcwd() _ = find('.') # clone the repo with the first call to find() assert [find('.', commit) for commit in commits()] == [ [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./dir/link.txt', 'symlink', './bar.txt', '0'), ('./link.txt', 'symlink', './bar.txt', '0'), ], [ ('./bar.txt', '0000000000.DATE.tar.lz4.gpg.00000', 'd202d795', '4'), ('./link.txt', 'symlink', './bar.txt', '0'), ], [], ] ## with sh.tempdir(): os.environ['BACKUP_ROOT'] = os.getcwd() restore('.') assert sh.run('find -printf "%y %p %l\n" | grep -v "\./\.backup" | grep -P "^(l|f)"').splitlines() == [ 'l ./link.txt bar.txt', 'l ./dir/link.txt ../bar.txt', 'f ./bar.txt', ] assert sh.run('cat link.txt') == 'foo' assert sh.run('cat dir/link.txt') == 'foo' assert sh.run('cat bar.txt') == 'foo'
def new(name: 'name of all instances', arg: 'one instance per arg, and that arg is str formatted into cmd, pre_cmd, and tags as "arg"' = None, label: 'one label per arg, to use as ec2 tag since arg is often inapproriate, defaults to arg if not provided' = None, pre_cmd: 'optional cmd which runs before cmd is backgrounded. will be retried on failure. format with %(arg)s.' = None, cmd: 'cmd which is run in the background. format with %(arg)s.' = None, tag: 'tag to set as "<key>=<value>' = None, no_rm: 'stop instance instead of terminating when done' = False, chunk_size: 'how many args to launch at once' = 50, bucket: 's3 bucket to upload logs to' = shell.conf.get_or_prompt_pref('launch_logs_bucket', __file__, message='bucket for launch_logs'), spot: 'spot price to bid' = None, key: 'key pair name' = shell.conf.get_or_prompt_pref('key', aws.ec2.__file__, message='key pair name'), ami: 'ami id' = shell.conf.get_or_prompt_pref('ami', aws.ec2.__file__, message='ami id'), sg: 'security group name' = shell.conf.get_or_prompt_pref('sg', aws.ec2.__file__, message='security group name'), type: 'instance type' = shell.conf.get_or_prompt_pref('type', aws.ec2.__file__, message='instance type'), vpc: 'vpc name' = shell.conf.get_or_prompt_pref('vpc', aws.ec2.__file__, message='vpc name'), zone: 'ec2 availability zone' = None, gigs: 'gb capacity of primary disk' = 8): optional = ['no_rm', 'zone', 'spot', 'tag', 'pre_cmd', 'label'] for k, v in locals().items(): assert v is not None or k in optional, 'required flag missing: --' + k.replace('_', '-') tags, args, labels = tuple(tag or ()), tuple(arg or ()), tuple(label or ()) args = [str(a) for a in args] if labels: assert len(args) == len(labels), 'there must be an equal number of args and labels, %s != %s' % (len(args), len(labels)) else: labels = args labels = [_tagify(x) for x in labels] for tag in tags: assert '=' in tag, 'tags should be "<key>=<value>", not: %s' % tag for label, arg in zip(labels, args): if label == arg: logging.info('going to launch arg: %s', arg) else: logging.info('going to launch label: %s, arg: %s', label, arg) if pre_cmd and os.path.exists(pre_cmd): logging.info('reading pre_cmd from file: %s', os.path.abspath(pre_cmd)) with open(pre_cmd) as f: pre_cmd = f.read() if os.path.exists(cmd): logging.info('reading cmd from file: %s', os.path.abspath(cmd)) with open(cmd) as f: cmd = f.read() for _ in range(10): launch = str(uuid.uuid4()) path = 's3://%(bucket)s/launch_logs/launch=%(launch)s' % locals() try: shell.run('aws s3 ls', path) except: break else: assert False, 'failed to generate a unique launch id. clean up: s3://%(bucket)s/launch_logs/' % locals() logging.info('launch=%s', launch) data = json.dumps({'name': name, 'args': args, 'labels': labels, 'pre_cmd': pre_cmd, 'cmd': cmd, 'tags': tags, 'no_rm': no_rm, 'bucket': bucket, 'spot': spot, 'key': key, 'ami': ami, 'sg': sg, 'type': type, 'vpc': vpc, 'gigs': gigs}) if 'AWS_LAUNCH_RUN_LOCAL' in os.environ: for arg in args: with shell.tempdir(), shell.set_stream(): shell.run(pre_cmd % {'arg': arg}) shell.run(cmd % {'arg': arg}) else: shell.run('aws s3 cp - s3://%(bucket)s/launch_logs/launch=%(launch)s/params.json' % locals(), stdin=data) tags += ('launch=%s' % launch,) for i, (args_chunk, labels_chunk) in enumerate(zip(chunk(args, chunk_size), chunk(labels, chunk_size))): logging.info('launching chunk %s of %s, chunk size: %s', i + 1, len(args) // chunk_size + 1, chunk_size) instance_ids = aws.ec2.new(name, spot=spot, key=key, ami=ami, sg=sg, type=type, vpc=vpc, zone=zone, gigs=gigs, num=len(args_chunk)) errors = [] def run_cmd(instance_id, arg, label): def fn(): try: if pre_cmd: aws.ec2._retry(aws.ec2.ssh)(instance_id, yes=True, cmd=pre_cmd % {'arg': arg}, prefixed=True) aws.ec2.ssh(instance_id, no_tty=True, yes=True, cmd=_cmd(arg, cmd, no_rm, bucket), prefixed=True) instance = aws.ec2._ls([instance_id])[0] aws.ec2._retry(instance.create_tags)(Tags=[{'Key': k, 'Value': v} for tag in tags + ('label=%s' % label, 'chunk=%s' % i) for [k, v] in [tag.split('=', 1)]]) logging.info('tagged: %s', aws.ec2._pretty(instance)) logging.info('ran cmd against %s for label %s', instance_id, label) except: errors.append(traceback.format_exc()) return fn pool.thread.wait(*map(run_cmd, instance_ids, args_chunk, labels_chunk), max_threads=10) if errors: logging.info(util.colors.red('errors:')) for e in errors: logging.info(e) sys.exit(1) return 'launch=%s' % launch
def test_numeric(): with shell.tempdir(): shell.run('echo a,1,2.0 | bsv > data.bsv') assert 'a,1,2.000000' == shell.run('bcat data.bsv')