def find_s3_region(self, bucket: str) -> str: """ Returns the URL for an S3 bucket (extracted from AWS CLI). :return: the URL string. """ from sh import aws cmd = aws('s3api', 'get-bucket-location', '--bucket', str(bucket), '--output', 'json') regs: Dict[str, str] = json.loads(str(cmd)) return str(bucket) + '.s3.' + str(regs['LocationConstraint']) + '.amazonaws.com'
def object_exists_in_s3(key): "check if object exists in S3 and is not empty" try: ret = sh.aws("s3api", "head-object", "--bucket", os.getenv("BUCKET_NAME"), "--key", key) obj = json.loads(str(ret)) return obj["ContentLength"] > 0 except sh.ErrorReturnCode_255: return False return False # TODO revisit
def dumpBucket(bucketName): # Dump the bucket into bucket folder bucketDir = './buckets/' + bucketName dumped = None try: if not AWS_CREDS_CONFIGURED: sh.aws('s3', 'sync', 's3://' + bucketName, bucketDir, '--no-sign-request', _fg=False) dumped = True else: sh.aws('s3', 'sync', 's3://' + bucketName, bucketDir, _fg=False) dumped = True except sh.ErrorReturnCode_1 as e: # Loop through our list of known errors. If found, dumping failed. foundErr = False err_message = e.stderr.decode('utf-8') for err in ERROR_CODES: if err in err_message: foundErr = True break if foundErr: # We caught a known error while dumping if not os.listdir( bucketDir ): # The bucket directory is empty. The dump didn't work dumped = False else: # The bucket directory is not empty. At least 1 of the files was downloaded. dumped = True else: raise e # Check if folder is empty. If it is, delete it if not os.listdir(bucketDir): os.rmdir(bucketDir) return dumped
def create_policy_from_string(name, doc): """ Create a policy :param name: policy name :param doc: json document string describing the policy :return: dictionary with the policy properties """ response = sh.aws("iot", "create-policy", "--policy-name", name, "--policy-document", doc) return json.loads(str(response))
def find_aws_acct(self) -> str: """ Returns the logged in users ID. This will only be useful once we can tag sources and not just assets. For now this is unused. :return: the account string """ from sh import aws cmd = aws('sts', 'get-caller-identity', '--output', 'json') acc: Dict[str, str] = json.loads(str(cmd)) return 'AWS_ACCT_ID:' + str(acc['Account'])
def upload(input, output, clean_train_files, dryrun): """Upload all model checkpoints to s3 for archival. You may want to remove large files like optimizer.pt before that.""" if clean_train_files: to_remove = ["optimizer.pt"] for root, subFolders, files in os.walk(input): for file in files: if file in to_remove: path = Path(root) / file print(f"Removing train file {path}") if not dry_run: path.unlink() if dryrun: dry_run_command = ["--dryrun"] else: dry_run_command = [] command = ["s3", "sync"] + dry_run_command + ["--follow-symlinks", input, output] sh.aws(*command, _out = sys.stdout)
def find_aws_dynamic_ips(self, region: str) -> Dict[str, int]: from sh import aws ips: Dict[str, int] = {} cmd = aws('ec2', 'describe-instances', '--region', region, '--query', 'Reservations[*].Instances[*].[PublicIpAddress]', '--output', 'json') iplist: List[List[List[str]]] = json.loads(str(cmd)) # This is required to unravel the list within list within list that AWS responds with for innerlist in iplist: for theips in innerlist: ips[theips[0]] = 1 return ips
def getBucketSize(bucketName): """ Use awscli to 'ls' the bucket which will give us the total size of the bucket. NOTE: Function assumes the bucket exists and doesn't catch errors if it doesn't. """ a = sh.aws('s3', 'ls', '--summarize', '--human-readable', '--recursive', '--no-sign-request', 's3://' + bucketName) # Get the last line of the output, get everything to the right of the colon, and strip whitespace return a.splitlines()[len(a.splitlines()) - 1].split(":")[1].strip()
def delete_stack(stack_name): """ Deletes the given cloudFormation stack """ print("Deleting stack {stack_name}".format(stack_name=stack_name)) try: sh.aws( "cloudformation", "delete-stack", "--stack-name", stack_name, ) #Wait for stack to be deleted print("Waiting for stack {stack_name} to be deleted".format( stack_name=stack_name)) sh.aws("cloudformation", "wait", "stack-delete-complete", "--stack-name", stack_name) print("stack {stack_name} deleted".format(stack_name=stack_name)) except Exception as e: print(traceback.format_exc())
def create_put_activate_pipeline(template_file_path): """ :param template_file_path: string. """ pipelineFilePath = "file://" + template_file_path uniqueId = "TeraSort" + str(uuid.uuid4().fields[-1])[:5] print("New pipeline from pipeline template: " + pipelineFilePath) print("Create Pipeline") cr = sh.aws("datapipeline", "create-pipeline", "--name", "TeraSort-10GB", "--unique-id", uniqueId, "--tags", "key=DPLTemplate,value=TeraSort-10GB-Template-v7") print(cr) pipelineId = json.loads(str(cr))['pipelineId'] print("Put pipeline definition") pr = sh.aws("datapipeline", "put-pipeline-definition", "--pipeline-id", pipelineId, "--pipeline-definition", pipelineFilePath) print(pr) print("Activate pipeline") ar = sh.aws("datapipeline", "activate-pipeline", "--pipeline-id", pipelineId) print(ar) print("Activated pipeline")
def test_attach_and_delete(self): """Test if: * policies can be attached to certificates * certificates can be attached to things * we can detach and delete certificates """ thing_name = "Thing-" + random_string() policy_name = "Policy-" + random_string() thing = iot.create_thing(thing_name) certs = iot.create_keys_and_certificate() policy = iot.create_policy(policy_name, "Allow", "iot:Publish", "topic-" + random_string()) arn = certs["certificateArn"] # attach policy and test if it is there iot.attach_policy(certs, policy) policies_data = json.loads( str(sh.aws("iot", "list-principal-policies", "--principal", arn))) attached_policy_name = policies_data["policies"][0]["policyName"] self.assertEqual(policy_name, attached_policy_name) # attach thing and test if it is there iot.attach_to_thing(thing, certs) things_data = json.loads( str(sh.aws("iot", "list-principal-things", "--principal", arn))) attached_thing_name = things_data["things"][0] self.assertEqual(thing_name, attached_thing_name) # delete certificate and make sure it is gone iot.delete_certificate(certs) try: iot.describe_certificate(certs) except Exception as e: self.assertTrue("%s does not exist" % certs['certificateId'] in e.message) iot.delete_policy(policy) iot.delete_thing(thing)
def find_s3_buckets(self) -> Dict[str, int]: """ Retrieve all running S3 buckets that the logged-in user owns (extracted from AWS CLI). :return: a dictionary with the buckets as keys. """ from sh import aws buckets = {} cmd = aws('s3api', 'list-buckets', '--query', "Buckets[].Name", '--output', 'json') bucketjson: List[str] = json.loads(str(cmd)) for i in bucketjson: buckets[i] = 1 return buckets
def dumpBucket(bucketName, region): # Check to make sure the bucket is open b = checkBucket(bucketName, region) if b[0] != 200: raise ValueError("The specified bucket is not open.") # Dump the bucket into bucket folder bucketDir = './buckets/' + bucketName if not os.path.exists(bucketDir): os.makedirs(bucketDir) sh.aws('s3', 'sync', 's3://' + bucketName, bucketDir, '--no-sign-request', _fg=True) # Check if folder is empty. If it is, delete it if not os.listdir(bucketDir): # Delete empty folder os.rmdir(bucketDir)
def getBucketSize(bucketName): """ Use awscli to 'ls' the bucket which will give us the total size of the bucket. NOTE: Function assumes the bucket exists and doesn't catch errors if it doesn't. """ try: if awsCredsConfigured: a = sh.aws('s3', 'ls', '--summarize', '--human-readable', '--recursive', 's3://' + bucketName, _timeout=sizeCheckTimeout) else: a = sh.aws('s3', 'ls', '--summarize', '--human-readable', '--recursive', '--no-sign-request', 's3://' + bucketName, _timeout=sizeCheckTimeout) # Get the last line of the output, get everything to the right of the colon, and strip whitespace return a.splitlines()[len(a.splitlines()) - 1].split(":")[1].strip() except sh.TimeoutException: return "Unknown Size - timeout" except sh.ErrorReturnCode_255 as e: if "AccessDenied" in e.stderr.decode("UTF-8"): return "AccessDenied" elif "AllAccessDisabled" in e.stderr.decode("UTF-8"): return "AllAccessDisabled" elif "NoSuchBucket" in e.stderr.decode("UTF-8"): return "NoSuchBucket" else: raise e
def delete_images(ecrName): """ Deletes the images in a given ecr repository """ print("Deleting images from {ecrName}".format(ecrName=ecrName)) try: #Get the image list from repository imageList = sh.aws("ecr", "list-images", "--repository-name", ecrName) #Remove all images imageList = json.loads(str(imageList)) for image in imageList['imageIds']: print("Removing image {imageDigest}".format( imageDigest=image['imageDigest'])) sh.aws( "ecr", "batch-delete-image", "--repository-name", ecrName, "--image-ids", "imageDigest={imageDigest}".format( imageDigest=image['imageDigest']), ) except Exception as e: print(traceback.format_exc())
def get_fastq_files_from_s3(sra_accession): """ If fastq files are present in S3, download them and return True. Otherwise return False. """ bucket = os.getenv("BUCKET_NAME") dirs = ["pipeline-fastq", "pipeline-fastq-salivary"] found_one = False found_two = False for dir_ in dirs: for num in ["1", "2"]: key = "{}/{}/{}_{}.fastq.gz".format(dir_, sra_accession, sra_accession, num) if object_exists_in_s3(key): fprint("Downloading {}_{}.fastq.gz....".format(sra_accession, num)) sh.aws("s3", "cp", "s3://{}/{}".format(bucket, key), ".") # false positive below: # https://github.com/PyCQA/pylint/issues/837#issuecomment-255109936 if num == "1": # pylint: disable=simplifiable-if-statement found_one = True else: found_two = True if found_one and found_two: return True return False
def transfer(df): """ Function that makes the transfer to the worker df is passed in the meassure wrapper """ bucket_id = "jfhuete-pycones2021" temp_path = "/tmp" file_path = f"{temp_path}/sample.hdf5" temp_files_prefix = str(random.getrandbits(32)) # Export to hdf5 df.export_hdf5(file_path) # Split hdf5 file in smaller files sh.split( f"-b{CHUNK_SIZE_MB}M", file_path, f"{temp_path}/{temp_files_prefix}" ) # Upload files to S3 temp_files = list( filter(lambda x: x.find(temp_files_prefix) == 0, os.listdir(temp_path)) ) processes = [] for file in temp_files: processes.append( sh.aws( "s3api", "put-object", "--bucket", bucket_id, "--key", file, "--body", f"{temp_path}/{file}", _bg=True ) ) for process in processes: process.wait() task = read.delay(temp_files) task.wait()
def main(): diff_only = False if len(sys.argv) > 1 and sys.argv[1] in ("-d", "--diff"): diff_only = True src = json.load(sys.stdin, encoding='utf-8') # get current ipset newl = _get_new_ipset(src) curr = _get_curr_ipset(src['IPSet']['IPSetId']) (del_list, add_list) = _get_diff(curr, newl) if diff_only: print("Append:") pprint.pprint(add_list) print("Revoke:") pprint.pprint(del_list) exit(0) token = aws("waf", "get-change-token", "--output", "text").stdout ipset_list = { "IPSetId": src['IPSet']['IPSetId'], "ChangeToken": token.rstrip(), 'Updates': [] } for l in del_list: ipset_list['Updates'].append({ "Action": "DELETE", "IPSetDescriptor": { "Type": "IPV4", "Value": "%s" % l } }) for l in add_list: ipset_list['Updates'].append({ "Action": "INSERT", "IPSetDescriptor": { "Type": "IPV4", "Value": "%s" % l } }) print(json.dumps(ipset_list, indent=4))
def main(): """ main function """ #Delete the pipeline stack delete_stack(pipeline_stack_name) #Delete the ecs stack delete_stack(ecs_stack_name) #Get the ECR name try: ecrName = sh.aws( "cloudformation", "describe-stacks", "--stack-name", vpc_stack_name, "--query", "Stacks[0].Outputs[?OutputKey=='ECRRepositoryName'].OutputValue", "--output", "text") ecrName = str(ecrName) ecrName = ecrName.replace('\n', '') delete_images(ecrName) except Exception as e: print(traceback.format_exc()) #Delete the vpc stack delete_stack(vpc_stack_name)
def get_stat(start, end, profile, inst_name): raw = sh.aws( '--profile', profile, 'lightsail', 'get-instance-metric-data', '--instance-name', inst_name, '--metric-name', 'NetworkOut', '--period', '2700000', '--start-time', str(start), '--unit', 'Bytes', '--statistics', 'Sum', '--end-time', str(end), ) js = json.loads(str(raw)) return js['metricData'][0]['sum'] / 2**20
def _get_curr_ipset(ipset_id): j = json.loads( aws("waf", "get-ip-set", "--ip-set-id", ipset_id, "--output", "json").stdout) return [c['Value'] for c in j['IPSet']['IPSetDescriptors']]
def find_aws_regions(self) -> List[str]: from sh import aws cmd = aws('ec2', 'describe-regions', '--output', 'json') regions: Dict[str, List[Dict[str, str]]] = json.loads(str(cmd)) return [r['RegionName'] for r in regions['Regions']]
import random as rd ip = '' def print_ret(s, j): r = str(j) js = json.loads(r) print(s) print(js) return r while not (ip.startswith('54.238') or ip.startswith('54.95')): raw = sh.aws('lightsail', 'allocate-static-ip', '--profile', 'tokyo2', '--static-ip-name', 'try') print_ret('Allocation msg:', raw) time.sleep(2) raw = sh.aws('lightsail', 'attach-static-ip', '--static-ip-name', 'try', '--profile', 'tokyo2', '--instance-name', 'AWS-Tokyo-2') print_ret('Attach msg:', raw) time.sleep(2) raw = sh.aws('lightsail', 'get-instance', '--profile', 'tokyo2', '--instance-name', 'AWS-Tokyo-2') r = print_ret('Instance message after allocation:', raw) ip = json.loads(r)['instance']['publicIpAddress'] print(ip) if ip.startswith('54.238') or ip.startswith('54.95'): break
def load_single(self, xp_name): key = xp_name + "/output/model.tar.gz" dest_file_name = self.tmp_dir / xp_name / (xp_name + "_output.tgz") dest_dir = dest_file_name.parent final_dest_file = self.dest / ("aws_" + dest_dir.name) if final_dest_file.exists() and False: print("ALREADY PROCESSED", final_dest_file) # Nothing to do return print("PROCESSING", key) dest_dir.mkdir(parents=True, exist_ok=True) if not dest_file_name.exists(): try: print(dest_file_name) s3_download(self.s3client, self.sagemaker_bucket, key, dest_file_name) except self.s3client.exceptions.ClientError as e: return None else: print("File was already downloaded to %s" % dest_file_name) print("Unpacking") with working_directory(dest_dir): sh.tar("-zxvf", dest_file_name.name) to_remove = [] to_remove_local = [] for root, dirs, files in os.walk(".", topdown=False): for name in files: # Mark optimizer files for deletion if name == "optimizer.pt": to_remove += [Path(root) / name] print("Cleaning up") # Remove the unwanted files for f in to_remove: print("remove", f) (dest_dir / f).unlink() # Remove the tar.gz dest_file_name.unlink() sh.aws( "s3", "sync", str(dest_dir), "s3://lagunas-sparsity-experiments/backup/nn_pruning/output/squad_test_aws/" + xp_name, _out=sys.stdout, _err=sys.stderr) for f in to_remove_local: print("remove local", f) (dest_dir / f).unlink() print("Copying to final destination") shutil.copytree(dest_dir, final_dest_file, dirs_exist_ok=True) print("Removing temporary dir") shutil.rmtree(self.tmp_dir) # Special stuff : add link to compensate for bug for link_name in [ "pytorch_model.bin", "training_args.bin", "vocab.txt", "tokenizer_config.json", "special_tokens_map.json" ]: link = final_dest_file / "checkpoint-110660" / link_name if not link.exists(): link.symlink_to(final_dest_file / link_name)
def main(): """ main function """ #Create the vpc stack print("Creating stack {vpc_stack_name}".format( vpc_stack_name=vpc_stack_name)) try: sh.aws( "cloudformation", "create-stack", "--stack-name", vpc_stack_name, "--template-body", "file://{vpc_file_location}".format( vpc_file_location=vpc_file_location), "--capabilities", "CAPABILITY_IAM") #Wait for stack to be created wait_for_creation(vpc_stack_name) except Exception as e: print(traceback.format_exc()) #Get the ECR Url try: ecrUrl = sh.aws( "cloudformation", "describe-stacks", "--stack-name", vpc_stack_name, "--query", "Stacks[0].Outputs[?OutputKey=='ECRRepositoryUrl'].OutputValue", "--output", "text") except Exception as e: print(traceback.format_exc()) #Build the image and push it to ECR print('Building Docker image and pushing it to {ecrUrl}'.format( ecrUrl=ecrUrl)) print(sh.bash("docker_image.sh", ecrUrl)) #Create the ecs stack print("Creating stack {ecs_stack_name}".format( ecs_stack_name=ecs_stack_name)) try: sh.aws( "cloudformation", "create-stack", "--stack-name", ecs_stack_name, "--template-body", "file://{ecs_file_location}".format( ecs_file_location=ecs_file_location)) #Wait for stack to be created wait_for_creation(ecs_stack_name) except Exception as e: print(traceback.format_exc()) #Create the pipeline stack print("Creating stack {pipeline_stack_name}".format( pipeline_stack_name=pipeline_stack_name)) try: sh.aws( "cloudformation", "create-stack", "--stack-name", pipeline_stack_name, "--template-body", "file://{pipeline_file_location}".format( pipeline_file_location=pipeline_file_location), "--parameters", "ParameterKey=GitHubToken,ParameterValue={github_key}".format( github_key=github_key), "--capabilities", "CAPABILITY_IAM") #Wait for stack to be created wait_for_creation(pipeline_stack_name) except Exception as e: print(traceback.format_exc())
#!/usr/bin/env python3 from pathlib import Path import sh sh.cd(Path(__file__).parent.absolute()) sh.mkdir('-p', 'datasets/modcloth/raw', 'datasets/electronics/raw') print( sh.aws('s3', 'cp', 's3://seshlabucsc/df_modcloth.csv', './datasets/modcloth/raw')) print( sh.aws('s3', 'cp', 's3://seshlabucsc/df_electronics.csv', './datasets/electronics/raw'))
def delete_images_with_digest(repo, image_digests): if len(image_digests) != 0: sh.aws("ecr", "batch-delete-image", "--repository-name", repo, "--image-ids", image_digests)
def run_bowtie(sra_accession, read_handling="equal"): """ run bowtie2 sra_accession - sra accession read_handling - if both fastq files are of equal length (indicated by value "equal", the default), then both fastq files are used. If value is 1 or 2, then the given single fastq file is used. """ viruses = os.getenv("REFERENCES").split(",") viruses = [x.strip() for x in viruses] # cmd = sh.Command("/bowtie2-2.3.4.1-linux-x86_64//bowtie2") bowtie2 = partial(sh.bowtie2, _piped=True, _bg_exc=False) for virus in viruses: bowtie_args = [ "--local", "-p", os.getenv("NUM_CORES"), "--no-unal", "-x", "/bt2/{}".format(virus), ] if read_handling == "equal": bowtie_args.extend( [ "-1", "{}_1.fastq.gz".format(sra_accession), "-2", "{}_2.fastq.gz".format(sra_accession), ] ) elif read_handling == 1: bowtie_args.extend(["-U", "{}_1.fastq.gz".format(sra_accession)]) elif read_handling == 2: bowtie_args.extend(["-U", "{}_2.fastq.gz".format(sra_accession)]) fprint("processing virus {} ...".format(virus)) if object_exists_in_s3( "{}/{}/{}/{}.sam".format( os.getenv("PREFIX"), sra_accession, virus, sra_accession ) ): fprint( "output sam file already exists in s3 for virus {}, skipping...".format( virus ) ) else: with Timer() as timer: for line in sh.aws( bowtie2(*bowtie_args), "s3", "cp", "-", "s3://{}/{}/{}/{}/{}.sam".format( os.getenv("BUCKET_NAME"), os.getenv("PREFIX"), sra_accession, virus, sra_accession, ), _iter=True, ): fprint(line) fprint("bowtie2 duration for {}: {}".format(virus, timer.interval))