def exists(path: str) -> bool: url = urlparse(path) if url.scheme == 's3': fs = s3fs.S3FileSystem(anon=False) return fs.exists(path) return os.path.exists(path)
#!/usr/bin/env python import s3fs import yaml stream = open("/etc/backup/backup.yaml", "r") conf = yaml.load(stream) stream.close() bucket = conf['s3']['bucket'] aws_key = conf['s3']['key'] aws_secret = conf['s3']['secret'] print "listing bucket %s" % bucket s3 = s3fs.S3FileSystem(key=aws_key, secret=aws_secret) objects = s3.ls(bucket, detail=True) for object in objects: print "%s size %s date %s" % (object['Key'], object['Size'], str(object['LastModified']))
# coding: utf-8 # In[29]: import networkx as nx import pandas as pd import s3fs import pyarrow.parquet as pq import numpy as np import pyarrow as pa import os import networkx as nx import random import dgl import torch as th s3 = s3fs.S3FileSystem() import torch import pandas as pd import dgl.function as fn import torch as th import torch.nn as nn import torch.nn.functional as F from dgl import DGLGraph from dateutil.parser import parse as dt_parse import time import logging device = "cpu" import argparse parser = argparse.ArgumentParser() parser.add_argument('date_key') args = parser.parse_args()
def preprocess(s3_in_url, s3_out_bucket, s3_out_prefix, delimiter=","): """Preprocesses data based on business logic - Reads delimited file passed as s3_url and preprocess data by filtering long tail in the customer ratings data i.e. keep customers who have rated 5 or more videos, and videos that have been rated by 9+ customers - Preprocessed data is then written to output Args: s3_in_url: s3 url to the delimited file to be processed e.g. s3://amazon-reviews-pds/tsv/reviews.tsv.gz s3_out_bucket: s3 bucket where preprocessed data will be staged e.g. mybucket s3_out_prefix: s3 url prefix to stage preprocessed data to use later in the pipeline e.g. amazon-reviews-pds/preprocess/ delimiter: delimiter to be used for parsing the file. Defaults to "," if none provided Returns: status of preprocessed data Raises: IOError: An error occurred accessing the s3 file """ try: print("preprocessing data from {}".format(s3_in_url)) # read s3 url into pandas dataframe # pandas internally uses s3fs to read s3 file directory df = pd.read_csv(s3_in_url, delimiter, error_bad_lines=False) # limit dataframe to customer_id, product_id, and star_rating # `product_title` will be useful validating recommendations df = df[['customer_id', 'product_id', 'star_rating', 'product_title']] # clean out the long tail because most people haven't seen most videos, # and people rate fewer videos than they actually watch customers = df['customer_id'].value_counts() products = df['product_id'].value_counts() # based on data exploration only about 5% of customers have rated 5 or # more videos, and only 25% of videos have been rated by 9+ customers customers = customers[customers >= 5] products = products[products >= 10] print("# of rows before the long tail = {:10d}".format(df.shape[0])) reduced_df = df \ .merge(pd.DataFrame({'customer_id': customers.index})) \ .merge(pd.DataFrame({'product_id': products.index})) print("# of rows after the long tail = {:10d}".format( reduced_df.shape[0])) reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id']) print("# of rows after removing duplicates = {:10d}".format( reduced_df.shape[0])) # recreate customer and product lists since there are customers with # more than 5 reviews, but all of their reviews are on products with # less than 5 reviews (and vice versa) customers = reduced_df['customer_id'].value_counts() products = reduced_df['product_id'].value_counts() # sequentially index each user and item to hold the sparse format where # the indices indicate the row and column in our ratings matrix customer_index = pd.DataFrame({ 'customer_id': customers.index, 'customer': np.arange(customers.shape[0])}) product_index = pd.DataFrame({ 'product_id': products.index, 'product': np.arange(products.shape[0])}) reduced_df = reduced_df \ .merge(customer_index) \ .merge(product_index) nb_customer = reduced_df['customer'].max() + 1 nb_products = reduced_df['product'].max() + 1 feature_dim = nb_customer + nb_products print(nb_customer, nb_products, feature_dim) product_df = reduced_df[['customer', 'product', 'star_rating']] # split into train, validation and test data sets train_df, validate_df, test_df = np.split( product_df.sample(frac=1), [int(.6*len(product_df)), int(.8*len(product_df))] ) print("# of rows train data set = {:10d}".format( train_df.shape[0])) print("# of rows validation data set = {:10d}".format( validate_df.shape[0])) print("# of rows test data set = {:10d}".format( test_df.shape[0])) # select columns required for training the model # excluding columns "customer_id", "product_id", "product_title" to # keep files small cols = ["customer", "product", "star_rating"] train_df = train_df[cols] validate_df = validate_df[cols] test_df = test_df[cols] # write output to s3 as delimited file fs = s3fs.S3FileSystem(anon=False) s3_out_prefix = s3_out_prefix[:-1] \ if s3_out_prefix[-1] == "/" else s3_out_prefix s3_out_train = "s3://{}/{}/{}".format( s3_out_bucket, s3_out_prefix, "train/train.csv") print("writing training data to {}".format(s3_out_train)) with fs.open(s3_out_train, "w") as f: train_df.to_csv(f, sep=str(','), index=False) s3_out_validate = "s3://{}/{}/{}".format( s3_out_bucket, s3_out_prefix, "validate/validate.csv") print("writing test data to {}".format(s3_out_validate)) with fs.open(s3_out_validate, "w") as f: validate_df.to_csv(f, sep=str(','), index=False) s3_out_test = "s3://{}/{}/{}".format( s3_out_bucket, s3_out_prefix, "test/test.csv") print("writing test data to {}".format(s3_out_test)) with fs.open(s3_out_test, "w") as f: test_df.to_csv(f, sep=str(','), index=False) print("preprocessing completed") return "SUCCESS" except Exception as e: raise e
def __init__(self, client_id, client_secret, user_agent): self.reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent) self.sia = SIA() self.fs = s3fs.S3FileSystem()
def save_object_to_s3(model_: object, path: str) -> None: fs = s3fs.S3FileSystem(anon=False) with fs.open(path, 'wb') as f: pickle.dump(model_, f) print('Storing to s3: ', path, verbosity=1)
def get_clip_keys(fs=None): if not fs: fs = s3fs.S3FileSystem() keys = fs.find(f'brissonstagram/clips') return keys
def write_process(load_id_): s3 = s3fs.S3FileSystem(anon=False) now = datetime.datetime.now().strftime("%Y%m%dT%H%M") dataset_schema = RandomData(Constants.INPUT_DATASET_SCHEMA_DICT, load_id_) dataset_schema.read_files() athena_client = Athena() parquet_creation_timing = pd.DataFrame() if Settings.new_parquet: dataset_sufix = now.lower() create_table = True else: dataset_sufix = Settings.existing_parquet create_table = False for dataset, atomic_load_count, data in dataset_schema.generate_random_data_list(): start_time = datetime.datetime.now() print(start_time) parquet_table = pa.Table.from_pandas(data) s3path = ("s3://simm-poc-s3-athena/simm/" + str(Settings.use_dict_encoding).lower() + "/" + Settings.compression + "/" + "".join(Settings.partition_cols) + "/" + dataset + "_" + dataset_sufix) partition_list = [] for column in Settings.partition_cols: partition_list.append((column, data[column].iloc[0])) partition_path = "/".join(["=".join(i) for i in partition_list]) s3partpath = (s3path + "/" + partition_path) if Settings.remove_existing_part and s3.exists(s3partpath): s3.rm(s3partpath, recursive=True) print("Remove partition: SUCCESS") print(partition_path) print(atomic_load_count) pq.write_to_dataset(parquet_table, s3path, filesystem=s3, partition_cols=Settings.partition_cols, coerce_timestamps="ms", allow_truncated_timestamps=True, use_dictionary=Settings.use_dict_encoding, compression=Settings.compression) print("Push to S3: SUCCESS") if create_table: athena_client.create_table(dataset + "_" + dataset_sufix, partition_list, s3path) print("Create table: SUCCESS") create_table = False athena_client.refresh_metastore(dataset + "_" + dataset_sufix, partition_list) print("Refresh metastore: SUCCESS") end_time = datetime.datetime.now() creation_timing = {"Dataset": dataset, "dataset_sufix": dataset_sufix, "use_dict_encoding": str(Settings.use_dict_encoding).lower(), "compression": Settings.compression, "partition_cols": str(Settings.partition_cols), "load_id": dataset_schema.load_id[dataset][load_id_] - 1, "Dataset size": atomic_load_count, "duration": end_time - start_time} parquet_creation_timing = parquet_creation_timing.append(pd.DataFrame(creation_timing, index=[0]), ignore_index=True) parquet_creation_timing.to_csv("C:\\Users\\micha\\Desktop\\SIMM\\output\\" "parquet_toS3_timing_" + dataset_sufix + "_" + now + ".csv")
class Download(): def __init__(self): self.name = 'Ben' def download_data(self): return True def transform_json(self): return True db = DynamoConn() d = Download() s3 = S3() S3FS = s3fs.S3FileSystem() def handler(event, context): data = {} for dataset_item in db.get_all(): # print dataset_item dataset = dataset_item['dataset'] columns = dataset_item['columns'] map_type = dataset_item['map_type'] if map_type != 'arc' and dataset != '' and dataset == 'lehd_rac': print dataset, columns paths = ['bnroths/chicago-data/%s' % dataset]
def __init__(self, json_path): "Initialization" super().__init__(json_path) if "from_s3" in self.json_data.keys(): self.from_s3 = self.json_data["from_s3"] else: self.from_s3 = False self.raw_data_file = self.json_data["movie_path"] self.batch_size = self.json_data["batch_size"] self.pre_frame = self.json_data["pre_frame"] self.post_frame = self.json_data["post_frame"] self.start_frame = self.json_data["start_frame"] # This is compatible with negative frames self.end_frame = self.json_data["end_frame"] # This is used to limit the total number of samples # -1 means to take all and is the default fall back if "total_samples" in self.json_data.keys(): self.total_samples = self.json_data["total_samples"] else: self.total_samples = -1 if self.from_s3: s3_filesystem = s3fs.S3FileSystem() raw_data = h5py.File(s3_filesystem.open(self.raw_data_file, 'rb'), 'r')['data'] else: raw_data = h5py.File(self.raw_data_file, "r")["data"] self.total_frame_per_movie = int(raw_data.shape[0]) if self.end_frame < 0: self.img_per_movie = (self.total_frame_per_movie + 1 + self.end_frame - self.start_frame - self.post_frame) elif self.total_frame_per_movie < self.end_frame: self.img_per_movie = (self.total_frame_per_movie - self.start_frame - self.post_frame) else: self.img_per_movie = self.end_frame + 1 - self.start_frame average_nb_samples = 1000 local_data = raw_data[0:average_nb_samples, :, :].flatten() local_data = local_data.astype("float32") self.local_mean = np.mean(local_data) self.local_std = np.std(local_data) self.list_samples = np.arange(self.start_frame, self.start_frame + self.img_per_movie) if "randomize" in self.json_data.keys(): self.randomize = self.json_data["randomize"] else: self.randomize = 1 if self.randomize: np.random.shuffle(self.list_samples) # We cut the number of samples if asked to if self.total_samples > 0 and self.total_samples < len( self.list_samples): self.list_samples = self.list_samples[0:self.total_samples]
def __init__(self, bucket, directory): self.bucket_uri = '%s/%s/visits/%%s' % (bucket, directory) self.s3_fs = s3fs.S3FileSystem(session=LocalS3Session())
def main(): # Load configuration args = parser.parse_args() # Torch stuff torch.cuda.set_device(args.rank) cudnn.benchmark = True # Create model by loading a snapshot body, head, cls_state = load_snapshot(args.snapshot) model = SegmentationModule(body, head, 256, 65, args.fusion_mode) model.cls.load_state_dict(cls_state) model = model.cuda().eval() # Create data loader transformation = SegmentationTransform( 2048, (0.41738699, 0.45732192, 0.46886091), (0.25685097, 0.26509955, 0.29067996), ) dataset = S3Dataset(args.data, transformation) data_loader = DataLoader(dataset, batch_size=1, pin_memory=True, sampler=DistributedSampler( dataset, args.world_size, args.rank), num_workers=2, collate_fn=segmentation_collate, shuffle=False) s3 = s3fs.S3FileSystem( s3_additional_kwargs={'ServerSideEncryption': 'AES256'}) # Run testing scales = eval(args.scales) with torch.no_grad(): for batch_i, rec in enumerate(data_loader): print("Testing batch [{:3d}/{:3d}]".format(batch_i + 1, len(data_loader))) img = rec["img"].cuda(non_blocking=True) probs, preds = model(img, scales, args.flip) for i, (prob, pred) in enumerate( zip(torch.unbind(probs, dim=0), torch.unbind(preds, dim=0))): out_size = rec["meta"][i]["size"] img_name = rec["meta"][i]["idx"] # Save prediction prob = prob.cpu() pred = pred.cpu() pred_img = get_pred_image(pred, out_size, args.output_mode == "palette") save_img_to_s3(pred_img, path.join(args.output, img_name + ".png"), s3) # Optionally save probabilities if args.output_mode == "prob": prob_img = get_prob_image(prob, out_size) save_img_to_s3( prob_img, path.join(args.output, img_name + "_prob.png"), s3)
def get_scene_list(lon: float, lat: float, start_date: Union[dt.date, dt.datetime], end_date: Union[dt.date, dt.datetime], what: Union[str, Iterable[str]], cloud_cover_le: float = 50, use_ssl: bool = True, also: Optional[List[str]] = None) -> List[str]: """ Returns the scene list of a given location Parameters ---------- lon: float Float value defining the longitude of interest. lat: float Float value defining the latitude of interest. start_date: datetime.date or datetime.datetime Date to start looking for images to download. end_date: datetime.date or datetime.datetime Date to end looking for images to download. what: str or array_like Here you have to define what you want to download as a string or as an array_like of strings. Valid values are: 'TCI', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'AOT', 'WVP', 'SCL' cloud_cover_le: float FLoat indicating the maximum cloud cover allowed. If the value is 10 it indicates the allowed cloud cover on the image must be lower or equal to 10%. Default value is 50 (%). also: list or None A list detailing if you want to download other COG files in the borders. Valid values are 'N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW'. See below where 'X' is the original target. +-----+-----+-----+ |NW | N | NE| | | | | | | | | +-----+-----+-----+ | | | | |W | X | E| | | | | +-----+-----+-----+ | | | | | | | | |SW | S | SE| +-----+-----+-----+ """ _also = { "N": { "x": 0, "y": 150_000 }, "NE": { "x": 150_000, "y": 150_000 }, "E": { "x": 150_000, "y": 0 }, "SE": { "x": 150_000, "y": -150_000 }, "S": { "x": 0, "y": -150_000 }, "SW": { "x": -150_000, "y": -150_000 }, "W": { "x": -150_000, "y": 0 }, "NW": { "x": -150_000, "y": 150_000 }, } if start_date > end_date: raise ValueError( "`start_date` has to be lower or equal than `end_date`") if isinstance(what, str): what = [what] for w in what: if w.upper() not in [item.value for item in Properties]: raise ValueError(f"{w} is not a valid product") fs = s3fs.S3FileSystem(anon=True, use_ssl=use_ssl) start_date = dt.date(start_date.year, start_date.month, start_date.day) end_date = dt.date(end_date.year, end_date.month, end_date.day) rpaths = [] path: Union[str, Path] m = mgrs.MGRS() # Get the remote and local paths for the original target coord = m.toMGRS(lat, lon, MGRSPrecision=0) number, a, b = coord[:-3], coord[-3:-2], coord[-2:] def check_tile(_c): name = _c.split("/")[-1] info = _c + "/" + name + ".json" with fs.open(info, "r") as f: info = json.load(f) date_str = name.split("_")[2] cc = info["properties"]["eo:cloud_cover"] date = dt.datetime.strptime(date_str, "%Y%m%d").date() if cloud_cover_le >= cc and start_date <= date <= end_date: package = [] for w in what: package.append(str(_c + f"/{w}.tif")) rpaths.append(tuple(package)) def check_package(path): _contents = fs.ls(path) with ThreadPoolExecutor() as exe: for _c in _contents: exe.submit(check_tile, _c) with ThreadPoolExecutor() as ex: for yy, mm in _iter_dates(start_date, end_date): path = f"sentinel-cogs/sentinel-s2-l2a-cogs/{number}/{a}/{b}/{yy}/{mm}" ex.submit(check_package, path) # Get the remote and local paths for the adjacent COGS to the target, # if required # TODO (josep) make it threaded as before if also is None: also = [] for al in also: al = al.upper() if al not in list(_also.keys()): raise ValueError(f'"{al}" is not a valid value for `also` keyword') z, hem, x, y = m.MGRSToUTM(coord) x += _also[al]["x"] y += _also[al]["y"] _coord = m.UTMToMGRS(z, hem, x, y, MGRSPrecision=0) number, a, b = _coord[:-3], _coord[-3:-2], _coord[-2:] for yy, mm in _iter_dates(start_date, end_date): path = "sentinel-cogs/sentinel-s2-l2a-cogs/" f"{number}/{a}/{b}/{yy}/{mm}" _contents = fs.ls(path) for _c in _contents: name = _c.split("/")[-1] info = _c + "/" + name + ".json" with fs.open(info, "r") as f: info = json.load(f) date_str = name.split("_")[2] cc = info["properties"]["eo:cloud_cover"] date = dt.datetime.strptime(date_str, "%Y%m%d").date() if cloud_cover_le >= cc and start_date <= date <= end_date: package = [] for w in what: package.append(str(_c + f"/{w}.tif")) rpaths.append(tuple(package)) if not rpaths: raise Exception('No data found') return rpaths
import os import shutil import tempfile import zipfile import loompy import numpy import pandas import requests import s3fs import scipy import zarr from matrix.common.etl import get_dss_client S3 = s3fs.S3FileSystem(anon=True) def calculate_ss2_metrics_direct(bundle_fqids): """Calculate expected SS2 matrix values. Don't use matrices or the matrix service, calculate in a completely orthogonal way using the RSEM outputs directly. """ def read_bundle(fqid): dss_client = get_dss_client(os.environ['DEPLOYMENT_STAGE']) bundle_uuid, bundle_version = fqid.split(".", 1) bundle = dss_client.get_bundle(uuid=bundle_uuid, version=bundle_version, replica="aws")
def s3_file_exists(filepath): s3 = _s3fs.S3FileSystem(anon=False) return s3.exists(filepath)
# ------------------------------------------------------------------------------------------------------- # run it! if __name__ == "__main__": print(40 * "*", "opasFileSupport Tests", 40 * "*") print("Running in Python %s" % sys.version_info[0]) import doctest doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) print("Fini. opasFileSupport Tests complete.") sys.exit() # test S3FileSystem remfs = s3fs.S3FileSystem(anon=False, key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET) #fs.ls("embedded-graphics") filename_and_path = "pep-web-files/doc/g/BAP.01.0004.FIG001.jpg" try: if remfs.ls(filename_and_path) != []: # exists with remfs.open(filename_and_path, mode='rb') as f: # doctest: +SKIP image_bytes = f.read() f.close() print(image_bytes) except Exception as e: print(f"Error: {e}")
def copy_s3_file(s3_source_file, s3_target_file): s3 = _s3fs.S3FileSystem(anon=False) s3.cp(s3_source_file, s3_target_file)
def upload_df_to_s3(df,s3_path): s3 = s3fs.S3FileSystem(anon=False) with s3.open(s3_path,'w') as f: df.to_csv(f)
def read_parquet(fs: Union[s3fs.S3FileSystem, None], path: str = None, fallback_path: str = None, columns: List[str] = None, partition_filters: Union[List[Tuple[str]], None] = None, non_partition_filters: Union[List[Tuple], List[List[Tuple]], None] = None) -> pd.DataFrame: """ Read parquet file to pandas from S3. Accepts partition filters without overhead and automatically syncronizes to local filesystem prior to read if s3 is None. :param fs: Filesystem. s3fs.S3FileSystem instance or None if executing on local. :param path: path to the parquet folder. :param fallback_path: In case you perform a local execution and the path does not exist on your machine, fall back s3 path from which it will be copied. :param columns: List[str] Names of columns to read from the file :param partition_filters: List[Tuple[str]] or None (default) One list element for filter. Each tuple contains 1) the column to filter for and 2) the value to filter on. List element order matters. :param non_partition_filters: List[Tuple] or List[List[Tuple]] or None (default) List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This implements partition-level (hive) filtering only, i.e., to prevent the loading of some files of the dataset. Predicates are expressed in disjunctive normal form (DNF). This means that the innermost tuple describe a single column predicate. These inner predicate make are all combined with a conjunction (AND) into a larger predicate. The most outer list then combines all filters with a disjunction (OR). By this, we should be able to express all kinds of filters that are possible using boolean logic. This function also supports passing in as List[Tuple]. These predicates are evaluated as a conjunction. To express OR in predictates, one must use the (preferred) List[List[Tuple]] notation. :return: pandas.DataFrame """ original_fallback_path = copy(fallback_path) if partition_filters: for filter in partition_filters: path += '/{}={}'.format(filter[0], filter[1]) if fallback_path: fallback_path += '/{}={}'.format(filter[0], filter[1]) if not fs and not os.path.exists(path): assert fallback_path, 'local execution is turned on and {} does not exists on machine, but fallback_path has not been set' print('{} does not exist on local machine, so it\'ll be copied from s3'.format(path)) ensure_path(path, del_if_exists=False, include_last=True) s3 = s3fs.S3FileSystem(anon=False) files = s3.ls(fallback_path, detail=False) if len(files) == 0: table_exists = len(s3.ls(original_fallback_path, detail=False)) > 0 if table_exists: raise AssertionError('The table {} exist, but there are no data for the selected {} partition filters' .format(original_fallback_path, partition_filters)) else: raise AssertionError('The table {} does not exist') for file in files: print('fetching ', file) s3.get(file, os.path.join(path, file.split('/')[-1])) print('Loading board from ', path, verbosity=1) non_partition_filters_columns = [filter[0] for filter in non_partition_filters] if non_partition_filters else [] df = pq.ParquetDataset(path, filesystem=fs, filters=non_partition_filters)\ .read_pandas(columns=list(set(columns + non_partition_filters_columns)) if columns else None).to_pandas() # WARNING the filters argument is interfaced as of now but not actually implemented if non_partition_filters: for column, evaluator, value in non_partition_filters: if evaluator == '=': df = df[df[column] == value] else: raise NotImplementedError('{} filter condition not implemented. I suggest you implement it now. It\'s not that hard'.format(evaluator)) if columns: df = df[columns] assert df.shape[0] > 0, 'The table {} exist, and there are data for the selected partition filters, but there are no data for the selected {} non-partition filters'\ .format(original_fallback_path, non_partition_filters) return df
def execute(queue_url, message_body, receipt_handle): logger.info("Message received") queue = get_queue(queue_url) msg = queue.Message(receipt_handle) try: # Parse and validate incoming message validate_message(message_body) body = json.loads(message_body) session = get_session(body.get("RoleArn")) client = session.client("s3") query_bucket, query_key, object_path, job_id, file_format = itemgetter( "QueryBucket", "QueryKey", "Object", "JobId", "Format")(body) obj = s3_client.Object(query_bucket, query_key) raw_data = obj.get()['Body'].read().decode('utf-8') data = json.loads(raw_data) cols = data["Columns"] input_bucket, input_key = parse_s3_url(object_path) validate_bucket_versioning(client, input_bucket) creds = session.get_credentials().get_frozen_credentials() s3 = s3fs.S3FileSystem( key=creds.access_key, secret=creds.secret_key, token=creds.token, default_cache_type="none", requester_pays=True, default_fill_cache=False, version_aware=True, ) # Download the object in-memory and convert to PyArrow NativeFile logger.info("Downloading and opening %s object in-memory", object_path) with s3.open(object_path, "rb") as f: source_version = f.version_id logger.info("Using object version %s as source", source_version) # Write new file in-memory compressed = object_path.endswith(".gz") out_sink, stats = delete_matches_from_file(f, cols, file_format, compressed) if stats["DeletedRows"] == 0: raise ValueError( "The object {} was processed successfully but no rows required deletion" .format(object_path)) with pa.BufferReader(out_sink.getvalue()) as output_buf: new_version = save(s3, client, output_buf, input_bucket, input_key, source_version) logger.info("New object version: %s", new_version) verify_object_versions_integrity(client, input_bucket, input_key, source_version, new_version) if body.get("DeleteOldVersions"): logger.info( "Deleting object {} versions older than version {}".format( input_key, new_version)) delete_old_versions(client, input_bucket, input_key, new_version) msg.delete() emit_deletion_event(body, stats) except (KeyError, ArrowException) as e: err_message = "Apache Arrow processing error: {}".format(str(e)) handle_error(msg, message_body, err_message) except IOError as e: err_message = "Unable to retrieve object: {}".format(str(e)) handle_error(msg, message_body, err_message) except MemoryError as e: err_message = "Insufficient memory to work on object: {}".format( str(e)) handle_error(msg, message_body, err_message) except ClientError as e: err_message = "ClientError: {}".format(str(e)) if e.operation_name == "PutObjectAcl": err_message += ". Redacted object uploaded successfully but unable to restore WRITE ACL" if e.operation_name == "ListObjectVersions": err_message += ". Could not verify redacted object version integrity" handle_error(msg, message_body, err_message) except ValueError as e: err_message = "Unprocessable message: {}".format(str(e)) handle_error(msg, message_body, err_message) except DeleteOldVersionsError as e: err_message = "Unable to delete previous versions: {}".format(str(e)) handle_error(msg, message_body, err_message) except IntegrityCheckFailedError as e: err_description, client, bucket, key, version_id = e.args err_message = "Object version integrity check failed: {}".format( err_description) handle_error(msg, message_body, err_message) rollback_object_version( client, bucket, key, version_id, on_error=lambda err: handle_error(None, "{}", err, "ObjectRollbackFailed", False), ) except Exception as e: err_message = "Unknown error during message processing: {}".format( str(e)) handle_error(msg, message_body, err_message)
def s3_resource(s3_base, tips_file, jsonl_file, feather_file): """ Sets up S3 bucket with contents The primary bucket name is "pandas-test". The following datasets are loaded. - tips.csv - tips.csv.gz - tips.csv.bz2 - items.jsonl A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ import boto3 import s3fs test_s3_files = [ ("tips#1.csv", tips_file), ("tips.csv", tips_file), ("tips.csv.gz", tips_file + ".gz"), ("tips.csv.bz2", tips_file + ".bz2"), ("items.jsonl", jsonl_file), ("simple_dataset.feather", feather_file), ] def add_tips_files(bucket_name): for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f) bucket = "pandas-test" conn = boto3.resource("s3", endpoint_url=s3_base) cli = boto3.client("s3", endpoint_url=s3_base) try: cli.create_bucket(Bucket=bucket) except Exception: # OK is bucket already exists pass try: cli.create_bucket(Bucket="cant_get_it", ACL="private") except Exception: # OK is bucket already exists pass timeout = 2 while not cli.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1 add_tips_files(bucket) add_tips_files("cant_get_it") s3fs.S3FileSystem.clear_instance_cache() yield conn s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base}) try: s3.rm(bucket, recursive=True) except Exception: pass try: s3.rm("cant_get_it", recursive=True) except Exception: pass timeout = 2 while cli.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1
def build_matrix(self, as_of_times, label_name, label_type, feature_dictionary, matrix_directory, matrix_metadata, matrix_uuid, matrix_type): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_directory: the directory in which to store the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_directory: str :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logging.info('popped matrix %s build off the queue', matrix_uuid) matrix_filename = os.path.join(matrix_directory, '{}.csv'.format(matrix_uuid)) # The output directory is local or in s3 path_parsed = urlparse(matrix_filename) scheme = path_parsed.scheme # If '' of 'file' is a regular file or 's3' if scheme in ('', 'file'): if not self.replace and os.path.exists(matrix_filename): logging.info('Skipping %s because matrix already exists', matrix_filename) return elif scheme == 's3': if not self.replace and s3fs.S3FileSystem().exists( matrix_filename): logging.info('Skipping %s because matrix already exists', matrix_filename) return else: raise ValueError(f"""URL scheme not supported: {scheme} (from {matrix_filename}) """) logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'], matrix_filename) # make the entity time table and query the labels and features tables logging.info('Making entity date table for matrix %s', matrix_uuid) entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata['state'], matrix_type, matrix_uuid, matrix_metadata['label_timespan']) logging.info( 'Extracting feature group data from database into file ' 'for matrix %s', matrix_uuid) features_csv_names = self.write_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logging.info(f"Feature data extracted for matrix {matrix_uuid}") try: logging.info( 'Extracting label data from database into file for ' 'matrix %s', matrix_uuid) labels_csv_name = self.write_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata['label_timespan']) features_csv_names.insert(0, labels_csv_name) logging.info(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logging.info('Merging feature files for matrix %s', matrix_uuid) output = self.merge_feature_csvs(features_csv_names, matrix_directory, matrix_uuid) logging.info(f"Features data merged for matrix {matrix_uuid}") finally: # clean up files and database before finishing for csv_name in features_csv_names: self.remove_file(csv_name) try: # store the matrix logging.info('Archiving matrix %s with metta', matrix_uuid) metta.archive_matrix(matrix_config=matrix_metadata, df_matrix=output, overwrite=True, directory=self.matrix_directory, format='csv') logging.info("Matrix {matrix_uuid} archived (using metta)") # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == 'train': lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], n_examples=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], matrix_metadata=json.dumps(matrix_metadata, sort_keys=True, default=str)) session = self.sessionmaker() session.add(matrix) session.commit() session.close() finally: if isinstance(output, str): os.remove(output)
def client(self) -> S3FileSystem: s3 = s3fs.S3FileSystem(key=self.access_key, secret=self.secret_key, client_kwargs={'region_name': self.aws_region}) return s3
from rasterio.io import MemoryFile import tempfile sys.path.append("../model/robosat_pink/") from robosat_pink.config import load_config # original with 5/28 #config_location= '/home/ubuntu/planet-snowcover/experiments/co-train.toml' # revised with neighboring watershed #config_location= '/home/ubuntu/planet-snowcover/experiments/co-train-neigh.toml' config_location = '/home/ubuntu/planet-snowcover/experiments/co-train-veg-colo-validate.toml' config = load_config(config_location) p = pprint.PrettyPrinter() fs = s3fs.S3FileSystem(session=boto3.Session( profile_name=config['dataset']['aws_profile'])) imagery_searchpath = config['dataset']['image_bucket'] + '/' + config[ 'dataset']['imagery_directory_regex'] print("Searching for imagery...({})".format(imagery_searchpath)) imagery_candidates = fs.ls(config['dataset']['image_bucket']) #print("candidates:") #p.pprint(imagery_candidates) imagery_locs = [c for c in imagery_candidates if match(imagery_searchpath, c)] print("result:") p.pprint(imagery_locs) # In[4]: #get_ipython().system('export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt')
def __init__(self, dataset_url, hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3', user=None): """ Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to instantiate a pyarrow filesystem. Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order: 1. If no ``scheme``, no longer supported, so raise an exception! 2. If ``scheme`` is ``file``, use local filesystem path. 3. If ``scheme`` is ``hdfs``: a. Try the ``hostname`` as a namespace and attempt to connect to a name node. 1. If that doesn't work, try connecting directly to namenode ``hostname:port``. b. If no host, connect to the default name node. 5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3 6. If ``scheme`` is ``gs``or ``gcs``, use gcsfs. The user must manually install gcsfs before using GCS 7. Fail otherwise. :param dataset_url: The hdfs URL or absolute path to the dataset :param hadoop_configuration: an optional hadoop configuration :param connector: the HDFS connector object to use (ONLY override for testing purposes) :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS. None implies login user. """ # Cache both the original URL and the resolved, urlparsed dataset_url self._dataset_url = dataset_url self._parsed_dataset_url = None # Cache the instantiated filesystem object self._filesystem = None if isinstance(self._dataset_url, six.string_types): self._parsed_dataset_url = urlparse(self._dataset_url) else: self._parsed_dataset_url = self._dataset_url if not self._parsed_dataset_url.scheme: # Case 1 raise ValueError( 'ERROR! A scheme-less dataset url ({}) is no longer supported. ' 'Please prepend "file://" for local filesystem.'.format( self._parsed_dataset_url.scheme)) elif self._parsed_dataset_url.scheme == 'file': # Case 2: definitely local self._filesystem = pyarrow.localfs self._filesystem_factory = lambda: pyarrow.localfs elif self._parsed_dataset_url.scheme == 'hdfs': if hdfs_driver == 'libhdfs3': # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary # if using libhdfs # Obtain singleton and force hadoop config evaluation namenode_resolver = HdfsNamenodeResolver(hadoop_configuration) # Since we can't tell for sure, first treat the URL as though it references a name service if self._parsed_dataset_url.netloc: # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased nameservice = self._parsed_dataset_url.netloc.split(':')[0] namenodes = namenode_resolver.resolve_hdfs_name_service( nameservice) if namenodes: self._filesystem = connector.connect_to_either_namenode( namenodes, user=user) self._filesystem_factory = lambda: connector.connect_to_either_namenode( namenodes, user=user) if self._filesystem is None: # Case 3a1: That didn't work; try the URL as a namenode host self._filesystem = connector.hdfs_connect_namenode( self._parsed_dataset_url, user=user) self._filesystem_factory = \ lambda url=self._dataset_url, user=user: \ connector.hdfs_connect_namenode(urlparse(url), user=user) else: # Case 3b: No netloc, so let's try to connect to default namenode # HdfsNamenodeResolver will raise exception if it fails to connect. nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service( ) filesystem = connector.connect_to_either_namenode( namenodes, user=user) self._filesystem_factory = lambda: connector.connect_to_either_namenode( namenodes, user=user) if filesystem is not None: # Properly replace the parsed dataset URL once default namenode is confirmed self._parsed_dataset_url = urlparse( 'hdfs://{}{}'.format( nameservice, self._parsed_dataset_url.path)) self._filesystem = filesystem else: self._filesystem = connector.hdfs_connect_namenode( self._parsed_dataset_url, hdfs_driver, user=user) self._filesystem_factory = \ lambda url=self._dataset_url, user=user: \ connector.hdfs_connect_namenode(urlparse(url), hdfs_driver, user=user) elif self._parsed_dataset_url.scheme == 's3': # Case 5 # S3 support requires s3fs to be installed try: import s3fs except ImportError: raise ValueError( 'Must have s3fs installed in order to use datasets on s3. ' 'Please install s3fs and try again.') if not self._parsed_dataset_url.netloc: raise ValueError('URLs must be of the form s3://bucket/path') fs = s3fs.S3FileSystem() self._filesystem = pyarrow.filesystem.S3FSWrapper(fs) self._filesystem_factory = lambda: pyarrow.filesystem.S3FSWrapper( s3fs.S3FileSystem()) elif self._parsed_dataset_url.scheme in ['gs', 'gcs']: # Case 6 # GCS support requires gcsfs to be installed try: import gcsfs except ImportError: raise ValueError( 'Must have gcsfs installed in order to use datasets on GCS. ' 'Please install gcsfs and try again.') if not self._parsed_dataset_url.netloc: raise ValueError( 'URLs must be of the form gs://bucket/path or gcs://bucket/path' ) fs = gcsfs.GCSFileSystem() self._filesystem = GCSFSWrapper(fs) self._filesystem_factory = lambda: GCSFSWrapper(gcsfs. GCSFileSystem()) else: # Case 7 raise ValueError( 'Unsupported scheme in dataset url {}. ' 'Currently, only "file" and "hdfs" are supported.'.format( self._parsed_dataset_url.scheme))
import os import tempfile import pandas as pd import s3fs import tensorflow as tf __all__ = 'from_tfrecords', s3_fs = s3fs.S3FileSystem() def from_tfrecords(file_paths, schema=None, compression_type='auto', cast=True): file_paths = list(_normalize(file_paths)) if compression_type == 'auto': compression_type = _get_compress_type(file_paths[0]) dataset = tf.data.TFRecordDataset(file_paths, compression_type=compression_type) if schema: features, feature_lists = parse_schema(schema) else: features, feature_lists = detect_schema(dataset) if feature_lists: parser = read_sequence_example(features, feature_lists)
def open_s3fs_connection(): s3 = s3fs.S3FileSystem() return s3
def save_data(run_all=False): S3FS = s3fs.S3FileSystem() s3 = S3() db = DynamoConn() dates = {} datasets = db.get_datasets() for dataset in datasets: # print dataset if datasets[dataset]['source'] == 'Plenario': today = datetime.datetime.today().date() date_list = set([today.strftime('%Y-%m')]) date_list.add( (today - datetime.timedelta(days=32)).strftime('%Y-%m')) date_list = sorted( list( set([(today - datetime.timedelta(days=x)).strftime('%Y-%m') for x in range(32)]))) paths = [] if run_all: paths = ['bnroths/chicago-data/%s' % dataset] cnts = {} else: for month in date_list: year, month = month.split('-') paths.append('bnroths/chicago-data/%s/year=%s/month=%s' % (dataset, year, month)) print paths cnts = datasets[dataset]['cnts'] # exit(0) print paths for path in paths: ds = pq.ParquetDataset(path_or_paths=path, filesystem=S3FS, validate_schema=False) columns = datasets[dataset]['columns'] dt = columns[1] table = ds.read() df = table.to_pandas() print df.columns print df.head() df['dt'] = df[dt].astype(str).str[:7] dts = [] groups = dict(list(df.groupby('dt'))) print groups.keys() # exit(0) for group in groups: print group year, month = group.split('-') a = groups[group][['longitude', 'latitude']].to_json(orient='values') cnts[group] = groups[group].count()[0] dts.append(group) filename = '../data/%s/%s-%s/all.json' % (dataset, year, month) if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w') as f: f.write(a) ## write to s3 s3.save_file_public(local='../data/%s/%s-%s/all.json' % (dataset, year, month), dataset=dataset, dt="%s-%s" % (year, month), filename='all.json') db.update_col(dataset=dataset, col='cnts', update=json.dumps(cnts))
brokerlist = 'ec2-54-186-208-110.us-west-2.compute.amazonaws.com:9092,ec2-52-11-172-126.us-west-2.compute.amazonaws.com:9092,ec2-52-88-204-111.us-west-2.compute.amazonaws.com:9092,ec2-52-35-101-204.us-west-2.compute.amazonaws.com:9092' producer = KafkaProducer(bootstrap_servers=brokerlist) # # Read the file in, iterate over events and publish # 1. Get the GDELT field names from a helper file # colnames = pd.read_excel('CSV.header.fieldids.xlsx', sheet_name='Sheet1', index_col='Column ID', usecols=1)['Field Name'] # # 2. Read the events in dataframe # fs = s3fs.S3FileSystem(anon=False) df_events = pd.read_csv('s3://gdelt-open-data/events/20180730.export.csv', sep='\t', low_memory=False, header=None, dtype=str, names=colnames, index_col=['GLOBALEVENTID']) cnt = 0 for index, row in df_events.iterrows(): topic = str(row["Actor1Geo_CountryCode"]) if topic == 'US': sendmsg = pickle.dumps(row) producer.send(topic, sendmsg)
import sys, os, json sys.path.append('/Users/benjamin/Desktop/repos/chi-data/backend') sys.path.append('/Users/benjamin/Desktop/repos/chi-data/backend/aws') from s3 import S3 from dynamo import DynamoConn import pyarrow.parquet as pq import pandas as pd import numpy as np import s3fs import decimal from time import time import requests as r import geopandas as gpd from shapely.geometry import mapping, shape from config import cook_tracts, chicago_tracts, msa_tracts arrow_s3fs = s3fs.S3FileSystem() s3 = S3() d = DynamoConn() boundaries = { # 'chicago-zillow-opposite': None, 'chicago': chicago_tracts, } stats = { 'S000': 'total_jobs', # 'SA01': 'age_group_1', # 'SA02': 'age_group_2', # 'SA03': 'age_group_3', # 'SE01': 'salary_group_1', # 'SE02': 'salary_group_2',