Python S3FileSystem 예제들, s3fs.S3FileSystem Python 예제들

예제 #1

0

파일 보기

def exists(path: str) -> bool:
    url = urlparse(path)
    if url.scheme == 's3':
        fs = s3fs.S3FileSystem(anon=False)
        return fs.exists(path)
    return os.path.exists(path)

예제 #2

0

파일 보기

파일: s3_info.py 프로젝트: CaptTofu/backup-utils

#!/usr/bin/env python

import s3fs
import yaml

stream = open("/etc/backup/backup.yaml", "r")
conf = yaml.load(stream)
stream.close()

bucket = conf['s3']['bucket']
aws_key = conf['s3']['key']
aws_secret = conf['s3']['secret']

print "listing bucket %s" % bucket
s3 = s3fs.S3FileSystem(key=aws_key, secret=aws_secret)

objects = s3.ls(bucket, detail=True)

for object in objects:
    print "%s size %s date %s" % (object['Key'], object['Size'],
                                  str(object['LastModified']))

예제 #3

0

파일 보기

# coding: utf-8

# In[29]:

import networkx as nx
import pandas as pd
import s3fs
import pyarrow.parquet as pq
import numpy as np
import pyarrow as pa
import os
import networkx as nx
import random
import dgl
import torch as th
s3 = s3fs.S3FileSystem()
import torch
import pandas as pd
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dateutil.parser import parse as dt_parse
import time
import logging
device = "cpu"
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('date_key')
args = parser.parse_args()

예제 #4

0

파일 보기

파일: preprocess.py 프로젝트: miadp/workshop

def preprocess(s3_in_url,
               s3_out_bucket,
               s3_out_prefix,
               delimiter=","):
    """Preprocesses data based on business logic

    - Reads delimited file passed as s3_url and preprocess data by filtering
    long tail in the customer ratings data i.e. keep customers who have rated 5
    or more videos, and videos that have been rated by 9+ customers
    - Preprocessed data is then written to output

    Args:
        s3_in_url:
          s3 url to the delimited file to be processed
          e.g. s3://amazon-reviews-pds/tsv/reviews.tsv.gz
        s3_out_bucket:
          s3 bucket where preprocessed data will be staged
          e.g. mybucket
        s3_out_prefix:
          s3 url prefix to stage preprocessed data to use later in the pipeline
          e.g. amazon-reviews-pds/preprocess/
        delimiter:
          delimiter to be used for parsing the file. Defaults to "," if none
          provided

    Returns:
        status of preprocessed data

    Raises:
        IOError: An error occurred accessing the s3 file
    """
    try:
        print("preprocessing data from {}".format(s3_in_url))
        # read s3 url into pandas dataframe
        # pandas internally uses s3fs to read s3 file directory
        df = pd.read_csv(s3_in_url, delimiter, error_bad_lines=False)

        # limit dataframe to customer_id, product_id, and star_rating
        # `product_title` will be useful validating recommendations
        df = df[['customer_id', 'product_id', 'star_rating', 'product_title']]

        # clean out the long tail because most people haven't seen most videos,
        # and people rate fewer videos than they actually watch
        customers = df['customer_id'].value_counts()
        products = df['product_id'].value_counts()

        # based on data exploration only about 5% of customers have rated 5 or
        # more videos, and only 25% of videos have been rated by 9+ customers
        customers = customers[customers >= 5]
        products = products[products >= 10]
        print("# of rows before the long tail = {:10d}".format(df.shape[0]))
        reduced_df = df \
            .merge(pd.DataFrame({'customer_id': customers.index})) \
            .merge(pd.DataFrame({'product_id': products.index}))
        print("# of rows after the long tail = {:10d}".format(
            reduced_df.shape[0]))
        reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id'])
        print("# of rows after removing duplicates = {:10d}".format(
            reduced_df.shape[0]))

        # recreate customer and product lists since there are customers with
        # more than 5 reviews, but all of their reviews are on products with
        # less than 5 reviews (and vice versa)
        customers = reduced_df['customer_id'].value_counts()
        products = reduced_df['product_id'].value_counts()

        # sequentially index each user and item to hold the sparse format where
        # the indices indicate the row and column in our ratings matrix
        customer_index = pd.DataFrame({
            'customer_id': customers.index,
            'customer': np.arange(customers.shape[0])})
        product_index = pd.DataFrame({
            'product_id': products.index,
            'product': np.arange(products.shape[0])})
        reduced_df = reduced_df \
            .merge(customer_index) \
            .merge(product_index)

        nb_customer = reduced_df['customer'].max() + 1
        nb_products = reduced_df['product'].max() + 1
        feature_dim = nb_customer + nb_products
        print(nb_customer, nb_products, feature_dim)

        product_df = reduced_df[['customer', 'product', 'star_rating']]

        # split into train, validation and test data sets
        train_df, validate_df, test_df = np.split(
            product_df.sample(frac=1),
            [int(.6*len(product_df)), int(.8*len(product_df))]
        )

        print("# of rows train data set = {:10d}".format(
            train_df.shape[0]))
        print("# of rows validation data set = {:10d}".format(
            validate_df.shape[0]))
        print("# of rows test data set = {:10d}".format(
            test_df.shape[0]))

        # select columns required for training the model
        # excluding columns "customer_id", "product_id", "product_title" to
        # keep files small
        cols = ["customer", "product", "star_rating"]
        train_df = train_df[cols]
        validate_df = validate_df[cols]
        test_df = test_df[cols]

        # write output to s3 as delimited file
        fs = s3fs.S3FileSystem(anon=False)
        s3_out_prefix = s3_out_prefix[:-1] \
            if s3_out_prefix[-1] == "/" else s3_out_prefix
        s3_out_train = "s3://{}/{}/{}".format(
            s3_out_bucket, s3_out_prefix, "train/train.csv")
        print("writing training data to {}".format(s3_out_train))
        with fs.open(s3_out_train, "w") as f:
            train_df.to_csv(f, sep=str(','), index=False)

        s3_out_validate = "s3://{}/{}/{}".format(
            s3_out_bucket, s3_out_prefix, "validate/validate.csv")
        print("writing test data to {}".format(s3_out_validate))
        with fs.open(s3_out_validate, "w") as f:
            validate_df.to_csv(f, sep=str(','), index=False)

        s3_out_test = "s3://{}/{}/{}".format(
            s3_out_bucket, s3_out_prefix, "test/test.csv")
        print("writing test data to {}".format(s3_out_test))
        with fs.open(s3_out_test, "w") as f:
            test_df.to_csv(f, sep=str(','), index=False)

        print("preprocessing completed")
        return "SUCCESS"
    except Exception as e:
        raise e

예제 #5

0

파일 보기

 def __init__(self, client_id, client_secret, user_agent):
     self.reddit = praw.Reddit(client_id=client_id,
                               client_secret=client_secret,
                               user_agent=user_agent)
     self.sia = SIA()
     self.fs = s3fs.S3FileSystem()

예제 #6

0

파일 보기

파일: io.py 프로젝트: Carloshp11/MLFlow_mock

def save_object_to_s3(model_: object, path: str) -> None:
    fs = s3fs.S3FileSystem(anon=False)
    with fs.open(path, 'wb') as f:
        pickle.dump(model_, f)
    print('Storing to s3: ', path, verbosity=1)

예제 #7

0

파일 보기

def get_clip_keys(fs=None):
    if not fs:
        fs = s3fs.S3FileSystem()
    keys = fs.find(f'brissonstagram/clips')
    return keys

예제 #8

0

파일 보기

파일: load_parquet_to_s3_no_extra.py 프로젝트: michalheld/simm-athena-poc

def write_process(load_id_):
    s3 = s3fs.S3FileSystem(anon=False)
    now = datetime.datetime.now().strftime("%Y%m%dT%H%M")
    
    dataset_schema = RandomData(Constants.INPUT_DATASET_SCHEMA_DICT, load_id_)
    dataset_schema.read_files()

    athena_client = Athena()
    
    parquet_creation_timing = pd.DataFrame()
    
    if Settings.new_parquet:
        dataset_sufix = now.lower()
        create_table = True
    else:
        dataset_sufix = Settings.existing_parquet
        create_table = False
    
    for dataset, atomic_load_count, data in dataset_schema.generate_random_data_list():
        start_time = datetime.datetime.now()
        print(start_time)
        parquet_table = pa.Table.from_pandas(data)   
        s3path = ("s3://simm-poc-s3-athena/simm/"
                 + str(Settings.use_dict_encoding).lower()
                 + "/"
                 + Settings.compression 
                 + "/"
                 + "".join(Settings.partition_cols)
                 + "/"
                 + dataset
                 + "_"
                 + dataset_sufix)
        
        partition_list = []
        for column in Settings.partition_cols:
            partition_list.append((column, data[column].iloc[0]))
        partition_path = "/".join(["=".join(i) for i in partition_list])       
        
        s3partpath = (s3path 
                      + "/"
                      + partition_path)
        if Settings.remove_existing_part and s3.exists(s3partpath):
            s3.rm(s3partpath, recursive=True)
            print("Remove partition: SUCCESS")
        
        print(partition_path)
        print(atomic_load_count)
        
        pq.write_to_dataset(parquet_table, 
                            s3path,
                            filesystem=s3,
                            partition_cols=Settings.partition_cols,
                            coerce_timestamps="ms",
                            allow_truncated_timestamps=True,
                            use_dictionary=Settings.use_dict_encoding,
                            compression=Settings.compression)
        print("Push to S3: SUCCESS")
        
        if create_table:
            athena_client.create_table(dataset + "_" + dataset_sufix, 
                                        partition_list, 
                                        s3path)
            print("Create table: SUCCESS")
            create_table = False
                
        athena_client.refresh_metastore(dataset + "_" + dataset_sufix,
                                        partition_list)
        print("Refresh metastore: SUCCESS")
        
        end_time = datetime.datetime.now()
        creation_timing = {"Dataset": dataset,
                           "dataset_sufix": dataset_sufix,
                           "use_dict_encoding": str(Settings.use_dict_encoding).lower(),
                           "compression": Settings.compression,
                           "partition_cols": str(Settings.partition_cols),
                           "load_id": dataset_schema.load_id[dataset][load_id_] - 1,
                           "Dataset size": atomic_load_count, 
                           "duration": end_time - start_time}
        parquet_creation_timing = parquet_creation_timing.append(pd.DataFrame(creation_timing, index=[0]), ignore_index=True)
        
    parquet_creation_timing.to_csv("C:\\Users\\micha\\Desktop\\SIMM\\output\\"
                                    "parquet_toS3_timing_" + dataset_sufix + "_" + now + ".csv")

예제 #9

0

파일 보기

파일: agg.py 프로젝트: bnroths/chi-data

class Download():
    def __init__(self):
        self.name = 'Ben'

    def download_data(self):
        return True

    def transform_json(self):
        return True


db = DynamoConn()
d = Download()
s3 = S3()
S3FS = s3fs.S3FileSystem()


def handler(event, context):

    data = {}
    for dataset_item in db.get_all():
        # print dataset_item
        dataset = dataset_item['dataset']
        columns = dataset_item['columns']
        map_type = dataset_item['map_type']
        if map_type != 'arc' and dataset != '' and dataset == 'lehd_rac':
            print dataset, columns

            paths = ['bnroths/chicago-data/%s' % dataset]

예제 #10

0

파일 보기

파일: generator_collection.py 프로젝트: sakuroki/deepinterpolation

    def __init__(self, json_path):
        "Initialization"
        super().__init__(json_path)

        if "from_s3" in self.json_data.keys():
            self.from_s3 = self.json_data["from_s3"]
        else:
            self.from_s3 = False

        self.raw_data_file = self.json_data["movie_path"]
        self.batch_size = self.json_data["batch_size"]
        self.pre_frame = self.json_data["pre_frame"]
        self.post_frame = self.json_data["post_frame"]

        self.start_frame = self.json_data["start_frame"]

        # This is compatible with negative frames
        self.end_frame = self.json_data["end_frame"]

        # This is used to limit the total number of samples
        # -1 means to take all and is the default fall back
        if "total_samples" in self.json_data.keys():
            self.total_samples = self.json_data["total_samples"]
        else:
            self.total_samples = -1

        if self.from_s3:
            s3_filesystem = s3fs.S3FileSystem()
            raw_data = h5py.File(s3_filesystem.open(self.raw_data_file, 'rb'),
                                 'r')['data']
        else:
            raw_data = h5py.File(self.raw_data_file, "r")["data"]

        self.total_frame_per_movie = int(raw_data.shape[0])

        if self.end_frame < 0:
            self.img_per_movie = (self.total_frame_per_movie + 1 +
                                  self.end_frame - self.start_frame -
                                  self.post_frame)
        elif self.total_frame_per_movie < self.end_frame:
            self.img_per_movie = (self.total_frame_per_movie -
                                  self.start_frame - self.post_frame)
        else:
            self.img_per_movie = self.end_frame + 1 - self.start_frame

        average_nb_samples = 1000

        local_data = raw_data[0:average_nb_samples, :, :].flatten()
        local_data = local_data.astype("float32")

        self.local_mean = np.mean(local_data)
        self.local_std = np.std(local_data)

        self.list_samples = np.arange(self.start_frame,
                                      self.start_frame + self.img_per_movie)

        if "randomize" in self.json_data.keys():
            self.randomize = self.json_data["randomize"]
        else:
            self.randomize = 1

        if self.randomize:
            np.random.shuffle(self.list_samples)

        # We cut the number of samples if asked to
        if self.total_samples > 0 and self.total_samples < len(
                self.list_samples):
            self.list_samples = self.list_samples[0:self.total_samples]

예제 #11

0

파일 보기

 def __init__(self, bucket, directory):
     self.bucket_uri = '%s/%s/visits/%%s' % (bucket, directory)
     self.s3_fs = s3fs.S3FileSystem(session=LocalS3Session())

예제 #12

0

파일 보기

파일: test_vistas_single_gpu.py 프로젝트: marcocaccin/inplace_abn

def main():
    # Load configuration
    args = parser.parse_args()

    # Torch stuff
    torch.cuda.set_device(args.rank)
    cudnn.benchmark = True

    # Create model by loading a snapshot
    body, head, cls_state = load_snapshot(args.snapshot)
    model = SegmentationModule(body, head, 256, 65, args.fusion_mode)
    model.cls.load_state_dict(cls_state)
    model = model.cuda().eval()

    # Create data loader
    transformation = SegmentationTransform(
        2048,
        (0.41738699, 0.45732192, 0.46886091),
        (0.25685097, 0.26509955, 0.29067996),
    )
    dataset = S3Dataset(args.data, transformation)
    data_loader = DataLoader(dataset,
                             batch_size=1,
                             pin_memory=True,
                             sampler=DistributedSampler(
                                 dataset, args.world_size, args.rank),
                             num_workers=2,
                             collate_fn=segmentation_collate,
                             shuffle=False)

    s3 = s3fs.S3FileSystem(
        s3_additional_kwargs={'ServerSideEncryption': 'AES256'})

    # Run testing
    scales = eval(args.scales)
    with torch.no_grad():
        for batch_i, rec in enumerate(data_loader):
            print("Testing batch [{:3d}/{:3d}]".format(batch_i + 1,
                                                       len(data_loader)))

            img = rec["img"].cuda(non_blocking=True)
            probs, preds = model(img, scales, args.flip)

            for i, (prob, pred) in enumerate(
                    zip(torch.unbind(probs, dim=0), torch.unbind(preds,
                                                                 dim=0))):
                out_size = rec["meta"][i]["size"]
                img_name = rec["meta"][i]["idx"]

                # Save prediction
                prob = prob.cpu()
                pred = pred.cpu()
                pred_img = get_pred_image(pred, out_size,
                                          args.output_mode == "palette")
                save_img_to_s3(pred_img,
                               path.join(args.output, img_name + ".png"), s3)

                # Optionally save probabilities
                if args.output_mode == "prob":
                    prob_img = get_prob_image(prob, out_size)
                    save_img_to_s3(
                        prob_img, path.join(args.output,
                                            img_name + "_prob.png"), s3)

예제 #13

0

파일 보기

파일: download.py 프로젝트: cloudbutton/geospatial-usecase

def get_scene_list(lon: float,
                   lat: float,
                   start_date: Union[dt.date, dt.datetime],
                   end_date: Union[dt.date, dt.datetime],
                   what: Union[str, Iterable[str]],
                   cloud_cover_le: float = 50,
                   use_ssl: bool = True,
                   also: Optional[List[str]] = None) -> List[str]:
    """
    Returns the scene list of a given location

    Parameters
    ----------
    lon: float
        Float value defining the longitude of interest.
    lat: float
        Float value defining the latitude of interest.
    start_date: datetime.date or datetime.datetime
        Date to start looking for images to download.
    end_date: datetime.date or datetime.datetime
        Date to end looking for images to download.
    what: str or array_like
        Here you have to define what you want to download as a string or as an
        array_like of strings. Valid values are:
            'TCI', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08',
            'B8A', 'B09', 'B11', 'B12', 'AOT', 'WVP', 'SCL'
    cloud_cover_le: float
        FLoat indicating the maximum cloud cover allowed. If the value is 10
        it indicates the allowed cloud cover on the image must be lower or
        equal to 10%. Default value is 50 (%).
    also: list or None
        A list detailing if you want to download other COG files in the
        borders. Valid values are 'N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW'.
        See below where 'X' is the original target.
          +-----+-----+-----+
          |NW   |  N  |   NE|
          |     |     |     |
          |     |     |     |
          +-----+-----+-----+
          |     |     |     |
          |W    |  X  |    E|
          |     |     |     |
          +-----+-----+-----+
          |     |     |     |
          |     |     |     |
          |SW   |  S  |   SE|
          +-----+-----+-----+
    """
    _also = {
        "N": {
            "x": 0,
            "y": 150_000
        },
        "NE": {
            "x": 150_000,
            "y": 150_000
        },
        "E": {
            "x": 150_000,
            "y": 0
        },
        "SE": {
            "x": 150_000,
            "y": -150_000
        },
        "S": {
            "x": 0,
            "y": -150_000
        },
        "SW": {
            "x": -150_000,
            "y": -150_000
        },
        "W": {
            "x": -150_000,
            "y": 0
        },
        "NW": {
            "x": -150_000,
            "y": 150_000
        },
    }
    if start_date > end_date:
        raise ValueError(
            "`start_date` has to be lower or equal than `end_date`")
    if isinstance(what, str):
        what = [what]
    for w in what:
        if w.upper() not in [item.value for item in Properties]:
            raise ValueError(f"{w} is not a valid product")

    fs = s3fs.S3FileSystem(anon=True, use_ssl=use_ssl)

    start_date = dt.date(start_date.year, start_date.month, start_date.day)
    end_date = dt.date(end_date.year, end_date.month, end_date.day)

    rpaths = []

    path: Union[str, Path]
    m = mgrs.MGRS()

    # Get the remote and local paths for the original target
    coord = m.toMGRS(lat, lon, MGRSPrecision=0)
    number, a, b = coord[:-3], coord[-3:-2], coord[-2:]

    def check_tile(_c):
        name = _c.split("/")[-1]
        info = _c + "/" + name + ".json"
        with fs.open(info, "r") as f:
            info = json.load(f)
        date_str = name.split("_")[2]
        cc = info["properties"]["eo:cloud_cover"]
        date = dt.datetime.strptime(date_str, "%Y%m%d").date()
        if cloud_cover_le >= cc and start_date <= date <= end_date:
            package = []
            for w in what:
                package.append(str(_c + f"/{w}.tif"))
            rpaths.append(tuple(package))

    def check_package(path):
        _contents = fs.ls(path)
        with ThreadPoolExecutor() as exe:
            for _c in _contents:
                exe.submit(check_tile, _c)

    with ThreadPoolExecutor() as ex:
        for yy, mm in _iter_dates(start_date, end_date):
            path = f"sentinel-cogs/sentinel-s2-l2a-cogs/{number}/{a}/{b}/{yy}/{mm}"
            ex.submit(check_package, path)

    # Get the remote and local paths for the adjacent COGS to the target,
    # if required
    # TODO (josep) make it threaded as before
    if also is None:
        also = []
    for al in also:
        al = al.upper()
        if al not in list(_also.keys()):
            raise ValueError(f'"{al}" is not a valid value for `also` keyword')
        z, hem, x, y = m.MGRSToUTM(coord)
        x += _also[al]["x"]
        y += _also[al]["y"]
        _coord = m.UTMToMGRS(z, hem, x, y, MGRSPrecision=0)
        number, a, b = _coord[:-3], _coord[-3:-2], _coord[-2:]
        for yy, mm in _iter_dates(start_date, end_date):
            path = "sentinel-cogs/sentinel-s2-l2a-cogs/" f"{number}/{a}/{b}/{yy}/{mm}"
            _contents = fs.ls(path)
            for _c in _contents:
                name = _c.split("/")[-1]
                info = _c + "/" + name + ".json"
                with fs.open(info, "r") as f:
                    info = json.load(f)
                date_str = name.split("_")[2]
                cc = info["properties"]["eo:cloud_cover"]
                date = dt.datetime.strptime(date_str, "%Y%m%d").date()
                if cloud_cover_le >= cc and start_date <= date <= end_date:
                    package = []
                    for w in what:
                        package.append(str(_c + f"/{w}.tif"))
                    rpaths.append(tuple(package))

    if not rpaths:
        raise Exception('No data found')

    return rpaths

예제 #14

0

파일 보기

import os
import shutil
import tempfile
import zipfile

import loompy
import numpy
import pandas
import requests
import s3fs
import scipy
import zarr

from matrix.common.etl import get_dss_client

S3 = s3fs.S3FileSystem(anon=True)


def calculate_ss2_metrics_direct(bundle_fqids):
    """Calculate expected SS2 matrix values.

    Don't use matrices or the matrix service, calculate in a completely orthogonal way
    using the RSEM outputs directly.
    """
    def read_bundle(fqid):
        dss_client = get_dss_client(os.environ['DEPLOYMENT_STAGE'])

        bundle_uuid, bundle_version = fqid.split(".", 1)
        bundle = dss_client.get_bundle(uuid=bundle_uuid,
                                       version=bundle_version,
                                       replica="aws")

예제 #15

0

파일 보기

파일: uio.py 프로젝트: dataops-tk/uio

def s3_file_exists(filepath):
    s3 = _s3fs.S3FileSystem(anon=False)
    return s3.exists(filepath)

예제 #16

0

파일 보기

# -------------------------------------------------------------------------------------------------------
# run it!

if __name__ == "__main__":
    print(40 * "*", "opasFileSupport Tests", 40 * "*")
    print("Running in Python %s" % sys.version_info[0])

    import doctest
    doctest.testmod(optionflags=doctest.ELLIPSIS
                    | doctest.NORMALIZE_WHITESPACE)
    print("Fini. opasFileSupport Tests complete.")
    sys.exit()

    # test S3FileSystem
    remfs = s3fs.S3FileSystem(anon=False,
                              key=localsecrets.S3_KEY,
                              secret=localsecrets.S3_SECRET)
    #fs.ls("embedded-graphics")
    filename_and_path = "pep-web-files/doc/g/BAP.01.0004.FIG001.jpg"

    try:
        if remfs.ls(filename_and_path) != []:
            # exists
            with remfs.open(filename_and_path,
                            mode='rb') as f:  # doctest: +SKIP
                image_bytes = f.read()
                f.close()

            print(image_bytes)
    except Exception as e:
        print(f"Error: {e}")

예제 #17

0

파일 보기

파일: uio.py 프로젝트: dataops-tk/uio

def copy_s3_file(s3_source_file, s3_target_file):
    s3 = _s3fs.S3FileSystem(anon=False)
    s3.cp(s3_source_file, s3_target_file)

예제 #18

0

파일 보기

파일: s3_file_transfer.py 프로젝트: peterljw/FF15

def upload_df_to_s3(df,s3_path):
    s3 = s3fs.S3FileSystem(anon=False)
    with s3.open(s3_path,'w') as f:
        df.to_csv(f)

예제 #19

0

파일 보기

파일: io.py 프로젝트: Carloshp11/MLFlow_mock

def read_parquet(fs: Union[s3fs.S3FileSystem, None],
                 path: str = None,
                 fallback_path: str = None,
                 columns: List[str] = None,
                 partition_filters: Union[List[Tuple[str]], None] = None,
                 non_partition_filters: Union[List[Tuple], List[List[Tuple]], None] = None) -> pd.DataFrame:
    """
    Read parquet file to pandas from S3. Accepts partition filters without overhead and automatically syncronizes to local
    filesystem prior to read if s3 is None.
    :param fs: Filesystem. s3fs.S3FileSystem instance or None if executing on local.
    :param path: path to the parquet folder.
    :param fallback_path: In case you perform a local execution and the path does not exist on your machine, fall back
                          s3 path from which it will be copied.
    :param columns: List[str]
        Names of columns to read from the file
    :param partition_filters: List[Tuple[str]] or None (default)
        One list element for filter. Each tuple contains 1) the column to filter for
        and 2) the value to filter on. List element order matters.
    :param non_partition_filters: List[Tuple] or List[List[Tuple]] or None (default)
        List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This
        implements partition-level (hive) filtering only, i.e., to prevent the
        loading of some files of the dataset.

        Predicates are expressed in disjunctive normal form (DNF). This means
        that the innermost tuple describe a single column predicate. These
        inner predicate make are all combined with a conjunction (AND) into a
        larger predicate. The most outer list then combines all filters
        with a disjunction (OR). By this, we should be able to express all
        kinds of filters that are possible using boolean logic.

        This function also supports passing in as List[Tuple]. These predicates
        are evaluated as a conjunction. To express OR in predictates, one must
        use the (preferred) List[List[Tuple]] notation.
    :return: pandas.DataFrame
    """
    original_fallback_path = copy(fallback_path)
    if partition_filters:
        for filter in partition_filters:
            path += '/{}={}'.format(filter[0], filter[1])
            if fallback_path:
                fallback_path += '/{}={}'.format(filter[0], filter[1])

    if not fs and not os.path.exists(path):
        assert fallback_path, 'local execution is turned on and {} does not exists on machine, but fallback_path has not been set'
        print('{} does not exist on local machine, so it\'ll be copied from s3'.format(path))
        ensure_path(path, del_if_exists=False, include_last=True)
        s3 = s3fs.S3FileSystem(anon=False)
        files = s3.ls(fallback_path, detail=False)

        if len(files) == 0:
            table_exists = len(s3.ls(original_fallback_path, detail=False)) > 0
            if table_exists:
                raise AssertionError('The table {} exist, but there are no data for the selected {} partition filters'
                                     .format(original_fallback_path, partition_filters))
            else:
                raise AssertionError('The table {} does not exist')
        for file in files:
            print('fetching ', file)
            s3.get(file, os.path.join(path, file.split('/')[-1]))

    print('Loading board from ', path, verbosity=1)

    non_partition_filters_columns = [filter[0] for filter in non_partition_filters] if non_partition_filters else []

    df = pq.ParquetDataset(path,
                           filesystem=fs,
                           filters=non_partition_filters)\
        .read_pandas(columns=list(set(columns + non_partition_filters_columns)) if columns else None).to_pandas()  # WARNING the filters argument is interfaced as of now but not actually implemented
    if non_partition_filters:
        for column, evaluator, value in non_partition_filters:
            if evaluator == '=':
                df = df[df[column] == value]
            else:
                raise NotImplementedError('{} filter condition not implemented. I suggest you implement it now. It\'s not that hard'.format(evaluator))
    if columns:
        df = df[columns]
    assert df.shape[0] > 0, 'The table {} exist, and there are data for the selected partition filters, but there are no data for the selected {} non-partition filters'\
        .format(original_fallback_path, non_partition_filters)
    return df

예제 #20

0

파일 보기

파일: main.py 프로젝트: ormahler/amazon-s3-find-and-forget

def execute(queue_url, message_body, receipt_handle):
    logger.info("Message received")
    queue = get_queue(queue_url)
    msg = queue.Message(receipt_handle)
    try:
        # Parse and validate incoming message
        validate_message(message_body)
        body = json.loads(message_body)
        session = get_session(body.get("RoleArn"))
        client = session.client("s3")
        query_bucket, query_key, object_path, job_id, file_format = itemgetter(
            "QueryBucket", "QueryKey", "Object", "JobId", "Format")(body)
        obj = s3_client.Object(query_bucket, query_key)
        raw_data = obj.get()['Body'].read().decode('utf-8')
        data = json.loads(raw_data)
        cols = data["Columns"]
        input_bucket, input_key = parse_s3_url(object_path)
        validate_bucket_versioning(client, input_bucket)
        creds = session.get_credentials().get_frozen_credentials()
        s3 = s3fs.S3FileSystem(
            key=creds.access_key,
            secret=creds.secret_key,
            token=creds.token,
            default_cache_type="none",
            requester_pays=True,
            default_fill_cache=False,
            version_aware=True,
        )
        # Download the object in-memory and convert to PyArrow NativeFile
        logger.info("Downloading and opening %s object in-memory", object_path)
        with s3.open(object_path, "rb") as f:
            source_version = f.version_id
            logger.info("Using object version %s as source", source_version)
            # Write new file in-memory
            compressed = object_path.endswith(".gz")
            out_sink, stats = delete_matches_from_file(f, cols, file_format,
                                                       compressed)
        if stats["DeletedRows"] == 0:
            raise ValueError(
                "The object {} was processed successfully but no rows required deletion"
                .format(object_path))
        with pa.BufferReader(out_sink.getvalue()) as output_buf:
            new_version = save(s3, client, output_buf, input_bucket, input_key,
                               source_version)
        logger.info("New object version: %s", new_version)
        verify_object_versions_integrity(client, input_bucket, input_key,
                                         source_version, new_version)
        if body.get("DeleteOldVersions"):
            logger.info(
                "Deleting object {} versions older than version {}".format(
                    input_key, new_version))
            delete_old_versions(client, input_bucket, input_key, new_version)
        msg.delete()
        emit_deletion_event(body, stats)
    except (KeyError, ArrowException) as e:
        err_message = "Apache Arrow processing error: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except IOError as e:
        err_message = "Unable to retrieve object: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except MemoryError as e:
        err_message = "Insufficient memory to work on object: {}".format(
            str(e))
        handle_error(msg, message_body, err_message)
    except ClientError as e:
        err_message = "ClientError: {}".format(str(e))
        if e.operation_name == "PutObjectAcl":
            err_message += ". Redacted object uploaded successfully but unable to restore WRITE ACL"
        if e.operation_name == "ListObjectVersions":
            err_message += ". Could not verify redacted object version integrity"
        handle_error(msg, message_body, err_message)
    except ValueError as e:
        err_message = "Unprocessable message: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except DeleteOldVersionsError as e:
        err_message = "Unable to delete previous versions: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except IntegrityCheckFailedError as e:
        err_description, client, bucket, key, version_id = e.args
        err_message = "Object version integrity check failed: {}".format(
            err_description)
        handle_error(msg, message_body, err_message)
        rollback_object_version(
            client,
            bucket,
            key,
            version_id,
            on_error=lambda err: handle_error(None, "{}", err,
                                              "ObjectRollbackFailed", False),
        )
    except Exception as e:
        err_message = "Unknown error during message processing: {}".format(
            str(e))
        handle_error(msg, message_body, err_message)

예제 #21

0

파일 보기

파일: conftest.py 프로젝트: ukarroum/pandas

def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
    """
    Sets up S3 bucket with contents

    The primary bucket name is "pandas-test". The following datasets
    are loaded.

    - tips.csv
    - tips.csv.gz
    - tips.csv.bz2
    - items.jsonl

    A private bucket "cant_get_it" is also created. The boto3 s3 resource
    is yielded by the fixture.
    """
    import boto3
    import s3fs

    test_s3_files = [
        ("tips#1.csv", tips_file),
        ("tips.csv", tips_file),
        ("tips.csv.gz", tips_file + ".gz"),
        ("tips.csv.bz2", tips_file + ".bz2"),
        ("items.jsonl", jsonl_file),
        ("simple_dataset.feather", feather_file),
    ]

    def add_tips_files(bucket_name):
        for s3_key, file_name in test_s3_files:
            with open(file_name, "rb") as f:
                cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f)

    bucket = "pandas-test"
    conn = boto3.resource("s3", endpoint_url=s3_base)
    cli = boto3.client("s3", endpoint_url=s3_base)

    try:
        cli.create_bucket(Bucket=bucket)
    except Exception:
        # OK is bucket already exists
        pass
    try:
        cli.create_bucket(Bucket="cant_get_it", ACL="private")
    except Exception:
        # OK is bucket already exists
        pass
    timeout = 2
    while not cli.list_buckets()["Buckets"] and timeout > 0:
        time.sleep(0.1)
        timeout -= 0.1

    add_tips_files(bucket)
    add_tips_files("cant_get_it")
    s3fs.S3FileSystem.clear_instance_cache()
    yield conn

    s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base})

    try:
        s3.rm(bucket, recursive=True)
    except Exception:
        pass
    try:
        s3.rm("cant_get_it", recursive=True)
    except Exception:
        pass
    timeout = 2
    while cli.list_buckets()["Buckets"] and timeout > 0:
        time.sleep(0.1)
        timeout -= 0.1

예제 #22

0

파일 보기

파일: builders.py 프로젝트: afcarl/triage

    def build_matrix(self, as_of_times, label_name, label_type,
                     feature_dictionary, matrix_directory, matrix_metadata,
                     matrix_uuid, matrix_type):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_directory: the directory in which to store the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_directory: str
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logging.info('popped matrix %s build off the queue', matrix_uuid)
        matrix_filename = os.path.join(matrix_directory,
                                       '{}.csv'.format(matrix_uuid))

        # The output directory is local or in s3
        path_parsed = urlparse(matrix_filename)
        scheme = path_parsed.scheme  # If '' of 'file' is a regular file or 's3'

        if scheme in ('', 'file'):
            if not self.replace and os.path.exists(matrix_filename):
                logging.info('Skipping %s because matrix already exists',
                             matrix_filename)
                return
        elif scheme == 's3':
            if not self.replace and s3fs.S3FileSystem().exists(
                    matrix_filename):
                logging.info('Skipping %s because matrix already exists',
                             matrix_filename)
                return
        else:
            raise ValueError(f"""URL scheme not supported:
              {scheme} (from {matrix_filename})
            """)

        logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'],
                     matrix_filename)
        # make the entity time table and query the labels and features tables
        logging.info('Making entity date table for matrix %s', matrix_uuid)
        entity_date_table_name = self.make_entity_date_table(
            as_of_times, label_name, label_type, matrix_metadata['state'],
            matrix_type, matrix_uuid, matrix_metadata['label_timespan'])
        logging.info(
            'Extracting feature group data from database into file '
            'for matrix %s', matrix_uuid)
        features_csv_names = self.write_features_data(as_of_times,
                                                      feature_dictionary,
                                                      entity_date_table_name,
                                                      matrix_uuid)
        logging.info(f"Feature data extracted for matrix {matrix_uuid}")
        try:
            logging.info(
                'Extracting label data from database into file for '
                'matrix %s', matrix_uuid)
            labels_csv_name = self.write_labels_data(
                label_name, label_type, entity_date_table_name, matrix_uuid,
                matrix_metadata['label_timespan'])
            features_csv_names.insert(0, labels_csv_name)

            logging.info(f"Label data extracted for matrix {matrix_uuid}")
            # stitch together the csvs
            logging.info('Merging feature files for matrix %s', matrix_uuid)
            output = self.merge_feature_csvs(features_csv_names,
                                             matrix_directory, matrix_uuid)
            logging.info(f"Features data merged for matrix {matrix_uuid}")
        finally:
            # clean up files and database before finishing
            for csv_name in features_csv_names:
                self.remove_file(csv_name)
        try:
            # store the matrix
            logging.info('Archiving matrix %s with metta', matrix_uuid)
            metta.archive_matrix(matrix_config=matrix_metadata,
                                 df_matrix=output,
                                 overwrite=True,
                                 directory=self.matrix_directory,
                                 format='csv')
            logging.info("Matrix {matrix_uuid} archived (using metta)")
            # If completely archived, save its information to matrices table
            # At this point, existence of matrix already tested, so no need to delete from db
            if matrix_type == 'train':
                lookback = matrix_metadata["max_training_history"]
            else:
                lookback = matrix_metadata["test_duration"]

            matrix = Matrix(
                matrix_id=matrix_metadata["matrix_id"],
                matrix_uuid=matrix_uuid,
                matrix_type=matrix_type,
                labeling_window=matrix_metadata["label_timespan"],
                n_examples=len(output),
                lookback_duration=lookback,
                feature_start_time=matrix_metadata["feature_start_time"],
                matrix_metadata=json.dumps(matrix_metadata,
                                           sort_keys=True,
                                           default=str))
            session = self.sessionmaker()
            session.add(matrix)
            session.commit()
            session.close()

        finally:
            if isinstance(output, str):
                os.remove(output)

예제 #23

0

파일 보기

    def client(self) -> S3FileSystem:
        s3 = s3fs.S3FileSystem(key=self.access_key,
                               secret=self.secret_key,
                               client_kwargs={'region_name': self.aws_region})

        return s3

예제 #24

0

파일 보기

from rasterio.io import MemoryFile
import tempfile

sys.path.append("../model/robosat_pink/")
from robosat_pink.config import load_config
# original with 5/28
#config_location= '/home/ubuntu/planet-snowcover/experiments/co-train.toml'
# revised with neighboring watershed
#config_location= '/home/ubuntu/planet-snowcover/experiments/co-train-neigh.toml'
config_location = '/home/ubuntu/planet-snowcover/experiments/co-train-veg-colo-validate.toml'

config = load_config(config_location)

p = pprint.PrettyPrinter()

fs = s3fs.S3FileSystem(session=boto3.Session(
    profile_name=config['dataset']['aws_profile']))

imagery_searchpath = config['dataset']['image_bucket'] + '/' + config[
    'dataset']['imagery_directory_regex']
print("Searching for imagery...({})".format(imagery_searchpath))
imagery_candidates = fs.ls(config['dataset']['image_bucket'])
#print("candidates:")
#p.pprint(imagery_candidates)
imagery_locs = [c for c in imagery_candidates if match(imagery_searchpath, c)]
print("result:")
p.pprint(imagery_locs)

# In[4]:

#get_ipython().system('export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt')

예제 #25

0

파일 보기

파일: fs_utils.py 프로젝트: xiaohanhuang/petastorm

    def __init__(self,
                 dataset_url,
                 hadoop_configuration=None,
                 connector=HdfsConnector,
                 hdfs_driver='libhdfs3',
                 user=None):
        """
        Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to
        instantiate a pyarrow filesystem.

        Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order:

        1. If no ``scheme``, no longer supported, so raise an exception!
        2. If ``scheme`` is ``file``, use local filesystem path.
        3. If ``scheme`` is ``hdfs``:
           a. Try the ``hostname`` as a namespace and attempt to connect to a name node.
              1. If that doesn't work, try connecting directly to namenode ``hostname:port``.
           b. If no host, connect to the default name node.
        5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3
        6. If ``scheme`` is ``gs``or ``gcs``, use gcsfs. The user must manually install gcsfs before using GCS
        7. Fail otherwise.

        :param dataset_url: The hdfs URL or absolute path to the dataset
        :param hadoop_configuration: an optional hadoop configuration
        :param connector: the HDFS connector object to use (ONLY override for testing purposes)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        :param user: String denoting username when connecting to HDFS. None implies login user.
        """
        # Cache both the original URL and the resolved, urlparsed dataset_url
        self._dataset_url = dataset_url
        self._parsed_dataset_url = None
        # Cache the instantiated filesystem object
        self._filesystem = None

        if isinstance(self._dataset_url, six.string_types):
            self._parsed_dataset_url = urlparse(self._dataset_url)
        else:
            self._parsed_dataset_url = self._dataset_url

        if not self._parsed_dataset_url.scheme:
            # Case 1
            raise ValueError(
                'ERROR! A scheme-less dataset url ({}) is no longer supported. '
                'Please prepend "file://" for local filesystem.'.format(
                    self._parsed_dataset_url.scheme))

        elif self._parsed_dataset_url.scheme == 'file':
            # Case 2: definitely local
            self._filesystem = pyarrow.localfs
            self._filesystem_factory = lambda: pyarrow.localfs

        elif self._parsed_dataset_url.scheme == 'hdfs':

            if hdfs_driver == 'libhdfs3':
                # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary
                # if using libhdfs

                # Obtain singleton and force hadoop config evaluation
                namenode_resolver = HdfsNamenodeResolver(hadoop_configuration)

                # Since we can't tell for sure, first treat the URL as though it references a name service
                if self._parsed_dataset_url.netloc:
                    # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased
                    nameservice = self._parsed_dataset_url.netloc.split(':')[0]
                    namenodes = namenode_resolver.resolve_hdfs_name_service(
                        nameservice)
                    if namenodes:
                        self._filesystem = connector.connect_to_either_namenode(
                            namenodes, user=user)
                        self._filesystem_factory = lambda: connector.connect_to_either_namenode(
                            namenodes, user=user)
                    if self._filesystem is None:
                        # Case 3a1: That didn't work; try the URL as a namenode host
                        self._filesystem = connector.hdfs_connect_namenode(
                            self._parsed_dataset_url, user=user)
                        self._filesystem_factory = \
                            lambda url=self._dataset_url, user=user: \
                            connector.hdfs_connect_namenode(urlparse(url), user=user)
                else:
                    # Case 3b: No netloc, so let's try to connect to default namenode
                    # HdfsNamenodeResolver will raise exception if it fails to connect.
                    nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service(
                    )
                    filesystem = connector.connect_to_either_namenode(
                        namenodes, user=user)
                    self._filesystem_factory = lambda: connector.connect_to_either_namenode(
                        namenodes, user=user)
                    if filesystem is not None:
                        # Properly replace the parsed dataset URL once default namenode is confirmed
                        self._parsed_dataset_url = urlparse(
                            'hdfs://{}{}'.format(
                                nameservice, self._parsed_dataset_url.path))
                        self._filesystem = filesystem
            else:
                self._filesystem = connector.hdfs_connect_namenode(
                    self._parsed_dataset_url, hdfs_driver, user=user)
                self._filesystem_factory = \
                    lambda url=self._dataset_url, user=user: \
                    connector.hdfs_connect_namenode(urlparse(url), hdfs_driver, user=user)

        elif self._parsed_dataset_url.scheme == 's3':
            # Case 5
            # S3 support requires s3fs to be installed
            try:
                import s3fs
            except ImportError:
                raise ValueError(
                    'Must have s3fs installed in order to use datasets on s3. '
                    'Please install s3fs and try again.')

            if not self._parsed_dataset_url.netloc:
                raise ValueError('URLs must be of the form s3://bucket/path')

            fs = s3fs.S3FileSystem()
            self._filesystem = pyarrow.filesystem.S3FSWrapper(fs)
            self._filesystem_factory = lambda: pyarrow.filesystem.S3FSWrapper(
                s3fs.S3FileSystem())

        elif self._parsed_dataset_url.scheme in ['gs', 'gcs']:
            # Case 6
            # GCS support requires gcsfs to be installed
            try:
                import gcsfs
            except ImportError:
                raise ValueError(
                    'Must have gcsfs installed in order to use datasets on GCS. '
                    'Please install gcsfs and try again.')

            if not self._parsed_dataset_url.netloc:
                raise ValueError(
                    'URLs must be of the form gs://bucket/path or gcs://bucket/path'
                )

            fs = gcsfs.GCSFileSystem()
            self._filesystem = GCSFSWrapper(fs)
            self._filesystem_factory = lambda: GCSFSWrapper(gcsfs.
                                                            GCSFileSystem())

        else:
            # Case 7
            raise ValueError(
                'Unsupported scheme in dataset url {}. '
                'Currently, only "file" and "hdfs" are supported.'.format(
                    self._parsed_dataset_url.scheme))

예제 #26

0

파일 보기

import os
import tempfile

import pandas as pd
import s3fs
import tensorflow as tf

__all__ = 'from_tfrecords',

s3_fs = s3fs.S3FileSystem()


def from_tfrecords(file_paths,
                   schema=None,
                   compression_type='auto',
                   cast=True):
    file_paths = list(_normalize(file_paths))

    if compression_type == 'auto':
        compression_type = _get_compress_type(file_paths[0])

    dataset = tf.data.TFRecordDataset(file_paths,
                                      compression_type=compression_type)

    if schema:
        features, feature_lists = parse_schema(schema)
    else:
        features, feature_lists = detect_schema(dataset)

    if feature_lists:
        parser = read_sequence_example(features, feature_lists)

예제 #27

0

파일 보기

def open_s3fs_connection():
    s3 = s3fs.S3FileSystem()
    return s3

예제 #28

0

파일 보기

파일: plenario.py 프로젝트: bnroths/chi-data

def save_data(run_all=False):
    S3FS = s3fs.S3FileSystem()

    s3 = S3()
    db = DynamoConn()

    dates = {}
    datasets = db.get_datasets()
    for dataset in datasets:
        # print dataset
        if datasets[dataset]['source'] == 'Plenario':

            today = datetime.datetime.today().date()
            date_list = set([today.strftime('%Y-%m')])
            date_list.add(
                (today - datetime.timedelta(days=32)).strftime('%Y-%m'))
            date_list = sorted(
                list(
                    set([(today - datetime.timedelta(days=x)).strftime('%Y-%m')
                         for x in range(32)])))
            paths = []

            if run_all:
                paths = ['bnroths/chicago-data/%s' % dataset]
                cnts = {}

            else:
                for month in date_list:
                    year, month = month.split('-')
                    paths.append('bnroths/chicago-data/%s/year=%s/month=%s' %
                                 (dataset, year, month))
                print paths
                cnts = datasets[dataset]['cnts']
                # exit(0)

            print paths
            for path in paths:
                ds = pq.ParquetDataset(path_or_paths=path,
                                       filesystem=S3FS,
                                       validate_schema=False)

                columns = datasets[dataset]['columns']
                dt = columns[1]
                table = ds.read()
                df = table.to_pandas()
                print df.columns
                print df.head()
                df['dt'] = df[dt].astype(str).str[:7]

                dts = []
                groups = dict(list(df.groupby('dt')))
                print groups.keys()
                # exit(0)
                for group in groups:
                    print group
                    year, month = group.split('-')

                    a = groups[group][['longitude',
                                       'latitude']].to_json(orient='values')
                    cnts[group] = groups[group].count()[0]
                    dts.append(group)

                    filename = '../data/%s/%s-%s/all.json' % (dataset, year,
                                                              month)

                    if not os.path.exists(os.path.dirname(filename)):
                        try:
                            os.makedirs(os.path.dirname(filename))
                        except OSError as exc:  # Guard against race condition
                            if exc.errno != errno.EEXIST:
                                raise

                    with open(filename, 'w') as f:
                        f.write(a)

                    ## write to s3
                    s3.save_file_public(local='../data/%s/%s-%s/all.json' %
                                        (dataset, year, month),
                                        dataset=dataset,
                                        dt="%s-%s" % (year, month),
                                        filename='all.json')
                    db.update_col(dataset=dataset,
                                  col='cnts',
                                  update=json.dumps(cnts))

예제 #29

0

파일 보기

파일: cr_kafka_topics_bycountry.py 프로젝트: jaysisodiya/relevare

brokerlist = 'ec2-54-186-208-110.us-west-2.compute.amazonaws.com:9092,ec2-52-11-172-126.us-west-2.compute.amazonaws.com:9092,ec2-52-88-204-111.us-west-2.compute.amazonaws.com:9092,ec2-52-35-101-204.us-west-2.compute.amazonaws.com:9092'
producer = KafkaProducer(bootstrap_servers=brokerlist)

#
# Read the file in, iterate over events and publish
# 1. Get the GDELT field names from a helper file
#
colnames = pd.read_excel('CSV.header.fieldids.xlsx',
                         sheet_name='Sheet1',
                         index_col='Column ID',
                         usecols=1)['Field Name']

#
# 2. Read the events in dataframe
#
fs = s3fs.S3FileSystem(anon=False)
df_events = pd.read_csv('s3://gdelt-open-data/events/20180730.export.csv',
                        sep='\t',
                        low_memory=False,
                        header=None,
                        dtype=str,
                        names=colnames,
                        index_col=['GLOBALEVENTID'])

cnt = 0

for index, row in df_events.iterrows():
    topic = str(row["Actor1Geo_CountryCode"])
    if topic == 'US':
        sendmsg = pickle.dumps(row)
        producer.send(topic, sendmsg)

예제 #30

0

파일 보기

import sys, os, json
sys.path.append('/Users/benjamin/Desktop/repos/chi-data/backend')
sys.path.append('/Users/benjamin/Desktop/repos/chi-data/backend/aws')
from s3 import S3
from dynamo import DynamoConn
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import s3fs
import decimal
from time import time
import requests as r
import geopandas as gpd
from shapely.geometry import mapping, shape
from config import cook_tracts, chicago_tracts, msa_tracts
arrow_s3fs = s3fs.S3FileSystem()
s3 = S3()
d = DynamoConn()

boundaries = {
    # 'chicago-zillow-opposite': None,
    'chicago': chicago_tracts,
}

stats = {
    'S000': 'total_jobs',
    # 'SA01': 'age_group_1',
    # 'SA02': 'age_group_2',
    # 'SA03': 'age_group_3',
    # 'SE01': 'salary_group_1',
    # 'SE02': 'salary_group_2',