def test_write_csv_to_s3_no_header(generate_data): bucket = 'test-bucket' key = 'test.csv' conn = boto3.resource('s3', region_name='us-west-2') conn.create_bucket(Bucket=bucket) utils.write_csv_to_s3(generate_data(), bucket, key, header=False) body = (conn.Object(bucket, key).get()['Body'].read().decode('utf-8')) assert len(body.rstrip().split('\n')) == 1
def test_write_csv_to_s3_no_header(generate_data): bucket = "test-bucket" key = "test.csv" conn = boto3.resource("s3", region_name="us-west-2") conn.create_bucket(Bucket=bucket) utils.write_csv_to_s3(generate_data(), bucket, key, header=False) body = conn.Object(bucket, key).get()["Body"].read().decode("utf-8") assert len(body.rstrip().split("\n")) == 1
def save(dataframe, bucket, prefix, mode, version, start_ds): """Write dataframe to an s3 location and generate a manifest :dataframe DataFrame: rollup data :bucket str: s3 bucket :prefix str: s3 prefix :mode str: either `daily` or `monthly` :version int: version of the rollup :start_ds str: yyyymmdd """ # format the save location of the data start_date = arrow.get(start_ds, "YYYYMMDD") # select the relevant fields select_expr = [ F.lit(start_date.format("YYYY-MM-DD")), "search_provider", "search_count", "country", "locale", "distribution_id", "default_provider", "profile_count", "profile_share", # only for daily F.lit(start_date.replace(days=+1).format("YYYY-MM-DD")), ] # replace mode specific items, like rollup_date if mode == "monthly": select_expr[0] = F.lit(start_date.format("YYYY-MM")) # NOTE: beware of calling remove when there are Column elements in the # array because boolean operations are overloaded for dataframes. shares_index = map(str, select_expr).index("profile_share") del select_expr[shares_index] key = ("{}/{}/processed-{}.csv".format(prefix, mode, start_date.format("YYYY-MM-DD"))) # persist the dataframe to disk logging.info("Writing dataframe to {}/{}".format(bucket, key)) utils.write_csv_to_s3(dataframe.select(select_expr), bucket, key, header=False) csv_paths = get_csv_locations(bucket, key) # write the manifest to disk write_manifest(bucket, prefix, mode, version, start_ds, csv_paths)
def test_write_csv_to_s3_existing(generate_data): bucket = 'test-bucket' key = 'test.csv' conn = boto3.resource('s3', region_name='us-west-2') conn.create_bucket(Bucket=bucket) utils.write_csv_to_s3(generate_data(["foo"]), bucket, key) utils.write_csv_to_s3(generate_data(["foo", "bar"]), bucket, key) body = (conn.Object(bucket, key).get()['Body'].read().decode('utf-8')) # header + 2x row = 3 assert len(body.rstrip().split('\n')) == 3
def test_write_csv_to_s3_existing(generate_data): bucket = "test-bucket" key = "test.csv" conn = boto3.resource("s3", region_name="us-west-2") conn.create_bucket(Bucket=bucket) utils.write_csv_to_s3(generate_data(["foo"]), bucket, key) utils.write_csv_to_s3(generate_data(["foo", "bar"]), bucket, key) body = conn.Object(bucket, key).get()["Body"].read().decode("utf-8") # header + 2x row = 3 assert len(body.rstrip().split("\n")) == 3
def test_write_csv_to_s3(generate_data): bucket = "test-bucket" key = "test.csv" conn = boto3.resource("s3", region_name="us-west-2") conn.create_bucket( Bucket=bucket, CreateBucketConfiguration={ "LocationConstraint": "us-west-2", }, ) utils.write_csv_to_s3(generate_data(["foo"]), bucket, key) body = conn.Object(bucket, key).get()["Body"].read().decode("utf-8") # header + 1x row = 2 assert len(body.rstrip().split("\n")) == 2
def write_dashboard_data(df, bucket, prefix, mode): """ Write the dashboard data to a s3 location. """ # name of the output key key = "{}/topline-{}.csv".format(prefix, mode) utils.write_csv_to_s3(df, bucket, key)