def write_file(content, bucket, path, s3, path_with_filename, extension): if extension == "csv": df = pd.read_csv(StringIO(content),sep=",", encoding="utf-8") target = "{}/{}.{}".format(bucket,path,"parquet") print path, "--->", target pwrite(target, df, open_with=s3.open, compression='GZIP', append=False, has_nulls=True) elif extension == "json": obj = json.loads(content) parts = path.split("/") filename = parts[len(parts)-1].split(".")[0] for feature in obj['features']: #if feature['name'] == "United States of America" or feature['name'] == "Canada" : if 'ISO3166-1' in feature['properties']: geometry={} iso_code = feature['properties']['ISO3166-1'] geometry["type"] = feature["geometry"]["type"] geometry["coordinates"] = feature["geometry"]["coordinates"] data = json.dumps(geometry, separators=(',', ':')) target = "{}/{}/p_iso3166={}/{}.json".format(bucket, parts[0], iso_code, filename) print path, "--->", target with s3.open(target, 'wb') as f: f.write(data) elif extension == "gz": with s3.open("{}/{}.json".format(bucket, path_with_filename), 'wb') as f: f.write(content) else: target = "{}/{}".format(bucket, path) print path, "--->", target with s3.open(target, 'wb') as f: f.write(content)
def append(bucket, key1, key2, s3, output_filename): s3_open = s3.open path1='{}{}'.format(bucket,key1) pf1 = ParquetFile(path1, open_with=s3_open) df1=pf1.to_pandas() path2='{}{}'.format(bucket,key2) pf2 = ParquetFile(path2, open_with=s3_open) df2=pf2.to_pandas() data = df1.append(df2) pwrite('{}{}'.format(bucket,output_filename), data, open_with=s3_open, compression='GZIP', append=False, has_nulls=True)
def write(bucket, key, data, sep, object_encoding, append=False): if data.empty: raise RuntimeError( "[{}]An attempt to write an empty dataset has occurred. The request dataset was: {}".format(error.Error.empty_dataframe(), data)) sensitivity_type = KeyParts(key, sep).sensitivity_level.lower() s3 = s3fsmap[sensitivity_type] s3_open = s3.open size_before_dup_drop = len(data) data.drop_duplicates(inplace=True) size_after_dup_drop = len(data) if size_before_dup_drop - size_after_dup_drop > 0: print "{} duplicates have been dropped".format(size_before_dup_drop - size_after_dup_drop) util.debug_print("Using object encoding {}".format(object_encoding)) path='{}{}'.format(bucket,key) pwrite(path, data, open_with=s3_open, compression='GZIP', append=append, has_nulls=True, object_encoding=object_encoding) return path
def write(bucket, key, data, sep, object_encoding): if data.empty: raise RuntimeError( "[{}]An attempt to write an empty dataset has occurred. The request dataset was: {}" .format(error.Error.empty_dataframe(), data)) sensitivity_type = KeyParts(key, sep).sensitivity_level.lower() s3 = s3fsmap[sensitivity_type] s3_open = s3.open path = '{}{}'.format(bucket, key) pwrite(path, data, open_with=s3_open, compression='GZIP', append=False, has_nulls=True, object_encoding=object_encoding)