Пример #1
0
def write_file(content, bucket, path, s3, path_with_filename, extension):
    if extension == "csv":
        df = pd.read_csv(StringIO(content),sep=",", encoding="utf-8") 
        target = "{}/{}.{}".format(bucket,path,"parquet")
        print path, "--->", target
        pwrite(target, df, open_with=s3.open, compression='GZIP', append=False, has_nulls=True)  
    elif extension == "json":
        obj = json.loads(content)    
        parts = path.split("/")     
        filename = parts[len(parts)-1].split(".")[0]       
        for feature in obj['features']:
            #if feature['name'] == "United States of America" or feature['name'] == "Canada"  :             
            if 'ISO3166-1' in feature['properties']:                
                geometry={}
                iso_code = feature['properties']['ISO3166-1']
                geometry["type"] = feature["geometry"]["type"]
                geometry["coordinates"] = feature["geometry"]["coordinates"]
                data = json.dumps(geometry, separators=(',', ':'))
                target = "{}/{}/p_iso3166={}/{}.json".format(bucket, parts[0], iso_code, filename)
                print path, "--->", target
                with s3.open(target, 'wb') as f:
                    f.write(data)                
       
    elif extension == "gz":
        with s3.open("{}/{}.json".format(bucket, path_with_filename), 'wb') as f:
            f.write(content)
    else:
        target = "{}/{}".format(bucket, path)
        print path, "--->", target
        with s3.open(target, 'wb') as f:
            f.write(content)
Пример #2
0
def append(bucket, key1, key2, s3, output_filename):  
    s3_open = s3.open
    path1='{}{}'.format(bucket,key1)   
    pf1 = ParquetFile(path1, open_with=s3_open)
    df1=pf1.to_pandas()
    path2='{}{}'.format(bucket,key2)   
    pf2 = ParquetFile(path2, open_with=s3_open)
    df2=pf2.to_pandas()            
    data = df1.append(df2) 
    
    pwrite('{}{}'.format(bucket,output_filename), data, open_with=s3_open, compression='GZIP', append=False, has_nulls=True)    
Пример #3
0
def write(bucket, key, data, sep, object_encoding, append=False):   
    if data.empty:        
        raise RuntimeError( "[{}]An attempt to write an empty dataset has occurred.  The request dataset was: {}".format(error.Error.empty_dataframe(), data))    
    sensitivity_type = KeyParts(key, sep).sensitivity_level.lower()   
    s3 = s3fsmap[sensitivity_type]    
    s3_open = s3.open    
    size_before_dup_drop = len(data)
    data.drop_duplicates(inplace=True)        
    size_after_dup_drop = len(data)        
    if size_before_dup_drop - size_after_dup_drop > 0:
        print "{} duplicates have been dropped".format(size_before_dup_drop - size_after_dup_drop) 
    util.debug_print("Using object encoding {}".format(object_encoding))
    path='{}{}'.format(bucket,key)          
    pwrite(path, data, open_with=s3_open, compression='GZIP', append=append, has_nulls=True, object_encoding=object_encoding)        
    return path
Пример #4
0
def write(bucket, key, data, sep, object_encoding):
    if data.empty:
        raise RuntimeError(
            "[{}]An attempt to write an empty dataset has occurred.  The request dataset was: {}"
            .format(error.Error.empty_dataframe(), data))
    sensitivity_type = KeyParts(key, sep).sensitivity_level.lower()
    s3 = s3fsmap[sensitivity_type]
    s3_open = s3.open
    path = '{}{}'.format(bucket, key)
    pwrite(path,
           data,
           open_with=s3_open,
           compression='GZIP',
           append=False,
           has_nulls=True,
           object_encoding=object_encoding)