def to_redshift(data_frame): #date_str =data_base.strftime('%Y/%m/%d') print('connect_to_redshift') pr.connect_to_redshift( dbname='pricing', host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com', port='5432', user='******', password='******') print('connect_to_s3') pr.connect_to_s3( aws_access_key_id="AKIAILQVO2DJQHRLFLQQ", aws_secret_access_key="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", bucket="kroton-analytics", subdirectory="raw/uploads/esteira") print('pandas_to_redshift') pr.pandas_to_redshift( data_frame=data_frame, ##columns_data_types = ['real_mes' float64, 'real_ano_anterior' float64, 'acumulado_ano' float64, 'meta_ano' float64], ##column_data_types=['VARCHAR(250)','DATE','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','FLOAT','FLOAT','FLOAT','FLOAT'], index=False, ##column_data_types = None, redshift_table_name='kroton_pricing.tb_dev_bd_ccr', append=True) return print('fim save_to_redshift')
def connect_to_s3(): with open('credentials.json') as json_data: credentials = json.load(json_data) pr.connect_to_s3(aws_access_key_id = credentials['aws_access_key_id'], aws_secret_access_key = credentials['aws_secret_access_key'], bucket = 'jonobate-bucket')
def pandas_df_to_redshist(bucket_name, s3_file_name): df = s3_to_pandas_with_processing(bucket_name, s3_file_name) # Write the DataFrame to S3 and then to redshift pr.connect_to_s3(aws_access_key_id=KEY_ID, aws_secret_access_key=KEY_SECRET, bucket=bucket_name, subdirectory=s3_file_name) ## to redshift pr.pandas_to_redshift(data_frame=df, redshift_table_name=table_name)
def connect_aws(self): """ Connect to AWS using pandas_redshift """ pr.connect_to_s3(aws_access_key_id = s3_credentials_AWS_ACCESS_KEY, aws_secret_access_key = s3_credentials_AWS_SECRET_KEY, bucket = s3_credentials_BUCKET, subdirectory = BUCKET_FOLDER) pr.connect_to_redshift(dbname = redshift_credentials_dbname, host = redshift_credentials_host, port = redshift_credentials_port, user = redshift_credentials_user, password = redshift_credentials_password)
def dbconnect(df): dbname = os.getenv('REDSHIFT_DB') host = os.getenv('REDSHIFT_HOST') port = os.getenv('REDSHIFT_PORT') user = os.getenv('REDSHIFT_USER') password = os.getenv('REDSHIFT_PASS') pr.connect_to_redshift(dbname=dbname, host=host, port=port, user=user, password=password) pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'), bucket='TODO') pr.pandas_to_redshift(data_frame=df, redshift_table_name='weather_data', append=True)
##difference #for i in tqdm(reversed(range(len(li)))): # if i != 0: # li[i][9] = li[i][9] - li[i-1][9] # li[i][10] = li[i][10] - li[i-1][10] df3 = pd.concat(li) #concat li. df3 is final dataframe df3 = df3[[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17]] df3.columns = ['date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari'] dbname = os.getenv('REDSHIFT_DB') host = os.getenv('REDSHIFT_HOST') port = os.getenv('REDSHIFT_PORT') user = os.getenv('REDSHIFT_USER') password = os.getenv('REDSHIFT_PASS') pr.connect_to_redshift(dbname = dbname, host = host, port = port, user = user, password = password) pr.connect_to_s3(aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY'), bucket = 'nichiji-tmp' ) pr.pandas_to_redshift(data_frame = df3, redshift_table_name = 'jisseki_nichiji', append = True)
import pandas as pd import pandas_redshift as pr #connect to redshift ###this code calculates number of trips for each route and process the file one at a time and also most used routes. citibike data is public pr.connect_to_redshift(dbname = 'production', host ='localhost', port ='5439', user ='******', password ='******') # Connect to S3 pr.connect_to_s3(aws_access_key_id = 'sadadasfaftew', aws_secret_access_key = 'ewwet4tsdvsrvrvrervwef', bucket = 'data-science', subdirectory = 'shwangdir' ) #upload a copy to S3 and redshift #for i in range(5): #url = 'https://s3.amazonaws.com/tripdata/20180{}-citibike-tripdata.csv.zip'.format(i+1) df = pd.read_csv('/Users/ankitkumar/Downloads/201801-citibike-tripdata.csv') print(df) pr.pandas_to_redshift(data_frame = df, redshift_table_name = 'analytics.trip_fact') dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \ .sort_values(ascending=False) \ .reset_index(name='count'))
sku_cd | character varying(256) | | | dpt_cd | character varying(256) | | | line_cd | character varying(256) | | | class_cd | character varying(256) | | | sku_name | character varying(256) | | | urisu | numeric(10,2) | | | urikin | numeric(10,2) | | | gsagsu1 | numeric(10,2) | | | gsaggk1 | numeric(10,2) | | | gsagsu2 | numeric(10,2) | | | gsaggk2 | numeric(10,2) | | | gsagsu3 | numeric(10,2) | | | gsaggk3 | numeric(10,2) | | | garari | numeric(10,2) | | | """ pr.connect_to_redshift(dbname=os.getenv('REDSHIFT_DB'), host=os.getenv('REDSHIFT_HOST'), port=os.getenv('REDSHIFT_PORT'), user=os.getenv('REDSHIFT_USER'), password=os.getenv('REDSHIFT_PASS')) pr.connect_to_s3(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), bucket='jisseki-nichiji') data = {"date_ymdh": ["2018063012"], "ten_cd": ["0001"]} f = pd.DataFrame(data) pr.pandas_to_redshift(f, 'jisseki_nichiji')
print('5. Export to csv buffer') csv_buffer = StringIO() df.to_csv(csv_buffer, index=False) print('6. Upload to S3') session = Session(aws_access_key_id=s3_credentials_AWS_ACCESS_KEY, aws_secret_access_key=s3_credentials_AWS_SECRET_KEY) FILE_NAME = BUCKET_FOLDER + "/" + REDSHIFT_TABLE_NAME + "_" + DATE_NOW + ".csv" s3 = session.resource('s3') bucket = s3.Bucket(s3_credentials_BUCKET) s3.Object(s3_credentials_BUCKET, FILE_NAME).put(Body=csv_buffer.getvalue()) print('3. Connect to S3') pr.connect_to_s3(aws_access_key_id=s3_credentials_AWS_ACCESS_KEY, aws_secret_access_key=s3_credentials_AWS_SECRET_KEY, bucket=s3_credentials_BUCKET, subdirectory=BUCKET_FOLDER) print('4. Connect to Redshift') pr.connect_to_redshift(dbname=redshift_credentials_dbname, host=redshift_credentials_host, port=redshift_credentials_port, user=redshift_credentials_user, password=redshift_credentials_password) def delete_from_date(date): """ Delete data from table """ query = "DELETE FROM {table} WHERE date_sent >= '{datef}'".format( table=REDSHIFT_TABLE_NAME, datef=date) print("PRINT SQL STATEMENT: ", query)
def cleansing_format_data(date): colss_li = [i for i in range(0, 211)] del colss_li[11:21] colss_li.remove(2) colss_li.remove(7) response = s3client.list_objects(Bucket='ld-rawdata', Prefix='TR_JISSEKI/' + date + 'XXXXXX/') if 'Contents' in response: keys = [content['Key'] for content in response['Contents']] key = keys[-1] #23時のデータ bucket_name = 'ld-rawdata' file_name = key day = file_name[37:45] #day string reader = pd.read_csv('s3n://' + bucket_name + '/' + file_name, encoding="cp932", header=None, iterator=True, chunksize=1000, usecols=colss_li) df = pd.concat((r for r in reader), ignore_index=True) li = [] df = df[df[0].isin([day])] hour = 7 base = df.loc[:, 0:10] #Make hours list for i in range(19): if hour < 24: base.loc[:, 0] = pd.datetime(int(day[0:4]), int(day[4:6]), int(day[6:8]), hour) elif hour > 23: base.loc[:, 0] = pd.datetime(int(day[0:4]), int( day[4:6]), int(day[6:8]), hour - 24) + timedelta(days=1) hour += 1 li.append(pd.concat([base, df.loc[:, 21 + i * 10:30 + i * 10]], axis=1)) #set columns for i in range(len(li)): li[i].columns = [j for j in range(19)] df3 = pd.concat(li) #concat li. df3 is final dataframe df3 = df3[[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17]] df3.columns = [ 'date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari' ] dbname = os.getenv('REDSHIFT_DB') host = os.getenv('REDSHIFT_HOST') port = os.getenv('REDSHIFT_PORT') user = os.getenv('REDSHIFT_USER') password = os.getenv('REDSHIFT_PASS') pr.connect_to_redshift(dbname=dbname, host=host, port=port, user=user, password=password) pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'), bucket='nichiji-tmp') pr.pandas_to_redshift(data_frame=df3, redshift_table_name='jisseki_nichiji', append=True)
'b_col': [1, 2, 3], 'c_col': [True, False, True], 'd_col': ['2020-01-01', '2020-02-04', '2020-03-06'], }) # ============================================================= # Write a pandas DataFrame to redshift. Requires access to an S3 bucket and previously running pr.connect_to_redshift. # If the table currently exists IT WILL BE DROPPED and then the pandas DataFrame will be put in it's place. # If you set append = True the table will be appended to (if it exists). # ============================================================= # Connect to S3 pr.connect_to_s3( aws_access_key_id=str_accesskeyid, aws_secret_access_key=str_secretaccesskey, bucket=str_s3bucket, subdirectory=str_s3subdirectory # As of release 1.1.1 you are able to specify an aws_session_token (if necessary): # aws_session_token = <aws_session_token> ) # Write the DataFrame to S3 and then to redshift str_schema_table = '<schema>.<table>' pr.pandas_to_redshift(data_frame=df_upload, redshift_table_name=str_schema_table) # confirm that the table has been uploaded to Redshift by reading pr.connect_to_redshift(dbname=str_dbname, host=str_host, port=str_port, user=str_user, password=str_pw)
def conectar_s3(self): pr.connect_to_s3(aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, bucket=self.bucket)