예제 #1
0
def to_redshift(data_frame):
    #date_str =data_base.strftime('%Y/%m/%d')
    print('connect_to_redshift')
    pr.connect_to_redshift(
        dbname='pricing',
        host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com',
        port='5432',
        user='******',
        password='******')
    print('connect_to_s3')
    pr.connect_to_s3(
        aws_access_key_id="AKIAILQVO2DJQHRLFLQQ",
        aws_secret_access_key="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",
        bucket="kroton-analytics",
        subdirectory="raw/uploads/esteira")
    print('pandas_to_redshift')
    pr.pandas_to_redshift(
        data_frame=data_frame,
        ##columns_data_types = ['real_mes' float64, 'real_ano_anterior' float64, 'acumulado_ano' float64, 'meta_ano' float64],
        ##column_data_types=['VARCHAR(250)','DATE','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','FLOAT','FLOAT','FLOAT','FLOAT'],
        index=False,
        ##column_data_types = None,
        redshift_table_name='kroton_pricing.tb_dev_bd_ccr',
        append=True)
    return print('fim save_to_redshift')
예제 #2
0
def connect_to_s3():
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_s3(aws_access_key_id = credentials['aws_access_key_id'],
                aws_secret_access_key = credentials['aws_secret_access_key'],
                bucket = 'jonobate-bucket')
def pandas_df_to_redshist(bucket_name, s3_file_name):
    df = s3_to_pandas_with_processing(bucket_name, s3_file_name)
    # Write the DataFrame to S3 and then to redshift
    pr.connect_to_s3(aws_access_key_id=KEY_ID,
                     aws_secret_access_key=KEY_SECRET,
                     bucket=bucket_name,
                     subdirectory=s3_file_name)
    ## to redshift
    pr.pandas_to_redshift(data_frame=df, redshift_table_name=table_name)
예제 #4
0
    def connect_aws(self):
        """ Connect to AWS using pandas_redshift """
        pr.connect_to_s3(aws_access_key_id = s3_credentials_AWS_ACCESS_KEY,
                         aws_secret_access_key = s3_credentials_AWS_SECRET_KEY,
                         bucket = s3_credentials_BUCKET,
                         subdirectory = BUCKET_FOLDER)

        pr.connect_to_redshift(dbname    = redshift_credentials_dbname,
                                host     = redshift_credentials_host,
                                port     = redshift_credentials_port,
                                user     = redshift_credentials_user,
                                password = redshift_credentials_password)
예제 #5
0
def dbconnect(df):
    dbname = os.getenv('REDSHIFT_DB')
    host = os.getenv('REDSHIFT_HOST')
    port = os.getenv('REDSHIFT_PORT')
    user = os.getenv('REDSHIFT_USER')
    password = os.getenv('REDSHIFT_PASS')

    pr.connect_to_redshift(dbname=dbname,
                           host=host,
                           port=port,
                           user=user,
                           password=password)

    pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'),
                     aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'),
                     bucket='TODO')
    pr.pandas_to_redshift(data_frame=df,
                          redshift_table_name='weather_data',
                          append=True)
예제 #6
0
##difference
#for i in tqdm(reversed(range(len(li)))):
#    if i != 0:
#        li[i][9] = li[i][9] - li[i-1][9]
#        li[i][10] = li[i][10] - li[i-1][10]

df3 = pd.concat(li) #concat li. df3 is final dataframe
df3 = df3[[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17]]
df3.columns = ['date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari']

dbname = os.getenv('REDSHIFT_DB')
host = os.getenv('REDSHIFT_HOST')
port = os.getenv('REDSHIFT_PORT')
user = os.getenv('REDSHIFT_USER')
password = os.getenv('REDSHIFT_PASS')

pr.connect_to_redshift(dbname = dbname,
                        host = host,
                        port = port,
                        user = user,
                        password = password)

pr.connect_to_s3(aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID'),
                aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY'),
                bucket = 'nichiji-tmp'
                )

pr.pandas_to_redshift(data_frame = df3,
                        redshift_table_name = 'jisseki_nichiji',
                        append = True)
예제 #7
0
import pandas as pd
import pandas_redshift as pr

#connect to redshift
###this code calculates number of trips for each route and process the file one at a time and also most used routes. citibike data is public
pr.connect_to_redshift(dbname = 'production',
                        host ='localhost',
                        port ='5439',
                        user ='******',
                        password ='******')

# Connect to S3
pr.connect_to_s3(aws_access_key_id = 'sadadasfaftew',
                aws_secret_access_key = 'ewwet4tsdvsrvrvrervwef',
                bucket = 'data-science',
                subdirectory = 'shwangdir'
                )

#upload a copy to S3 and redshift

#for i in range(5):
    #url = 'https://s3.amazonaws.com/tripdata/20180{}-citibike-tripdata.csv.zip'.format(i+1)
df = pd.read_csv('/Users/ankitkumar/Downloads/201801-citibike-tripdata.csv')
print(df)
pr.pandas_to_redshift(data_frame = df,
                      redshift_table_name = 'analytics.trip_fact')

dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \
  .sort_values(ascending=False) \
  .reset_index(name='count'))
예제 #8
0
 sku_cd    | character varying(256) |           |          |
 dpt_cd    | character varying(256) |           |          |
 line_cd   | character varying(256) |           |          |
 class_cd  | character varying(256) |           |          |
 sku_name  | character varying(256) |           |          |
 urisu     | numeric(10,2)          |           |          |
 urikin    | numeric(10,2)          |           |          |
 gsagsu1   | numeric(10,2)          |           |          |
 gsaggk1   | numeric(10,2)          |           |          |
 gsagsu2   | numeric(10,2)          |           |          |
 gsaggk2   | numeric(10,2)          |           |          |
 gsagsu3   | numeric(10,2)          |           |          |
 gsaggk3   | numeric(10,2)          |           |          |
 garari    | numeric(10,2)          |           |          |
"""

pr.connect_to_redshift(dbname=os.getenv('REDSHIFT_DB'),
                       host=os.getenv('REDSHIFT_HOST'),
                       port=os.getenv('REDSHIFT_PORT'),
                       user=os.getenv('REDSHIFT_USER'),
                       password=os.getenv('REDSHIFT_PASS'))

pr.connect_to_s3(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
                 aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
                 bucket='jisseki-nichiji')

data = {"date_ymdh": ["2018063012"], "ten_cd": ["0001"]}
f = pd.DataFrame(data)

pr.pandas_to_redshift(f, 'jisseki_nichiji')
예제 #9
0
print('5. Export to csv buffer')
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)

print('6. Upload to S3')
session = Session(aws_access_key_id=s3_credentials_AWS_ACCESS_KEY,
                  aws_secret_access_key=s3_credentials_AWS_SECRET_KEY)

FILE_NAME = BUCKET_FOLDER + "/" + REDSHIFT_TABLE_NAME + "_" + DATE_NOW + ".csv"
s3 = session.resource('s3')
bucket = s3.Bucket(s3_credentials_BUCKET)
s3.Object(s3_credentials_BUCKET, FILE_NAME).put(Body=csv_buffer.getvalue())

print('3. Connect to S3')
pr.connect_to_s3(aws_access_key_id=s3_credentials_AWS_ACCESS_KEY,
                 aws_secret_access_key=s3_credentials_AWS_SECRET_KEY,
                 bucket=s3_credentials_BUCKET,
                 subdirectory=BUCKET_FOLDER)

print('4. Connect to Redshift')
pr.connect_to_redshift(dbname=redshift_credentials_dbname,
                       host=redshift_credentials_host,
                       port=redshift_credentials_port,
                       user=redshift_credentials_user,
                       password=redshift_credentials_password)


def delete_from_date(date):
    """ Delete data from table """
    query = "DELETE FROM {table} WHERE date_sent >= '{datef}'".format(
        table=REDSHIFT_TABLE_NAME, datef=date)
    print("PRINT SQL STATEMENT: ", query)
예제 #10
0
def cleansing_format_data(date):
    colss_li = [i for i in range(0, 211)]
    del colss_li[11:21]
    colss_li.remove(2)
    colss_li.remove(7)

    response = s3client.list_objects(Bucket='ld-rawdata',
                                     Prefix='TR_JISSEKI/' + date + 'XXXXXX/')

    if 'Contents' in response:
        keys = [content['Key'] for content in response['Contents']]
        key = keys[-1]  #23時のデータ

    bucket_name = 'ld-rawdata'
    file_name = key
    day = file_name[37:45]  #day string
    reader = pd.read_csv('s3n://' + bucket_name + '/' + file_name,
                         encoding="cp932",
                         header=None,
                         iterator=True,
                         chunksize=1000,
                         usecols=colss_li)
    df = pd.concat((r for r in reader), ignore_index=True)

    li = []
    df = df[df[0].isin([day])]
    hour = 7
    base = df.loc[:, 0:10]

    #Make hours list
    for i in range(19):
        if hour < 24:
            base.loc[:, 0] = pd.datetime(int(day[0:4]), int(day[4:6]),
                                         int(day[6:8]), hour)
        elif hour > 23:
            base.loc[:, 0] = pd.datetime(int(day[0:4]), int(
                day[4:6]), int(day[6:8]), hour - 24) + timedelta(days=1)

        hour += 1
        li.append(pd.concat([base, df.loc[:, 21 + i * 10:30 + i * 10]],
                            axis=1))

    #set columns
    for i in range(len(li)):
        li[i].columns = [j for j in range(19)]

    df3 = pd.concat(li)  #concat li. df3 is final dataframe
    df3 = df3[[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17]]
    df3.columns = [
        'date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd',
        'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2',
        'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari'
    ]

    dbname = os.getenv('REDSHIFT_DB')
    host = os.getenv('REDSHIFT_HOST')
    port = os.getenv('REDSHIFT_PORT')
    user = os.getenv('REDSHIFT_USER')
    password = os.getenv('REDSHIFT_PASS')

    pr.connect_to_redshift(dbname=dbname,
                           host=host,
                           port=port,
                           user=user,
                           password=password)

    pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'),
                     aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'),
                     bucket='nichiji-tmp')

    pr.pandas_to_redshift(data_frame=df3,
                          redshift_table_name='jisseki_nichiji',
                          append=True)
    'b_col': [1, 2, 3],
    'c_col': [True, False, True],
    'd_col': ['2020-01-01', '2020-02-04', '2020-03-06'],
})

# =============================================================
# Write a pandas DataFrame to redshift. Requires access to an S3 bucket and previously running pr.connect_to_redshift.
# If the table currently exists IT WILL BE DROPPED and then the pandas DataFrame will be put in it's place.
# If you set append = True the table will be appended to (if it exists).
# =============================================================

# Connect to S3
pr.connect_to_s3(
    aws_access_key_id=str_accesskeyid,
    aws_secret_access_key=str_secretaccesskey,
    bucket=str_s3bucket,
    subdirectory=str_s3subdirectory
    # As of release 1.1.1 you are able to specify an aws_session_token (if necessary):
    # aws_session_token = <aws_session_token>
)

# Write the DataFrame to S3 and then to redshift
str_schema_table = '<schema>.<table>'
pr.pandas_to_redshift(data_frame=df_upload,
                      redshift_table_name=str_schema_table)

# confirm that the table has been uploaded to Redshift by reading
pr.connect_to_redshift(dbname=str_dbname,
                       host=str_host,
                       port=str_port,
                       user=str_user,
                       password=str_pw)
예제 #12
0
 def conectar_s3(self):
     pr.connect_to_s3(aws_access_key_id=self.aws_access_key_id,
                      aws_secret_access_key=self.aws_secret_access_key,
                      bucket=self.bucket)