Exemplo n.º 1
0
def doSelect(bucketName, objectName, hostName, selectExpression, quiet):
    
    startTime = datetime.datetime.now()
    
    #setup client, use mc.finctions to look up URL, accessKey, and secretKey
    s3 = boto3.client('s3',
                      endpoint_url=mc.getURL(hostName, hostDict),
                      aws_access_key_id=mc.getAccessKey(hostName, hostDict),
                      aws_secret_access_key=mc.getSecretKey(hostName, hostDict),
                      region_name='us-east-1')
        
    
    #make the select_object_content call... returns a stream
    #TODO: assumes dataset is CSV and is not compressed... should relax this
    eventStream = s3.select_object_content(
                            Bucket=bucketName,
                            Key=objectName,
                            ExpressionType='SQL',
                            Expression=selectExpression,
                            InputSerialization={
                                                'CSV': {
                                                        "FileHeaderInfo": "USE",
                                                        },
                                                'CompressionType': 'NONE',
                                                },
                            OutputSerialization={'CSV': {}},
                            )
    
    #iterate through the response (eventStream)
    for event in eventStream['Payload']:
        #debugging code - totally messes up output
        #print(event)
        if 'Records' in event:
            record = event['Records']['Payload'].decode('utf-8')
            if not quiet :
                print(record)
        elif 'Stats' in event:
            statsDetails = event['Stats']['Details']
            bs = statsDetails['BytesScanned']
            bp = statsDetails['BytesProcessed']
            if not quiet : 
                print("Stats details bytesScanned: ", bs)
                print("Stats details bytesProcessed: ", bp)
                
    if quiet: 
        print("**DONE - (Output not echoed!)**")
        
    endTime = datetime.datetime.now() 
    elapsedTime = printElapsedTime(startTime, endTime, quiet)
    elapsedTimeSecs = datetime.timedelta.total_seconds(elapsedTime)
    
    metrics.append({"expression": selectExpression, 
                    'host' : hostName, 
                    'bucket': bucketName, 
                    'object': objectName, 
                    'elapsedTimeDays': elapsedTime,
                    'elapsedTimeSecs': elapsedTimeSecs,
                    'bytesScanned': bs, 
                    'bytesProcessed': bp
                    })
Exemplo n.º 2
0
def getColumntHeaders(bucketName, objectName, hostName, delim=","):
    
    #setup client, use mc.finctions to look up URL, accessKey, and secretKey
    s3 = boto3.resource('s3',
                          endpoint_url=mc.getURL(hostName, hostDict),
                          aws_access_key_id=mc.getAccessKey(hostName, hostDict),
                          aws_secret_access_key=mc.getSecretKey(hostName, hostDict)
                        )
    #setup object (o) based on bucketName and objectName
    o = s3.Object(bucketName, objectName)

    #read first line (returns bytes-like object), decode as 'utf-8'...
    #split indo list of column names using delim
    columns = o.get()['Body']._raw_stream.readline().decode('utf-8').split(delim)
    
    #return linst of column header names
    return(columns)
Exemplo n.º 3
0
def doSelectShowPayload(bucketName, objectName, hostName, selectExpression):

    #startTime = datetime.datetime.now()

    #setup client, use mc.finctions to look up URL, accessKey, and secretKey
    s3 = boto3.client('s3',
                      endpoint_url=mc.getURL(hostName, hostDict),
                      aws_access_key_id=mc.getAccessKey(hostName, hostDict),
                      aws_secret_access_key=mc.getSecretKey(
                          hostName, hostDict),
                      region_name='us-east-1')

    #make the select_object_content call... returns a stream
    #TODO: assumes dataset is CSV and is not compressed... should relax this
    eventStream = s3.select_object_content(
        Bucket=bucketName,
        Key=objectName,
        ExpressionType='SQL',
        Expression=selectExpression,
        InputSerialization={
            'CSV': {
                "FileHeaderInfo": "USE",
            },
            'CompressionType': 'NONE',
        },
        OutputSerialization={'CSV': {}},
    )

    #iterate through the response (eventStream)
    for event in eventStream['Payload']:
        #debugging code - totally messes up output
        #print(event)
        if 'Records' in event:
            record = event['Records']['Payload'].decode('utf-8')
            print(record)
        elif 'Stats' in event:
            statsDetails = event['Stats']['Details']
            bs = statsDetails['BytesScanned']
            bp = statsDetails['BytesProcessed']

            print("Stats details bytesScanned: ", bs)
            print("Stats details bytesProcessed: ", bp)