def printHostInfo(hostName): print("Host '", hostName, "' (", mc.getURL(hostName, hostDict), ")", sep="")
def doSelect(bucketName, objectName, hostName, selectExpression, quiet): startTime = datetime.datetime.now() #setup client, use mc.finctions to look up URL, accessKey, and secretKey s3 = boto3.client('s3', endpoint_url=mc.getURL(hostName, hostDict), aws_access_key_id=mc.getAccessKey(hostName, hostDict), aws_secret_access_key=mc.getSecretKey(hostName, hostDict), region_name='us-east-1') #make the select_object_content call... returns a stream #TODO: assumes dataset is CSV and is not compressed... should relax this eventStream = s3.select_object_content( Bucket=bucketName, Key=objectName, ExpressionType='SQL', Expression=selectExpression, InputSerialization={ 'CSV': { "FileHeaderInfo": "USE", }, 'CompressionType': 'NONE', }, OutputSerialization={'CSV': {}}, ) #iterate through the response (eventStream) for event in eventStream['Payload']: #debugging code - totally messes up output #print(event) if 'Records' in event: record = event['Records']['Payload'].decode('utf-8') if not quiet : print(record) elif 'Stats' in event: statsDetails = event['Stats']['Details'] bs = statsDetails['BytesScanned'] bp = statsDetails['BytesProcessed'] if not quiet : print("Stats details bytesScanned: ", bs) print("Stats details bytesProcessed: ", bp) if quiet: print("**DONE - (Output not echoed!)**") endTime = datetime.datetime.now() elapsedTime = printElapsedTime(startTime, endTime, quiet) elapsedTimeSecs = datetime.timedelta.total_seconds(elapsedTime) metrics.append({"expression": selectExpression, 'host' : hostName, 'bucket': bucketName, 'object': objectName, 'elapsedTimeDays': elapsedTime, 'elapsedTimeSecs': elapsedTimeSecs, 'bytesScanned': bs, 'bytesProcessed': bp })
def getColumntHeaders(bucketName, objectName, hostName, delim=","): #setup client, use mc.finctions to look up URL, accessKey, and secretKey s3 = boto3.resource('s3', endpoint_url=mc.getURL(hostName, hostDict), aws_access_key_id=mc.getAccessKey(hostName, hostDict), aws_secret_access_key=mc.getSecretKey(hostName, hostDict) ) #setup object (o) based on bucketName and objectName o = s3.Object(bucketName, objectName) #read first line (returns bytes-like object), decode as 'utf-8'... #split indo list of column names using delim columns = o.get()['Body']._raw_stream.readline().decode('utf-8').split(delim) #return linst of column header names return(columns)
def doSelectShowPayload(bucketName, objectName, hostName, selectExpression): #startTime = datetime.datetime.now() #setup client, use mc.finctions to look up URL, accessKey, and secretKey s3 = boto3.client('s3', endpoint_url=mc.getURL(hostName, hostDict), aws_access_key_id=mc.getAccessKey(hostName, hostDict), aws_secret_access_key=mc.getSecretKey( hostName, hostDict), region_name='us-east-1') #make the select_object_content call... returns a stream #TODO: assumes dataset is CSV and is not compressed... should relax this eventStream = s3.select_object_content( Bucket=bucketName, Key=objectName, ExpressionType='SQL', Expression=selectExpression, InputSerialization={ 'CSV': { "FileHeaderInfo": "USE", }, 'CompressionType': 'NONE', }, OutputSerialization={'CSV': {}}, ) #iterate through the response (eventStream) for event in eventStream['Payload']: #debugging code - totally messes up output #print(event) if 'Records' in event: record = event['Records']['Payload'].decode('utf-8') print(record) elif 'Stats' in event: statsDetails = event['Stats']['Details'] bs = statsDetails['BytesScanned'] bp = statsDetails['BytesProcessed'] print("Stats details bytesScanned: ", bs) print("Stats details bytesProcessed: ", bp)