def doSelectShowPayload(bucketName, objectName, hostName, selectExpression): #startTime = datetime.datetime.now() #setup client, use mc.getMinioHostInfo().finctions to look up URL, accessKey, and secretKey s3 = boto3.client('s3', endpoint_url=mc.getMinioHostInfo().getURL(hostName), aws_access_key_id=mc.getMinioHostInfo().getAccessKey(hostName), aws_secret_access_key=mc.getMinioHostInfo().getSecretKey(hostName), region_name='us-east-1') #make the select_object_content call... returns a stream #TODO: assumes dataset is CSV and is not compressed... should relax this eventStream = s3.select_object_content( Bucket=bucketName, Key=objectName, ExpressionType='SQL', Expression=selectExpression, InputSerialization={ 'CSV': { "FileHeaderInfo": "USE", }, 'CompressionType': 'NONE', }, OutputSerialization={'CSV': {}}, ) #iterate through the response (eventStream) for event in eventStream['Payload']: #debugging code - totally messes up output print(event) if 'Records' in event: record = event['Records']['Payload'].decode('utf-8') print(record, end="") elif 'Stats' in event: statsDetails = event['Stats']['Details'] bs = statsDetails['BytesScanned'] bp = statsDetails['BytesProcessed'] print("Stats details bytesScanned: ", bs) print("Stats details bytesProcessed: ", bp)
def getColumntHeaders(bucketName, objectName, hostName, delim=","): endpoint = mc.getMinioHostInfo().getURL(hostName) secureFlag = ("https://" in endpoint) print(secureFlag) #setup client, use mc.getMinioHostInfo().finctions to look up URL, accessKey, and secretKey s3 = boto3.resource('s3', endpoint_url=endpoint, aws_access_key_id=mc.getMinioHostInfo().getAccessKey(hostName), aws_secret_access_key=mc.getMinioHostInfo().getSecretKey(hostName),verify =False # is_secure=secureFlag ) #setup object (o) based on bucketName and objectName o = s3.Object(bucketName, objectName) #read first line (returns bytes-like object), decode as 'utf-8'... #split indo list of column names using delim columns = o.get()['Body']._raw_stream.readline().decode('utf-8').split(delim) #return linst of column header names return(columns)
def iterateThroughTests(whichHosts): #optionally override quiet.. #quiet = True for s in TestSelectExpressions : printSelectExpression(s) for t in TestDatasets : h = t['host'] if h in whichHosts: alias = mc.getMinioHostInfo().getAlias(h) if alias != "": print(">>> Querrying '", h, "'...", sep="") doSelect( t['bucket'], t['object'], h, s, quiet) else: print("ERROR: No host matching '", h, "' found is configured.", sep="") print()
def showHostInfo(hostName): print('Host config information for ', hostName, sep="") alias = mc.getMinioHostInfo().getAlias(hostName) if alias != "": print(" status:", mc.getMinioHostInfo().getStatus(hostName)) print(" alias:", mc.getMinioHostInfo().getAlias(hostName)) print(" URL: ", mc.getMinioHostInfo().getURL(hostName)) print(" accessKey:", mc.getMinioHostInfo().getAccessKey(hostName)) print(" secretKey:", mc.getMinioHostInfo().getSecretKey(hostName)) print(" api: ", mc.getMinioHostInfo().getAPI(hostName)) else: print("-- No host matching '", hostName, "' found is configured.", sep="") print()
doSelectShowPayload("sjm-airlines", "DelayedFlights.csv", h, s) print() #main if __name__ == "__main__": #supress printing extra information (eg: quiet supressing printing data returned #by the slect query against th data set) quiet = True metrics = list() #create dictionary of information about hosts configured in minio client (mc) config file. #dictionary will cotain all info (url, accessKey, secretKey) etc hostDict = mc.getMinioHostInfo() #True to test individual select calls, False to skip if False: testIndividualSelectCalls() #True to test all the select statemetns against all the hosts, False to skip if False: whichHosts = ['s3', 'play', 'm0', 'z0'] iterateThroughTests(whichHosts) showGraphs = False processMetrics(metrics, quiet, showGraphs) if True: showHarshaPayloadBug()
def doSelect(bucketName, objectName, hostName, selectExpression, quiet): startTime = datetime.datetime.now() endpoint = mc.getMinioHostInfo().getURL(hostName) secureFlag = ("https://" in endpoint) print(secureFlag) #setup client, use mc.getMinioHostInfo().finctions to look up URL, accessKey, and secretKey s3 = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=mc.getMinioHostInfo().getAccessKey(hostName), aws_secret_access_key=mc.getMinioHostInfo().getSecretKey(hostName), #is_secure=secureFlag, verify=False, region_name='us-east-1') #make the select_object_content call... returns a stream #TODO: assumes dataset is CSV and is not compressed... should relax this eventStream = s3.select_object_content( Bucket=bucketName, Key=objectName, ExpressionType='SQL', Expression=selectExpression, InputSerialization={ 'CSV': { "FileHeaderInfo": "USE", }, 'CompressionType': 'NONE', }, OutputSerialization={'CSV': {}}, ) #iterate through the response (eventStream) for event in eventStream['Payload']: #debugging code - totally messes up output #print(event) if 'Records' in event: record = event['Records']['Payload'].decode('utf-8') if not quiet : print(record, end="") elif 'Stats' in event: statsDetails = event['Stats']['Details'] bs = statsDetails['BytesScanned'] bp = statsDetails['BytesProcessed'] if not quiet : print("Stats details bytesScanned: ", bs) print("Stats details bytesProcessed: ", bp) if quiet: print("**DONE - (Output not echoed!)**") endTime = datetime.datetime.now() elapsedTime = printElapsedTime(startTime, endTime, quiet) elapsedTimeSecs = datetime.timedelta.total_seconds(elapsedTime) metrics.append({"expression": selectExpression, 'host' : hostName, 'bucket': bucketName, 'object': objectName, 'elapsedTimeDays': elapsedTime, 'elapsedTimeSecs': elapsedTimeSecs, 'bytesScanned': bs, 'bytesProcessed': bp })
def printHostInfo( hostName): print("Host '", hostName, "' (", mc.getMinioHostInfo().getURL(hostName), ")", sep="" )
doSelectShowPayload( "sjm-airlines", "DelayedFlights.csv", h, s) print() #main if __name__ == "__main__" : #os.environ['SSL_CERT_FILE'] = 'prgx_ca.pem' #supress printing extra information (eg: quiet supressing printing data returned #by the slect query against th data set) quiet = True metrics = list() #initialize the getMinioHostInfo() class mc.getMinioHostInfo() #True to test individual select calls, False to skip if True: testIndividualSelectCalls() #showGraphs = True #processMetrics( metrics, quiet, showGraphs) #True to test all the select statemetns against all the hosts, False to skip if False: whichHosts = [ 'c1' ] iterateThroughTests(whichHosts)