def insert_dynamo(table_name, dict_data, key_name, force=True): aws_session = getSession() dynamo = aws_session.client('dynamodb') try: _ = dynamo.put_item( TableName=table_name, Item=dict_data, ReturnConsumedCapacity='TOTAL', ConditionExpression='attribute_not_exists({0})'.format(key_name)) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'ConditionalCheckFailedException': if force: pm.print_info('Forcing to rewrite [{0}:{1}]'.format( key_name, dict_data[key_name])) pm.print_dict(dict_data) _ = dynamo.put_item(TableName=table_name, Item=dict_data, ReturnConsumedCapacity='TOTAL') else: pm.print_warning('Key already exists [{0}:{1}]'.format( key_name, dict_data[key_name])) else: pm.print_error('Dynamo problem unknown') pm.print_error(str(e), exit_code=1)
def replace(rule, configuration): replacement = {} var_value = None # Find pieces to replace in string type "{var-value}" for match in re.finditer('{[a-z]+(-*[a-z]*)*}', rule): var = match.group(0) var_name = var[1:-1] if var_name not in configuration.keys(): pm.print_error('Unknown rule for [{0}]'.format(var)) pm.print_error('Configuration during error:') pm.print_dict(configuration) exit(1) else: var_value = configuration[var_name] replacement[var] = var_value # Replace replaced_rule = copy.deepcopy(rule) for old, new in replacement.items(): replaced_rule = replaced_rule.replace(str(old), str(new)) return replaced_rule
def read_env_var(name): if name in os.environ: return os.environ[name] else: pm.print_error('Environment variable [{0}] not found'.format(name), exit_code=1) return None
def run_shell_command(command, wait=True): pm.print_info('Running [{0}]'.format(command)) session = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if wait: stdout, stderr = session.communicate() session.wait() output_lines = stdout.decode("utf-8").split('\n') for ol in output_lines: pm.print_info_2(ol, padding=3) if session.returncode != 0: e = stderr.decode("utf-8") pm.print_error('Command failed! [{0}]'.format(command)) pm.print_info_2('Error: {0}'.format(str(e))) pm.print_error('Exit', exit_code=1) else: e = stderr.decode("utf-8").split('\n') for ee in e: pm.print_info(ee, padding=4)
def sns_publish(topic, message): sns = sns_resource() try: response = sns.publish(TopicArn = topic, Message = message) except ClientError as err: pm.print_warning('SNS [{0}] error') pm.print_error(err.response['Error']['Message'], exit_code=1) return response
def connect(self): if not self._connected: connection_string = "host='{}' port={} dbname='{}' user={} password={}".format( self._host, self._port, self._dbname, self._user, self._pwd) try: self._connection = psycopg2.connect(connection_string) except Exception as e: pm.print_error('Error connecting to database') pm.print_separator() pm.print_error(str(e)) pm.print_separator() self._connected = False self._connected = True
def downloads3(file_local_path, s3_bucketname, file_remote_path, verbose=True): s3 = s3_resource() if not isfiles3(s3_bucketname, file_remote_path): s3 = None return False try: s3.Bucket(s3_bucketname).download_file(file_remote_path, file_local_path) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": if verbose: pm.print_error('[AWS][S3] The object does not exist.') s3 = None return False else: pm.print_error('[AWS][S3] Unknown error') pm.print_error_2(str(e)) s3 = None pm.print_error('', exit_code=1) s3 = None return True
def connect(self): if not self._connected: # Check for database if not athena_exist(self._dbname, self._s3bucket, self._remotepath, verbose=False): self._connected = False pm.print_error('Athena [{0}] does not exist'.format(self._dbname), exit_code=1) if check_aws_env(): try: self._connection = pyathena.connect(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], s3_staging_dir=self._output_location, region_name=os.environ['AWS_REGION']) except Exception as e: pm.print_error('Error connecting to database') pm.print_separator() pm.print_error(str(e)) pm.print_separator() self._connected = False else: try: self._connection = pyathena.connect(s3_staging_dir=self._output_location, region_name=os.environ['AWS_REGION']) except Exception as e: pm.print_error('Error connecting to database') pm.print_separator() pm.print_error(str(e)) pm.print_separator() self._connected = False self._connected = True
def get_query(self, query, close=True): df = None self.connect() if self._connected: try: df = pd.read_sql(query, self._connection) except Exception as e: pm.print_error('Query problem') pm.print_separator() pm.print_error(query) pm.print_separator() pm.print_error(str(e), raise_error=Exception) else: pm.print_error('Data Base not connected') pm.print_error('Exiting', exit_code=1) if close: self.disconnect() return df
def athena_query(query, athena_database, s3_bucketname, file_remote_path, verbose=True): athena = athena_resource() output_location = 's3://' + '/'.join([s3_bucketname, file_remote_path ]) + '/' query_result = None response = None try: response = athena.start_query_execution( QueryString=query, QueryExecutionContext={'Database': athena_database}, ResultConfiguration={ 'OutputLocation': output_location, 'EncryptionConfiguration': { 'EncryptionOption': 'SSE_S3' } }) except ClientError as err: pm.print_warning('Athena [{0}] error'.format(athena_database)) pm.print_error(err.response['Error']['Message'], exit_code=1) try: while True: status = athena.get_query_execution( QueryExecutionId=response['QueryExecutionId']) current_status = status['QueryExecution']['Status']['State'] if current_status not in ['SUCCEEDED', 'FAILED', 'CANCELLED']: if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=True) elif current_status == 'SUCCEEDED': if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=False) query_result = athena.get_query_results( QueryExecutionId=response['QueryExecutionId']) break else: if verbose: pm.print_error('Query {0}'.format(current_status)) return None except ClientError as err: pm.print_warning('Athena [{0}] error'.format(athena_database)) pm.print_error(err.response['Error']['Message'], exit_code=1) return query_result
def get_pymake_var(var): pymakefile = pkg_resources.resource_filename('pymake', 'pymakefile.json') pymakevars = json2dict(pymakefile) if 'project-name' not in pymakevars.keys(): pm.print_error('Unknown var [{0}]'.format('project-name')) pm.print_error('Pymakefile during error:') pm.print_dict(pymakevars) exit(1) if var in pymakevars.keys(): return pymakevars[var] else: pm.print_error('Unknown var [{0}]'.format(var)) pm.print_error('Pymakefile during error:') pm.print_dict(pymakevars) exit(1)
def deletes3(s3_bucketname, file_remote_path): if isfiles3(s3_bucketname, file_remote_path): s3 = s3_resource() try: s3.Object(s3_bucketname, file_remote_path).delete() except botocore.exceptions.ClientError as e: # Something else has gone wrong. s3 = None pm.print_error('[AWS][S3] Unknown error') pm.print_error(str(e)) pm.print_error('', exit_code=1) else: s3 = None return True else: pm.print_warning('File [s3://{0}/{1}] does not exist'.format( s3_bucketname, file_remote_path))
def isfiles3(s3_bucketname, file_remote_path): s3 = s3_resource() try: s3.Object(s3_bucketname, file_remote_path).load() except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": # The object does not exist. s3 = None return False else: # Something else has gone wrong. s3 = None pm.print_error('[AWS][S3] Unknown error') pm.print_error(str(e)) pm.print_error('', exit_code=1) else: s3 = None return True
def connect(self): if not self._connected: connection_string = 'DRIVER={ODBC Driver 13 for SQL Server};' connection_string += 'SERVER={0};DATABASE={1};UID={2};PWD={3};'.format(self._host, self._dbname, self._user, self._pwd) try: import pyodbc except ImportError: pm.print_error('Package pyodbc is not installed') try: import pydockerutils except ImportError: pm.print_error('You have installation recipes in package pydockerutils @') pm.print_error(' - [https://github.com/nenetto/pydockerutils]') pm.print_error('Exiting', exit_code=1) pm.print_warning('Please, run the command install_pyodb from pydockerutils in the shell') try: self._connection = pyodbc.connect(connection_string) except Exception as e: pm.print_error('Error connecting to database') pm.print_error(str(e)) pm.print_separator() self._connected = False return self._connected = True if self._verbose: pm.print_info('Connection Success')
def summary_table(df, fixedtable_file_xlsx, schema_file_xlsx, summary_file_xlsx, log_file): try: from pydqc.infer_schema import infer_schema from pydqc.data_summary import data_summary except ImportError: pm.print_error( 'To use this function, you need to install pacakge pydqc') pm.print_error(' - https://github.com/nenetto/pydqc') pm.print_error(' - pip[3] install https://github.com/nenetto/pydqc') pm.print_error('', exit_code=1) # Fix table try: df = fixtable(df) except Exception as e: with open(log_file, 'a') as log: log.write('Error fixing table:\n') log.write(str(e)) pm.print_error('Error processing file:') pm.print_error(str(e)) return # Save in log the void columns nan_columns = df.columns[df.isna().all()].tolist() with open(log_file, 'a') as log: log.write('Columns with all NaNs:\n') for c in nan_columns: log.write(' - {0}\n'.format(c)) # Clean df nan_columns_type = list() for c in nan_columns: nan_columns_type.append(c + '_type') clean_columns = list() for c in df.columns: if (c not in nan_columns) and (c not in nan_columns_type): clean_columns.append(c) df = df[clean_columns] df.to_excel(fixedtable_file_xlsx, index=False) dirpath = tempfile.mkdtemp() # Infer schema pm.print_info('Infering Schema') try: infer_schema(df, fname='', output_root=dirpath, sample_size=1.0, type_threshold=0.5, n_jobs=1, base_schema=None) shutil.copyfile(os.path.join(dirpath, 'data_schema_.xlsx'), schema_file_xlsx) df_schema = get_schema(schema_file_xlsx) except Exception as e: with open(log_file, 'a') as log: log.write('Error Infering Schema:\n') log.write(str(e)) pm.print_error('Error Infering Schema') pm.print_error(str(e)) return pm.print_info('Schema Detected') pm.print_info('Generating Summary') data_summary(table_schema=df_schema, table=df, output_root=dirpath, fname='', sample_size=1.0, keep_images=False) try: shutil.copyfile(os.path.join(dirpath, 'data_summary_.xlsx'), summary_file_xlsx) except Exception as e: with open(log_file, 'a') as log: log.write('Error Generating Summary:\n') log.write(str(e)) pm.print_error('Error Generating Summary') pm.print_error(str(e)) return pm.print_info('Summary Generated') # Remove temp dir shutil.rmtree(dirpath)
def reload_partitions_in_table(athena_database, athena_table, s3_bucketname, file_remote_path, verbose=True): if not athena_exist(athena_database, s3_bucketname, file_remote_path, False): pm.print_error('Database does not exist', exit_code=1) athena = athena_resource() output_location = 's3://' + '/'.join([s3_bucketname, file_remote_path ]) + '/' response = None try: response = athena.start_query_execution( QueryString='MSCK REPAIR TABLE {0};'.format(athena_table), QueryExecutionContext={'Database': athena_database}, ResultConfiguration={ 'OutputLocation': output_location, 'EncryptionConfiguration': { 'EncryptionOption': 'SSE_S3' } }) except ClientError as err: pm.print_error('Reload partitions failed on table [{0}.{1}]'.format( athena_database, athena_table)) pm.print_error(err.response['Error']['Message'], exit_code=1) try: while True: status = athena.get_query_execution( QueryExecutionId=response['QueryExecutionId']) current_status = status['QueryExecution']['Status']['State'] if current_status not in ['SUCCEEDED', 'FAILED', 'CANCELLED']: if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=True) elif current_status == 'SUCCEEDED': if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=False) _ = athena.get_query_results( QueryExecutionId=response['QueryExecutionId']) break else: if verbose: pm.print_error('Query {0}'.format(current_status)) return None time.sleep(5) except ClientError as err: pm.print_warning('Athena [{0}] error'.format(athena_database)) pm.print_error(err.response['Error']['Message'], exit_code=1) else: pm.print_info('Reload partitions succeed on table [{0}.{1}]'.format( athena_database, athena_table))