def run_job(self, input_dict): """Run the emr job.""" self.construct_job(input_dict) name = '{}_{}_training_{}'.format(self.env, self.ecosystem, self.current_time) bootstrap_uri = 's3://{bucket}/bootstrap.sh'.format( bucket=self.bucket_name) log_file_name = '{}.log'.format(name) log_uri = 's3://{bucket}/{log_file}'.format( bucket='{}-automated-analytics-spark-jobs'.format(self.env), log_file=log_file_name) emr_config_obj = EMRConfig(name=name, s3_bootstrap_uri=bootstrap_uri, training_repo_url=self.training_repo_url, log_uri=log_uri, ecosystem=self.ecosystem, properties=self.properties, hyper_params=self.hyper_params) configs = emr_config_obj.get_config() status = self.aws_emr.run_flow(configs) logger.info("EMR job is running {}".format(status)) status_code = status.get('ResponseMetadata', {}).get('HTTPStatusCode') if status_code != 200: logger.error( "EMR Job Failed with the status code {}".format(status_code), extra={"status": status}) return status
def from_github(self, package, url_df, api_url, api_token): """Find the keywords from the Github Graph QL.""" url_ = self.utility.get_url(url_df, package) keywords = list() if type(url_) == str: query_params = self.utility.get_query_params(url_) logger.info("Query Parameters are: {}, {}".format( query_params[0], query_params[1])) json = { 'query': '{{organization(login: "******"){{name url repository(name: "{1}")\ {{name url description repositoryTopics(first: 10){{nodes{{topic {{name}}}}}}}}}}}}' .format(str(query_params[0]), str(query_params[1])) } headers = {'Authorization': 'token %s' % api_token} try: response = requests.post(url=api_url, json=json, headers=headers) keywords = list(self.clean_response(response.json())) return keywords except Exception: logger.error( "Either Github token is not present or response is not coming." ) return keywords else: return keywords
def __init__(self, query_job_config=None, credential_path=None): """Initialize the BigqueryBuilder object.""" self.original_credential_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS') \ or credential_path try: json.loads(self.original_credential_path) json_credentials = True except Exception as e: logger.error("Not JSON credentials, reverting to local env JSON file: {}".format(e)) json_credentials = False if json_credentials: tfile = tempfile.NamedTemporaryFile(mode='w+', delete=False) tfile.write(self.original_credential_path) tfile.flush() tfile.seek(0) self.new_credential_path = tfile.name os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.new_credential_path else: self.new_credential_path = self.original_credential_path if isinstance(query_job_config, bigquery.job.QueryJobConfig): self.query_job_config = query_job_config else: self.query_job_config = bigquery.job.QueryJobConfig() self.client = None if self.new_credential_path: self.client = bigquery.Client( default_query_job_config=self.query_job_config) else: raise ValueError("Please provide the the valid credential_path")
def list_bucket_objects(self): """List all the objects in bucket.""" try: return self._s3.Bucket(self.bucket_name).objects.all() except Exception as exc: logger.error( "An Exception occurred while listing objects in bucket\n {}". format(str(exc)))
def get_status(self, cluster_id): """Get the status of EMR Instance.""" try: cluster = self._emr.describe_cluster(ClusterId=cluster_id) return cluster.get('Cluster', {}).get('Status') except ClientError: logger.error("Unable to get the cluster info", extra={"cluster_id": cluster_id})
def upload_file(self, src, target): """Upload file into S3 Bucket.""" try: return self._s3.Bucket(self.bucket_name).upload_file(src, target) except Exception as exc: logger.error( "An Exception occurred while uploading a file \n{}".format( str(exc)))
def list_bucket_keys(self): """List all the keys in bucket.""" try: return [i.key for i in self.list_bucket_objects()] except Exception as exc: logger.error( "An Exception occurred while listing bucket keys\n {}".format( str(exc)))
def read_generic_file(self, filename): """Retrieve remote object content.""" try: return self._s3.Object(self.bucket_name, filename).get()['Body'].read() except Exception as exc: logger.error( "An Exception occurred while retrieving an object\n {}".format( str(exc)))
def load_hyper_params(): """Load the hyper parameter from the command line args.""" if len(argv) > 1: input_data = argv[1:] try: if input_data: return loads(input_data[0]) except Exception: logger.error("Unable to decode the hyper params")
def check_field_exists(input_data, fields): """Check field exist in the input data.""" if isinstance(input_data, dict): for field in fields: if not input_data.get(field): logger.error( "Please provide the valid value for the field {}".format(field)) if isinstance(input_data, (list, dict, set, frozenset)): return list(set(fields).difference(set(input_data))) raise ValueError
def from_existing_df(self, df_, package): """Find the keywords from existing dump.""" if not df_.empty: data_lst = df_.loc[ df_['name'] == str(package), ['name', 'description', 'keywords', 'dependencies']].iloc[0] return data_lst else: logger.error("Node Package details Dataframe is not existed.") return self.df_
def read_pickle_file(self, filename): """Read Pickle file from the S3 bucket.""" try: pickle_content = pickle.loads(self.read_generic_file(filename)) return pickle_content except ValueError: logger.error("Not a valid pickle file provided.") except Exception as exc: logger.error( "An Exception occurred while retrieving a pickle file \n{}". format(str(exc)))
def s3_delete_object(self, object_key): """Delete a object in bucket.""" try: return self._s3.Bucket(self.bucket_name).delete_objects( Delete={"Objects": [{ 'Key': object_key }]}) except Exception as exc: logger.error( "An Exception occurred while deleting object\n {}".format( str(exc)))
def s3_clean_bucket(self): """Clean the bucket.""" try: all_keys = self.list_bucket_keys() self.s3_delete_objects(all_keys) logger.info("`{}` bucket has been cleaned.".format( self.bucket_name)) except Exception as exc: logger.error( "An Exception occurred while cleaning the bucket\n {}".format( str(exc)))
def load_matlab_multi_matrix(self, local_filename): """Load a '.mat'file & return a dict representation. :local_filename: The path of the object. :returns: A dict containing numpy matrices against the keys of the multi-matrix. """ try: model_dict = loadmat(os.path.join(self.src_dir, local_filename)) return model_dict except Exception as exc: logger.error("Unable to load mat file \n{}".format(str(exc)))
def list_bucket_objects(self, prefix=None): """List all the objects in bucket.""" try: if prefix: return self._s3.Bucket( self.bucket_name).objects.filter(Prefix=prefix) else: return self._s3.Bucket(self.bucket_name).objects.filter() except Exception as exc: logger.error( "An Exception occurred while listing objects in bucket\n {}". format(str(exc)))
def get_version(self, api_data): """Give the latest version for the package.""" if api_data: try: latest_version = api_data['dist-tags']['latest'] return latest_version except Exception: logger.info("Unable to fetch latest version from API data.") return '' else: logger.error("API Data is not available.") return ''
def read_json_file(self, data_in_bytes): # pragma: no cover """Read a big json file.""" try: coded_data = data_in_bytes.decode('utf-8') io_data = io.StringIO(coded_data) json_data = io_data.readlines() data = list(map(json.loads, json_data)) df = pd.DataFrame(data) return df except Exception: logger.error("Unable to read json file.") return self.df_
def get_file_content(url, session=None): """Customize get file content.""" if session: session.timeout = 10 if pip_download._scheme_re.search(url.decode() if not isinstance(url, str) else url): try: resp = session.get(url) resp.raise_for_status() return resp.content.decode() except Exception as _exc: logger.error('IGNORE: {}'.format(str(_exc))) return '' return url
def read_yaml_file(self, filename): """Read Yaml file from the S3 bucket.""" try: yaml = YAML() yaml_content = yaml.load(self.read_generic_file(filename)) # convet to dict return json.loads(json.dumps(yaml_content)) except ValueError: logger.error("Not a valid yaml file provided.") except Exception as exc: logger.error( "An Exception occurred while retrieving a yaml file \n{}". format(str(exc)))
def read_json_file(self, filename): """Read JSON file from the S3 bucket.""" try: utf_data = self.read_generic_file(filename) # python <= 3.5 requires string to load if isinstance(utf_data, (bytearray, bytes)): utf_data = utf_data.decode('utf-8') return json.loads(utf_data) except ValueError: logger.error("Not a valid json file provided.") except Exception as exc: logger.error( "An Exception occurred while retrieving a json file \n{}". format(str(exc)))
def construct_packages(self, content): """Construct package from content.""" if content: content = content.decode() if not isinstance(content, str) else content dependencies = {} try: decoded_json = demjson.decode(content) except Exception as _exc: logger.error("IGNORE {}".format(str(_exc))) decoded_json = self.handle_corrupt_packagejson(content) if decoded_json and isinstance(decoded_json, dict): dependencies = decoded_json.get('dependencies', {}) return list(dependencies.keys() if isinstance(dependencies, dict) else []) return []
def construct_job(self, input_dict): """Submit emr job.""" required_fields = [ 'environment', 'data_version', 'bucket_name', 'github_repo' ] missing_fields = check_field_exists(input_dict, required_fields) if missing_fields: logger.error("Missing the parameters in input_dict", extra={"missing_fields": missing_fields}) raise ValueError( "Required fields are missing in the input {}".format( missing_fields)) self.env = input_dict.get('environment') self.data_version = input_dict.get('data_version') github_repo = input_dict.get('github_repo') if not check_url_alive(github_repo): logger.error( "Unable to find the github_repo {}".format(github_repo)) raise ValueError( "Unable to find the github_repo {}".format(github_repo)) self.training_repo_url = github_repo self.hyper_params = input_dict.get('hyper_params', '{}') aws_access_key = os.getenv("AWS_S3_ACCESS_KEY_ID") \ or input_dict.get('aws_access_key') aws_secret_key = os.getenv("AWS_S3_SECRET_ACCESS_KEY")\ or input_dict.get('aws_secret_key') github_token = os.getenv("GITHUB_TOKEN", input_dict.get('github_token')) self.bucket_name = input_dict.get('bucket_name') if self.hyper_params: try: self.hyper_params = json.dumps(input_dict.get('hyper_params'), separators=(',', ':')) except Exception: logger.error( "Invalid hyper params", extra={"hyper_params": input_dict.get('hyper_params')}) self.properties = { 'AWS_S3_ACCESS_KEY_ID': aws_access_key, 'AWS_S3_SECRET_ACCESS_KEY': aws_secret_key, 'AWS_S3_BUCKET_NAME': self.bucket_name, 'MODEL_VERSION': self.data_version, 'DEPLOYMENT_PREFIX': self.env, 'GITHUB_TOKEN': github_token } self.aws_emr = AmazonEmr(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) self.aws_emr_client = self.aws_emr.connect() if not self.aws_emr.is_connected(): logger.error("Unable to connect to emr instance.") raise ValueError logger.info("Successfully connected to emr instance.")
def s3_delete_objects(self, object_keys): """Delete a object in bucket.""" try: if not isinstance(object_keys, list): raise ValueError("Expected {}, got {}".format( type(list()), type(object_keys))) return self._s3.Bucket(self.bucket_name).delete_objects( Delete={"Objects": [{ 'Key': k } for k in object_keys]}) except Exception as exc: logger.error( "An Exception occurred while deleting objects \n {}".format( str(exc)))
def load_matlab_multi_matrix(self, s3_path): """Load a '.mat'file & return a dict representation. :s3_path: The path of the object in the S3 bucket. :returns: A dict containing numpy matrices against the keys of the multi-matrix. """ local_filename = os.path.join('/tmp', s3_path.split('/')[-1]) self._s3.Bucket(self.bucket_name).download_file( s3_path, local_filename) model_dict = loadmat(local_filename) if not model_dict: logger.error("Unable to load the model for scoring") return model_dict
def from_npm_registry(self, package): """Find the keywords from NPM registry(through api).""" data_dict = self.dict_ api_url = "https://registry.npmjs.org/" + str(package) try: api_data = requests.get(api_url).text json_data = json.loads(api_data) data_dict['name'] = json_data.get('name', '') data_dict['description'] = json_data.get('description', '') data_dict['keywords'] = json_data.get('keywords', []) data_dict['dependencies'] = self.get_dependencies(json_data) return data_dict except Exception: logger.error("Can't fetch the keywords from NPM Registry") return data_dict
def make_kwd_dependencies_df(self, data_df, unique_packages): """Create Keyword Dependencies Dataframe.""" keyword_df = self.df_ dependencies_df = self.df_ try: keyword_df = data_df.loc[data_df['name'].isin(unique_packages), ['name', 'keywords']] except Exception: logger.error("Keyword is not present.") try: dependencies_df = data_df.loc[ data_df['name'].isin(unique_packages), ['name', 'dependencies']] except Exception: logger.error("Dependencies are not present. ") return keyword_df, dependencies_df
def save_on_s3(self, folder_path): """Store all the contents on S3.""" try: if os.path.exists(folder_path): if 'intermediate-model' in folder_path: self.s3_client.s3_upload_folder(folder_path=folder_path, prefix=self.version_name + '/intermediate-model' ) else: self.s3_client.s3_upload_folder(folder_path=folder_path, prefix=self.version_name + '') logger.info("Folders are successfully saved on S3.") else: logger.error("Folder path doesn't exist.") except Exception as e: raise e
def get_training_file_url(user, repo, branch='master', training_file_path='training/train.py'): """Get the training file from the github repo.""" if not user and not repo: logger.error("Please provide the github user and repo", extra={"user": user, "repo": repo}) raise ValueError("Please provide the github user:{} and repo:{}" .format(user, repo)) file_url = urljoin(GITHUB_CONTENT_BASEURL, '/'.join((user, repo, branch, training_file_path))) if not check_url_alive(file_url): logger.error("unable to reach the github training file path", extra={'github_url': file_url}) raise ValueError("Could not able to fetch training file") return file_url
def parse_requirements(content, session=PipSession(), *args, **kwargs): """Customize pip parse_requirements.""" _content = get_file_content(content, session=session) lines_enum = pip_req.preprocess(_content, None) for line_number, line in lines_enum: try: req_iter = pip_req.process_line(line, 'requirements.txt', line_number, None, None, None, session, None, use_pep517=None, constraint=None) for req in req_iter: if req.name: yield normalize_name(req.name) except Exception as _exc: logger.error('IGNORE: {} T(EXC):{} T(con):{}' .format(str(_exc), type(_exc), type(content))) logger.error('IGNORE CONTENT: {}'.format(content))