def construct_job(self, input_dict): """Submit emr job.""" required_fields = [ 'environment', 'data_version', 'bucket_name', 'github_repo' ] missing_fields = check_field_exists(input_dict, required_fields) if missing_fields: logger.error("Missing the parameters in input_dict", extra={"missing_fields": missing_fields}) raise ValueError( "Required fields are missing in the input {}".format( missing_fields)) self.env = input_dict.get('environment') self.data_version = input_dict.get('data_version') github_repo = input_dict.get('github_repo') if not check_url_alive(github_repo): logger.error( "Unable to find the github_repo {}".format(github_repo)) raise ValueError( "Unable to find the github_repo {}".format(github_repo)) self.training_repo_url = github_repo self.hyper_params = input_dict.get('hyper_params', '{}') aws_access_key = os.getenv("AWS_S3_ACCESS_KEY_ID") \ or input_dict.get('aws_access_key') aws_secret_key = os.getenv("AWS_S3_SECRET_ACCESS_KEY")\ or input_dict.get('aws_secret_key') github_token = os.getenv("GITHUB_TOKEN", input_dict.get('github_token')) self.bucket_name = input_dict.get('bucket_name') if self.hyper_params: try: self.hyper_params = json.dumps(input_dict.get('hyper_params'), separators=(',', ':')) except Exception: logger.error( "Invalid hyper params", extra={"hyper_params": input_dict.get('hyper_params')}) self.properties = { 'AWS_S3_ACCESS_KEY_ID': aws_access_key, 'AWS_S3_SECRET_ACCESS_KEY': aws_secret_key, 'AWS_S3_BUCKET_NAME': self.bucket_name, 'MODEL_VERSION': self.data_version, 'DEPLOYMENT_PREFIX': self.env, 'GITHUB_TOKEN': github_token } self.aws_emr = AmazonEmr(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) self.aws_emr_client = self.aws_emr.connect() if not self.aws_emr.is_connected(): logger.error("Unable to connect to emr instance.") raise ValueError logger.info("Successfully connected to emr instance.")
def connect(self): """Connect to the S3 database.""" try: session = boto3.session.Session( aws_access_key_id=self._aws_access_key_id, aws_secret_access_key=self._aws_secret_access_key, region_name=self.region_name) # signature version is needed to connect to new regions which support only v4 if self._local_dev: self._s3 = session.resource( 's3', config=botocore.client.Config(signature_version='s3v4'), use_ssl=self._use_ssl, endpoint_url=self._endpoint_url) else: self._s3 = session.resource( 's3', config=botocore.client.Config(signature_version='s3v4'), use_ssl=self._use_ssl) logger.info("Conneting to the s3") return self._s3 except Exception as exc: logger.info( "An Exception occurred while establishing a AmazonS3 connection {}" .format(str(exc)))
def run_job(self, input_dict): """Run the emr job.""" self.construct_job(input_dict) name = '{}_{}_training_{}'.format(self.env, self.ecosystem, self.current_time) bootstrap_uri = 's3://{bucket}/bootstrap.sh'.format( bucket=self.bucket_name) log_file_name = '{}.log'.format(name) log_uri = 's3://{bucket}/{log_file}'.format( bucket='{}-automated-analytics-spark-jobs'.format(self.env), log_file=log_file_name) emr_config_obj = EMRConfig(name=name, s3_bootstrap_uri=bootstrap_uri, training_repo_url=self.training_repo_url, log_uri=log_uri, ecosystem=self.ecosystem, properties=self.properties, hyper_params=self.hyper_params) configs = emr_config_obj.get_config() status = self.aws_emr.run_flow(configs) logger.info("EMR job is running {}".format(status)) status_code = status.get('ResponseMetadata', {}).get('HTTPStatusCode') if status_code != 200: logger.error( "EMR Job Failed with the status code {}".format(status_code), extra={"status": status}) return status
def from_github(self, package, url_df, api_url, api_token): """Find the keywords from the Github Graph QL.""" url_ = self.utility.get_url(url_df, package) keywords = list() if type(url_) == str: query_params = self.utility.get_query_params(url_) logger.info("Query Parameters are: {}, {}".format( query_params[0], query_params[1])) json = { 'query': '{{organization(login: "******"){{name url repository(name: "{1}")\ {{name url description repositoryTopics(first: 10){{nodes{{topic {{name}}}}}}}}}}}}' .format(str(query_params[0]), str(query_params[1])) } headers = {'Authorization': 'token %s' % api_token} try: response = requests.post(url=api_url, json=json, headers=headers) keywords = list(self.clean_response(response.json())) return keywords except Exception: logger.error( "Either Github token is not present or response is not coming." ) return keywords else: return keywords
def split_training_testing_data(self): """Split data into training and testing.""" data_in_bytes = self.load_user_item_data() data = data_in_bytes.decode("utf-8") data_list = data.split('\n') pairs_train = [] pairs_test = [] user_id = 0 np.random.seed(int(time.time())) logger.info("Splitting data into training and testing.") for line in data_list: arr = line.strip().split() arr = np.asarray([int(x) for x in arr[1:]]) n = len(arr) idx = np.random.permutation(n) for i in range(min(self.num_train_per_user, n)): pairs_train.append((user_id, arr[idx[i]])) if n > self.num_train_per_user: for i in range(self.num_train_per_user, n): pairs_test.append((user_id, arr[idx[i]])) user_id += 1 num_users = user_id pairs_train = np.asarray(pairs_train) pairs_test = np.asarray(pairs_test) num_items = np.maximum(np.max(pairs_train[:, 1]), np.max(pairs_test[:, 1])) + 1 logger.info("Number of users and items are respectively {}," " {}".format(num_users, num_items)) return [pairs_train, pairs_test, num_users, num_items]
def update_s3_bucket(self, data, bucket_name, filename='collated.json'): """Upload s3 bucket.""" if self.s3_client is None: # creat s3 client if not exists. self.s3_client = AmazonS3( bucket_name=bucket_name, aws_access_key_id=os.getenv('AWS_S3_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_S3_SECRET_ACCESS_KEY')) # connect after creating or with existing s3 client self.s3_client.connect() if not self.s3_client.is_connected(): raise ValueError("Unable to connect to s3.") json_data = dict() if self.s3_client.object_exists(filename): logger.info("{} exists, updating it.".format(filename)) json_data = self.s3_client.read_json_file(filename) if not json_data: raise ValueError( "Unable to get the json data path:{}/{}".format( bucket_name, filename)) json_data.update(data) self.s3_client.write_json_file(filename, json_data) logger.info("Updated file Succefully!")
def load_s3(self): """Establish the connection with S3.""" self.s3_object.connect() if self.s3_object.is_connected(): logger.info("S3 connection established.") return self.s3_object else: raise Exception
def check_path(self, path): """Check the given datastore path.""" logger.info("Given path is: {}".format(path)) try: if not os.path.exists(path): os.makedirs(path) return path except Exception as e: raise e
def save_numpy_matrix_temporary(self, content, filename, datastore): """Store numpy matrix in temporary storage.""" path = self.check_path(datastore) try: np.savez(os.path.join(path, filename), matrix=content) logger.info("Numpy matrix has been stored successfully.") except Exception as e: raise e
def save_json_file_temporary(self, content, filename, datastore): """Store JSON file in temporary storage.""" path = self.check_path(datastore) try: with open(os.path.join(path, filename), 'w') as f: json.dump(content, f) logger.info("JSON file has been stored successfully.") except Exception as e: raise e
def get_github_repo_info(repo_url): """Get the github repository information.""" logger.info("Received repository for the information", extra={'github_url': repo_url}) if repo_url.endswith('.git'): repo_url = repo_url[:-len('.git')] user, repo = repo_url.split('/')[-2:] user = user.split(':')[-1] return user, repo
def from_existing_df(self, df_, package): """Find the keywords from existing dump.""" if not df_.empty: data_lst = df_.loc[df_['name'] == str(package), ['name', 'description', 'keywords', 'dependencies']] if not data_lst.empty: return data_lst.iloc[0] else: logger.info("Node Package details Dataframe is not existed.") return self.df_
def s3_clean_bucket(self): """Clean the bucket.""" try: all_keys = self.list_bucket_keys() self.s3_delete_objects(all_keys) logger.info("`{}` bucket has been cleaned.".format( self.bucket_name)) except Exception as exc: logger.error( "An Exception occurred while cleaning the bucket\n {}".format( str(exc)))
def check_url_alive(url, accept_codes=[401]): """Validate github repo exist or not.""" try: logger.info("checking url is alive", extra={"url": url}) response = request.urlopen(url) status_code = response.getcode() if status_code in accept_codes or status_code // 100 in (2, 3): return True except Exception as exc: logger.debug("Unable to reach url", extra={"exception": str(exc)}) return False
def save_manifest_file_temporary(self, content, filename, datastore): """Store manifest file in temporary storage.""" path = self.check_path(datastore) try: with open(os.path.join(path, filename), 'w') as f: for lst in content: f.write("{} {}\n".format(lst[0], " ".join(str(x) for x in lst[1:]))) logger.info("Manifest File has been stored successfully.") except Exception as e: raise e
def load_raw_data(self): """Load the raw data from S3 bucket.""" NPM_raw_data_path = os.path.join(self.version_name, "data/manifest.json") try: raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path) logger.info("Size of Raw Manifest file is: {}".format( len(raw_data_dict_))) return raw_data_dict_ except Exception: raise Exception
def get_version(self, api_data): """Give the latest version for the package.""" if api_data: try: latest_version = api_data['dist-tags']['latest'] return latest_version except Exception: logger.info("Unable to fetch latest version from API data.") return '' else: logger.error("API Data is not available.") return ''
def get_result(self, job_id=None, job_query_obj=None): """Get the result of the job.""" if job_id is None: job_query_obj = job_query_obj or self.job_query_obj for row in job_query_obj.result(): yield ({k: v for k, v in row.items()}) else: job_obj = self.client.get_job(job_id) while job_obj.state == 'PENDING': job_obj = self.client.get_job(job_id) logger.info("Job State for Job Id:{} is {}".format( job_id, job_obj.state)) time.sleep(_POLLING_DELAY) yield from self.get_result(job_query_obj=job_obj)
def load_existing_data(self): """Load the node registry dump from S3 bucket.""" NPM_clean_json_data_path = os.path.join( self.version_name, "data/node-package-details-with-url.json") try: logger.info("Path Existed") existing_data = self.s3_client.read_generic_file( NPM_clean_json_data_path) existing_df = self.utility.read_json_file(existing_data) logger.info("Size of Raw df with url is: {}".format( len(existing_df))) return existing_df except Exception: raise Exception("S3 connection error")
def load_existing_data(self): """Load the node registry dump from S3 bucket.""" NPM_clean_json_data_path = os.path.join("training-utils", "node-package-details.json") if self.s3_client.object_exists(NPM_clean_json_data_path): try: logger.info("Reading dump data from training-utils folder.") existing_data = self.s3_client.read_json_file(NPM_clean_json_data_path) logger.info("Size of raw json: %d", len(existing_data)) return existing_data except Exception: raise Exception("S3 connection error") else: raise ValueError("Given Path is not present.")
def get_dependencies(self, api_data): """Give the dependencies for latest version of package.""" version = self.get_version(api_data) logger.info("Latest_version is: {}".format(version)) versions_dict = api_data.get('versions', dict()) try: if versions_dict: latest_version_data_dict = versions_dict.get(version, dict()) if latest_version_data_dict: latest_dependencies = latest_version_data_dict.get( 'dependencies', list()) return list(latest_dependencies.keys()) except Exception: return list()
def load_existing_data(self): """Load the node registry dump from S3 bucket.""" NPM_clean_json_data_path = os.path.join("training-utils", "node-package-details-with-url.json") if self.s3_client.object_exists(NPM_clean_json_data_path): try: logger.info("Reading dump data from training-utils folder.") existing_data = self.s3_client.read_generic_file(NPM_clean_json_data_path) existing_df = self.utility.read_json_file(existing_data) logger.info("Size of Raw df with url is: {}".format(len(existing_df))) return existing_df except Exception: raise Exception("S3 connection error") else: raise ValueError("Given Path is not present.")
def handle_response(self): """Process and get the response of async requests.""" results = list() for resp in self.responses: pkg_name, req_obj = resp if isinstance(req_obj, int): if req_obj == 200: results.append(pkg_name) elif req_obj.status_code == 200: results.append(pkg_name) logger.info("Received status:{} for pkg:{}".format( req_obj.status_code, pkg_name)) else: logger.info("Received status:{} for pkg:{}".format( req_obj.status_code, pkg_name)) return results
def save_on_s3(self, folder_path): """Store all the contents on S3.""" try: if os.path.exists(folder_path): if 'intermediate-model' in folder_path: self.s3_client.s3_upload_folder(folder_path=folder_path, prefix=self.version_name + '/intermediate-model' ) else: self.s3_client.s3_upload_folder(folder_path=folder_path, prefix=self.version_name + '') logger.info("Folders are successfully saved on S3.") else: logger.error("Folder path doesn't exist.") except Exception as e: raise e
def make_user_data(self, manifest_list, unique_packages): """Return the user data, which is required for making test data.""" manifest_user_data = list() logger.info("Length of manifest list is: {}".format( len(manifest_list))) logger.info("Length of Unique Packages are: {}".format( len(unique_packages))) if unique_packages: pkg_idx_map = self.create_package_map(unique_packages) for manifest in manifest_list: this_user_items = [pkg_idx_map[pkg] for pkg in manifest] this_user_items = [str(x) for x in this_user_items] length_ = len(this_user_items) user_items = [str(length_)] + this_user_items manifest_user_data.append(user_items) return list(manifest_user_data)
def connect(self): """Connect to the emr instance.""" try: session = boto3.session.Session( aws_access_key_id=self._aws_access_key_id, aws_secret_access_key=self._aws_secret_access_key, region_name=self.region_name) self._emr = session.client( 'emr', config=botocore.client.Config(signature_version='s3v4'), use_ssl=self._use_ssl) logger.info("Connecting to the emr") except Exception as exc: logger.info( "An Exception occurred while establishing a AmazonEMR connection {}" .format(str(exc)))
def create_content_matrix(self, pkg_tag_map, all_packages, vocabulary): # pragma: no cover """Create Content Matrix.""" tag_idx_map = self.create_vocabulary_map(vocabulary) content_matrix = np.zeros([len(all_packages), len(vocabulary)]) if tag_idx_map: for idx, package in enumerate(all_packages): try: package_tags = [ tag_idx_map[tag] for tag in pkg_tag_map[package] ] if idx == 0: logger.info("Setting to 1: {}".format(package_tags)) content_matrix[idx, package_tags] = 1 except KeyError: continue return content_matrix
def is_fetch_done(self, callback=lambda x: x): """Check whether all the requests are processed or not.""" _flag = True for resp in self.process_queue: _flag = False others, url, req_obj = resp logger.info("other:{}, url:{}, req_obj:{}".format(others, url, req_obj)) if url in self.cache: req_obj.cancel() self.process_queue.remove(resp) self.responses.append(self.cache[url]) elif req_obj.done(): req_obj.cancel() self.process_queue.remove(resp) self.cache[url] = (others, callback(req_obj)) self.responses.append((others, callback(req_obj))) return _flag
def preprocess_data(self): """Preprocesses the data and save into temporary storage.""" package_tag_map, vocabulary, manifest_user_data, unique_packages = \ self.preprocess_data_obj.update_pkg_tag_map() package_tag_map = {k: list(v) for k, v in package_tag_map.items()} self.obj_.save_manifest_file_temporary(manifest_user_data, 'manifest_user_data.dat', TEMPORARY_DATA_PATH) package_id_map = self.utils.create_package_map(unique_packages) id_package_map = dict( zip(range(len(unique_packages)), list(unique_packages))) user_train_data, item_train_data, user_test_data, item_test_data = \ self.obj_.train_test_data() content_matrix = self.utils.create_content_matrix( package_tag_map, unique_packages, vocabulary) self.obj_.save_json_file_temporary(package_id_map, 'package_to_index_map.json', TEMPORARY_PATH) self.obj_.save_json_file_temporary(id_package_map, 'index_to_package_map.json', TEMPORARY_PATH) self.obj_.save_json_file_temporary(package_tag_map, 'package_tag_map.json', TEMPORARY_PATH) self.obj_.save_file_temporary( user_train_data, "packagedata-train-" + str(self.num_users) + "-users.dat", TEMPORARY_DATA_PATH) self.obj_.save_file_temporary( user_test_data, "packagedata-test-" + str(self.num_users) + "-users.dat", TEMPORARY_DATA_PATH) self.obj_.save_file_temporary( item_train_data, "packagedata-train-" + str(self.num_users) + "-items.dat", TEMPORARY_DATA_PATH) self.obj_.save_file_temporary( item_test_data, "packagedata-test-" + str(self.num_users) + "-items.dat", TEMPORARY_DATA_PATH) self.obj_.save_numpy_matrix_temporary(content_matrix, 'content_matrix.npz', TEMPORARY_DATA_PATH) logger.info("All items are saved successfully in temporary location.")
def find_keywords(self, df_, list_): """Find the keywords for given list of list of raw data.""" package_lst = self.utility.flatten_list(list_) out_lst = list() for i in package_lst: pkg_kwd_lst = list() pkg_kwd_lst = self.utility.make_list_from_series( self.from_existing_df(df_, i)) if not pkg_kwd_lst or type(pkg_kwd_lst[2]) != list: logger.info("Finding from the NPM repository.") pkg_kwd_dict = self.from_npm_registry(i) pkg_kwd_lst = list(pkg_kwd_dict.values()) if len(pkg_kwd_lst[2]) == 0: logger.info("Trying to fetch from Github") api_url = 'https://api.github.com/graphql' api_token = self.get_data.github_token pkg_kwd_lst[2] = self.from_github(i, df_, api_url, api_token) out_lst.append(pkg_kwd_lst) return pd.DataFrame(out_lst, columns=['name', 'description', 'keywords', 'dependencies'])