class YoutubeMusicVideoSync(object): YOUTUBE_VIDEO_ID_LIST = {} YOUTUBE_VIDEO_ID_LENGTH = 11 def __init__(self): self.YOUTUBE_VIDEO_ID_LIST = LocalSettingsLoader( ).LOCAL_SETTINGS['YOUTUBE_VIDEO_ID_LIST'] self.file_helper_obj = FileHelper() self.shell_executor = ShellExecutor() def sync_mvs(self): mv_folder_path = self.shell_executor.execute_shell_command( cmd='echo $SYSTEM_MUSIC_VIDEOS_FOLDER') file_name_list = os.listdir(mv_folder_path[0]) existing_mv_list = [] for file_name in file_name_list: file_name = file_name.split('.')[:-1] file_name = ''.join(file_name) parts = file_name.split('-') parts_len = len(parts) file_name = '' for i in range(0, parts_len): if i is 0: file_name = parts[parts_len - i - 1] else: file_name = '{}-{}'.format( parts[parts_len - i - 1], file_name) if len(file_name) >= self.YOUTUBE_VIDEO_ID_LENGTH: break if len(file_name) == self.YOUTUBE_VIDEO_ID_LENGTH and ' ' not in file_name: existing_mv_list.append(file_name) for item in self.YOUTUBE_VIDEO_ID_LIST: if item not in existing_mv_list: print('INFO: New video found to sync with id: {video_id}'.format( video_id=item)) self.shell_executor.execute_shell_command( cmd='downloadYoutubeVideo {video_id}'.format(video_id=item)) for item in existing_mv_list: if item not in self.YOUTUBE_VIDEO_ID_LIST: self.YOUTUBE_VIDEO_ID_LIST.append(item) dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(dir_path, 'local_settings.py') file_content = self.file_helper_obj.read_file( dir_path=file_path) replace_str = '\'' + item + '\',\n ' file_content = re.sub( r'(\'YOUTUBE_VIDEO_ID_LIST\': \[)', r'\1{replace_str}'.format(replace_str=replace_str), file_content) self.file_helper_obj.write_on_file_force(dir_path=file_path, file_content=file_content)
def __init__(self): self.YOUTUBE_VIDEO_ID_LIST = LocalSettingsLoader( ).LOCAL_SETTINGS['YOUTUBE_VIDEO_ID_LIST'] self.file_helper_obj = FileHelper() self.shell_executor = ShellExecutor()
class hdfsManager(object): """ Manager class for HDFS directory operations. """ def __init__(self,topic): self.topic = topic self.shell_exec = ShellExecutor() def makedir(self, dir_path): """ Makes an HDFS directory at dirpath :param dir_path: HDFS path :return: None """ try: self.shell_exec.safe_execute("hadoop fs -mkdir -p %s" % dir_path) logger.info("Created %s" % dir_path) except ShellException: logger.error("HDFS makedir failed. %s" % dir_path) raise def rmdir(self, dir_path): """ Removes HDFS directory at dir_path :param dir_path: HDFS path :return: None """ try: self.shell_exec.safe_execute("hadoop fs -rm -r %s" % dir_path) logger.info("Deleted %s" % dir_path) except ShellException: logger.error("HDFS rmdir failed. %s" % dir_path) raise def putfile(self, local_path, hdfs_path): """ Copies a resource from local path to HDFS path :param localpath: Path to resource on local fileystem :param hdfspath: HDFS destination path :return: None """ try: self.shell_exec.safe_execute("hadoop fs -put %s %s" % (local_path, hdfs_path)) logger.info("Put %s to HDFS path %s" % (local_path, hdfs_path)) except ShellException: logger.error("HDFS putfile failed. %s %s" % (local_path, hdfs_path)) raise def force_putfile(self,local_path,hdfs_path): """ :param local_path: :param hdfs_path: :return: """ try: self.shell_exec.safe_execute("hadoop fs -put -f %s %s" %(local_path,hdfs_path)) logger.info("Force put %s to HDFS path %s" % (local_path, hdfs_path)) except ShellException: logger.error("HDFS force putfile failed. %s %s" % (local_path, hdfs_path)) raise def get_start_dir(self,last_dir,start_dir): """ :param last_dir: :param start_dir: :return: """ if start_dir is None: return last_dir else: last_dir_int = re.sub("[^0-9]", "",str(last_dir)) start_dir_int = re.sub("[^0-9]", "",str(start_dir)) if (last_dir_int < start_dir_int): return start_dir else: return last_dir def get_new_dirs(self,last_dir,start_dir,hdfs_path): """ :param last_dir: Last processed hdfs directory :return: A list of hdfs directories pending processing. Expected format is a list of the following: (2015-08-19 10:12, /data/ds_ctg/trinity/thrive_test/d_20150819-1710) """ # path and command to trinity topic for the dataset cmd = "hadoop fs -ls %s" % hdfs_path result = self.shell_exec.safe_execute(cmd) output = result.output # stores snappy file list in output # sample output: # drwxr-xr-x - sys_bio_ctgdq bio_hadoop_ds 0 2015-08-19 10:12 /data/ds_ctg/trinity/thrive_test/d_20150819-1710 # Compile pattern containing # (1) hdfs processing timestamp e.g: 2015-08-19 10:12 # (2) path leading to the snappy file e.g: /data/ds_ctg/trinity/thrive_test/d_20150819-1710 dentry_pattern = ".*([0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}) (.*)" all_dirs = re.findall(dentry_pattern, output) # Sort dirs according to date. Date is obtained from dirname # (e.g. 'd_20150311-1610') by retaining only the numeric parts of # the string and converting to int (e.g. 201503111610) all_dirs.sort(key=lambda s: int(re.sub("[^0-9]", "", s[1]))) proceed_dir = self.get_start_dir(last_dir,start_dir) try: # If last directory is None, indicating the current load is the first load, # process all directories found if proceed_dir is None: pending_dir_info = all_dirs # Else, index the last directories' position in all directories # Because dir_info contains both timestamp and path info in a list, # need to go into sublist to index proceed_dir else: lastindex = next((i for i, sublist in enumerate(all_dirs) if proceed_dir in sublist), -1) pending_dir_info = all_dirs[lastindex +1: -1] except ValueError: logger.error("Last processed directory %s not found in topic location %s" %(proceed_dir,hdfs_path)) raise pending_dir = [] for item in pending_dir_info: pending_dir.append(item[0]) logger.info("Retrived processed directory %s" %proceed_dir) logger.info("Pending directories %s" %pending_dir) return pending_dir_info def retrieve_hdfs_ts(self,dir_info,outfile): """ Retrieve server & hdfs timestamp. Note that this method is called in a loop in hdfs_thread_manager :param dir_info: HDFS timestamp and path to message fetched from snappy file header :param outfile: File to write the line of output to :return: """ try: hdfs_ts, path = dir_info cmd = "hadoop fs -text %s/*" %path result = self.shell_exec.safe_execute(cmd) # Fetches individual messages msg = result.output # Compile regular expression pattern to fetch event_id and server_timestamp pattern = re.compile('.*"event_id":"(.*?)".*?"server_timestamp":([0-9]+)') # Store parsed data event_data = re.findall(pattern,msg) # Loop through individual message event data for item in event_data: event_id,server_unix_ts = item # Convert UNIX timestamp in milliseconds to standard timestamp server_ts = datetime.fromtimestamp(int(server_unix_ts)/1000).strftime("%Y-%m-%d %H:%M:%S") # Create output. Add seconds to hdfs_ts so that it is recognizable by Hive timestamp format output = (event_id,server_ts,hdfs_ts+":00\n") # Separate fields with ControlA ctrl_A = '\x01' outfile.write(ctrl_A.join(output)) except Exception: raise def create_hdfs_ts_ptn(self,partition,table,hive_hdfs_ts_path): """ Adding on partition to the server & hdfs timestamp Hive table :param partition: The partition to be created. Expected format: YYYY/MM/DD/HH :param table: :return: """ ptn_year,ptn_month,ptn_day,ptn_hour = partition.split("/") create_ptn_cmd = ''' hive -e "use flowview; alter table %s_hdfs add if not exists partition (year = %s, month = %s, day = %s, hour = %s) location '%s/%s/%s/%s/%s'"; ''' %(table, ptn_year,ptn_month,ptn_day,ptn_hour, hive_hdfs_ts_path, ptn_year,ptn_month,ptn_day,ptn_hour) try: self.shell_exec.safe_execute(create_ptn_cmd,splitcmd=False,as_shell=True) except ShellException: logger.error("Error in creating hive table to store server and hdfs timestamp") raise
def __init__(self,topic): self.topic = topic self.shell_exec = ShellExecutor()
def __init__(self): self.local_settings_loader_obj = LocalSettingsLoader() self.shell_executor_obj = ShellExecutor() self.system_user_name = getpass.getuser() self.git_folder_path = 'cd $SYSTEM_ROOT_GIT_REPO_FOLDER/'.format( system_user_name=self.system_user_name)
class GitOperations(object): def __init__(self): self.local_settings_loader_obj = LocalSettingsLoader() self.shell_executor_obj = ShellExecutor() self.system_user_name = getpass.getuser() self.git_folder_path = 'cd $SYSTEM_ROOT_GIT_REPO_FOLDER/'.format( system_user_name=self.system_user_name) def perform_git_clone(self, git_url): self.shell_executor_obj.execute_shell_command( cmd='{git_folder_path} && git clone {git_url}'.format( git_folder_path=self.git_folder_path, git_url=git_url)) def get_project_folder_list(self): project_folder_list = self.shell_executor_obj.execute_shell_command( cmd='{git_folder_path} && ls -l'.format( git_folder_path=self.git_folder_path)) del project_folder_list[0] project_folder_list = project_folder_list[:-1] project_name_list = [] for project_folder in project_folder_list: parts = project_folder.split(' ') project_name = parts[len(parts) - 1] project_name_list.append(project_name) return project_name_list def clone_from_provider(self, provider_name, repo_list): project_name_list = self.get_project_folder_list() for item in repo_list: repo_name = item.split('/')[1] found = False for already_cloned_item in project_name_list: if repo_name in already_cloned_item: found = True break if found is True: print('======>>>>>', item, 'PREVIOUSLY CLONED') continue full_url = 'git@{provider_name}:{actual_path}.git'.format( provider_name=provider_name, actual_path=item) self.perform_git_clone(git_url=full_url) print('======>>>>>', item, 'CLONED') def clone_missing_repos(self): github_repo_list = self.local_settings_loader_obj.LOCAL_SETTINGS[ 'GITHUB_REPO_LIST'] bit_bucket_repo_list = self.local_settings_loader_obj.LOCAL_SETTINGS[ 'BIT_BUCKET_REPO_LIST'] gitlab_repo_list = self.local_settings_loader_obj.LOCAL_SETTINGS[ 'GITLAB_REPO_LIST'] self.clone_from_provider(provider_name='github.com', repo_list=github_repo_list) self.clone_from_provider(provider_name='bitbucket.org', repo_list=bit_bucket_repo_list) self.clone_from_provider(provider_name='gitlab.com', repo_list=gitlab_repo_list) def fetch_all_repos_and_reset_hard(self): project_name_list = self.get_project_folder_list() for project_name in project_name_list: project_folder_path = '{git_folder_path}{project_name}/'.format( git_folder_path=self.git_folder_path, project_name=project_name) self.shell_executor_obj.execute_shell_command( cmd= '{project_folder_path} && printf "yes\n" | git fetch --all && gitResetHard' .format(project_folder_path=project_folder_path)) branch_name = self.shell_executor_obj.execute_shell_command( cmd='{project_folder_path} && git branch'.format( project_folder_path=project_folder_path)) branch_name = branch_name[0].replace('* ', '') print('======>>>>>', project_name, branch_name, 'DONE')
from cloud_provider import DigitalOceanCloudProvider from input_adapter import FileInputAdapter from shell_executor import ShellExecutor class ApplicationController: def __init__(self, input_adapter, cloud_provider, shell_executor): self.input_adapter = input_adapter self.cloud_provider = cloud_provider self.shell_executor = shell_executor def run(self): config = self.input_adapter.read() print(config) ips = self.cloud_provider.run(config) self.shell_executor.run(config, ips) ApplicationController(input_adapter=FileInputAdapter(file_name="input.json"), cloud_provider=DigitalOceanCloudProvider(), shell_executor=ShellExecutor()).run()