def get_init_state(self, current_timestamp, df): # get queued jobs queued_df = df.loc[(df['ctime'] >= current_timestamp) & (df['start'] < current_timestamp)].copy() queued_df = queued_df.reset_index() tmp_queue_list = [] for index, row in queued_df.iterrows(): job_id = str(uuid.uuid4()) job_args = { 'create_time': row['ctime'], 'job_name': row['jobname'], 'start_time': None, 'total_run_time': row['end'] - row['start'], 'required_n_nodes': int(row['nhosts']), 'used_nodes': [], 'remained_running_time': None, 'job_id': job_id } num_full_slices = int(row['end'] - row['start']) / int( self.quantum) last_slice = int(row['end'] - row['start']) % int(self.quantum) tmp_list = [] job_args['remained_running_time'] = self.quantum for i in range(num_full_slices): tmp_list.append(Job(**job_args)) if num_full_slices == 0 or last_slice > 0: job_args['remained_running_time'] = last_slice tmp_list.append(Job(**job_args)) tmp_queue_list.append(tmp_list) if len(tmp_queue_list) > 0: for i in range(max(tmp_queue_list, key=len)): for j in tmp_queue_list: if i < len(tmp_queue_list[j]): self.waiting_queue.append(tmp_queue_list[j][i]) else: self.waiting_queue = [] # get running jobs run_df = df.loc[(df['start'] >= current_timestamp) & (df['end'] < current_timestamp)].copy() run_df = run_df.reset_index() for index, row in run_df.iterrows(): job_args = { 'create_time': row['ctime'], 'job_name': row['jobname'], 'start_time': row['start'], 'total_run_time': row['end'] - row['start'], 'required_n_nodes': int(row['nhosts']), 'used_nodes': [], 'remained_running_time': row['end'] - current_timestamp } available_nodes = Algorithm.get_available_nodes(self) if len(available_nodes) > job_args['required_n_nodes']: job_args['used_nodes'] = available_nodes[ 0:job_args['required_n_nodes']] self.running_list.append(Job(**job_args)) Algorithm.use_nodes(self, job_args['used_nodes'])
def parse(year, semester): job = Job() url = 'http://course.thu.edu.tw/view-dept/' + str(year) + '/' + str( semester) + '/everything' res = requests.get(url) domain = 'http://course.thu.edu.tw' res = BeautifulSoup(res.text, 'lxml') for dp in res.select('tr a'): dp_url = domain + dp['href'] job.add_job('thu', dp_parse, dp_url)
def data_parse(url): job = Job() page_data = [] res = requests.get(url=url, cookies={'over18': '1'}).text soup = BeautifulSoup(res, 'lxml') row = soup.select('.r-ent') for r in row: url = r.select('a')[0]['href'] job.add_job('ptt', body_parse, 'https://www.ptt.cc' + url)
def createJobFromImageFiles(self, filenames=None): """ Create a job containing all images to convert @param: filenames List of images files @return: a Job instance containing all images to convert """ masterJob = Job() for filename in filenames: job = self.createJobFromImageFile(filename) masterJob.addJob(job) return masterJob
def parse(start,end): job =Job() if end == -1: url = 'https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1' res= requests.get(url,headers={'Host': 'rent.591.com.tw','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) d = json.loads(res.text) end = int(d['records'].replace(",", "")) for i in range(start,end,30): if i == 0: i = 1 url = 'https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow='+str(i)+'&totalRows='+str(end) job.add_job('s591',data_parse,url)
def createJobFromMediaDirectory(self, mediaDirectoryPath, videoFileExtensions, imageFileExtensions): """ Creates a job to convert all supported media files. @param: mediaDirectoryPath Path to media directory @param: videoFileExtensions List of video file extensions @param: imageFileExtensions List of audio file extensions @return: a Job instance containing all jobs to execute. """ filenames = list() selectedVideoFilenames = list() selectedImageFilenames = list() # Retrieve list of all files in directory for root, dirs, files in os.walk(os.path.abspath(mediaDirectoryPath), topdown=False): for name in files: filename = os.path.join(root, name) filenames.append(filename) # Filter based on several criteriae for filename in filenames: fileExtension = os.path.splitext(filename)[1][1:].strip() fileNameWithoutExtension = os.path.splitext(filename)[0].strip() fileBasenameWithoutExtension = basename(fileNameWithoutExtension) fileExtensionLowerCase = fileExtension.lower() # Accept only non hidden files with specific file extension if (len(fileBasenameWithoutExtension) > 0 and len(fileExtensionLowerCase) > 0): if fileExtensionLowerCase in videoFileExtensions: selectedVideoFilenames.append(filename) elif fileExtensionLowerCase in imageFileExtensions: selectedImageFilenames.append(filename) masterJob = Job() videoJob = self.createJobFromVideoFiles(selectedVideoFilenames) masterJob.addJob(videoJob) imageJob = self.createJobFromImageFiles(selectedImageFilenames) masterJob.addJob(imageJob) return masterJob
def create(self, context_lib_name, payload): context = self.creator(kind='context', lib_name=context_lib_name, name='Context', payload=payload) kwargs = {'context': context} logging.info( f'create new job for {context.name}, event {context.event}') try: with open(JOB_CONFIG) as f: kwargs.update(json.load(f)[context.name][context.event]) except: logging.info(f'use default job') pass job = Job(**kwargs) runner = self.creator(kind='runner', lib_name=job.runner_lib, name='Runner') job.set_runner(runner) payload_provider = self.creator(kind='job_payload', lib_name=job.payload_lib, name='JobPayloadProvider') job.set_payload_provider(payload_provider) return job
def __init__(self, num_nodes, log_df, epoch_start, epoch_end): self.log_queue = Queue.Queue() filtered_df = log_df.loc[(log_df['ctime'] >= epoch_start) & (log_df['ctime'] < epoch_end)].copy() filtered_df = filtered_df.reset_index().sort_values('ctime') for index, row in filtered_df.iterrows(): job_args = { 'create_time': row['ctime'], 'job_name': row['jobname'], 'start_time': None, 'total_run_time': row['end'] - row['start'], 'required_n_nodes': int(row['nhosts']), 'used_nodes': [], 'remained_running_time': row['end'] - row['start'] } self.log_queue.put(Job(**job_args)) self.waiting_queue = [] self.running_list = [] self.node_pool = {} for i in range(num_nodes): self.node_pool['b{}'.format(str(i))] = 0
import os from job.job import Job from local_windows.local_windows import LocalWindows if __name__=='__main__': job=Job('input.json') job.run_today_job()
def page_parse(board, start, end): job = Job() for i in range(int(start), int(end) + 1): job.add_job('ptt', data_parse, 'https://www.ptt.cc/bbs/' + board + '/index%d.html' % i)