def run(self): """Main control function""" self.start_time = datetime.datetime.now() self._initlog() self._initpool() logm('Parallel tasks: {} starts at {}'.format( self.name, strTime(self.start_time))) while self.task_pool or self.process_running: if self.task_pool and len(self.process_running) < self.max_process: # add tasks into empty process self._runNewTask() elif not self._checkAll(): # check if process ends then log self._avgsleep() self.end_time = datetime.datetime.now() loggood('Parallel tasks:', self.name, 'end at', strTime(self.end_time)) loggood('Total {:d}'.format(self.runned_tasks), 'tasks with {:d}'.format(len(self.err_list)), 'error' if len(self.err_list) in [0, 1] else 'errors', 'using', strDiffTime(self.start_time, self.end_time)) if len(self.err_list): self._printErrTasks() logm('Putting results into log') logp('Output csv file to', self.output_filename) self.output(lastTime=True)
def _initpool(self): # setup task pool logm('Initial pooled tasks') self.setupTaskPool(self.task_pool) Task.printCount() self.total_tasks = len(self.task_pool) logp('Total pooled tasks:', self.total_tasks)
def output(self, lastTime=False): # original output function super.output(lastTime) # link all the ouptput to ~/public_html/log/<name>/<date>/ home_log_dir = os.path.join('~/public_html/log', self.name) if not os.path.exists(home_log_dir): os.makedirs(home_log_dir) sim_dir = os.path.join(home_log_dir, mp.strTime(self.start_time)) logm('Create symbolic link at for log files at', sim_dir) abs_logdir = os.path.dirname(os.path.realpath(self.output_filename)) if os.exists(sim_dir) and not os.path.samefile(sim_dir, abs_logdir): if os.path.islink(sim_dir): os.rename(sim_dir, sim_dir + '.backup') logwarn('link exists. Backup link as {:s}.backup -> {:s}' .format(sim_dir, os.path.realpath(os.path.expanduser(sim_dir)))) else: os.rename(sim_dir, sim_dir + '.backup') logwarn('file or dir exists. Backup as{:s}.backup' .format(sim_dir)) elif not os.exists(sim_dir): os.symlink(abs_logdir, mp.strTime(self.start_time), target_is_directory=True, dir_fd=home_log_dir)
def __init__(self, output_filename="", max_process=None, dump=True, start_now=True, update=True, name=None, html=True, sort=True): # clear previous task setup log if Task.typeCount: Task.refresh() logm('Setup parallel task') # whole parallel task name if not name: logwarn('No name given, guessing by class name') self.name = self.__class__.__name__ else: self.name = name logp('Parallel task name:', self.name) # Setup number of max process if max_process is None: # set default max process max_process = 1 if max_process <= 0: # used_process_num = MAX_CPU_NUM - max_process logp('Number of process depends on the number of CPU') if cpu_count() + max_process <= 0: logwarn('Number of process reaches 0! Will be set to 1') self.max_process = 1 else: self.max_process = cpu_count() + max_process else: # normal max_process assignment if max_process == 1: logwarn('Not using parallel function, use 1 process') elif max_process > cpu_count(): logwarn('# of processes exceeds # of CPUs: {:d}'.format( cpu_count()), 'This may decrease speed!') self.max_process = max_process logp('Use', self.max_process, 'processes') # for basic structure self.output_list = [] self.err_list = [] self.task_pool = [] self.process_running = [] self.updated_len = 0 self.out_filename = output_filename # parameters of current running status self.runned_tasks = 0 self.dump = dump if not self.dump: logp('Using custom output function') self.html = html self.sort = sort self.update = update if start_now: self.run()
def _checkAll(self): old_len = len(self.process_running) self.process_running = [task for task in self.process_running if not self._checkIfEnd(task)] if len(self.process_running) != old_len: if self.update and (self.task_pool or self.process_running): self.output() logm('Runned {:d}/{:d} tasks with {:d}'.format( self.runned_tasks, self.total_tasks, len(self.err_list)), 'error' if len(self.err_list) in [0, 1] else 'errors') return True else: return False
def output(self, lastTime=False): if not lastTime: new_num = len(self.output_list) - self.updated_len logm('Update log file with {:d} new'.format(new_num), 'result' if new_num == 1 else 'results') # output results if self.dump: # dump the output results with name respectively. self.dump2csv(lastTime) else: self.parse2csv(self.output_list) self.updated_len = len(self.output_list) # get new log list length if self.html: self.write2html(lastTime)
def fastq_list_Sample(Sample_pth_list, args): ''' Main program of Sample mode, Project mode inheritently calls this function''' if args.out_dir: args.out_dir = op.abspath(args.out_dir) if len(Sample_pth_list) > 1: # If multiple sample is input => Sample mode # use abspath for out_dir as root path of all results dir logp('multiple groups of sample get.') logp('root path to results is set manually:', args.out_dir) else: logp('no out_dir given,', 'results will got to the dir of every group of sample') # 這個程式主要把執行的環境設定好,把一些不正確的參數先判斷出來, # 再交給下一級 run_<...>() 系列的程式執行 if args.seq_type == 'paired': # paired mode, run by pairs # exapmle: -S A_R0.gz A_R1.gz -S B_R0.gz B_R1.gz -S ... # => [ # ['A_R0.gz', 'A_R1.gz'], # ['B_R0.gz', 'B_R1.gz'], # [...], ... # ] for i, paired_sample_list in enumerate(Sample_pth_list, start=1): # validation, samples should be paired if (len(paired_sample_list) != 2): logerr('Input samples :', ' ,'.join(paired_sample_list), 'is not paired!', 'Skipping ...') continue sample_R1, sample_R2 = paired_sample_list[0], paired_sample_list[1] # samples should exist if not op.exists(sample_R1): logerr('Input', sample_R1, 'does not exist! Skipping') continue if args.seq_type == 'paired' and not op.exists(sample_R2): logerr('Input', sample_R2, 'does not exist! Skipping') continue # copy args so if we change the args.out_dir or args.resume, # other samples will not be affected temp_args = copy.deepcopy(args) # if multiple samples, show the working progress if len(Sample_pth_list) > 1: logm('({:d}/{:d}) Processing paired:' .format(i, len(Sample_pth_list)), sample_R1, sample_R2) else: logm('Processing paired:', sample_R1, sample_R2) # use absolute path sample_R1, sample_R2 = op.abspath(sample_R1), op.abspath(sample_R2) if not args.out_dir: # default output path <path_of_sample_R1>/Tophat # if called by Project mode, args.out_dir will be set # automatically temp_args.out_dir = op.join(op.dirname(sample_R1), 'Tophat') else: if len(Sample_pth_list) > 1: # make sample sub_dir sample_name = ('Sample_' + op.basename(sample_R1).split('.')[0]) logp('subdir for sample name guessed from filename:', sample_name) # args.out_dir is root path shared by all samples temp_args.out_dir = op.join(args.out_dir, sample_name) # if not previous work, turn off the --resume option cond_log_exist = op.exists( op.join(temp_args.out_dir, 'logs/tophat.log')) if temp_args.resume and not cond_log_exist: logp('previous log file tophat.log not found,', 'resume function is temporarily off') temp_args.resume = False # create the out_dir and check if out_dir exists if op.isdir(temp_args.out_dir): if not temp_args.resume: logwarn('results dir exists', temp_args.out_dir) else: os.makedirs(temp_args.out_dir, mode=0o755) run_sample(sample_R1, sample_R2, temp_args) else: # single mode, run one by one # flatting all sequence into a list # example: -S A.fq -S B.fq C.fq D.fa -S ... # fastq list now becomes ['A.fq', 'B.fq', 'C.fq', ...] flatten_sample_list = list( itertools.chain.from_iterable(Sample_pth_list)) for sample in flatten_sample_list: # sample should exist if not op.exists(sample): logerr('Input', sample_R1, 'does not exist! Skipping') continue logm('Processing', sample) sample = op.abspath(sample) # if run as sample mode, set out_dir as the dir of the samples if not args.out_dir: args.out_dir = op.join(op.dirname(sample), 'Tophat') if op.isdir(args.out_dir): logwarn('results dir exists', args.out_dir) else: os.makedirs(args.out_dir, mode=0o755) # same function as paired-end mode, but leaving sample_R2 empty run_sample(sample, '', args)
def fastq_list_Project(Project_pth_list, args): ''' Main program of Sample mode. ''' # flatting nested list # Ex -P A B ... -P C -P D ... # => [[A, B], [C], [D], [...], ...] # flatten # => [A, B, C, D, ...] flatten_projects_list = list( itertools.chain.from_iterable(Project_pth_list)) logp('retreiving', len(flatten_projects_list), 'projects') for prj_pth in flatten_projects_list: if not op.exists(prj_pth) or not op.isdir(prj_pth): # not exist or not a directory logerr('Cannot find the directory of project:', prj_pth, 'Skipping...') continue # 使用絕對路徑,以免有些程式不能處理相對路徑,也方便除錯 # using abosolute path to prevent that some programs can not handle # relative path and easy for debugging prj_dir = op.abspath(prj_pth) # a typical sequencing path output after Illumina demultiplexing # .../<date_index_FCID>/Unaligned/Project_Test/Sample_HAHA # desired output path # .../<date_index_FCID>/Aligned/Project_Test/TopHat/Sample_HAHA FCID_dir, prj_name = op.split(prj_dir) FCID_dir = op.split(FCID_dir)[0] logm('Working project:', prj_name[8:]) # determine result dir if not args.out_dir: prj_result_root = op.join(FCID_dir, 'Aligned', prj_name, 'TopHat') else: prj_result_root = op.join(op.abspath(args.out_dir), prj_name) # create result dir first if not op.exists(prj_result_root): os.makedirs(prj_result_root, mode=0o755) else: logwarn('project result exists') # obtain all sample dir in the project sample_list = sorted(glob.glob(op.join(prj_dir, 'Sample_*/'))) total_sample = len(sample_list) logp('contains', str(total_sample), 'samples') for i, sample_dir in enumerate(sample_list, start=1): logm('({:d}/{:d}) Sample: {:s}' .format( i, total_sample, op.split(op.dirname(sample_dir))[1][7:]), 'in project', prj_name[8:]) # read SampleSheet.csv in the sample_dir ss_pth = op.join(sample_dir, 'SampleSheet.csv') if not op.exists(ss_pth): logerr('SampleSheet.csv not found! Skipping') continue # parsing SampleSheet.csv with open(ss_pth) as ss_csv_f: reader = csv.DictReader(ss_csv_f) for row in reader: temp_args = copy.deepcopy(args) # typical sample name # No35_ATGTCA_L003_R1_001 # => <sample_prefix>_R1/2_001 sample_prefix = '{:s}_{:s}_L{:03d}'.format( row['SampleID'], row['Index'], int(row['Lane'])) # output dir temp_args.out_dir = op.join(prj_result_root, 'Sample_' + row['SampleID']) logp('result goes to', temp_args.out_dir) if not op.exists(temp_args.out_dir): os.makedirs(temp_args.out_dir, mode=0o755) else: logwarn('sample result exists') # reference if not temp_args.ref_name: temp_args.ref_name = row['SampleRef'] sample_R1 = op.join(sample_dir, sample_prefix + '_R1_001.fastq.gz') if temp_args.seq_type == 'paired': sample_R2 = op.join(sample_dir, sample_prefix + '_R2_001.fastq.gz') else: sample_R2 = '' run_sample(sample_R1, sample_R2, temp_args)
def printCount(): logm('Print task type and count respectively') for k, v in Task.typeCount.items(): logp('Type', k + ':', str(v))
def refresh(): logm('Cleaning no-type warning and task type count') Task.warnedNoType = [] Task.typeCount = collections.OrderedDict()