def strDiffTime(d_start, d_end, human=True): delta = d_end - d_start if human: time_str = "" if delta.days < 0: logwarn('Negative value when computing time difference of') print('start:', strTime(d_start), 'end:', strTime(d_end)) time_str = "-" delta = d_start - d_end h = (delta.days * 24) + (delta.seconds // 3600) m = delta.seconds % 3600 // 60 if h: time_str += "{:0.0f}h ".format(h) time_str += "{:0.0f}m ".format(m) elif m: time_str += "{:0.0f}m ".format(m) time_str += "{:0.2f}s".format( delta.seconds % 60 + delta.microseconds / 1000000) return time_str else: return delta.total_seconds()
def output(self, lastTime=False): # original output function super.output(lastTime) # link all the ouptput to ~/public_html/log/<name>/<date>/ home_log_dir = os.path.join('~/public_html/log', self.name) if not os.path.exists(home_log_dir): os.makedirs(home_log_dir) sim_dir = os.path.join(home_log_dir, mp.strTime(self.start_time)) logm('Create symbolic link at for log files at', sim_dir) abs_logdir = os.path.dirname(os.path.realpath(self.output_filename)) if os.exists(sim_dir) and not os.path.samefile(sim_dir, abs_logdir): if os.path.islink(sim_dir): os.rename(sim_dir, sim_dir + '.backup') logwarn('link exists. Backup link as {:s}.backup -> {:s}' .format(sim_dir, os.path.realpath(os.path.expanduser(sim_dir)))) else: os.rename(sim_dir, sim_dir + '.backup') logwarn('file or dir exists. Backup as{:s}.backup' .format(sim_dir)) elif not os.exists(sim_dir): os.symlink(abs_logdir, mp.strTime(self.start_time), target_is_directory=True, dir_fd=home_log_dir)
def __init__(self, output_filename="", max_process=None, dump=True, start_now=True, update=True, name=None, html=True, sort=True): # clear previous task setup log if Task.typeCount: Task.refresh() logm('Setup parallel task') # whole parallel task name if not name: logwarn('No name given, guessing by class name') self.name = self.__class__.__name__ else: self.name = name logp('Parallel task name:', self.name) # Setup number of max process if max_process is None: # set default max process max_process = 1 if max_process <= 0: # used_process_num = MAX_CPU_NUM - max_process logp('Number of process depends on the number of CPU') if cpu_count() + max_process <= 0: logwarn('Number of process reaches 0! Will be set to 1') self.max_process = 1 else: self.max_process = cpu_count() + max_process else: # normal max_process assignment if max_process == 1: logwarn('Not using parallel function, use 1 process') elif max_process > cpu_count(): logwarn('# of processes exceeds # of CPUs: {:d}'.format( cpu_count()), 'This may decrease speed!') self.max_process = max_process logp('Use', self.max_process, 'processes') # for basic structure self.output_list = [] self.err_list = [] self.task_pool = [] self.process_running = [] self.updated_len = 0 self.out_filename = output_filename # parameters of current running status self.runned_tasks = 0 self.dump = dump if not self.dump: logp('Using custom output function') self.html = html self.sort = sort self.update = update if start_now: self.run()
def __init__(self, name, command, working_dir=None, task_type=None): self.name = name if not task_type: if command[0] not in Task.warnedNoType: # no task type, guess by command[0] Task.warnedNoType.append(command[0]) logwarn('No task type specified! Guess by command[0]:', command[0]) logwarn('Further warning for same task type \ will be surpressed!') self.task_type = command[0] else: self.task_type = task_type if self.task_type in Task.typeCount: Task.typeCount[self.task_type] += 1 else: Task.typeCount[self.task_type] = 1 self.process_time = 0 self.command = command self.working_dir = working_dir
def _initlog(self): # output all logs to ./log/<...>.csv # if html output, further html file also created out_name = self.output_filename # determine the log dir place log_dir = op.dirname(self.out_fname) if out_name else "" if log_dir: if log_dir != './log': log_dir = op.abspath(op.expanduser(log_dir)) logp('using non-default log dir:', log_dir) else: log_dir = './log' # create log folder if not op.exists(log_dir): try: os.makedirs(log_dir, mode=0o755) except OSError as e: logerr('Cannot create log folder!', 'using default dir ./log/ instead') out_name = './log/' + op.basename(out_name) os.makedirs('./log', mode=0o755) print(e) # determin the log filename log_name = op.basename(out_name) if out_name else "" if op.splitext(log_name)[0]: # append file extension .csv if needed if not op.splitext(log_name)[1]: logwarn('No extension in given filename', '".csv" will be auto appended.') log_name += '.csv' elif op.splitext(log_name)[1] != '.csv': logwarn('Output file extension given "{:s}" is not ".csv",', 'may create error when opening.'.format( op.splitext(log_name)[1])) else: log_name = "{:s}_{:s}.csv".format( self.name.replace(' ', '_'), strTime(dt=self.start_time, str_format=myparallel.time_strf)) # join dir and name setting out_name = op.join(log_dir, log_name) # warn if log file already exists if op.exists(out_name): log_mtime = datetime.datetime.fromtimestamp(op.getmtime(out_name)) logwarn('log file already exists! Created {:s} ago'.format( strDiffTime(log_mtime, datetime.datetime.today()))) logp('Raw output log csv goes to', out_name)
def parse_args(): '''Parse command line options, return the option list''' desc = ''' This script parse the needed argument, first decompress the zipped fastq file, select proper reference genome index, and call Top Hat with proper arugments. More aruments input will be passed directly to Top Hat, be sure you know what you are doing. For more information please contact Liang Bo Wang or Bioinformatics and Biostatistics Core Lab, NTU CGM''' # if the parser will be inherited or used by other ArgumentParser, # then add_help should be set False. # RawTextHelpFormatter both description and help text use raw string # RawDescriptionHelpFormatter only description uses raw string parser = ap.ArgumentParser(prog='tophat.py', formatter_class=ap.RawDescriptionHelpFormatter, description=textwrap.dedent(desc), add_help=True) p_addarg = parser.add_argument # make the function name shorter # --- input --- # one must choose either Project(-P) or Sample(-S) in_type_grp = parser.add_mutually_exclusive_group(required=True) in_addarg = in_type_grp.add_argument # make the function name shorter # Project mode in_addarg('-P', '--Project', metavar='DIR', action='append', dest='Project_list', nargs='+', help='''Path for a project directory. It should follow the structure of original Illumina direct output by demultiplexing.''') # Sample mode in_addarg('-S', '--Sample', metavar=('R1.fastq[.gz]', '{R2.fastq[.gz]}'), action='append', dest='Sample_list', nargs='+', help='''PATH to a pair of samples in paired mode or multiple samples in single mode. Ex -S A1 A2 -S B1 B2 (-t paried) or -S A B C -S D E -t single. Both FASTQ and .fastq.gz files are accepted.''') # nargs='+' implies that this option accepts mulitple arguments # dest='<var_name>' then one can access the arguments using args.var_name # action='append' use the following examples # Ex1. -S A_R0.fq A_R1.fq # => [['A_R0.fq', 'A_R1.fq']] # Ex2. -S A_R0.fq A_R1.fq -S B_R0.fq B_R1.fq -S ... # => [ # ['A_R0.fq', 'A_R1.fq'], # ['B_R0.fq', 'B_R1.fq'], # [...], ... # ] # so if input type is Sample, it will be a nested list # --- output --- p_addarg('-o', '--outdir', metavar='OUT_DIR', dest='out_dir', help='''In Sample mode, if only a (pair of) sample is given, then it will be its result dir directly, otherwise it will be the root path for all samples given. Ex -S A1 A2 -o DIR -t paired => outputs to DIR/, -S A1 A2 -S B1 B2 -o DIR -t paired => DIR/A/ and DIR/B/ for results of A and B respectively. In Project mode, by default it assumes multiple results, so OUT_DIR will be the root path of all the project results. Ex OUT_DIR/Sample_<1>, OUT_DIR/Sample_<2>, ...''') # --- parameters for Top Hat --- # tophat -p 15 # -G /data/iGenome/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf # -o TopHat_with_GTF/Sample_A-W # --library-type=fr-unstranded # --no-novel-juncs # /data/iGenome/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R1_001.fastq # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R2_001.fastq p_addarg('-r', '--readlength', # required=True, metavar='LEN', type=int, dest='read_length', help='''mate_inner_length given to Tophat.''') p_addarg('-t', '--seqtype', default='paired', choices=['paired', 'single'], dest='seq_type', help='''Sequencing type. Default is paired-end sequence.''') # quantification without a reference annotation p_addarg('--no-annotation', action='store_false', dest='annotation', help='''If specified, Top Hat will do alternative splicing without knowledge of existed isoform of all genes.''') # --- reference argments --- # user and specify species like human, mouse, or chicken through -R # or they specify the path to required bowtie index and gene annotation # If bowtie_path(--bowtie-index-path) or gene_path(--gene-path) is given, # program use the path directly. # Then it looks for the path given by ref_name(-R). # Otherwise it looks for the information inside SampleSheet.csv # Thus the priority is <*path> -> <ref_name> -> SampleSheet.csv p_addarg('-R', '--refname', metavar='NAME', dest='ref_name', help='''Name of the species or reference database. Ex. both human and hg19 goes to hg19; similarly, both mouse and mm10 goes to mm10.''') p_addarg('-B', '--bowtie-index', #required=True, metavar='BOWTIE_INDEX_PATH', dest='bowtie_index', help='''The path to FW index of whole genome sequence for Bowtie2. It should be ended with .../Bowtie2Index/genome''') p_addarg('-G', '--gtf', metavar='GTF_PATH', dest='gtf_path', help='''The path to the gene annotation GTF file with known transcripts, e.g., genes.gtf in most cases. ''') # --- miscellaneous arguments --- # multiprocessing p_addarg('-p', '--multiprocess', metavar='N', type=int, default=1, dest='max_process', help='''The maximum of parallel running processes. This number should be equal to or less then the number of CPU cores. If a negative number or zero is set, number of maxprocess depends on the number of CPU cores. For example, -1 uses CPU_NUM({:d}) - 1 = {:d} processes on this machine. Program use 1 process if not specified.'''.format( mp.cpu_count(), mp.cpu_count() - 1)) # resume Tophat p_addarg('--resume', action='store_true', dest='resume', help='''If specified, Tophat will try to resume the progress by looking for <out_dir>/logs/tophat.log''') # remove unzipped fastq files p_addarg('--remove-fastq', action='store_true', dest='rm_unzip_fq', help='''If specified, all unzipped fastq files will be removed. However, those fastq files existed before run will be intact.''') p_addarg('--extra-args', dest='extra_args', action='store_true', help='''Input additional arguments to Tophat directly WITHOUT ANY CHECKS. If specified, all unkown args will be collected''') # --- validation and first processing commands --- #args = parser.parse_args() args, unknown_args = parser.parse_known_args() if args.extra_args: logp('getting extra args passed to Tophat:', ' '.join(unknown_args)) args.extra_args = unknown_args else: if unknown_args: logerr('Get unkown args.', 'If they are passed to Tophat, please specify', '--extra-args') parser.error('unrecognized arguments:' + ' '.join(unknown_args)) else: args.extra_args = None # computing max_process if args.max_process > mp.cpu_count(): logwarn('Set # of processes({:d})'.format(args.max_process), '> # of CPUs({:d})'.format(mp.cpu_count()), 'the efficiency will be low.') elif args.max_process <= 0: args.max_process = mp.cpu_count() + args.max_process if args.max_process <= 0: logwarn('Negative # of processes({:d}) has been set, reset to 1' .format(args.max_process)) args.max_process = 1 return args
def fastq_list_Sample(Sample_pth_list, args): ''' Main program of Sample mode, Project mode inheritently calls this function''' if args.out_dir: args.out_dir = op.abspath(args.out_dir) if len(Sample_pth_list) > 1: # If multiple sample is input => Sample mode # use abspath for out_dir as root path of all results dir logp('multiple groups of sample get.') logp('root path to results is set manually:', args.out_dir) else: logp('no out_dir given,', 'results will got to the dir of every group of sample') # 這個程式主要把執行的環境設定好,把一些不正確的參數先判斷出來, # 再交給下一級 run_<...>() 系列的程式執行 if args.seq_type == 'paired': # paired mode, run by pairs # exapmle: -S A_R0.gz A_R1.gz -S B_R0.gz B_R1.gz -S ... # => [ # ['A_R0.gz', 'A_R1.gz'], # ['B_R0.gz', 'B_R1.gz'], # [...], ... # ] for i, paired_sample_list in enumerate(Sample_pth_list, start=1): # validation, samples should be paired if (len(paired_sample_list) != 2): logerr('Input samples :', ' ,'.join(paired_sample_list), 'is not paired!', 'Skipping ...') continue sample_R1, sample_R2 = paired_sample_list[0], paired_sample_list[1] # samples should exist if not op.exists(sample_R1): logerr('Input', sample_R1, 'does not exist! Skipping') continue if args.seq_type == 'paired' and not op.exists(sample_R2): logerr('Input', sample_R2, 'does not exist! Skipping') continue # copy args so if we change the args.out_dir or args.resume, # other samples will not be affected temp_args = copy.deepcopy(args) # if multiple samples, show the working progress if len(Sample_pth_list) > 1: logm('({:d}/{:d}) Processing paired:' .format(i, len(Sample_pth_list)), sample_R1, sample_R2) else: logm('Processing paired:', sample_R1, sample_R2) # use absolute path sample_R1, sample_R2 = op.abspath(sample_R1), op.abspath(sample_R2) if not args.out_dir: # default output path <path_of_sample_R1>/Tophat # if called by Project mode, args.out_dir will be set # automatically temp_args.out_dir = op.join(op.dirname(sample_R1), 'Tophat') else: if len(Sample_pth_list) > 1: # make sample sub_dir sample_name = ('Sample_' + op.basename(sample_R1).split('.')[0]) logp('subdir for sample name guessed from filename:', sample_name) # args.out_dir is root path shared by all samples temp_args.out_dir = op.join(args.out_dir, sample_name) # if not previous work, turn off the --resume option cond_log_exist = op.exists( op.join(temp_args.out_dir, 'logs/tophat.log')) if temp_args.resume and not cond_log_exist: logp('previous log file tophat.log not found,', 'resume function is temporarily off') temp_args.resume = False # create the out_dir and check if out_dir exists if op.isdir(temp_args.out_dir): if not temp_args.resume: logwarn('results dir exists', temp_args.out_dir) else: os.makedirs(temp_args.out_dir, mode=0o755) run_sample(sample_R1, sample_R2, temp_args) else: # single mode, run one by one # flatting all sequence into a list # example: -S A.fq -S B.fq C.fq D.fa -S ... # fastq list now becomes ['A.fq', 'B.fq', 'C.fq', ...] flatten_sample_list = list( itertools.chain.from_iterable(Sample_pth_list)) for sample in flatten_sample_list: # sample should exist if not op.exists(sample): logerr('Input', sample_R1, 'does not exist! Skipping') continue logm('Processing', sample) sample = op.abspath(sample) # if run as sample mode, set out_dir as the dir of the samples if not args.out_dir: args.out_dir = op.join(op.dirname(sample), 'Tophat') if op.isdir(args.out_dir): logwarn('results dir exists', args.out_dir) else: os.makedirs(args.out_dir, mode=0o755) # same function as paired-end mode, but leaving sample_R2 empty run_sample(sample, '', args)
def fastq_list_Project(Project_pth_list, args): ''' Main program of Sample mode. ''' # flatting nested list # Ex -P A B ... -P C -P D ... # => [[A, B], [C], [D], [...], ...] # flatten # => [A, B, C, D, ...] flatten_projects_list = list( itertools.chain.from_iterable(Project_pth_list)) logp('retreiving', len(flatten_projects_list), 'projects') for prj_pth in flatten_projects_list: if not op.exists(prj_pth) or not op.isdir(prj_pth): # not exist or not a directory logerr('Cannot find the directory of project:', prj_pth, 'Skipping...') continue # 使用絕對路徑,以免有些程式不能處理相對路徑,也方便除錯 # using abosolute path to prevent that some programs can not handle # relative path and easy for debugging prj_dir = op.abspath(prj_pth) # a typical sequencing path output after Illumina demultiplexing # .../<date_index_FCID>/Unaligned/Project_Test/Sample_HAHA # desired output path # .../<date_index_FCID>/Aligned/Project_Test/TopHat/Sample_HAHA FCID_dir, prj_name = op.split(prj_dir) FCID_dir = op.split(FCID_dir)[0] logm('Working project:', prj_name[8:]) # determine result dir if not args.out_dir: prj_result_root = op.join(FCID_dir, 'Aligned', prj_name, 'TopHat') else: prj_result_root = op.join(op.abspath(args.out_dir), prj_name) # create result dir first if not op.exists(prj_result_root): os.makedirs(prj_result_root, mode=0o755) else: logwarn('project result exists') # obtain all sample dir in the project sample_list = sorted(glob.glob(op.join(prj_dir, 'Sample_*/'))) total_sample = len(sample_list) logp('contains', str(total_sample), 'samples') for i, sample_dir in enumerate(sample_list, start=1): logm('({:d}/{:d}) Sample: {:s}' .format( i, total_sample, op.split(op.dirname(sample_dir))[1][7:]), 'in project', prj_name[8:]) # read SampleSheet.csv in the sample_dir ss_pth = op.join(sample_dir, 'SampleSheet.csv') if not op.exists(ss_pth): logerr('SampleSheet.csv not found! Skipping') continue # parsing SampleSheet.csv with open(ss_pth) as ss_csv_f: reader = csv.DictReader(ss_csv_f) for row in reader: temp_args = copy.deepcopy(args) # typical sample name # No35_ATGTCA_L003_R1_001 # => <sample_prefix>_R1/2_001 sample_prefix = '{:s}_{:s}_L{:03d}'.format( row['SampleID'], row['Index'], int(row['Lane'])) # output dir temp_args.out_dir = op.join(prj_result_root, 'Sample_' + row['SampleID']) logp('result goes to', temp_args.out_dir) if not op.exists(temp_args.out_dir): os.makedirs(temp_args.out_dir, mode=0o755) else: logwarn('sample result exists') # reference if not temp_args.ref_name: temp_args.ref_name = row['SampleRef'] sample_R1 = op.join(sample_dir, sample_prefix + '_R1_001.fastq.gz') if temp_args.seq_type == 'paired': sample_R2 = op.join(sample_dir, sample_prefix + '_R2_001.fastq.gz') else: sample_R2 = '' run_sample(sample_R1, sample_R2, temp_args)
parser.add_argument('--sort', action='store_true', help="""whether sort the output in html log file. default is False.""") parser.add_argument('--parse', action="store_false", dest='dump', help="""Use self-defined logging function, which means output will be parse using overrided function parse2csv(). If the function is not overrrided, it still calls default dump function, which logs raw stdout, stderr outputs. NOTICE! When using default output, multiline output may cause problems during Excel import or post parsing.""") parser.add_argument('--no_html', action='store_false', dest='html', help='No log in html format.') parser.add_argument('--no_update', action='store_false', dest='update', help="""Write output to log file only when all the tasks have finished. WARNING! All information will be lost if the program ended unexpectedly.""") ######### Below is for testing function ######## if __name__ == '__main__': # setup argv here logwarn('One should not run this program directly!') print('Try to call test.py')