def _record(self, task): """Record the result every time a task finishs""" task_time = strDiffTime(task.start_time, datetime.datetime.today()) logp('task', 'of', task.task_type + ':', task.name, 'ended using', task_time) output = task.process.communicate() try: normsg = output[0].decode('utf8') errmsg = output[1].decode('utf8') if errmsg: # error ouccrs in task self.err_list.append('{:s}[{:s}]'.format( task.name, task.task_type)) logerr('task', task.name, 'ended with error!!') print(errmsg) except UnicodeDecodeError as e: logerr('Decoding output of', task.name, 'failed!') print(e) print('The output is not a string text, skipping!!') normsg = errmsg = 'DECODE ERROR' self.err_list.append(task.name) # putting into output list self.output_list.append({ 'name': task.name, 'type': task.task_type, 'normal msg': normsg, 'error msg': errmsg, 'process time': task_time, })
def cmd_gunzip(gz_pth, fastq_pth): '''Unzip a *.gz file and return the fastq file path''' try: sp.check_call( ['gunzip -c {:s} > {:s}'.format(gz_pth, fastq_pth)], bufsize=-1, shell=True ) except sp.CalledProcessError as e: # error when unzipping logerr('Unzip file:', op.basename(gz_pth), 'failed! with exit({:d})'.format(e.returncode))
def _initlog(self): # output all logs to ./log/<...>.csv # if html output, further html file also created out_name = self.output_filename # determine the log dir place log_dir = op.dirname(self.out_fname) if out_name else "" if log_dir: if log_dir != './log': log_dir = op.abspath(op.expanduser(log_dir)) logp('using non-default log dir:', log_dir) else: log_dir = './log' # create log folder if not op.exists(log_dir): try: os.makedirs(log_dir, mode=0o755) except OSError as e: logerr('Cannot create log folder!', 'using default dir ./log/ instead') out_name = './log/' + op.basename(out_name) os.makedirs('./log', mode=0o755) print(e) # determin the log filename log_name = op.basename(out_name) if out_name else "" if op.splitext(log_name)[0]: # append file extension .csv if needed if not op.splitext(log_name)[1]: logwarn('No extension in given filename', '".csv" will be auto appended.') log_name += '.csv' elif op.splitext(log_name)[1] != '.csv': logwarn('Output file extension given "{:s}" is not ".csv",', 'may create error when opening.'.format( op.splitext(log_name)[1])) else: log_name = "{:s}_{:s}.csv".format( self.name.replace(' ', '_'), strTime(dt=self.start_time, str_format=myparallel.time_strf)) # join dir and name setting out_name = op.join(log_dir, log_name) # warn if log file already exists if op.exists(out_name): log_mtime = datetime.datetime.fromtimestamp(op.getmtime(out_name)) logwarn('log file already exists! Created {:s} ago'.format( strDiffTime(log_mtime, datetime.datetime.today()))) logp('Raw output log csv goes to', out_name)
def dump2csv(self, lastTime=False): try: csv_f = open(self.output_filename, 'w') except IOError as e: logerr('Cannot open file', self.output_filename) print(e) if lastTime: csv_f = myparallel.wopenfile(self.output_filename) else: print('Try to closed the log file. Skip logging this time') return csv_writer = csv.DictWriter(csv_f, myparallel.dump_headcol) csv_writer.writeheader() csv_writer.writerows(self.output_list) csv_f.close()
def unzip(gz_pth): gz_dir, gz_name = op.dirname(gz_pth), op.basename(gz_pth) fastq_path = op.join(gz_dir, op.splitext(gz_name)[0]) # check if <gz_pth>.fastq exists if op.exists(fastq_path): # fastq exists, skipping return fastq_path else: unzip_s = dt.datetime.today() logp('unzip', gz_name, 'starts at', strTime(unzip_s)) cmd_gunzip(gz_pth, fastq_path) unzip_e = dt.datetime.today() logp('ends at', strTime(unzip_e), 'using', strDiffTime(unzip_s, unzip_e)) if not op.exists(fastq_path): logerr('Output fastq not found!', fastq_path) return else: return fastq_path
def write2html(self, lastTime=False): logp('update html log file') html_name = op.splitext(self.output_filename)[0] + '.html' try: html_f = open(html_name, 'w') except IOError as e: logerr('Cannot open file', html_name) print(e) if lastTime: html_f = myparallel.wopenfile(html_name) else: print('Try to closed the log html file. Skip this time') return template = open('sample_output.html') output_reg = re.compile(r'\{ ?% ?outputblock ?% ?\}') for l in template: if output_reg.search(l): html_f.write(self.makeDescription(lastTime)) # place to insert result table html_f.write(self.makeHTMLTable(lastTime)) else: html_f.write(l)
def ref_path(sample_path, ref_name): global refpath_dict # black magic here will further move to new script if ref_name is not None: ref_name = ref_name.lower() # given a short name of the reference index, # return the actual directory of the reference # Ex. hg19 -> /data/iGenome/H**o-sapiens/UCSC/hg19 # Currently maintain: human, h**o sapiens, hg19, if ref_name == 'unkown': # unkown return None # skipping this sample elif ref_name in ['human', 'h**o sapiens', 'homo_sapiens']: # human -> hg19 return refpath_dict['hg19'] elif ref_name in ['mouse']: # mouse -> mm10 return refpath_dict['mm10'] elif ref_name in ['chicken']: # chicken -> gal4 return refpath_dict['galGal4'] elif ref_name in refpath_dict: return refpath_dict[ref_name] else: # unrecognized reference name, reporting and skipping logerr('Cannot find the reference in current database:', ref_name) return None else: # no ref_name is given, searching the SampleSheet.csv logp('guess reference from SampleSheet.csv') ss_csv_path = op.join(op.dirname(sample_path), 'SampleSheet.csv') if not op.exists(ss_csv_path): logerr('SampleSheet.csv not found at', ss_csv_path) return None sample_name = op.basename(sample_path).split('_')[0] with open(ss_csv_path) as ss_csv_f: reader = csv.DictReader(ss_csv_f) for row in reader: if row['SampleID'] == sample_name: return ref_path(sample_path, row['SampleRef']) logerr('Sample:', sample_name, 'cannot be found in SampleSheet.csv') return None
def parse_args(): '''Parse command line options, return the option list''' desc = ''' This script parse the needed argument, first decompress the zipped fastq file, select proper reference genome index, and call Top Hat with proper arugments. More aruments input will be passed directly to Top Hat, be sure you know what you are doing. For more information please contact Liang Bo Wang or Bioinformatics and Biostatistics Core Lab, NTU CGM''' # if the parser will be inherited or used by other ArgumentParser, # then add_help should be set False. # RawTextHelpFormatter both description and help text use raw string # RawDescriptionHelpFormatter only description uses raw string parser = ap.ArgumentParser(prog='tophat.py', formatter_class=ap.RawDescriptionHelpFormatter, description=textwrap.dedent(desc), add_help=True) p_addarg = parser.add_argument # make the function name shorter # --- input --- # one must choose either Project(-P) or Sample(-S) in_type_grp = parser.add_mutually_exclusive_group(required=True) in_addarg = in_type_grp.add_argument # make the function name shorter # Project mode in_addarg('-P', '--Project', metavar='DIR', action='append', dest='Project_list', nargs='+', help='''Path for a project directory. It should follow the structure of original Illumina direct output by demultiplexing.''') # Sample mode in_addarg('-S', '--Sample', metavar=('R1.fastq[.gz]', '{R2.fastq[.gz]}'), action='append', dest='Sample_list', nargs='+', help='''PATH to a pair of samples in paired mode or multiple samples in single mode. Ex -S A1 A2 -S B1 B2 (-t paried) or -S A B C -S D E -t single. Both FASTQ and .fastq.gz files are accepted.''') # nargs='+' implies that this option accepts mulitple arguments # dest='<var_name>' then one can access the arguments using args.var_name # action='append' use the following examples # Ex1. -S A_R0.fq A_R1.fq # => [['A_R0.fq', 'A_R1.fq']] # Ex2. -S A_R0.fq A_R1.fq -S B_R0.fq B_R1.fq -S ... # => [ # ['A_R0.fq', 'A_R1.fq'], # ['B_R0.fq', 'B_R1.fq'], # [...], ... # ] # so if input type is Sample, it will be a nested list # --- output --- p_addarg('-o', '--outdir', metavar='OUT_DIR', dest='out_dir', help='''In Sample mode, if only a (pair of) sample is given, then it will be its result dir directly, otherwise it will be the root path for all samples given. Ex -S A1 A2 -o DIR -t paired => outputs to DIR/, -S A1 A2 -S B1 B2 -o DIR -t paired => DIR/A/ and DIR/B/ for results of A and B respectively. In Project mode, by default it assumes multiple results, so OUT_DIR will be the root path of all the project results. Ex OUT_DIR/Sample_<1>, OUT_DIR/Sample_<2>, ...''') # --- parameters for Top Hat --- # tophat -p 15 # -G /data/iGenome/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf # -o TopHat_with_GTF/Sample_A-W # --library-type=fr-unstranded # --no-novel-juncs # /data/iGenome/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R1_001.fastq # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R2_001.fastq p_addarg('-r', '--readlength', # required=True, metavar='LEN', type=int, dest='read_length', help='''mate_inner_length given to Tophat.''') p_addarg('-t', '--seqtype', default='paired', choices=['paired', 'single'], dest='seq_type', help='''Sequencing type. Default is paired-end sequence.''') # quantification without a reference annotation p_addarg('--no-annotation', action='store_false', dest='annotation', help='''If specified, Top Hat will do alternative splicing without knowledge of existed isoform of all genes.''') # --- reference argments --- # user and specify species like human, mouse, or chicken through -R # or they specify the path to required bowtie index and gene annotation # If bowtie_path(--bowtie-index-path) or gene_path(--gene-path) is given, # program use the path directly. # Then it looks for the path given by ref_name(-R). # Otherwise it looks for the information inside SampleSheet.csv # Thus the priority is <*path> -> <ref_name> -> SampleSheet.csv p_addarg('-R', '--refname', metavar='NAME', dest='ref_name', help='''Name of the species or reference database. Ex. both human and hg19 goes to hg19; similarly, both mouse and mm10 goes to mm10.''') p_addarg('-B', '--bowtie-index', #required=True, metavar='BOWTIE_INDEX_PATH', dest='bowtie_index', help='''The path to FW index of whole genome sequence for Bowtie2. It should be ended with .../Bowtie2Index/genome''') p_addarg('-G', '--gtf', metavar='GTF_PATH', dest='gtf_path', help='''The path to the gene annotation GTF file with known transcripts, e.g., genes.gtf in most cases. ''') # --- miscellaneous arguments --- # multiprocessing p_addarg('-p', '--multiprocess', metavar='N', type=int, default=1, dest='max_process', help='''The maximum of parallel running processes. This number should be equal to or less then the number of CPU cores. If a negative number or zero is set, number of maxprocess depends on the number of CPU cores. For example, -1 uses CPU_NUM({:d}) - 1 = {:d} processes on this machine. Program use 1 process if not specified.'''.format( mp.cpu_count(), mp.cpu_count() - 1)) # resume Tophat p_addarg('--resume', action='store_true', dest='resume', help='''If specified, Tophat will try to resume the progress by looking for <out_dir>/logs/tophat.log''') # remove unzipped fastq files p_addarg('--remove-fastq', action='store_true', dest='rm_unzip_fq', help='''If specified, all unzipped fastq files will be removed. However, those fastq files existed before run will be intact.''') p_addarg('--extra-args', dest='extra_args', action='store_true', help='''Input additional arguments to Tophat directly WITHOUT ANY CHECKS. If specified, all unkown args will be collected''') # --- validation and first processing commands --- #args = parser.parse_args() args, unknown_args = parser.parse_known_args() if args.extra_args: logp('getting extra args passed to Tophat:', ' '.join(unknown_args)) args.extra_args = unknown_args else: if unknown_args: logerr('Get unkown args.', 'If they are passed to Tophat, please specify', '--extra-args') parser.error('unrecognized arguments:' + ' '.join(unknown_args)) else: args.extra_args = None # computing max_process if args.max_process > mp.cpu_count(): logwarn('Set # of processes({:d})'.format(args.max_process), '> # of CPUs({:d})'.format(mp.cpu_count()), 'the efficiency will be low.') elif args.max_process <= 0: args.max_process = mp.cpu_count() + args.max_process if args.max_process <= 0: logwarn('Negative # of processes({:d}) has been set, reset to 1' .format(args.max_process)) args.max_process = 1 return args
def cmd_tophat(gz_pth_1, gz_pth_2, bowtie_ref, gene_ref, out_dir, read_length, max_process, resume, rm_unzip_fq, extra_args): def unzip(gz_pth): gz_dir, gz_name = op.dirname(gz_pth), op.basename(gz_pth) fastq_path = op.join(gz_dir, op.splitext(gz_name)[0]) # check if <gz_pth>.fastq exists if op.exists(fastq_path): # fastq exists, skipping return fastq_path else: unzip_s = dt.datetime.today() logp('unzip', gz_name, 'starts at', strTime(unzip_s)) cmd_gunzip(gz_pth, fastq_path) unzip_e = dt.datetime.today() logp('ends at', strTime(unzip_e), 'using', strDiffTime(unzip_s, unzip_e)) if not op.exists(fastq_path): logerr('Output fastq not found!', fastq_path) return else: return fastq_path def rm_fq(fastq_path): logp('removing unzipped fastq:', fastq_path) os.remove(fastq_path) # unzipping unzip_fastq_1 = False if op.splitext(gz_pth_1)[1] == '.fastq': fq_pth_1 = gz_pth_1 else: fq_pth_1 = unzip(gz_pth_1) unzip_fastq_1 = True unzip_fastq_2 = False if gz_pth_2: if op.splitext(gz_pth_2)[1] == '.fastq': fq_pth_2 = gz_pth_2 else: fq_pth_2 = unzip(gz_pth_2) unzip_fastq_2 = True else: fq_pth_2 = None # tophat command splitted by space cmd = ['tophat', '-p', str(max_process), '-o', out_dir] if gene_ref: # genes.gtf cmd.extend(['-G', gene_ref]) if resume: # resume Tophat cmd.extend(['--resume', out_dir]) if read_length: # mate_inner_length cmd.extend(['-r', str(read_length)]) if extra_args: logp('getting extra args for TopHat:', ' '.join(extra_args)) cmd.extend(extra_args) cmd.extend([bowtie_ref, fq_pth_1]) # paired end if fq_pth_2: cmd.append(fq_pth_2) logp('running command:', ' '.join(cmd)) dt_start = dt.datetime.today() logp('starts at', strTime(dt_start)) process = sp.Popen( cmd, stdout=sp.PIPE, stderr=sp.PIPE, # pipeline bufsize=-1, # means use system buffer size universal_newlines=True, # parse '\n' automatically cwd=out_dir ) process.wait() # wait for process completes dt_end = dt.datetime.today() logp('ends at', strTime(dt_end), 'using', strDiffTime(dt_start, dt_end)) stdout, stderr = process.communicate() # get the message if process.poll() > 0: # tophat ends with not normal exit (returncode > 1) if stderr.startswith('Nothing to resume.'): logp('successfully complete, skipping') logerr('Original message:\n' + stderr) # remove unzipped fastq if rm_unzip_fq: if unzip_fastq_1: rm_fq(fq_pth_1) if unzip_fastq_2: rm_fq(fq_pth_2)
def fastq_list_Sample(Sample_pth_list, args): ''' Main program of Sample mode, Project mode inheritently calls this function''' if args.out_dir: args.out_dir = op.abspath(args.out_dir) if len(Sample_pth_list) > 1: # If multiple sample is input => Sample mode # use abspath for out_dir as root path of all results dir logp('multiple groups of sample get.') logp('root path to results is set manually:', args.out_dir) else: logp('no out_dir given,', 'results will got to the dir of every group of sample') # 這個程式主要把執行的環境設定好,把一些不正確的參數先判斷出來, # 再交給下一級 run_<...>() 系列的程式執行 if args.seq_type == 'paired': # paired mode, run by pairs # exapmle: -S A_R0.gz A_R1.gz -S B_R0.gz B_R1.gz -S ... # => [ # ['A_R0.gz', 'A_R1.gz'], # ['B_R0.gz', 'B_R1.gz'], # [...], ... # ] for i, paired_sample_list in enumerate(Sample_pth_list, start=1): # validation, samples should be paired if (len(paired_sample_list) != 2): logerr('Input samples :', ' ,'.join(paired_sample_list), 'is not paired!', 'Skipping ...') continue sample_R1, sample_R2 = paired_sample_list[0], paired_sample_list[1] # samples should exist if not op.exists(sample_R1): logerr('Input', sample_R1, 'does not exist! Skipping') continue if args.seq_type == 'paired' and not op.exists(sample_R2): logerr('Input', sample_R2, 'does not exist! Skipping') continue # copy args so if we change the args.out_dir or args.resume, # other samples will not be affected temp_args = copy.deepcopy(args) # if multiple samples, show the working progress if len(Sample_pth_list) > 1: logm('({:d}/{:d}) Processing paired:' .format(i, len(Sample_pth_list)), sample_R1, sample_R2) else: logm('Processing paired:', sample_R1, sample_R2) # use absolute path sample_R1, sample_R2 = op.abspath(sample_R1), op.abspath(sample_R2) if not args.out_dir: # default output path <path_of_sample_R1>/Tophat # if called by Project mode, args.out_dir will be set # automatically temp_args.out_dir = op.join(op.dirname(sample_R1), 'Tophat') else: if len(Sample_pth_list) > 1: # make sample sub_dir sample_name = ('Sample_' + op.basename(sample_R1).split('.')[0]) logp('subdir for sample name guessed from filename:', sample_name) # args.out_dir is root path shared by all samples temp_args.out_dir = op.join(args.out_dir, sample_name) # if not previous work, turn off the --resume option cond_log_exist = op.exists( op.join(temp_args.out_dir, 'logs/tophat.log')) if temp_args.resume and not cond_log_exist: logp('previous log file tophat.log not found,', 'resume function is temporarily off') temp_args.resume = False # create the out_dir and check if out_dir exists if op.isdir(temp_args.out_dir): if not temp_args.resume: logwarn('results dir exists', temp_args.out_dir) else: os.makedirs(temp_args.out_dir, mode=0o755) run_sample(sample_R1, sample_R2, temp_args) else: # single mode, run one by one # flatting all sequence into a list # example: -S A.fq -S B.fq C.fq D.fa -S ... # fastq list now becomes ['A.fq', 'B.fq', 'C.fq', ...] flatten_sample_list = list( itertools.chain.from_iterable(Sample_pth_list)) for sample in flatten_sample_list: # sample should exist if not op.exists(sample): logerr('Input', sample_R1, 'does not exist! Skipping') continue logm('Processing', sample) sample = op.abspath(sample) # if run as sample mode, set out_dir as the dir of the samples if not args.out_dir: args.out_dir = op.join(op.dirname(sample), 'Tophat') if op.isdir(args.out_dir): logwarn('results dir exists', args.out_dir) else: os.makedirs(args.out_dir, mode=0o755) # same function as paired-end mode, but leaving sample_R2 empty run_sample(sample, '', args)
def fastq_list_Project(Project_pth_list, args): ''' Main program of Sample mode. ''' # flatting nested list # Ex -P A B ... -P C -P D ... # => [[A, B], [C], [D], [...], ...] # flatten # => [A, B, C, D, ...] flatten_projects_list = list( itertools.chain.from_iterable(Project_pth_list)) logp('retreiving', len(flatten_projects_list), 'projects') for prj_pth in flatten_projects_list: if not op.exists(prj_pth) or not op.isdir(prj_pth): # not exist or not a directory logerr('Cannot find the directory of project:', prj_pth, 'Skipping...') continue # 使用絕對路徑,以免有些程式不能處理相對路徑,也方便除錯 # using abosolute path to prevent that some programs can not handle # relative path and easy for debugging prj_dir = op.abspath(prj_pth) # a typical sequencing path output after Illumina demultiplexing # .../<date_index_FCID>/Unaligned/Project_Test/Sample_HAHA # desired output path # .../<date_index_FCID>/Aligned/Project_Test/TopHat/Sample_HAHA FCID_dir, prj_name = op.split(prj_dir) FCID_dir = op.split(FCID_dir)[0] logm('Working project:', prj_name[8:]) # determine result dir if not args.out_dir: prj_result_root = op.join(FCID_dir, 'Aligned', prj_name, 'TopHat') else: prj_result_root = op.join(op.abspath(args.out_dir), prj_name) # create result dir first if not op.exists(prj_result_root): os.makedirs(prj_result_root, mode=0o755) else: logwarn('project result exists') # obtain all sample dir in the project sample_list = sorted(glob.glob(op.join(prj_dir, 'Sample_*/'))) total_sample = len(sample_list) logp('contains', str(total_sample), 'samples') for i, sample_dir in enumerate(sample_list, start=1): logm('({:d}/{:d}) Sample: {:s}' .format( i, total_sample, op.split(op.dirname(sample_dir))[1][7:]), 'in project', prj_name[8:]) # read SampleSheet.csv in the sample_dir ss_pth = op.join(sample_dir, 'SampleSheet.csv') if not op.exists(ss_pth): logerr('SampleSheet.csv not found! Skipping') continue # parsing SampleSheet.csv with open(ss_pth) as ss_csv_f: reader = csv.DictReader(ss_csv_f) for row in reader: temp_args = copy.deepcopy(args) # typical sample name # No35_ATGTCA_L003_R1_001 # => <sample_prefix>_R1/2_001 sample_prefix = '{:s}_{:s}_L{:03d}'.format( row['SampleID'], row['Index'], int(row['Lane'])) # output dir temp_args.out_dir = op.join(prj_result_root, 'Sample_' + row['SampleID']) logp('result goes to', temp_args.out_dir) if not op.exists(temp_args.out_dir): os.makedirs(temp_args.out_dir, mode=0o755) else: logwarn('sample result exists') # reference if not temp_args.ref_name: temp_args.ref_name = row['SampleRef'] sample_R1 = op.join(sample_dir, sample_prefix + '_R1_001.fastq.gz') if temp_args.seq_type == 'paired': sample_R2 = op.join(sample_dir, sample_prefix + '_R2_001.fastq.gz') else: sample_R2 = '' run_sample(sample_R1, sample_R2, temp_args)
def run_sample(sample_R1, sample_R2, args): if args.seq_type == 'paired': # for paired, make sure the order is R1, R2 sample_R1, sample_R2 = sorted( [sample_R1, sample_R2], key=lambda x: op.basename(x) ) ref_root_path = ref_path(sample_R1, args.ref_name) # --- Bowtie2 reference --- # then appended with Sequence/Bowtie2Index/genome cond_bowtie = ( args.bowtie_index and op.exists(op.dirname(args.bowtie_index)) and args.bowtie_index[-7:] == '/genome' ) if cond_bowtie: logp('directly specify bowtie index path:', args.bowtie_index) genome_bowtie_ref = args.bowtie_index else: if ref_root_path is None: logerr('Cannot determine the Bowite index reference! Skipping') return else: genome_bowtie_ref = op.join(ref_root_path, 'Sequence/Bowtie2Index/genome') logp('reference: ' + args.ref_name if args.ref_name else '', 'mapping to', genome_bowtie_ref) # --- gene annotation reference --- if args.annotation: cond_gtf = ( args.gtf_path and op.exists(args.gtf_path) and op.splitext(args.gtf_path)[1] == '.gtf' # gtf file ) if cond_gtf: logp('directly specify genes annotation file path:', args.gtf_path) gene_gtf_ref = args.gtf_path else: if ref_root_path is None: logerr('Cannot determine the Bowite index reference! Skipping') return else: gene_gtf_ref = op.join(ref_root_path, 'Annotation/Genes/genes.gtf') logp('reference: ' + args.ref_name if args.ref_name else '', 'mapping to', gene_gtf_ref) else: logp('run without a reference annotation') gene_gtf_ref = None cmd_tophat( gz_pth_1=sample_R1, gz_pth_2=sample_R2, bowtie_ref=genome_bowtie_ref, gene_ref=gene_gtf_ref, out_dir=args.out_dir, read_length=args.read_length, max_process=args.max_process, resume=args.resume, rm_unzip_fq=args.rm_unzip_fq, extra_args=args.extra_args )