def qc(self): self.get_proj_base_inf() ''' generate qc analysis script ''' qc_dir = os.path.join(self.proj_dir, 'qc') qc_script = os.path.join(self.proj_dir, 'qc.sh') qc_error = os.path.join(self.proj_dir, 'qc.error') # remove in future pathlib.Path(qc_error).touch() script_inf = ''' #! /bin/sh qc_pipe.py qc_collection \\ --OutDir {0} \\ --CleanDir {1} \\ --SampleInf {2} \\ --species {3} \\ --database {4} \\ --database-version {5} \\ --workers {6} \\ '''.format(qc_dir, self.clean_dir, self.sample_inf, self.species, self.database, self.database_version, self.worker_number) write_obj_to_file(script_inf, qc_script) return self.run_script_or_cmd(qc_script)
def get_diff_target(difffile, sRNA_target_dict, out_target): target_mRNA_list = [] diff_list = [each.strip() for each in open(difffile)] for each_miRNA in diff_list: if each_miRNA in sRNA_target_dict: target_mRNA_list.extend(sRNA_target_dict[each_miRNA]) target_mRNA_list = list(set(target_mRNA_list)) python_tools.write_obj_to_file(target_mRNA_list, out_target)
def get_diff_gene(difffile, outgene): diff_gene_list = [] if os.path.isfile(difffile): with open(difffile) as difffile_info: for n, eachline in enumerate(difffile_info): if n != 0: eachline_info = eachline.split('\t') diff_gene_list.append(eachline_info[0]) else: sys.exit('%s not exists!' % difffile) python_tools.write_obj_to_file(diff_gene_list, outgene)
def kallisto_quant(self): fq_cmd = ' '.join(self.fq_list) if len(self.fq_list) > 1 : cmd = '%s quant -i %s -o %s %s' % (self.kallisto,self.index,self.out_dir,fq_cmd) python_tools.circ_call_process(cmd) else : single_length = 2*self.fq_length cmd = '%s quant -i %s -o %s --single -l %s -s %s --plaintext -t %s %s ' % (self.kallisto,self.index,self.out_dir,self.fq_length,self.sd,self.thread,fq_cmd) python_tools.circ_call_process(cmd) quant_log = os.path.join(self.out_dir,'quant.cmd.log') python_tools.write_obj_to_file(cmd,quant_log) return cmd
def run(self): ignore_files = [ '.ignore', 'logs', 'mapping_dir', 'bam_dir', 'mapping_stats.plot', 'Rplots.pdf', 'mapping_stats.report' ] pdf_report_files = [ 'mapping_stats_plot.png', 'mapping_stats.report', 'mapping_stats.txt' ] pdf_report_ini = path.join(self.OutDir, '.report_files') write_obj_to_file(pdf_report_files, pdf_report_ini) with self.output().open('w') as ignore_files_inf: for each_file in ignore_files: ignore_files_inf.write('{}\n'.format(each_file))
def run(self): ignore_files = [ '.ignore', 'logs', 'fastqc_results/*zip', '.report_files' ] pdf_report_files_pattern = [ 'fastqc_general_stats.txt', 'gc_plot/*gc_distribution.line.png', 'reads_quality_plot/*reads_quality.bar.png' ] pdf_report_files = rsync_pattern_to_file(self.OutDir, pdf_report_files_pattern) pdf_report_ini = path.join(self.OutDir, '.report_files') write_obj_to_file(pdf_report_files, pdf_report_ini) with self.output().open('w') as ignore_files_inf: for each_file in ignore_files: ignore_files_inf.write('{}\n'.format(each_file))
def run(self): ignore_files = ['.ignore', 'logs', 'kegg/blast_out', 'kegg/kegg_pathway_logs', '.report_files', 'report.go.table.txt', 'report.kegg.table.txt', '*png'] report_files_pattern = ['go/*/*go.enrichment.barplot.png', 'kegg/*/*kegg.enrichment.barplot.png', 'go/*/DAG/ALL*png', 'go/*/*.ALL.go.enrichment.txt', 'kegg/*/*ALL.kegg.enrichment.txt'] report_files = rsync_pattern_to_file(self.OutDir, report_files_pattern) pathway_plots = get_enrichment_data(self.OutDir) report_files.extend(pathway_plots) report_ini = path.join(self.OutDir, '.report_files') write_obj_to_file(report_files, report_ini) with self.output().open('w') as ignore_inf: for each_file in ignore_files: ignore_inf.write('{}\n'.format(each_file))
def run(self): ignore_files = ['.ignore', 'logs', 'kallisto/*/run_info.json', '.report_files', 'Rplots.pdf', 'expression_summary/pdf.*', 'expression_summary/html.*', 'expression_summary/ALL.Volcano_plot.*'] report_files_pattern = ['expression_summary/*.png', 'differential_analysis/*/*png', 'expression_summary/*Gene.tpm.txt', 'expression_summary/*example.diff.table.txt', 'differential_analysis/*/*.edgeR.DE_results.txt'] report_files = rsync_pattern_to_file(self.OutDir, report_files_pattern) report_ini = path.join(self.OutDir, '.report_files') write_obj_to_file(report_files, report_ini) with self.output().open('w') as ignore_inf: for each_file in ignore_files: ignore_inf.write('{}\n'.format(each_file))
def run(self): snp_plot_cmds = [ 'bcftools stats -s - {0}/snp.filter.vcf > {0}/snp.stats'.format( OutDir) ] snp_plot_cmds.append( 'plot-vcfstats -p {0}/snp_plot -s {0}/snp.stats'.format(OutDir)) snp_plot_cmds.append( 'Rscript {0} --snp_stats {1}/snp_plot/tstv_by_sample.0.dat --sample_inf {2} --out_dir {1}' .format(SNP_PLOT, OutDir, SampleInf)) snp_plot_cmd_file = path.join(OutDir, 'snp.plot.sh') write_obj_to_file(snp_plot_cmds, snp_plot_cmd_file) snp_summary_cmds = ['sh', snp_plot_cmd_file] snp_summary_inf = run_cmd(snp_summary_cmds) with self.output().open('w') as snp_summary_log: snp_summary_log.write(snp_summary_inf)
def run(self): ignore_files = [ '.ignore', 'logs', 'read_duplication/*.DupRate_plot.*', 'read_distribution/read_distribution.summary.txt', 'junction_saturation', 'inner_distance/*inner_distance_plot*', 'inner_distance/*inner_distance.txt', 'infer_experiment', 'genebody_coverage/*geneBodyCoverage.curves.pdf', 'genebody_coverage/*geneBodyCoverage.r', 'Rplots.pdf' ] pdf_report_files_pattern = [ 'inner_distance/*inner_distance.bar.png', 'read_duplication/*reads_duplication.point.png', 'genebody_coverage/*genebody_coverage.point.png', 'read_distribution/read_distribution.bar.png', 'read_distribution/*read_distribution.pie.png' ] pdf_report_files = rsync_pattern_to_file(self.OutDir, pdf_report_files_pattern) pdf_report_ini = path.join(self.OutDir, '.report_files') write_obj_to_file(pdf_report_files, pdf_report_ini) with self.output().open('w') as ignore_files_inf: for each_file in ignore_files: ignore_files_inf.write('{}\n'.format(each_file))
def run(self): gtf_list_file = path.join(OutDir, 'assembly_dir', 'gtf.list') gtf_path_list = [ path.join(OutDir, 'assembly_dir', '{}.gtf'.format(each_sample)) for each_sample in sample_list ] write_obj_to_file(gtf_path_list, gtf_list_file) merge_gtf_cmd = [ 'stringtie', '--merge', '-m', '200', '-T', '0.1', '-f', '0.1', '-o', '{}/stringtie_merge.gtf'.format(OutDir), gtf_list_file ] if RefGtf: merge_gtf_cmd.extend(['-G', RefGtf]) get_fa_cmd = [ 'gffread', '{}/stringtie_merge.gtf'.format(OutDir), '-g', RefFa, '-w', '{}/stringtie_merge.fa'.format(OutDir) ] stringtie_merge_log_inf = run_cmd([merge_gtf_cmd, get_fa_cmd]) with self.output().open('w') as stringtie_merge_log: stringtie_merge_log.write(stringtie_merge_log_inf)
def pipe(self): self.get_proj_base_inf() pipe_error = os.path.join(self.proj_dir, 'pipe.error') pipe_script = os.path.join(self.proj_dir, 'pipe.sh') pathlib.Path(pipe_error).touch() pipe_inf = ''' #! /bin/sh mRNA_pipe_v2.py run_pipe \\ --proj-name {0} \\ --proj-dir {1} \\ --clean-dir {2} \\ --sample-inf {3} \\ --species {4} \\ --workers {5} \\ --database {6} \\ --database-version {7} \\ '''.format(self.proj_name, self.proj_dir, self.clean_dir, self.sample_inf, self.species, self.worker_number, self.database, self.database_version) if self.kegg_bg: pipe_inf = '{0} --kegg-bg {1} \\\n'.format( pipe_inf, self.kegg_bg) write_obj_to_file(pipe_inf, pipe_script) return self.run_script_or_cmd(pipe_script)
pass python_tools.circ_mkdir_unix(args.out_dir) time_info = time.localtime() output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon, time_info.tm_mday, time_info.tm_hour, time_info.tm_min, time_info.tm_sec) md5_file = os.path.join(args.out_dir, 'fq_md5.txt') md5_list = [] for each in sample_dict: if args.nocheck: logging.info('get md5 of %s start' % each) log_list = sample_dict[each].get_dna_md5() logging.info('get md5 of %s finished' % each) else: logging.info('check md5 of %s start' % each) if args.type == 'dna': log_list = sample_dict[each].check_dna_md5(args.out_dir) else: log_list = sample_dict[each].check_rna_md5(args.out_dir) if log_list: for each_log in log_list: if 'ok' in each_log: logging.info(each_log) else: logging.error(each_log) logging.info('check md5 of %s finished' % each) md5_list.extend(sample_dict[each].md5) python_tools.write_obj_to_file(md5_list, md5_file)
shell_name = path.basename(sh_script) log_file = path.join(log_dir, '{0}.log'.format(shell_name)) system('nohup sh {0} > {1} 2>&1 &'.format(sh_script, log_file)) if __name__ == '__main__': arguments = docopt(__doc__, sys.argv[1:], version='v1') cp_info_file = arguments['--cp_info_file'] log_dir = arguments['--log_dir'] if not cp_info_file or not log_dir: print(__doc__) sys.exit(1) ## get cp info dict cp_info_dict = {} with open(cp_info_file) as cp_info_file_inf: for eachline in cp_info_file_inf: eachline_inf = eachline.strip().split('\t') cp_info_dict[eachline_inf[0]] = eachline_inf[1] ## creat cp shell script for each directory and lauch it cp_info_file_name = path.basename(cp_info_file) shell_script = '{0}.sh'.format(cp_info_file_name) cmd_list = [] for each in cp_info_dict: from_dir = each to_dir = cp_info_dict[each] cmd_list.extend(shell_cmd_for_cp(from_dir, to_dir, log_dir)) write_obj_to_file(cmd_list, shell_script) nohup_sh_job(shell_script, log_dir)
def accession2gi(accession): try: handle = Entrez.elink(dbfrom="nucleotide", id=accession, linkname="nucleotide_nuccore") except HTTPError: time.sleep(20) handle = Entrez.elink(dbfrom="nucleotide", id=accession, linkname="nucleotide_nuccore") record = Entrez.read(handle) handle.close() linked = record[0]['IdList'][0] time.sleep(10) return linked if __name__ == '__main__': accession_list = [each.strip() for each in open(args.accession)] gi_list = [] for each_id in accession_list: each_accession = "" try: each_accession = accession2gi(each_id) # gi_list.append(accession2gi(each_id)) except: print 'can not find gi number of accession: %s' % each_id else: gi_list.append(each_accession) python_tools.write_obj_to_file(gi_list, args.gi)
'rawdata_number.json').st_size: sample_number_dict = python_tools.load_fn_to_obj('rawdata_number.json') else: sample_number_dict = {} total_size = [] cp_data_info = open(cp_data_info_file, 'w') for each in sample_dict: if each in sample_number_dict: sample_dict[each].pre_num = sample_number_dict[each] else: sample_number_dict[each] = len(sample_dict[each].read1) cmd_line = sample_dict[each].add_sup_data(args.out_dir) for n, each_fq in enumerate(sample_dict[each].read1): read1_fq = sample_dict[each].read1[n] read2_fq = sample_dict[each].read2[n] read1_fq_size = os.stat(read1_fq).st_size / float(1024**3) read2_fq_size = os.stat(read2_fq).st_size / float(1024**3) total_size.extend([read1_fq_size, read2_fq_size]) read1_fq_size_out = round(read1_fq_size, 2) read2_fq_size_out = round(read2_fq_size, 2) cp_data_info.write('%s\t%s\t%sG\t%s\t%sG\n' % (sample_dict[each].name, read1_fq, read1_fq_size_out, read2_fq, read2_fq_size_out)) python_tools.write_obj_to_file(cmd_line, cp_cmd, True) cp_data_info.write('total : %sG' % round(sum(total_size), 2)) cp_data_info.close() cp_data_info_json = os.path.join(cwd, 'rawdata_number.json') python_tools.write_obj_to_json(sample_number_dict, cp_data_info_json)
def check_data(self): qc_summary_check_file = path.join(self.qc_dir, 'qc_check_out.txt') write_obj_to_file('QC check logs.', qc_summary_check_file) # check q30: if path.exists(self.fastqc_dir): self.get_fastqc_summary() q30_check_msg = self.get_failed_msg('Q30', 'Q30(%)', self.Q30_CUT, gt=False) write_obj_to_file(q30_check_msg, qc_summary_check_file, append=True) # check mapping if path.exists(self.mapping_dir): self.get_mapping_summary() self.qc_summary_df.loc[:, 'unique_mapping_rate'] = [ float(each.rstrip('%')) for each in self.qc_summary_df.loc[:, 'Uniquely mapped reads %'] ] self.qc_summary_df.loc[:, 'multiple_mapping_rate'] = [ float(each.rstrip('%')) for each in self.qc_summary_df.loc[:, '% of reads mapped to multiple loci'] ] self.qc_summary_df.loc[:, 'total_mapping_rate'] = self.qc_summary_df.loc[:, 'unique_mapping_rate'] + self.qc_summary_df.loc[:, 'multiple_mapping_rate'] mapping_rate_median = self.qc_summary_df.loc[:, 'total_mapping_rate'].median( ) mapping_rate_cutoff = max((mapping_rate_median - 10), self.MAPPING_RATE_CUT) mapping_rate_check_msg = self.get_failed_msg('Mapping rate', 'total_mapping_rate', mapping_rate_cutoff, gt=False) write_obj_to_file(mapping_rate_check_msg, qc_summary_check_file, append=True) multi_map_median = self.qc_summary_df.loc[:, 'multiple_mapping_rate'].median( ) multi_map_cutoff = min((multi_map_median + 10), self.MULTI_MAP_CUT) multi_map_check_msg = self.get_failed_msg('Multiple mapping rate', 'multiple_mapping_rate', multi_map_cutoff) write_obj_to_file(multi_map_check_msg, qc_summary_check_file, append=True) self.qc_summary_df = self.qc_summary_df.drop([ 'unique_mapping_rate', 'multiple_mapping_rate', 'total_mapping_rate' ], axis=1) # check rseqc if path.exists(self.rseqc_dir): self.get_rseqc_summary() # check duplication dup_check_msg = self.get_failed_msg('Duplication', 'Duplication_seq', self.DUP_CUT) write_obj_to_file(dup_check_msg, qc_summary_check_file, append=True) # check tin tin_check_msg = self.get_failed_msg('TIN', 'TIN(median)', self.TIN_CUT, gt=False) write_obj_to_file(tin_check_msg, qc_summary_check_file, append=True) # check cds cds_check_msg = self.get_failed_msg('CDS reads portion', 'CDS_portion', self.CDS_CUT, gt=False) write_obj_to_file(cds_check_msg, qc_summary_check_file, append=True) # write qc summary out self.qc_summary_df.index.name = 'Sample' qc_summary_out = path.join(self.qc_dir, 'qc_summary.txt') self.qc_summary_df.to_csv(qc_summary_out, sep='\t')
python_tools.circ_mkdir_unix(args.out_dir) time_info = time.localtime() output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon, time_info.tm_mday, time_info.tm_hour, time_info.tm_min, time_info.tm_sec) md5_file = os.path.join(args.out_dir, 'fq_md5.txt') md5_list = [] for each in sample_dict: if args.nocheck: logging.info('get md5 of %s start' % each) log_list = sample_dict[each].get_dna_md5() logging.info('get md5 of %s finished' % each) else: logging.info('check md5 of %s start' % each) if args.type == 'dna': log_list = sample_dict[each].get_dna_md5() else: log_list = sample_dict[each].get_rna_md5() if log_list: for each_log in log_list: if 'ok' in each_log: logging.info(each_log) else: logging.error(each_log) logging.info('check md5 of %s finished' % each) md5_list.extend(sample_dict[each].md5) python_tools.write_obj_to_file('Sample\tfile\tfile_size\tmd5', md5_file) python_tools.write_obj_to_file(md5_list, md5_file, True)
def kallisto_index(self): cmd = '%s index -i %s %s' % (self.kallisto,self.index,self.transcript_fa) python_tools.circ_call_process(cmd) run_log = os.path.join(self.out_dir,'index.cmd.log') python_tools.write_obj_to_file(cmd,run_log) return cmd
import sys import os import RNAseq_tools import python_tools if not len(sys.argv) == 3: print 'python ' + sys.argv[0] + ' gtf length_filtered_gtf' sys.exit(0) ori_gtf = sys.argv[1] out_gtf = sys.argv[2] tr_dict = RNAseq_tools.get_transcript_info(ori_gtf) filter_gtf_list = RNAseq_tools.gtf_length_filter(ori_gtf, tr_dict) python_tools.write_obj_to_file(filter_gtf_list, out_gtf)
sys.exit(0) lncRNA_feature = sys.argv[1] novel_gtf = sys.argv[2] add_gtf = sys.argv[3] lncRNA_tr_dict = {} lncRNA_gene_dict = {} with open(lncRNA_feature) as lncRNA_feature_inf: for n, eachline in enumerate(lncRNA_feature_inf): if n != 0: eachline_inf = eachline.strip().split('\t') tr_id = eachline_inf[4] gene_id = eachline_inf[5] tr_type = eachline_inf[-1] lncRNA_tr_dict[tr_id] = tr_type lncRNA_gene_dict[gene_id] = tr_type out_list = [] for eachline in GFF_Reader(novel_gtf): gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] gene_type = tr_type = 'TUCP' if gene_id in lncRNA_gene_dict: gene_type = lncRNA_gene_dict[gene_id] if transcript_id in lncRNA_tr_dict: tr_type = lncRNA_tr_dict[transcript_id] out_list.append('%s; gene_biotype "%s"; transcript_biotype "%s";' % (eachline.get_gff_line().strip(), gene_type, tr_type)) python_tools.write_obj_to_file(out_list, add_gtf)
if n == 0: for m, each_col in enumerate(eachline_info): if '_novel_miRNA' in each_col: sample_id = each_col.split('_novel_miRNA')[0] sample_index_dict[m] = sample_id else: novel_id = eachline_info[0] novel_seq_list = [[], [], []] for m, each_col in enumerate(eachline_info): if m in sample_index_dict: sample_id = sample_index_dict[m] if each_col != '-': each_col_list = each_col.split(',') each_col_list = [ each.strip() for each in each_col_list ] for each_col_info in each_col_list: for i, each_seq in enumerate( miRNA_seq_dict[each_col_info] [sample_id]): novel_seq_list[i].append( miRNA_seq_dict[each_col_info] [sample_id][i]) for j, each_seq_list in enumerate(novel_seq_list): each_merged_seq = merge_seq_list(each_seq_list) seq_dict_list[j][novel_id] = each_merged_seq for n, each_dict in enumerate(seq_dict_list): each_file = seq_file_list[n] python_tools.write_obj_to_file(each_dict, each_file)
with open(args.datasize_file) as datasize_file_info: for n,eachline in enumerate(datasize_file_info): if n!= 0: eachline_info = eachline.strip().split('\t') sample_id = eachline_info[0] datasize = float(eachline_info[1]) if datasize > cutoff: sample_list.append(sample_id) extract_cmd_file = os.path.join(cwd,'extract_cmd.sh') extract_cmd_list = [] tmp_dir = os.path.join(args.data_dir,'tmp') if not os.path.exists(tmp_dir): extract_cmd_list.append('mkdir -p "%s"' % tmp_dir) sample_datasize = random.sample(target_reads_range,len(sample_list)) for n,each_sample in enumerate(sample_list): extract_cmd_list.append('echo "#### start extract data of sample %s ####"' % each_sample) extract_cmd_list.append('date') for read_num in (1,2): each_fq_file = os.path.join(args.data_dir,'%s_R%s.fastq.gz' % (each_sample,read_num)) each_fq_bak_file = os.path.join(tmp_dir,'%s_R%s.fastq.gz' % (each_sample,read_num)) each_extract_read = sample_datasize[n] extract_cmd_list.append('mv "%s" "%s"' % (each_fq_file,each_fq_bak_file)) extract_cmd_list.append('zcat "%s" | head -%s |gzip > "%s"' % (each_fq_bak_file,each_extract_read,each_fq_file)) extract_cmd_list.append('date') extract_cmd_list.append('echo "#### finish extract data of sample %s ####"\n' % each_sample) python_tools.write_obj_to_file(extract_cmd_list,extract_cmd_file)
if n == 0: if args.go: go_info = ['GO_term_accession', 'GO_domain', 'GO_term_name'] else: go_info = [ 'GO_term_accession', 'GO_domain', 'GO_term_name', 'InterPro_ID', 'InterPro_description' ] else: if args.go: go_info = ['--'] * 3 else: go_info = ['--'] * 5 if gene_id in go_anno_dict: go_info = [] for each in go_anno_dict[gene_id]: go_info.append(','.join(each)) eachline_info.extend(go_info) if n == 0: ko_info = ['KO_ID', 'KO_name'] else: ko_info = ['--'] * 2 if gene_id in ko_anno_dict: ko_info = [] for each in ko_anno_dict[gene_id]: ko_info.append(','.join(each)) eachline_info.extend(ko_info) output_info_list.append('\t'.join(eachline_info)) python_tools.write_obj_to_file(output_info_list, args.output)
Entrez.email = '*****@*****.**' def accession2gi(accession): try: handle = Entrez.esearch(db="protein", term=accession) except HTTPError: time.sleep(20) handle = Entrez.esearch(db="protein", term=accession) record = Entrez.read(handle) handle.close() linked = record['IdList'][0] time.sleep(10) return linked if __name__ == '__main__': accession_list = [each.strip() for each in open(args.protein_acc)] gi_dict = {} for each_id in accession_list: each_accession = "NA" try: each_accession = accession2gi(each_id) # gi_list.append(accession2gi(each_id)) except Exception as e: print e print 'can not find gi number of accession: %s' % each_id gi_dict[each_id] = each_accession python_tools.write_obj_to_file(gi_dict, args.protein_id)