def run_kegg_pathview(self): cmd_list = [] pathway_log_dir = path.join(self.out_dir, 'kegg_pathway_logs') python_tools.circ_mkdir_unix(pathway_log_dir) compare_list = listdir(self.diff_dir) for each_compare in compare_list: each_compare_diff_dir = path.join(self.diff_dir, each_compare) each_compare_out_dir = path.join(self.out_dir, each_compare) diff_out_list = glob( '{}/*.edgeR.DE_results.txt'.format(each_compare_diff_dir)) for each_diff_file in diff_out_list: each_diff_file_name = path.basename(each_diff_file) each_out_prefix = each_diff_file_name.split( '.edgeR.DE_results')[0] if 'UP' not in each_out_prefix: each_out_prefix = '{}.ALL'.format(each_out_prefix) kegg_output = path.join( each_compare_out_dir, '%s.kegg.enrichment.txt' % (each_out_prefix)) pathway_outdir = path.join( each_compare_out_dir, '%s.pathway' % each_out_prefix) pathview_check_log_file = path.join( pathway_log_dir, '%s.log' % (each_out_prefix)) pathview_cmd = 'python %s --kegg_table %s --blast_out %s --species %s --diff_out %s --out_dir %s' % ( PATHVIEW, kegg_output, self.all_blast_out, self.species, each_diff_file, pathway_outdir) pathview_check_cmd = 'python %s --kegg_table %s --pathway_dir %s --log_file %s' % ( PATHVIEW_CK, kegg_output, pathway_outdir, pathview_check_log_file) python_tools.circ_mkdir_unix(pathway_outdir) python_tools.circ_call_process(pathview_cmd) python_tools.circ_call_process(pathview_check_cmd) cmd_list.extend([pathview_cmd, pathview_check_cmd]) return cmd_list
def run_result(self): add_pipe_message(self.monitor_dir, 'generate_result_start') result_dir = path.join(self.project_dir, self.project_name) if path.exists(result_dir): system('rm -rf {}'.format(result_dir)) circ_mkdir_unix(result_dir) for each_dir in self.work_dir: each_dir_ignore = path.join(each_dir, '.ignore') each_dir_name = path.basename(each_dir) each_dir_result = path.join(result_dir, each_dir_name) cp_dir_with_ignore(each_dir, each_dir_result)
def get_diff_splicing_table(rmats_output, out_dir, pvalue=0.05): circ_mkdir_unix(out_dir) rmats_output_name = path.basename(rmats_output) rmats_output_treat = path.join(out_dir, rmats_output_name) system("sed -re 's/\"//g' {0} > {1}".format(rmats_output, rmats_output_treat)) diff_rmats_out = path.join(out_dir, 'diff.{}'.format(rmats_output_name)) rmats_output_df = pd.read_table(rmats_output_treat, sep='\t') diff_rmats_output_df = rmats_output_df[rmats_output_df.FDR <= 0.05] diff_rmats_output_df.to_csv(diff_rmats_out, sep='\t', index=False, na_rep='NA')
def run(self): star_index_dir = path.join(annotation_dir, 'star_index') circ_mkdir_unix(star_index_dir) star_index_cmd = [ 'STAR', '--runThreadN', STAR_THREAD, '--runMode', 'genomeGenerate', '--sjdbOverhang', '149', '--genomeFastaFiles', ref_fa, '--sjdbGTFfile', ref_gtf, '--genomeDir', star_index_dir ] star_index_log_inf = run_cmd(star_index_cmd) with self.output().open('w') as star_index_log: star_index_log.write(star_index_log_inf)
def requires(self): global ref_fa, genome_dir, ref_gtf, annotation_dir, species_latin, species_ensembl, species_kegg, log_dir, ko_pep_dir, ko_db_dir ref_fa, ref_gtf, species_latin = self.ref_fa, self.ref_gtf, self.species_latin species_kegg, species_ensembl = get_kegg_biomart_id(species_latin) genome_dir = path.dirname(ref_fa) annotation_dir = path.dirname(ref_gtf) log_dir = path.join(annotation_dir, 'logs') kobasrc = config.getrc() ko_pep_dir = kobasrc['blastdb'] ko_db_dir = kobasrc['kobasdb'] circ_mkdir_unix(log_dir) print ko_pep_dir return [fa_index(), star_index(), go_annotation(), ko_annotation()]
def run(self): from_dir_name = path.basename(self.from_dir) to_dir = path.join(self.to_dir, from_dir_name) report_files_ini = path.join(self.from_dir, '.report_files') if not path.exists(report_files_ini): cp_cmd_inf = 'nothing for report in {}'.format(from_dir_name) else: circ_mkdir_unix(to_dir) cp_cmd = [ 'rsync', '-av', '--files-from={}'.format(report_files_ini), self.from_dir, to_dir ] cp_cmd_inf = run_cmd(cp_cmd) with self.output().open('w') as cp_cmd_log: cp_cmd_log.write(cp_cmd_inf)
def run(self): analysis_bam_dir = path.join(self.proj_dir, 'mapping', 'bam_dir') out_data_dir = path.join(self.proj_dir, '{}_analysis_data'.format(proj_name)) out_bam_dir = path.join(out_data_dir, 'bam') fq_dir = path.join(out_data_dir, 'fq') circ_mkdir_unix(out_data_dir) ln_fq_cmd = ['ln', '-s', clean_dir, fq_dir] ln_bam_cmd = ['ln', '-s', analysis_bam_dir, out_bam_dir] link_cmd_inf = run_cmd([ln_fq_cmd, ln_bam_cmd]) with self.output().open('w') as get_analysis_data_log: get_analysis_data_log.write(link_cmd_inf)
def run_KEGG_enrich(self): cmd_list = [] blast_out_dir = path.join(self.out_dir, 'blast_out') python_tools.circ_mkdir_unix(blast_out_dir) compare_list = listdir(self.diff_dir) for each_compare in compare_list: each_compare_diff_dir = path.join(self.diff_dir, each_compare) diff_gene_list = glob( '{}/*.diffgenes.txt'.format(each_compare_diff_dir)) each_compare_out_dir = path.join(self.out_dir, each_compare) python_tools.circ_mkdir_unix(each_compare_out_dir) for each_diff_file in diff_gene_list: each_diff_file_name = path.basename(each_diff_file) each_out_prefix = each_diff_file_name.split( '.edgeR.DE_results')[0] each_diff_inf_prefix = each_out_prefix if 'UP' not in each_out_prefix: each_diff_inf_prefix = each_out_prefix.split('.')[0] each_diff_inf_file = path.join(each_compare_diff_dir,'{}.edgeR.DE_results.txt'.format(each_diff_inf_prefix)) kegg_output = path.join( each_compare_out_dir, '%s.kegg.enrichment.txt' % (each_out_prefix)) each_blast_out = path.join( blast_out_dir, '%s.blasttab' % (each_out_prefix)) extract_each_blast_cmd = 'python %s --id %s --table %s --output %s' % ( EXTRACT_INF_BY_ID, each_diff_file, self.all_blast_out, each_blast_out) kegg_cmd = self.generate_kobas(each_blast_out, kegg_output) python_tools.circ_call_process(extract_each_blast_cmd) cmd_list.append(extract_each_blast_cmd) if path.exists(each_blast_out): python_tools.circ_call_process(kegg_cmd) cmd_list.append(kegg_cmd) if path.exists(kegg_output): self.treat_KEGG_table(kegg_output) txt_to_excel(kegg_output) #pathway_cmd = self.run_kegg_pathview2(each_compare, each_diff_inf_file) #cmd_list.extend(pathway_cmd) else: cmd_list.append( "## {} not exists!".format(kegg_output)) else: cmd_list.append("## {} not exists!".format(each_blast_out)) return cmd_list
import sys import os import python_tools if not len(sys.argv) == 4: print 'python ' + sys.argv[0] + ' sample_list qc_dir out_dir' sys.exit(0) sample_list_file = sys.argv[1] qc_dir = sys.argv[2] out_dir = sys.argv[3] sample_list = [each.strip() for each in open(sample_list_file)] sample_info_dict = {} reads_quality_dir = os.path.join(out_dir, 'reads_quality') python_tools.circ_mkdir_unix(reads_quality_dir) merged_quality_file = os.path.join(reads_quality_dir, 'all.reads_quality.txt') merged_quality_file_inf = open(merged_quality_file, 'w') merged_quality_file_inf.write('Sample_ID\tQuality\tCount\tPercent\n') for each_sample in sample_list: sample_info_dict[each_sample] = [0, 0, 0, []] reads_quality_dict = {} each_reads_quality_file = os.path.join( reads_quality_dir, '{}_reads_quality.txt'.format(each_sample)) for n in (1, 2): each_qc_dir = os.path.join(qc_dir, '%s_%s.clean.fq_fastqc' % (each_sample, n)) each_qc_file = os.path.join(each_qc_dir, 'fastqc_data.txt') with open(each_qc_file) as each_qc_file_info: q30_flag = 0
out_dir = sys.argv[4] compare_list = [each.strip() for each in open(compare_file)] def run_cmd(cmd): p = subprocess.Popen(cmd, shell=False, universal_newlines=True, stdout=subprocess.PIPE) ret_code = p.wait() output = p.communicate()[0] return output ## cp diff table for each in compare_list: each_de_results = os.path.join(diff_dir, 'genes.counts.matrix.{0}.edgeR.DE_results'.format(each)) each_de_results_dir = os.path.join(out_dir, each) if not os.path.exists(each_de_results_dir): python_tools.circ_mkdir_unix(each_de_results_dir) each_de_results_out_tmp = os.path.join(each_de_results_dir, 'tmp.{0}.edgeR.DE_results.txt'.format(each)) each_de_results_out = os.path.join(each_de_results_dir, '{0}.edgeR.DE_results.txt'.format(each)) ## add table header cmd_list = [] cmd_list.append(['cp', each_de_results, each_de_results_out_tmp]) cmd_list.append(['python', '/home/lxgui/scripts/diff_table_add_header.py', '--table', each_de_results_out_tmp, '--add_info', 'Gene_ID']) ## vocalno plot cmd_list.append(['Rscript', '/home/lxgui/scripts/Volcano_Plot_20160406.R', each_de_results_out_tmp, each, each_de_results_dir, '0.001', '2']) cmd_list.append(['python', '/home/lxgui/scripts/add_gene_anno_v2.py', each_de_results_out_tmp, gene_anno, each_de_results_out]) cmd_list.append(['rm', each_de_results_out_tmp]) each_sub_reg_list = each.split('_vs_') for each_sub in each_sub_reg_list: name = '%s-UP' % each_sub each_sub_diff_result = glob.glob(r'{0}/genes.counts.matrix.{1}.edgeR.DE_results.*.{2}.subset'.format(diff_dir, each, name))[0] each_sub_diff_out = os.path.join(each_de_results_dir, '{0}.{1}.subset.txt'.format(each, name))
sample_dict[sample_id] = RNAseq_tools.rawdata() sample_dict[sample_id].name = sample_id else: pass if each_fq_path.endswith( 'R1.fastq.gz' ) and each_fq_path not in sample_dict[sample_id].read1: sample_dict[sample_id].read1.append(each_fq_path) elif each_fq_path.endswith( 'R2.fastq.gz' ) and each_fq_path not in sample_dict[sample_id].read2: sample_dict[sample_id].read2.append(each_fq_path) else: pass python_tools.circ_mkdir_unix(args.out_dir) time_info = time.localtime() output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon, time_info.tm_mday, time_info.tm_hour, time_info.tm_min, time_info.tm_sec) md5_file = os.path.join(args.out_dir, 'fq_md5.txt') md5_list = [] for each in sample_dict: if args.nocheck: logging.info('get md5 of %s start' % each) log_list = sample_dict[each].get_dna_md5() logging.info('get md5 of %s finished' % each) else: logging.info('check md5 of %s start' % each) if args.type == 'dna':
import time import json import re cwd = os.getcwd() parser = argparse.ArgumentParser() parser.add_argument('--sample_map', help='id map wgc id to sample id', required=True) parser.add_argument('--out_dir', help='output directory', required=True) parser.add_argument('--rawdata_list', help='directory list file', required=True) args = parser.parse_args() python_tools.circ_mkdir_unix(args.out_dir) time_info = time.localtime() output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon, time_info.tm_mday, time_info.tm_hour, time_info.tm_min, time_info.tm_sec) data_dir_list_file = args.rawdata_list data_dir_list = python_tools.file_to_list(data_dir_list_file) wgc_to_sample_dict = python_tools.table_to_dict(args.sample_map, 1, 2, False, '\t') sample_to_wgc_dict = python_tools.table_to_dict(args.sample_map, 2, 1, False, '\t') cp_cmd = os.path.join(cwd, '%s_cp.sh' % output_time) cp_data_info_file = os.path.join(cwd, '%s_rawdata.info' % output_time)
import sys import os import argparse import re import python_tools parser = argparse.ArgumentParser() parser.add_argument('--maf_fasta', help='maf file.', required=True) #parser.add_argument('--species', help = 'maf species.', required = True) parser.add_argument('--out_dir', help='output directory', required=True) args = parser.parse_args() out_dir = os.path.abspath(args.out_dir) if not os.path.exists(out_dir): python_tools.circ_mkdir_unix(out_dir) id_dict = {} outfile_list = os.path.join(out_dir, 'maf_fasta.list') outfile_list_info = open(outfile_list, 'w') with open(args.maf_fasta) as maf_fasta_info: for eachline in maf_fasta_info: eachline = eachline.strip() if '>' in eachline: header = re.sub('.TU', '|TU', eachline) tr_id = header.split('|')[1] if tr_id not in id_dict: id_dict[tr_id] = 1 each_tr_out_file = os.path.join(out_dir, '%s.fa' % tr_id)
from __future__ import division import sys import os import python_tools if not len(sys.argv) == 4: print ' print ' + sys.argv[ 0] + ' sRNA.analsys.dir sRNA.qc.summary sRNA.length.dir' sys.exit(1) sRNA_analysis_dir = sys.argv[1] out_qc_summary = sys.argv[2] sRNA_length_dir = sys.argv[3] python_tools.circ_mkdir_unix(sRNA_length_dir) sRNA_ori_summary_file = os.path.join(sRNA_analysis_dir, 'SampleSummary.xls') sRNA_data_dict = {} with open(sRNA_ori_summary_file) as sRNA_ori_summary_file_inf: for n, eachline in enumerate(sRNA_ori_summary_file_inf): if n != 0: eachline_inf = eachline.strip().split('\t') sample_id = eachline_inf[0] total_reads = int(eachline_inf[4]) mapped_reads = int(eachline_inf[5]) # mapping_rate = 100*round(mapped_reads/mapped_reads, 2) # mapping_rate_rep = '%s%%' mapping_rate sRNA_data_dict[sample_id] = [total_reads, mapped_reads] sample_length_merged_file = os.path.join(sRNA_length_dir, 'Sample.length.txt') sample_length_merged_file_inf = open(sample_length_merged_file, 'w')
required=True) parser.add_argument('--seq_dir_name', help='sequencing data names, sep with ","', required=True) parser.add_argument('--analysis_data_dir', help='directory store analysis data', required=True) parser.add_argument('--sample_map', help='file map sample id to wgc id', required=True) args = parser.parse_args() sample_map_dict = python_tools.table_to_dict(args.sample_map, 1, 2, False, '\t') python_tools.circ_mkdir_unix(args.analysis_data_dir) sample_data_dict = {} seq_dir_name_list = args.seq_dir_name.split(',') for each_name in seq_dir_name_list: each_dir = os.path.join(args.seq_data_dir, each_name) each_dir_files = os.listdir(each_dir) for each_file in each_dir_files: each_file_path = os.path.join(each_dir, each_file) if os.path.isdir(each_file_path): wgc_id = each_file sample_id = sample_map_dict[wgc_id] seq_files = os.listdir(each_file_path) if sample_id not in sample_data_dict: sample_data_dict[sample_id] = RNAseq_tools.rawdata() sample_data_dict[sample_id].name = sample_id
print ' python ' + sys.argv[ 0] + ' compare_list_file enrich_dir go_file out_dir' sys.exit(0) compare_list_file = sys.argv[1] enrich_dir = sys.argv[2] go_file = sys.argv[3] out_dir = sys.argv[4] compare_list = [each.strip() for each in open(compare_list_file)] for each in compare_list: each_enrich = os.path.join(enrich_dir, each) each_enrich_out = os.path.join(out_dir, each) if not os.path.exists(each_enrich_out): python_tools.circ_mkdir_unix(each_enrich_out) reg_list = ['ALL'] each_sub_reg_list = each.split('_vs_') reg_list.extend(each_sub_reg_list) for each_sub in reg_list: name = each_sub if each_sub != 'ALL': name = '%s-UP' % each_sub each_diff_file1 = os.path.join(each_enrich, '%s-target.list' % name) each_diff_file2 = os.path.join(each_enrich, '%s.list' % name) if os.path.exists(each_diff_file1): each_diff_file = each_diff_file1 else: each_diff_file = each_diff_file2 each_go_file1 = os.path.join(each_enrich, '%s.%s.GO.enrich.xls' % (each, name))