def process2(R1, R2, output, adapter, threads, libpath, mapper, minlen, trim5, counts, rRNA): file_name = R1.split("/")[-1].split("_")[0] outdir = os.path.join(output, file_name) ### make directory if not os.path.exists(outdir): try: os.makedirs(outdir) except Exception as e: pass prefix = os.path.join(outdir, file_name) out_R1_p = prefix + "_R1.fq.gz" out_R1_u = prefix + "_R1_unpaired.gz" out_R2_p = prefix + "_R2.fq.gz" out_R2_u = prefix + "_R2_unpaired.gz" out_log = prefix + "_trimmomatic.log" print("\n%s Processing: %s, %s" % (current_time(), R1,R2)) realpath = sys.path[0] ### trimmomatic subprocess.call("trimmomatic PE -threads %d -phred33 %s %s %s %s %s %s ILLUMINACLIP:%s/../library/adapter/%s:1:30:10:5 SLIDINGWINDOW:4:20 MINLEN:%d HEADCROP:%d 2> %s" % (threads, R1, R2, out_R1_p, out_R1_u, out_R2_p, out_R2_u, realpath, adapter, minlen, trim5, out_log), shell=True) ### Mapping by hisat2 if mapper == 'hisat2': SummaryFile = prefix + "_hisat_summary.txt" MapOut = prefix + "_hisat_sort.bam" subprocess.call("hisat2 -p %d -x %s/genome_tran -1 %s -2 %s -U %s,%s -t --dta --summary-file %s --new-summary|samtools sort -@ %d -m 10G -o %s" % (threads, libpath, out_R1_p, out_R2_p, out_R1_u, out_R2_u, SummaryFile, threads, MapOut), shell=True) ### Mapping by STAR elif mapper == 'STAR': STARprefix = prefix + "_STAR_" subprocess.call("STAR --runThreadN %d --outSAMtype BAM SortedByCoordinate --genomeDir %s --readFilesIn %s %s --readFilesCommand zcat --outFileNamePrefix %s --quantMode GeneCounts --outFilterScoreMinOverLread 0.1 --outFilterMatchNminOverLread 0.1 --outFilterMatchNmin 0 --outFilterMismatchNmax 2" % (threads, libpath, out_R1_p, out_R2_p, STARprefix), shell=True) MapOut = prefix + "_STAR_Aligned.sortedByCoord.out.bam" ## sorted bam file ### Asemble by stringtie print("%s Asemble ..." % current_time()) stringtieGTF = prefix + '_stringtie.gtf' stringtieGene = prefix + '_gene_abund.tab' subprocess.call("stringtie %s -e -G %s/annotation.gtf -p %d -o %s -A %s" % (MapOut, libpath, threads, stringtieGTF, stringtieGene), shell=True) ### Gene counts if counts: countOut = prefix + '_gene_counts.txt' subprocess.call("featureCounts -a %s/annotation.gtf -o %s %s -t exon -g gene_name -T %d -Q 30 -p" % (libpath, countOut, MapOut, threads), shell=True) ### rRNA if rRNA: rapvis_rRNA.rRNA(R1, R2, output, threads)
def merge_profiles(name, output): while True: n = GetRunningTasks(name) if n == 0: print("%s, Merging profiles ... " % current_time()) files = glob.glob("%s/*/*gene_abund.tab" % (output)) if files: files = sorted(files) dict_merge = {} for f in files: with open(f) as handle: for line in islice(handle, 1, None): line = line.strip().split("\t") k_map = line[1] k_RNA = f.split("/")[-2] count = float(line[8]) # TPM if k_map in dict_merge: dict_merge[k_map][k_RNA] = count else: tmp_dic = {} tmp_dic[k_RNA] = count dict_merge[k_map] = tmp_dic df = DataFrame(dict_merge).T df = df.fillna(value=0) ### fill NA to 0 df_sum = DataFrame(df.sum(axis=1), columns=['sum']) df = df.join(df_sum) df = df.sort_values(by="sum", ascending=False) ### sort by sum df.drop(['sum'], axis=1, inplace=True) merge_out = os.path.join(output, "merge_gene_TPM.txt") df.to_csv(merge_out, sep="\t", header=True, index=True, index_label="gene", float_format="%.2f") return merge_out break else: print( "\n### Merge profiles failed, it is not exsit in %s/*/ \n" % (output)) exit(1) else: print("%s, Waitiing for task finished, remaining %d tasks" % (current_time(), n)) time.sleep(10)
def merge_gene_counts(output): print("%s, Merging Gene Counts ... " % current_time()) files = glob.glob("%s/*/*gene_counts.txt" % (output)) try: files = sorted(files) dict_merge = {} for f in files: with open(f) as handle: for line in islice(handle, 2, None): line = line.strip().split("\t") k_map = line[0] k_RNA = f.split("/")[-2] count = int(line[6]) # count if k_map in dict_merge: dict_merge[k_map][k_RNA] = count else: tmp_dic = {} tmp_dic[k_RNA] = count dict_merge[k_map] = tmp_dic df = DataFrame(dict_merge).T df = df.fillna(value=0) ### fill NA to 0 df_sum = DataFrame(df.sum(axis=1), columns=['sum']) df = df.join(df_sum) df = df[df['sum'] > 0] df = df.sort_values(by="sum", ascending=False) ### sort by sum df.drop(['sum'], axis=1, inplace=True) merge_out2 = os.path.join(output, "merge_gene_counts.txt") df.to_csv(merge_out2, sep="\t", header=True, index=True, index_label="gene", float_format="%.0f") #return merge_out2 except Exception as e: print("\n### Merge Gene Counts failed, it is not exsit in %s/*/ \n" % (output))
def gene_dis(fi, output, libpath): print("%s, Caculating gene expression pattern ... " % current_time()) data = pd.read_table(fi, header=0) prefix = os.path.join(output, 'merge_gene_TPM') ### data_melt = data.melt('gene', var_name='sample') data_melt = data_melt.query('value>0') data_melt.index = data_melt['gene'] ### Gene species by gene type gene_type = {} with open("%s/gene_type.txt" % libpath) as f: x = str(data_melt.index[0]) if x.startswith("ENS"): for line in f: line = line.strip().split("\t") gene_type[line[1]] = line[3] else: for line in f: line = line.strip().split("\t") gene_type[line[2]] = line[3] gene_type = pd.Series(gene_type, name='gene_type', dtype="string") type_list = ["protein_coding", "pseudogene", "lincRNA", "antisense"] for i in range(0, len(gene_type.index)): if gene_type[i] in type_list: pass elif re.search("pseudogene", gene_type[i]): gene_type[i] = 'pseudogene' else: gene_type[i] = 'others' # Categories data_melt2 = pd.merge(data_melt, gene_type, how='left', sort=False, right_index=True, left_index=True) cat_type = CategoricalDtype(categories=data.columns[1:], ordered=True) data_melt2['sample'] = data_melt2['sample'].astype(cat_type) # set width and height width = int(data.shape[0]) height = 6 fontsize = 15 if width >= 8: width = math.log(width, 2) * 2 ### adjust the width of barplot else: width = width / 1.5 aspect = width / width #aspect = int(data.shape[1]) #if aspect >3: #aspect = np.log(aspect) - 1 #if aspect >1: # aspect = np.log(aspect) #else : # aspect = aspect colors = list(reversed(sns.color_palette()[0:5])) hue_order = [ "others", "pseudogene", "antisense", "lincRNA", "protein_coding" ] sns.displot(data_melt2, x="sample", hue="gene_type", hue_order=hue_order, palette=colors, multiple="stack", shrink=.8, height=height, aspect=aspect) plt.xticks(rotation=90) plt.xlabel('Samples', fontsize=fontsize) plt.ylabel('Gene species', fontsize=fontsize) out_box = prefix + "_species_type.pdf" plt.savefig(out_box, bbox_inches='tight') plt.close() ''' ### Gene species sns.displot(data_melt, x="sample", shrink=.8, height=height, aspect=aspect) plt.xticks(rotation=90) plt.xlabel('Samples', fontsize=fontsize) plt.ylabel('Gene numbers', fontsize=fontsize) out_box = prefix + "_species.pdf" plt.savefig(out_box, bbox_inches='tight') plt.close() ''' ### Gene species by expression interval values = pd.cut( data_melt['value'], [0, 1, 5, 10, 50, 100, 1000, 1000000], labels=['0~1', '1~5', '5~10', '10~50', '50~100', '100~1000', '>1000']) data_melt = data_melt.copy() ### For SettingWithCopyWarning data_melt['ExpressionInterval'] = values #data_melt.loc[:,'ExpressionInterval'] = values sns.displot(data_melt, x="sample", hue="ExpressionInterval", multiple="stack", shrink=.8, height=height, aspect=aspect) plt.xticks(rotation=90) plt.xlabel('Samples', fontsize=fontsize) plt.ylabel('Gene species', fontsize=fontsize) out_box = prefix + "_species_EI.pdf" plt.savefig(out_box, bbox_inches='tight') plt.close() ### expression density #data_melt['log2value'] = np.log2(data_melt['value']) #sns.displot(data=data_melt, x="log2value", kind="kde", hue='sample', height=4, aspect=1.4, common_norm=False) sns.kdeplot(data=data_melt, x="value", hue='sample', log_scale=True, common_norm=False) plt.xlabel('log10(TPM)', fontsize=15) out_box = prefix + "_density.pdf" plt.savefig(out_box, bbox_inches='tight') plt.close()
'-trim5', default=0, type=int, metavar='N', help='remove N bases from the begining of each read (default:0)') parser.add_argument('--counts', action='store_true', help='Get gene counts') parser.add_argument('--rRNA', action='store_true', help='whether mapping to rRNA(Human)') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.0.2') args = parser.parse_args() print("\n%s ..... Start RNAseq processing" % (current_time())) start_time = time.time() process(args.input, args.output, args.adapter, args.threads, args.libraryPath, args.mapper, args.minlen, args.trim5, args.counts, args.rRNA) ### end_time = time.time() run_time = round((end_time - start_time) / 60, 5) print("\n%s ..... Finished all. Used time: %s m\n" % (current_time(), run_time))
def SubmitTask(fi, output, adapter, threads, libpath, mapper, tasks, name, minlen, trim5, queue, counts, rRNA): ''' submit tasks to the server ''' ### get the data with fastq format files = [] fAll = glob.glob("%s/*" % fi) for f in fAll: if f.endswith('fastq') or f.endswith('fastq.gz') or f.endswith( 'fq.gz') or f.endswith('fq'): files.append(f) files = sorted(files) f_index = list(np.arange(0, len(files), 2)) f_num = 0 for i in f_index: while True: ### get task number n = GetRunningTasks(name) ### check task number if n >= tasks: time.sleep(10) print("%s, Submitted Tasks: %d, total: %d" % (current_time(), f_num, len(f_index))) else: f_num += 1 # samples number R1 = files[i] R2 = files[i + 1] tmp = "tmp.sh" f = open(tmp, 'w') f.write("#!/bin/bash\n") f.write("#$ -o %s.o\n" % name) f.write("#$ -e %s.e\n" % name) jobName = name + '.' + str(f_num) f.write("#$ -N %s\n" % jobName) f.write("source ~/.bashrc\n") f.write("source ~/.bash_profile\n") realpath = sys.path[0] ''' f.write("python %s/rapvis_process.py -f1 %s -f2 %s -o %s -a %s -p %d -lib %s -m %s --minlen %d --trim5 %d\n" %(realpath, R1, R2, output, adapter, threads, libpath, mapper, minlen, trim5)) if rRNA: f.write("python %s/rapvis_rRNA.py -f1 %s -f2 %s -o %s -p %d\n" % (realpath, R1, R2, output, threads)) ''' rRNA = '--rRNA' if rRNA else '' counts = '--counts' if counts else '' f.write( "python %s/rapvis_process.py -R1 %s -R2 %s -o %s -a %s -p %d -lib %s -m %s --minlen %d --trim5 %d %s %s\n" % (realpath, R1, R2, output, adapter, threads, libpath, mapper, minlen, trim5, counts, rRNA)) f.close() subprocess.call("qsub -cwd -q %s %s" % (queue, tmp), shell=True) #subprocess.call("qsub -cwd -l node=4 -q %s %s" % (queue, tmp), shell=True) #subprocess.call("qsub -cwd -l mem_free=150G -q %s %s" % (queue, tmp), shell=True) subprocess.call("rm %s" % tmp, shell=True) time.sleep(1) break