def get_repeat_gff(outfile): """This task downloads UCSC repetetive RNA types. """ ModuleTrna.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.as_list(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ucsc_remove_contigs"], job_memory="3G")
def merge_features(infiles, outfile): """This function will merge all of the outputs from featurecounts and create a single tsv file for all samples""" features = ModuleTrna.merge_feature_data(infiles) features.to_csv(outfile, sep="\t", header=True, index=True)
def process_reads(infile, outfile): """ Runs trimmomatic quality related trimming """ if PARAMS["trimmomatic_run"]: trimmomatic_options = PARAMS["trimmomatic_options"] trimmomatic_options = "ILLUMINACLIP:%s:%s:%s:%s" % ( PARAMS["trimmomatic_adapter"], PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"]) + "\t" + trimmomatic_options phred = PARAMS["trimmomatic_phred"] ModuleTrna.process_trimmomatic(infile, outfile, phred, trimmomatic_options) else: statement = "cp %(infile)s %(outfile)s" P.run(statement)
def merge_idx_stats(infiles, outfile): final_df = ModuleTrna.merge_counts_data(infiles) final_df.to_csv(outfile, sep="\t", compression="gzip")
def connectToUCSC(): return ModuleTrna.connectToUCSC(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=PARAMS["ucsc_database"])
def build_bam_stats(infiles, outfile): '''count number of reads mapped, duplicates, etc. Excludes regions overlapping repetitive RNA sequences Parameters ---------- infiles : list infiles[0] : str Input filename in :term:`bam` format infiles[1] : str Input filename with number of reads per sample outfile : str Output filename with read stats annotations_interface_rna_gtf : str :term:`PARMS`. :term:`gtf` format file with repetitive rna ''' job_memory = "32G" # Only one sample if len(infiles) == 3: bamfile, readsfile, rna_file = infiles # If there are multiple samples, programme specifies which .nreads file to use, by matching name to bam file else: bamfile = infiles[0] rna_file = infiles[-1] # Split file name up into directory and file name(/), then further split up by file name and file type and take file name (.) bam_name = bamfile.split('/')[1].split('.')[0] for i in range(1, len(infiles) - 1): nread_name = infiles[i].split('/')[1].split('.')[0] if bam_name == nread_name: readsfile = infiles[i] break else: continue nreads = ModuleTrna.getNumReadsFromReadsFile(readsfile) track = P.snip(os.path.basename(readsfile), ".nreads") # if a fastq file exists, submit for counting if os.path.exists(track + ".fastq.gz"): fastqfile = track + ".fastq.gz" elif os.path.exists(track + ".fastq.1.gz"): fastqfile = track + ".fastq.1.gz" else: fastqfile = None if fastqfile is not None: fastq_option = "--fastq-file=%s" % fastqfile else: fastq_option = "" statement = ''' cgat bam2stats %(fastq_option)s --force-output --mask-bed-file=%(rna_file)s --ignore-masked-reads --num-reads=%(nreads)i --output-filename-pattern=%(outfile)s.%%s < %(bamfile)s > %(outfile)s ''' P.run(statement)