Пример #1
0
def get_repeat_gff(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    ModuleTrna.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.as_list(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ucsc_remove_contigs"],
        job_memory="3G")
Пример #2
0
def merge_features(infiles, outfile):
    """This function will merge all of the outputs from featurecounts and
    create a single tsv file for all samples"""

    features = ModuleTrna.merge_feature_data(infiles)

    features.to_csv(outfile, sep="\t", header=True, index=True)
Пример #3
0
def process_reads(infile, outfile):
    """
    Runs trimmomatic quality related trimming
    """

    if PARAMS["trimmomatic_run"]:

        trimmomatic_options = PARAMS["trimmomatic_options"]

        trimmomatic_options = "ILLUMINACLIP:%s:%s:%s:%s" % (
            PARAMS["trimmomatic_adapter"], PARAMS["trimmomatic_mismatches"],
            PARAMS["trimmomatic_p_thresh"],
            PARAMS["trimmomatic_c_thresh"]) + "\t" + trimmomatic_options

        phred = PARAMS["trimmomatic_phred"]

        ModuleTrna.process_trimmomatic(infile, outfile, phred,
                                       trimmomatic_options)
    else:

        statement = "cp %(infile)s %(outfile)s"

        P.run(statement)
Пример #4
0
def merge_idx_stats(infiles, outfile):

    final_df = ModuleTrna.merge_counts_data(infiles)

    final_df.to_csv(outfile, sep="\t", compression="gzip")
Пример #5
0
def connectToUCSC():
    return ModuleTrna.connectToUCSC(host=PARAMS["ucsc_host"],
                                    user=PARAMS["ucsc_user"],
                                    database=PARAMS["ucsc_database"])
Пример #6
0
def build_bam_stats(infiles, outfile):
    '''count number of reads mapped, duplicates, etc.
    Excludes regions overlapping repetitive RNA sequences
    Parameters
    ----------
    infiles : list
    infiles[0] : str
       Input filename in :term:`bam` format
    infiles[1] : str
       Input filename with number of reads per sample
    outfile : str
       Output filename with read stats
    annotations_interface_rna_gtf : str
        :term:`PARMS`. :term:`gtf` format file with repetitive rna
    '''

    job_memory = "32G"

    # Only one sample
    if len(infiles) == 3:
        bamfile, readsfile, rna_file = infiles
    # If there are multiple samples, programme specifies which .nreads file to use, by matching name to bam file
    else:
        bamfile = infiles[0]
        rna_file = infiles[-1]
        # Split file name up into directory and file name(/), then further split up by file name and file type and take file name (.)
        bam_name = bamfile.split('/')[1].split('.')[0]
        for i in range(1, len(infiles) - 1):
            nread_name = infiles[i].split('/')[1].split('.')[0]
            if bam_name == nread_name:
                readsfile = infiles[i]
                break
            else:
                continue

    nreads = ModuleTrna.getNumReadsFromReadsFile(readsfile)
    track = P.snip(os.path.basename(readsfile), ".nreads")

    # if a fastq file exists, submit for counting
    if os.path.exists(track + ".fastq.gz"):
        fastqfile = track + ".fastq.gz"
    elif os.path.exists(track + ".fastq.1.gz"):
        fastqfile = track + ".fastq.1.gz"
    else:
        fastqfile = None

    if fastqfile is not None:
        fastq_option = "--fastq-file=%s" % fastqfile
    else:
        fastq_option = ""

    statement = '''
    cgat bam2stats
         %(fastq_option)s
         --force-output
         --mask-bed-file=%(rna_file)s
         --ignore-masked-reads
         --num-reads=%(nreads)i
         --output-filename-pattern=%(outfile)s.%%s
    < %(bamfile)s
    > %(outfile)s
    '''

    P.run(statement)