def process_remote(infile):

    repository, acc = iotools.open_file(infile).readlines()[0].strip().split()

    if repository == "ENCODE":
        location, filetype = get_encode_file(acc)
    elif repository == "URL":
        location = acc
        if acc.endswith("gz"):
            filetype = ".".join(acc.split(".")[-2])
        else:
            filetype = acc.split(".")[-1]
    else:
        raise ValueError("repository %s not yet supported" % repository)

    tmpfile = P.get_temp_filename(shared=False, suffix="." + filetype)

    preamble = "wget %(location)s -O %(tmpfile)s --quiet &&"
    postamble = "&&  rm %(tmpfile)s"

    if filetype == "bam":
        preamble += "samtools index %(tmpfile)s && "
        postamble += " && rm %(tmpfile)s.bai "
    elif filetype == "bed.gz":
        tmp2 = P.get_temp_filename(shared=False)
        preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s &&
                        mv %(tmp2)s %(tmpfile)s &&
                        tabix -p bed %(tmpfile)s && '''
        postamble += "&& rm %(tmpfile)s.tbi"

    return preamble % locals(), postamble % locals(), tmpfile, filetype
Exemplo n.º 2
0
def removeDuplicates(fastq1, outfile):
    '''Filter exact duplicates, if specified in config file'''

    if IS_PAIRED:
        fastq2 = P.snip(fastq1, FASTQ1_SUFFIX) + FASTQ2_SUFFIX
        outfile1 = P.snip(outfile, '.gz')
        outfile2 = P.snip(outfile1, '.fastq.1') + '.fastq.2'
        logfile = P.snip(outfile1, '.fastq.1') + '.log'
        cluster_file = P.snip(outfile1, '1') + '*.clstr'

        to_filter = PARAMS['cdhit_dedup']
        if to_filter:
            tmpf1 = P.get_temp_filename('.')
            tmpf2 = P.get_temp_filename('.')
            statement = ("zcat %(fastq1)s > %(tmpf1)s &&"
                         " zcat %(fastq2)s > %(tmpf2)s &&"
                         " cd-hit-dup"
                         "  -i %(tmpf1)s"
                         "  -i2 %(tmpf2)s"
                         "  -o %(outfile1)s"
                         "  -o2 %(outfile2)s"
                         "  %(cdhit_options)s"
                         " &> %(logfile)s &&"
                         " gzip %(outfile1)s &&"
                         " gzip %(outfile2)s &&"
                         " gzip %(logfile)s &&"
                         " rm -f %(tmpf1)s &&"
                         " rm -f %(tmpf2)s &&"
                         " rm -f %(cluster_file)s")
            P.run(statement, job_options=PARAMS['cdhit_run_options'])
        else:
            E.warn('Deduplication step is being skipped for: %s' % fastq1)
            symlnk(fastq1, outfile)
            symlnk(fastq2, outfile2 + '.gz')

    else:
        outfile1 = P.snip(outfile, '.gz')
        logfile = P.snip(outfile1, '.fastq.1') + '.log'
        cluster_file = P.snip(outfile1, '1') + '*.clstr'

        to_filter = PARAMS['preprocess_dedup']
        if to_filter:
            tmpf1 = P.get_temp_filename('.')
            statement = ("zcat %(fastq1)s > %(tmpf1)s"
                         " cd-hit-dup"
                         "  -i %(tmpf1)s"
                         "  -o %(outfile1)s"
                         "  %(cdhit_options)s"
                         " &> %(logfile)s &&"
                         " gzip %(outfile1)s &&"
                         " gzip %(logfile)s &&"
                         " rm -f %(tmpf1)s &&"
                         " rm -f %(cluster_file)s")

            P.run(statement, job_options=PARAMS['cdhit_run_options'])
        else:
            E.warn('Deduplication step is being skipped for: %s' % fastq1)
            symlnk(fastq1, outfile)
Exemplo n.º 3
0
def assembleWithStringTie(infiles, outfile):

    infile, reference = infiles
    basefile = os.path.basename(infile)
    job_threads = PARAMS["stringtie_threads"]
    job_memory = PARAMS["stringtie_memory"]
    tmpfile = P.get_temp_filename()
    if os.path.exists(tmpfile):
        os.unlink(tmpfile)

    statement = '''
                    portcullis full 
			    -t 1
                            -o portcullis/%(basefile)s/
                            -r %(portcullis_bedref)s
                            -b
                            %(portcullis_fastaref)s
                            %(infile)s &&
                    mv portcullis/%(basefile)s/portcullis.filtered.bam %(tmpfile)s &&
                    rm -r portcullis/%(basefile)s/ &&
                    stringtie %(tmpfile)s
                           -p %(stringtie_threads)s
                           -G <(zcat %(reference)s)
                           %(stringtie_options)s
                           2> %(outfile)s.log
                   | gzip > %(outfile)s &&
                   rm %(tmpfile)s'''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)
        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            tmpfilename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; ".join([
            "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s"
        ])

    P.run(statement, job_condaenv="portcullis")
Exemplo n.º 4
0
    def merge_bw(infiles, outfile):
        """Merge bigWigs using mergeBigWig"""

        infiles = " ".join(infiles)

        tmpfile = P.get_temp_filename()
        tmpfile2 = P.get_temp_filename()

        statement = '''bigWigMerge %(infiles)s %(tmpfile)s &&
                       LC_COLLATE-C sort -k1,1 -k2,2n -o %(tmpfile2)s %(tmpfile)s &&
                       bedGraphToBigWig %(tmpfile2)s %(contig_file)s %(outfile)s'''

        P.run(statement)
Exemplo n.º 5
0
def remove_reads(infiles, outfile):
    """remove all of the reads mapping at least once to the genome"""

    infile, pre_trna_genome = infiles

    temp_file = P.get_temp_filename(".")
    temp_file1 = P.get_temp_filename(".")

    statement = """samtools view -h %(infile)s> %(temp_file)s && 
                   perl %(cribbslab)s/perl/removeGenomeMapper.pl %(pre_trna_genome)s %(temp_file)s %(temp_file1)s &&
                   samtools view -b %(temp_file1)s > %(outfile)s"""

    job_memory = "50G"
    P.run(statement)
    os.unlink(temp_file)
    os.unlink(temp_file1)
Exemplo n.º 6
0
def buildGff(infile, outfile):
    '''Creates a gff for DEXSeq

    This takes the gtf and flattens it to an exon based input
    required by DEXSeq. The required python script is provided by DEXSeq
    and uses HTSeqCounts.

    Parameters
    ----------

    infile : string
       Input filename in :term:`gtf` format

    outfile : string
        A :term:`gff` file for use in DEXSeq

    annotations_interface_geneset_all_gtf : string
       :term:`PARAMS`. Filename of :term:`gtf` file containing
       all ensembl annotations
    '''

    tmpgff = P.get_temp_filename(".")
    statement = "gunzip -c %(infile)s > %(tmpgff)s"
    P.run(statement)

    ps = PYTHONSCRIPTSDIR
    statement = '''python %(ps)s/dexseq_prepare_annotation.py
                %(tmpgff)s %(outfile)s'''
    P.run(statement, job_condaenv="splicing")

    os.unlink(tmpgff)
Exemplo n.º 7
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.get_temp_filename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=P.get_params()["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        iotools.touch_file(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run(statement)

    os.unlink(tmpfasta)
Exemplo n.º 8
0
    def run(self, infiles, outfile, params):

        files = " ".join(infiles)

        job_threads = params.job_threads

        # todo:
        # 1. add header.
        # 2. do batch+merge sort in order to avoid hitting temporary space limits.
        # 3. remove unnecessary info fields while sorting, add them later.

        tmpdir = P.get_temp_filename()
        retval = P.run(
            "mkdir {tmpdir}; "
            "bcftools view -h {infiles[0]} "
            "| cut -f 1-10 "
            "| bgzip > {outfile}; "
            "zcat {files} "
            "| awk -v OFS='\\t' "
            "'!/^#/ && $5 != \"<NON_REF>\" "
            "{{$8=\".\";$9=\".\";$6=\".\";$7=\"GT\";$10=\".\"; print}}' "
            "2> {outfile}.filter.log "
            "| sort -k1,1V -k2,2n "
            "--parallel {job_threads} "
            "-T {tmpdir} "
            "2> {outfile}.sort.log "
            "| uniq "
            "| bgzip "
            ">> {outfile}; "
            "tabix -p vcf {outfile}; "
            "rm -rf {tmpdir} ".format(**locals()))
Exemplo n.º 9
0
def map_with_bowtie(infiles, outfile):
    """
    map reads with bowtie to get general alignment so features can be counted
    over RNA gene_biotypes
    """
    fastq, genome = infiles
    tmp_fastq = P.get_temp_filename(".")
    temp_file = P.get_temp_filename(".")
    genome = genome.replace(".fa", "")

    statement = """gzip -dc %(fastq)s > %(tmp_fastq)s && bowtie -k 10 -v 2 --best --strata --sam  %(genome)s  %(tmp_fastq)s 2> %(outfile)s_bowtie.log | samtools view -bS |
                   samtools sort -T %(temp_file)s -o %(outfile)s &&
                   samtools index %(outfile)s
                """
    job_memory = "15G"
    P.run(statement)
Exemplo n.º 10
0
    def run(self, infile, outfile, params):

        if params.reference_bed is None:
            raise ValueError("{} requires reference_bed to be set".format(
                self.name))

        # requires a consistent sort order, so sort both files.
        # It also requires the chromosome content to be identical,
        # so restrict output to common sets.
        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.reference_bed)

        statements = [stmnt]
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa "
                          "| bgzip "
                          "> {outfile}.shared.bed.gz")
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_test.bed.gz")
        statements.append("{params.path} intersect "
                          "-b {tmpf_test} "
                          "-a {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_truth.bed.gz")
        statements.append("rm -f {tmpf_test} {tmpf_truth}")

        for section in self.sections:
            statements.append(
                "tabix -p bed {outfile}.{section}.bed.gz".format(**locals()))

        statement = "; ".join(statements)
        retval = P.run(statement.format(**locals()))

        # these are small files, so doing it here. Implement tabix.count()
        # method
        counts = dict()
        for section in self.sections:
            # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf:
            inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz")
            counts[section] = len(list(inf.fetch()))
            inf.close()

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("section\tcounts\n")
            outf.write("\n".join(
                ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n")

        return retval
Exemplo n.º 11
0
def buildBedGraph(infile, outfile):
    '''build wiggle files from bam files.
    Generate :term:`bigWig` format file from :term:`bam` alignment file
    Parameters
    ----------
    infile : str
       Input filename in :term:`bam` format
    outfile : str
       Output filename in :term:`bigwig` format
    annotations_interface_contigs : str
       :term:`PARAMS`
       Input filename in :term:`bed` format
    '''
    inf = infile[0]
    inf_name = inf.replace(".bam", "")
    idxstats = infile[1]

    # scale by Million reads mapped
    reads_mapped = Bamtools.getNumberOfAlignments(inf)

    for idx in idxstats:
        file_name = idx.replace(".idxstats", "")
        if file_name == inf_name:
            # pass to a function that extracts the number of reads aligned to
            # spike in and human genome
            regex = PARAMS['quant_regex'] + "*"
            scale = ModuleQuantchip.getSpikeInReads(idx, str(regex))
            contig_sizes = ModuleQuantchip.getContigSizes(idx)
        else:
            continue

    tmpfile = P.get_temp_filename()
    tmpfile2 = P.get_temp_filename()
    job_memory = "30G"
    statement = '''bedtools genomecov
    -ibam %(inf)s
    -g %(contig_sizes)s
    -bg
    -scale %(scale)f
    > %(tmpfile)s &&
    sort -k1,1 -k2,2n -o %(tmpfile2)s %(tmpfile)s &&
    cat %(tmpfile2)s | grep chr  > %(outfile)s &&
    rm -f %(tmpfile)s %(tmpfile2)s
    '''
    P.run(statement)
Exemplo n.º 12
0
def downsample(infile, outfile):
    '''downsample fastq files using seqtk tool.'''

    tmp_file = P.get_temp_filename(".")
    statement = '''zcat %(infile)s > %(tmp_file)s && seqtk sample -2 -s100 %(tmp_file)s %(downsample_read)s | gzip > %(outfile)s'''

    job_memory = "30G"
    P.run(statement)
    os.unlink(tmp_file)
Exemplo n.º 13
0
def map_tran_gene(outfile):
    ''''Add an identifier to the transcript IDs'''

    tmp_cdna = P.get_temp_filename(".")

    statement = '''zcat < %(cdna_fasta)s | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > %(tmp_cdna)s &&
                   cat %(tmp_cdna)s | awk '{print $0"."NR}' > %(outfile)s'''

    P.run(statement)
Exemplo n.º 14
0
def map_tr2gene(infile, outfile):
    '''Map the transcripts to genes.'''

    tmp_cdna = P.get_temp_filename(".")

    statement = '''zcat < %(cdna_fasta)s | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > %(tmp_cdna)s &&
                   awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' %(infile)s %(tmp_cdna)s  > %(outfile)s'''

    P.run(statement)
Exemplo n.º 15
0
def remove_reads(infiles, outfile):
    """remove all of the reads mapping at least once to the genome"""

    infile, pre_trna_genome = infiles

    temp_file = P.get_temp_filename(".")
    temp_file1 = P.get_temp_filename(".")

    PY_SRC_PATH = os.path.abspath(os.path.dirname(__file__))

    statement = """samtools view -h %(infile)s> %(temp_file)s && 
                   perl %(PY_SRC_PATH)s/perl/removeGenomeMapper.pl %(pre_trna_genome)s %(temp_file)s %(temp_file1)s &&
                   samtools view -b %(temp_file1)s > %(outfile)s""" % locals()

    job_memory = "50G"
    P.run(statement)
    os.unlink(temp_file)
    os.unlink(temp_file1)
Exemplo n.º 16
0
 def download(self, genes=None, fields=None, scope=None, species=None):
     '''
     download an up to date ontology file, parse the xml data into a
     Python "ElementTree" and delete the ontology file.
     '''
     ontologyfile = P.get_temp_filename(".")
     os.system("wget -O %s %s" % (ontologyfile, self.datasource))
     tree = ET.parse(ontologyfile)
     os.remove(ontologyfile)
     self.dataset = tree
Exemplo n.º 17
0
def bustools_sort(infile, outfile):
    """
    Generate a sorted bus file
    """

    tmp = P.get_temp_filename(".")

    statement = """bustools sort -T %(tmp)s -t %(kallisto_threads)s -o %(outfile)s %(infile)s/output.bus"""

    P.run(statement)
Exemplo n.º 18
0
def capture_list(outfile):
    '''Get the transcripts to capture list and transcripts to genes for cDNA'''

    tmp_cdna = P.get_temp_filename(".")

    statement = '''zcat < %(cdna_fasta)s cDNA.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > %(tmp_cdna)s  &&
                   cat %(tmp_cdna)s  | tr "." " " | awk '{print $1}' > %(outfile)s '''

    P.run(statement)
    os.unlink(tmp_cdna)
Exemplo n.º 19
0
def intron_bed2fa(outfile):
    '''This converts introns bed to introns fa'''

    tmp_bed = P.get_temp_filename(".")

    statement = '''zcat < %(intron_bed)s > %(tmp_bed)s &&
    bedtools getfasta -name -fo %(outfile)s -fi %(genome_file)s -bed %(tmp_bed)s'''

    P.run(statement)
    os.unlink(tmp_bed)
Exemplo n.º 20
0
def find_intron_fa_header(infile, outfile):
    '''Fix the INTRONS FASTA header'''

    tmp_cdna = P.get_temp_filename(".")

    statement = '''zcat %(cdna)s > %(tmp_cdna)s && awk '{print ">"$1"."NR" gene_id:"$2" gene_name:"$3}' %(infile)s > geneset.dir/cDNA_fasta_header.txt &&
                   awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' geneset.dir/cDNA_fasta_header.txt %(tmp_cdna)s >
                  %(outfile)s'''

    P.run(statement)
    os.unlink(tmp_cdna)
Exemplo n.º 21
0
def tss_gene_parse(infile, outfile):
    """Filter a gtf using gene lists and then outut them as gtf"""

    bedfile = PARAMS['geneexpression_tss']

    tmpfile = P.get_temp_filename()

    statement = """zcat %(bedfile)s | grep -f %(infile)s > %(tmpfile)s &&
                   cat %(tmpfile)s | awk '{$4 = "TSS"; print}' OFS='\\t' | gzip > %(outfile)s"""

    P.run(statement)
Exemplo n.º 22
0
def buildRefFlat(infile, outfile):
    '''build flat geneset for Picard RnaSeqMetrics.
    '''

    tmpflat = P.get_temp_filename(".")

    statement = '''
    gtfToGenePred -genePredExt -geneNameAsName2 %(infile)s %(tmpflat)s &&
    paste <(cut -f 12 %(tmpflat)s) <(cut -f 1-10 %(tmpflat)s)
    > %(outfile)s
    '''
    P.run(statement, job_memory=PARAMS["job_memory"])
    os.unlink(tmpflat)
Exemplo n.º 23
0
    def run(self, outfile, params):

        if "--threads" in params.options or "-t " in params.options:
            job_threads = int(re.search("(-t|--threads)\s*(\d+)",
                                        params.options).groups()[1])

        fastq = resolve_argument(params.fastq, ",").split(",")
        if len(fastq) == 1:
            fastq = '-U "{}"'.format(fastq)
        else:
            fastq = '-1 "{}" -2 "{}"'.format(*fastq)

        tmpdir = P.get_temp_filename(clear=True)

        if "index" in params._fields:
            index = params.index
        else:
            index = params.reference_fasta

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "--rg-id {}".format(readgroup_id)

            # add additional level of quoting and remove "ID:{}"
            readgroup_string = re.sub("@RG\tID:\S+\t", "", readgroup_string)
            readgroup_string = " ".join(["--rg {}".format(x)
                                         for x in readgroup_string.split("\t")])
        else:
            readgroup_option = ""
            readgroup_string = ""

        return P.run(
            "mkdir {tmpdir}; "
            "{self.path} "
            "{readgroup_option} "
            "{readgroup_string} "
            "{params.options} "
            "-x {index} "
            "{fastq} "
            "2> {outfile}.log "
            "| samtools view -b /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort -T {tmpdir} -O bam /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile}; "
            "rm -rf {tmpdir}".format(**locals()),
            **params._asdict())
Exemplo n.º 24
0
def buildReferenceKallisto(infiles, outfile):
    '''
    Builds a reference transcriptome and decoy sequneces for alevin and kallisto
    Parameters
    ----------
    infile: str
        path to the GTF file containing transcript and gene level annotations
    genome_dir: str
        :term: `PARAMS` the directory of the reference genome
    genome: str
        :term: `PARAMS` the filename of the reference genome (without .fa)
    outfile: str
        path to output file
    '''
    prim_trans1, prim_trans2 = infiles
    genome_file1 = PARAMS['genome1']

    if PARAMS['mixed_species']:
        genome_file2 = PARAMS['genome2']
        tmp1 = P.get_temp_filename('.')
        tmp2 = P.get_temp_filename('.')
        statement = '''
                       grep "^>" <(gunzip -c %(genome_file1)s %(genome_file2)s) | cut -d " " -f 1 > decoys.txt &&
                       sed -i.bak -e 's/>//g' decoys.txt &&
                       cat %(prim_trans1)s %(prim_trans2)s %(genome_file1)s %(genome_file1)s > %(outfile)s
                       '''
    else:
        statement = '''
                       grep "^>" <(gunzip -c %(genome_file1)s) | cut -d " " -f 1 > decoys.txt &&
                       sed -i.bak -e 's/>//g' decoys.txt &&
                       cat %(prim_trans1)s %(genome_file1)s > %(outfile)s
                       '''

    P.run(statement)

    if PARAMS['mixed_species']:
        os.unlink(tmp1)
        os.unlink(tmp2)
Exemplo n.º 25
0
def busText(infile, outfile):
    '''
    Sort the bus file produced by kallisto and then convert it to a text file.
    '''

    tmp_bus = P.get_temp_filename(".")

    statement = '''
    sleep 10
    bustools sort -o %(tmp_bus)s %(infile)s ;
    bustools text -o %(outfile)s %(tmp_bus)s
    '''

    P.run(statement)
Exemplo n.º 26
0
def loadManualAnnotations(infile, outfile):

    tmp = P.get_temp_filename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with iotools.open_file(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with iotools.open_file(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
Exemplo n.º 27
0
        def aggregateAdaptors(infiles, outfile):
            '''
            Collate fasta files into a single contaminants file for
            adapter removal.
            '''
            tempfile = P.get_temp_filename()
            infiles = " ".join(infiles)

            statement = """
            cat %(infiles)s | fastx_reverse_complement > %(tempfile)s &&
            cat %(tempfile)s %(infiles)s | fastx_collapser > %(outfile)s &&
            rm -f %(tempfile)s
            """
            P.run(statement)
Exemplo n.º 28
0
def create_fragment_bed(infile, outfile):
    """Take the clusterInfo and create a bed file containing all of the fragments of tRNAs"""

    cluster_info = infile.replace("_cluster.fa","_clusterInfo.fa")
    tmp_file = P.get_temp_filename(".")

    PY_SRC_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                           "python"))

    statement = """python %(PY_SRC_PATH)s/trna_fragment_bed.py -I %(cluster_info)s -S %(tmp_file)s &&
                   sort %(tmp_file)s | uniq > %(outfile)s"""

    P.run(statement)
    os.unlink(tmp_file)
Exemplo n.º 29
0
def run_rmats_pre(infiles, outfile, track):

    infile, gtffile = infiles

    od = os.path.abspath("rmats.dir")

    statement = '''rmats.py
                   --task prep
                   --tmp rmats.dir/%(track)s.dir
                   --gtf <(zcat %(gtffile)s)
                   --readLength %(rmats_readLength)s
                   -t %(rmats_paired)s
                   --od rmats.dir
                   --b1 <(echo %(infile)s)
                   &> rmats.dir/%(track)s.prep.log
                   '''

    if infile.endswith(".remote"):
        token = glob("gdc-user-token*")
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)
        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(infile,
                                           token,
                                           tmpfilename,
                                           filter_bed=PARAMS["contigs_bed"])
        s = re.sub(";\n", " &&\n", s)

        infile = ",".join(infile)
        statement = " && ".join([
            "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s"
        ])

    P.run(statement,
          job_condaenv=PARAMS["rmats_env"],
          job_memory=PARAMS["rmats_prep_memory"])

    rmats_counter = ""
    for f_path in glob("rmats.dir/%(track)s.dir/*.rmats" % locals()):
        shutil.copy(f_path,
                    P.snip(outfile, ".rmats") + rmats_counter + ".rmats")
        if rmats_counter == "":
            rmats_counter = 1
        else:
            rmats_counter += 1
Exemplo n.º 30
0
def fix_intron_fasta(infiles, outfile):
    '''fix all of the headers for the introns FASTA file so that they
    contain the transcript ID, an identifier specifying that the transcript
    is an “intronic” transcript, and a unique number to avoid duplicates.'''

    introns_t2g, introns = infiles

    tmp_fasta = P.get_temp_filename(".")

    statement = '''awk '{print ">"$1"."NR"-I"" gene_id:"$2" gene_name:"$3}' %(introns_t2g)s > %(tmp_fasta)s  &&
                awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' %(tmp_fasta)s  %(introns)s > %(outfile)s '''

    P.run(statement)
    os.unlink(tmp_fasta)