Exemplo n.º 1
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(
        infile,
        outfile,
        metric="wc -l",
        suffixes=P.asList(P.asList(PARAMS.get('%s_regex_linecount' % track, ""))))
Exemplo n.º 2
0
def checkFileExistence(infile, outfile):
    '''check whether file exists.

    Files are uncompressed before checking existence.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(
        infile,
        outfile,
        metric="file",
        suffixes=P.asList(P.asList(PARAMS.get('%s_regex_exist' % track, ""))))
Exemplo n.º 3
0
def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(
        infile,
        outfile,
        metric="md5sum",
        suffixes=P.asList(P.asList(PARAMS.get('%s_regex_md5' % track, ""))))
Exemplo n.º 4
0
    def __call__(self, track, slice=None):

        exp_statement = """
        SELECT TPM, gene_id, sample_name
        FROM sailfish_genes AS A
        JOIN samples AS B
        ON A.sample_id = B.id"""

        exp_df = self.getDataFrame(exp_statement)

        factors_statement = '''
        SELECT factor, factor_value, sample_name
        FROM samples AS A
        JOIN factors AS B
        ON A.id = B.sample_id
        WHERE factor != 'genome'
        '''

        factors_df = self.getDataFrame(factors_statement)

        merged_df = pd.merge(exp_df,
                             factors_df,
                             left_on="sample_name",
                             right_on="sample_name")

        genes = Pipeline.asList(
            Pipeline.peekParameters(
                ".", "pipeline_rnaseqqc.py")['genes_of_interest'])

        interest_df = merged_df[merged_df['gene_id'].isin(genes)]

        interest_df['TPM'] = interest_df['TPM'].astype(float)

        return interest_df.reset_index().set_index("factor")
Exemplo n.º 5
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Exemplo n.º 6
0
def runTest(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")

    pipeline_name = PARAMS.get(
        "%s_pipeline" % track,
        "pipeline_" + track[len("test_"):])

    pipeline_targets = P.asList(
        PARAMS.get("%s_target" % track,
                   "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    to_cluster = False

    template_statement = '''
    (cd %%(track)s.dir;
    python %%(pipelinedir)s/%%(pipeline_name)s.py
    %%(pipeline_options)s make %s) >& %%(outfile)s
    '''
    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(ignore_errors=True)
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(ignore_errors=True)
Exemplo n.º 7
0
def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get(
        '%s_suffixes' % track,
        PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)
    regex_pattern = pipes.quote(regex_pattern)

    # ignore log files as time stamps will
    # be different
    statement = '''find %(track)s.dir
    -type f
    -not -regex ".*.log"
    -regex %(regex_pattern)s
    -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} md5sum \;
    | perl -p -e "s/ +/\\t/g"
    | sort -k1,1
    > %(outfile)s'''
    P.run()
Exemplo n.º 8
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get(
        '%s_suffixes' % track,
        PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)

    regex_pattern = pipes.quote(regex_pattern)

    # ignore log files as time stamps will
    # be different
    statement = '''find %(track)s.dir
    -type f
    -not -regex ".*.log"
    -regex %(regex_pattern)s
    -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} wc -l \;
    | sort -k1,1
    > %(outfile)s'''
    P.run()
Exemplo n.º 9
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Exemplo n.º 10
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError("required file %s for %s (stage %i) not exist." %
                             (filename, outfile, stage))

        if stage == 0:
            statement.append('''gunzip < %(filename)s''' % locals())
        else:
            statement.append('''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals())

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
Exemplo n.º 11
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substituteParameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.asList(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
Exemplo n.º 12
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError(
                "required file %s for %s (stage %i) not exist." % (filename, outfile, stage))

        if stage == 0:
            statement.append( '''gunzip < %(filename)s''' % locals() )
        else:
            statement.append( '''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals() )

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
Exemplo n.º 13
0
def importRepeatsFromUCSC(outfile):
    """This task downloads UCSC repeats types as identified
    in the configuration file.
    """
    PipelineGtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(),
                                            repclasses=P.asList(
                                                PARAMS["ucsc_repeattypes"]),
                                            outfile=outfile)
Exemplo n.º 14
0
def importRNAAnnotationFromUCSC(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    PipelineGtfsubset.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.asList(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ncbi_remove_contigs"])
Exemplo n.º 15
0
def importRepeatsFromUCSC(outfile):
    """This task downloads UCSC repeats types as identified
    in the configuration file.
    """
    PipelineGtfsubset.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.asList(PARAMS["ucsc_repeattypes"]),
        outfile=outfile,
        job_memory=PARAMS["job_memory"])
Exemplo n.º 16
0
def importRNAAnnotationFromUCSC(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    PipelineGtfsubset.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.asList(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ncbi_remove_contigs"],
        job_memory=PARAMS["job_memory"])
Exemplo n.º 17
0
def runMemeChIP(infile, outfile):
    '''run MemeChIP'''

    outdir = outfile.replace(".memechip", "")
    bfile = infile.replace(".foreground.fasta", ".background.bfile")

    motifDb = " -db ".join(
        P.asList(PARAMS["meme_motif_db"]
                 ))  # Meme-Chip needs eac db in list to have "-db" flag
    nmotifs = PARAMS["meme_nmotif"]
    meme_max_jobs = PARAMS["meme_meme_maxsize"]

    # nmeme - The upper bound on the number of sequences that are passed to MEME.
    # This is required because MEME takes too long to run for very large sequence sets.
    # All input sequences are passed to MEME if there are not more than limit.
    # default nmeme = 600

    # ccut - The maximum length of a sequence to use before it is trimmed to a central region of this size.
    # A value of 0 indicates that sequences should not be trimmed.

    # meme-maxsize - Change the largest allowed dataset to be size.
    # default meme-maxsize is 100,000.
    # Fine with the default settings for -nmeme (600) and -ccut (100), largest possible dataset size would be 60000.

    # meme-maxsize 10x10^6 - this is far to large, runs take >24hrs
    # will try 600,000, equivalent to max of 600 1000bp seq
    # in order to check 2,000 <= 1,000bp seq will need meme-maxsize of 2x10^6

    job_memory = "5G"
    job_threads = "2"

    statement = '''meme-chip
                   -oc %(outdir)s
                   -db %(motifDb)s
                   -bfile %(bfile)s
                   -ccut 0
                   -meme-mod zoops
                   -meme-minw 5
                   -meme-maxw 30
                   -meme-nmotifs %(nmotifs)s
                   -meme-maxsize %(meme_max_jobs)s

                   %(infile)s
                   > %(outfile)s
                ''' % locals()

    print statement

    P.run()
Exemplo n.º 18
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.asList(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
Exemplo n.º 19
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track, tmpfasta,
        dbhandle,
        full=False,
        masker=P.asList(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
Exemplo n.º 20
0
def exportIntervalSequences(infile, outfile, track, method):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    dbhandle = connect()

    try:
        halfwidth = int(PARAMS[method + "_halfwidth"])
        full = False
    except ValueError:
        full = True
        halfwidth = None

    try:
        maxsize = int(PARAMS[method + "_max_size"])
    except ValueError:
        maxsize = None

    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=full,
        masker=P.asList(PARAMS[method + '_masker']),
        halfwidth=halfwidth,
        maxsize=maxsize,
        num_sequences=PARAMS[method + "_num_sequences"],
        proportion=PARAMS[method + "_proportion"],
        min_sequences=PARAMS[method + "_min_sequences"],
        order=PARAMS[method + '_score'])

    if nseq == 0:
        E.warn("%s: no sequences - %s skipped" % (outfile, method))
        P.touch(outfile)
Exemplo n.º 21
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substituteParameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.asList(
            p['motifs_masker']),
        halfwidth=int(
            p["motifs_halfwidth"]),
        maxsize=int(
            p["motifs_max_size"]),
        proportion=p[
            "motifs_proportion"],
        min_sequences=p[
            "motifs_min_sequences"],
        num_sequences=p[
            "motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
Exemplo n.º 22
0
def publish():
    '''publish files.'''

    # directory, files

    export_files = {"bigwigfiles": glob.glob("*/*.bigwig")}

    if PARAMS['ucsc_exclude']:
        for filetype, files in export_files.items():
            new_files = set(files)
            for f in files:
                for regex in P.asList(PARAMS['ucsc_exclude']):
                    if re.match(regex, f):
                        new_files.remove(f)
                        break

            export_files[filetype] = list(new_files)

    # publish web pages
    E.info("publishing report")
    P.publish_report(export_files=export_files)

    E.info("publishing UCSC data hub")
    P.publish_tracks(export_files)
Exemplo n.º 23
0
    def processReads(infile, outfiles):
        """process reads from .fastq and other sequence files.
        """
        trimmomatic_options = PARAMS["trimmomatic_options"]
        if PARAMS["trimmomatic_adapter"]:
            trimmomatic_options = (
                " ILLUMINACLIP:%s:%s:%s:%s "
                % (
                    PARAMS["trimmomatic_adapter"],
                    PARAMS["trimmomatic_mismatches"],
                    PARAMS["trimmomatic_p_thresh"],
                    PARAMS["trimmomatic_c_thresh"],
                )
                + trimmomatic_options
            )

        if PARAMS["auto_remove"]:
            trimmomatic_options = (
                " ILLUMINACLIP:%s:%s:%s:%s "
                % (
                    "contaminants.fasta",
                    PARAMS["trimmomatic_mismatches"],
                    PARAMS["trimmomatic_p_thresh"],
                    PARAMS["trimmomatic_c_thresh"],
                )
                + trimmomatic_options
            )

        job_threads = PARAMS["threads"]
        job_memory = "7G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = PipelinePreprocess.MasterProcessor(
            save=PARAMS["save"], summarize=PARAMS["summarize"], threads=PARAMS["threads"]
        )

        for tool in P.asList(PARAMS["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(PipelinePreprocess.FastxTrimmer(PARAMS["fastx_trimmer_options"], threads=PARAMS["threads"]))
            elif tool == "trimmomatic":
                m.add(PipelinePreprocess.Trimmomatic(trimmomatic_options, threads=PARAMS["threads"]))
            elif tool == "sickle":
                m.add(PipelinePreprocess.Sickle(PARAMS["sickle_options"], threads=PARAMS["threads"]))
            elif tool == "trimgalore":
                m.add(PipelinePreprocess.Trimgalore(PARAMS["trimgalore_options"], threads=PARAMS["threads"]))
            elif tool == "flash":
                m.add(PipelinePreprocess.Flash(PARAMS["flash_options"], threads=PARAMS["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = PARAMS["cutadapt_options"]
                if PARAMS["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(
                    PipelinePreprocess.Cutadapt(
                        cutadapt_options, threads=PARAMS["threads"], untrimmed=PARAMS["cutadapt_reroute_untrimmed"]
                    )
                )

        statement = m.build((infile,), "processed.dir/trimmed-", track)

        P.run()
Exemplo n.º 24
0
    >  %(outfile)s
    '''

    P.run()


##########################################################################
##########################################################################
##########################################################################
# extracting alignments from maf files
##########################################################################
if "maf_dir" in PARAMS and "maf_tracks" in PARAMS:

    @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" %
             (PARAMS["%s_label" % track], PARAMS["maf_master"]), track)
            for track in P.asList(PARAMS["maf_tracks"])])
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)
     "../pipeline.ini",
     "pipeline.ini"],
    defaults={
        'annotations_dir': "",
        'paired_end': False})

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

# get options that are to be tested
cufflinks_options = {}
if "cufflinks_test_options" in PARAMS:
    options = P.asList(PARAMS["cufflinks_test_options"])
    for option in options:
        if option == "--pre-mrna-fraction" \
                or option == "--small-anchor-fraction" \
                or option == "--max-multiread-fraction":
            cufflinks_options[option] = [0, 0.5, 0.75, 1]
        elif option == "--min-isoform-fraction":
            cufflinks_options[option] = [0.05, 0.1, 0.5, 1]
        elif option == "--junc-alpha":
            cufflinks_options[option] = [0.001, 0.01, 0.1]
        elif option == "--min-frags-per-transfrag":
            cufflinks_options[option] = [1, 5, 10]
        elif option == "--overhang-tolerance":
            cufflinks_options[option] = [0, 2, 5, 8]
        elif option == "--overlap-radius":
            cufflinks_options[option] = [50, 100, 200]
Exemplo n.º 26
0
    | %(cmd-farm)s --split-at-regex="^chain" --chunk-size=1000 --max-lines=1000000 --log=%(outfile)s.log
    " cgat chain2psl --log=%(outfile)s.log
      | pslSwap stdin stdout "
    | gzip
    >  %(outfile)s
    '''

    P.run()

##########################################################################
##########################################################################
##########################################################################
# extracting alignments from maf files
##########################################################################
if "maf_dir" in PARAMS and "maf_tracks" in PARAMS:
    @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])])
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)
Exemplo n.º 27
0
    "pipeline.ini"
],
                defaults={
                    'annotations_dir': "",
                    'paired_end': False
                })

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

# get options that are to be tested
cufflinks_options = {}
if "cufflinks_test_options" in PARAMS:
    options = P.asList(PARAMS["cufflinks_test_options"])
    for option in options:
        if option == "--pre-mrna-fraction" \
                or option == "--small-anchor-fraction" \
                or option == "--max-multiread-fraction":
            cufflinks_options[option] = [0, 0.5, 0.75, 1]
        elif option == "--min-isoform-fraction":
            cufflinks_options[option] = [0.05, 0.1, 0.5, 1]
        elif option == "--junc-alpha":
            cufflinks_options[option] = [0.001, 0.01, 0.1]
        elif option == "--min-frags-per-transfrag":
            cufflinks_options[option] = [1, 5, 10]
        elif option == "--overhang-tolerance":
            cufflinks_options[option] = [0, 2, 5, 8]
        elif option == "--overlap-radius":
            cufflinks_options[option] = [50, 100, 200]
Exemplo n.º 28
0
def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` file in the
    data directory and returns an offset of 0.

    Associations can be defined in the .ini file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam

    '''
    fn = os.path.basename(track.asFile())
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.asList(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern.lower())
                    if re.search(p, fn.lower()):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = map(int, P.asList(PARAMS["offsets_%s" % fn.lower()]))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn):
                    offsets.extend(map(int, value.split(",")))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError("number of BAM files %s is not the "
                         "same as number of offsets: %s" %
                         (str(bamfiles), str(offsets)))

    return bamfiles, offsets
Exemplo n.º 29
0
                   > %(outfile)s.log.gz'''
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; 
                | gzip 
                > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################


@files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y)
        for x, y in itertools.product(
            glob.glob("*.features.gz"), P.asList(PARAMS["polyphen_models"]))])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = False

    # need to run in chunks for large feature files
    statement = """gunzip 
        < %(infile)s
        | %(cmd-farm)s
            --split-at-lines=10000
            --output-header
        "perl %(polyphen_home)s/bin/run_weka_cpp.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           -p 
Exemplo n.º 30
0
    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = PARAMS["trimmomatic_options"]

        if PARAMS["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta",
                PARAMS["trimmomatic_mismatches"],
                PARAMS["trimmomatic_p_thresh"],
                PARAMS["trimmomatic_c_thresh"],
                PARAMS["trimmomatic_min_adapter_len"],
                PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options

        elif PARAMS["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                PARAMS["trimmomatic_adapter"],
                PARAMS["trimmomatic_mismatches"],
                PARAMS["trimmomatic_p_thresh"],
                PARAMS["trimmomatic_c_thresh"],
                PARAMS["trimmomatic_min_adapter_len"],
                PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = PARAMS["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = PipelinePreprocess.MasterProcessor(
            save=PARAMS["save"],
            summarize=PARAMS["summarize"],
            threads=PARAMS["threads"],
            qual_format=PARAMS['qual_format'])

        for tool in P.asList(PARAMS["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(PipelinePreprocess.FastxTrimmer(
                    PARAMS["fastx_trimmer_options"],
                    threads=PARAMS["threads"]))
            elif tool == "trimmomatic":
                m.add(PipelinePreprocess.Trimmomatic(
                    trimmomatic_options,
                    threads=PARAMS["threads"]))
            elif tool == "sickle":
                m.add(PipelinePreprocess.Sickle(
                    PARAMS["sickle_options"],
                    threads=PARAMS["threads"]))
            elif tool == "trimgalore":
                m.add(PipelinePreprocess.Trimgalore(
                    PARAMS["trimgalore_options"],
                    threads=PARAMS["threads"]))
            elif tool == "flash":
                m.add(PipelinePreprocess.Flash(
                    PARAMS["flash_options"],
                    threads=PARAMS["threads"]))
            elif tool == "reversecomplement":
                m.add(PipelinePreprocess.ReverseComplement(
                    PARAMS["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(PipelinePreprocess.Pandaseq(
                    PARAMS["pandaseq_options"],
                    threads=PARAMS["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = PARAMS["cutadapt_options"]
                if PARAMS["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(PipelinePreprocess.Cutadapt(
                    cutadapt_options,
                    threads=PARAMS["threads"],
                    untrimmed=PARAMS['cutadapt_reroute_untrimmed'],
                    process_paired=PARAMS["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile,), "processed.dir/trimmed-", track)
        P.run()
Exemplo n.º 31
0
def compareCheckSums(infiles, outfile):
    '''compare checksum files against existing reference data.
    '''

    to_cluster = False
    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join((
        ("track", "status",
         "job_finished",
         "nfiles", "nref",
         "missing", "extra",
         "different",
         "different_md5",
         "different_lines",
         "same",
         "same_md5",
         "same_lines",
         "same_exist",
         "files_missing",
         "files_extra",
         "files_different_md5",
         "files_different_lines"))) + "\n")

    for infile in infiles:
        E.info("working on {}".format(infile))
        track = P.snip(infile, ".stats")

        logfiles = glob.glob(track + "*.log")
        job_finished = True
        for logfile in logfiles:
            is_complete = IOTools.isComplete(logfile)
            E.debug("logcheck: {} = {}".format(logfile, is_complete))
            job_finished = job_finished and is_complete

        reffile = track + ".ref"

        # regular expression of files to test only for existence
        regex_exist = PARAMS.get('%s_regex_exist' % track, None)
        if regex_exist:
            regex_exist = re.compile("|".join(P.asList(regex_exist)))

        regex_linecount = PARAMS.get('%s_regex_linecount' % track, None)
        if regex_linecount:
            regex_linecount = re.compile("|".join(P.asList(regex_linecount)))

        regex_md5 = PARAMS.get('%s_regex_md5' % track, None)
        if regex_md5:
            regex_md5 = re.compile("|".join(P.asList(regex_md5)))

        if not os.path.exists(reffile):
            raise ValueError('no reference data defined for %s' % track)

        cmp_data = pandas.read_csv(IOTools.openFile(infile),
                                   sep="\t",
                                   index_col=0)

        ref_data = pandas.read_csv(IOTools.openFile(reffile),
                                   sep="\t",
                                   index_col=0)

        shared_files = set(cmp_data.index).intersection(ref_data.index)
        missing = set(ref_data.index).difference(cmp_data.index)
        extra = set(cmp_data.index).difference(ref_data.index)

        different = set(shared_files)

        # remove those for which only check for existence
        if regex_exist:
            same_exist = set([x for x in different
                              if regex_exist.search(x)])

            different = set([x for x in different
                             if not regex_exist.search(x)])
        else:
            same_exist = set()

        # select those for which only check for number of lines
        if regex_linecount:
            check_lines = [x for x in different
                           if regex_linecount.search(x)]

            dd = (cmp_data['nlines'][check_lines] !=
                  ref_data['nlines'][check_lines])
            different_lines = set(dd.index[dd])
            different = different.difference(check_lines)

            dd = (cmp_data['nlines'][check_lines] ==
                  ref_data['nlines'][check_lines])
            same_lines = set(dd.index[dd])

        else:
            different_lines = set()
            same_lines = set()

        # remainder - check md5
        if regex_md5:
            check_md5 = [x for x in different
                         if regex_md5.search(x)]

            dd = (cmp_data['md5'][check_md5] !=
                  ref_data['md5'][check_md5])
            different_md5 = set(dd.index[dd])

            dd = (cmp_data['md5'][check_md5] ==
                  ref_data['md5'][check_md5])
            same_md5 = set(dd.index[dd])

        else:
            different_md5 = set()
            same_md5 = set()

        if job_finished and (len(missing) + len(extra) +
                             len(different_md5) + len(different_lines) == 0):
            status = "OK"
        else:
            status = "FAIL"

        outf.write("\t".join(map(str, (
            track,
            status,
            job_finished,
            len(cmp_data),
            len(ref_data),
            len(missing),
            len(extra),
            len(different_md5) + len(different_lines),
            len(different_md5),
            len(different_lines),
            len(same_md5) + len(same_lines) + len(same_exist),
            len(same_md5),
            len(same_lines),
            len(same_exist),
            ",".join(missing),
            ",".join(extra),
            ",".join(different_md5),
            ",".join(different_lines),
        ))) + "\n")

    outf.close()
Exemplo n.º 32
0
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design),
         "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"),
                                              P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")

#########################################################################
#########################################################################
#########################################################################


@follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
Exemplo n.º 33
0
    %(options)s -
    -o %(outfiles)s
    --too-short-o too_short.dir/%(track)s_tooshort.fastq.gz
    --untrimmed-output untrimmed.dir/%(track)s_untrimmed.fastq.gz
    >> %(track)s.log
    '''

    P.run()


###############################################################################
# Read alignment to library (with bowtie or bowtie2)
###############################################################################

mapper = PARAMS['mapper']
library_files = P.asList(PARAMS['libraryfiles'])
library_names = P.asList(PARAMS['librarynames'])
library_dict = dict(zip(library_names, library_files))

if mapper == 'bowtie':

    @follows(mkdir("library.dir"))
    @subdivide(library_files, regex(r"(\S+).fasta"), r"library.dir/\1.*.ebwt")
    def BuildBowtieIndex(infiles, outfiles):
        basename = 'library.dir/' + P.snip(os.path.basename(infiles), ".fasta")
        statement = '''
        bowtie-build -f %(infiles)s %(basename)s
        '''

        P.run()
Exemplo n.º 34
0
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts,
         mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design), "diff_methylation/%s_%s.deseq.gz" %
         (P.snip(os.path.basename(data),
                 ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")


#########################################################################
#########################################################################
#########################################################################

Exemplo n.º 35
0
    defaults={"annotations_dir": "",
              "genesets_abinitio_coding": "pruned.gtf.gz",
              "genesets_abinitio_lncrna": "pruned.gtf.gz",
              "genesets_reference": "reference.gtf.gz",
              "genesets_refcoding": "refcoding.gtf.gz",
              "genesets_previous": ""})

PARAMS = P.PARAMS

PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    prefix="annotations_",
    update_interface=True))

PREVIOUS = P.asList(PARAMS["genesets_previous"])


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()
Exemplo n.º 36
0
@follows(mkdir("motifs"))
@transform(BEDFILES, regex(".*/(.*).bed.gz"), r"motifs/\1.control.fasta")
def exportMotifControlSequences(infile, outfile):
    '''for each interval, export the left and right
    sequence segment of the same size.
    '''
    PipelineMotifs.exportSequencesFromBedFile(infile,
                                              outfile,
                                              masker=PARAMS['motifs_masker'],
                                              mode="leftright")


############################################################
############################################################
############################################################
@active_if("meme" in P.asList(PARAMS["methods"])
           or "disc_meme" in P.asList(PARAMS["methods"]))
@transform(loadIntervals, suffix("_intervals.load"), ".meme.fasta")
def exportMemeIntervalSequences(infile, outfile):

    track = os.path.basename(P.snip(infile, "_intervals.load"))

    exportIntervalSequences(infile, outfile, track, "meme")


############################################################
@follows(mkdir("meme.dir"))
@active_if("meme" in P.asList(PARAMS["methods"]))
@transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"),
           r"meme.dir/\1.meme")
def runMeme(infile, outfile):
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################
# do not run in parallel. run_weka.pl creates a $testfile
# that is not unique. run_weka.pl and pph2arff.pl could either
# be patched or the following jobs run in sequence.


@jobs_limit(1, "polyphen")
@files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x)
        for x in P.asList(PARAMS["polyphen_models"])])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''
    # options
    # -f: feature set, default is F11
    # -c: classifier, default is NBd (Naive Bayes with discretization)
    # -l: model name, default is HumDiv

    statement = '''
    %(polyphen_home)s/bin/run_weka.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           %(infile)s 
    | gzip 
    > %(outfile)s 
    2> %(outfile)s.log
def quantifySequins():
    pass


##############################################################################
#  Sequins Quantify - End                                                    #
##############################################################################

##############################################################################
#  Sequins Add Models - Start                                                #
##############################################################################

add_models_gtfs = []

for add_type in ["skip_exons", "incomplete", "3prime"]:
    for fraction in P.asList(PARAMS['%s_fractions' % add_type]):
        for iteration in range(0, PARAMS['%s_iterations' % add_type]):
            add_models_gtfs.append(
                "sequins/add_models/%s/transcripts_%s_%s.gtf.gz" %
                (add_type, fraction, iteration))


@mkdir('sequins/add_models/skip_exons', 'sequins/add_models/incomplete',
       'sequins/add_models/3prime')
@originate(add_models_gtfs)
def buildAddModels(outfile):
    ''' build a set of reference transcriptomes with additional
    transcripts with skipped exons, incomplete transcripts and transcripts with alternative 3' ends '''

    # how to avoid hardcoding this?
    infile = 'annotations/sequins.gtf.gz'
        with IOTools.openFile(outfile, "w") as outf:
            outf.write("%s\n" % "\t".join(
                ("target_id", "length", "tpm", "est_counts")))

            for line in lines:
                if not line.startswith("# "):
                    outf.write(line)

# define simulation targets
SIMTARGETS = []

mapToSimulationTargets = {'kallisto': (extractKallistoCountSimulation, ),
                          'salmon': (extractSalmonCountSimulation, ),
                          'sailfish': (extractSailfishCountSimulation, )}

for x in P.asList(PARAMS["quantifiers"]):
    SIMTARGETS.extend(mapToSimulationTargets[x])


@follows(*SIMTARGETS)
def quantifySimulation():
    pass


@transform(SIMTARGETS,
           regex("simulation.dir/quant.dir/(\S+)/simulated_reads_(\d+)/abundance.tsv"),
           r"simulation.dir/quant.dir/\1/simulated_reads_\2/results.tsv",
           r"simulation.dir/simulated_read_counts_\2.tsv")
def mergeAbundanceCounts(infile, outfile, counts):
    ''' merge the abundance and simulation counts files for
    each simulation '''
Exemplo n.º 40
0
    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = PARAMS["trimmomatic_options"]

        if PARAMS["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                PARAMS["trimmomatic_adapter"],
                PARAMS["trimmomatic_mismatches"],
                PARAMS["trimmomatic_p_thresh"],
                PARAMS["trimmomatic_c_thresh"],
                PARAMS["trimmomatic_min_adapter_len"],
                PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options

        if PARAMS["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta",
                PARAMS["trimmomatic_mismatches"],
                PARAMS["trimmomatic_p_thresh"],
                PARAMS["trimmomatic_c_thresh"],
                PARAMS["trimmomatic_min_adapter_len"],
                PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = PARAMS["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = PipelinePreprocess.MasterProcessor(
            save=PARAMS["save"],
            summarize=PARAMS["summarize"],
            threads=PARAMS["threads"])

        for tool in P.asList(PARAMS["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(PipelinePreprocess.FastxTrimmer(
                    PARAMS["fastx_trimmer_options"],
                    threads=PARAMS["threads"]))
            elif tool == "trimmomatic":
                m.add(PipelinePreprocess.Trimmomatic(
                    trimmomatic_options,
                    threads=PARAMS["threads"]))
            elif tool == "sickle":
                m.add(PipelinePreprocess.Sickle(
                    PARAMS["sickle_options"],
                    threads=PARAMS["threads"]))
            elif tool == "trimgalore":
                m.add(PipelinePreprocess.Trimgalore(
                    PARAMS["trimgalore_options"],
                    threads=PARAMS["threads"]))
            elif tool == "flash":
                m.add(PipelinePreprocess.Flash(
                    PARAMS["flash_options"],
                    threads=PARAMS["threads"]))
            elif tool == "reversecomplement":
                m.add(PipelinePreprocess.ReverseComplement(
                    PARAMS["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(PipelinePreprocess.Pandaseq(
                    PARAMS["pandaseq_options"],
                    threads=PARAMS["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = PARAMS["cutadapt_options"]
                if PARAMS["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(PipelinePreprocess.Cutadapt(
                    cutadapt_options,
                    threads=PARAMS["threads"],
                    untrimmed=PARAMS['cutadapt_reroute_untrimmed'],
                    process_paired=PARAMS["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile,), "processed.dir/trimmed-", track)
        P.run()
Exemplo n.º 41
0
           r"filesummaries.dir/\1.seqsummary")
def checkFile(infile, outfile):
    seqdat=PipelineMetaAssemblyKit.SequencingData(infile)
    outf=open(outfile,'w')
    outf.write("name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".format(
        seqdat.filename,seqdat.fileformat,seqdat.compressed,seqdat.paired,seqdat.interleaved))
    seqdat.readCount()
    outf.write("read_count\t{}\n".format(seqdat.readcount))
    outf.close()

##################################################
#Run Selected Assemblers
##################################################
    
#get the list of assemblers to run on the data 
ASSEMBLERS = P.asList(PARAMS.get("Assembler_assemblers", ""))
    
###################################################
# Run Megahit
###################################################
@active_if("megahit" in ASSEMBLERS)
@follows(checkFile)
@follows(mkdir("megahit_out.dir"))
@transform(SEQUENCEFILES,
           SEQUENCEFILES_REGEX,
           r"megahit_out.dir/\1/\1.contigs.fa")
def runMegahit(infile, outfile):
    job_memory = str(PARAMS["Megahit_clus_memory"])+"G"
    job_threads = PARAMS["Megahit_clus_threads"]
    seqdat=PipelineMetaAssemblyKit.SequencingData(infile)
    assembler = PipelineMetaAssemblyKit.Megahit(seqdat,"megahit_out.dir",PARAMS)
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################
# do not run in parallel. run_weka.pl creates a $testfile
# that is not unique. run_weka.pl and pph2arff.pl could either
# be patched or the following jobs run in sequence.


@jobs_limit(1, "polyphen")
@files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x)
        for x in P.asList(PARAMS["polyphen_models"])])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''
    # options
    # -f: feature set, default is F11
    # -c: classifier, default is NBd (Naive Bayes with discretization)
    # -l: model name, default is HumDiv

    statement = '''
    %(polyphen_home)s/bin/run_weka.pl
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           %(infile)s
    | gzip
    > %(outfile)s
    2> %(outfile)s.log
Exemplo n.º 43
0
    defaults={"annotations_dir": "",
              "genesets_abinitio_coding": "pruned.gtf.gz",
              "genesets_abinitio_lncrna": "pruned.gtf.gz",
              "genesets_reference": "reference.gtf.gz",
              "genesets_refcoding": "refcoding.gtf.gz",
              "genesets_previous": ""})

PARAMS = P.PARAMS

PARAMS.update(P.peekParameters(
    PARAMS["annotations_annotations_dir"],
    "pipeline_annotations.py",
    prefix="annotations_",
    update_interface=True))

PREVIOUS = P.asList(PARAMS["genesets_previous"])


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()
Exemplo n.º 44
0
# Pipeline configuration
###################################################

# load options from the config file
import CGATPipelines.Pipeline as P
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS


# obtain prerequisite generic data
@files([(None, "%s.tgz" % x)
        for x in P.asList(PARAMS.get("prerequisites", ""))])
def setupPrerequisites(infile, outfile):
    '''setup pre-requisites.

    These are tar-balls that are unpacked, but not run.
    '''

    to_cluster = False
    track = P.snip(outfile, ".tgz")

    # obtain data - should overwrite pipeline.ini file
    statement = '''
    wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz'''
    P.run()

    tf = tarfile.open(outfile)
Exemplo n.º 45
0
def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` file in the
    current directory and returns an offset of 0.

    Associations can be defined in the .ini file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam

    '''
    fn = track.asFile()
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.asList(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern)
                    if re.search(p, fn, re.IGNORECASE):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = map(int, P.asList(PARAMS["offsets_%s" % fn.lower()]))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn, re.IGNORECASE):
                    offsets.extend(map(int, value.split(",")))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError(
            "number of BAM files %s is not the "
            "same as number of offsets: %s" %
            (str(bamfiles), str(offsets)))

    return bamfiles, offsets