Python PipelineMapping примеры, CGATPipelines.PipelineMapping Python примеры использования

Пример #1

0

Показать файл

def run_mapping(infile, outfile):
    ''' Map reads with the specified read mapper '''

    if PARAMS["mapper"] == "star":
        job_threads = PARAMS["star_threads"]
        job_memory = PARAMS["star_memory"]
        star_mapping_genome = PARAMS["star_genome"] or PARAMS["genome"]
        m = PipelineMapping.STAR(
            executable=P.substituteParameters(**locals())["star_executable"],
            strip_sequence=0)

    elif PARAMS["mapper"] == "bowtie":
        job_threads = PARAMS["bowtie_threads"]
        job_memory = PARAMS["bowtie_memory"]

        m = PipelineMapping.Bowtie(executable="bowtie",
                                   tool_options=PARAMS["bowtie_options"],
                                   strip_sequence=0)

        genome = PARAMS["bowtie_genome"]
        reffile = os.path.join(PARAMS["bowtie_index_dir"],
                               PARAMS["bowtie_genome"] + ".fa")

    statement = m.build((infile, ), outfile)

    P.run()

Пример #2

0

Показать файл

def runFastqc(infiles, outfile):
    '''run Fastqc on each input file.

    convert sra files to fastq and check mapping qualities are in
    solexa format.  Perform quality control checks on reads from
    .fastq files.
    '''
    # MM: only pass the contaminants file list if requested by user,
    # do not make this the default behaviour
    if PARAMS['use_custom_contaiminants']:
        m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"],
                                   outdir=PARAMS["exportdir"] + "/fastqc",
                                   contaminants=PARAMS['contaminants_path'],
                                   qual_format=PARAMS['qual_format'])
    else:
        m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"],
                                   outdir=PARAMS["exportdir"] + "/fastqc",
                                   qual_format=PARAMS['qual_format'])

    if PARAMS["general_reconcile"] == 1:
        infiles = infiles.replace("processed.dir/trimmed",
                                  "reconciled.dir/trimmed")

    statement = m.build((infiles,), outfile)
    job_memory = "8G"
    P.run()

Пример #3

0

Показать файл

def runFastQC(infiles, outfile):
    '''run FastQC on each input file.

    convert sra files to fastq and check mapping qualities are in
    solexa format.  Perform quality control checks on reads from
    .fastq files.

    '''
    # only pass the contaminants file list if requested by user,
    if PARAMS['use_custom_contaminants']:
        m = PipelineMapping.FastQC(nogroup=PARAMS["readqc_no_group"],
                                   outdir=os.path.dirname(outfile),
                                   contaminants=PARAMS['contaminants_path'],
                                   qual_format=PARAMS['qual_format'])
    else:
        m = PipelineMapping.FastQC(nogroup=PARAMS["readqc_no_group"],
                                   outdir=os.path.dirname(outfile),
                                   qual_format=PARAMS['qual_format'])

    if PARAMS["reconcile"] == 1:
        infiles = infiles.replace("processed.dir/trimmed",
                                  "reconciled.dir/trimmed")

    statement = m.build((infiles,), outfile)
    P.run(statement)

Пример #4

0

Показать файл

def mapReadsAgainstSpadesContigs(infiles, outfile):
    '''
    map reads against spades contigs
    '''
    inf = infiles[0]
    to_cluster = True
    index_dir = os.path.dirname(outfile)

    if "agg" not in infiles[1]:
        genome = re.search(
            ".*R[0-9]*", infiles[0]).group(0) + ".filtered.contigs.fa"
    else:
        genome = "agg-agg-agg.filtered.contigs.fa"

    if infiles[1].endswith(".bt2") or infiles[1].endswith(".ebwt"):
        infile, reffile = infiles[0],  os.path.join(index_dir, genome) + ".fa"
        m = PipelineMapping.Bowtie(
            executable=P.substituteParameters(**locals())["bowtie_executable"])

    elif infiles[1].endswith("bwt"):
        genome = genome
        job_options = " -l mem_free=%s" % (PARAMS["bwa_memory"])
        bwa_index_dir = index_dir
        bwa_mem_options = PARAMS["bwa_mem_options"]
        bwa_threads = PARAMS["bwa_threads"]
        m = PipelineMapping.BWAMEM(remove_non_unique=True)
    statement = m.build((inf,), outfile)
    P.run()

Пример #5

0

Показать файл

Файл: pipeline_rnaseqqc.py Проект: sudlab/CGATPipelines

def buildReferenceGeneSet(infile, outfile):
    """ filter full gene set and add attributes to create the reference gene set

    Performs merge and filter operations:
       * Merge exons separated by small introns (< 5bp).
       * Remove transcripts with very long introns (`max_intron_size`)
       * Remove transcripts located on contigs to be ignored (`remove_contigs`)
         (usually: chrM, _random, ...)
       * (Optional) Remove transcripts overlapping repetitive sequences
         (`rna_file`)

    This preserves all features in a gtf file (exon, CDS, ...)

    Runs cuffcompare with `infile` against itself to add
    attributes such as p_id and tss_id.

    Parameters
    ----------
    infile : str
       Input filename in :term:`gtf` format
    outfile : str
       Input filename in :term:`gtf` format
    annotations_interface_rna_gff : str
       :term:`PARAMS`. Filename of :term:`gtf` file containing
       repetitive rna annotations
    genome_dir : str
       :term:`PARAMS`. Directory of :term:fasta formatted files
    genome : str
       :term:`PARAMS`. Genome name (e.g hg38)
    """

    tmp_mergedfiltered = P.getTempFilename(".")

    if "geneset_remove_repetetive_rna" in PARAMS:
        rna_file = PARAMS["annotations_interface_rna_gff"]
    else:
        rna_file = None

    gene_ids = PipelineMapping.mergeAndFilterGTF(
        infile,
        tmp_mergedfiltered,
        "%s.removed.gz" % outfile,
        genome=os.path.join(PARAMS["genome_dir"], PARAMS["genome"]),
        max_intron_size=PARAMS["max_intron_size"],
        remove_contigs=PARAMS["geneset_remove_contigs"],
        rna_file=rna_file,
    )

    # Add tss_id and p_id
    PipelineMapping.resetGTFAttributes(
        infile=tmp_mergedfiltered,
        genome=os.path.join(PARAMS["genome_dir"], PARAMS["genome"]),
        gene_ids=gene_ids,
        outfile=outfile,
    )

    os.unlink(tmp_mergedfiltered)

Пример #6

0

Показать файл

Файл: pipeline_proj020.py Проект: logust79/cgat-flow

def countTaggedReads(infiles, outfile):
    '''count number of reads in input files.'''
    to_cluster = True
    read1, read2 = infiles
    m = PipelineMapping.Counter()
    statement = m.build((read1,), outfile)
    P.run()

Пример #7

0

Показать файл

Файл: pipeline_readqc.py Проект: lesheng/cgat

def runFastqc(infiles, outfile):
    '''convert sra files to fastq and check mapping qualities are in solexa format. 
    Perform quality control checks on reads from .fastq files.'''
    to_cluster = True
    m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"])
    statement = m.build((infiles, ), outfile)
    P.run()

Пример #8

0

Показать файл

Файл: pipeline_iCLIP.py Проект: shulp2211/UMIpipe

def qcDemuxedReads(infile, outfile):
    ''' Run fastqc on the post demuxing and trimmed reads'''

    m = PipelineMapping.FastQc(nogroup=False, outdir="fastqc")
    statement = m.build((infile, ), outfile)
    exportdir = "fastqc"
    P.run()

Пример #9

0

Показать файл

def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.getTempDir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))
    job_memory = "8G"

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.ini
    with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"),
                          "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles,), outfile)
    P.run()
    shutil.rmtree(tempdir)
    P.touch(outfile)

Пример #10

0

Показать файл

Файл: pipeline_medip.py Проект: lesheng/cgat

def mapReads(infiles, outfile):
    '''Map reads to the genome using BWA '''
    to_cluster = True
    job_options = "-pe dedicated %i -R y" % PARAMS["bwa_threads"]
    m = PipelineMapping.BWA()
    statement = m.build((infiles,), outfile)
    P.run()

Пример #11

0

Показать файл

Файл: pipeline_scRNASeq.py Проект: shulp2211/UMIpipe

def mapBowtieAgainstTranscriptomeGSE65525(infiles, outfile):
    ''' map reads using Bowtie against transcriptome

    bowtie parameterised according to Allon et al 2015 except
    random reporting of alignments where more than one "best" exist (-M 1):

    -n 1 number of mismatches allowed
    -l 15 seed length
    -e 300 maxmimum permitted sum of sequence qualities at all mismatched
           positions
    -M 1 if more than one "best" alignment exist, report one at random
    --best report in best to worst order
    --strata only report reads falling into the best stratum
    '''

    infile, reference = infiles
    job_threads = 2
    job_options = "-l mem_free=1.9G"
    bowtie_options = "-n1 -l 15 -e 300 -M 1 --best --strata"
    bowtie_index_dir = os.path.abspath(os.path.dirname(reference))
    genome = P.snip(os.path.basename(reference), ".1.ebwt")
    reffile = reference
    bowtie_threads = job_threads

    m = PipelineMapping.Bowtie(tool_options=bowtie_options,
                               remove_non_unique=0,
                               strip_sequence=0)

    statement = m.build((infile, ), outfile)
    P.run()

Пример #12

0

Показать файл

def runFastQC(infiles, outfile):
    '''run FastQC on each input file.

    check mapping qualities are in solexa format for downloaded .fastq 
    and .bam files.  Perform quality control checks on reads from
    .fastq and .bam files.

    '''

    infile = infiles
    outdir = os.path.dirname(outfile)

    if infile.endswith(".bam"):
        statement = '''fastqc --extract --outdir=%(outdir)s %(infile)s >& %(outfile)s'''

    else:
        #        outfile = os.path.join(outdir, os.path.basename(outfile.split(os.extsep, 2)[1] + ".fastqc"))
        m = PipelineMapping.FastQC(nogroup=PARAMS["readqc_no_group"],
                                   outdir=outdir,
                                   qual_format=PARAMS['readqc_qual_format'])
        statement = m.build((infile, ), outfile)

    if not os.path.isfile(outfile):
        P.run(statement)
    else:
        pass

Пример #13

0

Показать файл

def mapReadsWithTophatFusion(infiles, outfile):
    '''map reads from .fastq or .sra files and find candidate fusions

    A list with known splice junctions expect from rnaseq pipeline
    '''

    job_options = "-pe dedicated %i -R y" % PARAMS["tophat_threads"]

    if "--butterfly-search" in PARAMS["tophat_options"]:
        # for butterfly search - require insane amount of
        # RAM.
        job_options += " -l mem_free=50G"

    to_cluster = USECLUSTER
    m = PipelineMapping.TopHat_fusion()
    infile = infiles

    #if a file of reference junctions, as generated by the rnaseq pipline,
    #has been specified in the ini, then pass this to tophat-fusion
    if not PARAMS['tophatfusion_reference_junctions'] == None:
        reffile = PARAMS['tophatfusion_reference_junctions']
        tophat_options = PARAMS[
            "tophat_options"] + " --raw-juncs %(reffile)s" % locals()

    tophatfusion_options = PARAMS["tophatfusion_options"]
    statement = m.build((infile, ), outfile)
    P.run()

Пример #14

0

Показать файл

def buildBAM(infile, outfile):
    '''map reads with bowtie'''
    track = P.snip(os.path.basename(outfile), ".bam")
    job_threads = PARAMS["bowtie_threads"]
    m = PipelineMapping.Bowtie()
    reffile = PARAMS["samtools_genome"]
    statement = m.build((infile, ), outfile)
    P.run()

Пример #15

0

Показать файл

Файл: pipeline_exome.py Проект: yangjl/cgat

def mapReads(infiles, outfile):
    '''Map reads to the genome using BWA (output=SAM), convert to BAM, sort and index BAM file '''
    to_cluster = USECLUSTER
    job_options = "-pe dedicated 2 -R y -l mem_free=8G"
    track = P.snip(os.path.basename(outfile), ".bam")
    m = PipelineMapping.BWA(remove_unique=PARAMS["bwa_remove_non_unique"])
    statement = m.build((infiles, ), outfile)
    P.run()

Пример #16

0

Показать файл

Файл: pipeline_proj020.py Проект: logust79/cgat-flow

def alignReadsToTranscriptome(infile, outfile):
    '''map reads to transcriptome with bowtie'''
    track = P.snip(os.path.basename(outfile), ".bam")
    job_threads = PARAMS["bowtie_threads"]
    m = PipelineMapping.Bowtie()
    reffile = PARAMS["bowtie_transcriptome"]
    bowtie_options = PARAMS["bowtie_options"]
    statement = m.build((infile,), outfile)
    P.run()

Пример #17

0

Показать файл

def buildBAM(infile, outfile):
    '''map reads with bowtie'''
    to_cluster = True
    track = P.snip(os.path.basename(outfile), ".bam")
    job_options = "-pe dedicated %i -R y" % PARAMS["bowtie_threads"]
    m = PipelineMapping.Bowtie()
    reffile = PARAMS["samtools_genome"]
    statement = m.build((infile, ), outfile)
    P.run()

Пример #18

0

Показать файл

Файл: pipeline_mapping_benchmark.py Проект: Acribbs/CGATPipelines

def buildBAM(infile, outfile, options):
    '''map reads with bowtie'''
    job_threads = PARAMS["bowtie_threads"]
    m = PipelineMapping.Bowtie()
    reffile = PARAMS["samtools_genome"]
    bowtie_options = options
    statement = m.build((infile,), outfile)
    # print(statement)
    P.run()

Пример #19

0

Показать файл

def buildBAM(infile, outfile, options):
    '''map reads with bowtie'''
    to_cluster = True
    job_options = "-pe dedicated %i -R y" % PARAMS["bowtie_threads"]
    m = PipelineMapping.Bowtie()
    reffile = PARAMS["samtools_genome"]
    bowtie_options = options
    statement = m.build((infile,), outfile)
    # print(statement)
    P.run()

Пример #20

0

Показать файл

def mapReadsWithBismark(infile, outfile):
    '''map reads with bismark'''

    # can this handle paired end?
    # it appears bismark uses twice as many CPUs as expeceted!
    job_options = "-l mem_free=%s " % PARAMS["bismark_memory"]
    job_threads = (PARAMS["bismark_threads"] * 2) + 1
    outdir = "bismark.dir"
    bismark_options = PARAMS["bismark_options"]
    m = PipelineMapping.Bismark()
    statement = m.build((infile,), outfile)
    # print statement
    P.run()

Пример #21

0

Показать файл

def mapReadsWithBowtieAgainstRayContigs(infile, outfile):
    '''
    map reads against contigs with bowtie
    '''
    PARAMS["bowtie_index_dir"] = "ray.dir"
    PARAMS["genome"] = TRACKS.getTracks(infile)[0].split(".")[0]

    infile, reffile = infile, os.path.join("ray.dir",
                                           TRACKS.getTracks(infile)[0])
    m = PipelineMapping.Bowtie(executable=P.substituteParameters(
        **locals())["bowtie_executable"])
    statement = m.build((infile, ), outfile)
    P.run()

Пример #22

0

Показать файл

def mapReads(infile, outfile):
    '''Map reads to the genome using BWA, sort and index BAM file,
    generate alignment statistics and deduplicate using Picard'''

    job_threads = PARAMS["bwa_threads"]
    job_memory = PARAMS["bwa_memory"]

    if PARAMS["bwa_algorithm"] == "aln":
        m = PipelineMapping.BWA(
            remove_non_unique=PARAMS["bwa_remove_non_unique"],
            strip_sequence=False)

    elif PARAMS["bwa_algorithm"] == "mem":
        m = PipelineMapping.BWAMEM(
            remove_non_unique=PARAMS["bwa_remove_non_unique"],
            strip_sequence=False)
    else:
        raise ValueError("bwa algorithm '%s' not known" % algorithm)

    statement = m.build((infile, ), outfile)
    print(statement)
    P.run()

Пример #23

0

Показать файл

def mapReads(infiles, outfile):
    '''Map reads to the genome using BWA (output=SAM), convert to BAM,
    sort and index BAM file, generate alignment statistics and
    deduplicate using Picard
    '''

    job_options = "-pe dedicated 2 -l mem_free=8G"
    track = P.snip(os.path.basename(outfile), ".bam")
    m = PipelineMapping.BWA(remove_unique=PARAMS["bwa_remove_non_unique"],
                            align_stats=True,
                            dedup=True)
    statement = m.build((infiles, ), outfile)
    P.run()

Пример #24

0

Показать файл

Файл: pipeline_ref_transcriptome_paper.py Проект: TomSmithCGAT/RefTranscriptome

def pseudoalignWithKallisto(infiles, outfile):
    ''' pseudoalign with kallisto '''

    infile, index = infiles

    job_threads = PARAMS['alignment_free_threads']
    job_memory = "6G"

    kallisto_options = PARAMS["kallisto_options"]
    kallisto_bootstrap = PARAMS["alignment_free_bootstrap"]

    m = PipelineMapping.Kallisto(pseudobam=1, readable_suffix='.tsv')
    statement = m.build((infile, ), outfile)

    P.run()

Пример #25

0

Показать файл

Файл: pipeline_iCLIP.py Проект: shulp2211/UMIpipe

def run_mapping(infile, outfile):
    ''' Map reads using the selected read mapper '''

    job_threads = PARAMS["bowtie_threads"]
    job_memory = PARAMS["bowtie_memory"]

    m = PipelineMapping.Bowtie(
           executable="bowtie",
           tool_options=PARAMS["bowtie_options"],
           strip_sequence=0)

    genome = PARAMS["bowtie_genome"]
    reffile = os.path.join(PARAMS["bowtie_index_dir"],
                           PARAMS["bowtie_genome"] + ".fa")

    statement = m.build((infile,), outfile)

    P.run()

Пример #26

0

Показать файл

Файл: pipeline_readqc.py Проект: santayana/cgat

    def runSailfish(infiles, outfile):
        '''quantify abundance'''

        to_cluster = True
        job_options = "-pe dedicated %i -R y" % PARAMS["sailfish_threads"]

        infile, index = infiles
        index = P.snip(index, "/transcriptome.sfi")

        sample = P.snip(os.path.basename(outfile), "_quant.sf")
        outdir = "quantification/%(sample)s" % locals()

        m = PipelineMapping.Sailfish(strand=PARAMS["sailfish_strandedness"],
                                     orient=PARAMS["sailfish_orientation"],
                                     threads=PARAMS["sailfish_threads"])

        statement = m.build((infile,), outfile)

        P.run()

Пример #27

0

Показать файл

Файл: pipeline_readqc.py Проект: dormeight/CGATPipelines

def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.getTempDir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.ini
    with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"),
                          "w") as f:
        for i, k in PARAMS.items():
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles,), outfile)
    P.run()
    shutil.rmtree(tempdir)
    P.touch(outfile)

Пример #28

0

Показать файл

def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    tempdir = P.get_temp_dir(".")
    conf_fn = os.path.join(tempdir, "fastq_screen.conf")
    with IOTools.open_file(conf_fn, "w") as f:
        for i, k in PARAMS.items():
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen(config_filename=conf_fn)
    statement = m.build((infiles,), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    IOTools.touch_file(outfile)

Пример #29

0

Показать файл

Файл: pipeline_ref_transcriptome_paper.py Проект: TomSmithCGAT/RefTranscriptome

def runFeatureCountsAddModels(infiles, outfiles):
    ''' 
    First align with hisat2 and then quantify with FeatureCounts
    '''

    junctions, infile, annotations, sequins_genome_index, transcript_map = infiles

    ### align with hisat ###
    job_threads = PARAMS["hisat_threads"]
    job_memory = PARAMS["hisat_memory"]

    tmp_outfile = P.getTempFilename()

    hisat_index_dir = os.path.dirname(sequins_genome_index)
    genome = P.snip(os.path.basename(sequins_genome_index), ".1.ht2")

    m = PipelineMapping.Hisat(executable='hisat2',
                              strip_sequence=0,
                              stranded=PARAMS["hisat_strandedness"])

    statement = m.build((infile, ), tmp_outfile)

    P.run()

    ### quantify with featureCounts ###
    transcript_outfile, gene_outfile = outfiles

    Quantifier = PipelineRnaseq.FeatureCountsQuantifier(
        infile=tmp_outfile,
        transcript_outfile=transcript_outfile,
        gene_outfile=gene_outfile,
        job_threads=PARAMS['featurecounts_threads'],
        strand=PARAMS['featurecounts_strand'],
        options=PARAMS['featurecounts_options'],
        annotations=annotations)

    Quantifier.run_all()

    os.unlink(tmp_outfile)

Пример #30

0

Показать файл

def filterPhiX(infiles, outfile):
    ''' Use mapping to bowtie to remove any phiX mapping reads '''

    infile, reffile = infiles
    outfile = P.snip(outfile, ".gz")
    bam_out = P.snip(infile, ".fastq.gz") + ".phix.bam"

    job_threads = PARAMS["phix_bowtie_threads"]
    job_memory = PARAMS["phix_bowtie_memory"]
    options = PARAMS["phix_bowtie_options"] + " --un %s" % outfile
    genome = PARAMS["phix_genome"]
    bowtie_threads = PARAMS["phix_bowtie_threads"]

    m = PipelineMapping.Bowtie(executable=PARAMS["phix_bowtie_exe"],
                               strip_sequence=False,
                               remove_non_unique=False,
                               tool_options=options)

    statement = m.build((infile, ), bam_out)
    statement += "checkpoint; gzip %(outfile)s"

    P.run()

Пример #31

0

Показать файл

Файл: pipeline_scRNASeq.py Проект: shulp2211/UMIpipe

def mapBWAAgainstGenesetGSE53638(infiles, outfile):
    ''' map reads using BWA against transcriptome data

    bwa parameterised according to soumillon et al 2014:
    -l 24 = seed length - 24 bp
    -k 2 = default number of mismatches allowed in seed - 2
    -n 0.04 = default percentage of mismatches allowed across read - 4%

    non-unique alignments will NOT be removed from the final bam
    '''

    infile, reference = infiles
    job_threads = 2
    job_options = "-l mem_free=1.9G"
    bwa_aln_options = "-l 24 -k 2 -n 0.04"
    bwa_index_dir = os.path.abspath(os.path.dirname(reference))
    genome = P.snip(os.path.basename(reference), ".sa")
    bwa_threads = job_threads
    bwa_samse_options = ""
    m = PipelineMapping.BWA(remove_non_unique=0, strip_sequence=0, set_nh=1)

    statement = m.build((infile, ), outfile)
    P.run()

Пример #32

0

Показать файл

Файл: PipelineChipseq.py Проект: BioXiao/CGATPipelines

def loadZinba(infile, outfile, bamfile,
              tablename=None,
              controlfile=None):
    '''load Zinba results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.zinba`.

    If no peaks were predicted, an empty table is created.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.

    This method uses the refined peak locations.

    Zinba peaks can be overlapping. This method does not merge
    overlapping intervals.

    Zinba calls peaks in regions where there are many reads inside
    the control. Thus this method applies a filtering step 
    removing all intervals in which there is a peak of
    more than readlength / 2 height in the control.

    .. note:

       Zinba calls peaks that are overlapping.

    '''

    track = P.snip(os.path.basename(infile), ".zinba")
    folder = os.path.dirname(infile)

    infilename = infile + ".peaks"

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")

    counter = E.Counter()

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
    elif P.isEmpty(infile):
        E.warn("no data in %s" % filename)
    else:
        # filter peaks
        shift = getPeakShiftFromZinba(infile)
        assert shift is not None, "could not determine peak shift from Zinba file %s" % infile

        E.info("%s: found peak shift of %i" % (track, shift))

        samfiles = [pysam.Samfile(bamfile, "rb")]
        offsets = [shift / 2]

        if controlfile:
            controlfiles = [pysam.Samfile(controlfile, "rb")]
            readlength = PipelineMapping.getReadLengthFromBamfile(controlfile)
            control_max_peakval = readlength // 2
            E.info("removing intervals in which control has peak higher than %i reads" %
                   control_max_peakval)
        else:
            controlfiles = None

        id = 0

        # get thresholds
        max_qvalue = float(PARAMS["zinba_fdr_threshold"])

        with IOTools.openFile(infilename, "r") as ins:
            for peak in WrapperZinba.iteratePeaks(ins):

                # filter by qvalue
                if peak.fdr > max_qvalue:
                    counter.removed_qvalue += 1
                    continue

                assert peak.refined_start < peak.refined_end

                # filter by control
                if controlfiles:
                    npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                     peak.refined_start,
                                                                                     peak.refined_end,
                                                                                     controlfiles,
                                                                                     offsets)

                    if peakval > control_max_peakval:
                        counter.removed_control += 1
                        continue

                # output peak
                npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                 peak.refined_start,
                                                                                 peak.refined_end,
                                                                                 samfiles,
                                                                                 offsets)

                outtemp.write("\t".join(map(str, (
                    id, peak.contig, peak.refined_start, peak.refined_end,
                    npeaks, peakcenter, length, avgval, peakval, nreads,
                    1.0 - peak.posterior, 1.0, peak.fdr,
                    peak.refined_start + peak.summit - 1,
                    peak.height))) + "\n")
                id += 1
                counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_intervals" % track

    statement = '''
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty-file
              --add-index=interval_id 
              --add-index=contig,start
              --table=%(tablename)s 
    < %(tmpfilename)s 
    > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)

Python PipelineMapping примеры использования