Python Pipeline.quote示例，CGAT.Pipeline.quote Python示例

示例#1

0

显示文件

文件： pipeline_benchmark_rnaseqmappers.py 项目： lesheng/cgat

def mergeAndLoad(infiles, outfile, suffix):
    '''load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    '''
    header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --headers=%(header)s
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

示例#2

0

显示文件

文件： pipeline_benchmark_rnaseqmappers.py 项目： nishantthakur/cgat

def mergeAndLoad(infiles, outfile, suffix):
    """load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    """
    header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --headers=%(header)s
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

示例#3

0

显示文件

文件： PipelineIntervalAnnotation.py 项目： Charlie-George/cgat

def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed")
        tablename = "%s_intervals" % P.quote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=open(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()

示例#4

0

显示文件

文件： PipelineIntervalAnnotation.py 项目： santayana/cgat

def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed")
        tablename = "%s_intervals" % P.quote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=open(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()

示例#5

0

显示文件

文件： pipeline_benchmark_rnaseqmappers.py 项目： nishantthakur/cgat

def loadBAMStats(infiles, outfile):
    """import bam statisticis."""

    header = ",".join([P.quote(P.snip(x, ".readstats")) for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """python %(scriptsdir)s/combine_tables.py
                      --headers=%(header)s
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --header=%(header)s
                      --skip-titles
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | python %(scriptsdir)s/csv2db.py
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

示例#6

0

显示文件

文件： pipeline_benchmark_rnaseqmappers.py 项目： lesheng/cgat

def loadBAMStats(infiles, outfile):
    '''import bam statisticis.'''

    header = ",".join([P.quote(P.snip(x, ".readstats")) for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """python %(scriptsdir)s/combine_tables.py
                      --headers=%(header)s
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --header=%(header)s
                      --skip-titles
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | python %(scriptsdir)s/csv2db.py
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

示例#7

0

显示文件

文件： pipeline_benchmark_rnaseqmappers.py 项目： nishantthakur/cgat

def loadTranscriptomeValidation(infiles, outfile):
    """load transcriptome validation data into database."""

    to_cluster = USECLUSTER

    headers = ",".join([P.quote(P.snip(x, ".accepted.bam")) for x in infiles])
    infiles = " ".join(["%s.log" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """
    python %(scriptsdir)s/combine_tables.py 
         --headers=%(headers)s
         %(infiles)s
    | python %(scriptsdir)s/table2table.py --transpose
    | perl -p -e "s/bin/track/"
    | python %(scriptsdir)s/csv2db.py
         --table=%(tablename)s 
    > %(outfile)s
    """

    P.run()

示例#8

0

显示文件

文件： pipeline_benchmark_rnaseqmappers.py 项目： lesheng/cgat

def loadTranscriptomeValidation(infiles, outfile):
    '''load transcriptome validation data into database.'''

    to_cluster = USECLUSTER

    headers = ",".join([P.quote(P.snip(x, ".accepted.bam")) for x in infiles])
    infiles = " ".join(["%s.log" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = '''
    python %(scriptsdir)s/combine_tables.py 
         --headers=%(headers)s
         %(infiles)s
    | python %(scriptsdir)s/table2table.py --transpose
    | perl -p -e "s/bin/track/"
    | python %(scriptsdir)s/csv2db.py
         --table=%(tablename)s 
    > %(outfile)s
    '''

    P.run()

示例#9

0

显示文件

文件： PipelineMotifs.py 项目： lesheng/cgat

def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken 
    from the table <track>_intervals in the database *dbhandle* and 
    save to *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of 
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible values
    are 'peakval' (peak value, descending order), 'score' (peak score, descending order) 

    If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals
    on the left and right of the actual interval. The intervals will be centered around
    the mid-point and truncated the same way as the main intervals.
    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    cc = dbhandle.cursor()

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.quote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info("writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (
            track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    # modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter)
                        for contig, start, end, interval_id, peakcenter in data]
            new_data.extend([(contig, end, end + (end - start), str(interval_id) + "_right", peakcenter)
                             for contig, start, end, interval_id, peakcenter in data])
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id)
                for contig, start, end, interval_id, peakcenter in data]
    else:
        # remove peakcenter
        data = [(contig, start, end, interval_id)
                for contig, start, end, interval_id, peakcenter in data]

    # get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info("writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" %
                   (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info("writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" %
                   (track, maxsize, nseq, len(data) - nseq))
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = IOTools.openFile(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output

示例#10

0

显示文件

文件： PipelineMotifs.py 项目： yangjl/cgat

def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken 
    from the table <track>_intervals in the database *dbhandle* and 
    save to *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of 
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible values
    are 'peakval' (peak value, descending order), 'score' (peak score, descending order) 

    If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals
    on the left and right of the actual interval. The intervals will be centered around
    the mid-point and truncated the same way as the main intervals.
    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    cc = dbhandle.cursor()

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.quote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info(
            "writeSequencesForIntervals %s: using at most %i sequences for pattern finding"
            % (track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    ## modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [ (contig, start - (end-start), start, str(interval_id) + "_left", peakcenter) \
                             for contig, start, end, interval_id, peakcenter in data ]
            new_data.extend( [ (contig, end, end + (end-start), str(interval_id) + "_right", peakcenter) \
                                   for contig, start, end, interval_id, peakcenter in data ] )
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [ (contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) \
                     for contig, start, end, interval_id, peakcenter in data ]
    else:
        # remove peakcenter
        data = [ (contig, start, end, interval_id) \
                     for contig, start, end, interval_id, peakcenter in data ]

    ## get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info(
                "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored"
                % (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % \
                        (track, maxsize, nseq, len(data) - nseq ) )
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = IOTools.openFile(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output

示例#11

0

显示文件

文件： PipelineMotifs.py 项目： siping/cgat

def writeSequencesForIntervals( track, filename,
                                dbhandle,
                                full = False,
                                halfwidth = None,
                                maxsize = None,
                                proportion = None,
                                masker = [],
                                offset = 0,
                                shuffled = False,
                                min_sequences = None ):
    '''build a sequence set for motif discovery. Intervals are taken 
    from the table <track>_intervals in the database *dbhandle* and 
    save to *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    *masker* can be a combination of 
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    '''

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )

    cc = dbhandle.cursor()

    if PARAMS["score"] == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif PARAMS["score"] == "max":
        orderby = " ORDER BY score DESC"
    elif PARAMS["score"] == "min":
        orderby = " ORDER BY score ASC"
    else:
        raise ValueError("Unknown value passed as score parameter, check your ini file")
         
    tablename = "%s_intervals" % P.quote( track )
    if full:
        statement = '''SELECT start, end, interval_id, contig 
                       FROM %(tablename)s 
                       ''' % locals() + orderby
    elif halfwidth:
        statement = '''SELECT peakcenter - %(halfwidth)s, peakcenter + %(halfwidth)s,
                       interval_id, contig 
                       FROM %(tablename)s 
                       ''' % locals() + orderby
    else:
        raise ValueError("either specify full or halfwidth" )
    
    cc.execute( statement )
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max( cutoff, min_sequences )
    else:
        cutoff = len(data)
        L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff) )

    L.info( "writeSequencesForIntervals %s: masker=%s" % (track,str(masker)))

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"]) )

    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for start, end, id, contig in data[:cutoff]:
        lcontig = fasta.getLength( contig )
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset))
            continue
        seq = fasta.getSequence( contig, "+", start, end )
        sequences.append( seq )
        new_data.append( (start, end, id, contig) )
        current_size += len(seq)
        if maxsize and current_size >= maxsize: 
            L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % \
                        (track, maxsize, nseq, len(data) - nseq ) )
            break
        nseq += 1
        
    data = new_data
            
    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [ list(x) for x in sequences ]
        for sequence in sequences: random.shuffle(sequence)
        sequences = maskSequences( ["".join(x) for x in sequences ], masker )
        
    c = E.Counter()
    outs = IOTools.openFile(filename, "w" )
    for masker in masker:
        sequences = maskSequences( sequences, masker )

    for sequence, d in zip( sequences, data ):
        c.input += 1
        if len(sequence) == 0: 
            c.empty += 1 
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i..%i" % (track, str(id), contig, start, end)
        outs.write( ">%s\n%s\n" % (id, sequence ) )
        c.output += 1
    outs.close()
    
    E.info("%s" % c )

    return c.output