Пример #1
0
def plotMetrics(infile, outfile):
    '''
    Intermediate target to plot metrics.
    '''

    IOTools.touch_file(outfile)
Пример #2
0
def checkMkfastqInputs(infile, outfile):
    '''Check mkfastq input .sample files'''

    sample_information()

    IOTools.touch_file(outfile)
Пример #3
0
def metrics(infiles, outfile):
    '''
    Intermediate target to run metrics tasks.
    '''

    IOTools.touch_file(outfile)
Пример #4
0
def create_view(dbhandle,
                tables,
                tablename,
                outfile,
                view_type="TABLE",
                ignore_duplicates=True):
    '''create a database view for a list of tables.

    This method performs a join across multiple tables and stores the
    result either as a view or a table in the database.

    Arguments
    ---------
    dbhandle :
        A database handle.
    tables : list of tuples
        Tables to merge. Each tuple contains the name of a table and
        the field to join with the first table. For example::

            tables = (
                "reads_summary", "track",
                "bam_stats", "track",
                "context_stats", "track",
                "picard_stats_alignment_summary_metrics", "track")

    tablename : string
        Name of the view or table to be created.
    outfile : string
        Output filename for status information.
    view_type : string
        Type of view, either ``VIEW`` or ``TABLE``.  If a view is to be
        created across multiple databases, use ``TABLE``.
    ignore_duplicates : bool
        If set to False, duplicate column names will be added with the
        tablename as prefix. The default is to ignore.

    '''

    database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    tracks, columns = [], []
    tablenames = [x[0] for x in tables]
    for table, track in tables:
        d = database.executewait(
            dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
        tracks.append(d.fetchone()[0])
        columns.append([
            x.lower() for x in database.getColumnNames(dbhandle, table)
            if x != track
        ])

    E.info("creating %s from the following tables: %s" %
           (tablename, str(list(zip(tablenames, tracks)))))
    if min(tracks) != max(tracks):
        raise ValueError("number of rows not identical - will not create view")

    from_statement = " , ".join(
        ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
    f = tables[0][1]
    where_statement = " AND ".join([
        "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])
    ])

    all_columns, taken = [], set()
    for x, c in enumerate(columns):
        i = set(taken).intersection(set(c))
        if i:
            E.warn("duplicate column names: %s " % i)
            if not ignore_duplicates:
                table = tables[x][0]
                all_columns.extend(
                    ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
                c = [y for y in c if y not in i]

        all_columns.extend(["t%i.%s" % (x, y) for y in c])
        taken.update(set(c))

    all_columns = ",".join(all_columns)
    statement = '''
    CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
    FROM %(from_statement)s
    WHERE %(where_statement)s
    ''' % locals()
    database.executewait(dbhandle, statement)

    nrows = database.executewait(
        dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]

    if nrows == 0:
        raise ValueError("empty view mapping, check statement = %s" %
                         (statement % locals()))
    if nrows != min(tracks):
        E.warn("view creates duplicate rows, got %i, expected %i" %
               (nrows, min(tracks)))

    E.info("created view_mapping with %i rows" % nrows)
    touch_file(outfile)
Пример #5
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if iotools.is_empty(dbfile) or len(motiffiles) == 0:
        iotools.touch_file(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if iotools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = iotools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = iotools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    P.run("gzip < %(tmpfile)s > %(outfile)s")

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Пример #6
0
def subsetAndDownsample(infiles, outfile):
    '''
    Generate datasets that include subsets of the 10x samples.

    Optionally downsample UMI counts to normalise between samples.
    '''

    outdir = os.path.dirname(outfile)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    agg_matrix_dir = os.path.join(os.path.dirname(infiles[0]),
                                  "agg.processed.dir")

    sample_table = pd.read_csv(infiles[1], sep="\t")

    subsets = [k.split("_", 1)[1] for k in PARAMS.keys()
               if k.startswith("datasets_")]

    # Titles of fields encoded in filenames
    name_field_titles = PARAMS["name_field_titles"]

    if PARAMS["downsampling_enabled"]:
        downsampling_function = PARAMS['downsampling_function']
    else:
        downsampling_function = "no"

    downsampling_apply = PARAMS["downsampling_apply"]

    job_memory = PARAMS["postprocess_memory"]

    statements = []

    for subset in subsets:

        if subset == "all":
            if not PARAMS["datasets_all"]:
                continue

            sample_ids = set(sample_table["sample_id"].values)
            sample_ids_str = ",".join(sample_ids)

        else:

            sample_ids = PARAMS["datasets" + "_" + subset]

            sample_ids_str = ",".join([x.strip() for x in
                                       sample_ids.split(",")])

        out_dir = os.path.join(os.path.dirname(outfile),
                               subset)

        tenx_dir = PARAMS["tenx_dir"]

        log_file = outfile.replace(".sentinel",
                                   "." + subset + ".log")

        statement = '''Rscript %(tenx_dir)s/R/cellranger_subsetAndDownsample.R
                       --tenxdir=%(agg_matrix_dir)s
                       --sampleids=%(sample_ids_str)s
                       --downsample=%(downsampling_function)s
                       --apply=%(downsampling_apply)s
                       --samplenamefields=%(name_field_titles)s
                       --outdir=%(out_dir)s
                       &> %(log_file)s
                    ''' % locals()

        statements.append(statement)

    P.run(statements)

    IOTools.touch_file(outfile)
Пример #7
0
def buildSpikeResults(infile, outfile):
    '''build matrices with results from spike-in and upload
    into database.

    The method will output several files:

    .spiked.gz: Number of intervals that have been spiked-in
               for each bin of expression and fold-change

    .power.gz: Global power analysis - aggregates over all
        ranges of fold-change and expression and outputs the
        power, the proportion of intervals overall that
        could be detected as differentially methylated.

        This is a table with the following columns:

        fdr - fdr threshold
        power - power level, number of intervals detectable
        intervals - number of intervals in observed data at given
                    level of fdr and power.
        intervals_percent - percentage of intervals in observed data
              at given level of fdr and power

    The method will also upload the results into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    outfile : string
        Output filename in :term:`tsv` format.

    '''

    expression_nbins = 10
    fold_nbins = 10

    spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz'

    if not os.path.exists(spikefile):
        E.warn('no spike data: %s' % spikefile)
        iotools.touch_file(outfile)
        return

    ########################################
    # output and load spiked results
    tmpfile_name = P.get_temp_filename(shared=True)

    statement = '''zcat %(spikefile)s
    | grep -e "^spike" -e "^test_id"
    > %(tmpfile_name)s
    '''
    P.run(statement)

    E.debug("outputting spiked counts")
    (spiked, spiked_d2hist_counts, xedges, yedges,
     spiked_l10average, spiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".spiked.gz",
            infile_name=tmpfile_name,
            expression_nbins=expression_nbins,
            fold_nbins=fold_nbins)

    ########################################
    # output and load unspiked results
    statement = '''zcat %(infile)s
    | grep -v -e "^spike"
    > %(tmpfile_name)s
    '''
    P.run(statement)
    E.debug("outputting unspiked counts")

    (unspiked, unspiked_d2hist_counts, unspiked_xedges,
     unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz",
            infile_name=tmpfile_name,
            expression_bins=xedges,
            fold_bins=yedges)

    E.debug("computing power")

    assert xedges.all() == unspiked_xedges.all()

    tmpfile = iotools.open_file(tmpfile_name, "w")
    tmpfile.write("\t".join(("expression", "fold", "fdr", "counts",
                             "percent")) + "\n")

    fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1))
    power_thresholds = numpy.arange(0.1, 1.1, 0.1)

    spiked_total = float(spiked_d2hist_counts.sum().sum())
    unspiked_total = float(unspiked_d2hist_counts.sum().sum())

    outf = iotools.open_file(outfile, "w")
    outf.write("fdr\tpower\tintervals\tintervals_percent\n")

    # significant results
    for fdr in fdr_thresholds:
        take = spiked['qvalue'] < fdr

        # compute 2D histogram in spiked data below fdr threshold
        spiked_d2hist_fdr, xedges, yedges = \
            numpy.histogram2d(spiked_l10average[take],
                              spiked_l2fold[take],
                              bins=(xedges, yedges))

        # convert to percentage of spike-ins per bin
        spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts
        spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed)

        # set values without data to -1
        spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0

        # output to table for database upload
        for x, y in itertools.product(list(range(len(xedges) - 1)),
                                      list(range(len(yedges) - 1))):
            tmpfile.write("\t".join(
                map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y],
                          100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n")

        # take elements in spiked_hist_fdr above a certain threshold
        for power in power_thresholds:
            # select 2D bins at a given power level
            power_take = spiked_d2hist_fdr_normed >= power

            # select the counts in the unspiked data according
            # to this level
            power_counts = unspiked_d2hist_counts[power_take]

            outf.write("\t".join(
                map(str, (fdr, power, power_counts.sum().sum(), 100.0 *
                          power_counts.sum().sum() / unspiked_total))) + "\n")

    tmpfile.close()
    outf.close()

    # upload into table
    method = P.snip(os.path.dirname(outfile), ".dir")
    tablename = P.to_table(
        P.snip(outfile, "power.gz") + method + ".spike.load")

    P.load(tmpfile_name,
           outfile + ".log",
           tablename=tablename,
           options="--add-index=fdr")

    os.unlink(tmpfile_name)
Пример #8
0
def postprocessAggrMatrix(infiles, outfile):
    '''
    Post-process the cellranger aggr count matrix.

    Batch, sample_name and aggregation ID metadata are added.

    Optionally cells with barcodes shared (within sequencing batch)
    between samples can be removed (known index hopping on Illumina 4000).
    '''

    outdir = os.path.dirname(outfile)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    infile = infiles[0]

    sample_table = infiles[1]

    agg_dir = os.path.dirname(infile)
    out_dir = os.path.dirname(outfile)

    # Clean barcode hopping
    if PARAMS["postprocess_barcodes"]:
        hopping = "--hopping"
    else:
        hopping = ""

    # Additional options
    options = PARAMS["postprocess_options"]

    mexdir = PARAMS["postprocess_mexdir"]

    if mexdir is None:
        raise ValueError('"postprocess_mexdir" parameter not set'
                         ' in file "pipeline.yml"')

    tenxdir = os.path.join(agg_dir, mexdir)
    if not os.path.exists(tenxdir):
        raise ValueError('The specified "postprocess_mexdir"'
                         ' directory does not exist in directory ' + agg_dir)

    job_memory = PARAMS["postprocess_memory"]

    blacklist = PARAMS["postprocess_blacklist"]

    log_file = outfile.replace(".sentinel", ".log")

    statement = '''Rscript %(tenx_dir)s/R/cellranger_postprocessAggrMatrix.R
                   --tenxdir=%(tenxdir)s
                   --sampletable=%(sample_table)s
                   --samplenamefields=%(name_field_titles)s
                   --downsample=no
                   %(hopping)s
                   --blacklist=%(blacklist)s
                   %(options)s
                   --outdir=%(out_dir)s
                   &> %(log_file)s
                '''

    P.run(statement)

    IOTools.touch_file(outfile)
Пример #9
0
def cellrangerCount(infile, outfile):
    '''
    Execute the cell ranger pipleline for all samples.
    '''
    # set key parameters
    transcriptome = PARAMS["cellranger_transcriptome"]

    if transcriptome is None:
        raise ValueError('"cellranger_transcriptome" parameter not set'
                         ' in file "pipeline.yml"')

    if not os.path.exists(transcriptome):
        raise ValueError('The specified "cellranger_transcriptome"'
                         ' file does not exist')

    memory = PARAMS["cellranger_memory"]
    job_threads = PARAMS["cellranger_threads"]
    mem_per_core = int(float(memory) / job_threads)  # round down
    job_memory = str(mem_per_core) + "M"

    # cellranger expects memory in GB
    cellranger_memory = str(int((mem_per_core * job_threads)/1000) - 2)

    # parse the sample name and expected cell number
    library_id, cellnumber, batch, trash = os.path.basename(infile).split(".")

    # build lists of the sample files
    seq_folders = []
    sample_ids = []

    # Parse the list of sequencing runs (i.e., paths) for the sample
    with open(infile, "r") as sample_list:
        for line in sample_list:
            seq_folder_path = line.strip()
            if seq_folder_path != "":
                seq_folders.append(seq_folder_path)
                sample_ids.append(os.path.basename(seq_folder_path))

    input_fastqs = ",".join(seq_folders)
    input_samples = ",".join(sample_ids)

    id_tag = library_id + "-count"

    log_file = id_tag + ".log"

    statement = (
        '''cellranger count
                   --id %(id_tag)s
                   --fastqs %(input_fastqs)s
                   --sample %(input_samples)s
                   --transcriptome %(transcriptome)s
                   --expect-cells %(cellnumber)s
                   --chemistry %(cellranger_chemistry)s
                   --jobmode=local
                   --localcores %(job_threads)s
                   --localmem %(cellranger_memory)s
                   --nopreflight
            &> %(log_file)s
        ''')

    P.run(statement)

    IOTools.touch_file(outfile)
Пример #10
0
def full(outfile):
    touch_file(outfile)
Пример #11
0
 def aggregateAdaptors(infile, outfile):
     iotools.touch_file(outfile)
Пример #12
0
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if iotools.is_empty(infiles[0]) or iotools.isEmpty(infiles[1]):
            iotools.touch_file(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run(statement)

    else:

        tmpfile = P.get_temp_filename(".")

        # need to merge incrementally
        fn = infiles[0]
        if iotools.is_empty(infiles[0]):
            iotools.touch_file(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run(statement)

        for fn in infiles[1:]:
            if iotools.is_empty(infiles[0]):
                iotools.touch_file(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run(statement)

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run(statement)

        os.unlink(tmpfile)